diff options
39 files changed, 1602 insertions, 303 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 4201504158a1..227bef531c16 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -800,20 +800,21 @@ void submit_bio_noacct(struct bio *bio) goto end_io; } + if (WARN_ON_ONCE((bio->bi_opf & REQ_PREFLUSH) && + bio_op(bio) != REQ_OP_WRITE && + bio_op(bio) != REQ_OP_ZONE_APPEND)) + goto end_io; + /* * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ - if (op_is_flush(bio->bi_opf)) { - if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && - bio_op(bio) != REQ_OP_ZONE_APPEND)) + if (op_is_flush(bio->bi_opf) && + !bdev_write_cache(bdev)) { + bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); + if (!bio_sectors(bio)) { + status = BLK_STS_OK; goto end_io; - if (!bdev_write_cache(bdev)) { - bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); - if (!bio_sectors(bio)) { - status = BLK_STS_OK; - goto end_io; - } } } diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 8b846c09350b..5455412b2b75 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -3,7 +3,6 @@ config BCACHEFS_FS tristate "bcachefs filesystem support (EXPERIMENTAL)" depends on BLOCK select EXPORTFS - select CLOSURES select CRC32 select CRC64 select FS_POSIX_ACL diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index a4258615dffa..1e87eee962ec 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -98,7 +98,8 @@ bcachefs-y := \ two_state_shared_lock.o \ util.o \ varint.o \ - xattr.o + xattr.o \ + vendor/closure.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 83d6ab9c1a91..3ccca855f05e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -196,7 +196,6 @@ #include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> -#include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/math64.h> @@ -217,6 +216,7 @@ #include "bcachefs_format.h" #include "btree_journal_iter_types.h" +#include "closure.h" #include "disk_accounting_types.h" #include "errcode.h" #include "fast_list.h" diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d29bd684b137..090f11e122ad 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -707,7 +707,8 @@ struct bch_sb_field_ext { x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ - x(btree_node_accounting, BCH_VERSION(1, 31)) + x(btree_node_accounting, BCH_VERSION(1, 31)) \ + x(rebalance_v2, BCH_VERSION(1, 32)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 59638d09e1fd..3b1d694dcb3a 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -15,6 +15,7 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#include <linux/seq_buf.h> #include <linux/swap.h> const char * const bch2_btree_node_flags[] = { @@ -565,6 +566,19 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(list); } +static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -659,6 +673,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 2; shrink->private_data = &bc->live[0]; shrinker_register(shrink); @@ -669,6 +684,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 8; shrink->private_data = &bc->live[1]; shrinker_register(shrink); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 4890cbc88e7c..e3336ab27ccc 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -13,6 +13,7 @@ #include "trace.h" #include <linux/sched/mm.h> +#include <linux/seq_buf.h> static inline bool btree_uses_pcpu_readers(enum btree_id id) { @@ -808,6 +809,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { } +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct bch_fs *c = shrink->private_data; + struct btree_key_cache *bc = &c->btree_key_cache; + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); @@ -832,6 +845,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->to_text = bch2_btree_key_cache_shrinker_to_text; shrink->batch = 1 << 14; shrink->seeks = 0; shrink->private_data = c; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 10bfadcde80a..362846d5bb87 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, return bch2_csum_opt_to_type(opts.data_checksum, true); } +static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c, + struct bch_extent_rebalance opts) +{ + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; + + return bch2_csum_opt_to_type(opts.data_checksum, true); +} + static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) diff --git a/fs/bcachefs/closure.h b/fs/bcachefs/closure.h new file mode 100644 index 000000000000..d8d4c7093ce0 --- /dev/null +++ b/fs/bcachefs/closure.h @@ -0,0 +1,5 @@ +#include "vendor/closure.h" + +#define closure_wait bch2_closure_wait +#define closure_return_sync bch2_closure_return_sync +#define __closure_wake_up __bch2_closure_wake_up diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 155c1ad42fc1..6333af6adbae 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -208,28 +208,6 @@ static void trace_data_update2(struct data_update *m, } noinline_for_stack -static void trace_io_move_created_rebalance2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - CLASS(printbuf, buf)(); - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_io_move_created_rebalance(c, buf.buf); - - count_event(c, io_move_created_rebalance); -} - -noinline_for_stack static int data_update_invalid_bkey(struct data_update *m, struct bkey_s_c old, struct bkey_s_c k, struct bkey_i *insert) @@ -438,7 +416,7 @@ restart_drop_extra_replicas: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?: - bch2_bkey_set_needs_rebalance(c, &opts, insert, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert, SET_NEEDS_REBALANCE_foreground, m->op.opts.change_cookie) ?: bch2_trans_update(trans, &iter, insert, @@ -449,10 +427,6 @@ restart_drop_extra_replicas: if (trace_data_update_enabled()) trace_data_update2(m, old, k, insert); - if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > - bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) - trace_io_move_created_rebalance2(m, old, k, insert); - ret = bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a99f821c6a1c..9da26e11446b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -282,6 +282,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_str(out, "btree="); bch2_btree_id_to_text(out, k->btree.id); break; + case BCH_DISK_ACCOUNTING_rebalance_work_v2: + bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type); + break; } } diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 730a17ea4243..0b61d6100180 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -110,7 +110,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(snapshot, 5, 1) \ x(btree, 6, 3) \ x(rebalance_work, 7, 1) \ - x(inum, 8, 3) + x(inum, 8, 3) \ + x(rebalance_work_v2, 9, 1) \ enum disk_accounting_type { #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, @@ -210,6 +211,10 @@ struct bch_acct_inum { struct bch_acct_rebalance_work { }; +struct bch_acct_rebalance_work_v2 { + __u8 type; +}; + struct disk_accounting_pos { union { struct { @@ -224,6 +229,7 @@ struct disk_accounting_pos { struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; struct bch_acct_inum inum; + struct bch_acct_rebalance_work_v2 rebalance_work_v2; } __packed; } __packed; struct bpos _pad; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 89a95b6c4e51..103719a76c81 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1134,7 +1134,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, ret = bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n), SET_NEEDS_REBALANCE_other) ?: - bch2_bkey_set_needs_rebalance(trans->c, &opts, n, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_other, 0) ?: bch2_trans_update(trans, &iter, n, 0); out: diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 3274ba42c995..c534b009bf60 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1522,24 +1522,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, "redundant stripe entry"); have_ec = true; break; - case BCH_EXTENT_ENTRY_rebalance: { - /* - * this shouldn't be a fsck error, for forward - * compatibility; the rebalance code should just refetch - * the compression opt if it's unknown - */ -#if 0 - const struct bch_extent_rebalance *r = &entry->rebalance; - - if (!bch2_compression_opt_valid(r->compression)) { - union bch_compression_opt opt = { .value = r->compression }; - prt_printf(err, "invalid compression opt %u:%u", - opt.type, opt.level); - return bch_err_throw(c, invalid_bkey); - } -#endif + case BCH_EXTENT_ENTRY_rebalance: + ret = bch2_extent_rebalance_validate(c, k, from, &entry->rebalance); + if (ret) + return ret; break; - } case BCH_EXTENT_ENTRY_flags: bkey_fsck_err_on(entry != ptrs.start, c, extent_flags_not_at_start, diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index ca480b8f8dae..ac545f962ce9 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -42,12 +42,6 @@ module_param_named(read_corrupt_device, bch2_read_corrupt_device, int, 0644); MODULE_PARM_DESC(read_corrupt_device, ""); #endif -static bool bch2_poison_extents_on_checksum_error; -module_param_named(poison_extents_on_checksum_error, - bch2_poison_extents_on_checksum_error, bool, 0644); -MODULE_PARM_DESC(poison_extents_on_checksum_error, - "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now) @@ -551,9 +545,6 @@ static void get_rbio_extent(struct btree_trans *trans, static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, enum btree_id btree, struct bkey_s_c read_k) { - if (!bch2_poison_extents_on_checksum_error) - return 0; - struct bch_fs *c = trans->c; struct data_update *u = rbio_data_update(rbio); @@ -1291,6 +1282,10 @@ retry_pick: async_object_list_add(c, rbio, rbio, &rbio->list_idx); + /* XXX: also nvme read recovery level */ + if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) + rbio->bio.bi_opf |= REQ_FUA; + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 6a5da02ce266..ccbca802db0b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -365,7 +365,7 @@ int bch2_extent_update(struct btree_trans *trans, min(k->k.p.offset << 9, new_i_size), i_sectors_delta, &inode) ?: (bch2_inode_opts_get_inode(c, &inode, &opts), - bch2_bkey_set_needs_rebalance(c, &opts, k, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k, SET_NEEDS_REBALANCE_foreground, change_cookie)) ?: bch2_trans_update(trans, iter, k, 0) ?: @@ -1271,7 +1271,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: (bch2_inode_opts_get_inode(c, &inode, &opts), - bch2_bkey_set_needs_rebalance(c, &opts, new, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new, SET_NEEDS_REBALANCE_foreground, op->opts.change_cookie)) ?: bch2_trans_update(trans, iter, new, diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 139a6587a64e..9b172af4f8c8 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -84,7 +84,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct bch_inode_opts opts; ret = bch2_extent_get_apply_io_opts_one(trans, &opts, iter, k, ctx) ?: - bch2_bkey_set_needs_rebalance(c, &opts, n, ctx, 0) ?: + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, ctx, 0) ?: drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false); if (ret) return ret; diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 58cfd540c6d6..71b17f18e90c 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -2,11 +2,10 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "closure.h" #include "nocow_locking.h" #include "util.h" -#include <linux/closure.h> - bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) { u64 dev_bucket = bucket_to_u64(bucket); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index bd5faafc9aa7..365cce4a6b49 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -103,6 +103,13 @@ static const char * const __bch2_fs_usage_types[] = { #undef x +static const char * const __bch2_rebalance_accounting_types[] = { +#define x(n) #n, + BCH_REBALANCE_ACCOUNTING() +#undef x + NULL +}; + static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], unsigned nr, const char *type, unsigned idx) { @@ -125,6 +132,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); +PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 6b9f18839345..de1ac235e929 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -34,6 +34,7 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); +void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type); static inline const char *bch2_d_type_str(unsigned d_type) { diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 67d6a90e86ef..0c29be7d662f 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -25,8 +25,29 @@ #include <linux/kthread.h> #include <linux/sched/cputime.h> +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + /* bch_extent_rebalance: */ +int bch2_extent_rebalance_validate(struct bch_fs *c, + struct bkey_s_c k, + struct bkey_validate_context from, + const struct bch_extent_rebalance *r) +{ + int ret = 0; + + bkey_fsck_err_on(r->pending && !(r->need_rb & BIT(BCH_REBALANCE_background_target)), + c, extent_rebalance_bad_pending, + "pending incorrectly set"); + + bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)), + c, extent_rebalance_bad_pending, + "hipri incorrectly set"); + +fsck_err: + return ret; +} + static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { const union bch_extent_entry *entry; @@ -38,15 +59,30 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b return NULL; } -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) { return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); } +static const char * const rebalance_opts[] = { +#define x(n) #n, + BCH_REBALANCE_OPTS() +#undef x + NULL +}; + void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_rebalance *r) { - prt_printf(out, "replicas=%u", r->data_replicas); + prt_str(out, "need_rb="); + prt_bitflags(out, rebalance_opts, r->need_rb); + + if (r->hipri) + prt_str(out, " hipri"); + if (r->pending) + prt_str(out, " pending"); + + prt_printf(out, " replicas=%u", r->data_replicas); if (r->data_replicas_from_inode) prt_str(out, " (inode)"); @@ -92,32 +128,54 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } } -int bch2_trigger_extent_rebalance(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; +/* + * XXX: check in bkey_validate that if r->hipri or r->pending are set, + * r->data_replicas are also set + */ - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; +static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r) +{ + if (!r) + return 0; + unsigned ret = r->need_rb; - s = bch2_bkey_sectors_need_rebalance(c, new); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; + if (r->hipri) + ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority); + if (r->pending) { + ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending); + ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target); + } + return ret; +} - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { +int __bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned old_r, unsigned new_r, + enum btree_iter_update_trigger_flags flags) +{ + int delta = (int) !!new_r - (int) !!old_r; + if ((flags & BTREE_TRIGGER_transactional) && delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); + new.k->p, delta > 0); if (ret) return ret; } - if (need_rebalance_sectors_delta[0]) { + delta = old.k->size == new.k->size + ? old_r ^ new_r + : old_r | new_r; + while (delta) { + unsigned c = __ffs(delta); + delta ^= BIT(c); + + s64 v[1] = { 0 }; + if (old_r & BIT(c)) + v[0] -= (s64) old.k->size; + if (new_r & BIT(c)) + v[0] += (s64) new.k->size; + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); + v, rebalance_work_v2, c); if (ret) return ret; } @@ -125,39 +183,48 @@ int bch2_trigger_extent_rebalance(struct btree_trans *trans, return 0; } -static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, - struct bch_inode_opts *io_opts, - unsigned *move_ptrs, - unsigned *compress_ptrs, - u64 *sectors) +static struct bch_extent_rebalance +bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, + struct bch_inode_opts *opts, + unsigned *move_ptrs, + unsigned *compress_ptrs, + unsigned *csum_ptrs, + bool may_update_indirect) { *move_ptrs = 0; *compress_ptrs = 0; - *sectors = 0; + *csum_ptrs = 0; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!io_opts && !rb_opts) - return; + struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) }; if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return; + return r; - unsigned compression_type = - bch2_compression_opt_to_type(io_opts - ? io_opts->background_compression - : rb_opts->background_compression); - unsigned target = io_opts - ? io_opts->background_target - : rb_opts->background_target; - if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target)) - target = 0; + const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs); + if (old_r) { + r = *old_r; + r.need_rb = 0; + } + +#define x(_name) \ + if (k.k->type != KEY_TYPE_reflink_v || \ + may_update_indirect || \ + (!opts->_name##_from_inode && !r._name##_from_inode)) { \ + r._name = opts->_name; \ + r._name##_from_inode = opts->_name##_from_inode; \ + } + BCH_REBALANCE_OPTS() +#undef x + + unsigned compression_type = bch2_compression_opt_to_type(r.background_compression); + unsigned csum_type = bch2_data_checksum_type_rb(c, r); + + bool incompressible = false, unwritten = false, ec = false; + unsigned durability = 0, min_durability = INT_MAX; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - bool incompressible = false, unwritten = false; - unsigned ptr_idx = 1; guard(rcu)(); @@ -166,102 +233,222 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, unwritten |= p.ptr.unwritten; if (!p.ptr.cached) { - if (p.crc.compression_type != compression_type) + if (p.crc.compression_type != compression_type) { *compress_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_background_compression); + } - if (target && !bch2_dev_in_target(c, p.ptr.dev, target)) + if (p.crc.csum_type != csum_type) { + *csum_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_data_checksum); + } + + if (r.background_target && + !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) { *move_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_background_target); + } + + unsigned d = bch2_extent_ptr_durability(c, &p); + durability += d; + min_durability = min(min_durability, d); + + ec |= p.has_ec; } ptr_idx <<= 1; } - if (unwritten) + if (unwritten || incompressible) { *compress_ptrs = 0; - if (incompressible) - *compress_ptrs = 0; - - unsigned rb_ptrs = *move_ptrs | *compress_ptrs; - - if (!rb_ptrs) - return; + r.need_rb &= ~BIT(BCH_REBALANCE_background_compression); + } - ptr_idx = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (rb_ptrs & ptr_idx) - *sectors += p.crc.compressed_size; - ptr_idx <<= 1; + if (unwritten) { + *csum_ptrs = 0; + r.need_rb &= !BIT(BCH_REBALANCE_data_checksum); } + + return r; } -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v) { - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; + if (v && *v) + return 1; + + /* + * If opts need to be propagated to the extent, a scan cookie should be + * present: + */ + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; - bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, §ors); - return sectors; + ret = k.k->type == KEY_TYPE_cookie; + if (v) + *v = ret; + return ret; } -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static int new_needs_rb_allowed(struct btree_trans *trans, + struct per_snapshot_io_opts *s, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx, + unsigned opt_change_cookie, + const struct bch_extent_rebalance *old, + const struct bch_extent_rebalance *new, + unsigned new_need_rb) { - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; + struct bch_fs *c = trans->c; + /* + * New need_rb - pointers that don't match the current io path options - + * are only allowed in certain situations: + * + * Propagating new options: from bch2_set_rebalance_needs_scan + * + * Foreground writes: background_compression and background_target are + * allowed + * + * Foreground writes: we may have raced with an option change: + * opt_change_cookie checks for this + * + * XXX: foreground writes should still match compression, + * foreground_target - figure out how to check for this + */ + if (ctx == SET_NEEDS_REBALANCE_opt_change || + ctx == SET_NEEDS_REBALANCE_opt_change_indirect) + return 0; + + if (ctx == SET_NEEDS_REBALANCE_foreground) { + new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)| + BIT(BCH_REBALANCE_background_target)); + if (!new_need_rb) + return 0; + + if (opt_change_cookie != atomic_read(&c->opt_change_cookie)) + return 0; + } + + /* + * Either the extent data or the extent io options (from + * bch_extent_rebalance) should match the io_opts from the + * inode/filesystem, unless + * + * - There's a scan pending to propagate new options + * - It's an indirect extent: it may be referenced by inodes + * with inconsistent options + * + * For efficiency (so that we can cache checking for scan + * cookies), only check option consistency when we're called + * with snapshot_io_opts - don't bother when we're called from + * move_data_phys() -> get_io_opts_one() + * + * Note that we can cache the existence of a cookie, but not the + * non-existence, to avoid spurious false positives. + */ + int ret = check_rebalance_scan_cookie(trans, 0, s ? &s->fs_scan_cookie : NULL) ?: + check_rebalance_scan_cookie(trans, k.k->p.inode, s ? &s->inum_scan_cookie : NULL); + if (ret < 0) + return ret; + if (ret) + return 0; + + CLASS(printbuf, buf)(); + + prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n"); + bch2_bkey_val_to_text(&buf, c, k); - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - return move_ptrs|compress_ptrs; + const struct bch_extent_rebalance _old = {}; + if (!old) + old = &_old; + +#define x(_name) \ + if (new_need_rb & BIT(BCH_REBALANCE_##_name)) \ + prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name); + BCH_REBALANCE_OPTS() +#undef x + + fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf); +fsck_err: + return ret; } -static inline bool bkey_should_have_rb_opts(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static inline bool bkey_should_have_rb_opts(struct bkey_s_c k, + struct bch_extent_rebalance new) { if (k.k->type == KEY_TYPE_reflink_v) { -#define x(n) if (opts->n##_from_inode) return true; +#define x(n) if (new.n##_from_inode) return true; BCH_REBALANCE_OPTS() #undef x } - return bch2_bkey_ptrs_need_rebalance(c, opts, k); + return new.need_rb; } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, +int bch2_bkey_set_needs_rebalance(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bch_inode_opts *opts, struct bkey_i *_k, enum set_needs_rebalance_ctx ctx, - u32 change_cookie) + u32 opt_change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; + struct bch_fs *c = trans->c; struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *old = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (bkey_should_have_rb_opts(c, opts, k.s_c)) { + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + struct bch_extent_rebalance new = + bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, + ctx == SET_NEEDS_REBALANCE_opt_change_indirect); + + bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new); + + if (should_have_rb == !!old && + (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) + return 0; + + unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0); + + if (unlikely(new_need_rb)) { + int ret = new_needs_rb_allowed(trans, snapshot_io_opts, + k.s_c, ctx, opt_change_cookie, + old, &new, new_need_rb); + if (ret) + return ret; + } + + if (should_have_rb) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } + *old = new; + } else if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); return 0; } static int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *io_opts, struct btree_iter *iter, struct bkey_s_c k, enum set_needs_rebalance_ctx ctx) { struct bch_fs *c = trans->c; + int ret = 0; BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); @@ -269,36 +456,24 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, if (!bkey_extent_is_direct_data(k.k)) return 0; - bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect; + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k); - /* - * If it's an indirect extent, and we walked to it directly, we won't - * have the options from the inode that were directly applied: options - * from the extent take precedence - unless the io_opts option came from - * the inode and may_update_indirect is true (walked from a - * REFLINK_P_MAY_UPDATE_OPTIONS pointer). - */ - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - if (old && k.k->type == KEY_TYPE_reflink_v) { -#define x(_name) \ - if (old->_name##_from_inode && \ - !(may_update_indirect && io_opts->_name##_from_inode)) { \ - io_opts->_name = old->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + struct bch_extent_rebalance new = + bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, &csum_ptrs, + ctx == SET_NEEDS_REBALANCE_opt_change_indirect); - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts); + bool should_have_rb = bkey_should_have_rb_opts(k, new); - if (bkey_should_have_rb_opts(c, io_opts, k) - ? old && !memcmp(old, &new, sizeof(new)) - : !old) + if (should_have_rb == !!old && + (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) return 0; struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); + ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -306,7 +481,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: + return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: bch_err_throw(c, transaction_restart_commit); @@ -349,7 +524,8 @@ static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, darray_push(&io_opts->d, e); })); - io_opts->cur_inum = extent_pos.inode; + io_opts->cur_inum = extent_pos.inode; + io_opts->inum_scan_cookie = false; } ret = ret ?: trans_was_restarted(trans, restart_count); @@ -372,11 +548,13 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, enum set_needs_rebalance_ctx ctx) { struct bch_inode_opts *opts = - bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); + bch2_extent_get_io_opts(trans, snapshot_io_opts, + extent_pos, extent_iter, extent_k); if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) return opts; - int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx); + int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, + extent_iter, extent_k, ctx); return ret ? ERR_PTR(ret) : opts; } @@ -420,11 +598,9 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans, if (ret || btree_iter_path(trans, extent_iter)->level) return ret; - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx); + return bch2_get_update_rebalance_opts(trans, NULL, io_opts, extent_iter, extent_k, ctx); } -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - static const char * const bch2_rebalance_state_strs[] = { #define x(t) #t, BCH_REBALANCE_STATES() @@ -535,23 +711,6 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, return &(&darray_pop(buf))->k_i; } -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; - - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - extent_entry_drop(bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, @@ -570,6 +729,10 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + if (!r || !r->need_rb) /* Write buffer race? */ + return bkey_s_c_null; + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, extent_iter->pos, extent_iter, k, @@ -580,22 +743,24 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, *opts_ret = opts; + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, false); + memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); + data_opts->rewrite_ptrs = move_ptrs|compress_ptrs|csum_ptrs; data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; - if (!data_opts->rewrite_ptrs) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); + if (!data_opts->rewrite_ptrs && + !data_opts->kill_ptrs && + !data_opts->kill_ec_ptrs && + !data_opts->extra_replicas) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s", buf.buf); return bkey_s_c_null; } @@ -605,12 +770,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; - - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - if (move_ptrs) { prt_str(&buf, "move="); bch2_target_to_text(&buf, c, opts->background_target); @@ -627,6 +786,14 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, prt_newline(&buf); } + if (csum_ptrs) { + prt_str(&buf, "csum="); + bch2_prt_csum_opt(&buf, opts->data_checksum); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, csum_ptrs); + prt_newline(&buf); + } + trace_rebalance_extent(c, buf.buf); } count_event(c, rebalance_extent); @@ -690,6 +857,7 @@ out: static int do_rebalance_scan_indirect(struct btree_trans *trans, struct bkey_s_c_reflink_p p, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *opts) { u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); @@ -702,7 +870,7 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans, BTREE_ITER_not_extents, k, ({ if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) break; - bch2_get_update_rebalance_opts(trans, opts, &iter, k, + bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, k, SET_NEEDS_REBALANCE_opt_change_indirect); })); if (ret) @@ -750,7 +918,8 @@ static int do_rebalance_scan(struct moving_context *ctxt, (inum && k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) - ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) + ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), + snapshot_io_opts, opts) : 0); })); if (ret) @@ -1049,6 +1218,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c) static int check_rebalance_work_one(struct btree_trans *trans, struct btree_iter *extent_iter, struct btree_iter *rebalance_iter, + struct per_snapshot_io_opts *snapshot_io_opts, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -1089,8 +1259,7 @@ static int check_rebalance_work_one(struct btree_trans *trans, extent_k.k = &deleted; } - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool should_have_rebalance = bch2_bkey_needs_rb(extent_k); bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; if (should_have_rebalance != have_rebalance) { @@ -1119,6 +1288,21 @@ static int check_rebalance_work_one(struct btree_trans *trans, return ret; } + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, extent_iter->pos, extent_iter, extent_k, + SET_NEEDS_REBALANCE_other); + ret = PTR_ERR_OR_ZERO(opts); + if (ret == -BCH_ERR_transaction_restart_commit) { + /* + * If get_apply_io_opts() did work, just advance and check the + * next key; it may have updated the rebalance_work btree so + * we'd need a write buffer flush to check what it just did. + */ + ret = 0; + } + if (ret) + return ret; + if (cmp <= 0) bch2_btree_iter_advance(extent_iter); if (cmp >= 0) @@ -1131,10 +1315,14 @@ int bch2_check_rebalance_work(struct bch_fs *c) { CLASS(btree_trans, trans)(c); CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_not_extents| BTREE_ITER_prefetch); CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, BTREE_ITER_prefetch); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -1148,12 +1336,15 @@ int bch2_check_rebalance_work(struct bch_fs *c) bch2_trans_begin(trans); - ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, + &snapshot_io_opts, &last_flushed) ?: + bch2_trans_commit(trans, NULL, NULL, 0); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; } + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_bkey_buf_exit(&last_flushed, c); return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 24bafa42f070..f40f670af046 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -7,10 +7,14 @@ #include "opts.h" #include "rebalance_types.h" +int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context, + const struct bch_extent_rebalance *); + static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, struct bch_inode_opts *opts) { - struct bch_extent_rebalance r = { + return (struct bch_extent_rebalance) { .type = BIT(BCH_EXTENT_ENTRY_rebalance), #define x(_name) \ ._name = opts->_name, \ @@ -18,22 +22,36 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f BCH_REBALANCE_OPTS() #undef x }; - - if (r.background_target && - !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) - r.background_target = 0; - - return r; }; void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, const struct bch_extent_rebalance *); -int bch2_trigger_extent_rebalance(struct btree_trans *, - struct bkey_s_c, struct bkey_s_c, - enum btree_iter_update_trigger_flags); +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); + +static inline int bch2_bkey_needs_rb(struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + return r ? r->need_rb : 0; +} + +int __bch2_trigger_extent_rebalance(struct btree_trans *, + struct bkey_s_c, struct bkey_s_c, + unsigned, unsigned, + enum btree_iter_update_trigger_flags); -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) +{ + unsigned old_r = bch2_bkey_needs_rb(old); + unsigned new_r = bch2_bkey_needs_rb(new); + + return old_r != new_r || + (old.k->size != new.k->size && (old_r|new_r)) + ? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags) + : 0; +} enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_opt_change, @@ -42,9 +60,6 @@ enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_other, }; -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, - struct bkey_i *, enum set_needs_rebalance_ctx, u32); - /* Inodes in different snapshots may have different IO options: */ struct snapshot_io_opts_entry { u32 snapshot; @@ -53,6 +68,9 @@ struct snapshot_io_opts_entry { struct per_snapshot_io_opts { u64 cur_inum; + bool fs_scan_cookie; + bool inum_scan_cookie; + struct bch_inode_opts fs_io_opts; DARRAY(struct snapshot_io_opts_entry) d; }; @@ -68,6 +86,10 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt darray_exit(&io_opts->d); } +int bch2_bkey_set_needs_rebalance(struct btree_trans *, + struct per_snapshot_io_opts *, struct bch_inode_opts *, + struct bkey_i *, enum set_needs_rebalance_ctx, u32); + struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *, struct per_snapshot_io_opts *, struct bpos, struct btree_iter *, struct bkey_s_c, diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h index ff9a1342a22b..d7a5f899e789 100644 --- a/fs/bcachefs/rebalance_format.h +++ b/fs/bcachefs/rebalance_format.h @@ -5,49 +5,76 @@ struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:6, - unused:3, + unused:5, + hipri:1, + pending:1, + need_rb:5, - promote_target_from_inode:1, - erasure_code_from_inode:1, + data_replicas_from_inode:1, data_checksum_from_inode:1, + erasure_code_from_inode:1, background_compression_from_inode:1, - data_replicas_from_inode:1, background_target_from_inode:1, + promote_target_from_inode:1, - promote_target:16, - erasure_code:1, + data_replicas:3, data_checksum:4, - data_replicas:4, + erasure_code:1, background_compression:8, /* enum bch_compression_opt */ - background_target:16; + background_target:12, + promote_target:12; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, + __u64 promote_target:12, + background_target:12, background_compression:8, - data_replicas:4, - data_checksum:4, erasure_code:1, - promote_target:16, + data_checksum:4, + data_replicas:3, + promote_target_from_inode:1, background_target_from_inode:1, - data_replicas_from_inode:1, background_compression_from_inode:1, - data_checksum_from_inode:1, erasure_code_from_inode:1, - promote_target_from_inode:1, + data_checksum_from_inode:1, + data_replicas_from_inode:1, - unused:3, + need_rb:5, + pending:1, + hipri:1, + unused:5, type:6; #endif }; /* subset of BCH_INODE_OPTS */ #define BCH_REBALANCE_OPTS() \ + x(data_replicas) \ x(data_checksum) \ + x(erasure_code) \ x(background_compression) \ + x(background_target) \ + x(promote_target) + +enum bch_rebalance_opts { +#define x(n) BCH_REBALANCE_##n, + BCH_REBALANCE_OPTS() +#undef x +}; + +#define BCH_REBALANCE_ACCOUNTING() \ x(data_replicas) \ - x(promote_target) \ + x(data_checksum) \ + x(erasure_code) \ + x(background_compression) \ x(background_target) \ - x(erasure_code) + x(high_priority) \ + x(pending) \ + +enum bch_rebalance_accounting_type { +#define x(n) BCH_REBALANCE_ACCOUNTING_##n, + BCH_REBALANCE_ACCOUNTING() +#undef x +}; #endif /* _BCACHEFS_REBALANCE_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index bfd06fd5d506..66b7f19f0437 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -107,7 +107,10 @@ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ x(btree_node_accounting, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \ + BCH_FSCK_ERR_extent_io_opts_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 77e3fc92e39b..9ec2df6c8071 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -159,6 +159,8 @@ enum bch_fsck_flags { x(extent_ptrs_redundant_stripe, 139, 0) \ x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(extent_rebalance_bad_pending, 330, 0) \ + x(extent_rebalance_bad_hipri, 331, 0) \ x(ptr_to_invalid_device, 142, 0) \ x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_duplicate_device, 143, 0) \ @@ -339,7 +341,9 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 328, 0) + x(extent_io_opts_not_set, 328, FSCK_AUTOFIX) \ + x(extent_io_opts_unneeded, 329, FSCK_AUTOFIX) \ + x(MAX, 332, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 40adefe7170f..ef6312c50f88 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -45,6 +45,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> +#include <linux/string_choices.h> #include <linux/sched/clock.h> #include "util.h" @@ -157,6 +158,7 @@ write_attribute(trigger_recalc_capacity); write_attribute(trigger_delete_dead_snapshots); write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); +__sysfs_attribute(read_fua_test, 0400); read_attribute(uuid); read_attribute(minor); @@ -304,6 +306,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "reserved:\t\t%llu\n", b.reserved); } +static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bio *bio = NULL; + void *buf = NULL; + unsigned bs = c->opts.block_size, iters; + u64 end, test_duration = NSEC_PER_SEC * 2; + struct bch2_time_stats stats_nofua, stats_fua, stats_random; + int ret = 0; + + bch2_time_stats_init_no_pcpu(&stats_nofua); + bch2_time_stats_init_no_pcpu(&stats_fua); + bch2_time_stats_init_no_pcpu(&stats_random); + + if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) { + prt_str(out, "offline\n"); + return 0; + } + + struct block_device *bdev = ca->disk_sb.bdev; + + bio = bio_kmalloc(1, GFP_KERNEL); + if (!bio) { + ret = -ENOMEM; + goto err; + } + + buf = kmalloc(bs, GFP_KERNEL); + if (!buf) + goto err; + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_nofua, submit_time); + + if (ret) + goto err; + } + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_fua, submit_time); + + if (ret) + goto err; + } + + u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_random, submit_time); + + if (ret) + goto err; + } + + u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); + u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); + u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); + + u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); + u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); + u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); + + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 12); + prt_printf(out, "This test must be run on an idle drive for accurate results\n"); + prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); + prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev))); + prt_newline(out); + prt_printf(out, "ns:\tlatency\rstddev\r\n"); + prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); + prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); + prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); + + bool read_cache = ns_nofua * 2 < ns_rand; + bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; + + if (!read_cache) + prt_str(out, "reads don't appear to be cached - safe\n"); + else if (!fua_cached) + prt_str(out, "fua reads don't appear to be cached - safe\n"); + else + prt_str(out, "fua reads appear to be cached - unsafe\n"); +err: + kfree(buf); + kfree(bio); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test); + bch_err_fn(c, ret); + return ret; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -847,6 +959,9 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + if (attr == &sysfs_read_fua_test) + return bch2_read_fua_test(out, ca); + int opt_id = bch2_opt_lookup(attr->name); if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); @@ -911,6 +1026,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_congested, #endif + &sysfs_read_fua_test, + /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 6c312fd9a447..c5d7be2eba03 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1339,11 +1339,6 @@ DEFINE_EVENT(fs_str, io_move_pred, TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, io_move_created_rebalance, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - DEFINE_EVENT(fs_str, io_move_evacuate_bucket, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 52ac8230be9f..555e0d8f3cf0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -4,7 +4,6 @@ #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/closure.h> #include <linux/errno.h> #include <linux/freezer.h> #include <linux/kernel.h> @@ -21,6 +20,7 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> +#include "closure.h" #include "mean_and_variance.h" #include "darray.h" diff --git a/fs/bcachefs/vendor/closure.c b/fs/bcachefs/vendor/closure.c new file mode 100644 index 000000000000..bdafd3a57386 --- /dev/null +++ b/fs/bcachefs/vendor/closure.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Asynchronous refcounty things + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "closure.h" +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/sched/debug.h> + +static void closure_val_checks(struct closure *cl, unsigned new, int d) +{ + unsigned count = new & CLOSURE_REMAINING_MASK; + + if (WARN(new & CLOSURE_GUARD_MASK, + "closure %ps has guard bits set: %x (%u), delta %i", + cl->fn, + new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d)) + new &= ~CLOSURE_GUARD_MASK; + + WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)), + "closure %ps ref hit 0 with incorrect flags set: %x (%u)", + cl->fn, + new, (unsigned) __fls(new)); +} + +enum new_closure_state { + CLOSURE_normal_put, + CLOSURE_requeue, + CLOSURE_done, +}; + +/* For clearing flags with the same atomic op as a put */ +void bch2_closure_sub(struct closure *cl, int v) +{ + enum new_closure_state s; + struct task_struct *sleeper; + + /* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */ + guard(rcu)(); + + int old = atomic_read_acquire(&cl->remaining), new; + do { + new = old - v; + + if (new & CLOSURE_REMAINING_MASK) { + s = CLOSURE_normal_put; + } else { + if ((cl->fn || (new & CLOSURE_SLEEPING)) && + !(new & CLOSURE_DESTRUCTOR)) { + s = CLOSURE_requeue; + new += CLOSURE_REMAINING_INITIALIZER; + } else + s = CLOSURE_done; + + sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL; + new &= ~CLOSURE_SLEEPING; + } + + closure_val_checks(cl, new, -v); + } while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new)); + + if (s == CLOSURE_normal_put) + return; + + if (sleeper) { + smp_mb(); + wake_up_process(sleeper); + return; + } + + if (s == CLOSURE_requeue) { + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + closure_fn *destructor = cl->fn; + + closure_debug_destroy(cl); + + if (destructor) + destructor(&cl->work); + + if (parent) + closure_put(parent); + } +} + +/* + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ +void __bch2_closure_wake_up(struct closure_waitlist *wait_list) +{ + struct llist_node *list; + struct closure *cl, *t; + struct llist_node *reverse = NULL; + + list = llist_del_all(&wait_list->list); + + /* We first reverse the list to preserve FIFO ordering and fairness */ + reverse = llist_reverse_order(list); + + /* Then do the wakeups */ + llist_for_each_entry_safe(cl, t, reverse, list) { + closure_set_waiting(cl, 0); + bch2_closure_sub(cl, CLOSURE_WAITING + 1); + } +} + +/** + * closure_wait - add a closure to a waitlist + * @waitlist: will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * @cl: closure pointer. + * + */ +bool bch2_closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +{ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + + closure_set_waiting(cl, _RET_IP_); + unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining); + closure_val_checks(cl, r, CLOSURE_WAITING + 1); + + llist_add(&cl->list, &waitlist->list); + + return true; +} + +void __sched __bch2_closure_sync(struct closure *cl) +{ + cl->sleeper = current; + bch2_closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_SLEEPING); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); +} + +/* + * closure_return_sync - finish running a closure, synchronously (i.e. waiting + * for outstanding get()s to finish) and returning once closure refcount is 0. + * + * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent + * closure_get_not_zero() calls will fail. + */ +void __sched bch2_closure_return_sync(struct closure *cl) +{ + cl->sleeper = current; + bch2_closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_DESTRUCTOR - + CLOSURE_SLEEPING); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); + + if (cl->parent) + closure_put(cl->parent); +} + +int __sched __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout) +{ + int ret = 0; + + cl->sleeper = current; + bch2_closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_SLEEPING); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + /* + * Carefully undo the continue_at() - but only if it + * hasn't completed, i.e. the final closure_put() hasn't + * happened yet: + */ + unsigned old = atomic_read(&cl->remaining), new; + if (!(old & CLOSURE_SLEEPING)) + goto success; + + if (!timeout) { + do { + if (!(old & CLOSURE_SLEEPING)) + goto success; + + new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING; + closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING); + } while (!atomic_try_cmpxchg(&cl->remaining, &old, new)); + + ret = -ETIME; + break; + } + + timeout = schedule_timeout(timeout); + } +success: + __set_current_state(TASK_RUNNING); + return ret; +} diff --git a/fs/bcachefs/vendor/closure.h b/fs/bcachefs/vendor/closure.h new file mode 100644 index 000000000000..79112efe30a7 --- /dev/null +++ b/fs/bcachefs/vendor/closure.h @@ -0,0 +1,490 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CLOSURE_H +#define _LINUX_CLOSURE_H + +#include <linux/llist.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/workqueue.h> + +/* + * Closure is perhaps the most overused and abused term in computer science, but + * since I've been unable to come up with anything better you're stuck with it + * again. + * + * What are closures? + * + * They embed a refcount. The basic idea is they count "things that are in + * progress" - in flight bios, some other thread that's doing something else - + * anything you might want to wait on. + * + * The refcount may be manipulated with closure_get() and closure_put(). + * closure_put() is where many of the interesting things happen, when it causes + * the refcount to go to 0. + * + * Closures can be used to wait on things both synchronously and asynchronously, + * and synchronous and asynchronous use can be mixed without restriction. To + * wait synchronously, use closure_sync() - you will sleep until your closure's + * refcount hits 1. + * + * To wait asynchronously, use + * continue_at(cl, next_function, workqueue); + * + * passing it, as you might expect, the function to run when nothing is pending + * and the workqueue to run that function out of. + * + * continue_at() also, critically, requires a 'return' immediately following the + * location where this macro is referenced, to return to the calling function. + * There's good reason for this. + * + * To use safely closures asynchronously, they must always have a refcount while + * they are running owned by the thread that is running them. Otherwise, suppose + * you submit some bios and wish to have a function run when they all complete: + * + * foo_endio(struct bio *bio) + * { + * closure_put(cl); + * } + * + * closure_init(cl); + * + * do_stuff(); + * closure_get(cl); + * bio1->bi_endio = foo_endio; + * bio_submit(bio1); + * + * do_more_stuff(); + * closure_get(cl); + * bio2->bi_endio = foo_endio; + * bio_submit(bio2); + * + * continue_at(cl, complete_some_read, system_wq); + * + * If closure's refcount started at 0, complete_some_read() could run before the + * second bio was submitted - which is almost always not what you want! More + * importantly, it wouldn't be possible to say whether the original thread or + * complete_some_read()'s thread owned the closure - and whatever state it was + * associated with! + * + * So, closure_init() initializes a closure's refcount to 1 - and when a + * closure_fn is run, the refcount will be reset to 1 first. + * + * Then, the rule is - if you got the refcount with closure_get(), release it + * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount + * on a closure because you called closure_init() or you were run out of a + * closure - _always_ use continue_at(). Doing so consistently will help + * eliminate an entire class of particularly pernicious races. + * + * Lastly, you might have a wait list dedicated to a specific event, and have no + * need for specifying the condition - you just want to wait until someone runs + * closure_wake_up() on the appropriate wait list. In that case, just use + * closure_wait(). It will return either true or false, depending on whether the + * closure was already on a wait list or not - a closure can only be on one wait + * list at a time. + * + * Parents: + * + * closure_init() takes two arguments - it takes the closure to initialize, and + * a (possibly null) parent. + * + * If parent is non null, the new closure will have a refcount for its lifetime; + * a closure is considered to be "finished" when its refcount hits 0 and the + * function to run is null. Hence + * + * continue_at(cl, NULL, NULL); + * + * returns up the (spaghetti) stack of closures, precisely like normal return + * returns up the C stack. continue_at() with non null fn is better thought of + * as doing a tail call. + * + * All this implies that a closure should typically be embedded in a particular + * struct (which its refcount will normally control the lifetime of), and that + * struct can very much be thought of as a stack frame. + */ + +struct closure; +struct closure_syncer; +typedef void (closure_fn) (struct work_struct *); +extern struct dentry *bcache_debug; + +struct closure_waitlist { + struct llist_head list; +}; + +enum closure_state { + /* + * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by + * the thread that owns the closure, and cleared by the thread that's + * waking up the closure. + * + * The rest are for debugging and don't affect behaviour: + * + * CLOSURE_RUNNING: Set when a closure is running (i.e. by + * closure_init() and when closure_put() runs then next function), and + * must be cleared before remaining hits 0. Primarily to help guard + * against incorrect usage and accidentally transferring references. + * continue_at() and closure_return() clear it for you, if you're doing + * something unusual you can use closure_set_dead() which also helps + * annotate where references are being transferred. + */ + + CLOSURE_BITS_START = (1U << 24), + CLOSURE_DESTRUCTOR = (1U << 24), + CLOSURE_SLEEPING = (1U << 26), + CLOSURE_WAITING = (1U << 28), + CLOSURE_RUNNING = (1U << 30), +}; + +#define CLOSURE_GUARD_MASK \ + (((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1)) + +#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) + +struct closure { + union { + struct { + struct workqueue_struct *wq; + struct task_struct *sleeper; + struct llist_node list; + closure_fn *fn; + }; + struct work_struct work; + }; + + struct closure *parent; + + atomic_t remaining; + +#ifdef CONFIG_DEBUG_CLOSURES +#define CLOSURE_MAGIC_DEAD 0xc054dead +#define CLOSURE_MAGIC_ALIVE 0xc054a11e +#define CLOSURE_MAGIC_STACK 0xc05451cc + + unsigned int magic; + struct list_head all; + unsigned long ip; + unsigned long waiting_on; +#endif +}; + +void bch2_closure_sub(struct closure *cl, int v); +void __bch2_closure_wake_up(struct closure_waitlist *list); +bool bch2_closure_wait(struct closure_waitlist *list, struct closure *cl); +void __bch2_closure_sync(struct closure *cl); + +/* + * closure_put - decrement a closure's refcount + */ +static inline void closure_put(struct closure *cl) +{ + bch2_closure_sub(cl, 1); +} + +static inline unsigned closure_nr_remaining(struct closure *cl) +{ + return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK; +} + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +static inline void closure_sync(struct closure *cl) +{ + if (closure_nr_remaining(cl) > 1) + __bch2_closure_sync(cl); +} + +int __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout); + +static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout) +{ + return closure_nr_remaining(cl) > 1 + ? __bch2_closure_sync_timeout(cl, timeout) + : 0; +} + +//#ifdef CONFIG_DEBUG_CLOSURES +#if 0 + +void bch2_closure_debug_create(struct closure *cl); +void closure_debug_destroy(struct closure *cl); + +#else + +static inline void bch2_closure_debug_create(struct closure *cl) {} +static inline void closure_debug_destroy(struct closure *cl) {} + +#endif + +static inline void closure_set_ip(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _THIS_IP_; +#endif +} + +static inline void closure_set_ret_ip(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _RET_IP_; +#endif +} + +static inline void closure_set_waiting(struct closure *cl, unsigned long f) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->waiting_on = f; +#endif +} + +static inline void closure_set_stopped(struct closure *cl) +{ + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} + +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; +} + +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); + + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(&cl->work); +} + +/** + * closure_get - increment a closure's refcount + */ +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} + +/** + * closure_get_not_zero + */ +static inline bool closure_get_not_zero(struct closure *cl) +{ + unsigned old = atomic_read(&cl->remaining); + do { + if (!(old & CLOSURE_REMAINING_MASK)) + return false; + + } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1)); + + return true; +} + +/** + * closure_init - Initialize a closure, setting the refcount to 1 + * @cl: closure to initialize + * @parent: parent of the new closure. cl will take a refcount on it for its + * lifetime; may be NULL. + */ +static inline void closure_init(struct closure *cl, struct closure *parent) +{ + cl->fn = NULL; + cl->parent = parent; + if (parent) + closure_get(parent); + + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + + bch2_closure_debug_create(cl); + closure_set_ip(cl); +} + +static inline void closure_init_stack(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif +} + +static inline void closure_init_stack_release(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif +} + +/** + * closure_wake_up - wake up all closures on a wait list, + * with memory barrier + */ +static inline void closure_wake_up(struct closure_waitlist *list) +{ + /* Memory barrier for the wait list */ + smp_mb(); + __bch2_closure_wake_up(list); +} + +#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws) +#define closure_type(name, type, member) \ + struct closure *cl = container_of(ws, struct closure, work); \ + type *name = container_of(cl, type, member) + +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. + * + * Note you are expected to immediately return after using this macro. + */ +#define continue_at(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + bch2_closure_sub(_cl, CLOSURE_RUNNING + 1); \ +} while (0) + +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ +#define closure_return(_cl) continue_at((_cl), NULL, NULL) + +void bch2_closure_return_sync(struct closure *cl); + +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ +#define continue_at_nobarrier(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_queue(_cl); \ +} while (0) + +/** + * closure_return_with_destructor - finish execution of a closure, + * with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ +#define closure_return_with_destructor(_cl, _destructor) \ +do { \ + set_closure_fn(_cl, _destructor, NULL); \ + bch2_closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +} while (0) + +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ +static inline void closure_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + closure_init(cl, parent); + continue_at_nobarrier(cl, fn, wq); +} + +#define __closure_wait_event(waitlist, _cond) \ +do { \ + struct closure cl; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + bch2_closure_wait(waitlist, &cl); \ + if (_cond) \ + break; \ + closure_sync(&cl); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ +} while (0) + +#define closure_wait_event(waitlist, _cond) \ +do { \ + if (!(_cond)) \ + __closure_wait_event(waitlist, _cond); \ +} while (0) + +#define __closure_wait_event_timeout(waitlist, _cond, _until) \ +({ \ + struct closure cl; \ + long _t; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + bch2_closure_wait(waitlist, &cl); \ + if (_cond) { \ + _t = max_t(long, 1L, _until - jiffies); \ + break; \ + } \ + _t = max_t(long, 0L, _until - jiffies); \ + if (!_t) \ + break; \ + closure_sync_timeout(&cl, _t); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ + _t; \ +}) + +/* + * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if + * condition became true + */ +#define closure_wait_event_timeout(waitlist, _cond, _timeout) \ +({ \ + unsigned long _until = jiffies + _timeout; \ + (_cond) \ + ? max_t(long, 1L, _until - jiffies) \ + : __closure_wait_event_timeout(waitlist, _cond, _until);\ +}) + +#endif /* _LINUX_CLOSURE_H */ diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 52791e070506..c055243ec206 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -173,4 +173,8 @@ int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); void seq_buf_do_printk(struct seq_buf *s, const char *lvl); +enum string_size_units; +void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, + const enum string_size_units units); + #endif /* _LINUX_SEQ_BUF_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 1a00be90d93a..106622ddac77 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -24,6 +24,8 @@ struct shrinker_info { struct shrinker_info_unit *unit[]; }; +struct seq_buf; + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. @@ -80,10 +82,12 @@ struct shrink_control { * @flags determine the shrinker abilities, like numa awareness */ struct shrinker { + const char *name; unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); + void (*to_text)(struct seq_buf *, struct shrinker *); long batch; /* reclaim batch size, 0 = default */ int seeks; /* seeks to recreate an obj */ @@ -110,11 +114,16 @@ struct shrinker { #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; - const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; + + atomic_long_t objects_requested_to_free; + atomic_long_t objects_freed; + unsigned long last_freed; /* timestamp, in jiffies */ + unsigned long last_scanned; /* timestamp, in jiffies */ + atomic64_t ns_run; }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ @@ -135,6 +144,8 @@ __printf(2, 3) struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); +void shrinker_to_text(struct seq_buf *, struct shrinker *); +void shrinkers_to_text(struct seq_buf *); static inline bool shrinker_try_get(struct shrinker *shrinker) { diff --git a/lib/seq_buf.c b/lib/seq_buf.c index f3f3436d60a9..3c41ca83a0c3 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -436,3 +436,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, } return 0; } + +void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, const enum string_size_units units) +{ + char *buf; + size_t size = seq_buf_get_buf(s, &buf); + int wrote = string_get_size(v, 1, units, buf, size); + + seq_buf_commit(s, wrote); +} +EXPORT_SYMBOL(seq_buf_human_readable_u64); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..ad1ac9be0db4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -169,27 +169,6 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/* - * Check whether unreclaimable slab amount is greater than - * all user memory(LRU pages). - * dump_unreclaimable_slab() could help in the case that - * oom due to too much unreclaimable slab used by kernel. -*/ -static bool should_dump_unreclaim_slab(void) -{ - unsigned long nr_lru; - - nr_lru = global_node_page_state(NR_ACTIVE_ANON) + - global_node_page_state(NR_INACTIVE_ANON) + - global_node_page_state(NR_ACTIVE_FILE) + - global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_ISOLATED_ANON) + - global_node_page_state(NR_ISOLATED_FILE) + - global_node_page_state(NR_UNEVICTABLE); - - return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); -} - /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate @@ -469,8 +448,6 @@ static void dump_header(struct oom_control *oc) mem_cgroup_print_oom_meminfo(oc->memcg); else { __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); - if (should_dump_unreclaim_slab()) - dump_unreclaimable_slab(); } if (sysctl_oom_dump_tasks) dump_tasks(oc); diff --git a/mm/show_mem.c b/mm/show_mem.c index 41999e94a56d..013ad4da618c 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -7,15 +7,18 @@ #include <linux/blkdev.h> #include <linux/cma.h> +#include <linux/console.h> #include <linux/cpuset.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/mm.h> #include <linux/mmzone.h> +#include <linux/seq_buf.h> #include <linux/swap.h> #include <linux/vmstat.h> #include "internal.h" +#include "slab.h" #include "swap.h" atomic_long_t _totalram_pages __read_mostly; @@ -392,10 +395,31 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z show_swap_cache_info(); } +static void print_string_as_lines(const char *prefix, const char *lines) +{ + if (!lines) { + printk("%s (null)\n", prefix); + return; + } + + bool locked = console_trylock(); + + while (1) { + const char *p = strchrnul(lines, '\n'); + printk("%s%.*s\n", prefix, (int) (p - lines), lines); + if (!*p) + break; + lines = p + 1; + } + if (locked) + console_unlock(); +} + void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long total = 0, reserved = 0, highmem = 0; struct zone *zone; + char *buf; printk("Mem-Info:\n"); show_free_areas(filter, nodemask, max_zone_idx); @@ -447,4 +471,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) } } #endif + + const unsigned buf_size = 8192; + buf = kmalloc(buf_size, GFP_ATOMIC); + if (buf) { + struct seq_buf s; + + printk("Unreclaimable slab info:\n"); + seq_buf_init(&s, buf, buf_size); + dump_unreclaimable_slab(&s); + print_string_as_lines(KERN_NOTICE, seq_buf_str(&s)); + + static unsigned long shrinkers_last_print; + + /* Ratelimit to at most once every 30 seconds */ + if (!shrinkers_last_print || + time_after(jiffies, shrinkers_last_print + HZ * 30)) { + shrinkers_last_print = jiffies; + + printk("Shrinkers:\n"); + seq_buf_init(&s, buf, buf_size); + shrinkers_to_text(&s); + print_string_as_lines(KERN_NOTICE, seq_buf_str(&s)); + } + + kfree(buf); + } } diff --git a/mm/shrinker.c b/mm/shrinker.c index 4a93fd433689..4a76364d2b7e 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -1,8 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/memcontrol.h> +#include <linux/rculist.h> #include <linux/rwsem.h> +#include <linux/seq_buf.h> #include <linux/shrinker.h> -#include <linux/rculist.h> #include <trace/events/vmscan.h> #include "internal.h" @@ -411,6 +412,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); + u64 start_time = ktime_get_ns(); /* * Normally, we should not scan less than batch_size objects in one @@ -461,6 +463,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); + unsigned long now = jiffies; + if (freed) { + atomic_long_add(freed, &shrinker->objects_freed); + shrinker->last_freed = now; + } + shrinker->last_scanned = now; + atomic_long_add(scanned, &shrinker->objects_requested_to_free); + + atomic64_add(ktime_get_ns() - start_time, &shrinker->ns_run); + + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } @@ -809,3 +822,95 @@ void shrinker_free(struct shrinker *shrinker) call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); } EXPORT_SYMBOL_GPL(shrinker_free); + +void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker) +{ + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, +#ifdef CONFIG_MEMCG + .memcg = root_mem_cgroup, +#endif + }; + unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed); + + seq_buf_printf(out, "%ps", shrinker->scan_objects); + if (shrinker->name) + seq_buf_printf(out, ": %s", shrinker->name); + seq_buf_putc(out, '\n'); + + seq_buf_printf(out, "objects: %lu\n", shrinker->count_objects(shrinker, &sc)); + seq_buf_printf(out, "requested to free: %lu\n", atomic_long_read(&shrinker->objects_requested_to_free)); + seq_buf_printf(out, "objects freed: %lu\n", nr_freed); + seq_buf_printf(out, "last scanned: %li sec ago\n", (jiffies - shrinker->last_scanned) / HZ); + seq_buf_printf(out, "last freed: %li sec ago\n", (jiffies - shrinker->last_freed) / HZ); + seq_buf_printf(out, "ns per object freed: %llu\n", nr_freed + ? div64_ul(atomic64_read(&shrinker->ns_run), nr_freed) + : 0); + + if (shrinker->to_text) { + shrinker->to_text(out, shrinker); + seq_buf_puts(out, "\n"); + } +} + +/** + * shrinkers_to_text - Report on shrinkers with highest usage + * + * This reports on the top 10 shrinkers, by object counts, in sorted order: + * intended to be used for OOM reporting. + */ +void shrinkers_to_text(struct seq_buf *out) +{ + struct shrinker *shrinker; + struct shrinker_by_mem { + struct shrinker *shrinker; + unsigned long mem; + } shrinkers_by_mem[4]; + int i, nr = 0; + + if (!mutex_trylock(&shrinker_mutex)) { + seq_buf_puts(out, "(couldn't take shrinker lock)"); + return; + } + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, +#ifdef CONFIG_MEMCG + .memcg = root_mem_cgroup, +#endif + }; + unsigned long mem = shrinker->count_objects(shrinker, &sc); + + if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY) + continue; + + for (i = 0; i < nr; i++) + if (mem < shrinkers_by_mem[i].mem) + break; + + if (nr < ARRAY_SIZE(shrinkers_by_mem)) { + memmove(&shrinkers_by_mem[i + 1], + &shrinkers_by_mem[i], + sizeof(shrinkers_by_mem[0]) * (nr - i)); + nr++; + } else if (i) { + i--; + memmove(&shrinkers_by_mem[0], + &shrinkers_by_mem[1], + sizeof(shrinkers_by_mem[0]) * i); + } else { + continue; + } + + shrinkers_by_mem[i] = (struct shrinker_by_mem) { + .shrinker = shrinker, + .mem = mem, + }; + } + + for (i = nr - 1; i >= 0; --i) + shrinker_to_text(out, shrinkers_by_mem[i].shrinker); + + mutex_unlock(&shrinker_mutex); +} diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 20eaee3e97f7..a16c2848d332 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -2,6 +2,7 @@ #include <linux/idr.h> #include <linux/slab.h> #include <linux/debugfs.h> +#include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/shrinker.h> #include <linux/memcontrol.h> @@ -159,6 +160,21 @@ static const struct file_operations shrinker_debugfs_scan_fops = { .write = shrinker_debugfs_scan_write, }; +static int shrinker_debugfs_report_show(struct seq_file *m, void *v) +{ + struct shrinker *shrinker = m->private; + char *bufp; + size_t buflen = seq_get_buf(m, &bufp); + struct seq_buf out; + + seq_buf_init(&out, bufp, buflen); + shrinker_to_text(&out, shrinker); + seq_commit(m, seq_buf_used(&out)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_report); + int shrinker_debugfs_add(struct shrinker *shrinker) { struct dentry *entry; @@ -190,6 +206,8 @@ int shrinker_debugfs_add(struct shrinker *shrinker) &shrinker_debugfs_count_fops); debugfs_create_file("scan", 0220, entry, shrinker, &shrinker_debugfs_scan_fops); + debugfs_create_file("report", 0440, entry, shrinker, + &shrinker_debugfs_report_fops); return 0; } diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..09be83786b68 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -587,10 +587,12 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } +struct seq_buf; + #ifdef CONFIG_SLUB_DEBUG -void dump_unreclaimable_slab(void); +void dump_unreclaimable_slab(struct seq_buf *); #else -static inline void dump_unreclaimable_slab(void) +static inline void dump_unreclaimable_slab(struct seq_buf *out) { } #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index bfe7c40eeee1..a365f146e669 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -27,6 +27,7 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> +#include <linux/seq_buf.h> #include <linux/stackdepot.h> #include <trace/events/rcu.h> @@ -1127,10 +1128,15 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -void dump_unreclaimable_slab(void) +void dump_unreclaimable_slab(struct seq_buf *out) { struct kmem_cache *s; struct slabinfo sinfo; + struct slab_by_mem { + struct kmem_cache *s; + size_t total, active; + } slabs_by_mem[10], n; + int i, nr = 0; /* * Here acquiring slab_mutex is risky since we don't prefer to get @@ -1140,24 +1146,52 @@ void dump_unreclaimable_slab(void) * without acquiring the mutex. */ if (!mutex_trylock(&slab_mutex)) { - pr_warn("excessive unreclaimable slab but cannot dump stats\n"); + seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n"); return; } - pr_info("Unreclaimable slab info:\n"); - pr_info("Name Used Total\n"); - list_for_each_entry(s, &slab_caches, list) { if (s->flags & SLAB_RECLAIM_ACCOUNT) continue; get_slabinfo(s, &sinfo); - if (sinfo.num_objs > 0) - pr_info("%-17s %10luKB %10luKB\n", s->name, - (sinfo.active_objs * s->size) / 1024, - (sinfo.num_objs * s->size) / 1024); + if (!sinfo.num_objs) + continue; + + n.s = s; + n.total = sinfo.num_objs * s->size; + n.active = sinfo.active_objs * s->size; + + for (i = 0; i < nr; i++) + if (n.total < slabs_by_mem[i].total) + break; + + if (nr < ARRAY_SIZE(slabs_by_mem)) { + memmove(&slabs_by_mem[i + 1], + &slabs_by_mem[i], + sizeof(slabs_by_mem[0]) * (nr - i)); + nr++; + } else if (i) { + i--; + memmove(&slabs_by_mem[0], + &slabs_by_mem[1], + sizeof(slabs_by_mem[0]) * i); + } else { + continue; + } + + slabs_by_mem[i] = n; } + + for (i = nr - 1; i >= 0; --i) { + seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); + seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2); + seq_buf_printf(out, " active: "); + seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2); + seq_buf_putc(out, '\n'); + } + mutex_unlock(&slab_mutex); } |