diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/Kconfig | 4 | ||||
-rw-r--r-- | fs/bcachefs/Makefile | 8 | ||||
-rw-r--r-- | fs/bcachefs/async_objs.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/closure.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/data_update.c | 8 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting_format.h | 8 | ||||
-rw-r--r-- | fs/bcachefs/errcode.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/fast_list.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/fs.c | 14 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 44 | ||||
-rw-r--r-- | fs/bcachefs/nocow_locking.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/opts.c | 8 | ||||
-rw-r--r-- | fs/bcachefs/opts.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 308 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.h | 36 | ||||
-rw-r--r-- | fs/bcachefs/rebalance_format.h | 63 | ||||
-rw-r--r-- | fs/bcachefs/sb-downgrade.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/util.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/vendor/closure.c | 218 | ||||
-rw-r--r-- | fs/bcachefs/vendor/closure.h | 490 |
23 files changed, 968 insertions, 276 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 8cb2b9d5da96..5455412b2b75 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -3,14 +3,12 @@ config BCACHEFS_FS tristate "bcachefs filesystem support (EXPERIMENTAL)" depends on BLOCK select EXPORTFS - select CLOSURES select CRC32 select CRC64 select FS_POSIX_ACL select LZ4_COMPRESS select LZ4_DECOMPRESS select LZ4HC_COMPRESS - select LZ4HC_DECOMPRESS select ZLIB_DEFLATE select ZLIB_INFLATE select ZSTD_COMPRESS @@ -22,10 +20,8 @@ config BCACHEFS_FS select RAID6_PQ select XOR_BLOCKS select XXHASH - select SRCU select SYMBOLIC_ERRNAME select MIN_HEAP - select XARRAY_MULTI help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index bb2a80fba12b..1e87eee962ec 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -5,6 +5,7 @@ bcachefs-y := \ acl.o \ alloc_background.o \ alloc_foreground.o \ + async_objs.o \ backpointers.o \ bkey.o \ bkey_methods.o \ @@ -41,6 +42,7 @@ bcachefs-y := \ extents.o \ extent_update.o \ eytzinger.o \ + fast_list.o \ fs.o \ fs-ioctl.o \ fs-io.o \ @@ -96,10 +98,8 @@ bcachefs-y := \ two_state_shared_lock.o \ util.o \ varint.o \ - xattr.o - -bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += fast_list.o -bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o + xattr.o \ + vendor/closure.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c index ad04e5f0f056..bd935782c5f7 100644 --- a/fs/bcachefs/async_objs.c +++ b/fs/bcachefs/async_objs.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + /* * Async obj debugging: keep asynchronous objects on (very fast) lists, make * them visibile in debugfs: @@ -139,3 +141,5 @@ int bch2_fs_async_obj_init(struct bch_fs *c) return 0; } + +#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 83d6ab9c1a91..3ccca855f05e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -196,7 +196,6 @@ #include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> -#include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/math64.h> @@ -217,6 +216,7 @@ #include "bcachefs_format.h" #include "btree_journal_iter_types.h" +#include "closure.h" #include "disk_accounting_types.h" #include "errcode.h" #include "fast_list.h" diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 090f11e122ad..d29bd684b137 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -707,8 +707,7 @@ struct bch_sb_field_ext { x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ - x(btree_node_accounting, BCH_VERSION(1, 31)) \ - x(rebalance_v2, BCH_VERSION(1, 32)) + x(btree_node_accounting, BCH_VERSION(1, 31)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/closure.h b/fs/bcachefs/closure.h new file mode 100644 index 000000000000..d8d4c7093ce0 --- /dev/null +++ b/fs/bcachefs/closure.h @@ -0,0 +1,5 @@ +#include "vendor/closure.h" + +#define closure_wait bch2_closure_wait +#define closure_return_sync bch2_closure_return_sync +#define __closure_wake_up __bch2_closure_wake_up diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 894ba0944fea..62d5d17d681e 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -91,8 +91,10 @@ bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bke move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || list_empty(&ctxt->ios)); - if (!locked) + if (!locked) { + bch2_trans_unlock(ctxt->trans); bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } } return true; } @@ -449,8 +451,8 @@ restart_drop_extra_replicas: if (trace_data_update_enabled()) trace_data_update2(m, old, k, insert); - if (bch2_bkey_needs_rb(bkey_i_to_s_c(insert)) && - !bch2_bkey_needs_rb(k)) + if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > + bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) trace_io_move_created_rebalance2(m, old, k, insert); ret = bch2_trans_commit(trans, &op->res, diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9da26e11446b..a99f821c6a1c 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -282,9 +282,6 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_str(out, "btree="); bch2_btree_id_to_text(out, k->btree.id); break; - case BCH_DISK_ACCOUNTING_rebalance_work_v2: - bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type); - break; } } diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 0b61d6100180..730a17ea4243 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -110,8 +110,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(snapshot, 5, 1) \ x(btree, 6, 3) \ x(rebalance_work, 7, 1) \ - x(inum, 8, 3) \ - x(rebalance_work_v2, 9, 1) \ + x(inum, 8, 3) enum disk_accounting_type { #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, @@ -211,10 +210,6 @@ struct bch_acct_inum { struct bch_acct_rebalance_work { }; -struct bch_acct_rebalance_work_v2 { - __u8 type; -}; - struct disk_accounting_pos { union { struct { @@ -229,7 +224,6 @@ struct disk_accounting_pos { struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; struct bch_acct_inum inum; - struct bch_acct_rebalance_work_v2 rebalance_work_v2; } __packed; } __packed; struct bpos _pad; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c4344a1d6976..cbf1eedddad7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -188,6 +188,11 @@ x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ x(0, data_update_done) \ x(0, bkey_was_deleted) \ + x(0, bucket_not_moveable) \ + x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_dev_not_rw) \ + x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_bucket_open) \ + x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_bp_mismatch) \ + x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_lru_race) \ x(BCH_ERR_data_update_done, data_update_done_would_block) \ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c index 6be2a45be1dd..de2947cd2bcf 100644 --- a/fs/bcachefs/fast_list.c +++ b/fs/bcachefs/fast_list.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS /* * Fast, unordered lists @@ -166,3 +167,5 @@ int fast_list_init(struct fast_list *l) return -ENOMEM; return 0; } + +#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 958849c30071..f1849eb8327d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1521,6 +1521,7 @@ static const struct vm_operations_struct bch_vm_ops = { .page_mkwrite = bch2_page_mkwrite, }; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,17,0) static int bch2_mmap_prepare(struct vm_area_desc *desc) { file_accessed(desc->file); @@ -1528,6 +1529,15 @@ static int bch2_mmap_prepare(struct vm_area_desc *desc) desc->vm_ops = &bch_vm_ops; return 0; } +#else +static int bch2_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + + vma->vm_ops = &bch_vm_ops; + return 0; +} +#endif /* Directories: */ @@ -1719,7 +1729,11 @@ static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,17,0) .mmap_prepare = bch2_mmap_prepare, +#else + .mmap = bch2_mmap, +#endif .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index f36d60b8fb07..0f7e35684bc8 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -62,25 +62,38 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, { struct bch_fs *c = trans->c; - if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) + /* + * Valid bucket? + * + * XXX: we should kill the LRU entry here if it's not + */ + CLASS(bch2_dev_bucket_tryget, ca)(c, b->k.bucket); + if (!ca) return 0; - CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); - if (ret) - return ret; - - CLASS(bch2_dev_bucket_tryget, ca)(c, k.k->p); - if (!ca) + if (ca->mi.state != BCH_MEMBER_STATE_rw || + !bch2_dev_is_online(ca)) { + bch_err_throw(c, bucket_not_moveable_dev_not_rw); return 0; + } - if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) + /* Bucket still being written? */ + if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) { + bch_err_throw(c, bucket_not_moveable_bucket_open); return 0; + } - if (ca->mi.state != BCH_MEMBER_STATE_rw || - !bch2_dev_is_online(ca)) + /* We won't be able to evacuate it if there's missing backpointers */ + if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) { + bch_err_throw(c, bucket_not_moveable_bp_mismatch); return 0; + } + + CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); @@ -88,7 +101,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, b->sectors = bch2_bucket_sectors_dirty(*a); u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - return lru_idx && lru_idx <= time; + if (!lru_idx || lru_idx > time) { + bch_err_throw(c, bucket_not_moveable_lru_race); + return 0; + } + + return true; } static void move_bucket_free(struct buckets_in_flight *list, diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 58cfd540c6d6..71b17f18e90c 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -2,11 +2,10 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "closure.h" #include "nocow_locking.h" #include "util.h" -#include <linux/closure.h> - bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) { u64 dev_bucket = bucket_to_u64(bucket); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 365cce4a6b49..bd5faafc9aa7 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -103,13 +103,6 @@ static const char * const __bch2_fs_usage_types[] = { #undef x -static const char * const __bch2_rebalance_accounting_types[] = { -#define x(n) #n, - BCH_REBALANCE_ACCOUNTING() -#undef x - NULL -}; - static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], unsigned nr, const char *type, unsigned idx) { @@ -132,7 +125,6 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); -PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index de1ac235e929..6b9f18839345 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -34,7 +34,6 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); -void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type); static inline const char *bch2_d_type_str(unsigned d_type) { diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index f2d0040d043c..67d6a90e86ef 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -38,30 +38,15 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b return NULL; } -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) { return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); } -static const char * const rebalance_opts[] = { -#define x(n) #n, - BCH_REBALANCE_OPTS() -#undef x - NULL -}; - void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_rebalance *r) { - prt_str(out, "need_rb="); - prt_bitflags(out, rebalance_opts, r->need_rb); - - if (r->hipri) - prt_str(out, " hipri"); - if (r->pending) - prt_str(out, " pending"); - - prt_printf(out, " replicas=%u", r->data_replicas); + prt_printf(out, "replicas=%u", r->data_replicas); if (r->data_replicas_from_inode) prt_str(out, " (inode)"); @@ -107,54 +92,32 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } } -/* - * XXX: check in bkey_validate that if r->hipri or r->pending are set, - * r->data_replicas are also set - */ - -static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r) +int bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) { - if (!r) - return 0; - unsigned ret = r->need_rb; + struct bch_fs *c = trans->c; + int need_rebalance_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; - if (r->hipri) - ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority); - if (r->pending) { - ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending); - ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target); - } - return ret; -} + s64 s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta -= s != 0; + need_rebalance_sectors_delta[0] -= s; -int __bch2_trigger_extent_rebalance(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned old_r, unsigned new_r, - enum btree_iter_update_trigger_flags flags) -{ - int delta = (int) !!new_r - (int) !!old_r; - if ((flags & BTREE_TRIGGER_transactional) && delta) { + s = bch2_bkey_sectors_need_rebalance(c, new); + need_rebalance_delta += s != 0; + need_rebalance_sectors_delta[0] += s; + + if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, delta > 0); + new.k->p, need_rebalance_delta > 0); if (ret) return ret; } - delta = old.k->size == new.k->size - ? old_r ^ new_r - : old_r | new_r; - while (delta) { - unsigned c = __ffs(delta); - delta ^= BIT(c); - - s64 v[1] = { 0 }; - if (old_r & BIT(c)) - v[0] -= (s64) old.k->size; - if (new_r & BIT(c)) - v[0] += (s64) new.k->size; - + if (need_rebalance_sectors_delta[0]) { int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - v, rebalance_work_v2, c); + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -162,45 +125,39 @@ int __bch2_trigger_extent_rebalance(struct btree_trans *trans, return 0; } -static struct bch_extent_rebalance -bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, - struct bch_inode_opts *opts, - unsigned *move_ptrs, - unsigned *compress_ptrs, - bool may_update_indirect) +static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, + struct bch_inode_opts *io_opts, + unsigned *move_ptrs, + unsigned *compress_ptrs, + u64 *sectors) { *move_ptrs = 0; *compress_ptrs = 0; + *sectors = 0; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) }; - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return r; - - const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (old_r) { - r = *old_r; - r.need_rb = 0; - } -#define x(_name) \ - if (k.k->type != KEY_TYPE_reflink_v || \ - may_update_indirect || \ - (!opts->_name##_from_inode && !r._name##_from_inode)) { \ - r._name = opts->_name; \ - r._name##_from_inode = opts->_name##_from_inode; \ - } - BCH_REBALANCE_OPTS() -#undef x + const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs); + if (!io_opts && !rb_opts) + return; - unsigned compression_type = bch2_compression_opt_to_type(r.background_compression); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return; - bool incompressible = false, unwritten = false, ec = false; - unsigned durability = 0, min_durability = INT_MAX; + unsigned compression_type = + bch2_compression_opt_to_type(io_opts + ? io_opts->background_compression + : rb_opts->background_compression); + unsigned target = io_opts + ? io_opts->background_target + : rb_opts->background_target; + if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target)) + target = 0; const union bch_extent_entry *entry; struct extent_ptr_decoded p; + bool incompressible = false, unwritten = false; + unsigned ptr_idx = 1; guard(rcu)(); @@ -209,50 +166,72 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, unwritten |= p.ptr.unwritten; if (!p.ptr.cached) { - if (p.crc.compression_type != compression_type) { + if (p.crc.compression_type != compression_type) *compress_ptrs |= ptr_idx; - r.need_rb |= BIT(BCH_REBALANCE_background_compression); - } - if (r.background_target && - !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) { + if (target && !bch2_dev_in_target(c, p.ptr.dev, target)) *move_ptrs |= ptr_idx; - r.need_rb |= BIT(BCH_REBALANCE_background_target); - } - - unsigned d = bch2_extent_ptr_durability(c, &p); - durability += d; - min_durability = min(min_durability, d); - - ec |= p.has_ec; } ptr_idx <<= 1; } - if (unwritten || incompressible) { + if (unwritten) *compress_ptrs = 0; - r.need_rb &= ~BIT(BCH_REBALANCE_background_compression); + if (incompressible) + *compress_ptrs = 0; + + unsigned rb_ptrs = *move_ptrs | *compress_ptrs; + + if (!rb_ptrs) + return; + + ptr_idx = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (rb_ptrs & ptr_idx) + *sectors += p.crc.compressed_size; + ptr_idx <<= 1; } - return r; } -static inline bool bkey_should_have_rb_opts(struct bkey_s_c k, - struct bch_extent_rebalance new) +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + u64 sectors = 0; + + bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, §ors); + return sectors; +} + +static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + struct bch_inode_opts *opts, + struct bkey_s_c k) +{ + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + u64 sectors = 0; + + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); + return move_ptrs|compress_ptrs; +} + +static inline bool bkey_should_have_rb_opts(struct bch_fs *c, + struct bch_inode_opts *opts, + struct bkey_s_c k) { if (k.k->type == KEY_TYPE_reflink_v) { -#define x(n) if (new.n##_from_inode) return true; +#define x(n) if (opts->n##_from_inode) return true; BCH_REBALANCE_OPTS() #undef x } - return new.need_rb; + return bch2_bkey_ptrs_need_rebalance(c, opts, k); } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, - struct bch_inode_opts *opts, +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, struct bkey_i *_k, enum set_needs_rebalance_ctx ctx, - u32 opt_change_cookie) + u32 change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; @@ -261,27 +240,17 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_extent_rebalance *old = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - struct bch_extent_rebalance new = - bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, - ctx == SET_NEEDS_REBALANCE_opt_change_indirect); - - bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new); - - if (should_have_rb == !!old && - (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) - return 0; - - if (should_have_rb) { + if (bkey_should_have_rb_opts(c, opts, k.s_c)) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = new; - } else if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); + *old = io_opts_to_rebalance_opts(c, opts); + } else { + if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); + } return 0; } @@ -300,19 +269,32 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, if (!bkey_extent_is_direct_data(k.k)) return 0; - struct bch_extent_rebalance *old = - (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k); + bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect; - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - struct bch_extent_rebalance new = - bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, - ctx == SET_NEEDS_REBALANCE_opt_change_indirect); + /* + * If it's an indirect extent, and we walked to it directly, we won't + * have the options from the inode that were directly applied: options + * from the extent take precedence - unless the io_opts option came from + * the inode and may_update_indirect is true (walked from a + * REFLINK_P_MAY_UPDATE_OPTIONS pointer). + */ + const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + if (old && k.k->type == KEY_TYPE_reflink_v) { +#define x(_name) \ + if (old->_name##_from_inode && \ + !(may_update_indirect && io_opts->_name##_from_inode)) { \ + io_opts->_name = old->_name; \ + io_opts->_name##_from_inode = true; \ + } + BCH_REBALANCE_OPTS() +#undef x + } - bool should_have_rb = bkey_should_have_rb_opts(k, new); + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts); - if (should_have_rb == !!old && - (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) + if (bkey_should_have_rb_opts(c, io_opts, k) + ? old && !memcmp(old, &new, sizeof(new)) + : !old) return 0; struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); @@ -324,7 +306,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: + return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: bch_err_throw(c, transaction_restart_commit); @@ -390,8 +372,7 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, enum set_needs_rebalance_ctx ctx) { struct bch_inode_opts *opts = - bch2_extent_get_io_opts(trans, snapshot_io_opts, - extent_pos, extent_iter, extent_k); + bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) return opts; @@ -554,6 +535,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, return &(&darray_pop(buf))->k_i; } +static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) + return 0; + + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + extent_entry_drop(bkey_i_to_s(n), + (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +} + static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, @@ -572,10 +570,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - if (!r || !r->need_rb) /* Write buffer race? */ - return bkey_s_c_null; - struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, extent_iter->pos, extent_iter, k, @@ -586,23 +580,22 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, *opts_ret = opts; - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, false); - memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = move_ptrs|compress_ptrs; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; - if (!data_opts->rewrite_ptrs && - !data_opts->kill_ptrs && - !data_opts->kill_ec_ptrs && - !data_opts->extra_replicas) { - CLASS(printbuf, buf)(); - prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "%s", buf.buf); + if (!data_opts->rewrite_ptrs) { + /* + * device we would want to write to offline? devices in target + * changed? + * + * We'll now need a full scan before this extent is picked up + * again: + */ + int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); return bkey_s_c_null; } @@ -612,6 +605,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + u64 sectors = 0; + + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); + if (move_ptrs) { prt_str(&buf, "move="); bch2_target_to_text(&buf, c, opts->background_target); @@ -1090,7 +1089,8 @@ static int check_rebalance_work_one(struct btree_trans *trans, extent_k.k = &deleted; } - bool should_have_rebalance = bch2_bkey_needs_rb(extent_k); + bool should_have_rebalance = + bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; if (should_have_rebalance != have_rebalance) { diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index ae576d8af609..24bafa42f070 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -10,7 +10,7 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, struct bch_inode_opts *opts) { - return (struct bch_extent_rebalance) { + struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance), #define x(_name) \ ._name = opts->_name, \ @@ -18,36 +18,22 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f BCH_REBALANCE_OPTS() #undef x }; + + if (r.background_target && + !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) + r.background_target = 0; + + return r; }; void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, const struct bch_extent_rebalance *); -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); - -static inline int bch2_bkey_needs_rb(struct bkey_s_c k) -{ - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - return r ? r->need_rb : 0; -} +int bch2_trigger_extent_rebalance(struct btree_trans *, + struct bkey_s_c, struct bkey_s_c, + enum btree_iter_update_trigger_flags); -int __bch2_trigger_extent_rebalance(struct btree_trans *, - struct bkey_s_c, struct bkey_s_c, - unsigned, unsigned, - enum btree_iter_update_trigger_flags); - -static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - unsigned old_r = bch2_bkey_needs_rb(old); - unsigned new_r = bch2_bkey_needs_rb(new); - - return old_r != new_r || - (old.k->size != new.k->size && (old_r|new_r)) - ? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags) - : 0; -} +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_opt_change, diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h index d7a5f899e789..ff9a1342a22b 100644 --- a/fs/bcachefs/rebalance_format.h +++ b/fs/bcachefs/rebalance_format.h @@ -5,76 +5,49 @@ struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:6, - unused:5, - hipri:1, - pending:1, - need_rb:5, + unused:3, - data_replicas_from_inode:1, - data_checksum_from_inode:1, + promote_target_from_inode:1, erasure_code_from_inode:1, + data_checksum_from_inode:1, background_compression_from_inode:1, + data_replicas_from_inode:1, background_target_from_inode:1, - promote_target_from_inode:1, - data_replicas:3, - data_checksum:4, + promote_target:16, erasure_code:1, + data_checksum:4, + data_replicas:4, background_compression:8, /* enum bch_compression_opt */ - background_target:12, - promote_target:12; + background_target:16; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 promote_target:12, - background_target:12, + __u64 background_target:16, background_compression:8, - erasure_code:1, + data_replicas:4, data_checksum:4, - data_replicas:3, + erasure_code:1, + promote_target:16, - promote_target_from_inode:1, background_target_from_inode:1, + data_replicas_from_inode:1, background_compression_from_inode:1, - erasure_code_from_inode:1, data_checksum_from_inode:1, - data_replicas_from_inode:1, + erasure_code_from_inode:1, + promote_target_from_inode:1, - need_rb:5, - pending:1, - hipri:1, - unused:5, + unused:3, type:6; #endif }; /* subset of BCH_INODE_OPTS */ #define BCH_REBALANCE_OPTS() \ - x(data_replicas) \ x(data_checksum) \ - x(erasure_code) \ x(background_compression) \ - x(background_target) \ - x(promote_target) - -enum bch_rebalance_opts { -#define x(n) BCH_REBALANCE_##n, - BCH_REBALANCE_OPTS() -#undef x -}; - -#define BCH_REBALANCE_ACCOUNTING() \ x(data_replicas) \ - x(data_checksum) \ - x(erasure_code) \ - x(background_compression) \ + x(promote_target) \ x(background_target) \ - x(high_priority) \ - x(pending) \ - -enum bch_rebalance_accounting_type { -#define x(n) BCH_REBALANCE_ACCOUNTING_##n, - BCH_REBALANCE_ACCOUNTING() -#undef x -}; + x(erasure_code) #endif /* _BCACHEFS_REBALANCE_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 0d47e83c28a6..bfd06fd5d506 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -107,9 +107,7 @@ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ x(btree_node_accounting, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(rebalance_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work)) + BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 52ac8230be9f..555e0d8f3cf0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -4,7 +4,6 @@ #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/closure.h> #include <linux/errno.h> #include <linux/freezer.h> #include <linux/kernel.h> @@ -21,6 +20,7 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> +#include "closure.h" #include "mean_and_variance.h" #include "darray.h" diff --git a/fs/bcachefs/vendor/closure.c b/fs/bcachefs/vendor/closure.c new file mode 100644 index 000000000000..bdafd3a57386 --- /dev/null +++ b/fs/bcachefs/vendor/closure.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Asynchronous refcounty things + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "closure.h" +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/sched/debug.h> + +static void closure_val_checks(struct closure *cl, unsigned new, int d) +{ + unsigned count = new & CLOSURE_REMAINING_MASK; + + if (WARN(new & CLOSURE_GUARD_MASK, + "closure %ps has guard bits set: %x (%u), delta %i", + cl->fn, + new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d)) + new &= ~CLOSURE_GUARD_MASK; + + WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)), + "closure %ps ref hit 0 with incorrect flags set: %x (%u)", + cl->fn, + new, (unsigned) __fls(new)); +} + +enum new_closure_state { + CLOSURE_normal_put, + CLOSURE_requeue, + CLOSURE_done, +}; + +/* For clearing flags with the same atomic op as a put */ +void bch2_closure_sub(struct closure *cl, int v) +{ + enum new_closure_state s; + struct task_struct *sleeper; + + /* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */ + guard(rcu)(); + + int old = atomic_read_acquire(&cl->remaining), new; + do { + new = old - v; + + if (new & CLOSURE_REMAINING_MASK) { + s = CLOSURE_normal_put; + } else { + if ((cl->fn || (new & CLOSURE_SLEEPING)) && + !(new & CLOSURE_DESTRUCTOR)) { + s = CLOSURE_requeue; + new += CLOSURE_REMAINING_INITIALIZER; + } else + s = CLOSURE_done; + + sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL; + new &= ~CLOSURE_SLEEPING; + } + + closure_val_checks(cl, new, -v); + } while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new)); + + if (s == CLOSURE_normal_put) + return; + + if (sleeper) { + smp_mb(); + wake_up_process(sleeper); + return; + } + + if (s == CLOSURE_requeue) { + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + closure_fn *destructor = cl->fn; + + closure_debug_destroy(cl); + + if (destructor) + destructor(&cl->work); + + if (parent) + closure_put(parent); + } +} + +/* + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ +void __bch2_closure_wake_up(struct closure_waitlist *wait_list) +{ + struct llist_node *list; + struct closure *cl, *t; + struct llist_node *reverse = NULL; + + list = llist_del_all(&wait_list->list); + + /* We first reverse the list to preserve FIFO ordering and fairness */ + reverse = llist_reverse_order(list); + + /* Then do the wakeups */ + llist_for_each_entry_safe(cl, t, reverse, list) { + closure_set_waiting(cl, 0); + bch2_closure_sub(cl, CLOSURE_WAITING + 1); + } +} + +/** + * closure_wait - add a closure to a waitlist + * @waitlist: will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * @cl: closure pointer. + * + */ +bool bch2_closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +{ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + + closure_set_waiting(cl, _RET_IP_); + unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining); + closure_val_checks(cl, r, CLOSURE_WAITING + 1); + + llist_add(&cl->list, &waitlist->list); + + return true; +} + +void __sched __bch2_closure_sync(struct closure *cl) +{ + cl->sleeper = current; + bch2_closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_SLEEPING); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); +} + +/* + * closure_return_sync - finish running a closure, synchronously (i.e. waiting + * for outstanding get()s to finish) and returning once closure refcount is 0. + * + * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent + * closure_get_not_zero() calls will fail. + */ +void __sched bch2_closure_return_sync(struct closure *cl) +{ + cl->sleeper = current; + bch2_closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_DESTRUCTOR - + CLOSURE_SLEEPING); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); + + if (cl->parent) + closure_put(cl->parent); +} + +int __sched __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout) +{ + int ret = 0; + + cl->sleeper = current; + bch2_closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_SLEEPING); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + /* + * Carefully undo the continue_at() - but only if it + * hasn't completed, i.e. the final closure_put() hasn't + * happened yet: + */ + unsigned old = atomic_read(&cl->remaining), new; + if (!(old & CLOSURE_SLEEPING)) + goto success; + + if (!timeout) { + do { + if (!(old & CLOSURE_SLEEPING)) + goto success; + + new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING; + closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING); + } while (!atomic_try_cmpxchg(&cl->remaining, &old, new)); + + ret = -ETIME; + break; + } + + timeout = schedule_timeout(timeout); + } +success: + __set_current_state(TASK_RUNNING); + return ret; +} diff --git a/fs/bcachefs/vendor/closure.h b/fs/bcachefs/vendor/closure.h new file mode 100644 index 000000000000..79112efe30a7 --- /dev/null +++ b/fs/bcachefs/vendor/closure.h @@ -0,0 +1,490 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CLOSURE_H +#define _LINUX_CLOSURE_H + +#include <linux/llist.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/workqueue.h> + +/* + * Closure is perhaps the most overused and abused term in computer science, but + * since I've been unable to come up with anything better you're stuck with it + * again. + * + * What are closures? + * + * They embed a refcount. The basic idea is they count "things that are in + * progress" - in flight bios, some other thread that's doing something else - + * anything you might want to wait on. + * + * The refcount may be manipulated with closure_get() and closure_put(). + * closure_put() is where many of the interesting things happen, when it causes + * the refcount to go to 0. + * + * Closures can be used to wait on things both synchronously and asynchronously, + * and synchronous and asynchronous use can be mixed without restriction. To + * wait synchronously, use closure_sync() - you will sleep until your closure's + * refcount hits 1. + * + * To wait asynchronously, use + * continue_at(cl, next_function, workqueue); + * + * passing it, as you might expect, the function to run when nothing is pending + * and the workqueue to run that function out of. + * + * continue_at() also, critically, requires a 'return' immediately following the + * location where this macro is referenced, to return to the calling function. + * There's good reason for this. + * + * To use safely closures asynchronously, they must always have a refcount while + * they are running owned by the thread that is running them. Otherwise, suppose + * you submit some bios and wish to have a function run when they all complete: + * + * foo_endio(struct bio *bio) + * { + * closure_put(cl); + * } + * + * closure_init(cl); + * + * do_stuff(); + * closure_get(cl); + * bio1->bi_endio = foo_endio; + * bio_submit(bio1); + * + * do_more_stuff(); + * closure_get(cl); + * bio2->bi_endio = foo_endio; + * bio_submit(bio2); + * + * continue_at(cl, complete_some_read, system_wq); + * + * If closure's refcount started at 0, complete_some_read() could run before the + * second bio was submitted - which is almost always not what you want! More + * importantly, it wouldn't be possible to say whether the original thread or + * complete_some_read()'s thread owned the closure - and whatever state it was + * associated with! + * + * So, closure_init() initializes a closure's refcount to 1 - and when a + * closure_fn is run, the refcount will be reset to 1 first. + * + * Then, the rule is - if you got the refcount with closure_get(), release it + * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount + * on a closure because you called closure_init() or you were run out of a + * closure - _always_ use continue_at(). Doing so consistently will help + * eliminate an entire class of particularly pernicious races. + * + * Lastly, you might have a wait list dedicated to a specific event, and have no + * need for specifying the condition - you just want to wait until someone runs + * closure_wake_up() on the appropriate wait list. In that case, just use + * closure_wait(). It will return either true or false, depending on whether the + * closure was already on a wait list or not - a closure can only be on one wait + * list at a time. + * + * Parents: + * + * closure_init() takes two arguments - it takes the closure to initialize, and + * a (possibly null) parent. + * + * If parent is non null, the new closure will have a refcount for its lifetime; + * a closure is considered to be "finished" when its refcount hits 0 and the + * function to run is null. Hence + * + * continue_at(cl, NULL, NULL); + * + * returns up the (spaghetti) stack of closures, precisely like normal return + * returns up the C stack. continue_at() with non null fn is better thought of + * as doing a tail call. + * + * All this implies that a closure should typically be embedded in a particular + * struct (which its refcount will normally control the lifetime of), and that + * struct can very much be thought of as a stack frame. + */ + +struct closure; +struct closure_syncer; +typedef void (closure_fn) (struct work_struct *); +extern struct dentry *bcache_debug; + +struct closure_waitlist { + struct llist_head list; +}; + +enum closure_state { + /* + * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by + * the thread that owns the closure, and cleared by the thread that's + * waking up the closure. + * + * The rest are for debugging and don't affect behaviour: + * + * CLOSURE_RUNNING: Set when a closure is running (i.e. by + * closure_init() and when closure_put() runs then next function), and + * must be cleared before remaining hits 0. Primarily to help guard + * against incorrect usage and accidentally transferring references. + * continue_at() and closure_return() clear it for you, if you're doing + * something unusual you can use closure_set_dead() which also helps + * annotate where references are being transferred. + */ + + CLOSURE_BITS_START = (1U << 24), + CLOSURE_DESTRUCTOR = (1U << 24), + CLOSURE_SLEEPING = (1U << 26), + CLOSURE_WAITING = (1U << 28), + CLOSURE_RUNNING = (1U << 30), +}; + +#define CLOSURE_GUARD_MASK \ + (((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1)) + +#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) + +struct closure { + union { + struct { + struct workqueue_struct *wq; + struct task_struct *sleeper; + struct llist_node list; + closure_fn *fn; + }; + struct work_struct work; + }; + + struct closure *parent; + + atomic_t remaining; + +#ifdef CONFIG_DEBUG_CLOSURES +#define CLOSURE_MAGIC_DEAD 0xc054dead +#define CLOSURE_MAGIC_ALIVE 0xc054a11e +#define CLOSURE_MAGIC_STACK 0xc05451cc + + unsigned int magic; + struct list_head all; + unsigned long ip; + unsigned long waiting_on; +#endif +}; + +void bch2_closure_sub(struct closure *cl, int v); +void __bch2_closure_wake_up(struct closure_waitlist *list); +bool bch2_closure_wait(struct closure_waitlist *list, struct closure *cl); +void __bch2_closure_sync(struct closure *cl); + +/* + * closure_put - decrement a closure's refcount + */ +static inline void closure_put(struct closure *cl) +{ + bch2_closure_sub(cl, 1); +} + +static inline unsigned closure_nr_remaining(struct closure *cl) +{ + return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK; +} + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +static inline void closure_sync(struct closure *cl) +{ + if (closure_nr_remaining(cl) > 1) + __bch2_closure_sync(cl); +} + +int __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout); + +static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout) +{ + return closure_nr_remaining(cl) > 1 + ? __bch2_closure_sync_timeout(cl, timeout) + : 0; +} + +//#ifdef CONFIG_DEBUG_CLOSURES +#if 0 + +void bch2_closure_debug_create(struct closure *cl); +void closure_debug_destroy(struct closure *cl); + +#else + +static inline void bch2_closure_debug_create(struct closure *cl) {} +static inline void closure_debug_destroy(struct closure *cl) {} + +#endif + +static inline void closure_set_ip(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _THIS_IP_; +#endif +} + +static inline void closure_set_ret_ip(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _RET_IP_; +#endif +} + +static inline void closure_set_waiting(struct closure *cl, unsigned long f) +{ +#ifdef CONFIG_DEBUG_CLOSURES + cl->waiting_on = f; +#endif +} + +static inline void closure_set_stopped(struct closure *cl) +{ + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} + +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; +} + +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); + + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(&cl->work); +} + +/** + * closure_get - increment a closure's refcount + */ +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_DEBUG_CLOSURES + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} + +/** + * closure_get_not_zero + */ +static inline bool closure_get_not_zero(struct closure *cl) +{ + unsigned old = atomic_read(&cl->remaining); + do { + if (!(old & CLOSURE_REMAINING_MASK)) + return false; + + } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1)); + + return true; +} + +/** + * closure_init - Initialize a closure, setting the refcount to 1 + * @cl: closure to initialize + * @parent: parent of the new closure. cl will take a refcount on it for its + * lifetime; may be NULL. + */ +static inline void closure_init(struct closure *cl, struct closure *parent) +{ + cl->fn = NULL; + cl->parent = parent; + if (parent) + closure_get(parent); + + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + + bch2_closure_debug_create(cl); + closure_set_ip(cl); +} + +static inline void closure_init_stack(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif +} + +static inline void closure_init_stack_release(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif +} + +/** + * closure_wake_up - wake up all closures on a wait list, + * with memory barrier + */ +static inline void closure_wake_up(struct closure_waitlist *list) +{ + /* Memory barrier for the wait list */ + smp_mb(); + __bch2_closure_wake_up(list); +} + +#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws) +#define closure_type(name, type, member) \ + struct closure *cl = container_of(ws, struct closure, work); \ + type *name = container_of(cl, type, member) + +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. + * + * Note you are expected to immediately return after using this macro. + */ +#define continue_at(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + bch2_closure_sub(_cl, CLOSURE_RUNNING + 1); \ +} while (0) + +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ +#define closure_return(_cl) continue_at((_cl), NULL, NULL) + +void bch2_closure_return_sync(struct closure *cl); + +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ +#define continue_at_nobarrier(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_queue(_cl); \ +} while (0) + +/** + * closure_return_with_destructor - finish execution of a closure, + * with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ +#define closure_return_with_destructor(_cl, _destructor) \ +do { \ + set_closure_fn(_cl, _destructor, NULL); \ + bch2_closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +} while (0) + +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ +static inline void closure_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + closure_init(cl, parent); + continue_at_nobarrier(cl, fn, wq); +} + +#define __closure_wait_event(waitlist, _cond) \ +do { \ + struct closure cl; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + bch2_closure_wait(waitlist, &cl); \ + if (_cond) \ + break; \ + closure_sync(&cl); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ +} while (0) + +#define closure_wait_event(waitlist, _cond) \ +do { \ + if (!(_cond)) \ + __closure_wait_event(waitlist, _cond); \ +} while (0) + +#define __closure_wait_event_timeout(waitlist, _cond, _until) \ +({ \ + struct closure cl; \ + long _t; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + bch2_closure_wait(waitlist, &cl); \ + if (_cond) { \ + _t = max_t(long, 1L, _until - jiffies); \ + break; \ + } \ + _t = max_t(long, 0L, _until - jiffies); \ + if (!_t) \ + break; \ + closure_sync_timeout(&cl, _t); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ + _t; \ +}) + +/* + * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if + * condition became true + */ +#define closure_wait_event_timeout(waitlist, _cond, _timeout) \ +({ \ + unsigned long _until = jiffies + _timeout; \ + (_cond) \ + ? max_t(long, 1L, _until - jiffies) \ + : __closure_wait_event_timeout(waitlist, _cond, _until);\ +}) + +#endif /* _LINUX_CLOSURE_H */ |