From 90ef8b9f57c9114e82c41aef43db80776bbfaf82 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Feb 2018 21:43:46 -0500 Subject: Update bcachefs sources to 90d78c2461 bcachefs: Option parsing for io targets --- .bcachefs_revision | 2 +- include/linux/sched/cputime.h | 6 + include/trace/events/bcachefs.h | 4 +- libbcachefs/alloc.c | 168 +++++++------------- libbcachefs/alloc.h | 2 +- libbcachefs/bcachefs.h | 30 ++-- libbcachefs/bcachefs_format.h | 27 +++- libbcachefs/btree_update_interior.c | 2 +- libbcachefs/chardev.c | 14 +- libbcachefs/compress.c | 70 ++++++--- libbcachefs/extents.c | 50 ++---- libbcachefs/extents.h | 2 +- libbcachefs/fs-io.c | 18 ++- libbcachefs/fs.c | 16 +- libbcachefs/io.c | 46 ++++-- libbcachefs/io.h | 11 +- libbcachefs/io_types.h | 3 +- libbcachefs/move.c | 150 ++++++++++-------- libbcachefs/move.h | 5 +- libbcachefs/movinggc.c | 12 +- libbcachefs/opts.c | 49 +++++- libbcachefs/opts.h | 35 ++++- libbcachefs/super-io.c | 206 ++++++++++++++++++++---- libbcachefs/super-io.h | 37 +++-- libbcachefs/super.c | 133 +++++++++++++--- libbcachefs/super.h | 2 + libbcachefs/super_types.h | 1 - libbcachefs/sysfs.c | 101 ++++++------ libbcachefs/tier.c | 306 ++++++++++++++++++++++-------------- libbcachefs/tier.h | 21 ++- libbcachefs/xattr.c | 22 +-- 31 files changed, 978 insertions(+), 573 deletions(-) create mode 100644 include/linux/sched/cputime.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 76acdf93..d29d45d4 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -e99d29e40210f6d9b7ec9e5b7aee1e48ae7655c5 +90d78c246188f4e90bd9ceb29fe95186b7dc680d diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h new file mode 100644 index 00000000..a89c626f --- /dev/null +++ b/include/linux/sched/cputime.h @@ -0,0 +1,6 @@ + +static inline void task_cputime_adjusted(struct task_struct *p, u64 *utime, u64 *stime) +{ + *utime = 0; + *stime = 0; +} diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index d132dd8a..a7be2d82 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -49,15 +49,13 @@ DECLARE_EVENT_CLASS(bch_dev, TP_STRUCT__entry( __array(char, uuid, 16 ) - __field(unsigned, tier ) ), TP_fast_assign( memcpy(__entry->uuid, ca->uuid.b, 16); - __entry->tier = ca->mi.tier; ), - TP_printk("%pU tier %u", __entry->uuid, __entry->tier) + TP_printk("%pU", __entry->uuid) ); DECLARE_EVENT_CLASS(bch_fs, diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 339ffd02..a76f2b7c 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -89,69 +89,29 @@ static void pd_controllers_update(struct work_struct *work) struct bch_fs, pd_controllers_update); struct bch_dev *ca; - unsigned i, iter; - - /* All units are in bytes */ - u64 faster_tiers_size = 0; - u64 faster_tiers_dirty = 0; - - u64 copygc_can_free = 0; - - rcu_read_lock(); - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { - bch2_pd_controller_update(&c->tiers[i].pd, - div_u64(faster_tiers_size * - c->tiering_percent, 100), - faster_tiers_dirty, - -1); - - for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) { - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); - - u64 size = bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket) << 9; - u64 dirty = bucket_to_sector(ca, - stats.buckets[BCH_DATA_USER]) << 9; - u64 free = bucket_to_sector(ca, - __dev_buckets_free(ca, stats)) << 9; - /* - * Bytes of internal fragmentation, which can be - * reclaimed by copy GC - */ - s64 fragmented = (bucket_to_sector(ca, - stats.buckets[BCH_DATA_USER] + - stats.buckets[BCH_DATA_CACHED]) - - (stats.sectors[BCH_DATA_USER] + - stats.sectors[BCH_DATA_CACHED])) << 9; + unsigned i; - fragmented = max(0LL, fragmented); + for_each_member_device(ca, c, i) { + struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); - bch2_pd_controller_update(&ca->copygc_pd, - free, fragmented, -1); + u64 free = bucket_to_sector(ca, + __dev_buckets_free(ca, stats)) << 9; + /* + * Bytes of internal fragmentation, which can be + * reclaimed by copy GC + */ + s64 fragmented = (bucket_to_sector(ca, + stats.buckets[BCH_DATA_USER] + + stats.buckets[BCH_DATA_CACHED]) - + (stats.sectors[BCH_DATA_USER] + + stats.sectors[BCH_DATA_CACHED])) << 9; - faster_tiers_size += size; - faster_tiers_dirty += dirty; + fragmented = max(0LL, fragmented); - copygc_can_free += fragmented; - } + bch2_pd_controller_update(&ca->copygc_pd, + free, fragmented, -1); } - rcu_read_unlock(); - - /* - * Throttle foreground writes if tier 0 is running out of free buckets, - * and either tiering or copygc can free up space. - * - * Target will be small if there isn't any work to do - we don't want to - * throttle foreground writes if we currently have all the free space - * we're ever going to have. - * - * Otherwise, if there's work to do, try to keep 20% of tier0 available - * for foreground writes. - */ - if (c->fastest_tier) - copygc_can_free = U64_MAX; - schedule_delayed_work(&c->pd_controllers_update, c->pd_controllers_update_seconds * HZ); } @@ -1201,22 +1161,14 @@ out: return ob - c->open_buckets; } -static int __dev_alloc_cmp(struct bch_fs *c, - struct write_point *wp, +static int __dev_alloc_cmp(struct write_point *wp, unsigned l, unsigned r) { - struct bch_dev *ca_l = rcu_dereference(c->devs[l]); - struct bch_dev *ca_r = rcu_dereference(c->devs[r]); - - if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier) - return ((ca_l->mi.tier > ca_r->mi.tier) - - (ca_l->mi.tier < ca_r->mi.tier)); - return ((wp->next_alloc[l] > wp->next_alloc[r]) - (wp->next_alloc[l] < wp->next_alloc[r])); } -#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r) +#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r) struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, struct write_point *wp, @@ -1355,7 +1307,7 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, static void writepoint_drop_ptrs(struct bch_fs *c, struct write_point *wp, - struct bch_devs_mask *devs, + u16 target, bool in_target, unsigned nr_ptrs_dislike) { int i; @@ -1367,7 +1319,8 @@ static void writepoint_drop_ptrs(struct bch_fs *c, struct open_bucket *ob = wp->ptrs[i]; struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) { + if (nr_ptrs_dislike && + dev_in_target(ca, target) == in_target) { BUG_ON(ca->open_buckets_partial_nr >= ARRAY_SIZE(ca->open_buckets_partial)); @@ -1401,7 +1354,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp) } static int open_bucket_add_buckets(struct bch_fs *c, - struct bch_devs_mask *_devs, + u16 target, struct write_point *wp, struct bch_devs_list *devs_have, unsigned nr_replicas, @@ -1422,8 +1375,15 @@ static int open_bucket_add_buckets(struct bch_fs *c, writepoint_for_each_ptr(wp, ob, i) __clear_bit(ob->ptr.dev, devs.d); - if (_devs) - bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX); + if (target) { + const struct bch_devs_mask *t; + + rcu_read_lock(); + t = bch2_target_to_mask(c, target); + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + rcu_read_unlock(); + } return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl); } @@ -1503,7 +1463,7 @@ out: * Get us an open_bucket we can allocate from, return with it locked: */ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, - struct bch_devs_mask *devs, + unsigned target, struct write_point_specifier write_point, struct bch_devs_list *devs_have, unsigned nr_replicas, @@ -1525,17 +1485,27 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, writepoint_for_each_ptr(wp, ob, i) if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) nr_ptrs_have++; - else if (devs && !test_bit(ob->ptr.dev, devs->d)) + else if (!dev_in_target(c->devs[ob->ptr.dev], target)) nr_ptrs_dislike++; - ret = open_bucket_add_buckets(c, devs, wp, devs_have, + ret = open_bucket_add_buckets(c, target, wp, devs_have, nr_replicas + nr_ptrs_have + nr_ptrs_dislike, reserve, cl); if (ret && ret != -EROFS) goto err; - if (wp->nr_ptrs < - nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) { + if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + goto alloc_done; + + ret = open_bucket_add_buckets(c, target, wp, devs_have, + nr_replicas + nr_ptrs_have, + reserve, cl); + if (ret && ret != -EROFS) + goto err; +alloc_done: + if (wp->nr_ptrs - nr_ptrs_have - + ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0) + < nr_replicas_required) { ret = -EROFS; goto err; } @@ -1545,7 +1515,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, 0, nr_ptrs_dislike); /* Remove pointers we don't want to use: */ - writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike); + writepoint_drop_ptrs(c, wp, target, false, nr_ptrs_dislike); /* * Move pointers to devices we already have to end of open bucket @@ -1637,7 +1607,6 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) void bch2_recalc_capacity(struct bch_fs *c) { - struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier; struct bch_dev *ca; u64 total_capacity, capacity = 0, reserved_sectors = 0; unsigned long ra_pages = 0; @@ -1653,28 +1622,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); - /* Find fastest, slowest tiers with devices: */ - - for (tier = c->tiers; - tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!dev_mask_nr(&tier->devs)) - continue; - if (!fastest_tier) - fastest_tier = tier; - slowest_tier = tier; - } - - c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; - c->fastest_devs = fastest_tier != slowest_tier ? &fastest_tier->devs : NULL; - - if (!fastest_tier) - goto set_capacity; - - /* - * Capacity of the filesystem is the capacity of all the devices in the - * slowest (highest) tier - we don't include lower tier devices. - */ - for_each_member_device_rcu(ca, c, i, &slowest_tier->devs) { + for_each_rw_member(ca, c, i) { size_t reserve = 0; /* @@ -1700,16 +1648,14 @@ void bch2_recalc_capacity(struct bch_fs *c) reserve += ARRAY_SIZE(c->write_points); - if (ca->mi.tier) - reserve += 1; /* tiering write point */ - reserve += 1; /* btree write point */ + reserve += 1; /* btree write point */ reserved_sectors += bucket_to_sector(ca, reserve); capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); } -set_capacity: + total_capacity = capacity; capacity *= (100 - c->opts.gc_reserve_percent); @@ -1745,7 +1691,8 @@ static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX); mutex_lock(&wp->lock); - writepoint_drop_ptrs(c, wp, ¬_self, wp->nr_ptrs); + writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), + true, wp->nr_ptrs); mutex_unlock(&wp->lock); } @@ -1776,7 +1723,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* First, remove device from allocation groups: */ - clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) clear_bit(ca->dev_idx, c->rw_devs[i].d); @@ -1790,7 +1736,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) bch2_stop_write_point(c, ca, &c->write_points[i]); bch2_stop_write_point(c, ca, &ca->copygc_write_point); - bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp); + bch2_stop_write_point(c, ca, &c->rebalance_write_point); bch2_stop_write_point(c, ca, &c->btree_write_point); mutex_lock(&c->btree_reserve_cache_lock); @@ -1828,7 +1774,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) if (ca->mi.data_allowed & (1 << i)) set_bit(ca->dev_idx, c->rw_devs[i].d); - set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); } /* stop allocator thread: */ @@ -2059,7 +2004,6 @@ void bch2_fs_allocator_init(struct bch_fs *c) { struct open_bucket *ob; struct write_point *wp; - unsigned i; mutex_init(&c->write_points_hash_lock); spin_lock_init(&c->freelist_lock); @@ -2079,9 +2023,7 @@ void bch2_fs_allocator_init(struct bch_fs *c) } writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); - - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) - writepoint_init(&c->tiers[i].wp, BCH_DATA_USER); + writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) { diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 3bdc2946..5b589223 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -66,7 +66,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, } struct write_point *bch2_alloc_sectors_start(struct bch_fs *, - struct bch_devs_mask *, + unsigned, struct write_point_specifier, struct bch_devs_list *, unsigned, unsigned, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 5a3e99b3..75f3a006 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -408,6 +408,8 @@ struct bch_dev { struct bch_pd_controller copygc_pd; struct write_point copygc_write_point; + atomic64_t rebalance_work; + struct journal_device journal; struct work_struct io_error_work; @@ -458,15 +460,6 @@ struct btree_debug { struct dentry *failed; }; -struct bch_tier { - unsigned idx; - struct task_struct *migrate; - struct bch_pd_controller pd; - - struct bch_devs_mask devs; - struct write_point wp; -}; - enum bch_fs_state { BCH_FS_STARTING = 0, BCH_FS_STOPPING, @@ -522,6 +515,7 @@ struct bch_fs { u64 time_base_lo; u32 time_base_hi; u32 time_precision; + u64 features; } sb; struct bch_sb *disk_sb; @@ -569,16 +563,13 @@ struct bch_fs { struct delayed_work pd_controllers_update; unsigned pd_controllers_update_seconds; + /* REBALANCE */ + struct task_struct *rebalance_thread; + struct bch_pd_controller rebalance_pd; + + atomic64_t rebalance_work_unknown_dev; - /* - * These contain all r/w devices - i.e. devices we can currently - * allocate from: - */ struct bch_devs_mask rw_devs[BCH_DATA_NR]; - struct bch_tier tiers[BCH_TIER_MAX]; - /* NULL if we only have devices in one tier: */ - struct bch_devs_mask *fastest_devs; - struct bch_tier *fastest_tier; u64 capacity; /* sectors */ @@ -615,6 +606,7 @@ struct bch_fs { struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; struct write_point btree_write_point; + struct write_point rebalance_write_point; struct write_point write_points[WRITE_POINT_COUNT]; struct hlist_head write_points_hash[WRITE_POINT_COUNT]; @@ -717,8 +709,8 @@ struct bch_fs { unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; - unsigned tiering_enabled:1; - unsigned tiering_percent; + unsigned rebalance_enabled:1; + unsigned rebalance_percent; #define BCH_DEBUG_PARAM(name, description) bool name; BCH_DEBUG_PARAMS_ALL() diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 5e406275..0f2c9cec 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -608,12 +608,22 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BCH_INODE_FIELD(bi_dev, 32) \ BCH_INODE_FIELD(bi_data_checksum, 8) \ BCH_INODE_FIELD(bi_compression, 8) \ - BCH_INODE_FIELD(bi_project, 32) + BCH_INODE_FIELD(bi_project, 32) \ + BCH_INODE_FIELD(bi_background_compression, 8) \ + BCH_INODE_FIELD(bi_data_replicas, 8) \ + BCH_INODE_FIELD(bi_promote_target, 16) \ + BCH_INODE_FIELD(bi_foreground_target, 16) \ + BCH_INODE_FIELD(bi_background_target, 16) #define BCH_INODE_FIELDS_INHERIT() \ BCH_INODE_FIELD(bi_data_checksum) \ BCH_INODE_FIELD(bi_compression) \ - BCH_INODE_FIELD(bi_project) + BCH_INODE_FIELD(bi_project) \ + BCH_INODE_FIELD(bi_background_compression) \ + BCH_INODE_FIELD(bi_data_replicas) \ + BCH_INODE_FIELD(bi_promote_target) \ + BCH_INODE_FIELD(bi_foreground_target) \ + BCH_INODE_FIELD(bi_background_target) enum { /* @@ -814,13 +824,14 @@ struct bch_member { }; LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) -/* 8-10 unused, was HAS_(META)DATA */ +/* 4-10 unused, was TIER, HAS_(META)DATA */ LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) +#define BCH_TIER_MAX 4U + #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); @@ -834,8 +845,6 @@ enum bch_member_state { BCH_MEMBER_STATE_NR = 4, }; -#define BCH_TIER_MAX 4U - enum cache_replacement { CACHE_REPLACEMENT_LRU = 0, CACHE_REPLACEMENT_FIFO = 1, @@ -1077,6 +1086,12 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, + struct bch_sb, flags[1], 28, 32); + +LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); +LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); +LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); /* Features: */ enum bch_sb_features { diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 0e0156d9..f42239da 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -348,7 +348,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, NULL, + wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 5ff90cc0..ab6dc665 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -40,27 +40,15 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, if (!ca) return ERR_PTR(-EINVAL); } else { - struct block_device *bdev; char *path; - unsigned i; path = strndup_user((const char __user *) (unsigned long) dev, PATH_MAX); if (IS_ERR(path)) return ERR_CAST(path); - bdev = lookup_bdev(path); + ca = bch2_dev_lookup(c, path); kfree(path); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); - - for_each_member_device(ca, c, i) - if (ca->disk_sb.bdev == bdev) - goto found; - - ca = ERR_PTR(-ENOENT); -found: - bdput(bdev); } return ca; diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 7726cfd8..18c94598 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -360,6 +360,9 @@ static unsigned __bio_compress(struct bch_fs *c, unsigned pad; int ret = 0; + BUG_ON(compression_type >= BCH_COMPRESSION_NR); + BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + /* If it's only one block, don't bother trying to compress: */ if (bio_sectors(src) <= c->opts.block_size) return 0; @@ -465,6 +468,8 @@ unsigned bch2_bio_compress(struct bch_fs *c, return compression_type; } +static int __bch2_fs_compress_init(struct bch_fs *, u64); + #define BCH_FEATURE_NONE 0 static const unsigned bch2_compression_opt_to_feature[] = { @@ -475,29 +480,42 @@ static const unsigned bch2_compression_opt_to_feature[] = { #undef BCH_FEATURE_NONE -/* doesn't write superblock: */ -int bch2_check_set_has_compressed_data(struct bch_fs *c, - unsigned compression_type) +int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) { - unsigned f; int ret = 0; - pr_verbose_init(c->opts, ""); + if ((c->sb.features & f) == f) + return 0; - BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + mutex_lock(&c->sb_lock); - if (!compression_type) - goto out; + if ((c->sb.features & f) == f) { + mutex_unlock(&c->sb_lock); + return 0; + } - f = bch2_compression_opt_to_feature[compression_type]; - if (bch2_sb_test_feature(c->disk_sb, f)) - goto out; + ret = __bch2_fs_compress_init(c, c->sb.features|f); + if (ret) { + mutex_unlock(&c->sb_lock); + return ret; + } - bch2_sb_set_feature(c->disk_sb, f); - ret = bch2_fs_compress_init(c); -out: - pr_verbose_init(c->opts, "ret %i", ret); - return ret; + c->disk_sb->features[0] |= cpu_to_le64(f); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_check_set_has_compressed_data(struct bch_fs *c, + unsigned compression_type) +{ + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + + return compression_type + ? __bch2_check_set_has_compressed_data(c, + 1ULL << bch2_compression_opt_to_feature[compression_type]) + : 0; } void bch2_fs_compress_exit(struct bch_fs *c) @@ -531,7 +549,7 @@ static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) : 0; } -int bch2_fs_compress_init(struct bch_fs *c) +static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t max_extent = c->sb.encoded_extent_max << 9; size_t order = get_order(max_extent); @@ -561,7 +579,7 @@ int bch2_fs_compress_init(struct bch_fs *c) for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) - if (bch2_sb_test_feature(c->disk_sb, i->feature)) + if (features & (1 << i->feature)) goto have_compressed; goto out; @@ -587,7 +605,7 @@ have_compressed: decompress_workspace_size = max(decompress_workspace_size, i->decompress_workspace); - if (!bch2_sb_test_feature(c->disk_sb, i->feature)) + if (!(features & (1 << i->feature))) continue; if (i->decompress_workspace) @@ -609,3 +627,17 @@ out: pr_verbose_init(c->opts, "ret %i", ret); return ret; } + +int bch2_fs_compress_init(struct bch_fs *c) +{ + u64 f = c->sb.features; + + if (c->opts.compression) + f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; + + if (c->opts.background_compression) + f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; + + return __bch2_fs_compress_init(c, f); + +} diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index ce1f8ba2..37470f86 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1766,7 +1766,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, unsigned seq, stale; char buf[160]; bool bad; - unsigned ptrs_per_tier[BCH_TIER_MAX]; unsigned replicas = 0; /* @@ -1778,12 +1777,9 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, * going to get overwritten during replay) */ - memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); - extent_for_each_ptr(e, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); replicas++; - ptrs_per_tier[ca->mi.tier]++; /* * If journal replay hasn't finished, we might be seeing keys @@ -1886,12 +1882,6 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf, #undef p } -static unsigned PTR_TIER(struct bch_fs *c, - const struct bch_extent_ptr *ptr) -{ - return bch_dev_bkey_exists(c, ptr->dev)->mi.tier; -} - static void bch2_extent_crc_init(union bch_extent_crc *crc, struct bch_extent_crc_unpacked new) { @@ -2014,45 +2004,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) void bch2_extent_mark_replicas_cached(struct bch_fs *c, struct bkey_s_extent e, - unsigned nr_desired_replicas) + unsigned nr_desired_replicas, + unsigned target) { struct bch_extent_ptr *ptr; - unsigned tier = 0, nr_cached = 0; - unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c); - bool have_higher_tier; + unsigned nr_cached = 0, nr_good = bch2_extent_nr_good_ptrs(c, e.c); if (nr_good <= nr_desired_replicas) return; nr_cached = nr_good - nr_desired_replicas; - do { - have_higher_tier = false; - - extent_for_each_ptr(e, ptr) { - if (!ptr->cached && - PTR_TIER(c, ptr) == tier) { - ptr->cached = true; - nr_cached--; - if (!nr_cached) - return; - } - - if (PTR_TIER(c, ptr) > tier) - have_higher_tier = true; + extent_for_each_ptr(e, ptr) + if (!ptr->cached && + !dev_in_target(c->devs[ptr->dev], target)) { + ptr->cached = true; + nr_cached--; + if (!nr_cached) + return; } - - tier++; - } while (have_higher_tier); } /* - * This picks a non-stale pointer, preferabbly from a device other than - * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale - * pointers to other devices, it will still pick a pointer from avoid. - * Note that it prefers lowered-numbered pointers to higher-numbered pointers - * as the pointers are sorted by tier, hence preferring pointers to tier 0 - * rather than pointers to tier 1. + * This picks a non-stale pointer, preferably from a device other than @avoid. + * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to + * other devices, it will still pick a pointer from avoid. */ void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, struct bch_devs_mask *avoid, diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 75579273..83c0f24d 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -39,7 +39,7 @@ bch2_insert_fixup_extent(struct btree_insert *, bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, - unsigned); + unsigned, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 00475b99..46cffc5c 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -504,10 +504,8 @@ static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, op->unalloc = false; op->new_i_size = U64_MAX; - bch2_write_op_init(&op->op, c); - op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum); - op->op.compression_type = bch2_compression_opt_to_type[opts.compression]; - op->op.devs = c->fastest_devs; + bch2_write_op_init(&op->op, c, opts); + op->op.target = opts.foreground_target; op->op.index_update_fn = bchfs_write_index_update; op_journal_seq_set(&op->op, &inode->ei_journal_seq); } @@ -615,8 +613,14 @@ static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *in struct page *page, bool check_enospc) { struct bch_page_state *s = page_state(page), new, old; + + /* XXX: this should not be open coded */ + unsigned nr_replicas = inode->ei_inode.bi_data_replicas + ? inode->ei_inode.bi_data_replicas - 1 + : c->opts.data_replicas; + struct disk_reservation disk_res = bch2_disk_reservation_init(c, - READ_ONCE(c->opts.data_replicas)); + nr_replicas); struct quota_res quota_res = { 0 }; int ret = 0; @@ -1894,7 +1898,7 @@ static int bch2_direct_IO_write(struct kiocb *req, goto err; ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, - c->opts.data_replicas, 0); + dio->iop.op.opts.data_replicas, 0); if (unlikely(ret)) { if (bch2_check_range_allocated(c, POS(inode->v.i_ino, offset >> 9), @@ -2351,7 +2355,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, loff_t block_start, block_end; loff_t end = offset + len; unsigned sectors; - unsigned replicas = READ_ONCE(c->opts.data_replicas); + unsigned replicas = io_opts(c, inode).data_replicas; int ret; bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 80962b5d..c7e842ee 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1266,6 +1266,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; enum bch_opt_id i; + char buf[512]; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; @@ -1277,17 +1278,10 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; - switch (opt->type) { - case BCH_OPT_BOOL: - seq_printf(seq, ",%s%s", v ? "" : "no", opt->attr.name); - break; - case BCH_OPT_UINT: - seq_printf(seq, ",%s=%llu", opt->attr.name, v); - break; - case BCH_OPT_STR: - seq_printf(seq, ",%s=%s", opt->attr.name, opt->choices[v]); - break; - } + bch2_opt_to_text(c, buf, sizeof(buf), opt, v, + OPT_SHOW_MOUNT_STYLE); + seq_putc(seq, ','); + seq_puts(seq, buf); } return 0; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 13495d48..6624d8af 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -22,6 +22,7 @@ #include "move.h" #include "super.h" #include "super-io.h" +#include "tier.h" #include #include @@ -220,9 +221,9 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_INTENT); ret = bch2_btree_insert_list_at(&iter, keys, &op->res, - NULL, op_journal_seq(op), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); + NULL, op_journal_seq(op), + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE); bch2_btree_iter_unlock(&iter); return ret; @@ -238,7 +239,7 @@ static void bch2_write_index(struct closure *cl) struct keylist *keys = &op->insert_keys; struct bkey_s_extent e; struct bch_extent_ptr *ptr; - struct bkey_i *src, *dst = keys->keys, *n; + struct bkey_i *src, *dst = keys->keys, *n, *k; int ret; op->flags |= BCH_WRITE_LOOPED; @@ -268,6 +269,14 @@ static void bch2_write_index(struct closure *cl) keys->top = dst; + /* + * probably not the ideal place to hook this in, but I don't + * particularly want to plumb io_opts all the way through the btree + * update stack right now + */ + for_each_keylist_key(keys, k) + bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); + if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); @@ -735,7 +744,7 @@ static void __bch2_write(struct closure *cl) continue_at(cl, bch2_write_index, index_update_wq(op)); wp = bch2_alloc_sectors_start(c, - op->devs, + op->target, op->write_point, &op->devs_have, op->nr_replicas, @@ -935,29 +944,32 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio, memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs, - writepoint_hashed((unsigned long) current), - rbio->opts, - DATA_PROMOTE, - (struct data_opts) { 0 }, - k); + ret = bch2_migrate_write_init(c, &op->write, + writepoint_hashed((unsigned long) current), + rbio->opts, + DATA_PROMOTE, + (struct data_opts) { + .target = rbio->opts.promote_target + }, + k); BUG_ON(ret); return op; } -/* only promote if we're not reading from the fastest tier: */ -static bool should_promote(struct bch_fs *c, - struct extent_pick_ptr *pick, unsigned flags) +static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, + unsigned flags, u16 target) { + if (!target) + return false; + if (!(flags & BCH_READ_MAY_PROMOTE)) return false; if (percpu_ref_is_dying(&c->writes)) return false; - return c->fastest_tier && - c->fastest_tier < c->tiers + pick->ca->mi.tier; + return bch2_extent_has_target(c, e, target); } /* Read */ @@ -1323,7 +1335,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, bounce = true; } - promote = should_promote(c, pick, flags); + promote = should_promote(c, e, flags, orig->opts.promote_target); /* could also set read_full */ if (promote) bounce = true; diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 4208fd43..bf0b17e1 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -61,24 +61,25 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) int bch2_write_index_default(struct bch_write_op *); -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) { op->c = c; op->io_wq = index_update_wq(op); op->flags = 0; op->written = 0; op->error = 0; - op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum); - op->compression_type = - bch2_compression_opt_to_type[c->opts.compression]; + op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); + op->compression_type = bch2_compression_opt_to_type[opts.compression]; op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; op->alloc_reserve = RESERVE_NONE; op->open_buckets_nr = 0; op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; op->pos = POS_MAX; op->version = ZERO_VERSION; - op->devs = NULL; op->write_point = (struct write_point_specifier) { 0 }; op->res = (struct disk_reservation) { 0 }; op->journal_seq = 0; diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 32ecac24..a022ab33 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -103,13 +103,14 @@ struct bch_write_op { u16 target; u16 nonce; + struct bch_io_opts opts; + struct bpos pos; struct bversion version; /* For BCH_WRITE_DATA_ENCODED: */ struct bch_extent_crc_unpacked crc; - struct bch_devs_mask *devs; struct write_point_specifier write_point; struct disk_reservation res; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index a176484a..a7c4c3ac 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -14,11 +14,16 @@ #include +#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 + struct moving_io { struct list_head list; struct closure cl; bool read_completed; - unsigned sectors; + + unsigned read_dev; + unsigned read_sectors; + unsigned write_sectors; struct bch_read_bio rbio; @@ -34,7 +39,11 @@ struct moving_context { struct bch_move_stats *stats; struct list_head reads; - atomic_t sectors_in_flight; + + /* in flight sectors: */ + atomic_t read_sectors[BCH_SB_MEMBERS_MAX]; + atomic_t write_sectors; + wait_queue_head_t wait; }; @@ -116,7 +125,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) (struct bch_extent_crc_unpacked) { 0 }); bch2_extent_normalize(c, extent_i_to_s(insert).s); bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), - c->opts.data_replicas); + op->opts.background_target, + op->opts.data_replicas); /* * It's possible we race, and for whatever reason the extent now @@ -206,7 +216,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) } int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, - struct bch_devs_mask *devs, struct write_point_specifier wp, struct bch_io_opts io_opts, enum data_cmd data_cmd, @@ -219,11 +228,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->data_opts = data_opts; m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k); - bch2_write_op_init(&m->op, c); - m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum); + bch2_write_op_init(&m->op, c, io_opts); m->op.compression_type = - bch2_compression_opt_to_type[io_opts.compression]; - m->op.devs = devs; + bch2_compression_opt_to_type[io_opts.background_compression ?: + io_opts.compression]; + m->op.target = data_opts.target, m->op.write_point = wp; if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) @@ -241,8 +250,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, switch (data_cmd) { case DATA_ADD_REPLICAS: - if (m->nr_ptrs_reserved < c->opts.data_replicas) { - m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved; + if (m->nr_ptrs_reserved < io_opts.data_replicas) { + m->op.nr_replicas = io_opts.data_replicas - m->nr_ptrs_reserved; ret = bch2_disk_reservation_get(c, &m->op.res, k.k->size, @@ -250,7 +259,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, if (ret) return ret; - m->nr_ptrs_reserved = c->opts.data_replicas; + m->nr_ptrs_reserved = io_opts.data_replicas; } break; case DATA_REWRITE: @@ -279,19 +288,29 @@ static void move_free(struct closure *cl) if (bv->bv_page) __free_page(bv->bv_page); - atomic_sub(io->sectors, &ctxt->sectors_in_flight); wake_up(&ctxt->wait); kfree(io); } +static void move_write_done(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + + atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); + closure_return_with_destructor(cl, move_free); +} + static void move_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); if (likely(!io->rbio.bio.bi_status)) { bch2_migrate_read_done(&io->write, &io->rbio); + + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); closure_call(&io->write.op.cl, bch2_write, NULL, cl); + continue_at(cl, move_write_done, NULL); } closure_return_with_destructor(cl, move_free); @@ -310,16 +329,46 @@ static void move_read_endio(struct bio *bio) struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); struct moving_context *ctxt = io->write.ctxt; + atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]); io->read_completed = true; + if (next_pending_write(ctxt)) wake_up(&ctxt->wait); closure_put(&ctxt->cl); } +static void do_pending_writes(struct moving_context *ctxt) +{ + struct moving_io *io; + + while ((io = next_pending_write(ctxt))) { + list_del(&io->list); + closure_call(&io->cl, move_write, NULL, &ctxt->cl); + } +} + +#define move_ctxt_wait_event(_ctxt, _cond) \ +do { \ + do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + __wait_event((_ctxt)->wait, \ + next_pending_write(_ctxt) || (_cond)); \ +} while (1) + +static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +{ + unsigned sectors_pending = atomic_read(&ctxt->write_sectors); + + move_ctxt_wait_event(ctxt, + !atomic_read(&ctxt->write_sectors) || + atomic_read(&ctxt->write_sectors) != sectors_pending); +} + static int bch2_move_extent(struct bch_fs *c, struct moving_context *ctxt, - struct bch_devs_mask *devs, struct write_point_specifier wp, struct bch_io_opts io_opts, struct bkey_s_c_extent e, @@ -333,10 +382,18 @@ static int bch2_move_extent(struct bch_fs *c, unsigned sectors = e.k->size, pages; int ret = -ENOMEM; + move_ctxt_wait_event(ctxt, + atomic_read(&ctxt->write_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + bch2_extent_pick_ptr(c, e.s_c, NULL, &pick); if (IS_ERR_OR_NULL(pick.ca)) return pick.ca ? PTR_ERR(pick.ca) : 0; + move_ctxt_wait_event(ctxt, + atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) < + SECTORS_IN_FLIGHT_PER_DEVICE); + /* write path might have to decompress data: */ extent_for_each_ptr_crc(e, ptr, crc) sectors = max_t(unsigned, sectors, crc.uncompressed_size); @@ -347,8 +404,10 @@ static int bch2_move_extent(struct bch_fs *c, if (!io) goto err; - io->write.ctxt = ctxt; - io->sectors = e.k->size; + io->write.ctxt = ctxt; + io->read_dev = pick.ca->dev_idx; + io->read_sectors = pick.crc.uncompressed_size; + io->write_sectors = e.k->size; bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->write.op.wbio.bio, @@ -368,8 +427,8 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_migrate_write_init(c, &io->write, devs, wp, - io_opts, data_cmd, data_opts, e.s_c); + ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, + data_cmd, data_opts, e.s_c); if (ret) goto err_free_pages; @@ -378,7 +437,7 @@ static int bch2_move_extent(struct bch_fs *c, trace_move_extent(e.k); - atomic_add(io->sectors, &ctxt->sectors_in_flight); + atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]); list_add_tail(&io->list, &ctxt->reads); /* @@ -398,39 +457,8 @@ err: return ret; } -static void do_pending_writes(struct moving_context *ctxt) -{ - struct moving_io *io; - - while ((io = next_pending_write(ctxt))) { - list_del(&io->list); - closure_call(&io->cl, move_write, NULL, &ctxt->cl); - } -} - -#define move_ctxt_wait_event(_ctxt, _cond) \ -do { \ - do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - __wait_event((_ctxt)->wait, \ - next_pending_write(_ctxt) || (_cond)); \ -} while (1) - -static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -{ - unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight); - - move_ctxt_wait_event(ctxt, - !atomic_read(&ctxt->sectors_in_flight) || - atomic_read(&ctxt->sectors_in_flight) != sectors_pending); -} - int bch2_move_data(struct bch_fs *c, struct bch_ratelimit *rate, - unsigned sectors_in_flight, - struct bch_devs_mask *devs, struct write_point_specifier wp, struct bpos start, struct bpos end, @@ -460,13 +488,6 @@ int bch2_move_data(struct bch_fs *c, bch2_ratelimit_reset(rate); while (!kthread || !(ret = kthread_should_stop())) { - if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) { - bch2_btree_iter_unlock(&stats->iter); - move_ctxt_wait_event(&ctxt, - atomic_read(&ctxt.sectors_in_flight) < - sectors_in_flight); - } - if (rate && bch2_ratelimit_delay(rate) && (bch2_btree_iter_unlock(&stats->iter), @@ -519,7 +540,7 @@ peek: k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&stats->iter); - ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts, + ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, bkey_s_c_to_extent(k), data_cmd, data_opts); if (ret2) { @@ -545,11 +566,10 @@ next_nondata: bch2_btree_iter_unlock(&stats->iter); - move_ctxt_wait_event(&ctxt, !atomic_read(&ctxt.sectors_in_flight)); + move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); - EBUG_ON(!list_empty(&ctxt.reads)); - EBUG_ON(atomic_read(&ctxt.sectors_in_flight)); + EBUG_ON(atomic_read(&ctxt.write_sectors)); trace_move_data(c, atomic64_read(&stats->sectors_moved), @@ -671,11 +691,12 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); unsigned replicas = type == BKEY_TYPE_BTREE ? c->opts.metadata_replicas - : c->opts.data_replicas; + : io_opts->data_replicas; if (!nr_good || nr_good >= replicas) return DATA_SKIP; + data_opts->target = 0; data_opts->btree_insert_flags = 0; return DATA_ADD_REPLICAS; } @@ -691,6 +712,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, if (!bch2_extent_has_device(e, op->migrate.dev)) return DATA_SKIP; + data_opts->target = 0; data_opts->btree_insert_flags = 0; data_opts->rewrite_dev = op->migrate.dev; return DATA_REWRITE; @@ -710,8 +732,7 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret; - ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, - NULL, + ret = bch2_move_data(c, NULL, writepoint_hashed((unsigned long) current), op.start, op.end, @@ -728,8 +749,7 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret; - ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, - NULL, + ret = bch2_move_data(c, NULL, writepoint_hashed((unsigned long) current), op.start, op.end, diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 819e5d9f..bc98f94b 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -17,6 +17,7 @@ enum data_cmd { }; struct data_opts { + u16 target; unsigned rewrite_dev; int btree_insert_flags; }; @@ -38,14 +39,11 @@ struct migrate_write { void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, - struct bch_devs_mask *, struct write_point_specifier, struct bch_io_opts, enum data_cmd, struct data_opts, struct bkey_s_c); -#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 - typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, enum bkey_type, struct bkey_s_c_extent, struct bch_io_opts *, struct data_opts *); @@ -61,7 +59,6 @@ struct bch_move_stats { }; int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, - unsigned, struct bch_devs_mask *, struct write_point_specifier, struct bpos, struct bpos, move_pred_fn, void *, diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index c306a89f..ad56e039 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -94,7 +95,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, if (!__copygc_pred(ca, e)) return DATA_SKIP; - data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE, + data_opts->target = dev_to_target(ca->dev_idx); + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; data_opts->rewrite_dev = ca->dev_idx; return DATA_REWRITE; } @@ -178,8 +180,6 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) bucket_offset_cmp, NULL); ret = bch2_move_data(c, &ca->copygc_pd.rate, - SECTORS_IN_FLIGHT_PER_DEVICE, - &ca->self, writepoint_ptr(&ca->copygc_write_point), POS_MIN, POS_MAX, copygc_pred, ca, @@ -248,8 +248,10 @@ void bch2_copygc_stop(struct bch_dev *ca) ca->copygc_pd.rate.rate = UINT_MAX; bch2_ratelimit_reset(&ca->copygc_pd.rate); - if (ca->copygc_thread) + if (ca->copygc_thread) { kthread_stop(ca->copygc_thread); + put_task_struct(ca->copygc_thread); + } ca->copygc_thread = NULL; } @@ -269,6 +271,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) if (IS_ERR(t)) return PTR_ERR(t); + get_task_struct(t); + ca->copygc_thread = t; wake_up_process(ca->copygc_thread); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index ec50345f..326b8ad9 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -1,7 +1,9 @@ #include +#include "bcachefs.h" #include "opts.h" +#include "super-io.h" #include "util.h" const char * const bch2_error_actions[] = { @@ -139,6 +141,9 @@ const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices +#define OPT_FN(_fn) .type = BCH_OPT_FN, \ + .parse = _fn##_parse, \ + .print = _fn##_print #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \ [Opt_##_name] = { \ @@ -189,7 +194,8 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } -int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res) +int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, + const char *val, u64 *res) { ssize_t ret; @@ -217,11 +223,50 @@ int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res) *res = ret; break; + case BCH_OPT_FN: + if (!c) + return -EINVAL; + + return opt->parse(c, val, res); } return 0; } +int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len, + const struct bch_option *opt, u64 v, + unsigned flags) +{ + char *out = buf, *end = buf + len; + + if (flags & OPT_SHOW_MOUNT_STYLE) { + if (opt->type == BCH_OPT_BOOL) + return scnprintf(out, end - out, "%s%s", + v ? "" : "no", + opt->attr.name); + + out += scnprintf(out, end - out, "%s=", opt->attr.name); + } + + switch (opt->type) { + case BCH_OPT_BOOL: + case BCH_OPT_UINT: + out += scnprintf(out, end - out, "%lli", v); + break; + case BCH_OPT_STR: + out += (flags & OPT_SHOW_FULL_LIST) + ? bch2_scnprint_string_list(out, end - out, opt->choices, v) + : scnprintf(out, end - out, opt->choices[v]); + break; + case BCH_OPT_FN: + return opt->print(c, out, end - out, v); + default: + BUG(); + } + + return out - buf; +} + int bch2_parse_mount_opts(struct bch_opts *opts, char *options) { char *opt, *name, *val; @@ -237,7 +282,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) if (id < 0) goto bad_opt; - ret = bch2_opt_parse(&bch2_opt_table[id], val, &v); + ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); if (ret < 0) goto bad_val; } else { diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8a3ac66b..e7ab8870 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -42,6 +42,7 @@ enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, BCH_OPT_STR, + BCH_OPT_FN, }; /** @@ -94,9 +95,21 @@ enum opt_type { BCH_OPT(compression, u8, OPT_RUNTIME, \ OPT_STR(bch2_compression_types), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE)\ + BCH_OPT(background_compression, u8, OPT_RUNTIME, \ + OPT_STR(bch2_compression_types), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\ BCH_OPT(str_hash, u8, OPT_RUNTIME, \ OPT_STR(bch2_str_hash_types), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH) \ + BCH_OPT(foreground_target, u16, OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_FOREGROUND_TARGET, 0) \ + BCH_OPT(background_target, u16, OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_BACKGROUND_TARGET, 0) \ + BCH_OPT(promote_target, u16, OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_PROMOTE_TARGET, 0) \ BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, false) \ @@ -205,6 +218,8 @@ enum bch_opt_id { bch2_opts_nr }; +struct bch_fs; + struct bch_option { struct attribute attr; void (*set_sb)(struct bch_sb *, u64); @@ -218,6 +233,10 @@ struct bch_option { struct { const char * const *choices; }; + struct { + int (*parse)(struct bch_fs *, const char *, u64 *); + int (*print)(struct bch_fs *, char *, size_t, u64); + }; }; }; @@ -231,14 +250,26 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); struct bch_opts bch2_opts_from_sb(struct bch_sb *); int bch2_opt_lookup(const char *); -int bch2_opt_parse(const struct bch_option *, const char *, u64 *); +int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); + +#define OPT_SHOW_FULL_LIST (1 << 0) +#define OPT_SHOW_MOUNT_STYLE (1 << 1) + +int bch2_opt_to_text(struct bch_fs *, char *, size_t, + const struct bch_option *, u64, unsigned); + int bch2_parse_mount_opts(struct bch_opts *, char *); /* inode opts: */ #define BCH_INODE_OPTS() \ BCH_INODE_OPT(data_checksum, 8) \ - BCH_INODE_OPT(compression, 8) + BCH_INODE_OPT(compression, 8) \ + BCH_INODE_OPT(background_compression, 8) \ + BCH_INODE_OPT(data_replicas, 8) \ + BCH_INODE_OPT(promote_target, 16) \ + BCH_INODE_OPT(foreground_target, 16) \ + BCH_INODE_OPT(background_target, 16) struct bch_io_opts { #define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index c7473917..69101f3a 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -400,6 +400,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); c->sb.time_precision = le32_to_cpu(src->time_precision); + c->sb.features = le64_to_cpu(src->features[0]); for_each_member_device(ca, c, i) ca->mi = bch2_mi_to_cpu(mi->members + i); @@ -1600,24 +1601,22 @@ static const char *bch2_sb_validate_quota(struct bch_sb *sb, /* Disk groups: */ -#if 0 -static size_t trim_nulls(const char *str, size_t len) +static int strcmp_void(const void *l, const void *r) { - while (len && !str[len - 1]) - --len; - return len; + return strcmp(l, r); } -#endif static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); + struct bch_disk_group *g; struct bch_sb_field_members *mi; struct bch_member *m; - struct bch_disk_group *g; - unsigned nr_groups; + unsigned i, nr_groups, nr_live = 0, len; + char **labels, *l; + const char *err = NULL; mi = bch2_sb_get_members(sb); groups = bch2_sb_get_disk_groups(sb); @@ -1626,32 +1625,57 @@ static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb, for (m = mi->members; m < mi->members + sb->nr_devices; m++) { + unsigned g; + if (!BCH_MEMBER_GROUP(m)) continue; - if (BCH_MEMBER_GROUP(m) >= nr_groups) - return "disk has invalid group"; + g = BCH_MEMBER_GROUP(m) - 1; - g = &groups->entries[BCH_MEMBER_GROUP(m)]; - if (BCH_GROUP_DELETED(g)) + if (g >= nr_groups || + BCH_GROUP_DELETED(&groups->entries[g])) return "disk has invalid group"; } -#if 0 - if (!groups) + + if (!nr_groups) return NULL; - char **labels; labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL); if (!labels) return "cannot allocate memory"; - for (g = groups->groups; - g < groups->groups + nr_groups; + for (g = groups->entries; + g < groups->entries + nr_groups; g++) { + if (BCH_GROUP_DELETED(g)) + continue; + + len = strnlen(g->label, sizeof(g->label)); + labels[nr_live++] = l = kmalloc(len + 1, GFP_KERNEL); + if (!l) { + err = "cannot allocate memory"; + goto err; + } + + memcpy(l, g->label, len); + l[len] = '\0'; } -#endif - return NULL; + + sort(labels, nr_live, sizeof(labels[0]), strcmp_void, NULL); + + for (i = 0; i + 1 < nr_live; i++) + if (!strcmp(labels[i], labels[i + 1])) { + err = "duplicate group labels"; + goto err; + } + + err = NULL; +err: + for (i = 0; i < nr_live; i++) + kfree(labels[i]); + kfree(labels); + return err; } static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) @@ -1692,7 +1716,11 @@ static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) if (!bch2_member_exists(m)) continue; - __set_bit(i, dst->devs.d); + dst = BCH_MEMBER_GROUP(m) + ? &cpu_g->entries[BCH_MEMBER_GROUP(m) - 1] + : NULL; + if (dst) + __set_bit(i, dst->devs.d); } old_g = c->disk_groups; @@ -1708,18 +1736,140 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe struct target t = target_decode(target); switch (t.type) { - case TARGET_DEV: - BUG_ON(t.dev >= c->sb.nr_devices && !c->devs[t.dev]); - return &c->devs[t.dev]->self; + case TARGET_DEV: { + struct bch_dev *ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + return ca ? &ca->self : NULL; + } + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + return t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + } + default: + BUG(); + } +} + +int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, + const char *name) +{ + unsigned i, nr_groups = disk_groups_nr(groups); + unsigned len = strlen(name); + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *g = groups->entries + i; + + if (BCH_GROUP_DELETED(g)) + continue; + + if (strnlen(g->label, sizeof(g->label)) == len && + !memcmp(name, g->label, len)) + return i; + } + + return -1; +} + +static int bch2_disk_group_find(struct bch_fs *c, const char *name) +{ + int ret; + + mutex_lock(&c->sb_lock); + ret = __bch2_disk_group_find(bch2_sb_get_disk_groups(c->disk_sb), name); + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) +{ + struct bch_dev *ca; + int g; + + if (!strlen(buf) || !strcmp(buf, "none")) { + *v = 0; + return 0; + } + + /* Is it a device? */ + ca = bch2_dev_lookup(c, buf); + if (!IS_ERR(ca)) { + *v = dev_to_target(ca->dev_idx); + percpu_ref_put(&ca->ref); + return 0; + } + + g = bch2_disk_group_find(c, buf); + if (g >= 0) { + *v = group_to_target(g); + return 0; + } + + return -EINVAL; +} + +int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v) +{ + struct target t = target_decode(v); + int ret; + + switch (t.type) { + case TARGET_NULL: + return scnprintf(buf, len, "none"); + case TARGET_DEV: { + struct bch_dev *ca; + + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + char b[BDEVNAME_SIZE]; + + ret = scnprintf(buf, len, "/dev/%s", + bdevname(ca->disk_sb.bdev, b)); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + ret = scnprintf(buf, len, "offline device %u", t.dev); + } else { + ret = scnprintf(buf, len, "invalid device %u", t.dev); + } + + rcu_read_unlock(); + break; + } case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = - rcu_dereference(c->disk_groups); + struct bch_sb_field_disk_groups *groups; + struct bch_disk_group *g; + + mutex_lock(&c->sb_lock); + groups = bch2_sb_get_disk_groups(c->disk_sb); + + g = t.group < disk_groups_nr(groups) + ? groups->entries + t.group + : NULL; + + if (g && !BCH_GROUP_DELETED(g)) { + ret = len ? min(len - 1, strnlen(g->label, sizeof(g->label))) : 0; - /* XXX: what to do here? */ - BUG_ON(t.group >= g->nr || g->entries[t.group].deleted); - return &g->entries[t.group].devs; + memcpy(buf, g->label, ret); + if (len) + buf[ret] = '\0'; + } else { + ret = scnprintf(buf, len, "invalid group %u", t.group); + } + + mutex_unlock(&c->sb_lock); + break; } default: BUG(); } + + return ret; } diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index d7fecf02..3811de72 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -129,7 +129,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), .state = BCH_MEMBER_STATE(mi), - .tier = BCH_MEMBER_TIER(mi), .replacement = BCH_MEMBER_REPLACEMENT(mi), .discard = BCH_MEMBER_DISCARD(mi), .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), @@ -204,27 +203,34 @@ struct target { }; }; +#define TARGET_DEV_START 1 +#define TARGET_GROUP_START (256 + TARGET_DEV_START) + static inline u16 dev_to_target(unsigned dev) { - return 1 + dev; + return TARGET_DEV_START + dev; } static inline u16 group_to_target(unsigned group) { - return 1 + U8_MAX + group; + return TARGET_GROUP_START + group; } static inline struct target target_decode(unsigned target) { - if (!target) - return (struct target) { .type = TARGET_NULL }; - - --target; - if (target <= U8_MAX) - return (struct target) { .type = TARGET_DEV, .dev = target }; - - target -= U8_MAX; - return (struct target) { .type = TARGET_GROUP, .group = target }; + if (target >= TARGET_GROUP_START) + return (struct target) { + .type = TARGET_GROUP, + .group = target - TARGET_GROUP_START + }; + + if (target >= TARGET_DEV_START) + return (struct target) { + .type = TARGET_DEV, + .group = target - TARGET_DEV_START + }; + + return (struct target) { .type = TARGET_NULL }; } static inline bool dev_in_target(struct bch_dev *ca, unsigned target) @@ -232,6 +238,8 @@ static inline bool dev_in_target(struct bch_dev *ca, unsigned target) struct target t = target_decode(target); switch (t.type) { + case TARGET_NULL: + return false; case TARGET_DEV: return ca->dev_idx == t.dev; case TARGET_GROUP: @@ -243,4 +251,9 @@ static inline bool dev_in_target(struct bch_dev *ca, unsigned target) const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); +int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *); + +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); +int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64); + #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 58bcd7d1..abb97128 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -149,6 +149,7 @@ int bch2_congested(void *data, int bdi_bits) unsigned i; int ret = 0; + rcu_read_lock(); if (bdi_bits & (1 << WB_sync_congested)) { /* Reads - check all devices: */ for_each_readable_member(ca, c, i) { @@ -160,12 +161,11 @@ int bch2_congested(void *data, int bdi_bits) } } } else { - /* Writes prefer fastest tier: */ - struct bch_tier *tier = READ_ONCE(c->fastest_tier); - struct bch_devs_mask *devs = - tier ? &tier->devs : &c->rw_devs[BCH_DATA_USER]; + unsigned target = READ_ONCE(c->opts.foreground_target); + const struct bch_devs_mask *devs = target + ? bch2_target_to_mask(c, target) + : &c->rw_devs[BCH_DATA_USER]; - rcu_read_lock(); for_each_member_device_rcu(ca, c, i, devs) { bdi = ca->disk_sb.bdev->bd_bdi; @@ -174,8 +174,8 @@ int bch2_congested(void *data, int bdi_bits) break; } } - rcu_read_unlock(); } + rcu_read_unlock(); return ret; } @@ -185,9 +185,9 @@ int bch2_congested(void *data, int bdi_bits) /* * For startup/shutdown of RW stuff, the dependencies are: * - * - foreground writes depend on copygc and tiering (to free up space) + * - foreground writes depend on copygc and rebalance (to free up space) * - * - copygc and tiering depend on mark and sweep gc (they actually probably + * - copygc and rebalance depend on mark and sweep gc (they actually probably * don't because they either reserve ahead of time or don't block if * allocations fail, but allocations can require mark and sweep gc to run * because of generation number wraparound) @@ -225,7 +225,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) struct bch_dev *ca; unsigned i; - bch2_tiering_stop(c); + bch2_rebalance_stop(c); for_each_member_device(ca, c, i) bch2_copygc_stop(ca); @@ -385,8 +385,8 @@ const char *bch2_fs_read_write(struct bch_fs *c) goto err; } - err = "error starting tiering thread"; - if (bch2_tiering_start(c)) + err = "error starting rebalance thread"; + if (bch2_rebalance_start(c)) goto err; schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); @@ -531,7 +531,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) #undef BCH_TIME_STAT bch2_fs_allocator_init(c); - bch2_fs_tiering_init(c); + bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); INIT_LIST_HEAD(&c->list); @@ -555,8 +555,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->writeback_pages_max = (256 << 10) / PAGE_SIZE; c->copy_gc_enabled = 1; - c->tiering_enabled = 1; - c->tiering_percent = 10; + c->rebalance_enabled = 1; + c->rebalance_percent = 10; c->journal.write_time = &c->journal_write_time; c->journal.delay_time = &c->journal_delay_time; @@ -626,7 +626,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || - bch2_check_set_has_compressed_data(c, c->opts.compression) || bch2_fs_fsio_init(c)) goto err; @@ -1216,6 +1215,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) if (ca->mi.state == BCH_MEMBER_STATE_RW) bch2_dev_allocator_add(c, ca); + rebalance_wakeup(c); + percpu_ref_reinit(&ca->io_ref); return 0; } @@ -1340,9 +1341,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) if (bch2_copygc_start(c, ca)) return "error starting copygc thread"; - if (bch2_tiering_start(c)) - return "error starting tiering thread"; - return NULL; } @@ -1350,6 +1348,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_sb_field_members *mi; + int ret = 0; if (ca->mi.state == new_state) return 0; @@ -1368,10 +1367,13 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (new_state == BCH_MEMBER_STATE_RW) - return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0; + if (new_state == BCH_MEMBER_STATE_RW && + __bch2_dev_read_write(c, ca)) + ret = -ENOMEM; - return 0; + rebalance_wakeup(c); + + return ret; } int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1700,6 +1702,95 @@ err: return ret; } +/* return with ref on ca->ref: */ +struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) +{ + + struct block_device *bdev = lookup_bdev(path); + struct bch_dev *ca; + unsigned i; + + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + for_each_member_device(ca, c, i) + if (ca->disk_sb.bdev == bdev) + goto found; + + ca = ERR_PTR(-ENOENT); +found: + bdput(bdev); + return ca; +} + +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label) +{ + struct bch_sb_field_disk_groups *groups; + struct bch_disk_group *g; + struct bch_member *mi; + unsigned i, v, nr_groups; + int ret; + + if (strlen(label) > BCH_SB_LABEL_SIZE) + return -EINVAL; + + mutex_lock(&c->sb_lock); + groups = bch2_sb_get_disk_groups(c->disk_sb); + nr_groups = disk_groups_nr(groups); + + if (!strcmp(label, "none")) { + v = 0; + goto write_sb; + } + + ret = __bch2_disk_group_find(groups, label); + if (ret >= 0) { + v = ret + 1; + goto write_sb; + } + + /* not found - create a new disk group: */ + + for (i = 0; + i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); + i++) + ; + + if (i == nr_groups) { + unsigned u64s = + (sizeof(struct bch_sb_field_disk_groups) + + sizeof(struct bch_disk_group) * (nr_groups + 1)) / + sizeof(u64); + + groups = bch2_fs_sb_resize_disk_groups(c, u64s); + if (!groups) { + mutex_unlock(&c->sb_lock); + return -ENOSPC; + } + + nr_groups = disk_groups_nr(groups); + } + + BUG_ON(i >= nr_groups); + + g = &groups->entries[i]; + v = i + 1; + + memcpy(g->label, label, strlen(label)); + if (strlen(label) < sizeof(g->label)) + g->label[strlen(label)] = '\0'; + SET_BCH_GROUP_DELETED(g, 0); + SET_BCH_GROUP_DATA_ALLOWED(g, ~0); +write_sb: + mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_GROUP(mi, v); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + /* Filesystem open: */ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 1718f5c1..652a572f 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -194,6 +194,8 @@ int bch2_dev_add(struct bch_fs *, const char *); int bch2_dev_online(struct bch_fs *, const char *); int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); +struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); +int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index d76d917c..3be05e9b 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -24,7 +24,6 @@ struct bch_member_cpu { u16 bucket_size; /* sectors */ u16 group; u8 state; - u8 tier; u8 replacement; u8 discard; u8 data_allowed; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 2e958a8e..e42bc1da 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -168,15 +168,14 @@ rw_attribute(writeback_pages_max); rw_attribute(discard); rw_attribute(cache_replacement_policy); +rw_attribute(group); rw_attribute(copy_gc_enabled); sysfs_pd_controller_attribute(copy_gc); -rw_attribute(tier); -rw_attribute(tiering_enabled); -rw_attribute(tiering_percent); -sysfs_pd_controller_attribute(tiering); - +rw_attribute(rebalance_enabled); +rw_attribute(rebalance_percent); +sysfs_pd_controller_attribute(rebalance); rw_attribute(pd_controllers_update_seconds); @@ -332,10 +331,10 @@ SHOW(bch2_fs) sysfs_print(pd_controllers_update_seconds, c->pd_controllers_update_seconds); - sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); - sysfs_print(tiering_percent, c->tiering_percent); + sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled); + sysfs_print(rebalance_percent, c->rebalance_percent); - sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */ + sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */ sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true)); sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false)); @@ -397,19 +396,19 @@ STORE(__bch2_fs) return ret; } - if (attr == &sysfs_tiering_enabled) { - ssize_t ret = strtoul_safe(buf, c->tiering_enabled) + if (attr == &sysfs_rebalance_enabled) { + ssize_t ret = strtoul_safe(buf, c->rebalance_enabled) ?: (ssize_t) size; - bch2_tiering_start(c); /* issue wakeups */ + rebalance_wakeup(c); return ret; } sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); - sysfs_strtoul(tiering_percent, c->tiering_percent); - sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */ + sysfs_strtoul(rebalance_percent, c->rebalance_percent); + sysfs_pd_controller_store(rebalance, &c->rebalance_pd); /* Debugging: */ @@ -468,7 +467,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_writeback_pages_max, - &sysfs_tiering_percent, + &sysfs_rebalance_percent, &sysfs_compression_stats, NULL @@ -506,8 +505,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_prune_cache, &sysfs_copy_gc_enabled, - &sysfs_tiering_enabled, - sysfs_pd_controller_files(tiering), + &sysfs_rebalance_enabled, + sysfs_pd_controller_files(rebalance), &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -527,9 +526,7 @@ SHOW(bch2_fs_opts_dir) int id = opt - bch2_opt_table; u64 v = bch2_opt_get_by_id(&c->opts, id); - out += opt->type == BCH_OPT_STR - ? bch2_scnprint_string_list(out, end - out, opt->choices, v) - : scnprintf(out, end - out, "%lli", v); + out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST); out += scnprintf(out, end - out, "\n"); return out - buf; @@ -542,13 +539,12 @@ STORE(bch2_fs_opts_dir) int ret, id = opt - bch2_opt_table; u64 v; - ret = bch2_opt_parse(opt, buf, &v); + ret = bch2_opt_parse(c, opt, buf, &v); if (ret < 0) return ret; - mutex_lock(&c->sb_lock); - - if (id == Opt_compression) { + if (id == Opt_compression || + id == Opt_background_compression) { int ret = bch2_check_set_has_compressed_data(c, v); if (ret) { mutex_unlock(&c->sb_lock); @@ -557,13 +553,19 @@ STORE(bch2_fs_opts_dir) } if (opt->set_sb != SET_NO_SB_OPT) { + mutex_lock(&c->sb_lock); opt->set_sb(c->disk_sb, v); bch2_write_super(c); + mutex_unlock(&c->sb_lock); } bch2_opt_set_by_id(&c->opts, id, v); - mutex_unlock(&c->sb_lock); + if ((id == Opt_background_target || + id == Opt_background_compression) && v) { + bch2_rebalance_add_work(c, S64_MAX); + rebalance_wakeup(c); + } return size; } @@ -809,6 +811,26 @@ SHOW(bch2_dev) sysfs_print(nbuckets, ca->mi.nbuckets); sysfs_print(discard, ca->mi.discard); + if (attr == &sysfs_group) { + struct bch_sb_field_disk_groups *groups; + struct bch_disk_group *g; + unsigned len; + + if (!ca->mi.group) + return scnprintf(out, end - out, "none\n"); + + mutex_lock(&c->sb_lock); + groups = bch2_sb_get_disk_groups(c->disk_sb); + + g = &groups->entries[ca->mi.group - 1]; + len = strnlen(g->label, sizeof(g->label)); + memcpy(buf, g->label, len); + mutex_unlock(&c->sb_lock); + + buf[len++] = '\n'; + return len; + } + if (attr == &sysfs_has_data) { out += bch2_scnprint_flag_list(out, end - out, bch2_data_types, @@ -827,8 +849,6 @@ SHOW(bch2_dev) return out - buf; } - sysfs_print(tier, ca->mi.tier); - if (attr == &sysfs_state_rw) { out += bch2_scnprint_string_list(out, end - out, bch2_dev_state, @@ -892,29 +912,10 @@ STORE(bch2_dev) mutex_unlock(&c->sb_lock); } - if (attr == &sysfs_tier) { - unsigned prev_tier; - unsigned v = strtoul_restrict_or_return(buf, - 0, BCH_TIER_MAX - 1); - - mutex_lock(&c->sb_lock); - prev_tier = ca->mi.tier; - - if (v == ca->mi.tier) { - mutex_unlock(&c->sb_lock); - return size; - } - - mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; - SET_BCH_MEMBER_TIER(mi, v); - bch2_write_super(c); - - clear_bit(ca->dev_idx, c->tiers[prev_tier].devs.d); - set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); - mutex_unlock(&c->sb_lock); - - bch2_recalc_capacity(c); - bch2_tiering_start(c); + if (attr == &sysfs_group) { + int ret = bch2_dev_group_set(c, ca, buf); + if (ret) + return ret; } if (attr == &sysfs_wake_allocator) @@ -934,8 +935,8 @@ struct attribute *bch2_dev_files[] = { /* settings: */ &sysfs_discard, &sysfs_cache_replacement_policy, - &sysfs_tier, &sysfs_state_rw, + &sysfs_group, &sysfs_has_data, &sysfs_iostats, diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index 775c2e2b..211a844c 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -12,173 +12,247 @@ #include #include +#include #include -static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier, - struct bkey_s_c_extent e) +static inline bool rebalance_ptr_pred(struct bch_fs *c, + const struct bch_extent_ptr *ptr, + struct bch_extent_crc_unpacked crc, + struct bch_io_opts *io_opts) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (io_opts->background_target && + !dev_in_target(ca, io_opts->background_target) && + !ptr->cached) + return true; + + if (io_opts->background_compression && + crc.compression_type != + bch2_compression_opt_to_type[io_opts->background_compression]) + return true; + + return false; +} + +void bch2_rebalance_add_key(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + struct bkey_s_c_extent e; + + if (!bkey_extent_is_data(k.k)) + return; + + if (!io_opts->background_target && + !io_opts->background_compression) + return; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (!atomic64_add_return(crc.compressed_size, + &ca->rebalance_work)) + rebalance_wakeup(c); + } +} + +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev)) + rebalance_wakeup(c); +} + +static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { const struct bch_extent_ptr *ptr; - unsigned replicas = 0; + struct bch_extent_crc_unpacked crc; /* Make sure we have room to add a new pointer: */ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > BKEY_EXTENT_VAL_U64s_MAX) - return false; + return DATA_SKIP; - extent_for_each_ptr(e, ptr) - if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx) - replicas++; + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) + goto found; - return replicas < c->opts.data_replicas; + return DATA_SKIP; +found: + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; } -static enum data_cmd tiering_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) +struct rebalance_work { + unsigned dev_most_full_percent; + u64 dev_most_full_work; + u64 dev_most_full_capacity; + u64 total_work; +}; + +static struct rebalance_work rebalance_work(struct bch_fs *c) { - struct bch_tier *tier = arg; + struct bch_dev *ca; + struct rebalance_work ret = { 0 }; + unsigned i; - if (!__tiering_pred(c, tier, e)) - return DATA_SKIP; + for_each_online_member(ca, c, i) { + u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); + u64 work = atomic64_read(&ca->rebalance_work) + + atomic64_read(&c->rebalance_work_unknown_dev); + unsigned percent_full = div_u64(work * 100, capacity); + + if (percent_full > ret.dev_most_full_percent) { + ret.dev_most_full_percent = percent_full; + ret.dev_most_full_work = work; + ret.dev_most_full_capacity = capacity; + } - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; + ret.total_work += atomic64_read(&ca->rebalance_work); + } + + ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev); + + return ret; } -static int bch2_tiering_thread(void *arg) +static void rebalance_work_reset(struct bch_fs *c) { - struct bch_tier *tier = arg; - struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); - struct io_clock *clock = &c->io_clock[WRITE]; struct bch_dev *ca; - struct bch_move_stats move_stats; - u64 tier_capacity, available_sectors; - unsigned long last; - unsigned i, nr_devices; + unsigned i; + + for_each_online_member(ca, c, i) + atomic64_set(&ca->rebalance_work, 0); + + atomic64_set(&c->rebalance_work_unknown_dev, 0); +} + +static unsigned long curr_cputime(void) +{ + u64 utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return nsecs_to_jiffies(utime + stime); +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; + struct rebalance_work w, p; + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; - memset(&move_stats, 0, sizeof(move_stats)); set_freezable(); - while (!kthread_should_stop()) { - if (kthread_wait_freezable(c->tiering_enabled && - (nr_devices = dev_mask_nr(&tier->devs)))) - break; - - while (1) { - struct bch_tier *faster_tier; - - last = atomic_long_read(&clock->now); - - tier_capacity = available_sectors = 0; - for (faster_tier = c->tiers; - faster_tier != tier; - faster_tier++) { - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &faster_tier->devs) { - tier_capacity += - bucket_to_sector(ca, - ca->mi.nbuckets - - ca->mi.first_bucket); - available_sectors += - bucket_to_sector(ca, - dev_buckets_available(c, ca)); - } - rcu_read_unlock(); - } + p = rebalance_work(c); + prev_start = jiffies; + prev_cputime = curr_cputime(); + + while (!kthread_wait_freezable(c->rebalance_enabled)) { + struct bch_move_stats move_stats = { 0 }; - if (available_sectors < (tier_capacity >> 1)) - break; + w = rebalance_work(c); + start = jiffies; + cputime = curr_cputime(); + + prev_run_time = start - prev_start; + prev_run_cputime = cputime - prev_cputime; + + if (!w.total_work) { + kthread_wait_freezable(rebalance_work(c).total_work); + continue; + } - bch2_kthread_io_clock_wait(clock, - last + - available_sectors - - (tier_capacity >> 1)); - if (kthread_should_stop()) - return 0; + if (w.dev_most_full_percent < 20 && + prev_run_cputime * 5 > prev_run_time) { + if (w.dev_most_full_capacity) { + bch2_kthread_io_clock_wait(clock, + atomic_long_read(&clock->now) + + div_u64(w.dev_most_full_capacity, 5)); + } else { + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + schedule_timeout(prev_run_cputime * 5 - + prev_run_time); + continue; + } } - bch2_move_data(c, &tier->pd.rate, - SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices, - &tier->devs, - writepoint_ptr(&tier->wp), + /* minimum 1 mb/sec: */ + c->rebalance_pd.rate.rate = + max_t(u64, 1 << 11, + c->rebalance_pd.rate.rate * + max(p.dev_most_full_percent, 1U) / + max(w.dev_most_full_percent, 1U)); + + rebalance_work_reset(c); + + bch2_move_data(c, &c->rebalance_pd.rate, + writepoint_ptr(&c->rebalance_write_point), POS_MIN, POS_MAX, - tiering_pred, tier, + rebalance_pred, NULL, &move_stats); } return 0; } -static void __bch2_tiering_stop(struct bch_tier *tier) +void bch2_rebalance_stop(struct bch_fs *c) { - tier->pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&tier->pd.rate); - - if (tier->migrate) - kthread_stop(tier->migrate); + struct task_struct *p; - tier->migrate = NULL; -} + c->rebalance_pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance_pd.rate); -void bch2_tiering_stop(struct bch_fs *c) -{ - struct bch_tier *tier; + p = c->rebalance_thread; + c->rebalance_thread = NULL; - for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) - __bch2_tiering_stop(tier); -} + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); -static int __bch2_tiering_start(struct bch_tier *tier) -{ - if (!tier->migrate) { - struct task_struct *p = - kthread_create(bch2_tiering_thread, tier, - "bch_tier[%u]", tier->idx); - if (IS_ERR(p)) - return PTR_ERR(p); - - tier->migrate = p; + kthread_stop(p); + put_task_struct(p); } - - wake_up_process(tier->migrate); - return 0; } -int bch2_tiering_start(struct bch_fs *c) +int bch2_rebalance_start(struct bch_fs *c) { - struct bch_tier *tier; - bool have_faster_tier = false; + struct task_struct *p; if (c->opts.nochanges) return 0; - for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { - if (!dev_mask_nr(&tier->devs)) - continue; - - if (have_faster_tier) { - int ret = __bch2_tiering_start(tier); - if (ret) - return ret; - } else { - __bch2_tiering_stop(tier); - } + p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); + if (IS_ERR(p)) + return PTR_ERR(p); - have_faster_tier = true; - } + get_task_struct(p); + rcu_assign_pointer(c->rebalance_thread, p); + wake_up_process(c->rebalance_thread); return 0; } -void bch2_fs_tiering_init(struct bch_fs *c) +void bch2_fs_rebalance_init(struct bch_fs *c) { - unsigned i; + bch2_pd_controller_init(&c->rebalance_pd); - for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { - c->tiers[i].idx = i; - bch2_pd_controller_init(&c->tiers[i].pd); - } + atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX); } diff --git a/libbcachefs/tier.h b/libbcachefs/tier.h index f8eaa9b0..0c66dfea 100644 --- a/libbcachefs/tier.h +++ b/libbcachefs/tier.h @@ -1,8 +1,23 @@ #ifndef _BCACHEFS_TIER_H #define _BCACHEFS_TIER_H -void bch2_tiering_stop(struct bch_fs *); -int bch2_tiering_start(struct bch_fs *); -void bch2_fs_tiering_init(struct bch_fs *); +static inline void rebalance_wakeup(struct bch_fs *c) +{ + struct task_struct *p; + + rcu_read_lock(); + p = rcu_dereference(c->rebalance_thread); + if (p) + wake_up_process(p); + rcu_read_unlock(); +} + +void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, + struct bch_io_opts *); +void bch2_rebalance_add_work(struct bch_fs *, u64); + +void bch2_rebalance_stop(struct bch_fs *); +int bch2_rebalance_start(struct bch_fs *); +void bch2_fs_rebalance_init(struct bch_fs *); #endif /* _BCACHEFS_TIER_H */ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 1d6cbe72..81e942e5 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -6,6 +6,7 @@ #include "extents.h" #include "fs.h" #include "str_hash.h" +#include "tier.h" #include "xattr.h" #include @@ -366,6 +367,7 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, const char *name, void *buffer, size_t size) { struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_opts opts = bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); const struct bch_option *opt; @@ -383,12 +385,9 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, v = bch2_opt_get_by_id(&opts, id); - if (opt->type == BCH_OPT_STR) - ret = snprintf(buffer, size, "%s", opt->choices[v]); - else - ret = snprintf(buffer, size, "%llu", v); + ret = bch2_opt_to_text(c, buffer, size, opt, v, 0); - return ret <= size || !buffer ? ret : -ERANGE; + return ret < size || !buffer ? ret : -ERANGE; } struct inode_opt_set { @@ -435,17 +434,15 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, memcpy(buf, value, size); buf[size] = '\0'; - ret = bch2_opt_parse(opt, buf, &s.v); + ret = bch2_opt_parse(c, opt, buf, &s.v); kfree(buf); if (ret < 0) return ret; - if (s.id == Opt_compression) { - mutex_lock(&c->sb_lock); + if (s.id == Opt_compression || + s.id == Opt_background_compression) { ret = bch2_check_set_has_compressed_data(c, s.v); - mutex_unlock(&c->sb_lock); - if (ret) return ret; } @@ -459,6 +456,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s); mutex_unlock(&inode->ei_update_lock); + if (value && + (s.id == Opt_background_compression || + s.id == Opt_background_target)) + bch2_rebalance_add_work(c, inode->v.i_blocks); + return ret; } -- cgit v1.2.3