diff options
-rw-r--r-- | fs/btrfs/block-group.c | 18 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 127 | ||||
-rw-r--r-- | fs/btrfs/space-info.h | 8 | ||||
-rw-r--r-- | fs/btrfs/sysfs.c | 43 |
4 files changed, 169 insertions, 27 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b530302b469d..8169a1465db1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1764,24 +1764,21 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) { - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); + u64 thresh_bytes = mult_perc(bg->length, thresh_pct); const u64 new_val = bg->used; const u64 old_val = new_val + bytes_freed; - u64 thresh; - if (reclaim_thresh == 0) + if (thresh_bytes == 0) return false; - thresh = mult_perc(bg->length, reclaim_thresh); - /* * If we were below the threshold before don't reclaim, we are likely a * brand new block group and we don't want to relocate new block groups. */ - if (old_val < thresh) + if (old_val < thresh_bytes) return false; - if (new_val >= thresh) + if (new_val >= thresh_bytes) return false; return true; } @@ -1843,6 +1840,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* Don't race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); + spin_lock(&space_info->lock); spin_lock(&bg->lock); if (bg->reserved || bg->pinned || bg->ro) { /* @@ -1852,6 +1850,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * this block group. */ spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } @@ -1870,6 +1869,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_mark_bg_unused(bg); spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; @@ -1886,10 +1886,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!should_reclaim_block_group(bg, bg->length)) { spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); /* * Get out fast, in case we're read-only or unmounting the diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3608c1f950f0..5b1940432ff3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/minmax.h> #include "misc.h" #include "ctree.h" #include "space-info.h" @@ -190,6 +191,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -341,11 +344,32 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } +static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo; + u64 data_chunk_size; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no + * more than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size) + * as it is. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + if (btrfs_is_zoned(fs_info)) + return data_sinfo->chunk_size; + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + return min_t(u64, data_chunk_size, SZ_1G); +} + static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; u64 data_chunk_size; @@ -369,23 +393,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, if (avail == 0) return 0; - /* - * Calculate the data_chunk_size, space_info->chunk_size is the - * "optimal" chunk size based on the fs size. However when we actually - * allocate the chunk we will strip this down further, making it no more - * than 10% of the disk or 1G, whichever is smaller. - * - * On the zoned mode, we need to use zone_size (= - * data_sinfo->chunk_size) as it is. - */ - data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - if (!btrfs_is_zoned(fs_info)) { - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); - } else { - data_chunk_size = data_sinfo->chunk_size; - } + data_chunk_size = calc_effective_data_chunk_size(fs_info); /* * Since data allocations immediately use block groups as part of the @@ -1878,3 +1886,86 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } + +static u64 calc_pct_ratio(u64 x, u64 y) +{ + int err; + + if (!y) + return 0; +again: + err = check_mul_overflow(100, x, &x); + if (err) + goto lose_precision; + return div64_u64(x, y); +lose_precision: + x >>= 10; + y >>= 10; + if (!y) + y = 1; + goto again; +} + +/* + * A reasonable buffer for unallocated space is 10 data block_groups. + * If we claw this back repeatedly, we can still achieve efficient + * utilization when near full, and not do too much reclaim while + * always maintaining a solid buffer for workloads that quickly + * allocate and pressure the unallocated space. + */ +static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) +{ + return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * calc_effective_data_chunk_size(fs_info); +} + +/* + * The fundamental goal of automatic reclaim is to protect the filesystem's + * unallocated space and thus minimize the probability of the filesystem going + * read only when a metadata allocation failure causes a transaction abort. + * + * However, relocations happen into the space_info's unused space, therefore + * automatic reclaim must also back off as that space runs low. There is no + * value in doing trivial "relocations" of re-writing the same block group + * into a fresh one. + * + * Furthermore, we want to avoid doing too much reclaim even if there are good + * candidates. This is because the allocator is pretty good at filling up the + * holes with writes. So we want to do just enough reclaim to try and stay + * safe from running out of unallocated space but not be wasteful about it. + * + * Therefore, the dynamic reclaim threshold is calculated as follows: + * - calculate a target unallocated amount of 5 block group sized chunks + * - ratchet up the intensity of reclaim depending on how far we are from + * that target by using a formula of unalloc / target to set the threshold. + * + * Typically with 10 block groups as the target, the discrete values this comes + * out to are 0, 10, 20, ... , 80, 90, and 99. + */ +static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 target = calc_unalloc_target(fs_info); + u64 alloc = space_info->total_bytes; + u64 used = btrfs_space_info_used(space_info, false); + u64 unused = alloc - used; + u64 want = target > unalloc ? target - unalloc : 0; + u64 data_chunk_size = calc_effective_data_chunk_size(fs_info); + /* Cast to int is OK because want <= target */ + int ratio = calc_pct_ratio(want, target); + + /* If we have no unused space, don't bother, it won't work anyway */ + if (unused < data_chunk_size) + return 0; + + return ratio; +} + +int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) +{ + lockdep_assert_held(&space_info->lock); + + if (READ_ONCE(space_info->dynamic_reclaim)) + return calc_dynamic_reclaim_threshold(space_info); + return READ_ONCE(space_info->bg_reclaim_threshold); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 25edfd453b27..2cac771321c7 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -184,6 +184,12 @@ struct btrfs_space_info { * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors */ u64 reclaim_errors; + + /* + * If true, use the dynamic relocation threshold, instead of the + * fixed bg_reclaim_threshold. + */ + bool dynamic_reclaim; }; struct reserve_ticket { @@ -266,4 +272,6 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 919c7ba45121..360d6093476f 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -905,8 +905,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; - return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + spin_lock(&space_info->lock); + ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); + spin_unlock(&space_info->lock); + return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, @@ -917,6 +921,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, int thresh; int ret; + if (READ_ONCE(space_info->dynamic_reclaim)) + return -EINVAL; + ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; @@ -933,6 +940,39 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); +static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); +} + +static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int dynamic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &dynamic_reclaim); + if (ret) + return ret; + + if (dynamic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, dynamic_reclaim, + btrfs_sinfo_dynamic_reclaim_show, + btrfs_sinfo_dynamic_reclaim_store); + /* * Allocation information about block group types. * @@ -950,6 +990,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), BTRFS_ATTR_PTR(space_info, reclaim_count), |