From a5ed45f8224f2c7e4ad5a9673cb50e8e3128bd88 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 11 May 2017 09:17:46 +0300 Subject: btrfs: Convert fs_info->free_chunk_space to atomic64_t The ->free_chunk_space variable is used to track the unallocated space and access to it is protected by a spinlock, which is not used for anything else. Make the code a bit self-explanatory by switching the variable to an atomic64_t type and kill the spinlock. Signed-off-by: Nikolay Borisov [ not a performance critical code, use of atomic type is ok ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 33d979e9ea2a..4c0d3980fe3f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4646,9 +4646,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&fs_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(&fs_info->free_chunk_lock); + avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free -- cgit v1.2.3 From 1b86826d12dc4acf9576cad9c43da74025dc8074 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 17 May 2017 11:38:35 -0400 Subject: btrfs: cleanup root usage by btrfs_get_alloc_profile There are two places where we don't already know what kind of alloc profile we need before calling btrfs_get_alloc_profile, but we need access to a root everywhere we call it. This patch adds helpers for btrfs_{data,metadata,system}_alloc_profile() and relegates btrfs_system_alloc_profile to a static for use in those two cases. The next patch will eliminate one of those. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 +++- fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++------- fs/btrfs/inode.c | 3 +-- fs/btrfs/super.c | 3 +-- fs/btrfs/volumes.c | 5 ++--- 5 files changed, 28 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c457cb177340..c13b4859df73 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2681,7 +2681,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4c0d3980fe3f..47eb98ffdfc6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4121,7 +4121,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; @@ -4138,6 +4138,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { @@ -4187,7 +4202,7 @@ again: data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); + alloc_target = btrfs_data_alloc_profile(fs_info); /* * It is ugly that we don't call nolock join * transaction for the free space inode case here. @@ -4463,9 +4478,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } if (left < thresh) { - u64 flags; + u64 flags = btrfs_system_alloc_profile(fs_info); - flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from @@ -4629,7 +4643,7 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - profile = btrfs_get_alloc_profile(root, 0); + profile = get_alloc_profile_by_root(root, 0); used = btrfs_space_info_used(space_info, false); /* @@ -4894,7 +4908,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; } ret = do_chunk_alloc(trans, fs_info, - btrfs_get_alloc_profile(root, 0), + btrfs_metadata_alloc_profile(fs_info), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) @@ -7954,7 +7968,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 flags; int ret; - flags = btrfs_get_alloc_profile(root, is_data); + flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 683ee05798e5..222d33d78092 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8473,7 +8473,6 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; u64 start_sector = orig_bio->bi_iter.bi_sector; @@ -8499,7 +8498,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4f1cdd5058f1..3371213924bd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1898,7 +1898,6 @@ static inline void btrfs_descending_sort_devices( static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; @@ -1932,7 +1931,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, return -ENOMEM; /* calc min stripe number for data space allocation */ - type = btrfs_get_alloc_profile(root, 1); + type = btrfs_data_alloc_profile(fs_info); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; num_stripes = nr_devices; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e37f95976443..e28c113785bb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5019,20 +5019,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_root *extent_root = fs_info->extent_root; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; int ret; chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(extent_root, 0); + alloc_profile = btrfs_metadata_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); if (ret) return ret; sys_chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); + alloc_profile = btrfs_system_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); return ret; } -- cgit v1.2.3 From c1c4919b112d2591bea5c68b54a24f083619f81b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 17 May 2017 11:38:36 -0400 Subject: btrfs: remove root usage from can_overcommit can_overcommit using the root to determine the allocation profile is the only use of a root in the call graph below reserve_metadata_bytes. It turns out that we only need to know whether the allocation is for the chunk root or not -- and we can pass that around as a bool instead. This allows us to pull root usage out of the reservation path all the way up to reserve_metadata_bytes itself, which uses it only to compare against fs_info->chunk_root to set the bool. In turn, this eliminates a bunch of races where we use a particular root too early in the mount process. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 82 ++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 36 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47eb98ffdfc6..87eab8fdac73 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush); + enum btrfs_reserve_flush_enum flush, + bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -4628,11 +4629,11 @@ out: return ret; } -static int can_overcommit(struct btrfs_root *root, +static int can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; u64 space_size; @@ -4643,7 +4644,11 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - profile = get_alloc_profile_by_root(root, 0); + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + used = btrfs_space_info_used(space_info, false); /* @@ -4728,10 +4733,9 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -4788,7 +4792,7 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, flush)) { + if (can_overcommit(fs_info, space_info, orig, flush, false)) { spin_unlock(&space_info->lock); break; } @@ -4898,7 +4902,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes * 2, orig_bytes, + shrink_delalloc(fs_info, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4929,8 +4933,9 @@ static int flush_space(struct btrfs_fs_info *fs_info, } static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, - struct btrfs_space_info *space_info) +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) { struct reserve_ticket *ticket; u64 used; @@ -4945,14 +4950,15 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; - if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4966,17 +4972,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_root *root, u64 used) +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -5013,8 +5020,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -5036,8 +5043,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_tickets_id == space_info->tickets_id) { @@ -5075,8 +5083,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state = FLUSH_DELAYED_ITEMS_NR; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->extent_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -5155,12 +5163,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -5182,7 +5190,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (can_overcommit(root, space_info, orig_bytes, flush)) { + } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); @@ -5209,7 +5218,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, orig_bytes, flush, "enospc"); queue_work(system_unbound_wq, - &root->fs_info->async_reclaim_work); + &fs_info->async_reclaim_work); } } else { list_add_tail(&ticket.list, @@ -5223,7 +5232,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(space_info, root, used) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -5281,9 +5291,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; + bool system_chunk = (root == fs_info->chunk_root); - ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, - flush); + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -5406,8 +5417,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !can_overcommit(fs_info->extent_root, space_info, 0, - flush)) + !can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); -- cgit v1.2.3 From 28785f70ef882e4798cd5706066a55dbf7adf80e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 May 2017 11:39:15 -0600 Subject: Btrfs: skip commit transaction if we don't have enough pinned bytes We commit transaction in order to reclaim space from pinned bytes because it could process delayed refs, and in may_commit_transaction(), we check first if pinned bytes are enough for the required space, we then check if that plus bytes reserved for delayed insert are enough for the required space. This changes the code to the above logic. Fixes: b150a4f10d87 ("Btrfs: use a percpu to keep track of possibly pinned bytes") Tested-by: Nikolay Borisov Reported-by: Nikolay Borisov Reviewed-by: Nikolay Borisov Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 87eab8fdac73..da8e49e04eb0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4854,7 +4854,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) >= 0) { + bytes - delayed_rsv->size) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } -- cgit v1.2.3 From 2be12ef79fe9f5333cb9fadc85ef20b6832a56d7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 22 May 2017 09:35:49 +0300 Subject: btrfs: Separate space_info create/update Currently the struct space_info creation code is intermixed in the udpate_space_info function. There are well-defined points at which the we actually want to create brand-new space_info structs (e.g. during mount of the filesystem as well as sometimes when adding/initialising new chunks). In such cases update_space_info is called with 0 as the bytes parameter. All of this makes for spaghetti code. Fix it by factoring out the creation code in a separate create_space_info structure. This also allows to simplify the internals. Also remove BUG_ON from do_alloc_chunk since the callers handle errors. Furthermore it will make the update_space_info function not fail, allowing us to remove error handling in callers. This will come in a follow up patch. Signed-off-by: Nikolay Borisov Reviewed-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 132 ++++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 66 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index da8e49e04eb0..4e88df0b3f0a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3925,15 +3925,60 @@ static const char *alloc_name(u64 flags) }; } +static int create_space_info(struct btrfs_fs_info *info, u64 flags, + struct btrfs_space_info **new) +{ + + struct btrfs_space_info *space_info; + int i; + int ret; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); + if (ret) { + kfree(space_info); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + percpu_counter_destroy(&space_info->total_bytes_pinned); + kfree(space_info); + return ret; + } + + *new = space_info; + list_add_rcu(&space_info->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = space_info; + + return ret; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; - int i; int factor; - int ret; if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -3957,54 +4002,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, *space_info = found; return 0; } - found = kzalloc(sizeof(*found), GFP_NOFS); - if (!found) - return -ENOMEM; - - ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); - if (ret) { - kfree(found); - return ret; - } - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&found->block_groups[i]); - init_rwsem(&found->groups_sem); - spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - found->total_bytes = total_bytes; - found->disk_total = total_bytes * factor; - found->bytes_used = bytes_used; - found->disk_used = bytes_used * factor; - found->bytes_pinned = 0; - found->bytes_reserved = 0; - found->bytes_readonly = bytes_readonly; - found->bytes_may_use = 0; - found->full = 0; - found->max_extent_size = 0; - found->force_alloc = CHUNK_ALLOC_NO_FORCE; - found->chunk_alloc = 0; - found->flush = 0; - init_waitqueue_head(&found->wait); - INIT_LIST_HEAD(&found->ro_bgs); - INIT_LIST_HEAD(&found->tickets); - INIT_LIST_HEAD(&found->priority_tickets); - - ret = kobject_init_and_add(&found->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(found->flags)); - if (ret) { - percpu_counter_destroy(&found->total_bytes_pinned); - kfree(found); - return ret; - } - - *space_info = found; - list_add_rcu(&found->list, &info->space_info); - if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = found; - - return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -4521,10 +4518,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(fs_info, flags); if (!space_info) { - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); - BUG_ON(ret); /* -ENOMEM */ + ret = create_space_info(fs_info, flags, &space_info); + if (ret) + return ret; } - BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -10225,16 +10222,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } #endif /* - * Call to ensure the corresponding space_info object is created and - * assigned to our block group, but don't update its counters just yet. - * We want our bg to be added to the rbtree with its ->space_info set. + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. */ - ret = update_space_info(fs_info, cache->flags, 0, 0, 0, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; + cache->space_info = __find_space_info(fs_info, cache->flags); + if (!cache->space_info) { + ret = create_space_info(fs_info, cache->flags, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } } ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10808,21 +10808,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } out: return ret; -- cgit v1.2.3 From d2006e6d28ae2182db9f12dc686cfca78e878f52 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 22 May 2017 09:35:50 +0300 Subject: btrfs: Refactor update_space_info Following the factoring out of the creation code udpate_space_info can only be called for already-existing space_info structs. As such it cannot fail. Remove superfluous error handling and make the function return void. Signed-off-by: Nikolay Borisov Reviewed-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 58 ++++++++++++++++---------------------------------- 1 file changed, 18 insertions(+), 40 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4e88df0b3f0a..4689b88ac408 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3972,7 +3972,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } -static int update_space_info(struct btrfs_fs_info *info, u64 flags, +static void update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, struct btrfs_space_info **space_info) @@ -3987,21 +3987,19 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, factor = 1; found = __find_space_info(info, flags); - if (found) { - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; - return 0; - } + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -10078,19 +10076,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - ret = update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&info->block_group_cache_lock); - btrfs_put_block_group(cache); - goto error; - } + update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10249,18 +10237,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - ret = update_space_info(fs_info, cache->flags, size, bytes_used, + update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - return ret; - } update_global_block_rsv(fs_info); __link_block_group(cache->space_info, cache); -- cgit v1.2.3 From 0eee8a494e786672c57d86c445e24e678d562bf4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 14 Jun 2017 11:35:34 +0300 Subject: btrfs: Use btrfs_space_info_used instead of opencoding it Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4689b88ac408..65659bd55a4e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4949,9 +4949,8 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); + if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); @@ -5398,9 +5397,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, * overcommit, and if we can't then we just need to free up our space * and not satisfy any requests. */ - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); if (used - num_bytes >= space_info->total_bytes) check_overcommit = true; again: -- cgit v1.2.3 From 0d9f824df35d11215016785213f9c0fd06a72fc0 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 6 Jun 2017 16:45:26 -0700 Subject: Btrfs: make add_pinned_bytes() take an s64 num_bytes instead of u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a few places where we pass in a negative num_bytes, so make it signed for clarity. Also move it up in the file since later patches will need it there. Signed-off-by: Omar Sandoval Tested-by: Holger Hoffstätte Reviewed-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 65659bd55a4e..d784ecef27c0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -767,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. @@ -6808,27 +6828,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, - u64 owner, u64 root_objectid) -{ - struct btrfs_space_info *space_info; - u64 flags; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } - - space_info = __find_space_info(fs_info, flags); - BUG_ON(!space_info); /* Logic bug */ - percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); -} - - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info, struct btrfs_delayed_ref_node *node, u64 parent, -- cgit v1.2.3 From 55e8196a57cfe603ce3480a66c15dde3a13fe218 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 6 Jun 2017 16:45:27 -0700 Subject: Btrfs: make BUG_ON() in add_pinned_bytes() an ASSERT() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of flags is one of DATA/METADATA/SYSTEM, they must exist at when add_pinned_bytes is called. Signed-off-by: Omar Sandoval Tested-by: Holger Hoffstätte Reviewed-by: David Sterba [ added changelog ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d784ecef27c0..b344966585c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -783,7 +783,7 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, } space_info = __find_space_info(fs_info, flags); - BUG_ON(!space_info); /* Logic bug */ + ASSERT(space_info); percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); } -- cgit v1.2.3 From 4da8b76d347bcae951f89522a040c36d9fc9f3b3 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 6 Jun 2017 16:45:28 -0700 Subject: Btrfs: update total_bytes_pinned when pinning down extents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extents marked in pin_down_extent() will be unpinned later in unpin_extent_range(), which decrements total_bytes_pinned. pin_down_extent() must increment the counter to avoid underflowing it. Also adjust btrfs_free_tree_block() to avoid accounting for the same extent twice. Signed-off-by: Omar Sandoval Tested-by: Holger Hoffstätte Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b344966585c8..152e04773767 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6358,6 +6358,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes); set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -7204,6 +7205,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -7219,7 +7221,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); - pin = 0; } out: if (pin) -- cgit v1.2.3 From 0a16c7d7aecfae8987197e50116ebfc338cbe0a2 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 6 Jun 2017 16:45:29 -0700 Subject: Btrfs: always account pinned bytes when dropping a tree block ref MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, we only increment total_bytes_pinned in btrfs_free_tree_block() when dropping the last reference on the block. However, when the delayed ref is run later, we will decrement total_bytes_pinned regardless of whether it was the last reference or not. This causes the counter to underflow when the reference we dropped was not the last reference. Fix it by incrementing the counter unconditionally, which is what btrfs_free_extent() does. This makes total_bytes_pinned an overestimate when references to shared extents are dropped, but in the worst case this will just make us try to commit the transaction to try to free up space and find we didn't free enough. Signed-off-by: Omar Sandoval Tested-by: Holger Hoffstätte Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 152e04773767..40c80ed8d1f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7193,10 +7193,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } - if (!last_ref) - return; - - if (btrfs_header_generation(buf) == trans->transid) { + if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group_cache *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -7227,11 +7224,13 @@ out: add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), root->root_key.objectid); - /* - * Deleting the buffer, clear the corrupt flag since it doesn't matter - * anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } } /* Can return -ENOMEM */ -- cgit v1.2.3 From 7be07912b32d103d9789082f27dd54b47c89c744 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 6 Jun 2017 16:45:30 -0700 Subject: Btrfs: return old and new total ref mods when adding delayed refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need this to decide when to account pinned bytes. Signed-off-by: Omar Sandoval Tested-by: Holger Hoffstätte Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 29 ++++++++++++++++++++-------- fs/btrfs/delayed-ref.h | 6 ++++-- fs/btrfs/extent-tree.c | 51 ++++++++++++++++++++++++++------------------------ 3 files changed, 52 insertions(+), 34 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index be70d90dfee5..93ffa898df6d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -470,7 +470,8 @@ add_tail: static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) + struct btrfs_delayed_ref_node *update, + int *old_ref_mod_ret) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; @@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing_ref->total_ref_mod; + if (old_ref_mod_ret) + *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing_ref->total_ref_mod += update->ref_mod; @@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret) + int action, int is_data, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref, + old_ref_mod); /* * we've updated the existing ref, free the newly * allocated ref @@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (old_ref_mod) + *old_ref_mod = 0; if (is_data && count_mod < 0) delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; @@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; + if (new_ref_mod) + *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted); + &qrecord_inserted, old_ref_mod, + new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action) + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted); + action, 1, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, @@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL); + extent_op->is_data, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c0264ff01b53..ce88e4ac5276 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -247,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action); + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 40c80ed8d1f9..8121a78f6cbd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2120,14 +2120,16 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL, + NULL, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 0, - BTRFS_ADD_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_ADD_DELAYED_REF, NULL, + NULL); } return ret; } @@ -7184,12 +7186,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - buf->start, buf->len, - parent, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, + buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, + NULL, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -7257,15 +7259,16 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL, + NULL, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, 0, - BTRFS_DROP_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_DROP_DELAYED_REF, + NULL, NULL); } return ret; } @@ -8213,9 +8216,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - ram_bytes, BTRFS_ADD_DELAYED_EXTENT); + ins->offset, 0, root_objectid, owner, + offset, ram_bytes, + BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; } @@ -8435,11 +8438,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - ins.objectid, ins.offset, - parent, root_objectid, level, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, + ins.offset, parent, + root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, NULL, NULL); if (ret) goto out_free_delayed; } -- cgit v1.2.3 From d7eae3403f46646889a9d172476e61a7aa822cc7 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 6 Jun 2017 16:45:31 -0700 Subject: Btrfs: rework delayed ref total_bytes_pinned accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The total_bytes_pinned counter is completely broken when accounting delayed refs: - If two drops for the same extent are merged, we will decrement total_bytes_pinned twice but only increment it once. - If an add is merged into a drop or vice versa, we will decrement the total_bytes_pinned counter but never increment it. - If multiple references to an extent are dropped, we will account it multiple times, potentially vastly over-estimating the number of bytes that will be freed by a commit and doing unnecessary work when we're close to ENOSPC. The last issue is relatively minor, but the first two make the total_bytes_pinned counter leak or underflow very often. These accounting issues were introduced in b150a4f10d87 ("Btrfs: use a percpu to keep track of possibly pinned bytes"), but they were papered over by zeroing out the counter on every commit until d288db5dc011 ("Btrfs: fix race of using total_bytes_pinned"). We need to make sure that an extent is accounted as pinned exactly once if and only if we will drop references to it when when the transaction is committed. Ideally we would only add to total_bytes_pinned when the *last* reference is dropped, but this information isn't readily available for data extents. Again, this over-estimation can lead to extra commits when we're close to ENOSPC, but it's not as bad as before. The fix implemented here is to increment total_bytes_pinned when the total refmod count for an extent goes negative and decrement it if the refmod count goes back to non-negative or after we've run all of the delayed refs for that extent. Signed-off-by: Omar Sandoval Tested-by: Holger Hoffstätte Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8121a78f6cbd..f2a6a59da20a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2113,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && @@ -2123,14 +2124,18 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, (int)owner, BTRFS_ADD_DELAYED_REF, NULL, - NULL, NULL); + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - 0, BTRFS_ADD_DELAYED_REF, NULL, - NULL); + 0, BTRFS_ADD_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + return ret; } @@ -2434,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, head = btrfs_delayed_node_to_head(node); trace_run_delayed_ref_head(fs_info, node, head, node->action); + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, node->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -node->num_bytes); + btrfs_put_block_group(cache); + } + if (insert_reserved) { btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); @@ -6284,6 +6299,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + num_bytes); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -7053,8 +7070,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } - add_pinned_bytes(info, -num_bytes, owner_objectid, - root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -7186,13 +7201,16 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + int old_ref_mod, new_ref_mod; + ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), BTRFS_DROP_DELAYED_REF, NULL, - NULL, NULL); + &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ + pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { @@ -7241,12 +7259,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -7256,20 +7274,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + old_ref_mod = new_ref_mod = 0; ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL, - NULL, NULL); + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, 0, BTRFS_DROP_DELAYED_REF, - NULL, NULL); + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + return ret; } -- cgit v1.2.3 From 7bc329c1836866ffac8b2613f780a51b3ffe786d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Feb 2017 15:10:36 +0800 Subject: btrfs: qgroup: Return actually freed bytes for qgroup release or free data btrfs_qgroup_release/free_data() only returns 0 or a negative error number (ENOMEM is the only possible error). This is normally good enough, but sometimes we need the exact byte count it freed/released. Change it to return actually released/freed bytenr number instead of 0 for success. And slightly modify related extent_changeset structure, since in btrfs one no-hole data extent won't be larger than 128M, so "unsigned int" is large enough for the use case. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/qgroup.c | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f2a6a59da20a..b0b02c6c71aa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4347,7 +4347,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1e508a8f876e..ce670d213913 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -209,7 +209,7 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ struct ulist range_changed; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fc2260359211..475d53c492c8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2897,6 +2897,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, changeset.bytes_changed); + ret = changeset.bytes_changed; out: ulist_release(&changeset.range_changed); return ret; -- cgit v1.2.3 From 364ecf3651e0862152c8b340d7cb3021dc0122c7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Feb 2017 15:10:38 +0800 Subject: btrfs: qgroup: Introduce extent changeset for qgroup reserve functions Introduce a new parameter, struct extent_changeset for btrfs_qgroup_reserved_data() and its callers. Such extent_changeset was used in btrfs_qgroup_reserve_data() to record which range it reserved in current reserve, so it can free it in error paths. The reason we need to export it to callers is, at buffered write error path, without knowing what exactly which range we reserved in current allocation, we can free space which is not reserved by us. This will lead to qgroup reserved space underflow. Reviewed-by: Chandan Rajendra Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ++++-- fs/btrfs/extent-tree.c | 23 +++++++++++++---------- fs/btrfs/extent_io.h | 34 +++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 12 +++++++++--- fs/btrfs/inode-map.c | 4 +++- fs/btrfs/inode.c | 18 ++++++++++++++---- fs/btrfs/ioctl.c | 5 ++++- fs/btrfs/qgroup.c | 51 ++++++++++++++++++++++++++++++++------------------ fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 4 +++- 10 files changed, 119 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e33e1d6d5c9..1ee4489dc398 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2708,8 +2708,9 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); @@ -2727,7 +2728,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b0b02c6c71aa..70f85cfdbd46 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3402,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3521,7 +3522,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3552,6 +3553,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -4326,12 +4328,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4346,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -6175,6 +6175,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6192,11 +6194,12 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ce670d213913..aeafdb35d90b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -215,6 +215,40 @@ struct extent_changeset { struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5da85b080368..1b5cce51728b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1802,6 +1805,7 @@ again: } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -2772,6 +2776,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2901,8 +2906,8 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -2974,6 +2979,7 @@ out: if (ret != 0) btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5c6c20ec64d8..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -492,7 +493,7 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b66ea03a3a1c..d9cf6df40d5e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2037,6 +2037,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -2074,7 +2075,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -2094,6 +2095,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -4769,6 +4771,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4783,7 +4786,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4868,6 +4871,7 @@ out_unlock: unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -8718,6 +8722,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8754,7 +8759,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; dio_data.outstanding_extents = count_max_extents(count); @@ -8811,6 +8817,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -9043,6 +9050,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -9068,7 +9076,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { ret = file_update_time(vmf->vma->vm_file); @@ -9174,6 +9182,7 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); @@ -9181,6 +9190,7 @@ out: btrfs_delalloc_release_space(inode, page_start, reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e4116f9248c2..ccee5417d3f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1247,6 +1248,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { @@ -1256,6 +1258,7 @@ out: btrfs_delalloc_release_space(inode, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 475d53c492c8..002088937021 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2835,43 +2835,60 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed, true); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_release(&changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_release(&changeset.range_changed); + extent_changeset_release(reserved); return ret; } @@ -2882,8 +2899,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, int trace_op = QGROUP_RELEASE; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -2899,7 +2915,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, changeset.bytes_changed); ret = changeset.bytes_changed; out: - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2999,8 +3015,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3017,5 +3032,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) changeset.bytes_changed); } - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d11125d6afb9..99408e93eb0d 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -242,7 +242,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b291d1bebb4c..6407423151ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3129,6 +3130,7 @@ int prealloc_file_extent_cluster(struct inode *inode, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } -- cgit v1.2.3 From bc42bda22345efdb5d8b578d1b4df2c6eaa85c58 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Feb 2017 15:10:39 +0800 Subject: btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges [BUG] For the following case, btrfs can underflow qgroup reserved space at an error path: (Page size 4K, function name without "btrfs_" prefix) Task A | Task B ---------------------------------------------------------------------- Buffered_write [0, 2K) | |- check_data_free_space() | | |- qgroup_reserve_data() | | Range aligned to page | | range [0, 4K) <<< | | 4K bytes reserved <<< | |- copy pages to page cache | | Buffered_write [2K, 4K) | |- check_data_free_space() | | |- qgroup_reserved_data() | | Range alinged to page | | range [0, 4K) | | Already reserved by A <<< | | 0 bytes reserved <<< | |- delalloc_reserve_metadata() | | And it *FAILED* (Maybe EQUOTA) | |- free_reserved_data_space() |- qgroup_free_data() Range aligned to page range [0, 4K) Freeing 4K (Special thanks to Chandan for the detailed report and analyse) [CAUSE] Above Task B is freeing reserved data range [0, 4K) which is actually reserved by Task A. And at writeback time, page dirty by Task A will go through writeback routine, which will free 4K reserved data space at file extent insert time, causing the qgroup underflow. [FIX] For btrfs_qgroup_free_data(), add @reserved parameter to only free data ranges reserved by previous btrfs_qgroup_reserve_data(). So in above case, Task B will try to free 0 byte, so no underflow. Reported-by: Chandan Rajendra Signed-off-by: Qu Wenruo Reviewed-by: Chandan Rajendra Tested-by: Chandan Rajendra Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +++-- fs/btrfs/extent-tree.c | 12 +++++---- fs/btrfs/file.c | 29 +++++++++++--------- fs/btrfs/inode.c | 29 ++++++++++---------- fs/btrfs/ioctl.c | 4 +-- fs/btrfs/qgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 8 +++--- 8 files changed, 117 insertions(+), 46 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1ee4489dc398..5bdd36664421 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2711,7 +2711,10 @@ enum btrfs_flush_state { int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2730,7 +2733,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 70f85cfdbd46..a3339acec28c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4389,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4399,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -6204,7 +6205,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6223,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1b5cce51728b..0f102a1b851f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1660,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1743,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1799,9 +1801,9 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } @@ -2918,8 +2920,8 @@ static long btrfs_fallocate(struct file *file, int mode, * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2938,8 +2940,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2977,8 +2980,8 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d9cf6df40d5e..5d3c6ac960fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -345,7 +345,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -2935,7 +2935,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -4794,7 +4794,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4866,7 +4866,7 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); @@ -5266,7 +5266,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8792,8 +8792,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8808,8 +8808,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start, false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -9008,7 +9008,7 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -9130,8 +9130,8 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } @@ -9187,7 +9187,8 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); @@ -10557,7 +10558,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ccee5417d3f6..b4e9941efb60 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1227,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1255,7 +1255,7 @@ out: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 002088937021..fc9dffaa9524 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2892,13 +2892,72 @@ cleanup: return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -2924,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2951,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 99408e93eb0d..d9984e87cddf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -245,7 +245,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, int btrfs_qgroup_reserve_data(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, bool enforce); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6407423151ab..dc69b6ba29af 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3114,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3126,8 +3126,8 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); extent_changeset_free(data_reserved); -- cgit v1.2.3 From 6374e57ad8091b9c2db2eecc536c7f0166ce099e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 23 Jun 2017 09:48:21 -0700 Subject: btrfs: fix integer overflow in calc_reclaim_items_nr Dave Jones hit a WARN_ON(nr < 0) in btrfs_wait_ordered_roots() with v4.12-rc6. This was because commit 70e7af244 made it possible for calc_reclaim_items_nr() to return a negative number. It's not really a bug in that commit, it just didn't go far enough down the stack to find all the possible 64->32 bit overflows. This switches calc_reclaim_items_nr() to return a u64 and changes everyone that uses the results of that math to u64 as well. Reported-by: Dave Jones Fixes: 70e7af2 ("Btrfs: fix delalloc accounting leak caused by u32 overflow") Signed-off-by: Chris Mason Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/extent-tree.c | 12 ++++++------ fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 17 ++++++++--------- fs/btrfs/ordered-data.h | 4 ++-- fs/btrfs/qgroup.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 2 +- 10 files changed, 24 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5fe1ca8abc70..bee3edeea7a3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a3339acec28c..375f8c728d91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4288,7 +4288,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1, 0, + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4748,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, } } -static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; - int nr; + u64 nr; bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = (int)div64_u64(to_reclaim, bytes); + nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; return nr; @@ -4774,15 +4774,15 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 max_reclaim; + u64 items; long time_left; unsigned long nr_pages; int loops; - int items; enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &fs_info->delalloc_block_rsv; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b4e9941efb60..fa1b78cf25f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7b40e2e7292a..a3aca495e33e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, LIST_HEAD(skipped); LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; - int count = 0; + u64 count = 0; const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); @@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, cond_resched(); spin_lock(&root->ordered_extent_lock); - if (nr != -1) + if (nr != U64_MAX) nr--; count++; } @@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, - const u64 range_start, const u64 range_len) +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - int done; - int total_done = 0; + u64 total_done = 0; + u64 done; INIT_LIST_HEAD(&splice); @@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, total_done += done; spin_lock(&fs_info->ordered_root_lock); - if (nr != -1) { + if (nr != U64_MAX) { nr -= done; - WARN_ON(nr < 0); } } list_splice_tail(&splice, &fs_info->ordered_roots); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e0c1d5b8d859..56c4c0ee6381 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct btrfs_inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fc9dffaa9524..4ce351efe281 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2405,7 +2405,7 @@ retry: ret = btrfs_start_delalloc_inodes(root, 0); if (ret) return ret; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index dc69b6ba29af..65661d1aae4e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4373,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, -1, + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group->key.objectid, rc->block_group->key.offset); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 738e784ba20d..0cebeb5eb5d0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3836,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, -1, + ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->key.objectid, cache->key.offset); if (ret > 0) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ba2cb7f96986..74e47794e63f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1177,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 309b73da756b..f615d59b0489 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1923,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static inline void -- cgit v1.2.3