From 8eaf40c0e24e98899a0f3ac9d25a33aafe13822a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 12 Jun 2019 11:05:42 +0100 Subject: Btrfs: fix race between block group removal and block group allocation If a task is removing the block group that currently has the highest start offset amongst all existing block groups, there is a short time window where it races with a concurrent block group allocation, resulting in a transaction abort with an error code of EEXIST. The following diagram explains the race in detail: Task A Task B btrfs_remove_block_group(bg offset X) remove_extent_mapping(em offset X) -> removes extent map X from the tree of extent maps (fs_info->mapping_tree), so the next call to find_next_chunk() will return offset X btrfs_alloc_chunk() find_next_chunk() --> returns offset X __btrfs_alloc_chunk(offset X) btrfs_make_block_group() btrfs_create_block_group_cache() --> creates btrfs_block_group_cache object with a key corresponding to the block group item in the extent, the key is: (offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G) --> adds the btrfs_block_group_cache object to the list new_bgs of the transaction handle btrfs_end_transaction(trans handle) __btrfs_end_transaction() btrfs_create_pending_block_groups() --> sees the new btrfs_block_group_cache in the new_bgs list of the transaction handle --> its call to btrfs_insert_item() fails with -EEXIST when attempting to insert the block group item key (offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G) because task A has not removed that key yet --> aborts the running transaction with error -EEXIST btrfs_del_item() -> removes the block group's key from the extent tree, key is (offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G) A sample transaction abort trace: [78912.403537] ------------[ cut here ]------------ [78912.403811] BTRFS: Transaction aborted (error -17) [78912.404082] WARNING: CPU: 2 PID: 20465 at fs/btrfs/extent-tree.c:10551 btrfs_create_pending_block_groups+0x196/0x250 [btrfs] (...) [78912.405642] CPU: 2 PID: 20465 Comm: btrfs Tainted: G W 5.0.0-btrfs-next-46 #1 [78912.405941] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [78912.406586] RIP: 0010:btrfs_create_pending_block_groups+0x196/0x250 [btrfs] (...) [78912.407636] RSP: 0018:ffff9d3d4b7e3b08 EFLAGS: 00010282 [78912.407997] RAX: 0000000000000000 RBX: ffff90959a3796f0 RCX: 0000000000000006 [78912.408369] RDX: 0000000000000007 RSI: 0000000000000001 RDI: ffff909636b16860 [78912.408746] RBP: ffff909626758a58 R08: 0000000000000000 R09: 0000000000000000 [78912.409144] R10: ffff9095ff462400 R11: 0000000000000000 R12: ffff90959a379588 [78912.409521] R13: ffff909626758ab0 R14: ffff9095036c0000 R15: ffff9095299e1158 [78912.409899] FS: 00007f387f16f700(0000) GS:ffff909636b00000(0000) knlGS:0000000000000000 [78912.410285] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [78912.410673] CR2: 00007f429fc87cbc CR3: 000000014440a004 CR4: 00000000003606e0 [78912.411095] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [78912.411496] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [78912.411898] Call Trace: [78912.412318] __btrfs_end_transaction+0x5b/0x1c0 [btrfs] [78912.412746] btrfs_inc_block_group_ro+0xcf/0x160 [btrfs] [78912.413179] scrub_enumerate_chunks+0x188/0x5b0 [btrfs] [78912.413622] ? __mutex_unlock_slowpath+0x100/0x2a0 [78912.414078] btrfs_scrub_dev+0x2ef/0x720 [btrfs] [78912.414535] ? __sb_start_write+0xd4/0x1c0 [78912.414963] ? mnt_want_write_file+0x24/0x50 [78912.415403] btrfs_ioctl+0x17fb/0x3120 [btrfs] [78912.415832] ? lock_acquire+0xa6/0x190 [78912.416256] ? do_vfs_ioctl+0xa2/0x6f0 [78912.416685] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [78912.417116] do_vfs_ioctl+0xa2/0x6f0 [78912.417534] ? __fget+0x113/0x200 [78912.417954] ksys_ioctl+0x70/0x80 [78912.418369] __x64_sys_ioctl+0x16/0x20 [78912.418812] do_syscall_64+0x60/0x1b0 [78912.419231] entry_SYSCALL_64_after_hwframe+0x49/0xbe [78912.419644] RIP: 0033:0x7f3880252dd7 (...) [78912.420957] RSP: 002b:00007f387f16ed68 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [78912.421426] RAX: ffffffffffffffda RBX: 000055f5becc1df0 RCX: 00007f3880252dd7 [78912.421889] RDX: 000055f5becc1df0 RSI: 00000000c400941b RDI: 0000000000000003 [78912.422354] RBP: 0000000000000000 R08: 00007f387f16f700 R09: 0000000000000000 [78912.422790] R10: 00007f387f16f700 R11: 0000000000000246 R12: 0000000000000000 [78912.423202] R13: 00007ffda49c266f R14: 0000000000000000 R15: 00007f388145e040 [78912.425505] ---[ end trace eb9bfe7c426fc4d3 ]--- Fix this by calling remove_extent_mapping(), at btrfs_remove_block_group(), only at the very end, after removing the block group item key from the extent tree (and removing the free space tree entry if we are using the free space tree feature). Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c7adff343ba9..5faf057f6f37 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10831,17 +10831,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, remove_em = (atomic_read(&block_group->trimming) == 0); spin_unlock(&block_group->lock); - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree.map_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } - mutex_unlock(&fs_info->chunk_mutex); ret = remove_block_group_free_space(trans, block_group); @@ -10858,6 +10847,19 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } out: if (remove_rsv) btrfs_delayed_refs_rsv_release(fs_info, 1); -- cgit v1.2.3 From c4e0540d0ad49c8ceab06cceed1de27c4fe29f6e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 6 Jun 2019 16:54:44 +0900 Subject: btrfs: start readahead also in seed devices Currently, btrfs does not consult seed devices to start readahead. As a result, if readahead zone is added to the seed devices, btrfs_reada_wait() indefinitely wait for the reada_ctl to finish. You can reproduce the hung by modifying btrfs/163 to have larger initial file size (e.g. xfs_io pwrite 4M instead of current 256K). Fixes: 7414a03fbf9e ("btrfs: initial readahead code and prototypes") Cc: stable@vger.kernel.org # 3.2+: ce7791ffee1e: Btrfs: fix race between readahead and device replace/removal Cc: stable@vger.kernel.org # 3.2+ Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/reada.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 10d9589001a9..bb5bd49573b4 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -747,6 +747,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) u64 total = 0; int i; +again: do { enqueued = 0; mutex_lock(&fs_devices->device_list_mutex); @@ -758,6 +759,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } if (enqueued == 0) return; -- cgit v1.2.3 From 3763771cf60236caaf7ccc79cea244c63d7c49a0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 12 Jun 2019 15:14:11 +0100 Subject: Btrfs: fix failure to persist compression property xattr deletion on fsync After the recent series of cleanups in the properties and xattrs modules that landed in the 5.2 merge window, we ended up with a regression where after deleting the compression xattr property through the setflags ioctl, we don't set the BTRFS_INODE_COPY_EVERYTHING flag in the inode anymore. As a consequence, if the inode was fsync'ed when it had the compression property set, after deleting the compression property through the setflags ioctl and fsync'ing again the inode, the log will still contain the compression xattr, because the inode did not had that bit set, which made the fsync not delete all xattrs from the log and copy all xattrs from the subvolume tree to the log tree. This regression happens due to the fact that that series of cleanups made btrfs_set_prop() call the old function do_setxattr() (which is now named btrfs_setxattr()), and not the old version of btrfs_setxattr(), which is now called btrfs_setxattr_trans(). Fix this by setting the BTRFS_INODE_COPY_EVERYTHING bit in the current btrfs_setxattr() function and remove it from everywhere else, including its setup at btrfs_ioctl_setflags(). This is cleaner, avoids similar regressions in the future, and centralizes the setup of the bit. After all, the need to setup this bit should only be in the xattrs module, since it is an implementation of xattrs. Fixes: 04e6863b19c722 ("btrfs: split btrfs_setxattr calls regarding transaction") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 -- fs/btrfs/xattr.c | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6dafa857bbb9..2a1be0d1a698 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -312,8 +312,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_abort_transaction(trans, ret); goto out_end_trans; } - set_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); } else { ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 78b6ba2029e8..95d9aebff2c4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -213,6 +213,9 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, } out: btrfs_free_path(path); + if (!ret) + set_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); return ret; } @@ -236,7 +239,6 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: @@ -388,8 +390,6 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (!ret) { inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); } -- cgit v1.2.3