From 8461a3de770477a9a7b8eeaebcc4804dbc26ca38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2015 15:12:08 -0400 Subject: Btrfs: fix merge delalloc logic My patch to properly count outstanding extents wrt MAX_EXTENT_SIZE introduced a regression when re-dirtying already dirty areas. We have logic in split to make sure we are taking the largest space into account but didn't have it for merge, so it was sometimes making us think we were turning a tiny extent into a huge extent, when in reality we already had a huge extent and needed to use the other side in our logic. This fixes the regression that was reported by a user on list. Thanks, Reported-by: Markus Trippelsdorf Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 91a87f53be3c..97b601bec326 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1592,7 +1592,12 @@ static void btrfs_merge_extent_hook(struct inode *inode, return; old_size = other->end - other->start + 1; - new_size = old_size + (new->end - new->start + 1); + if (old_size < (new->end - new->start + 1)) + old_size = (new->end - new->start + 1); + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { -- cgit v1.2.3 From ba117213554bc747561c5b7bf274d60ac93b8598 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2015 15:01:24 -0400 Subject: Btrfs: account merges/splits properly My fix Btrfs: fix merge delalloc logic only fixed half of the problems, it didn't fix the case where we have two large extents on either side and then join them together with a new small extent. We need to instead keep track of how many extents we have accounted for with each side of the new extent, and then see how many extents we need for the new large extent. If they match then we know we need to keep our reservation, otherwise we need to drop our reservation. This shows up with a case like this [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] Previously the logic would have said that the number extents required for the new size (3) is larger than the number of extents required for the largest side (2) therefore we need to keep our reservation. But this isn't the case, since both sides require a reservation of 2 which leads to 4 for the whole range currently reserved, but we only need 3, so we need to drop one of the reservations. The same problem existed for splits, we'd think we only need 3 extents when creating the hole but in reality we need 4. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 57 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97b601bec326..bb74a4181075 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1542,30 +1542,17 @@ static void btrfs_split_extent_hook(struct inode *inode, u64 new_size; /* - * We need the largest size of the remaining extent to see if we - * need to add a new outstanding extent. Think of the following - * case - * - * [MEAX_EXTENT_SIZEx2 - 4k][4k] - * - * The new_size would just be 4k and we'd think we had enough - * outstanding extents for this if we only took one side of the - * split, same goes for the other direction. We need to see if - * the larger size still is the same amount of extents as the - * original size, because if it is we need to add a new - * outstanding extent. But if we split up and the larger size - * is less than the original then we are good to go since we've - * already accounted for the extra extent in our original - * accounting. + * See the explanation in btrfs_merge_extent_hook, the same + * applies here, just in reverse. */ new_size = orig->end - split + 1; - if ((split - orig->start) > new_size) - new_size = split - orig->start; - - num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); - if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) < num_extents) + new_size = split - orig->start; + num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; } @@ -1591,9 +1578,6 @@ static void btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return; - old_size = other->end - other->start + 1; - if (old_size < (new->end - new->start + 1)) - old_size = (new->end - new->start + 1); if (new->start > other->start) new_size = new->end - other->start + 1; else @@ -1608,13 +1592,32 @@ static void btrfs_merge_extent_hook(struct inode *inode, } /* - * If we grew by another max_extent, just return, we want to keep that - * reserved amount. + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. */ + old_size = other->end - other->start + 1; num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); + old_size = new->end - new->start + 1; + num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) > num_extents) + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); -- cgit v1.2.3 From 6a3891c551268dd4ff0969b883c4c8b8d974db8f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Mar 2015 17:38:52 -0400 Subject: Btrfs: add sanity test for outstanding_extents accounting I introduced a regression wrt outstanding_extents accounting. These are tricky areas that aren't easily covered by xfstests as we could change MAX_EXTENT_SIZE at any time. So add sanity tests to cover the various conditions that are tricky in order to make sure we don't introduce regressions in the future. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 + fs/btrfs/extent-tree.c | 3 + fs/btrfs/inode.c | 15 ++++ fs/btrfs/tests/inode-tests.c | 197 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 217 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a0c90a324f53..5bd721a1516e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3910,6 +3910,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, loff_t actual_len, u64 *alloc_hint); int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); +#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ac3fefb1625..d1ca86ea9993 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (btrfs_test_is_dummy_root(root)) + return; + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb74a4181075..3717b3d4c2ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, static int btrfs_dirty_inode(struct inode *inode); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode) +{ + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +} +#endif + static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -1694,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } + /* For sanity tests */ + if (btrfs_test_is_dummy_root(root)) + return; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); @@ -1749,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); + /* For sanity tests. */ + if (btrfs_test_is_dummy_root(root)) + return; + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a116b55ce788..054fc0d97131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -911,6 +911,197 @@ out: return ret; } +static int test_extent_accounting(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + BTRFS_I(inode)->root = root; + btrfs_test_inode_set_ops(inode); + + /* [BTRFS_MAX_EXTENT_SIZE] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_msg("Miscount, wanted 1, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + EXTENT_DELALLOC | EXTENT_DIRTY | + EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * + * I'm artificially adding 2 to outstanding_extents because in the + * buffered IO case we'd add things up as we go, but I don't feel like + * doing that here, this isn't the interesting case we want to test. + */ + BTRFS_I(inode)->outstanding_extents += 2; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, + (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_msg("Miscount, wanted 0, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + int btrfs_test_inodes(void) { int ret; @@ -924,5 +1115,9 @@ int btrfs_test_inodes(void) if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - return test_hole_first(); + ret = test_hole_first(); + if (ret) + return ret; + test_msg("Running outstanding_extents tests\n"); + return test_extent_accounting(); } -- cgit v1.2.3 From e1cbbfa5f5aaf40a1fe70856fac4dfcc33e0e651 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Mar 2015 10:52:28 -0400 Subject: Btrfs: fix outstanding_extents accounting in DIO We are keeping track of how many extents we need to reserve properly based on the amount we want to write, but we were still incrementing outstanding_extents if we wrote less than what we requested. This isn't quite right since we will be limited to our max extent size. So instead lets do something horrible! Keep track of how many outstanding_extents we reserved, and decrement each time we allocate an extent. If we use our entire reserve make sure to jack up outstanding_extents on the inode so the accounting works out properly. Thanks, Reported-by: Filipe Manana Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3717b3d4c2ce..aa1fb534f69f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7239,7 +7239,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 orig_len = len; + u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7251,6 +7251,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, lockstart = start; lockend = start + len - 1; + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transction doesn't get + * confused. + */ + outstanding_extents = current->journal_info; + current->journal_info = NULL; + } + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. @@ -7374,11 +7384,20 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - if (len < orig_len) { + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (*outstanding_extents) { + (*outstanding_extents)--; + } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } + + current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); } @@ -7402,6 +7421,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); + if (outstanding_extents) + current->journal_info = outstanding_extents; return ret; } @@ -8101,6 +8122,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + u64 outstanding_extents = 0; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8138,6 +8160,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; + outstanding_extents = div64_u64(count + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); @@ -8150,6 +8182,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { + current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); else if (ret >= 0 && (size_t)ret < count) -- cgit v1.2.3