summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-12 13:28:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-12 13:28:55 -0700
commita785fd28d31f76d50004712b6e0b409d5a8239d8 (patch)
treea1ddcf141f79219555677908e1f2fc49a1cb83fd
parentae545c3283dc673f7e748065efa46ba95f678ef2 (diff)
parent92fb94b69c6accf1e49fff699640fa0ce03dc910 (diff)
Merge tag 'for-6.5-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "More fixes, some of them going back to older releases and there are fixes for hangs in stress tests regarding space caching: - fixes and progress tracking for hangs in free space caching, found by test generic/475 - writeback fixes, write pages in integrity mode and skip writing pages that have been written meanwhile - properly clear end of extent range after an error - relocation fixes: - fix race betwen qgroup tree creation and relocation - detect and report invalid reloc roots" * tag 'for-6.5-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: set cache_block_group_error if we find an error btrfs: reject invalid reloc tree root keys with stack dump btrfs: exit gracefully if reloc roots don't match btrfs: avoid race between qgroup tree creation and relocation btrfs: properly clear end of the unreserved range in cow_file_range btrfs: don't wait for writeback on clean pages in extent_write_cache_pages btrfs: don't stop integrity writeback too early btrfs: wait for actual caching progress during allocation
-rw-r--r--fs/btrfs/block-group.c17
-rw-r--r--fs/btrfs/block-group.h2
-rw-r--r--fs/btrfs/disk-io.c13
-rw-r--r--fs/btrfs/extent-tree.c5
-rw-r--r--fs/btrfs/extent_io.c13
-rw-r--r--fs/btrfs/inode.c10
-rw-r--r--fs/btrfs/relocation.c45
-rw-r--r--fs/btrfs/tree-checker.c14
8 files changed, 99 insertions, 20 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 030ab44fce18..82324c327a50 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -441,13 +441,23 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
+ int progress;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return;
+ /*
+ * We've already failed to allocate from this block group, so even if
+ * there's enough space in the block group it isn't contiguous enough to
+ * allow for an allocation, so wait for at least the next wakeup tick,
+ * or for the thing to be done.
+ */
+ progress = atomic_read(&caching_ctl->progress);
+
wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
- (cache->free_space_ctl->free_space >= num_bytes));
+ (progress != atomic_read(&caching_ctl->progress) &&
+ (cache->free_space_ctl->free_space >= num_bytes)));
btrfs_put_caching_control(caching_ctl);
}
@@ -802,8 +812,10 @@ next:
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
- if (wakeup)
+ if (wakeup) {
+ atomic_inc(&caching_ctl->progress);
wake_up(&caching_ctl->wait);
+ }
}
}
path->slots[0]++;
@@ -910,6 +922,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
refcount_set(&caching_ctl->count, 2);
+ atomic_set(&caching_ctl->progress, 0);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index aba5dff66c19..74b61e663028 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -90,6 +90,8 @@ struct btrfs_caching_control {
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
+ /* Track progress of caching during allocation. */
+ atomic_t progress;
refcount_t count;
};
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9b9914e5f03d..a9a2c5446c18 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1103,7 +1103,8 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_drew_lock_init(&root->snapshot_lock);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- !btrfs_is_data_reloc_root(root)) {
+ !btrfs_is_data_reloc_root(root) &&
+ is_fstree(root->root_key.objectid)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1300,6 +1301,16 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
root = btrfs_get_global_root(fs_info, objectid);
if (root)
return root;
+
+ /*
+ * If we're called for non-subvolume trees, and above function didn't
+ * find one, do not try to read it from disk.
+ *
+ * This is namely for free-space-tree and quota tree, which can change
+ * at runtime and should only be grabbed from fs_info.
+ */
+ if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ return ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 911908ea5f6f..f396a9afa403 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4310,8 +4310,11 @@ have_block_group:
ret = 0;
}
- if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
+ if (!cache_block_group_error)
+ cache_block_group_error = -EIO;
goto loop;
+ }
if (!find_free_extent_check_size_class(ffe_ctl, block_group))
goto loop;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a91d5ad27984..ca765d62324f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2145,6 +2145,12 @@ retry:
continue;
}
+ if (!folio_test_dirty(folio)) {
+ /* Someone wrote it for us. */
+ folio_unlock(folio);
+ continue;
+ }
+
if (wbc->sync_mode != WB_SYNC_NONE) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
@@ -2164,11 +2170,12 @@ retry:
}
/*
- * the filesystem may choose to bump up nr_to_write.
+ * The filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
- * at any time
+ * at any time.
*/
- nr_to_write_done = wbc->nr_to_write <= 0;
+ nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
+ wbc->nr_to_write <= 0);
}
folio_batch_release(&fbatch);
cond_resched();
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 49cef61f6a39..9055e19b01ef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1654,8 +1654,6 @@ out_unlock:
clear_bits,
page_ops);
start += cur_alloc_size;
- if (start >= end)
- return ret;
}
/*
@@ -1664,9 +1662,11 @@ out_unlock:
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits | EXTENT_CLEAR_DATA_RESV,
- page_ops);
+ if (start < end) {
+ clear_bits |= EXTENT_CLEAR_DATA_RESV;
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ clear_bits, page_ops);
+ }
return ret;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 25a3361caedc..46c3c1d57266 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1916,7 +1916,39 @@ again:
err = PTR_ERR(root);
break;
}
- ASSERT(root->reloc_root == reloc_root);
+
+ if (unlikely(root->reloc_root != reloc_root)) {
+ if (root->reloc_root) {
+ btrfs_err(fs_info,
+"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
+ root->root_key.objectid,
+ root->reloc_root->root_key.objectid,
+ root->reloc_root->root_key.type,
+ root->reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &root->reloc_root->root_item),
+ reloc_root->root_key.objectid,
+ reloc_root->root_key.type,
+ reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &reloc_root->root_item));
+ } else {
+ btrfs_err(fs_info,
+"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
+ root->root_key.objectid,
+ reloc_root->root_key.objectid,
+ reloc_root->root_key.type,
+ reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &reloc_root->root_item));
+ }
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ if (!err)
+ err = -EUCLEAN;
+ break;
+ }
/*
* set reference count to 1, so btrfs_recover_relocation
@@ -1989,7 +2021,7 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- if (IS_ERR(root)) {
+ if (WARN_ON(IS_ERR(root))) {
/*
* For recovery we read the fs roots on mount,
* and if we didn't find the root then we marked
@@ -1998,17 +2030,14 @@ again:
* memory. However there's no reason we can't
* handle the error properly here just in case.
*/
- ASSERT(0);
ret = PTR_ERR(root);
goto out;
}
- if (root->reloc_root != reloc_root) {
+ if (WARN_ON(root->reloc_root != reloc_root)) {
/*
- * This is actually impossible without something
- * going really wrong (like weird race condition
- * or cosmic rays).
+ * This can happen if on-disk metadata has some
+ * corruption, e.g. bad reloc tree key offset.
*/
- ASSERT(0);
ret = -EINVAL;
goto out;
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 038dfa8f1788..ab08a0b01311 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -446,6 +446,20 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
btrfs_item_key_to_cpu(leaf, &item_key, slot);
is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
+ /*
+ * Bad rootid for reloc trees.
+ *
+ * Reloc trees are only for subvolume trees, other trees only need
+ * to be COWed to be relocated.
+ */
+ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ !is_fstree(key->offset))) {
+ generic_err(leaf, slot,
+ "invalid reloc tree for root %lld, root id is not a subvolume tree",
+ key->offset);
+ return -EUCLEAN;
+ }
+
/* No such tree id */
if (unlikely(key->objectid == 0)) {
if (is_root_item)