summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/blk-merge.c3
-rw-r--r--fs/btrfs/Makefile6
-rw-r--r--fs/btrfs/backref.c33
-rw-r--r--fs/btrfs/bio.c557
-rw-r--r--fs/btrfs/bio.h67
-rw-r--r--fs/btrfs/block-group.c273
-rw-r--r--fs/btrfs/block-group.h24
-rw-r--r--fs/btrfs/btrfs_inode.h22
-rw-r--r--fs/btrfs/compression.c276
-rw-r--r--fs/btrfs/compression.h3
-rw-r--r--fs/btrfs/ctree.c62
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/defrag.c4
-rw-r--r--fs/btrfs/delayed-ref.c24
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/discard.c41
-rw-r--r--fs/btrfs/disk-io.c225
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/extent-io-tree.c10
-rw-r--r--fs/btrfs/extent-io-tree.h1
-rw-r--r--fs/btrfs/extent-tree.c181
-rw-r--r--fs/btrfs/extent-tree.h81
-rw-r--r--fs/btrfs/extent_io.c582
-rw-r--r--fs/btrfs/extent_io.h36
-rw-r--r--fs/btrfs/file-item.c72
-rw-r--r--fs/btrfs/file-item.h8
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/free-space-tree.c2
-rw-r--r--fs/btrfs/fs.c4
-rw-r--r--fs/btrfs/fs.h11
-rw-r--r--fs/btrfs/inode.c641
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/lru_cache.c166
-rw-r--r--fs/btrfs/lru_cache.h80
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/messages.c30
-rw-r--r--fs/btrfs/messages.h34
-rw-r--r--fs/btrfs/ordered-data.c25
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/raid56.c334
-rw-r--r--fs/btrfs/raid56.h4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c51
-rw-r--r--fs/btrfs/send.c684
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/sysfs.h3
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c34
-rw-r--r--fs/btrfs/transaction.h31
-rw-r--r--fs/btrfs/tree-log.c87
-rw-r--r--fs/btrfs/tree-log.h9
-rw-r--r--fs/btrfs/volumes.c116
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/zoned.c146
-rw-r--r--fs/btrfs/zoned.h20
-rw-r--r--fs/iomap/direct-io.c10
-rw-r--r--include/linux/bio.h4
-rw-r--r--include/linux/iomap.h3
-rw-r--r--include/trace/events/btrfs.h127
61 files changed, 2457 insertions, 2898 deletions
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b7c193d67185..64bf7d9dd8e8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
-static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
@@ -336,6 +336,7 @@ split:
bio_clear_polled(bio);
return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
}
+EXPORT_SYMBOL_GPL(bio_split_rw);
/**
* __bio_split_to_limits - split a bio to fit the queue limits
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 555c962fdad6..90d53209755b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,8 @@ condflags := \
$(call cc-option, -Wunused-but-set-variable) \
$(call cc-option, -Wunused-const-variable) \
$(call cc-option, -Wpacked-not-aligned) \
- $(call cc-option, -Wstringop-truncation)
+ $(call cc-option, -Wstringop-truncation) \
+ $(call cc-option, -Wmaybe-uninitialized)
subdir-ccflags-y += $(condflags)
# The following turn off the warnings enabled by -Wextra
subdir-ccflags-y += -Wno-missing-field-initializers
@@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
+ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
+ lru_cache.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 46851511b661..90e40d5ceccd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
struct btrfs_root *root,
u64 bytenr, int level, bool *is_shared)
{
+ const struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_backref_shared_cache_entry *entry;
+ if (!current->journal_info)
+ lockdep_assert_held(&fs_info->commit_root_sem);
+
if (!ctx->use_path_cache)
return false;
@@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
* could be a snapshot sharing this extent buffer.
*/
if (entry->is_shared &&
- entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ entry->gen != btrfs_get_last_root_drop_gen(fs_info))
return false;
*is_shared = entry->is_shared;
@@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
struct btrfs_root *root,
u64 bytenr, int level, bool is_shared)
{
+ const struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_backref_shared_cache_entry *entry;
u64 gen;
+ if (!current->journal_info)
+ lockdep_assert_held(&fs_info->commit_root_sem);
+
if (!ctx->use_path_cache)
return;
@@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
ASSERT(level >= 0);
if (is_shared)
- gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ gen = btrfs_get_last_root_drop_gen(fs_info);
else
gen = btrfs_root_last_snapshot(&root->root_item);
@@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
.have_delayed_delete_refs = false,
};
int level;
+ bool leaf_cached;
+ bool leaf_is_shared;
for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
if (ctx->prev_extents_cache[i].bytenr == bytenr)
@@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
walk_ctx.time_seq = elem.seq;
}
+ ctx->use_path_cache = true;
+
+ /*
+ * We may have previously determined that the current leaf is shared.
+ * If it is, then we have a data extent that is shared due to a shared
+ * subtree (caused by snapshotting) and we don't need to check for data
+ * backrefs. If the leaf is not shared, then we must do backref walking
+ * to determine if the data extent is shared through reflinks.
+ */
+ leaf_cached = lookup_backref_shared_cache(ctx, root,
+ ctx->curr_leaf_bytenr, 0,
+ &leaf_is_shared);
+ if (leaf_cached && leaf_is_shared) {
+ ret = 1;
+ goto out_trans;
+ }
+
walk_ctx.ignore_extent_item_pos = true;
walk_ctx.trans = trans;
walk_ctx.fs_info = fs_info;
@@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
/* -1 means we are in the bytenr of the data extent. */
level = -1;
ULIST_ITER_INIT(&uiter);
- ctx->use_path_cache = true;
while (1) {
bool is_shared;
bool cached;
@@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
ctx->prev_extents_cache_slot = slot;
}
+out_trans:
if (trans) {
btrfs_put_tree_mod_seq(fs_info, &elem);
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 8affc88b0e0a..d8b90f95b157 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -14,19 +14,31 @@
#include "dev-replace.h"
#include "rcu-string.h"
#include "zoned.h"
+#include "file-item.h"
static struct bio_set btrfs_bioset;
+static struct bio_set btrfs_clone_bioset;
+static struct bio_set btrfs_repair_bioset;
+static mempool_t btrfs_failed_bio_pool;
+
+struct btrfs_failed_bio {
+ struct btrfs_bio *bbio;
+ int num_copies;
+ atomic_t repair_count;
+};
/*
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
-static inline void btrfs_bio_init(struct btrfs_bio *bbio,
- btrfs_bio_end_io_t end_io, void *private)
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+ btrfs_bio_end_io_t end_io, void *private)
{
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->inode = inode;
bbio->end_io = end_io;
bbio->private = private;
+ atomic_set(&bbio->pending_ios, 1);
}
/*
@@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio,
* a mempool.
*/
struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ struct btrfs_inode *inode,
btrfs_bio_end_io_t end_io, void *private)
{
struct bio *bio;
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
return bio;
}
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
- btrfs_bio_end_io_t end_io, void *private)
+static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+ struct bio *orig, u64 map_length,
+ bool use_append)
{
+ struct btrfs_bio *orig_bbio = btrfs_bio(orig);
struct bio *bio;
- struct btrfs_bio *bbio;
- ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+ if (use_append) {
+ unsigned int nr_segs;
+
+ bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
+ &btrfs_clone_bioset, map_length);
+ } else {
+ bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
+ &btrfs_clone_bioset);
+ }
+ btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, end_io, private);
+ btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
+ if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
+ orig_bbio->file_offset += map_length;
- bio_trim(bio, offset >> 9, size >> 9);
- bbio->iter = bio->bi_iter;
+ atomic_inc(&orig_bbio->pending_ios);
return bio;
}
+static void btrfs_orig_write_end_io(struct bio *bio);
+
+static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
+ struct btrfs_bio *orig_bbio)
+{
+ /*
+ * For writes we tolerate nr_mirrors - 1 write failures, so we can't
+ * just blindly propagate a write failure here. Instead increment the
+ * error count in the original I/O context so that it is guaranteed to
+ * be larger than the error tolerance.
+ */
+ if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
+ struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
+ struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
+
+ atomic_add(orig_bioc->max_errors, &orig_bioc->error);
+ } else {
+ orig_bbio->bio.bi_status = bbio->bio.bi_status;
+ }
+}
+
+static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
+{
+ if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
+ struct btrfs_bio *orig_bbio = bbio->private;
+
+ if (bbio->bio.bi_status)
+ btrfs_bbio_propagate_error(bbio, orig_bbio);
+ bio_put(&bbio->bio);
+ bbio = orig_bbio;
+ }
+
+ if (atomic_dec_and_test(&bbio->pending_ios))
+ bbio->end_io(bbio);
+}
+
+static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+{
+ if (cur_mirror == fbio->num_copies)
+ return cur_mirror + 1 - fbio->num_copies;
+ return cur_mirror + 1;
+}
+
+static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+{
+ if (cur_mirror == 1)
+ return fbio->num_copies;
+ return cur_mirror - 1;
+}
+
+static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
+{
+ if (atomic_dec_and_test(&fbio->repair_count)) {
+ btrfs_orig_bbio_end_io(fbio->bbio);
+ mempool_free(fbio, &btrfs_failed_bio_pool);
+ }
+}
+
+static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
+ struct btrfs_device *dev)
+{
+ struct btrfs_failed_bio *fbio = repair_bbio->private;
+ struct btrfs_inode *inode = repair_bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
+ int mirror = repair_bbio->mirror_num;
+
+ if (repair_bbio->bio.bi_status ||
+ !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
+ bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
+ repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
+
+ mirror = next_repair_mirror(fbio, mirror);
+ if (mirror == fbio->bbio->mirror_num) {
+ btrfs_debug(fs_info, "no mirror left");
+ fbio->bbio->bio.bi_status = BLK_STS_IOERR;
+ goto done;
+ }
+
+ btrfs_submit_bio(&repair_bbio->bio, mirror);
+ return;
+ }
+
+ do {
+ mirror = prev_repair_mirror(fbio, mirror);
+ btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
+ repair_bbio->file_offset, fs_info->sectorsize,
+ repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
+ bv->bv_page, bv->bv_offset, mirror);
+ } while (mirror != fbio->bbio->mirror_num);
+
+done:
+ btrfs_repair_done(fbio);
+ bio_put(&repair_bbio->bio);
+}
+
+/*
+ * Try to kick off a repair read to the next available mirror for a bad sector.
+ *
+ * This primarily tries to recover good data to serve the actual read request,
+ * but also tries to write the good data back to the bad mirror(s) when a
+ * read succeeded to restore the redundancy.
+ */
+static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
+ u32 bio_offset,
+ struct bio_vec *bv,
+ struct btrfs_failed_bio *fbio)
+{
+ struct btrfs_inode *inode = failed_bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
+ struct btrfs_bio *repair_bbio;
+ struct bio *repair_bio;
+ int num_copies;
+ int mirror;
+
+ btrfs_debug(fs_info, "repair read error: read error at %llu",
+ failed_bbio->file_offset + bio_offset);
+
+ num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
+ if (num_copies == 1) {
+ btrfs_debug(fs_info, "no copy to repair from");
+ failed_bbio->bio.bi_status = BLK_STS_IOERR;
+ return fbio;
+ }
+
+ if (!fbio) {
+ fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
+ fbio->bbio = failed_bbio;
+ fbio->num_copies = num_copies;
+ atomic_set(&fbio->repair_count, 1);
+ }
+
+ atomic_inc(&fbio->repair_count);
+
+ repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
+ &btrfs_repair_bioset);
+ repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
+ bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+
+ repair_bbio = btrfs_bio(repair_bio);
+ btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
+ repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
+
+ mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
+ btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
+ btrfs_submit_bio(repair_bio, mirror);
+ return fbio;
+}
+
+static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
+{
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bvec_iter *iter = &bbio->saved_iter;
+ blk_status_t status = bbio->bio.bi_status;
+ struct btrfs_failed_bio *fbio = NULL;
+ u32 offset = 0;
+
+ /*
+ * Hand off repair bios to the repair code as there is no upper level
+ * submitter for them.
+ */
+ if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
+ btrfs_end_repair_bio(bbio, dev);
+ return;
+ }
+
+ /* Clear the I/O error. A failed repair will reset it. */
+ bbio->bio.bi_status = BLK_STS_OK;
+
+ while (iter->bi_size) {
+ struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
+
+ bv.bv_len = min(bv.bv_len, sectorsize);
+ if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
+ fbio = repair_one_sector(bbio, offset, &bv, fbio);
+
+ bio_advance_iter_single(&bbio->bio, iter, sectorsize);
+ offset += sectorsize;
+ }
+
+ if (bbio->csum != bbio->csum_inline)
+ kfree(bbio->csum);
+
+ if (fbio)
+ btrfs_repair_done(fbio);
+ else
+ btrfs_orig_bbio_end_io(bbio);
+}
+
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
@@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work)
{
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
- bbio->end_io(bbio);
+ /* Metadata reads are checked and repaired by the submitter. */
+ if (bbio->bio.bi_opf & REQ_META)
+ bbio->end_io(bbio);
+ else
+ btrfs_check_read_bio(bbio, bbio->bio.bi_private);
}
static void btrfs_simple_end_io(struct bio *bio)
{
- struct btrfs_fs_info *fs_info = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_device *dev = bio->bi_private;
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
- btrfs_log_dev_io_error(bio, bbio->device);
+ btrfs_log_dev_io_error(bio, dev);
if (bio_op(bio) == REQ_OP_READ) {
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- bbio->end_io(bbio);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ btrfs_record_physical_zoned(bbio);
+ btrfs_orig_bbio_end_io(bbio);
}
}
@@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
- bbio->end_io(bbio);
+ if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
+ btrfs_check_read_bio(bbio, NULL);
+ else
+ btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
- bbio->end_io(bbio);
+ btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 zone_start = round_down(physical, dev->fs_info->zone_size);
- if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical,
- dev->fs_info->zone_size);
-
- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
- } else {
- bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
- bio->bi_opf |= REQ_OP_WRITE;
- }
+ ASSERT(btrfs_dev_is_sequential(dev, physical));
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
}
btrfs_debug_in_rcu(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
@@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
+static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap, int mirror_num)
{
- u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = bio->bi_iter.bi_size;
- u64 map_length = length;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_io_stripe smap;
- int ret;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
- return;
- }
-
- if (map_length < length) {
- btrfs_crit(fs_info,
- "mapping failed logical %llu bio len %llu len %llu",
- logical, length, map_length);
- BUG();
- }
+ /* Do not leak our private flag into the block layer. */
+ bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED;
if (!bioc) {
- /* Single mirror read/write fast path */
+ /* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
- btrfs_bio(bio)->device = smap.dev;
- bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- bio->bi_private = fs_info;
+ bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
+ bio->bi_private = smap->dev;
bio->bi_end_io = btrfs_simple_end_io;
- btrfs_submit_dev_bio(smap.dev, bio);
+ btrfs_submit_dev_bio(smap->dev, bio);
} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /* Parity RAID write or read recovery */
+ /* Parity RAID write or read recovery. */
bio->bi_private = bioc;
bio->bi_end_io = btrfs_raid56_end_io;
if (bio_op(bio) == REQ_OP_READ)
@@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
else
raid56_parity_write(bio, bioc);
} else {
- /* Write to multiple mirrors */
+ /* Write to multiple mirrors. */
int total_devs = bioc->num_stripes;
- int dev_nr;
bioc->orig_bio = bio;
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
btrfs_submit_mirrored_bio(bioc, dev_nr);
}
}
+static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
+{
+ if (bbio->bio.bi_opf & REQ_META)
+ return btree_csum_one_bio(bbio);
+ return btrfs_csum_one_bio(bbio);
+}
+
+/*
+ * Async submit bios are used to offload expensive checksumming onto the worker
+ * threads.
+ */
+struct async_submit_bio {
+ struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe smap;
+ int mirror_num;
+ struct btrfs_work work;
+};
+
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the btree.
+ */
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ blk_status_t ret;
+
+ ret = btrfs_bio_csum(async->bbio);
+ if (ret)
+ async->bbio->bio.bi_status = ret;
+}
+
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct bio *bio = &async->bbio->bio;
+
+ /* If an error occurred we just want to clean up the bio and move on. */
+ if (bio->bi_status) {
+ btrfs_orig_bbio_end_io(async->bbio);
+ return;
+ }
+
+ /*
+ * All of the bios that pass through here are from async helpers.
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+ * This changes nothing when cgroups aren't in use.
+ */
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ kfree(container_of(work, struct async_submit_bio, work));
+}
+
+static bool should_async_write(struct btrfs_bio *bbio)
+{
+ /*
+ * If the I/O is not issued by fsync and friends, (->sync_writers != 0),
+ * then try to defer the submission to a workqueue to parallelize the
+ * checksum calculation.
+ */
+ if (atomic_read(&bbio->inode->sync_writers))
+ return false;
+
+ /*
+ * Submit metadata writes synchronously if the checksum implementation
+ * is fast, or we are on a zoned device that wants I/O to be submitted
+ * in order.
+ */
+ if (bbio->bio.bi_opf & REQ_META) {
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+
+ if (btrfs_is_zoned(fs_info))
+ return false;
+ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Submit bio to an async queue.
+ *
+ * Return true if the work has been succesfuly submitted, else false.
+ */
+static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
+ struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct async_submit_bio *async;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return false;
+
+ async->bbio = bbio;
+ async->bioc = bioc;
+ async->smap = *smap;
+ async->mirror_num = mirror_num;
+
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
+ run_one_async_free);
+ if (op_is_sync(bbio->bio.bi_opf))
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
+ return true;
+}
+
+static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_bio *orig_bbio = bbio;
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
+ bool use_append = btrfs_use_zone_append(bbio);
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap;
+ blk_status_t ret;
+ int error;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
+ if (error) {
+ ret = errno_to_blk_status(error);
+ goto fail;
+ }
+
+ map_length = min(map_length, length);
+ if (use_append)
+ map_length = min(map_length, fs_info->max_zone_append_size);
+
+ if (map_length < length) {
+ bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
+ bbio = btrfs_bio(bio);
+ }
+
+ /*
+ * Save the iter for the end_io handler and preload the checksums for
+ * data reads.
+ */
+ if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
+ bbio->saved_iter = bio->bi_iter;
+ ret = btrfs_lookup_bio_sums(bbio);
+ if (ret)
+ goto fail_put_bio;
+ }
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ if (use_append) {
+ bio->bi_opf &= ~REQ_OP_WRITE;
+ bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
+ if (ret)
+ goto fail_put_bio;
+ }
+
+ /*
+ * Csum items for reloc roots have already been cloned at this
+ * point, so they are handled as part of the no-checksum case.
+ */
+ if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(inode->root)) {
+ if (should_async_write(bbio) &&
+ btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
+ goto done;
+
+ ret = btrfs_bio_csum(bbio);
+ if (ret)
+ goto fail_put_bio;
+ }
+ }
+
+ __btrfs_submit_bio(bio, bioc, &smap, mirror_num);
+done:
+ return map_length == length;
+
+fail_put_bio:
+ if (map_length < length)
+ bio_put(bio);
+fail:
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_bio_end_io(orig_bbio, ret);
+ /* Do not submit another chunk */
+ return true;
+}
+
+void btrfs_submit_bio(struct bio *bio, int mirror_num)
+{
+ while (!btrfs_submit_chunk(bio, mirror_num))
+ ;
+}
+
/*
* Submit a repair write.
*
@@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
* RAID setup. Here we only want to write the one bad copy, so we do the
* mapping ourselves and submit the bio directly.
*
- * The I/O is issued sychronously to block the repair read completion from
+ * The I/O is issued synchronously to block the repair read completion from
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
@@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void)
offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
+ if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio), 0))
+ goto out_free_bioset;
+ if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ goto out_free_clone_bioset;
+ if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
+ sizeof(struct btrfs_failed_bio)))
+ goto out_free_repair_bioset;
return 0;
+
+out_free_repair_bioset:
+ bioset_exit(&btrfs_repair_bioset);
+out_free_clone_bioset:
+ bioset_exit(&btrfs_clone_bioset);
+out_free_bioset:
+ bioset_exit(&btrfs_bioset);
+ return -ENOMEM;
}
void __cold btrfs_bioset_exit(void)
{
+ mempool_exit(&btrfs_failed_bio_pool);
+ bioset_exit(&btrfs_repair_bioset);
+ bioset_exit(&btrfs_clone_bioset);
bioset_exit(&btrfs_bioset);
}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index b12f84b3b341..873ff85817f0 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -26,32 +26,23 @@ struct btrfs_fs_info;
typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
- * Additional info to pass along bio.
- *
- * Mostly for btrfs specific features like csum and mirror_num.
+ * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and
+ * passed to btrfs_submit_bio for mapping to the physical devices.
*/
struct btrfs_bio {
- unsigned int mirror_num:7;
-
- /*
- * Extra indicator for metadata bios.
- * For some btrfs bios they use pages without a mapping, thus
- * we can not rely on page->mapping->host to determine if
- * it's a metadata bio.
- */
- unsigned int is_metadata:1;
- struct bvec_iter iter;
-
- /* for direct I/O */
+ /* Inode and offset into it that this I/O operates on. */
+ struct btrfs_inode *inode;
u64 file_offset;
- /* @device is for stripe IO submission. */
- struct btrfs_device *device;
union {
- /* For data checksum verification. */
+ /*
+ * Data checksumming and original I/O information for internal
+ * use in the btrfs_submit_bio machinery.
+ */
struct {
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+ struct bvec_iter saved_iter;
};
/* For metadata parentness verification. */
@@ -62,7 +53,9 @@ struct btrfs_bio {
btrfs_bio_end_io_t end_io;
void *private;
- /* For read end I/O handling */
+ /* For internal use in read end I/O handling */
+ unsigned int mirror_num;
+ atomic_t pending_ios;
struct work_struct end_io_work;
/*
@@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
int __init btrfs_bioset_init(void);
void __cold btrfs_bioset_exit(void);
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+ btrfs_bio_end_io_t end_io, void *private);
struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ struct btrfs_inode *inode,
btrfs_bio_end_io_t end_io, void *private);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
- btrfs_bio_end_io_t end_io, void *private);
-
static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
@@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
bbio->end_io(bbio);
}
-static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
-{
- if (bbio->is_metadata)
- return;
- if (bbio->csum != bbio->csum_inline) {
- kfree(bbio->csum);
- bbio->csum = NULL;
- }
-}
+/* Bio only refers to one ordered extent. */
+#define REQ_BTRFS_ONE_ORDERED REQ_DRV
-/*
- * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
- *
- * bvl - struct bio_vec
- * bbio - struct btrfs_bio
- * iters - struct bvec_iter
- * bio_offset - unsigned int
- */
-#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
- for ((iter) = (bbio)->iter, (bio_offset) = 0; \
- (iter).bi_size && \
- (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
- (bio_offset) += fs_info->sectorsize, \
- bio_advance_iter_single(&(bbio)->bio, &(iter), \
- (fs_info)->sectorsize))
-
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num);
+void btrfs_submit_bio(struct bio *bio, int mirror_num);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 708d843daa72..5b10401d803b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/sizes.h>
#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
@@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
return total_added;
}
+/*
+ * Get an arbitrary extent item index / max_index through the block group
+ *
+ * @block_group the block group to sample from
+ * @index: the integral step through the block group to grab from
+ * @max_index: the granularity of the sampling
+ * @key: return value parameter for the item we find
+ *
+ * Pre-conditions on indices:
+ * 0 <= index <= max_index
+ * 0 < max_index
+ *
+ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
+ * error code on error.
+ */
+static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_block_group *block_group,
+ int index, int max_index,
+ struct btrfs_key *key)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *extent_root;
+ int ret = 0;
+ u64 search_offset;
+ u64 search_end = block_group->start + block_group->length;
+ struct btrfs_path *path;
+
+ ASSERT(index >= 0);
+ ASSERT(index <= max_index);
+ ASSERT(max_index > 0);
+ lockdep_assert_held(&caching_ctl->mutex);
+ lockdep_assert_held_read(&fs_info->commit_root_sem);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
+ BTRFS_SUPER_INFO_OFFSET));
+
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = READA_FORWARD;
+
+ search_offset = index * div_u64(block_group->length, max_index);
+ key->objectid = block_group->start + search_offset;
+ key->type = BTRFS_EXTENT_ITEM_KEY;
+ key->offset = 0;
+
+ while (1) {
+ ret = btrfs_search_forward(extent_root, key, path, 0);
+ if (ret != 0)
+ goto out;
+ /* Success; sampled an extent item in the block group */
+ if (key->type == BTRFS_EXTENT_ITEM_KEY &&
+ key->objectid >= block_group->start &&
+ key->objectid + key->offset <= search_end)
+ goto out;
+
+ /* We can't possibly find a valid extent item anymore */
+ if (key->objectid >= search_end) {
+ ret = 1;
+ break;
+ }
+ if (key->type < BTRFS_EXTENT_ITEM_KEY)
+ key->type = BTRFS_EXTENT_ITEM_KEY;
+ else
+ key->objectid++;
+ btrfs_release_path(path);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ cond_resched();
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ }
+out:
+ lockdep_assert_held(&caching_ctl->mutex);
+ lockdep_assert_held_read(&fs_info->commit_root_sem);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Best effort attempt to compute a block group's size class while caching it.
+ *
+ * @block_group: the block group we are caching
+ *
+ * We cannot infer the size class while adding free space extents, because that
+ * logic doesn't care about contiguous file extents (it doesn't differentiate
+ * between a 100M extent and 100 contiguous 1M extents). So we need to read the
+ * file extent items. Reading all of them is quite wasteful, because usually
+ * only a handful are enough to give a good answer. Therefore, we just grab 5 of
+ * them at even steps through the block group and pick the smallest size class
+ * we see. Since size class is best effort, and not guaranteed in general,
+ * inaccuracy is acceptable.
+ *
+ * To be more explicit about why this algorithm makes sense:
+ *
+ * If we are caching in a block group from disk, then there are three major cases
+ * to consider:
+ * 1. the block group is well behaved and all extents in it are the same size
+ * class.
+ * 2. the block group is mostly one size class with rare exceptions for last
+ * ditch allocations
+ * 3. the block group was populated before size classes and can have a totally
+ * arbitrary mix of size classes.
+ *
+ * In case 1, looking at any extent in the block group will yield the correct
+ * result. For the mixed cases, taking the minimum size class seems like a good
+ * approximation, since gaps from frees will be usable to the size class. For
+ * 2., a small handful of file extents is likely to yield the right answer. For
+ * 3, we can either read every file extent, or admit that this is best effort
+ * anyway and try to stay fast.
+ *
+ * Returns: 0 on success, negative error code on error.
+ */
+static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_key key;
+ int i;
+ u64 min_size = block_group->length;
+ enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
+ int ret;
+
+ if (!btrfs_block_group_should_use_size_class(block_group))
+ return 0;
+
+ for (i = 0; i < 5; ++i) {
+ ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ continue;
+ min_size = min_t(u64, min_size, key.offset);
+ size_class = btrfs_calc_block_group_size_class(min_size);
+ }
+ if (size_class != BTRFS_BG_SZ_NONE) {
+ spin_lock(&block_group->lock);
+ block_group->size_class = size_class;
+ spin_unlock(&block_group->lock);
+ }
+
+out:
+ return ret;
+}
+
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
@@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
+ load_block_group_size_class(caching_ctl, block_group);
if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
ret = load_free_space_cache(block_group);
if (ret == 1) {
@@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
*
* @fs_info: the filesystem
* @chunk_start: logical address of block group
- * @bdev: physical device to resolve, can be NULL to indicate any device
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
@@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* block copies.
*/
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- struct block_device *bdev, u64 physical, u64 **logical,
- int *naddrs, int *stripe_len)
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
data_stripe_length))
continue;
- if (bdev && map->stripes[i].dev->bdev != bdev)
- continue;
-
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
@@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->start, NULL,
+ ret = btrfs_rmap_block(fs_info, cache->start,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
- bool reclaim;
+ bool reclaim = false;
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
@@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info->disk_used -= num_bytes * factor;
reclaim = should_reclaim_block_group(cache, num_bytes);
+
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc)
+ u64 ram_bytes, u64 num_bytes, int delalloc,
+ bool force_wrong_size_class)
{
struct btrfs_space_info *space_info = cache->space_info;
+ enum btrfs_block_group_size_class size_class;
int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (cache->ro) {
ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- trace_btrfs_space_reservation(cache->fs_info, "space_info",
- space_info->flags, num_bytes, 1);
- btrfs_space_info_update_bytes_may_use(cache->fs_info,
- space_info, -ram_bytes);
- if (delalloc)
- cache->delalloc_bytes += num_bytes;
+ goto out;
+ }
- /*
- * Compression can use less space than we reserved, so wake
- * tickets if that happens
- */
- if (num_bytes < ram_bytes)
- btrfs_try_granting_tickets(cache->fs_info, space_info);
+ if (btrfs_block_group_should_use_size_class(cache)) {
+ size_class = btrfs_calc_block_group_size_class(num_bytes);
+ ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
+ if (ret)
+ goto out;
}
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ trace_btrfs_space_reservation(cache->fs_info, "space_info",
+ space_info->flags, num_bytes, 1);
+ btrfs_space_info_update_bytes_may_use(cache->fs_info,
+ space_info, -ram_bytes);
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake tickets if
+ * that happens.
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
+out:
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
@@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount
bg->swap_extents -= amount;
spin_unlock(&bg->lock);
}
+
+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
+{
+ if (size <= SZ_128K)
+ return BTRFS_BG_SZ_SMALL;
+ if (size <= SZ_8M)
+ return BTRFS_BG_SZ_MEDIUM;
+ return BTRFS_BG_SZ_LARGE;
+}
+
+/*
+ * Handle a block group allocating an extent in a size class
+ *
+ * @bg: The block group we allocated in.
+ * @size_class: The size class of the allocation.
+ * @force_wrong_size_class: Whether we are desperate enough to allow
+ * mismatched size classes.
+ *
+ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
+ * case of a race that leads to the wrong size class without
+ * force_wrong_size_class set.
+ *
+ * find_free_extent will skip block groups with a mismatched size class until
+ * it really needs to avoid ENOSPC. In that case it will set
+ * force_wrong_size_class. However, if a block group is newly allocated and
+ * doesn't yet have a size class, then it is possible for two allocations of
+ * different sizes to race and both try to use it. The loser is caught here and
+ * has to retry.
+ */
+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
+ enum btrfs_block_group_size_class size_class,
+ bool force_wrong_size_class)
+{
+ ASSERT(size_class != BTRFS_BG_SZ_NONE);
+
+ /* The new allocation is in the right size class, do nothing */
+ if (bg->size_class == size_class)
+ return 0;
+ /*
+ * The new allocation is in a mismatched size class.
+ * This means one of two things:
+ *
+ * 1. Two tasks in find_free_extent for different size_classes raced
+ * and hit the same empty block_group. Make the loser try again.
+ * 2. A call to find_free_extent got desperate enough to set
+ * 'force_wrong_slab'. Don't change the size_class, but allow the
+ * allocation.
+ */
+ if (bg->size_class != BTRFS_BG_SZ_NONE) {
+ if (force_wrong_size_class)
+ return 0;
+ return -EAGAIN;
+ }
+ /*
+ * The happy new block group case: the new allocation is the first
+ * one in the block_group so we set size_class.
+ */
+ bg->size_class = size_class;
+
+ return 0;
+}
+
+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
+{
+ if (btrfs_is_zoned(bg->fs_info))
+ return false;
+ if (!btrfs_is_block_group_data_only(bg))
+ return false;
+ return true;
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index a02ea76fd6cf..6e4a0b429ac3 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -12,6 +12,17 @@ enum btrfs_disk_cache_state {
BTRFS_DC_SETUP,
};
+enum btrfs_block_group_size_class {
+ /* Unset */
+ BTRFS_BG_SZ_NONE,
+ /* 0 < size <= 128K */
+ BTRFS_BG_SZ_SMALL,
+ /* 128K < size <= 8M */
+ BTRFS_BG_SZ_MEDIUM,
+ /* 8M < size < BG_LENGTH */
+ BTRFS_BG_SZ_LARGE,
+};
+
/*
* This describes the state of the block_group for async discard. This is due
* to the two pass nature of it where extent discarding is prioritized over
@@ -233,6 +244,7 @@ struct btrfs_block_group {
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
+ enum btrfs_block_group_size_class size_class;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc);
+ u64 ram_bytes, u64 num_bytes, int delalloc,
+ bool force_wrong_size_class);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc);
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
@@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- struct block_device *bdev, u64 physical, u64 **logical,
- int *naddrs, int *stripe_len);
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len);
static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
@@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
+ enum btrfs_block_group_size_class size_class,
+ bool force_wrong_size_class);
+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
+
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7c1527fcc7b3..9dc21622806e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -93,12 +93,6 @@ struct btrfs_inode {
/* the io_tree does range state (DIRTY, LOCKED etc) */
struct extent_io_tree io_tree;
- /* special utility tree used to record which mirrors have already been
- * tried when checksums fail for a given block
- */
- struct rb_root io_failure_tree;
- spinlock_t io_failure_lock;
-
/*
* Keep track of where the inode has extent items mapped in order to
* make sure the i_size adjustments are accurate
@@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
-void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
-void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type);
-void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
-blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
-blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
- struct bio *bio,
- u64 dio_file_offset);
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
u32 pgoff, u8 *csum, const u8 * const csum_expected);
-int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff);
-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page,
- u64 start, u64 end);
+blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
+bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
+ u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool nowait, bool strict);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5122ca79f7ea..f42f31f22d13 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws,
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static void finish_compressed_bio_read(struct compressed_bio *cb)
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
+ struct compressed_bio *cb = bbio->private;
unsigned int index;
struct page *page;
- if (cb->status == BLK_STS_OK)
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
+ else
cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
/* Release the compressed pages */
@@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
/* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
-}
-
-/*
- * Verify the checksums and kick off repair if needed on the uncompressed data
- * before decompressing it into the original bio and freeing the uncompressed
- * pages.
- */
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
-{
- struct compressed_bio *cb = bbio->private;
- struct inode *inode = cb->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *bi = BTRFS_I(inode);
- bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
- blk_status_t status = bbio->bio.bi_status;
- struct bvec_iter iter;
- struct bio_vec bv;
- u32 offset;
-
- btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
- u64 start = bbio->file_offset + offset;
-
- if (!status &&
- (!csum || !btrfs_check_data_csum(bi, bbio, offset,
- bv.bv_page, bv.bv_offset))) {
- btrfs_clean_io_failure(bi, start, bv.bv_page,
- bv.bv_offset);
- } else {
- int ret;
-
- refcount_inc(&cb->pending_ios);
- ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
- bv.bv_page, bv.bv_offset,
- true);
- if (ret) {
- refcount_dec(&cb->pending_ios);
- status = errno_to_blk_status(ret);
- }
- }
- }
-
- if (status)
- cb->status = status;
-
- if (refcount_dec_and_test(&cb->pending_ios))
- finish_compressed_bio_read(cb);
- btrfs_bio_free_csum(bbio);
bio_put(&bbio->bio);
}
@@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = bbio->private;
-
- if (bbio->bio.bi_status)
- cb->status = bbio->bio.bi_status;
-
- if (refcount_dec_and_test(&cb->pending_ios)) {
- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
-
- btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
- queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
- }
- bio_put(&bbio->bio);
-}
-
-/*
- * Allocate a compressed_bio, which will be used to read/write on-disk
- * (aka, compressed) * data.
- *
- * @cb: The compressed_bio structure, which records all the needed
- * information to bind the compressed data to the uncompressed
- * page cache.
- * @disk_byten: The logical bytenr where the compressed data will be read
- * from or written to.
- * @endio_func: The endio function to call after the IO for compressed data
- * is finished.
- * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
- * Let the caller know to only fill the bio up to the stripe
- * boundary.
- */
-
-
-static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
- blk_opf_t opf,
- btrfs_bio_end_io_t endio_func,
- u64 *next_stripe_start)
-{
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- struct btrfs_io_geometry geom;
- struct extent_map *em;
- struct bio *bio;
- int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
- bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ cb->status = bbio->bio.bi_status;
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
- em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
- if (IS_ERR(em)) {
- bio_put(bio);
- return ERR_CAST(em);
- }
-
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
-
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
- free_extent_map(em);
- if (ret < 0) {
- bio_put(bio);
- return ERR_PTR(ret);
- }
- *next_stripe_start = disk_bytenr + geom.len;
- refcount_inc(&cb->pending_ios);
- return bio;
+ bio_put(&bbio->bio);
}
/*
@@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct bio *bio = NULL;
struct compressed_bio *cb;
u64 cur_disk_bytenr = disk_start;
- u64 next_stripe_start;
blk_status_t ret = BLK_STS_OK;
- int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
- const bool use_append = btrfs_use_zone_append(inode, disk_start);
- const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
@@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
- if (blkcg_css)
+ if (blkcg_css) {
kthread_associate_blkcg(blkcg_css);
+ write_flags |= REQ_CGROUP_PUNT;
+ }
+
+ write_flags |= REQ_BTRFS_ONE_ORDERED;
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
+ BTRFS_I(cb->inode), end_compressed_bio_write, cb);
+ bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
+ btrfs_bio(bio)->file_offset = start;
while (cur_disk_bytenr < disk_start + compressed_len) {
u64 offset = cur_disk_bytenr - disk_start;
@@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int real_size;
unsigned int added;
struct page *page = compressed_pages[index];
- bool submit = false;
-
- /* Allocate new bio if submitted or not yet allocated */
- if (!bio) {
- bio = alloc_compressed_bio(cb, cur_disk_bytenr,
- bio_op | write_flags, end_compressed_bio_write,
- &next_stripe_start);
- if (IS_ERR(bio)) {
- ret = errno_to_blk_status(PTR_ERR(bio));
- break;
- }
- if (blkcg_css)
- bio->bi_opf |= REQ_CGROUP_PUNT;
- }
- /*
- * We should never reach next_stripe_start start as we will
- * submit comp_bio when reach the boundary immediately.
- */
- ASSERT(cur_disk_bytenr != next_stripe_start);
/*
* We have various limits on the real read size:
- * - stripe boundary
* - page boundary
* - compressed length boundary
*/
- real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
- real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
real_size = min_t(u64, real_size, compressed_len - offset);
ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
- if (use_append)
- added = bio_add_zone_append_page(bio, page, real_size,
- offset_in_page(offset));
- else
- added = bio_add_page(bio, page, real_size,
- offset_in_page(offset));
- /* Reached zoned boundary */
- if (added == 0)
- submit = true;
-
+ added = bio_add_page(bio, page, real_size, offset_in_page(offset));
+ /*
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
+ */
+ ASSERT(added == real_size);
cur_disk_bytenr += added;
- /* Reached stripe boundary */
- if (cur_disk_bytenr == next_stripe_start)
- submit = true;
-
- /* Finished the range */
- if (cur_disk_bytenr == disk_start + compressed_len)
- submit = true;
-
- if (submit) {
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, true);
- if (ret) {
- btrfs_bio_end_io(btrfs_bio(bio), ret);
- break;
- }
- }
-
- ASSERT(bio->bi_iter.bi_size);
- btrfs_submit_bio(fs_info, bio, 0);
- bio = NULL;
- }
- cond_resched();
}
+ /* Finished the range. */
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(bio, 0);
if (blkcg_css)
kthread_associate_blkcg(NULL);
-
- if (refcount_dec_and_test(&cb->pending_ios))
- finish_compressed_bio_write(cb);
return ret;
}
@@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
- struct bio *comp_bio = NULL;
+ struct bio *comp_bio;
const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_byte = disk_bytenr;
- u64 next_stripe_start;
u64 file_offset;
u64 em_len;
u64 em_start;
@@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto out;
}
- refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = inode;
@@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
+ comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
+ end_compressed_bio_read, cb);
+ comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
+
while (cur_disk_byte < disk_bytenr + compressed_len) {
u64 offset = cur_disk_byte - disk_bytenr;
unsigned int index = offset >> PAGE_SHIFT;
unsigned int real_size;
unsigned int added;
struct page *page = cb->compressed_pages[index];
- bool submit = false;
-
- /* Allocate new bio if submitted or not yet allocated */
- if (!comp_bio) {
- comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
- REQ_OP_READ, end_compressed_bio_read,
- &next_stripe_start);
- if (IS_ERR(comp_bio)) {
- cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
- break;
- }
- }
- /*
- * We should never reach next_stripe_start start as we will
- * submit comp_bio when reach the boundary immediately.
- */
- ASSERT(cur_disk_byte != next_stripe_start);
+
/*
* We have various limit on the real read size:
- * - stripe boundary
* - page boundary
* - compressed length boundary
*/
- real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
- real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
real_size = min_t(u64, real_size, compressed_len - offset);
ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
@@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
*/
ASSERT(added == real_size);
cur_disk_byte += added;
-
- /* Reached stripe boundary, need to submit */
- if (cur_disk_byte == next_stripe_start)
- submit = true;
-
- /* Has finished the range, need to submit */
- if (cur_disk_byte == disk_bytenr + compressed_len)
- submit = true;
-
- if (submit) {
- /* Save the original iter for read repair */
- if (bio_op(comp_bio) == REQ_OP_READ)
- btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
-
- /*
- * Save the initial offset of this chunk, as there
- * is no direct correlation between compressed pages and
- * the original file offset. The field is only used for
- * priting error messages.
- */
- btrfs_bio(comp_bio)->file_offset = file_offset;
-
- ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
- if (ret) {
- btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
- break;
- }
-
- ASSERT(comp_bio->bi_iter.bi_size);
- btrfs_submit_bio(fs_info, comp_bio, mirror_num);
- comp_bio = NULL;
- }
}
if (memstall)
psi_memstall_leave(&pflags);
- if (refcount_dec_and_test(&cb->pending_ios))
- finish_compressed_bio_read(cb);
+ /*
+ * Stash the initial offset of this chunk, as there is no direct
+ * correlation between compressed pages and the original file offset.
+ * The field is only used for printing error messages anyway.
+ */
+ btrfs_bio(comp_bio)->file_offset = file_offset;
+
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(comp_bio, mirror_num);
return;
fail:
@@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
index_end = end >> PAGE_SHIFT;
/* Don't miss unaligned end */
- if (!IS_ALIGNED(end, PAGE_SIZE))
+ if (!PAGE_ALIGNED(end))
index_end++;
curr_sample_pos = 0;
@@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*
* For now is's a naive and optimistic 'return true', we'll extend the logic to
* quickly (compared to direct compression) detect data characteristics
- * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
+ * (compressible/incompressible) to avoid wasting CPU time on incompressible
* data.
*
* The following types of analysis can be performed:
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 6209d40a1e08..a5e3377db9ad 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of outstanding bios */
- refcount_t pending_ios;
-
/* Number of compressed pages in the array */
unsigned int nr_pages;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4754c9101a4c..a5b6bb54545f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- btrfs_clean_tree_block(buf);
+ btrfs_clear_buffer_dirty(trans, buf);
*last_ref = 1;
}
return 0;
@@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
/*
* Search for a key in the given extent_buffer.
*
- * The lower boundary for the search is specified by the slot number @low. Use a
- * value of 0 to search over the whole extent buffer.
+ * The lower boundary for the search is specified by the slot number @first_slot.
+ * Use a value of 0 to search over the whole extent buffer.
*
* The slot in the extent buffer is returned via @slot. If the key exists in the
* extent buffer, then @slot will point to the slot where the key is, otherwise
@@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
* Slot may point to the total number of items (i.e. one position beyond the last
* key) if the key is bigger than the last key in the extent buffer.
*/
-static noinline int generic_bin_search(struct extent_buffer *eb, int low,
- const struct btrfs_key *key, int *slot)
+int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
+ const struct btrfs_key *key, int *slot)
{
unsigned long p;
int item_size;
- int high = btrfs_header_nritems(eb);
+ /*
+ * Use unsigned types for the low and high slots, so that we get a more
+ * efficient division in the search loop below.
+ */
+ u32 low = first_slot;
+ u32 high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
- if (low > high) {
+ if (unlikely(low > high)) {
btrfs_err(eb->fs_info,
- "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
+ "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
__func__, low, high, eb->start,
btrfs_header_owner(eb), btrfs_header_level(eb));
return -EINVAL;
@@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low,
return 1;
}
-/*
- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
- */
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int *slot)
-{
- return generic_bin_search(eb, 0, key, slot);
-}
-
static void root_add_used(struct btrfs_root *root, u32 size)
{
spin_lock(&root->accounting_lock);
@@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
path->nodes[level] = NULL;
- btrfs_clean_tree_block(mid);
+ btrfs_clear_buffer_dirty(trans, mid);
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
@@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- btrfs_clean_tree_block(right);
+ btrfs_clear_buffer_dirty(trans, right);
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
@@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- btrfs_clean_tree_block(mid);
+ btrfs_clear_buffer_dirty(trans, mid);
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
@@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
return 0;
}
- return generic_bin_search(eb, search_low_slot, key, slot);
+ return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
}
static int search_leaf(struct btrfs_trans_handle *trans,
@@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
* min slot controls the lowest index we're willing to push to the
* right. We'll push up to and including min_slot, but no lower
*/
-static noinline int __push_leaf_right(struct btrfs_path *path,
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
int data_size, int empty,
struct extent_buffer *right,
int free_space, u32 left_nritems,
@@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
else
- btrfs_clean_tree_block(left);
+ btrfs_clear_buffer_dirty(trans, left);
btrfs_mark_buffer_dirty(right);
@@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
if (btrfs_header_nritems(path->nodes[0]) == 0)
- btrfs_clean_tree_block(path->nodes[0]);
+ btrfs_clear_buffer_dirty(trans, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 0;
}
- return __push_leaf_right(path, min_data_size, empty,
- right, free_space, left_nritems, min_slot);
+ return __push_leaf_right(trans, path, min_data_size, empty, right,
+ free_space, left_nritems, min_slot);
out_unlock:
btrfs_tree_unlock(right);
free_extent_buffer(right);
@@ -3259,7 +3255,8 @@ out_unlock:
* item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
* items
*/
-static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, int data_size,
int empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
u32 max_slot)
@@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (right_nritems)
btrfs_mark_buffer_dirty(right);
else
- btrfs_clean_tree_block(right);
+ btrfs_clear_buffer_dirty(trans, right);
btrfs_item_key(right, &disk_key, 0);
fixup_low_keys(path, &disk_key, 1);
@@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
ret = -EUCLEAN;
goto out;
}
- return __push_leaf_left(path, min_data_size,
- empty, left, free_space, right_nritems,
- max_slot);
+ return __push_leaf_left(trans, path, min_data_size, empty, left,
+ free_space, right_nritems, max_slot);
out:
btrfs_tree_unlock(left);
free_extent_buffer(left);
@@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
- btrfs_clean_tree_block(leaf);
+ btrfs_clear_buffer_dirty(trans, leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
} else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6965703a81b6..97897107fab5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
/* ctree.c */
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
+
+int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
+ const struct btrfs_key *key, int *slot);
+
+/*
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
+ */
+static inline int btrfs_bin_search(struct extent_buffer *eb,
+ const struct btrfs_key *key,
+ int *slot)
+{
+ return btrfs_generic_bin_search(eb, 0, key, slot);
+}
+
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index d81b764a7644..8065341d831a 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -765,7 +765,7 @@ again:
break;
unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -999,7 +999,7 @@ next:
}
#define CLUSTER_SIZE (SZ_256K)
-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
/*
* Defrag one contiguous target range.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 573ebab886e2..886ffb232eac 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
return 0;
}
-static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
@@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
atomic_dec(&delayed_refs->num_entries);
}
-static bool merge_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
@@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
mod = -next->ref_mod;
}
- drop_delayed_ref(trans, delayed_refs, head, next);
+ drop_delayed_ref(delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
- drop_delayed_ref(trans, delayed_refs, head, ref);
+ drop_delayed_ref(delayed_refs, head, ref);
done = true;
} else {
/*
@@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
return done;
}
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
u64 seq = 0;
@@ -524,7 +521,7 @@ again:
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
- if (merge_ref(trans, delayed_refs, head, ref, seq))
+ if (merge_ref(delayed_refs, head, ref, seq))
goto again;
}
}
@@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return 0 for insert.
* Return >0 for merge.
*/
-static int insert_delayed_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *root,
+static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
@@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
- drop_delayed_ref(trans, root, href, exist);
+ drop_delayed_ref(root, href, exist);
spin_unlock(&href->lock);
return ret;
inserted:
@@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
+ ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
+ ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d6304b690ec4..2eb34abf700f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index ff2e524d9937..317aeff6c1da 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
+ lockdep_assert_held(&discard_ctl->lock);
if (!btrfs_run_discard_work(discard_ctl))
return;
@@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
BTRFS_DISCARD_DELAY);
block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
}
+ if (list_empty(&block_group->discard_list))
+ btrfs_get_block_group(block_group);
list_move_tail(&block_group->discard_list,
get_discard_list(discard_ctl, block_group));
@@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
+ bool queued;
+
spin_lock(&discard_ctl->lock);
+ queued = !list_empty(&block_group->discard_list);
+
if (!btrfs_run_discard_work(discard_ctl)) {
spin_unlock(&discard_ctl->lock);
return;
@@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = (ktime_get_ns() +
BTRFS_DISCARD_UNUSED_DELAY);
block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ if (!queued)
+ btrfs_get_block_group(block_group);
list_add_tail(&block_group->discard_list,
&discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
@@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
bool running = false;
+ bool queued = false;
spin_lock(&discard_ctl->lock);
@@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
}
block_group->discard_eligible_time = 0;
+ queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
+ /*
+ * If the block group is currently running in the discard workfn, we
+ * don't want to deref it, since it's still being used by the workfn.
+ * The workfn will notice this case and deref the block group when it is
+ * finished.
+ */
+ if (queued && !running)
+ btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -214,10 +233,12 @@ again:
if (block_group && now >= block_group->discard_eligible_time) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
block_group->used != 0) {
- if (btrfs_is_block_group_data_only(block_group))
+ if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
- else
+ } else {
list_del_init(&block_group->discard_list);
+ btrfs_put_block_group(block_group);
+ }
goto again;
}
if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
@@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
+ /*
+ * If the block group was removed from the discard list while it was
+ * running in this workfn, then we didn't deref it, since this function
+ * still owned that reference. But we set the discard_ctl->block_group
+ * back to NULL, so we can use that condition to know that now we need
+ * to deref the block_group.
+ */
+ if (discard_ctl->block_group == NULL)
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
@@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
bg_list) {
list_del_init(&block_group->bg_list);
- btrfs_put_block_group(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ /*
+ * This put is for the get done by btrfs_mark_bg_unused.
+ * Queueing discard incremented it for discard's reference.
+ */
+ btrfs_put_block_group(block_group);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
@@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
if (block_group->used == 0)
btrfs_mark_bg_unused(block_group);
spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
}
}
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3aa04224315e..b53f0e30ce2b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -79,23 +79,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
}
/*
- * async submit bios are used to offload expensive checksumming
- * onto the worker threads. They checksum file and metadata bios
- * just before they are sent down the IO stack.
- */
-struct async_submit_bio {
- struct btrfs_inode *inode;
- struct bio *bio;
- enum btrfs_wq_submit_cmd submit_cmd;
- int mirror_num;
-
- /* Optional parameter for used by direct io */
- u64 dio_file_offset;
- struct btrfs_work work;
- blk_status_t status;
-};
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
return csum_one_extent_buffer(eb);
}
+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
+{
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ int ret = 0;
+
+ bio_for_each_segment(bv, &bbio->bio, iter) {
+ ret = csum_dirty_buffer(fs_info, &bv);
+ if (ret)
+ break;
+ }
+
+ return errno_to_blk_status(ret);
+}
+
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -700,172 +699,6 @@ err:
return ret;
}
-static void run_one_async_start(struct btrfs_work *work)
-{
- struct async_submit_bio *async;
- blk_status_t ret;
-
- async = container_of(work, struct async_submit_bio, work);
- switch (async->submit_cmd) {
- case WQ_SUBMIT_METADATA:
- ret = btree_submit_bio_start(async->bio);
- break;
- case WQ_SUBMIT_DATA:
- ret = btrfs_submit_bio_start(async->inode, async->bio);
- break;
- case WQ_SUBMIT_DATA_DIO:
- ret = btrfs_submit_bio_start_direct_io(async->inode,
- async->bio, async->dio_file_offset);
- break;
- }
- if (ret)
- async->status = ret;
-}
-
-/*
- * In order to insert checksums into the metadata in large chunks, we wait
- * until bio submission time. All the pages in the bio are checksummed and
- * sums are attached onto the ordered extent record.
- *
- * At IO completion time the csums attached on the ordered extent record are
- * inserted into the tree.
- */
-static void run_one_async_done(struct btrfs_work *work)
-{
- struct async_submit_bio *async =
- container_of(work, struct async_submit_bio, work);
- struct btrfs_inode *inode = async->inode;
- struct btrfs_bio *bbio = btrfs_bio(async->bio);
-
- /* If an error occurred we just want to clean up the bio and move on */
- if (async->status) {
- btrfs_bio_end_io(bbio, async->status);
- return;
- }
-
- /*
- * All of the bios that pass through here are from async helpers.
- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
- * This changes nothing when cgroups aren't in use.
- */
- async->bio->bi_opf |= REQ_CGROUP_PUNT;
- btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
-}
-
-static void run_one_async_free(struct btrfs_work *work)
-{
- struct async_submit_bio *async;
-
- async = container_of(work, struct async_submit_bio, work);
- kfree(async);
-}
-
-/*
- * Submit bio to an async queue.
- *
- * Retrun:
- * - true if the work has been succesfuly submitted
- * - false in case of error
- */
-bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct async_submit_bio *async;
-
- async = kmalloc(sizeof(*async), GFP_NOFS);
- if (!async)
- return false;
-
- async->inode = inode;
- async->bio = bio;
- async->mirror_num = mirror_num;
- async->submit_cmd = cmd;
-
- btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
- run_one_async_free);
-
- async->dio_file_offset = dio_file_offset;
-
- async->status = 0;
-
- if (op_is_sync(bio->bi_opf))
- btrfs_queue_work(fs_info->hipri_workers, &async->work);
- else
- btrfs_queue_work(fs_info->workers, &async->work);
- return true;
-}
-
-static blk_status_t btree_csum_one_bio(struct bio *bio)
-{
- struct bio_vec *bvec;
- struct btrfs_root *root;
- int ret = 0;
- struct bvec_iter_all iter_all;
-
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root->fs_info, bvec);
- if (ret)
- break;
- }
-
- return errno_to_blk_status(ret);
-}
-
-blk_status_t btree_submit_bio_start(struct bio *bio)
-{
- /*
- * when we're called for a write, we're already in the async
- * submission context. Just jump into btrfs_submit_bio.
- */
- return btree_csum_one_bio(bio);
-}
-
-static bool should_async_write(struct btrfs_fs_info *fs_info,
- struct btrfs_inode *bi)
-{
- if (btrfs_is_zoned(fs_info))
- return false;
- if (atomic_read(&bi->sync_writers))
- return false;
- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
- return false;
- return true;
-}
-
-void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_bio *bbio = btrfs_bio(bio);
- blk_status_t ret;
-
- bio->bi_opf |= REQ_META;
- bbio->is_metadata = 1;
-
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- btrfs_submit_bio(fs_info, bio, mirror_num);
- return;
- }
-
- /*
- * Kthread helpers are used to submit writes so that checksumming can
- * happen in parallel across all CPUs.
- */
- if (should_async_write(fs_info, inode) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
- return;
-
- ret = btree_csum_one_bio(bio);
- if (ret) {
- btrfs_bio_end_io(bbio, ret);
- return;
- }
-
- btrfs_submit_bio(fs_info, bio, mirror_num);
-}
-
#ifdef CONFIG_MIGRATION
static int btree_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
@@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
}
-void btrfs_clean_tree_block(struct extent_buffer *buf)
-{
- struct btrfs_fs_info *fs_info = buf->fs_info;
- if (btrfs_header_generation(buf) ==
- fs_info->running_transaction->transid) {
- btrfs_assert_tree_write_locked(buf);
-
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
- -buf->len,
- fs_info->dirty_metadata_batch);
- clear_extent_buffer_dirty(buf);
- }
- }
-}
-
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
@@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
+ btrfs_sysfs_feature_update(fs_info);
+
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(fs_info);
@@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
start += fs_info->nodesize;
if (!eb)
continue;
+
+ btrfs_tree_lock(eb);
wait_on_extent_buffer_writeback(eb);
+ btrfs_clear_buffer_dirty(NULL, eb);
+ btrfs_tree_unlock(eb);
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
- &eb->bflags))
- clear_extent_buffer_dirty(eb);
free_extent_buffer_stale(eb);
}
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f2f295eb6103..4d5772330110 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level);
-void btrfs_clean_tree_block(struct extent_buffer *buf);
+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *buf);
void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
@@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
-void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int btrfs_read_extent_buffer(struct extent_buffer *buf,
struct btrfs_tree_parent_check *check);
-enum btrfs_wq_submit_cmd {
- WQ_SUBMIT_METADATA,
- WQ_SUBMIT_DATA,
- WQ_SUBMIT_DATA_DIO,
-};
-
-bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
-blk_status_t btree_submit_bio_start(struct bio *bio);
+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 3c7766dfaa69..29a225836e28 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
- struct rb_node **p;
- struct rb_node *parent;
+ struct rb_node **p = NULL;
+ struct rb_node *parent = NULL;
int err = 0;
u64 last_start;
u64 last_end;
@@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
- struct rb_node **p;
- struct rb_node *parent;
+ struct rb_node **p = NULL;
+ struct rb_node *parent = NULL;
int err = 0;
u64 last_start;
u64 last_end;
@@ -1625,7 +1625,7 @@ search:
}
/*
- * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * Search a range in the state tree for a given mask. If 'filled' == 1, this
* returns 1 only if every extent in the tree has the bits set. Otherwise, 1
* is returned if any bit in the range is found set.
*/
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index e3eeec380844..21766e49ec02 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -6,7 +6,6 @@
#include "misc.h"
struct extent_changeset;
-struct io_failure_record;
/* Bits for the extent state */
enum {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72ba13b027a9..824c657f59e8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -16,7 +16,8 @@
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
#include <linux/crc32c.h>
-#include "misc.h"
+#include "ctree.h"
+#include "extent-tree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
@@ -31,14 +32,12 @@
#include "space-info.h"
#include "block-rsv.h"
#include "delalloc-space.h"
-#include "block-group.h"
#include "discard.h"
#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
#include "fs.h"
#include "accessors.h"
-#include "extent-tree.h"
#include "root-tree.h"
#include "file-item.h"
#include "orphan.h"
@@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
cond_resched();
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+ btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
}
return 0;
@@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* insert_inline_extent_backref()).
*/
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+ btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
&actual_count);
@@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
enum btrfs_loop_type {
LOOP_CACHING_NOWAIT,
LOOP_CACHING_WAIT,
+ LOOP_UNSET_SIZE_CLASS,
LOOP_ALLOC_CHUNK,
+ LOOP_WRONG_SIZE_CLASS,
LOOP_NO_EMPTY_SIZE,
};
@@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
-enum btrfs_extent_allocation_policy {
- BTRFS_EXTENT_ALLOC_CLUSTERED,
- BTRFS_EXTENT_ALLOC_ZONED,
-};
-
-/*
- * Structure used internally for find_free_extent() function. Wraps needed
- * parameters.
- */
-struct find_free_extent_ctl {
- /* Basic allocation info */
- u64 ram_bytes;
- u64 num_bytes;
- u64 min_alloc_size;
- u64 empty_size;
- u64 flags;
- int delalloc;
-
- /* Where to start the search inside the bg */
- u64 search_start;
-
- /* For clustered allocation */
- u64 empty_cluster;
- struct btrfs_free_cluster *last_ptr;
- bool use_cluster;
-
- bool have_caching_bg;
- bool orig_have_caching_bg;
-
- /* Allocation is called for tree-log */
- bool for_treelog;
-
- /* Allocation is called for data relocation */
- bool for_data_reloc;
-
- /* RAID index, converted from flags */
- int index;
-
- /*
- * Current loop number, check find_free_extent_update_loop() for details
- */
- int loop;
-
- /*
- * Whether we're refilling a cluster, if true we need to re-search
- * current block group but don't try to refill the cluster again.
- */
- bool retry_clustered;
-
- /*
- * Whether we're updating free space cache, if true we need to re-search
- * current block group but don't try updating free space cache again.
- */
- bool retry_unclustered;
-
- /* If current block group is cached */
- int cached;
-
- /* Max contiguous hole found */
- u64 max_extent_size;
-
- /* Total free space from free space cache, not always contiguous */
- u64 total_free_space;
-
- /* Found result */
- u64 found_offset;
-
- /* Hint where to start looking for an empty space */
- u64 hint_byte;
-
- /* Allocation policy */
- enum btrfs_extent_allocation_policy policy;
-};
-
-
/*
* Helper function for find_free_extent().
*
@@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (offset) {
/* We have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(cluster_bg,
- ffe_ctl->search_start, ffe_ctl->num_bytes);
+ trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
*cluster_bg_ret = cluster_bg;
ffe_ctl->found_offset = offset;
return 0;
@@ -3610,10 +3535,8 @@ refill_cluster:
if (offset) {
/* We found one, proceed */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(bg,
- ffe_ctl->search_start,
- ffe_ctl->num_bytes);
ffe_ctl->found_offset = offset;
+ trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
return 0;
}
} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
@@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
}
}
-static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
-{
- switch (ffe_ctl->policy) {
- case BTRFS_EXTENT_ALLOC_CLUSTERED:
- /*
- * If we can't allocate a new chunk we've already looped through
- * at least once, move on to the NO_EMPTY_SIZE case.
- */
- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
- return 0;
- case BTRFS_EXTENT_ALLOC_ZONED:
- /* Give up here */
- return -ENOSPC;
- default:
- BUG();
- }
-}
-
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
@@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_UNSET_SIZE_CLASS, allow unset size class
* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
* again
*/
if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
ffe_ctl->index = 0;
- if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
- /*
- * We want to skip the LOOP_CACHING_WAIT step if we
- * don't have any uncached bgs and we've already done a
- * full search through.
- */
- if (ffe_ctl->orig_have_caching_bg || !full_search)
- ffe_ctl->loop = LOOP_CACHING_WAIT;
- else
- ffe_ctl->loop = LOOP_ALLOC_CHUNK;
- } else {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we don't have
+ * any uncached bgs and we've already done a full search
+ * through.
+ */
+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
+ (!ffe_ctl->orig_have_caching_bg && full_search))
ffe_ctl->loop++;
- }
+ ffe_ctl->loop++;
if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
- /*Check if allocation policy allows to create a new chunk */
+ /* Check if allocation policy allows to create a new chunk */
ret = can_allocate_chunk(fs_info, ffe_ctl);
if (ret)
return ret;
@@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
- if (ret == -ENOSPC)
- ret = chunk_allocation_failed(ffe_ctl);
+ if (ret == -ENOSPC) {
+ ret = 0;
+ ffe_ctl->loop++;
+ }
else if (ret < 0)
btrfs_abort_transaction(trans, ret);
else
@@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
+static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl->total_free_space = 0;
ffe_ctl->found_offset = 0;
ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+ ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
if (btrfs_is_zoned(fs_info))
ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
@@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
- ffe_ctl->flags);
+ trace_find_free_extent(root, ffe_ctl);
space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
if (!space_info) {
@@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
block_group->flags);
btrfs_lock_block_group(block_group,
ffe_ctl->delalloc);
+ ffe_ctl->hinted = true;
goto have_block_group;
}
} else if (block_group) {
@@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
+ trace_find_free_extent_search_loop(root, ffe_ctl);
ffe_ctl->have_caching_bg = false;
if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
ffe_ctl->index == 0)
@@ -4356,6 +4277,7 @@ search:
&space_info->block_groups[ffe_ctl->index], list) {
struct btrfs_block_group *bg_ret;
+ ffe_ctl->hinted = false;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro)) {
if (ffe_ctl->for_treelog)
@@ -4397,6 +4319,7 @@ search:
}
have_block_group:
+ trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
@@ -4421,6 +4344,9 @@ have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
+ if (!find_free_extent_check_size_class(ffe_ctl, block_group))
+ goto loop;
+
bg_ret = NULL;
ret = do_allocation(block_group, ffe_ctl, &bg_ret);
if (ret == 0) {
@@ -4455,7 +4381,8 @@ have_block_group:
ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
ffe_ctl->num_bytes,
- ffe_ctl->delalloc);
+ ffe_ctl->delalloc,
+ ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
if (ret == -EAGAIN) {
btrfs_add_free_space_unused(block_group,
ffe_ctl->found_offset,
@@ -4468,8 +4395,7 @@ have_block_group:
ins->objectid = ffe_ctl->search_start;
ins->offset = ffe_ctl->num_bytes;
- trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
- ffe_ctl->num_bytes);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl);
btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
@@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
__btrfs_tree_lock(buf, nest);
- btrfs_clean_tree_block(buf);
+ btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
@@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
}
}
}
- /* make block locked assertion in btrfs_clean_tree_block happy */
- if (!path->locks[level] &&
- btrfs_header_generation(eb) == trans->transid) {
+ /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
+ if (!path->locks[level]) {
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
}
- btrfs_clean_tree_block(eb);
+ btrfs_clear_buffer_dirty(trans, eb);
}
if (eb == root->node) {
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index ae5425253603..0c958fc1b3b8 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -3,6 +3,87 @@
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
+#include "misc.h"
+#include "block-group.h"
+
+struct btrfs_free_cluster;
+
+enum btrfs_extent_allocation_policy {
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
+ BTRFS_EXTENT_ALLOC_ZONED,
+};
+
+struct find_free_extent_ctl {
+ /* Basic allocation info */
+ u64 ram_bytes;
+ u64 num_bytes;
+ u64 min_alloc_size;
+ u64 empty_size;
+ u64 flags;
+ int delalloc;
+
+ /* Where to start the search inside the bg */
+ u64 search_start;
+
+ /* For clustered allocation */
+ u64 empty_cluster;
+ struct btrfs_free_cluster *last_ptr;
+ bool use_cluster;
+
+ bool have_caching_bg;
+ bool orig_have_caching_bg;
+
+ /* Allocation is called for tree-log */
+ bool for_treelog;
+
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
+ /* RAID index, converted from flags */
+ int index;
+
+ /*
+ * Current loop number, check find_free_extent_update_loop() for details
+ */
+ int loop;
+
+ /*
+ * Whether we're refilling a cluster, if true we need to re-search
+ * current block group but don't try to refill the cluster again.
+ */
+ bool retry_clustered;
+
+ /*
+ * Whether we're updating free space cache, if true we need to re-search
+ * current block group but don't try updating free space cache again.
+ */
+ bool retry_unclustered;
+
+ /* If current block group is cached */
+ int cached;
+
+ /* Max contiguous hole found */
+ u64 max_extent_size;
+
+ /* Total free space from free space cache, not always contiguous */
+ u64 total_free_space;
+
+ /* Found result */
+ u64 found_offset;
+
+ /* Hint where to start looking for an empty space */
+ u64 hint_byte;
+
+ /* Allocation policy */
+ enum btrfs_extent_allocation_policy policy;
+
+ /* Whether or not the allocator is currently following a hint */
+ bool hinted;
+
+ /* Size class of block groups to prefer in early loops */
+ enum btrfs_block_group_size_class size_class;
+};
+
enum btrfs_inline_ref_type {
BTRFS_REF_TYPE_INVALID,
BTRFS_REF_TYPE_BLOCK,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3bbf8703db2a..c25fa74d7615 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -36,6 +36,7 @@
#include "file.h"
#include "dev-replace.h"
#include "super.h"
+#include "transaction.h"
static struct kmem_cache *extent_buffer_cache;
@@ -99,7 +100,6 @@ struct btrfs_bio_ctrl {
struct bio *bio;
int mirror_num;
enum btrfs_compression_type compress_type;
- u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
btrfs_bio_end_io_t end_io_func;
@@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct bio *bio;
struct bio_vec *bv;
- struct btrfs_inode *inode;
+ struct inode *inode;
int mirror_num;
if (!bio_ctrl->bio)
@@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bio = bio_ctrl->bio;
bv = bio_first_bvec_all(bio);
- inode = BTRFS_I(bv->bv_page->mapping->host);
+ inode = bv->bv_page->mapping->host;
mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
- btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
-
- if (!is_data_inode(&inode->vfs_inode)) {
+ if (!is_data_inode(inode)) {
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
/*
* For metadata read, we should have the parent_check,
@@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bio_ctrl->parent_check,
sizeof(struct btrfs_tree_parent_check));
}
- btrfs_submit_metadata_bio(inode, bio, mirror_num);
- } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- btrfs_submit_data_write_bio(inode, bio, mirror_num);
- } else {
- btrfs_submit_data_read_bio(inode, bio, mirror_num,
- bio_ctrl->compress_type);
+ bio->bi_opf |= REQ_META;
}
+ if (btrfs_op(bio) == BTRFS_MAP_READ &&
+ bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ else
+ btrfs_submit_bio(bio, mirror_num);
+
/* The bio is owned by the end_io handler now */
bio_ctrl->bio = NULL;
}
@@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
start, end, page_ops, NULL);
}
-static int insert_failrec(struct btrfs_inode *inode,
- struct io_failure_record *failrec)
-{
- struct rb_node *exist;
-
- spin_lock(&inode->io_failure_lock);
- exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
- &failrec->rb_node);
- spin_unlock(&inode->io_failure_lock);
-
- return (exist == NULL) ? 0 : -EEXIST;
-}
-
-static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
-{
- struct rb_node *node;
- struct io_failure_record *failrec = ERR_PTR(-ENOENT);
-
- spin_lock(&inode->io_failure_lock);
- node = rb_simple_search(&inode->io_failure_tree, start);
- if (node)
- failrec = rb_entry(node, struct io_failure_record, rb_node);
- spin_unlock(&inode->io_failure_lock);
- return failrec;
-}
-
-static void free_io_failure(struct btrfs_inode *inode,
- struct io_failure_record *rec)
-{
- spin_lock(&inode->io_failure_lock);
- rb_erase(&rec->rb_node, &inode->io_failure_tree);
- spin_unlock(&inode->io_failure_lock);
-
- kfree(rec);
-}
-
-static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
-{
- if (cur_mirror == failrec->num_copies)
- return cur_mirror + 1 - failrec->num_copies;
- return cur_mirror + 1;
-}
-
-static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
-{
- if (cur_mirror == 1)
- return failrec->num_copies;
- return cur_mirror - 1;
-}
-
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
- struct page *page, unsigned int pg_offset)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_io_tree *io_tree = &inode->io_tree;
- u64 ino = btrfs_ino(inode);
- u64 locked_start, locked_end;
- struct io_failure_record *failrec;
- int mirror;
- int ret;
-
- failrec = get_failrec(inode, start);
- if (IS_ERR(failrec))
- return 0;
-
- BUG_ON(!failrec->this_mirror);
-
- if (sb_rdonly(fs_info->sb))
- goto out;
-
- ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
- &locked_end, EXTENT_LOCKED, NULL);
- if (ret || locked_start > failrec->bytenr ||
- locked_end < failrec->bytenr + failrec->len - 1)
- goto out;
-
- mirror = failrec->this_mirror;
- do {
- mirror = prev_mirror(failrec, mirror);
- btrfs_repair_io_failure(fs_info, ino, start, failrec->len,
- failrec->logical, page, pg_offset, mirror);
- } while (mirror != failrec->failed_mirror);
-
-out:
- free_io_failure(inode, failrec);
- return 0;
-}
-
-/*
- * Can be called when
- * - hold extent lock
- * - under ordered extent
- * - the inode is freeing
- */
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
-{
- struct io_failure_record *failrec;
- struct rb_node *node, *next;
-
- if (RB_EMPTY_ROOT(&inode->io_failure_tree))
- return;
-
- spin_lock(&inode->io_failure_lock);
- node = rb_simple_search_first(&inode->io_failure_tree, start);
- while (node) {
- failrec = rb_entry(node, struct io_failure_record, rb_node);
- if (failrec->bytenr > end)
- break;
-
- next = rb_next(node);
- rb_erase(&failrec->rb_node, &inode->io_failure_tree);
- kfree(failrec);
-
- node = next;
- }
- spin_unlock(&inode->io_failure_lock);
-}
-
-static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
- struct btrfs_bio *bbio,
- unsigned int bio_offset)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 start = bbio->file_offset + bio_offset;
- struct io_failure_record *failrec;
- const u32 sectorsize = fs_info->sectorsize;
- int ret;
-
- failrec = get_failrec(BTRFS_I(inode), start);
- if (!IS_ERR(failrec)) {
- btrfs_debug(fs_info,
- "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
- failrec->logical, failrec->bytenr, failrec->len);
- /*
- * when data can be on disk more than twice, add to failrec here
- * (e.g. with a list for failed_mirror) to make
- * clean_io_failure() clean all those errors at once.
- */
- ASSERT(failrec->this_mirror == bbio->mirror_num);
- ASSERT(failrec->len == fs_info->sectorsize);
- return failrec;
- }
-
- failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return ERR_PTR(-ENOMEM);
-
- RB_CLEAR_NODE(&failrec->rb_node);
- failrec->bytenr = start;
- failrec->len = sectorsize;
- failrec->failed_mirror = bbio->mirror_num;
- failrec->this_mirror = bbio->mirror_num;
- failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
-
- btrfs_debug(fs_info,
- "new io failure record logical %llu start %llu",
- failrec->logical, start);
-
- failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
- if (failrec->num_copies == 1) {
- /*
- * We only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. No
- * matter what the error is, it is very likely to persist.
- */
- btrfs_debug(fs_info,
- "cannot repair logical %llu num_copies %d",
- failrec->logical, failrec->num_copies);
- kfree(failrec);
- return ERR_PTR(-EIO);
- }
-
- /* Set the bits in the private failure tree */
- ret = insert_failrec(BTRFS_I(inode), failrec);
- if (ret) {
- kfree(failrec);
- return ERR_PTR(ret);
- }
-
- return failrec;
-}
-
-int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
- u32 bio_offset, struct page *page, unsigned int pgoff,
- bool submit_buffered)
-{
- u64 start = failed_bbio->file_offset + bio_offset;
- struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct bio *failed_bio = &failed_bbio->bio;
- const int icsum = bio_offset >> fs_info->sectorsize_bits;
- struct bio *repair_bio;
- struct btrfs_bio *repair_bbio;
-
- btrfs_debug(fs_info,
- "repair read error: read error at %llu", start);
-
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
-
- failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset);
- if (IS_ERR(failrec))
- return PTR_ERR(failrec);
-
- /*
- * There are two premises:
- * a) deliver good data to the caller
- * b) correct the bad sectors on disk
- *
- * Since we're only doing repair for one sector, we only need to get
- * a good copy of the failed sector and if we succeed, we have setup
- * everything for btrfs_repair_io_failure to do the rest for us.
- */
- failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
- if (failrec->this_mirror == failrec->failed_mirror) {
- btrfs_debug(fs_info,
- "failed to repair num_copies %d this_mirror %d failed_mirror %d",
- failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
- free_io_failure(inode, failrec);
- return -EIO;
- }
-
- repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
- failed_bbio->private);
- repair_bbio = btrfs_bio(repair_bio);
- repair_bbio->file_offset = start;
- repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
-
- if (failed_bbio->csum) {
- const u32 csum_size = fs_info->csum_size;
-
- repair_bbio->csum = repair_bbio->csum_inline;
- memcpy(repair_bbio->csum,
- failed_bbio->csum + csum_size * icsum, csum_size);
- }
-
- bio_add_page(repair_bio, page, failrec->len, pgoff);
- repair_bbio->iter = repair_bio->bi_iter;
-
- btrfs_debug(fs_info,
- "repair read error: submitting new read to mirror %d",
- failrec->this_mirror);
-
- /*
- * At this point we have a bio, so any errors from bio submission will
- * be handled by the endio on the repair_bio, so we can't return an
- * error here.
- */
- if (submit_buffered)
- btrfs_submit_data_read_bio(inode, repair_bio,
- failrec->this_mirror, 0);
- else
- btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
-
- return BLK_STS_OK;
-}
-
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
@@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-static void end_sector_io(struct page *page, u64 offset, bool uptodate)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- const u32 sectorsize = inode->root->fs_info->sectorsize;
-
- end_page_read(page, uptodate, offset, sectorsize);
- unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
-}
-
-static void submit_data_read_repair(struct inode *inode,
- struct btrfs_bio *failed_bbio,
- u32 bio_offset, const struct bio_vec *bvec,
- unsigned int error_bitmap)
-{
- const unsigned int pgoff = bvec->bv_offset;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct page *page = bvec->bv_page;
- const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
- const u64 end = start + bvec->bv_len - 1;
- const u32 sectorsize = fs_info->sectorsize;
- const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
- int i;
-
- BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
-
- /* This repair is only for data */
- ASSERT(is_data_inode(inode));
-
- /* We're here because we had some read errors or csum mismatch */
- ASSERT(error_bitmap);
-
- /*
- * We only get called on buffered IO, thus page must be mapped and bio
- * must not be cloned.
- */
- ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
-
- /* Iterate through all the sectors in the range */
- for (i = 0; i < nr_bits; i++) {
- const unsigned int offset = i * sectorsize;
- bool uptodate = false;
- int ret;
-
- if (!(error_bitmap & (1U << i))) {
- /*
- * This sector has no error, just end the page read
- * and unlock the range.
- */
- uptodate = true;
- goto next;
- }
-
- ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio,
- bio_offset + offset, page, pgoff + offset,
- true);
- if (!ret) {
- /*
- * We have submitted the read repair, the page release
- * will be handled by the endio function of the
- * submitted repair bio.
- * Thus we don't need to do any thing here.
- */
- continue;
- }
- /*
- * Continue on failed repair, otherwise the remaining sectors
- * will not be properly unlocked.
- */
-next:
- end_sector_io(page, start + offset, uptodate);
- }
-}
-
/* lots and lots of room for performance fixes in the end_bio funcs */
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
@@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
u64 start;
u64 end;
struct bvec_iter_all iter_all;
- bool first_bvec = true;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
- if (first_bvec) {
- btrfs_record_physical_zoned(inode, start, bio);
- first_bvec = false;
- }
-
end_extent_writepage(page, error, start, end);
btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
@@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
- unsigned int error_bitmap = (unsigned int)-1;
- bool repair = false;
u64 start;
u64 end;
u32 len;
@@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
len = bvec->bv_len;
mirror = bbio->mirror_num;
- if (likely(uptodate)) {
- if (is_data_inode(inode)) {
- error_bitmap = btrfs_verify_data_csum(bbio,
- bio_offset, page, start, end);
- if (error_bitmap)
- uptodate = false;
- } else {
- if (btrfs_validate_metadata_buffer(bbio,
- page, start, end, mirror))
- uptodate = false;
- }
- }
+ if (uptodate && !is_data_inode(inode) &&
+ btrfs_validate_metadata_buffer(bbio, page, start, end, mirror))
+ uptodate = false;
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
- btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
-
/*
* Zero out the remaining part if this range straddles
* i_size.
@@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
}
- } else if (is_data_inode(inode)) {
- /*
- * Only try to repair bios that actually made it to a
- * device. If the bio failed to be submitted mirror
- * is 0 and we need to fail it without retrying.
- *
- * This also includes the high level bios for compressed
- * extents - these never make it to a device and repair
- * is already handled on the lower compressed bio.
- */
- if (mirror > 0)
- repair = true;
- } else {
+ } else if (!is_data_inode(inode)) {
struct extent_buffer *eb;
eb = find_extent_buffer_readpage(fs_info, page, start);
@@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
atomic_dec(&eb->io_pages);
}
- if (repair) {
- /*
- * submit_data_read_repair() will handle all the good
- * and bad sectors, we just continue to the next bvec.
- */
- submit_data_read_repair(inode, bbio, bio_offset, bvec,
- error_bitmap);
- } else {
- /* Update page status and unlock */
- end_page_read(page, uptodate, start, len);
- endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, PageUptodate(page));
- }
+ /* Update page status and unlock. */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, PageUptodate(page));
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
@@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
- btrfs_bio_free_csum(bbio);
bio_put(bio);
}
@@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
u32 real_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
bool contig = false;
- int ret;
ASSERT(bio);
/* The limit should be calculated when bio_ctrl->bio is allocated */
- ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
+ ASSERT(bio_ctrl->len_to_oe_boundary);
if (bio_ctrl->compress_type != compress_type)
return 0;
@@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
if (!contig)
return 0;
- real_size = min(bio_ctrl->len_to_oe_boundary,
- bio_ctrl->len_to_stripe_boundary) - bio_size;
- real_size = min(real_size, size);
+ real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
/*
* If real_size is 0, never call bio_add_*_page(), as even size is 0,
@@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
if (real_size == 0)
return 0;
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
- else
- ret = bio_add_page(bio, page, real_size, pg_offset);
-
- return ret;
+ return bio_add_page(bio, page, real_size, pg_offset);
}
-static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
- struct btrfs_inode *inode, u64 file_offset)
+static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+ struct btrfs_inode *inode, u64 file_offset)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_io_geometry geom;
struct btrfs_ordered_extent *ordered;
- struct extent_map *em;
- u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
- int ret;
/*
- * Pages for compressed extent are never submitted to disk directly,
- * thus it has no real boundary, just set them to U32_MAX.
- *
- * The split happens for real compressed bio, which happens in
- * btrfs_submit_compressed_read/write().
+ * Limit the extent to the ordered boundary for Zone Append.
+ * Compressed bios aren't submitted directly, so it doesn't apply to
+ * them.
*/
- if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
- bio_ctrl->len_to_oe_boundary = U32_MAX;
- bio_ctrl->len_to_stripe_boundary = U32_MAX;
- return 0;
- }
- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
- if (IS_ERR(em))
- return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
- logical, &geom);
- free_extent_map(em);
- if (ret < 0) {
- return ret;
- }
- if (geom.len > U32_MAX)
- bio_ctrl->len_to_stripe_boundary = U32_MAX;
- else
- bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
-
- if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
- bio_ctrl->len_to_oe_boundary = U32_MAX;
- return 0;
- }
-
- /* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, file_offset);
- if (!ordered) {
- bio_ctrl->len_to_oe_boundary = U32_MAX;
- return 0;
+ if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
+ btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (ordered) {
+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ ordered->file_offset +
+ ordered->disk_num_bytes - file_offset);
+ btrfs_put_ordered_extent(ordered);
+ return;
+ }
}
- bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
- ordered->disk_bytenr + ordered->disk_num_bytes - logical);
- btrfs_put_ordered_extent(ordered);
- return 0;
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
}
-static int alloc_new_bio(struct btrfs_inode *inode,
- struct btrfs_bio_ctrl *bio_ctrl,
- struct writeback_control *wbc,
- blk_opf_t opf,
- u64 disk_bytenr, u32 offset, u64 file_offset,
- enum btrfs_compression_type compress_type)
+static void alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ struct writeback_control *wbc, blk_opf_t opf,
+ u64 disk_bytenr, u32 offset, u64 file_offset,
+ enum btrfs_compression_type compress_type)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
- int ret;
- ASSERT(bio_ctrl->end_io_func);
-
- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
+ NULL);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
@@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
+ btrfs_bio(bio)->file_offset = file_offset;
bio_ctrl->bio = bio;
bio_ctrl->compress_type = compress_type;
- ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
- if (ret < 0)
- goto error;
+ calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (wbc) {
/*
- * For Zone append we need the correct block_device that we are
- * going to write to set in the bio to be able to respect the
- * hardware limitation. Look it up here:
+ * Pick the last added device to support cgroup writeback. For
+ * multi-device file systems this means blk-cgroup policies have
+ * to always be set on the last added/replaced device.
+ * This is a bit odd but has been like that for a long time.
*/
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *dev;
-
- dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
- fs_info->sectorsize);
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- goto error;
- }
-
- bio_set_dev(bio, dev->bdev);
- } else {
- /*
- * Otherwise pick the last added device to support
- * cgroup writeback. For multi-device file systems this
- * means blk-cgroup policies have to always be set on the
- * last added/replaced device. This is a bit odd but has
- * been like that for a long time.
- */
- bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
- }
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, bio);
- } else {
- ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
- return 0;
-error:
- bio_ctrl->bio = NULL;
- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
- return ret;
}
/*
@@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf,
enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
- int ret = 0;
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
unsigned int cur = pg_offset;
@@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf,
/* Allocate new bio if needed */
if (!bio_ctrl->bio) {
- ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
- disk_bytenr, offset,
- page_offset(page) + cur,
- compress_type);
- if (ret < 0)
- return ret;
+ alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
+ offset, page_offset(page) + cur,
+ compress_type);
}
/*
* We must go through btrfs_bio_add_page() to ensure each
@@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* find_next_dirty_byte() are all exclusive
*/
iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
-
- if (btrfs_use_zone_append(inode, em->block_start))
- op = REQ_OP_ZONE_APPEND;
-
free_extent_map(em);
em = NULL;
@@ -2361,13 +1911,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
mapping_set_error(page->mapping, -EIO);
/*
- * If we error out, we should add back the dirty_metadata_bytes
- * to make it consistent.
- */
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
- eb->len, fs_info->dirty_metadata_batch);
-
- /*
* If writeback for a btree extent that doesn't belong to a log tree
* failed, increment the counter transaction->eb_write_errors.
* We do this because while the transaction is running and before it's
@@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
}
-void clear_extent_buffer_dirty(const struct extent_buffer *eb)
+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int i;
int num_pages;
struct page *page;
+ btrfs_assert_tree_write_locked(eb);
+
+ if (trans && btrfs_header_generation(eb) != trans->transid)
+ return;
+
+ if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
+ return;
+
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
+ fs_info->dirty_metadata_batch);
+
if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a2c82448b2e0..4341ad978fb8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,6 +11,8 @@
#include "ulist.h"
#include "misc.h"
+struct btrfs_trans_handle;
+
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
@@ -60,11 +62,9 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
struct btrfs_fs_info;
-struct io_failure_record;
struct extent_io_tree;
struct btrfs_tree_parent_check;
@@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
unsigned long len);
-void clear_extent_buffer_dirty(const struct extent_buffer *eb);
bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
@@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
struct folio *folio, size_t offset);
+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *buf);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data. This
- * io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the sector is set up to date
- * and things continue. If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
- /* Use rb_simple_node for search/insert */
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- };
- struct page *page;
- u64 len;
- u64 logical;
- int this_mirror;
- int failed_mirror;
- int num_copies;
-};
-
-int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
- u32 bio_offset, struct page *page, unsigned int pgoff,
- bool submit_buffered);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
-int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
- struct page *page, unsigned int pg_offset);
-
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5de73466b2ca..41c77a100853 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
/*
* Lookup the checksum for the read bio in csum tree.
*
- * @inode: inode that the bio is for.
- * @bio: bio to look up.
- * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
- * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
- * NULL, the checksum buffer is allocated and returned in
- * btrfs_bio(bio)->csum instead.
- *
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct bio *bio = &bbio->bio;
struct btrfs_path *path;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_bytenr;
- u8 *csum;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
blk_status_t ret = BLK_STS_OK;
- if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ if ((inode->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return BLK_STS_OK;
@@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
if (!path)
return BLK_STS_RESOURCE;
- if (!dst) {
- bbio = btrfs_bio(bio);
-
- if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
- if (!bbio->csum) {
- btrfs_free_path(path);
- return BLK_STS_RESOURCE;
- }
- } else {
- bbio->csum = bbio->csum_inline;
+ if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
+ btrfs_free_path(path);
+ return BLK_STS_RESOURCE;
}
- csum = bbio->csum;
} else {
- csum = dst;
+ bbio->csum = bbio->csum_inline;
}
/*
@@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
* read from the commit root and sidestep a nasty deadlock
* between reading the free space cache and updating the csum tree.
*/
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(inode)) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
@@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
fs_info->sectorsize_bits;
- csum_dst = csum + sector_offset * csum_size;
+ csum_dst = bbio->csum + sector_offset * csum_size;
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
search_len, csum_dst);
if (count < 0) {
ret = errno_to_blk_status(count);
- if (bbio)
- btrfs_bio_free_csum(bbio);
+ if (bbio->csum != bbio->csum_inline)
+ kfree(bbio->csum);
+ bbio->csum = NULL;
break;
}
@@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
memset(csum_dst, 0, csum_size);
count = 1;
- if (BTRFS_I(inode)->root->root_key.objectid ==
+ if (inode->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset;
int ret;
- ret = search_file_offset_in_bio(bio, inode,
+ ret = search_file_offset_in_bio(bio,
+ &inode->vfs_inode,
cur_disk_bytenr, &file_offset);
if (ret)
set_extent_bits(io_tree, file_offset,
@@ -784,23 +772,16 @@ fail:
/*
* Calculate checksums of the data contained inside a bio.
- *
- * @inode: Owner of the data inside the bio
- * @bio: Contains the data to be checksummed
- * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the
- * file offsets are determined from the page offsets in the bio.
- * Otherwise, this is the starting file offset of the bio vecs in
- * @bio, which must be contiguous.
- * @one_ordered: If true, @bio only refers to one ordered extent.
*/
-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 offset, bool one_ordered)
+blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
{
+ struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct bio *bio = &bbio->bio;
+ u64 offset = bbio->file_offset;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
- const bool use_page_offsets = (offset == (u64)-1);
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
@@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
- if (use_page_offsets)
- offset = page_offset(bvec.bv_page) + bvec.bv_offset;
-
if (!ordered) {
ordered = btrfs_lookup_ordered_extent(inode, offset);
/*
@@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- 1);
for (i = 0; i < blockcount; i++) {
- if (!one_ordered &&
+ if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) &&
!in_range(offset, ordered->file_offset,
ordered->num_bytes)) {
unsigned long bytes_left;
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 031225668434..cd7f2ae515c0 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
+blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 pos,
u64 num_bytes);
@@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 offset, bool one_ordered);
+blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait);
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index af046d22300e..5cc5a1faaef5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index c667e878ef1a..4d155a48ec59 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
list_del(&free_space_root->dirty_list);
btrfs_tree_lock(free_space_root->node);
- btrfs_clean_tree_block(free_space_root->node);
+ btrfs_clear_buffer_dirty(trans, free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
free_space_root->node, 0, 1);
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 5553e1f8afe8..31c1648bc0b4 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
name, flag);
}
spin_unlock(&fs_info->super_lock);
+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 37b86acfcbcf..4c477eae6891 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_FS_H
#define BTRFS_FS_H
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/btrfs_tree.h>
#include <linux/sizes.h>
@@ -125,6 +126,12 @@ enum {
*/
BTRFS_FS_NO_OVERCOMMIT,
+ /*
+ * Indicate if we have some features changed, this is mostly for
+ * cleaner thread to update the sysfs interface.
+ */
+ BTRFS_FS_FEATURE_CHANGED,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -742,8 +749,10 @@ struct btrfs_fs_info {
*/
u64 zone_size;
- /* Max size to emit ZONE_APPEND write command */
+ /* Constraints for ZONE_APPEND commands: */
+ struct queue_limits limits;
u64 max_zone_append_size;
+
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index efee6d35af52..6c18dc9a1831 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -84,27 +84,12 @@ struct btrfs_dio_data {
};
struct btrfs_dio_private {
- struct btrfs_inode *inode;
-
- /*
- * Since DIO can use anonymous page, we cannot use page_offset() to
- * grab the file offset, thus need a dedicated member for file offset.
- */
+ /* Range of I/O */
u64 file_offset;
- /* Used for bio::bi_size */
u32 bytes;
- /*
- * References to this structure. There is one reference per in-flight
- * bio plus one while we're still setting up.
- */
- refcount_t refs;
-
- /* Array of checksums */
- u8 *csums;
-
/* This must be last */
- struct bio bio;
+ struct btrfs_bio bbio;
};
static struct bio_set btrfs_dio_bioset;
@@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start, page_end;
+ u64 page_start = 0, page_end = 0;
struct page *page;
if (locked_page) {
@@ -2536,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
/*
- * in order to insert checksums into the metadata in large chunks,
- * we wait until bio submission time. All the pages in the bio are
- * checksummed and sums are attached onto the ordered extent record.
- *
- * At IO completion time the cums attached on the ordered extent record
- * are inserted into the btree
- */
-blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio)
-{
- return btrfs_csum_one_bio(inode, bio, (u64)-1, false);
-}
-
-/*
* Split an extent_map at [start, start + len]
*
* This function is intended to be used only for extract_ordered_extent().
@@ -2663,19 +2635,19 @@ out:
return ret;
}
-static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
- struct bio *bio, loff_t file_offset)
+blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
{
+ u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bbio->bio.bi_iter.bi_size;
+ struct btrfs_inode *inode = bbio->inode;
struct btrfs_ordered_extent *ordered;
- u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 file_len;
- u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
u64 pre, post;
int ret = 0;
- ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
if (WARN_ON_ONCE(!ordered))
return BLK_STS_IOERR;
@@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;
- ret = split_zoned_em(inode, file_offset, file_len, pre, post);
+ ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
out:
btrfs_put_ordered_extent(ordered);
@@ -2723,75 +2695,6 @@ out:
return errno_to_blk_status(ret);
}
-void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- blk_status_t ret;
-
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- ret = extract_ordered_extent(inode, bio,
- page_offset(bio_first_bvec_all(bio)->bv_page));
- if (ret) {
- btrfs_bio_end_io(btrfs_bio(bio), ret);
- return;
- }
- }
-
- /*
- * If we need to checksum, and the I/O is not issued by fsync and
- * friends, that is ->sync_writers != 0, defer the submission to a
- * workqueue to parallelize it.
- *
- * Csum items for reloc roots have already been cloned at this point,
- * so they are handled as part of the no-checksum case.
- */
- if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
- !btrfs_is_data_reloc_root(inode->root)) {
- if (!atomic_read(&inode->sync_writers) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
- return;
-
- ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false);
- if (ret) {
- btrfs_bio_end_io(btrfs_bio(bio), ret);
- return;
- }
- }
- btrfs_submit_bio(fs_info, bio, mirror_num);
-}
-
-void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- blk_status_t ret;
-
- if (compress_type != BTRFS_COMPRESS_NONE) {
- /*
- * btrfs_submit_compressed_read will handle completing the bio
- * if there were any errors, so just return here.
- */
- btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num);
- return;
- }
-
- /* Save the original iter for read repair */
- btrfs_bio(bio)->iter = bio->bi_iter;
-
- /*
- * Lookup bio sums does extra checks around whether we need to csum or
- * not, which is why we ignore skip_sum here.
- */
- ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
- if (ret) {
- btrfs_bio_end_io(btrfs_bio(bio), ret);
- return;
- }
-
- btrfs_submit_bio(fs_info, bio, mirror_num);
-}
-
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
@@ -2969,7 +2872,7 @@ again:
unlock_extent(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- /* A valid bdev implies a write on a sequential zone */
- if (ordered_extent->bdev) {
+ /* A valid ->physical implies a write on a sequential zone. */
+ if (ordered_extent->physical != (u64)-1) {
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
}
- btrfs_free_io_failure_record(inode, start, end);
-
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
logical_len = ordered_extent->truncated_len;
@@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
}
/*
- * check_data_csum - verify checksum of one sector of uncompressed data
- * @inode: inode
- * @bbio: btrfs_bio which contains the csum
+ * Verify the checksum of a single data sector.
+ *
+ * @bbio: btrfs_io_bio which contains the csum
+ * @dev: device the sector is on
* @bio_offset: offset to the beginning of the bio (in bytes)
- * @page: page where is the data to be verified
- * @pgoff: offset inside the page
+ * @bv: bio_vec to check
*
- * The length of such check is always one sector size.
+ * Check if the checksum on a data block is valid. When a checksum mismatch is
+ * detected, report the error and fill the corrupted range with zero.
*
- * When csum mismatch is detected, we will also report the error and fill the
- * corrupted range with zero. (Thus it needs the extra parameters)
+ * Return %true if the sector is ok or had no checksum to start with, else %false.
*/
-int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff)
+bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
+ u32 bio_offset, struct bio_vec *bv)
{
+ struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u32 len = fs_info->sectorsize;
+ u64 file_offset = bbio->file_offset + bio_offset;
+ u64 end = file_offset + bv->bv_len - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- ASSERT(pgoff + len <= PAGE_SIZE);
+ ASSERT(bv->bv_len == fs_info->sectorsize);
- csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
+ if (!bbio->csum)
+ return true;
- if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
+ if (btrfs_is_data_reloc_root(inode->root) &&
+ test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
+ 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(&inode->io_tree, file_offset, end,
+ EXTENT_NODATASUM);
+ return true;
+ }
+
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
+ if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
+ csum_expected))
goto zeroit;
- return 0;
+ return true;
zeroit:
- btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset,
- csum, csum_expected, bbio->mirror_num);
- if (bbio->device)
- btrfs_dev_stat_inc_and_print(bbio->device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memzero_page(page, pgoff, len);
- return -EIO;
-}
-
-/*
- * When reads are done, we need to check csums to verify the data is correct.
- * if there's a match, we allow the bio to finish. If not, the code in
- * extent_io.c will try to find good copies for us.
- *
- * @bio_offset: offset to the beginning of the bio (in bytes)
- * @start: file offset of the range start
- * @end: file offset of the range end (inclusive)
- *
- * Return a bitmap where bit set means a csum mismatch, and bit not set means
- * csum match.
- */
-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page,
- u64 start, u64 end)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_io_tree *io_tree = &inode->io_tree;
- const u32 sectorsize = root->fs_info->sectorsize;
- u32 pg_off;
- unsigned int result = 0;
-
- /*
- * This only happens for NODATASUM or compressed read.
- * Normally this should be covered by above check for compressed read
- * or the next check for NODATASUM. Just do a quicker exit here.
- */
- if (bbio->csum == NULL)
- return 0;
-
- if (inode->flags & BTRFS_INODE_NODATASUM)
- return 0;
-
- if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
- return 0;
-
- ASSERT(page_offset(page) <= start &&
- end <= page_offset(page) + PAGE_SIZE - 1);
- for (pg_off = offset_in_page(start);
- pg_off < offset_in_page(end);
- pg_off += sectorsize, bio_offset += sectorsize) {
- u64 file_offset = pg_off + page_offset(page);
- int ret;
-
- if (btrfs_is_data_reloc_root(root) &&
- test_range_bit(io_tree, file_offset,
- file_offset + sectorsize - 1,
- EXTENT_NODATASUM, 1, NULL)) {
- /* Skip the range without csum for data reloc inode */
- clear_extent_bits(io_tree, file_offset,
- file_offset + sectorsize - 1,
- EXTENT_NODATASUM);
- continue;
- }
- ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
- if (ret < 0) {
- const int nr_bit = (pg_off - offset_in_page(start)) >>
- root->fs_info->sectorsize_bits;
-
- result |= (1U << nr_bit);
- }
- }
- return result;
+ btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
+ bbio->mirror_num);
+ if (dev)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ memzero_bvec(bv);
+ return false;
}
/*
@@ -4987,7 +4834,7 @@ again:
unlock_extent(io_tree, block_start, block_end, &cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
- btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
-
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
goto no_delete;
@@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
else
ret = nowait ? -EAGAIN : -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
-
- if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
- iomap->flags |= IOMAP_F_ZONE_APPEND;
-
free_extent_map(em);
return 0;
@@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
return ret;
}
-static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
-{
- /*
- * This implies a barrier so that stores to dio_bio->bi_status before
- * this and loads of dio_bio->bi_status after this are fully ordered.
- */
- if (!refcount_dec_and_test(&dip->refs))
- return;
-
- if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
- btrfs_mark_ordered_io_finished(dip->inode, NULL,
- dip->file_offset, dip->bytes,
- !dip->bio.bi_status);
- } else {
- unlock_extent(&dip->inode->io_tree,
- dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
- }
-
- kfree(dip->csums);
- bio_endio(&dip->bio);
-}
-
-void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
-{
- struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
-
- BUG_ON(bio_op(bio) == REQ_OP_WRITE);
-
- refcount_inc(&dip->refs);
- btrfs_submit_bio(inode->root->fs_info, bio, mirror_num);
-}
-
-static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
- struct btrfs_bio *bbio,
- const bool uptodate)
-{
- struct inode *inode = &dip->inode->vfs_inode;
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- blk_status_t err = BLK_STS_OK;
- struct bvec_iter iter;
- struct bio_vec bv;
- u32 offset;
-
- btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
- u64 start = bbio->file_offset + offset;
-
- if (uptodate &&
- (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset,
- bv.bv_page, bv.bv_offset))) {
- btrfs_clean_io_failure(BTRFS_I(inode), start,
- bv.bv_page, bv.bv_offset);
- } else {
- int ret;
-
- ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
- bv.bv_page, bv.bv_offset, false);
- if (ret)
- err = errno_to_blk_status(ret);
- }
- }
-
- return err;
-}
-
-blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
- struct bio *bio,
- u64 dio_file_offset)
+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
{
- return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
-}
-
-static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
-{
- struct btrfs_dio_private *dip = bbio->private;
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_inode *inode = bbio->inode;
struct bio *bio = &bbio->bio;
- blk_status_t err = bio->bi_status;
-
- if (err)
- btrfs_warn(dip->inode->root->fs_info,
- "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
- btrfs_ino(dip->inode), bio_op(bio),
- bio->bi_opf, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, err);
-
- if (bio_op(bio) == REQ_OP_READ)
- err = btrfs_check_read_dio_bio(dip, bbio, !err);
-
- if (err)
- dip->bio.bi_status = err;
-
- btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio);
-
- bio_put(bio);
- btrfs_dio_private_put(dip);
-}
-static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
- u64 file_offset, int async_submit)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
- blk_status_t ret;
-
- /* Save the original iter for read repair */
- if (btrfs_op(bio) == BTRFS_MAP_READ)
- btrfs_bio(bio)->iter = bio->bi_iter;
-
- if (inode->flags & BTRFS_INODE_NODATASUM)
- goto map;
+ if (bio->bi_status) {
+ btrfs_warn(inode->root->fs_info,
+ "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
+ btrfs_ino(inode), bio->bi_opf,
+ dip->file_offset, dip->bytes, bio->bi_status);
+ }
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- /* Check btrfs_submit_data_write_bio() for async submit rules */
- if (async_submit && !atomic_read(&inode->sync_writers) &&
- btrfs_wq_submit_bio(inode, bio, 0, file_offset,
- WQ_SUBMIT_DATA_DIO))
- return;
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset,
+ dip->bytes, !bio->bi_status);
+ else
+ unlock_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
- /*
- * If we aren't doing async submit, calculate the csum of the
- * bio now.
- */
- ret = btrfs_csum_one_bio(inode, bio, file_offset, false);
- if (ret) {
- btrfs_bio_end_io(btrfs_bio(bio), ret);
- return;
- }
- } else {
- btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
- file_offset - dip->file_offset);
- }
-map:
- btrfs_submit_bio(fs_info, bio, 0);
+ bbio->bio.bi_private = bbio->private;
+ iomap_dio_bio_end_io(bio);
}
-static void btrfs_submit_direct(const struct iomap_iter *iter,
- struct bio *dio_bio, loff_t file_offset)
+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
+ loff_t file_offset)
{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_dio_private *dip =
- container_of(dio_bio, struct btrfs_dio_private, bio);
- struct inode *inode = iter->inode;
- const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
- BTRFS_BLOCK_GROUP_RAID56_MASK);
- struct bio *bio;
- u64 start_sector;
- int async_submit = 0;
- u64 submit_len;
- u64 clone_offset = 0;
- u64 clone_len;
- u64 logical;
- int ret;
- blk_status_t status;
- struct btrfs_io_geometry geom;
+ container_of(bbio, struct btrfs_dio_private, bbio);
struct btrfs_dio_data *dio_data = iter->private;
- struct extent_map *em = NULL;
-
- dip->inode = BTRFS_I(inode);
- dip->file_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- refcount_set(&dip->refs, 1);
- dip->csums = NULL;
-
- if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- unsigned int nr_sectors =
- (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
-
- /*
- * Load the csums up front to reduce csum tree searches and
- * contention when submitting bios.
- */
- status = BLK_STS_RESOURCE;
- dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
- if (!dip->csums)
- goto out_err;
-
- status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
- if (status != BLK_STS_OK)
- goto out_err;
- }
-
- start_sector = dio_bio->bi_iter.bi_sector;
- submit_len = dio_bio->bi_iter.bi_size;
-
- do {
- logical = start_sector << 9;
- em = btrfs_get_chunk_map(fs_info, logical, submit_len);
- if (IS_ERR(em)) {
- status = errno_to_blk_status(PTR_ERR(em));
- em = NULL;
- goto out_err_em;
- }
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
- logical, &geom);
- if (ret) {
- status = errno_to_blk_status(ret);
- goto out_err_em;
- }
- clone_len = min(submit_len, geom.len);
- ASSERT(clone_len <= UINT_MAX);
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
+ bbio->file_offset = file_offset;
- /*
- * This will never fail as it's passing GPF_NOFS and
- * the allocation is backed by btrfs_bioset.
- */
- bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
- btrfs_end_dio_bio, dip);
- btrfs_bio(bio)->file_offset = file_offset;
-
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- status = extract_ordered_extent(BTRFS_I(inode), bio,
- file_offset);
- if (status) {
- bio_put(bio);
- goto out_err;
- }
- }
-
- ASSERT(submit_len >= clone_len);
- submit_len -= clone_len;
-
- /*
- * Increase the count before we submit the bio so we know
- * the end IO handler won't happen before we increase the
- * count. Otherwise, the dip might get freed before we're
- * done setting it up.
- *
- * We transfer the initial reference to the last bio, so we
- * don't need to increment the reference count for the last one.
- */
- if (submit_len > 0) {
- refcount_inc(&dip->refs);
- /*
- * If we are submitting more than one bio, submit them
- * all asynchronously. The exception is RAID 5 or 6, as
- * asynchronous checksums make it difficult to collect
- * full stripe writes.
- */
- if (!raid56)
- async_submit = 1;
- }
-
- btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
-
- dio_data->submitted += clone_len;
- clone_offset += clone_len;
- start_sector += clone_len >> 9;
- file_offset += clone_len;
-
- free_extent_map(em);
- } while (submit_len > 0);
- return;
+ dip->file_offset = file_offset;
+ dip->bytes = bio->bi_iter.bi_size;
-out_err_em:
- free_extent_map(em);
-out_err:
- dio_bio->bi_status = status;
- btrfs_dio_private_put(dip);
+ dio_data->submitted += bio->bi_iter.bi_size;
+ btrfs_submit_bio(bio, 0);
}
static const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = {
};
static const struct iomap_dio_ops btrfs_dio_ops = {
- .submit_io = btrfs_submit_direct,
+ .submit_io = btrfs_dio_submit_io,
.bio_set = &btrfs_dio_bioset,
};
@@ -8552,7 +8173,7 @@ again:
unlock_extent(io_tree, page_start, page_end, &cached_state);
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
- spin_lock_init(&ei->io_failure_lock);
ei->outstanding_extents = 0;
if (sb->s_magic != BTRFS_TEST_MAGIC)
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->io_tree.inode = ei;
extent_io_tree_init(fs_info, &ei->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT);
- ei->io_failure_tree = RB_ROOT;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void)
goto fail;
if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_dio_private, bio),
+ offsetof(struct btrfs_dio_private, bbio.bio),
BIOSET_NEED_BVECS))
goto fail;
@@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private {
wait_queue_head_t wait;
atomic_t pending;
blk_status_t status;
- bool skip_csum;
};
-static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
- struct bio *bio, int mirror_num)
-{
- struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- blk_status_t ret;
-
- if (!priv->skip_csum) {
- ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
- if (ret)
- return ret;
- }
-
- atomic_inc(&priv->pending);
- btrfs_submit_bio(fs_info, bio, mirror_num);
- return BLK_STS_OK;
-}
-
-static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
-{
- const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
- struct btrfs_encoded_read_private *priv = bbio->private;
- struct btrfs_inode *inode = priv->inode;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u32 sectorsize = fs_info->sectorsize;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
- u32 bio_offset = 0;
-
- if (priv->skip_csum || !uptodate)
- return bbio->bio.bi_status;
-
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
- unsigned int i, nr_sectors, pgoff;
-
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
- pgoff = bvec->bv_offset;
- for (i = 0; i < nr_sectors; i++) {
- ASSERT(pgoff < PAGE_SIZE);
- if (btrfs_check_data_csum(inode, bbio, bio_offset,
- bvec->bv_page, pgoff))
- return BLK_STS_IOERR;
- bio_offset += sectorsize;
- pgoff += sectorsize;
- }
- }
- return BLK_STS_OK;
-}
-
static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
{
struct btrfs_encoded_read_private *priv = bbio->private;
- blk_status_t status;
- status = btrfs_encoded_read_verify_csum(bbio);
- if (status) {
+ if (bbio->bio.bi_status) {
/*
* The memory barrier implied by the atomic_dec_return() here
* pairs with the memory barrier implied by the
@@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
* write is observed before the load of status in
* btrfs_encoded_read_regular_fill_pages().
*/
- WRITE_ONCE(priv->status, status);
+ WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
if (!atomic_dec_return(&priv->pending))
wake_up(&priv->wait);
- btrfs_bio_free_csum(bbio);
bio_put(&bbio->bio);
}
@@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr,
u64 disk_io_size, struct page **pages)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private priv = {
.inode = inode,
.file_offset = file_offset,
.pending = ATOMIC_INIT(1),
- .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
};
unsigned long i = 0;
u64 cur = 0;
- int ret;
init_waitqueue_head(&priv.wait);
- /*
- * Submit bios for the extent, splitting due to bio or stripe limits as
- * necessary.
- */
+ /* Submit bios for the extent, splitting due to bio limits as necessary. */
while (cur < disk_io_size) {
- struct extent_map *em;
- struct btrfs_io_geometry geom;
struct bio *bio = NULL;
- u64 remaining;
+ u64 remaining = disk_io_size - cur;
- em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
- disk_io_size - cur);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- } else {
- ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
- disk_bytenr + cur, &geom);
- free_extent_map(em);
- }
- if (ret) {
- WRITE_ONCE(priv.status, errno_to_blk_status(ret));
- break;
- }
- remaining = min(geom.len, disk_io_size - cur);
while (bio || remaining) {
size_t bytes = min_t(u64, remaining, PAGE_SIZE);
if (!bio) {
bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+ inode,
btrfs_encoded_read_endio,
&priv);
bio->bi_iter.bi_sector =
@@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
if (!bytes ||
bio_add_page(bio, pages[i], bytes, 0) < bytes) {
- blk_status_t status;
-
- status = submit_encoded_read_bio(inode, bio, 0);
- if (status) {
- WRITE_ONCE(priv.status, status);
- bio_put(bio);
- goto out;
- }
+ atomic_inc(&priv.pending);
+ btrfs_submit_bio(bio, 0);
bio = NULL;
continue;
}
@@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
}
}
-out:
if (atomic_dec_return(&priv.pending))
io_wait_event(priv.wait, !atomic_read(&priv.pending));
/* See btrfs_encoded_read_endio() for ordering. */
@@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
return 0;
max_pages = sis->max - bsi->nr_pages;
- first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
- next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
- PAGE_SIZE) >> PAGE_SHIFT;
+ first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
+ next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
if (first_ppage >= next_ppage)
return 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5ba1ff31713b..84626c8ad5bf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -707,7 +707,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
* exists).
*/
btrfs_tree_lock(leaf);
- btrfs_clean_tree_block(leaf);
+ btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
new file mode 100644
index 000000000000..0fe0ae54ac67
--- /dev/null
+++ b/fs/btrfs/lru_cache.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mm.h>
+#include "lru_cache.h"
+#include "messages.h"
+
+/*
+ * Initialize a cache object.
+ *
+ * @cache: The cache.
+ * @max_size: Maximum size (number of entries) for the cache.
+ * Use 0 for unlimited size, it's the user's responsability to
+ * trim the cache in that case.
+ */
+void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
+{
+ INIT_LIST_HEAD(&cache->lru_list);
+ mt_init(&cache->entries);
+ cache->size = 0;
+ cache->max_size = max_size;
+}
+
+static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key,
+ u64 gen)
+{
+ struct btrfs_lru_cache_entry *entry;
+
+ list_for_each_entry(entry, head, list) {
+ if (entry->key == key && entry->gen == gen)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/*
+ * Lookup for an entry in the cache.
+ *
+ * @cache: The cache.
+ * @key: The key of the entry we are looking for.
+ * @gen: Generation associated to the key.
+ *
+ * Returns the entry associated with the key or NULL if none found.
+ */
+struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
+ u64 key, u64 gen)
+{
+ struct list_head *head;
+ struct btrfs_lru_cache_entry *entry;
+
+ head = mtree_load(&cache->entries, key);
+ if (!head)
+ return NULL;
+
+ entry = match_entry(head, key, gen);
+ if (entry)
+ list_move_tail(&entry->lru_list, &cache->lru_list);
+
+ return entry;
+}
+
+/*
+ * Remove an entry from the cache.
+ *
+ * @cache: The cache to remove from.
+ * @entry: The entry to remove from the cache.
+ *
+ * Note: this also frees the memory used by the entry.
+ */
+void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
+ struct btrfs_lru_cache_entry *entry)
+{
+ struct list_head *prev = entry->list.prev;
+
+ ASSERT(cache->size > 0);
+ ASSERT(!mtree_empty(&cache->entries));
+
+ list_del(&entry->list);
+ list_del(&entry->lru_list);
+
+ if (list_empty(prev)) {
+ struct list_head *head;
+
+ /*
+ * If previous element in the list entry->list is now empty, it
+ * means it's a head entry not pointing to any cached entries,
+ * so remove it from the maple tree and free it.
+ */
+ head = mtree_erase(&cache->entries, entry->key);
+ ASSERT(head == prev);
+ kfree(head);
+ }
+
+ kfree(entry);
+ cache->size--;
+}
+
+/*
+ * Store an entry in the cache.
+ *
+ * @cache: The cache.
+ * @entry: The entry to store.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
+ struct btrfs_lru_cache_entry *new_entry,
+ gfp_t gfp)
+{
+ const u64 key = new_entry->key;
+ struct list_head *head;
+ int ret;
+
+ head = kmalloc(sizeof(*head), gfp);
+ if (!head)
+ return -ENOMEM;
+
+ ret = mtree_insert(&cache->entries, key, head, gfp);
+ if (ret == 0) {
+ INIT_LIST_HEAD(head);
+ list_add_tail(&new_entry->list, head);
+ } else if (ret == -EEXIST) {
+ kfree(head);
+ head = mtree_load(&cache->entries, key);
+ ASSERT(head != NULL);
+ if (match_entry(head, key, new_entry->gen) != NULL)
+ return -EEXIST;
+ list_add_tail(&new_entry->list, head);
+ } else if (ret < 0) {
+ kfree(head);
+ return ret;
+ }
+
+ if (cache->max_size > 0 && cache->size == cache->max_size) {
+ struct btrfs_lru_cache_entry *lru_entry;
+
+ lru_entry = list_first_entry(&cache->lru_list,
+ struct btrfs_lru_cache_entry,
+ lru_list);
+ btrfs_lru_cache_remove(cache, lru_entry);
+ }
+
+ list_add_tail(&new_entry->lru_list, &cache->lru_list);
+ cache->size++;
+
+ return 0;
+}
+
+/*
+ * Empty a cache.
+ *
+ * @cache: The cache to empty.
+ *
+ * Removes all entries from the cache.
+ */
+void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache)
+{
+ struct btrfs_lru_cache_entry *entry;
+ struct btrfs_lru_cache_entry *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list)
+ btrfs_lru_cache_remove(cache, entry);
+
+ ASSERT(cache->size == 0);
+ ASSERT(mtree_empty(&cache->entries));
+}
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
new file mode 100644
index 000000000000..de3e18bce24a
--- /dev/null
+++ b/fs/btrfs/lru_cache.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_LRU_CACHE_H
+#define BTRFS_LRU_CACHE_H
+
+#include <linux/maple_tree.h>
+#include <linux/list.h>
+
+/*
+ * A cache entry. This is meant to be embedded in a structure of a user of
+ * this module. Similar to how struct list_head and struct rb_node are used.
+ *
+ * Note: it should be embedded as the first element in a struct (offset 0), and
+ * this module assumes it was allocated with kmalloc(), so it calls kfree() when
+ * it needs to free an entry.
+ */
+struct btrfs_lru_cache_entry {
+ struct list_head lru_list;
+ u64 key;
+ /*
+ * Optional generation associated to a key. Use 0 if not needed/used.
+ * Entries with the same key and different generations are stored in a
+ * linked list, so use this only for cases where there's a small number
+ * of different generations.
+ */
+ u64 gen;
+ /*
+ * The maple tree uses unsigned long type for the keys, which is 32 bits
+ * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to
+ * use something like inode numbers as keys, which are always a u64, we
+ * have to deal with this in a special way - we store the key in the
+ * entry itself, as a u64, and the values inserted into the maple tree
+ * are linked lists of entries - so in case we are on a 64 bits system,
+ * that list always has a single entry, while on 32 bits systems it
+ * may have more than one, with each entry having the same value for
+ * their lower 32 bits of the u64 key.
+ */
+ struct list_head list;
+};
+
+struct btrfs_lru_cache {
+ struct list_head lru_list;
+ struct maple_tree entries;
+ /* Number of entries stored in the cache. */
+ unsigned int size;
+ /* Maximum number of entries the cache can have. */
+ unsigned int max_size;
+};
+
+#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \
+ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
+
+static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
+{
+ return cache->size;
+}
+
+static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
+{
+ return cache->size >= cache->max_size;
+}
+
+static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
+ struct btrfs_lru_cache *cache)
+{
+ return list_first_entry_or_null(&cache->lru_list,
+ struct btrfs_lru_cache_entry, lru_list);
+}
+
+void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size);
+struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
+ u64 key, u64 gen);
+int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
+ struct btrfs_lru_cache_entry *new_entry,
+ gfp_t gfp);
+void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
+ struct btrfs_lru_cache_entry *entry);
+void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache);
+
+#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d5e78cbc8fbc..71f6d8302d50 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* Check if we have reached page boundary */
- if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ if (PAGE_ALIGNED(cur_in)) {
put_page(page_in);
page_in = NULL;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 625bbbbb2608..fde5aaa6e7c9 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -293,36 +293,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
#endif
/*
- * We only mark the transaction aborted and then set the file system read-only.
- * This will prevent new transactions from starting or trying to join this
- * one.
- *
- * This means that error recovery at the call site is limited to freeing
- * any local memory allocations and passing the error code up without
- * further cleanup. The transaction should complete as it normally would
- * in the call path but will return -EIO.
- *
- * We'll complete the cleanup in btrfs_end_transaction and
- * btrfs_commit_transaction.
- */
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- const char *function,
- unsigned int line, int errno, bool first_hit)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- if (first_hit && errno == -ENOSPC)
- btrfs_dump_space_info_for_trans_abort(fs_info);
- /* Wake up anybody who may be waiting on this transaction */
- wake_up(&fs_info->transaction_wait);
- wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
-}
-
-/*
* __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
* alert, and either panics or BUGs, depending on mount options.
*/
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 190af1f698d9..8c516ee58ff9 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -6,7 +6,6 @@
#include <linux/types.h>
struct btrfs_fs_info;
-struct btrfs_trans_handle;
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
@@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
const char * __attribute_const__ btrfs_decode_error(int errno);
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- const char *function,
- unsigned int line, int errno, bool first_hit);
-
-bool __cold abort_should_print_stack(int errno);
-
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact stack trace is reported for some errors.
- */
-#define btrfs_abort_transaction(trans, errno) \
-do { \
- bool first = false; \
- /* Report first abort since mount */ \
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
- &((trans)->fs_info->fs_state))) { \
- first = true; \
- if (WARN(abort_should_print_stack(errno), \
- KERN_ERR \
- "BTRFS: Transaction aborted (error %d)\n", \
- (errno))) { \
- /* Stack trace printed. */ \
- } else { \
- btrfs_err((trans)->fs_info, \
- "Transaction aborted (error %d)", \
- (errno)); \
- } \
- } \
- __btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno), first); \
-} while (0)
-
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 57d8c72737e1..6c24b69e2d0a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
complete(&ordered->completion);
}
@@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
}
/*
- * Used to start IO or wait for a given ordered extent to finish.
+ * Start IO and wait for a given ordered extent to finish.
*
- * If wait is one, this effectively waits on page writeback for all the pages
- * in the extent, and it waits on the io completion code to insert
- * metadata into the btree corresponding to the extent
+ * Wait on page writeback for all the pages in the extent and the IO completion
+ * code to insert metadata into the btree corresponding to the extent.
*/
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
@@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
- if (wait) {
- if (!freespace_inode)
- btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
- wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &entry->flags));
- }
+
+ if (!freespace_inode)
+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
+ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
}
/*
@@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
break;
}
unlock_extent(&inode->io_tree, start, end, cachedp);
- btrfs_start_ordered_extent(ordered, 1);
+ btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 89f82b78f590..eb40cb39f842 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -157,7 +157,6 @@ struct btrfs_ordered_extent {
* command in a workqueue context
*/
u64 physical;
- struct block_device *bdev;
};
static inline void
@@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index af97413abcf4..52a7d2fa2284 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
list_del(&quota_root->dirty_list);
btrfs_tree_lock(quota_root->node);
- btrfs_clean_tree_block(quota_root->node);
+ btrfs_clear_buffer_dirty(trans, quota_root->node);
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
quota_root->node, 0, 1);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index ff4b1d583788..642828c1b299 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
}
/*
- * Return the total numer of errors found in the vertical stripe of @sector_nr.
+ * Return the total number of errors found in the vertical stripe of @sector_nr.
*
* @faila and @failb will also be updated to the first and second stripe
* number of the errors.
@@ -1183,7 +1183,15 @@ not_found:
trace_info->stripe_nr = -1;
}
-/* Generate PQ for one veritical stripe. */
+static inline void bio_list_put(struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+}
+
+/* Generate PQ for one vertical stripe. */
static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
void **pointers = rbio->finish_pointers;
@@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
struct bio_list *bio_list)
{
- struct bio *bio;
/* The total sector number inside the full stripe. */
int total_sector_nr;
int sectornr;
@@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
return 0;
error:
- while ((bio = bio_list_pop(bio_list)))
- bio_put(bio);
+ bio_list_put(bio_list);
return -EIO;
}
@@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
}
/*
- * For subpage case, we can no longer set page Uptodate directly for
+ * For subpage case, we can no longer set page Up-to-date directly for
* stripe_pages[], thus we need to locate the sector.
*/
static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
@@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi
int total_sector_nr = get_bio_sector_nr(rbio, bio);
u32 bio_size = 0;
struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
int i;
- bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_for_each_bvec_all(bvec, bio, i)
bio_size += bvec->bv_len;
/*
@@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio)
wake_up(&rbio->io_wait);
}
-static void submit_read_bios(struct btrfs_raid_bio *rbio,
+static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
struct bio_list *bio_list)
{
struct bio *bio;
@@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio,
}
submit_bio(bio);
}
-}
-
-static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list)
-{
- struct bio *bio;
- int total_sector_nr;
- int ret = 0;
-
- ASSERT(bio_list_size(bio_list) == 0);
-
- /*
- * Build a list of bios to read all sectors (including data and P/Q).
- *
- * This behaviro is to compensate the later csum verification and
- * recovery.
- */
- for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
- total_sector_nr++) {
- struct sector_ptr *sector;
- int stripe = total_sector_nr / rbio->stripe_nsectors;
- int sectornr = total_sector_nr % rbio->stripe_nsectors;
-
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, bio_list, sector,
- stripe, sectornr, REQ_OP_READ);
- if (ret)
- goto cleanup;
- }
- return 0;
-cleanup:
- while ((bio = bio_list_pop(bio_list)))
- bio_put(bio);
- return ret;
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
}
static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
@@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
- int ret = 0;
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- ret = PTR_ERR(rbio);
- goto fail;
+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
+ bio_endio(bio);
+ return;
}
rbio->operation = BTRFS_RBIO_WRITE;
rbio_add_bio(rbio, bio);
@@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
* Don't plug on full rbios, just get them out the door
* as quickly as we can
*/
- if (rbio_is_full(rbio))
- goto queue_rbio;
-
- cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
- if (cb) {
- plug = container_of(cb, struct btrfs_plug_cb, cb);
- if (!plug->info) {
- plug->info = fs_info;
- INIT_LIST_HEAD(&plug->rbio_list);
+ if (!rbio_is_full(rbio)) {
+ cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
+ if (cb) {
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+ if (!plug->info) {
+ plug->info = fs_info;
+ INIT_LIST_HEAD(&plug->rbio_list);
+ }
+ list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ return;
}
- list_add_tail(&rbio->plug_list, &plug->rbio_list);
- return;
}
-queue_rbio:
+
/*
* Either we don't have any existing plug, or we're doing a full stripe,
- * can queue the rmw work now.
+ * queue the rmw work now.
*/
start_async_work(rbio, rmw_rbio_work);
-
- return;
-
-fail:
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
}
static int verify_one_sector(struct btrfs_raid_bio *rbio,
@@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
&failb);
/*
- * No errors in the veritical stripe, skip it. Can happen for recovery
+ * No errors in the vertical stripe, skip it. Can happen for recovery
* which only part of a stripe failed csum check.
*/
if (!found_errors)
@@ -1949,14 +1914,25 @@ out:
return ret;
}
-static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list)
+static void recover_rbio(struct btrfs_raid_bio *rbio)
{
- struct bio *bio;
+ struct bio_list bio_list = BIO_EMPTY_LIST;
int total_sector_nr;
int ret = 0;
- ASSERT(bio_list_size(bio_list) == 0);
+ /*
+ * Either we're doing recover for a read failure or degraded write,
+ * caller should have set error bitmap correctly.
+ */
+ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
+
+ /* For recovery, we need to read all sectors including P/Q. */
+ ret = alloc_rbio_pages(rbio);
+ if (ret < 0)
+ goto out;
+
+ index_rbio_pages(rbio);
+
/*
* Read everything that hasn't failed. However this time we will
* not trust any cached sector.
@@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
}
sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
sectornr, REQ_OP_READ);
- if (ret < 0)
- goto error;
+ if (ret < 0) {
+ bio_list_put(&bio_list);
+ goto out;
+ }
}
- return 0;
-error:
- while ((bio = bio_list_pop(bio_list)))
- bio_put(bio);
-
- return -EIO;
-}
-
-static int recover_rbio(struct btrfs_raid_bio *rbio)
-{
- struct bio_list bio_list;
- struct bio *bio;
- int ret;
-
- /*
- * Either we're doing recover for a read failure or degraded write,
- * caller should have set error bitmap correctly.
- */
- ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
- bio_list_init(&bio_list);
-
- /* For recovery, we need to read all sectors including P/Q. */
- ret = alloc_rbio_pages(rbio);
- if (ret < 0)
- goto out;
-
- index_rbio_pages(rbio);
-
- ret = recover_assemble_read_bios(rbio, &bio_list);
- if (ret < 0)
- goto out;
-
- submit_read_bios(rbio, &bio_list);
- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ submit_read_wait_bio_list(rbio, &bio_list);
ret = recover_sectors(rbio);
-
out:
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
-
- return ret;
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
static void recover_rbio_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
- int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
-
- ret = lock_stripe_add(rbio);
- if (ret == 0) {
- ret = recover_rbio(rbio);
- rbio_orig_end_io(rbio, errno_to_blk_status(ret));
- }
+ if (!lock_stripe_add(rbio))
+ recover_rbio(rbio);
}
static void recover_rbio_work_locked(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio;
- int ret;
-
- rbio = container_of(work, struct btrfs_raid_bio, work);
-
- ret = recover_rbio(rbio);
- rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ recover_rbio(container_of(work, struct btrfs_raid_bio, work));
}
static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
@@ -2204,11 +2134,9 @@ no_csum:
static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
{
- struct bio_list bio_list;
- struct bio *bio;
- int ret;
-
- bio_list_init(&bio_list);
+ struct bio_list bio_list = BIO_EMPTY_LIST;
+ int total_sector_nr;
+ int ret = 0;
/*
* Fill the data csums we need for data verification. We need to fill
@@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
*/
fill_data_csums(rbio);
- ret = rmw_assemble_read_bios(rbio, &bio_list);
- if (ret < 0)
- goto out;
+ /*
+ * Build a list of bios to read all sectors (including data and P/Q).
+ *
+ * This behavior is to compensate the later csum verification and recovery.
+ */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
- submit_read_bios(rbio, &bio_list);
- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret) {
+ bio_list_put(&bio_list);
+ return ret;
+ }
+ }
/*
* We may or may not have any corrupted sectors (including missing dev
* and csum mismatch), just let recover_sectors() to handle them all.
*/
- ret = recover_sectors(rbio);
- return ret;
-out:
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
-
- return ret;
+ submit_read_wait_bio_list(rbio, &bio_list);
+ return recover_sectors(rbio);
}
static void raid_wait_write_end_io(struct bio *bio)
@@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
return false;
}
-static int rmw_rbio(struct btrfs_raid_bio *rbio)
+static void rmw_rbio(struct btrfs_raid_bio *rbio)
{
struct bio_list bio_list;
int sectornr;
@@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
*/
ret = alloc_rbio_parity_pages(rbio);
if (ret < 0)
- return ret;
+ goto out;
/*
* Either full stripe write, or we have every data sector already
* cached, can go to write path immediately.
*/
- if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
- goto write;
-
- /*
- * Now we're doing sub-stripe write, also need all data stripes to do
- * the full RMW.
- */
- ret = alloc_rbio_data_pages(rbio);
- if (ret < 0)
- return ret;
+ if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
+ /*
+ * Now we're doing sub-stripe write, also need all data stripes
+ * to do the full RMW.
+ */
+ ret = alloc_rbio_data_pages(rbio);
+ if (ret < 0)
+ goto out;
- index_rbio_pages(rbio);
+ index_rbio_pages(rbio);
- ret = rmw_read_wait_recover(rbio);
- if (ret < 0)
- return ret;
+ ret = rmw_read_wait_recover(rbio);
+ if (ret < 0)
+ goto out;
+ }
-write:
/*
* At this stage we're not allowed to add any new bios to the
* bio list any more, anyone else that wants to change this stripe
@@ -2356,7 +2290,7 @@ write:
bio_list_init(&bio_list);
ret = rmw_assemble_write_bios(rbio, &bio_list);
if (ret < 0)
- return ret;
+ goto out;
/* We should have at least one bio assembled. */
ASSERT(bio_list_size(&bio_list));
@@ -2373,32 +2307,22 @@ write:
break;
}
}
- return ret;
+out:
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
static void rmw_rbio_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
- int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
-
- ret = lock_stripe_add(rbio);
- if (ret == 0) {
- ret = rmw_rbio(rbio);
- rbio_orig_end_io(rbio, errno_to_blk_status(ret));
- }
+ if (lock_stripe_add(rbio) == 0)
+ rmw_rbio(rbio);
}
static void rmw_rbio_work_locked(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio;
- int ret;
-
- rbio = container_of(work, struct btrfs_raid_bio, work);
-
- ret = rmw_rbio(rbio);
- rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
}
/*
@@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
struct sector_ptr p_sector = { 0 };
struct sector_ptr q_sector = { 0 };
struct bio_list bio_list;
- struct bio *bio;
int is_replace = 0;
int ret;
@@ -2637,8 +2560,7 @@ submit_write:
return 0;
cleanup:
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
+ bio_list_put(&bio_list);
return ret;
}
@@ -2733,15 +2655,12 @@ out:
return ret;
}
-static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list)
+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
{
- struct bio *bio;
+ struct bio_list bio_list = BIO_EMPTY_LIST;
int total_sector_nr;
int ret = 0;
- ASSERT(bio_list_size(bio_list) == 0);
-
/* Build a list of bios to read all the missing parts. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
@@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
if (sector->uptodate)
continue;
- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
sectornr, REQ_OP_READ);
- if (ret)
- goto error;
+ if (ret) {
+ bio_list_put(&bio_list);
+ return ret;
+ }
}
+
+ submit_read_wait_bio_list(rbio, &bio_list);
return 0;
-error:
- while ((bio = bio_list_pop(bio_list)))
- bio_put(bio);
- return ret;
}
-static int scrub_rbio(struct btrfs_raid_bio *rbio)
+static void scrub_rbio(struct btrfs_raid_bio *rbio)
{
bool need_check = false;
- struct bio_list bio_list;
int sector_nr;
int ret;
- struct bio *bio;
-
- bio_list_init(&bio_list);
ret = alloc_rbio_essential_pages(rbio);
if (ret)
- goto cleanup;
+ goto out;
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
- ret = scrub_assemble_read_bios(rbio, &bio_list);
+ ret = scrub_assemble_read_bios(rbio);
if (ret < 0)
- goto cleanup;
-
- submit_read_bios(rbio, &bio_list);
- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ goto out;
/* We may have some failures, recover the failed sectors first. */
ret = recover_scrub_rbio(rbio);
if (ret < 0)
- goto cleanup;
+ goto out;
/*
* We have every sector properly prepared. Can finish the scrub
@@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
break;
}
}
- return ret;
-
-cleanup:
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
-
- return ret;
+out:
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
static void scrub_rbio_work_locked(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio;
- int ret;
-
- rbio = container_of(work, struct btrfs_raid_bio, work);
- ret = scrub_rbio(rbio);
- rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
}
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 7c73a443939e..df0e0abdeb1f 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -65,7 +65,7 @@ struct btrfs_raid_bio {
/* Number of data stripes (no p/q) */
u8 nr_data;
- /* Numer of all stripes (including P/Q) */
+ /* Number of all stripes (including P/Q) */
u8 real_stripes;
/* How many pages there are for each stripe */
@@ -132,7 +132,7 @@ struct btrfs_raid_bio {
/*
* Checksum buffer if the rbio is for data. The buffer should cover
- * all data sectors (exlcuding P/Q sectors).
+ * all data sectors (excluding P/Q sectors).
*/
u8 *csum_buf;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 31ec4a7658ce..ef13a9d4e370 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
*
* Here we have to manually invalidate the range (i_size, PAGE_END + 1).
*/
- if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+ if (!PAGE_ALIGNED(i_size)) {
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 52b346795f66..69c93ae333f6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -229,7 +229,7 @@ struct full_stripe_lock {
};
#ifndef CONFIG_64BIT
-/* This structure is for archtectures whose (void *) is smaller than u64 */
+/* This structure is for architectures whose (void *) is smaller than u64 */
struct scrub_page_private {
u64 logical;
};
@@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sblock->logical != btrfs_stack_header_bytenr(h))
+ if (sblock->logical != btrfs_stack_header_bytenr(h)) {
sblock->header_error = 1;
-
- if (sector->generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ sblock->logical, sblock->mirror_num,
+ btrfs_stack_header_bytenr(h),
+ sblock->logical);
+ goto out;
}
- if (!scrub_check_fsid(h->fsid, sector))
+ if (!scrub_check_fsid(h->fsid, sector)) {
sblock->header_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ sblock->logical, sblock->mirror_num,
+ h->fsid, sblock->dev->fs_devices->fsid);
+ goto out;
+ }
- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE))
+ if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
sblock->header_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ sblock->logical, sblock->mirror_num,
+ h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+ goto out;
+ }
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
@@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
}
crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
+ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
sblock->checksum_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+ sblock->logical, sblock->mirror_num,
+ CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+ CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+ goto out;
+ }
+
+ if (sector->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ sblock->logical, sblock->mirror_num,
+ btrfs_stack_header_generation(h),
+ sector->generation);
+ }
+out:
return sblock->header_error || sblock->checksum_error;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d50182b6deec..e5c963bb873d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -32,6 +32,7 @@
#include "file-item.h"
#include "ioctl.h"
#include "verity.h"
+#include "lru_cache.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -80,23 +81,23 @@ struct clone_root {
bool found_ref;
};
-#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
-#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+#define SEND_MAX_NAME_CACHE_SIZE 256
/*
- * Limit the root_ids array of struct backref_cache_entry to 12 elements.
- * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * Limit the root_ids array of struct backref_cache_entry to 17 elements.
+ * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
+ * can be satisfied from the kmalloc-192 slab, without wasting any space.
* The most common case is to have a single root for cloning, which corresponds
- * to the send root. Having the user specify more than 11 clone roots is not
+ * to the send root. Having the user specify more than 16 clone roots is not
* common, and in such rare cases we simply don't use caching if the number of
- * cloning roots that lead down to a leaf is more than 12.
+ * cloning roots that lead down to a leaf is more than 17.
*/
-#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+#define SEND_MAX_BACKREF_CACHE_ROOTS 17
/*
* Max number of entries in the cache.
- * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
- * maple tree's internal nodes, is 16K.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
+ * maple tree's internal nodes, is 24K.
*/
#define SEND_MAX_BACKREF_CACHE_SIZE 128
@@ -107,15 +108,31 @@ struct clone_root {
* x86_64).
*/
struct backref_cache_entry {
- /* List to link to the cache's lru list. */
- struct list_head list;
- /* The key for this entry in the cache. */
- u64 key;
+ struct btrfs_lru_cache_entry entry;
u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
/* Number of valid elements in the root_ids array. */
int num_roots;
};
+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
+static_assert(offsetof(struct backref_cache_entry, entry) == 0);
+
+/*
+ * Max number of entries in the cache that stores directories that were already
+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
+ */
+#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
+
+/*
+ * Max number of entries in the cache that stores directories that were already
+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
+ */
+#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
+
struct send_ctx {
struct file *send_filp;
loff_t send_off;
@@ -174,9 +191,7 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
- struct radix_tree_root name_cache;
- struct list_head name_cache_list;
- int name_cache_size;
+ struct btrfs_lru_cache name_cache;
/*
* The inode we are currently processing. It's not NULL only when we
@@ -285,13 +300,11 @@ struct send_ctx {
struct rb_root rbtree_new_refs;
struct rb_root rbtree_deleted_refs;
- struct {
- u64 last_reloc_trans;
- struct list_head lru_list;
- struct maple_tree entries;
- /* Number of entries stored in the cache. */
- int size;
- } backref_cache;
+ struct btrfs_lru_cache backref_cache;
+ u64 backref_cache_last_reloc_trans;
+
+ struct btrfs_lru_cache dir_created_cache;
+ struct btrfs_lru_cache dir_utimes_cache;
};
struct pending_dir_move {
@@ -321,21 +334,15 @@ struct orphan_dir_info {
u64 ino;
u64 gen;
u64 last_dir_index_offset;
+ u64 dir_high_seq_ino;
};
struct name_cache_entry {
- struct list_head list;
/*
- * radix_tree has only 32bit entries but we need to handle 64bit inums.
- * We use the lower 32bit of the 64bit inum to store it in the tree. If
- * more then one inum would fall into the same entry, we use radix_list
- * to store the additional entries. radix_list is also used to store
- * entries where two entries have the same inum but different
- * generations.
+ * The key in the entry is an inode number, and the generation matches
+ * the inode's generation.
*/
- struct list_head radix_list;
- u64 ino;
- u64 gen;
+ struct btrfs_lru_cache_entry entry;
u64 parent_ino;
u64 parent_gen;
int ret;
@@ -344,6 +351,9 @@ struct name_cache_entry {
char name[];
};
+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
+static_assert(offsetof(struct name_cache_entry, entry) == 0);
+
#define ADVANCE 1
#define ADVANCE_ONLY_NEXT -1
@@ -956,14 +966,12 @@ out:
static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
{
int ret;
- struct btrfs_inode_info info;
+ struct btrfs_inode_info info = { 0 };
- if (!gen)
- return -EPERM;
+ ASSERT(gen);
ret = get_inode_info(root, ino, &info);
- if (!ret)
- *gen = info.gen;
+ *gen = info.gen;
return ret;
}
@@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
return 0;
}
-static void empty_backref_cache(struct send_ctx *sctx)
-{
- struct backref_cache_entry *entry;
- struct backref_cache_entry *tmp;
-
- list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
- kfree(entry);
-
- INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
- mtree_destroy(&sctx->backref_cache.entries);
- sctx->backref_cache.size = 0;
-}
-
static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
const u64 **root_ids_ret, int *root_count_ret)
{
@@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
- if (sctx->backref_cache.size == 0)
+ if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
return false;
/*
@@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
- empty_backref_cache(sctx);
+ if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
+ btrfs_lru_cache_clear(&sctx->backref_cache);
return false;
}
- entry = mtree_load(&sctx->backref_cache.entries, key);
- if (!entry)
+ raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0);
+ if (!raw_entry)
return false;
+ entry = container_of(raw_entry, struct backref_cache_entry, entry);
*root_ids_ret = entry->root_ids;
*root_count_ret = entry->num_roots;
- list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
return true;
}
@@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
if (!new_entry)
return;
- new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.gen = 0;
new_entry->num_roots = 0;
ULIST_ITER_INIT(&uiter);
while ((node = ulist_next(root_ids, &uiter)) != NULL) {
@@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
* none of the roots is part of the list of roots from which we are
* allowed to clone. Cache the new entry as it's still useful to avoid
* backref walking to determine which roots have a path to the leaf.
+ *
+ * Also use GFP_NOFS because we're called while holding a transaction
+ * handle or while holding fs_info->commit_root_sem.
*/
-
- if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
- struct backref_cache_entry *lru_entry;
- struct backref_cache_entry *mt_entry;
-
- lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
- struct backref_cache_entry, list);
- mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
- ASSERT(mt_entry == lru_entry);
- list_del(&mt_entry->list);
- kfree(mt_entry);
- sctx->backref_cache.size--;
- }
-
- ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
- new_entry, GFP_NOFS);
+ ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
+ GFP_NOFS);
ASSERT(ret == 0 || ret == -ENOMEM);
if (ret) {
/* Caching is optional, no worries. */
@@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
return;
}
- list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
-
/*
* We are called from iterate_extent_inodes() while either holding a
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (sctx->backref_cache.size == 0)
- sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
-
- sctx->backref_cache.size++;
+ if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
+ sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
}
static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
@@ -1886,7 +1868,8 @@ enum inode_state {
inode_state_did_delete,
};
-static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
+ u64 *send_gen, u64 *parent_gen)
{
int ret;
int left_ret;
@@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
goto out;
left_ret = (info.nlink == 0) ? -ENOENT : ret;
left_gen = info.gen;
+ if (send_gen)
+ *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen);
if (!sctx->parent_root) {
right_ret = -ENOENT;
@@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
goto out;
right_ret = (info.nlink == 0) ? -ENOENT : ret;
right_gen = info.gen;
+ if (parent_gen)
+ *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen);
}
if (!left_ret && !right_ret) {
@@ -1953,14 +1940,15 @@ out:
return ret;
}
-static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
+ u64 *send_gen, u64 *parent_gen)
{
int ret;
if (ino == BTRFS_FIRST_FREE_OBJECTID)
return 1;
- ret = get_cur_inode_state(sctx, ino, gen);
+ ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
if (ret < 0)
goto out;
@@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
const char *name, int name_len,
u64 *who_ino, u64 *who_gen, u64 *who_mode)
{
- int ret = 0;
- u64 gen;
+ int ret;
+ u64 parent_root_dir_gen;
u64 other_inode = 0;
struct btrfs_inode_info info;
if (!sctx->parent_root)
- goto out;
+ return 0;
- ret = is_inode_existent(sctx, dir, dir_gen);
+ ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen);
if (ret <= 0)
- goto out;
+ return 0;
/*
* If we have a parent root we need to verify that the parent dir was
* not deleted and then re-created, if it was then we have no overwrite
* and we can just unlink this entry.
+ *
+ * @parent_root_dir_gen was set to 0 if the inode does not exist in the
+ * parent root.
*/
- if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_gen(sctx->parent_root, dir, &gen);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
- if (gen != dir_gen)
- goto out;
- }
+ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
+ parent_root_dir_gen != dir_gen)
+ return 0;
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
&other_inode);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ if (ret == -ENOENT)
+ return 0;
+ else if (ret < 0)
+ return ret;
/*
* Check if the overwritten ref was already processed. If yes, the ref
@@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, &info);
if (ret < 0)
- goto out;
+ return ret;
- ret = 1;
*who_ino = other_inode;
*who_gen = info.gen;
*who_mode = info.mode;
- } else {
- ret = 0;
+ return 1;
}
-out:
- return ret;
+ return 0;
}
/*
@@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx,
u64 ino, u64 ino_gen,
const char *name, int name_len)
{
- int ret = 0;
- u64 gen;
+ int ret;
u64 ow_inode;
+ u64 ow_gen = 0;
+ u64 send_root_dir_gen;
if (!sctx->parent_root)
- goto out;
+ return 0;
- ret = is_inode_existent(sctx, dir, dir_gen);
+ ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL);
if (ret <= 0)
- goto out;
+ return ret;
- if (dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_gen(sctx->send_root, dir, &gen);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
- if (gen != dir_gen)
- goto out;
- }
+ /*
+ * @send_root_dir_gen was set to 0 if the inode does not exist in the
+ * send root.
+ */
+ if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
+ return 0;
/* check if the ref was overwritten by another ref */
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
&ow_inode);
- if (ret < 0 && ret != -ENOENT)
- goto out;
- if (ret) {
+ if (ret == -ENOENT) {
/* was never and will never be overwritten */
- ret = 0;
- goto out;
+ return 0;
+ } else if (ret < 0) {
+ return ret;
}
- ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
- if (ret < 0)
- goto out;
+ if (ow_inode == ino) {
+ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
+ if (ret < 0)
+ return ret;
- if (ow_inode == ino && gen == ino_gen) {
- ret = 0;
- goto out;
+ /* It's the same inode, so no overwrite happened. */
+ if (ow_gen == ino_gen)
+ return 0;
}
/*
@@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx,
* inode 'ino' to be orphanized, therefore check if ow_inode matches
* the current inode being processed.
*/
- if ((ow_inode < sctx->send_progress) ||
- (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
- gen == sctx->cur_inode_gen))
- ret = 1;
- else
- ret = 0;
+ if (ow_inode < sctx->send_progress)
+ return 1;
-out:
- return ret;
+ if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
+ if (ow_gen == 0) {
+ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
+ if (ret < 0)
+ return ret;
+ }
+ if (ow_gen == sctx->cur_inode_gen)
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -2285,113 +2264,16 @@ out:
return ret;
}
-/*
- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
- * so we need to do some special handling in case we have clashes. This function
- * takes care of this with the help of name_cache_entry::radix_list.
- * In case of error, nce is kfreed.
- */
-static int name_cache_insert(struct send_ctx *sctx,
- struct name_cache_entry *nce)
+static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+ u64 ino, u64 gen)
{
- int ret = 0;
- struct list_head *nce_head;
-
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
- if (!nce_head) {
- nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
- if (!nce_head) {
- kfree(nce);
- return -ENOMEM;
- }
- INIT_LIST_HEAD(nce_head);
-
- ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
- if (ret < 0) {
- kfree(nce_head);
- kfree(nce);
- return ret;
- }
- }
- list_add_tail(&nce->radix_list, nce_head);
- list_add_tail(&nce->list, &sctx->name_cache_list);
- sctx->name_cache_size++;
-
- return ret;
-}
+ struct btrfs_lru_cache_entry *entry;
-static void name_cache_delete(struct send_ctx *sctx,
- struct name_cache_entry *nce)
-{
- struct list_head *nce_head;
-
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
- if (!nce_head) {
- btrfs_err(sctx->send_root->fs_info,
- "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
- nce->ino, sctx->name_cache_size);
- }
-
- list_del(&nce->radix_list);
- list_del(&nce->list);
- sctx->name_cache_size--;
-
- /*
- * We may not get to the final release of nce_head if the lookup fails
- */
- if (nce_head && list_empty(nce_head)) {
- radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
- kfree(nce_head);
- }
-}
-
-static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
- u64 ino, u64 gen)
-{
- struct list_head *nce_head;
- struct name_cache_entry *cur;
-
- nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
- if (!nce_head)
+ entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen);
+ if (!entry)
return NULL;
- list_for_each_entry(cur, nce_head, radix_list) {
- if (cur->ino == ino && cur->gen == gen)
- return cur;
- }
- return NULL;
-}
-
-/*
- * Remove some entries from the beginning of name_cache_list.
- */
-static void name_cache_clean_unused(struct send_ctx *sctx)
-{
- struct name_cache_entry *nce;
-
- if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
- return;
-
- while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
- nce = list_entry(sctx->name_cache_list.next,
- struct name_cache_entry, list);
- name_cache_delete(sctx, nce);
- kfree(nce);
- }
-}
-
-static void name_cache_free(struct send_ctx *sctx)
-{
- struct name_cache_entry *nce;
-
- while (!list_empty(&sctx->name_cache_list)) {
- nce = list_entry(sctx->name_cache_list.next,
- struct name_cache_entry, list);
- name_cache_delete(sctx, nce);
- kfree(nce);
- }
+ return container_of(entry, struct name_cache_entry, entry);
}
/*
@@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
{
int ret;
int nce_ret;
- struct name_cache_entry *nce = NULL;
+ struct name_cache_entry *nce;
/*
* First check if we already did a call to this function with the same
@@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
nce = name_cache_search(sctx, ino, gen);
if (nce) {
if (ino < sctx->send_progress && nce->need_later_update) {
- name_cache_delete(sctx, nce);
- kfree(nce);
+ btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
nce = NULL;
} else {
- /*
- * Removes the entry from the list and adds it back to
- * the end. This marks the entry as recently used so
- * that name_cache_clean_unused does not remove it.
- */
- list_move_tail(&nce->list, &sctx->name_cache_list);
-
*parent_ino = nce->parent_ino;
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
@@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
* This should only happen for the parent dir that we determine in
* record_new_ref_if_needed().
*/
- ret = is_inode_existent(sctx, ino, gen);
+ ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
if (ret < 0)
goto out;
@@ -2497,8 +2371,8 @@ out_cache:
goto out;
}
- nce->ino = ino;
- nce->gen = gen;
+ nce->entry.key = ino;
+ nce->entry.gen = gen;
nce->parent_ino = *parent_ino;
nce->parent_gen = *parent_gen;
nce->name_len = fs_path_len(dest);
@@ -2510,10 +2384,11 @@ out_cache:
else
nce->need_later_update = 1;
- nce_ret = name_cache_insert(sctx, nce);
- if (nce_ret < 0)
+ nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
+ if (nce_ret < 0) {
+ kfree(nce);
ret = nce_ret;
- name_cache_clean_unused(sctx);
+ }
out:
return ret;
@@ -2884,6 +2759,63 @@ out:
}
/*
+ * If the cache is full, we can't remove entries from it and do a call to
+ * send_utimes() for each respective inode, because we might be finishing
+ * processing an inode that is a directory and it just got renamed, and existing
+ * entries in the cache may refer to inodes that have the directory in their
+ * full path - in which case we would generate outdated paths (pre-rename)
+ * for the inodes that the cache entries point to. Instead of prunning the
+ * cache when inserting, do it after we finish processing each inode at
+ * finish_inode_if_needed().
+ */
+static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
+{
+ struct btrfs_lru_cache_entry *entry;
+ int ret;
+
+ entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen);
+ if (entry != NULL)
+ return 0;
+
+ /* Caching is optional, don't fail if we can't allocate memory. */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return send_utimes(sctx, dir, gen);
+
+ entry->key = dir;
+ entry->gen = gen;
+
+ ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ kfree(entry);
+ return send_utimes(sctx, dir, gen);
+ }
+
+ return 0;
+}
+
+static int trim_dir_utimes_cache(struct send_ctx *sctx)
+{
+ while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
+ SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
+ struct btrfs_lru_cache_entry *lru;
+ int ret;
+
+ lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache);
+ ASSERT(lru != NULL);
+
+ ret = send_utimes(sctx, lru->key, lru->gen);
+ if (ret)
+ return ret;
+
+ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru);
+ }
+
+ return 0;
+}
+
+/*
* Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
* a valid path yet because we did not process the refs yet. So, the inode
* is created as orphan.
@@ -2971,6 +2903,23 @@ out:
return ret;
}
+static void cache_dir_created(struct send_ctx *sctx, u64 dir)
+{
+ struct btrfs_lru_cache_entry *entry;
+ int ret;
+
+ /* Caching is optional, ignore any failures. */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return;
+
+ entry->key = dir;
+ entry->gen = 0;
+ ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL);
+ if (ret < 0)
+ kfree(entry);
+}
+
/*
* We need some special handling for inodes that get processed before the parent
* directory got created. See process_recorded_refs for details.
@@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
struct btrfs_key di_key;
struct btrfs_dir_item *di;
+ if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0))
+ return 1;
+
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
@@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
+ cache_dir_created(sctx, dir);
break;
}
}
@@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
return 0;
}
- return send_create_inode(sctx, sctx->cur_ino);
+ ret = send_create_inode(sctx, sctx->cur_ino);
+
+ if (ret == 0 && S_ISDIR(sctx->cur_inode_mode))
+ cache_dir_created(sctx, sctx->cur_ino);
+
+ return ret;
}
struct recorded_ref {
@@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
odi->ino = dir_ino;
odi->gen = dir_gen;
odi->last_dir_index_offset = 0;
+ odi->dir_high_seq_ino = 0;
rb_link_node(&odi->node, parent, p);
rb_insert_color(&odi->node, &sctx->orphan_dirs);
@@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx,
* We check this by iterating all dir items and checking if the inode behind
* the dir item was already processed.
*/
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
{
int ret = 0;
int iter_ret = 0;
@@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
struct btrfs_key loc;
struct btrfs_dir_item *di;
struct orphan_dir_info *odi = NULL;
+ u64 dir_high_seq_ino = 0;
+ u64 last_dir_index_offset = 0;
/*
* Don't try to rmdir the top/root subvolume dir.
@@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (dir == BTRFS_FIRST_FREE_OBJECTID)
return 0;
+ odi = get_orphan_dir_info(sctx, dir, dir_gen);
+ if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
+ return 0;
+
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
+ if (!odi) {
+ /*
+ * Find the inode number associated with the last dir index
+ * entry. This is very likely the inode with the highest number
+ * of all inodes that have an entry in the directory. We can
+ * then use it to avoid future calls to can_rmdir(), when
+ * processing inodes with a lower number, from having to search
+ * the parent root b+tree for dir index keys.
+ */
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ /* Can't happen, the root is never empty. */
+ ASSERT(path->slots[0] > 0);
+ if (WARN_ON(path->slots[0] == 0)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) {
+ /* No index keys, dir can be removed. */
+ ret = 1;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dir_high_seq_ino = loc.objectid;
+ if (sctx->cur_ino < dir_high_seq_ino) {
+ ret = 0;
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ }
+
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = 0;
-
- odi = get_orphan_dir_info(sctx, dir, dir_gen);
- if (odi)
- key.offset = odi->last_dir_index_offset;
+ key.offset = (odi ? odi->last_dir_index_offset : 0);
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
@@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
+ last_dir_index_offset = found_key.offset;
+
dm = get_waiting_dir_move(sctx, loc.objectid);
if (dm) {
- odi = add_orphan_dir_info(sctx, dir, dir_gen);
- if (IS_ERR(odi)) {
- ret = PTR_ERR(odi);
- goto out;
- }
- odi->gen = dir_gen;
- odi->last_dir_index_offset = found_key.offset;
dm->rmdir_ino = dir;
dm->rmdir_gen = dir_gen;
ret = 0;
goto out;
}
- if (loc.objectid > send_progress) {
- odi = add_orphan_dir_info(sctx, dir, dir_gen);
- if (IS_ERR(odi)) {
- ret = PTR_ERR(odi);
- goto out;
- }
- odi->gen = dir_gen;
- odi->last_dir_index_offset = found_key.offset;
+ if (loc.objectid > sctx->cur_ino) {
ret = 0;
goto out;
}
@@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
out:
btrfs_free_path(path);
- return ret;
+
+ if (ret)
+ return ret;
+
+ if (!odi) {
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
+ if (IS_ERR(odi))
+ return PTR_ERR(odi);
+
+ odi->gen = dir_gen;
+ }
+
+ odi->last_dir_index_offset = last_dir_index_offset;
+ odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
+
+ return 0;
}
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
@@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
}
gen = odi->gen;
- ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
+ ret = can_rmdir(sctx, rmdir_ino, gen);
if (ret < 0)
goto out;
if (!ret)
@@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
}
finish:
- ret = send_utimes(sctx, pm->ino, pm->gen);
+ ret = cache_dir_utimes(sctx, pm->ino, pm->gen);
if (ret < 0)
goto out;
@@ -3619,7 +3628,7 @@ finish:
if (ret < 0)
goto out;
- ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
}
@@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* "testdir_2".
*/
list_for_each_entry(cur, &sctx->new_refs, list) {
- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
if (ret < 0)
goto out;
if (ret == inode_state_will_create)
@@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* the source path when performing its rename
* operation.
*/
- if (is_waiting_for_move(sctx, ow_inode)) {
- wdm = get_waiting_dir_move(sctx,
- ow_inode);
- ASSERT(wdm);
+ wdm = get_waiting_dir_move(sctx, ow_inode);
+ if (wdm)
wdm->orphanized = true;
- }
/*
* Make sure we clear our orphanized inode's
@@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* and get instead the orphan name.
*/
nce = name_cache_search(sctx, ow_inode, ow_gen);
- if (nce) {
- name_cache_delete(sctx, nce);
- kfree(nce);
- }
+ if (nce)
+ btrfs_lru_cache_remove(&sctx->name_cache,
+ &nce->entry);
/*
* ow_inode might currently be an ancestor of
@@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* parent directory out of order. But we need to check if this
* did already happen before due to other refs in the same dir.
*/
- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
if (ret < 0)
goto out;
if (ret == inode_state_will_create) {
@@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_create_inode(sctx, cur->dir);
if (ret < 0)
goto out;
+ cache_dir_created(sctx, cur->dir);
}
}
@@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* later, we do this check again and rmdir it then if possible.
* See the use of check_dirs for more details.
*/
- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
- sctx->cur_ino);
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen);
if (ret < 0)
goto out;
if (ret) {
@@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (cur->dir > sctx->cur_ino)
continue;
- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
if (ret < 0)
goto out;
if (ret == inode_state_did_create ||
ret == inode_state_no_change) {
- /* TODO delayed utimes */
- ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
} else if (ret == inode_state_did_delete &&
cur->dir != last_dir_ino_rm) {
- ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
- sctx->cur_ino);
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
if (ret) {
@@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* boundary in the send buffer. This means that there may be a gap
* between the beginning of the command and the file data.
*/
- data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ data_offset = PAGE_ALIGN(sctx->send_size);
if (data_offset > sctx->send_max_size ||
sctx->send_max_size - data_offset < disk_num_bytes) {
ret = -EOVERFLOW;
@@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
sent += size;
}
- if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
/*
* Always operate only on ranges that are a multiple of the page
* size. This is not only to prevent zeroing parts of a page in
@@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
* it's moved/renamed, therefore we don't need to do it here.
*/
sctx->send_progress = sctx->cur_ino + 1;
- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+
+ /*
+ * If the current inode is a non-empty directory, delay issuing
+ * the utimes command for it, as it's very likely we have inodes
+ * with an higher number inside it. We want to issue the utimes
+ * command only after adding all dentries to it.
+ */
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0)
+ ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ else
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+
if (ret < 0)
goto out;
}
out:
+ if (!ret)
+ ret = trim_dir_utimes_cache(sctx);
+
return ret;
}
@@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
int clone_sources_to_rollback = 0;
size_t alloc_size;
int sort_clone_roots = 0;
+ struct btrfs_lru_cache_entry *entry;
+ struct btrfs_lru_cache_entry *tmp;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
- INIT_LIST_HEAD(&sctx->name_cache_list);
- INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
- mt_init(&sctx->backref_cache.entries);
+ btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
+ btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
+ btrfs_lru_cache_init(&sctx->dir_created_cache,
+ SEND_MAX_DIR_CREATED_CACHE_SIZE);
+ /*
+ * This cache is periodically trimmed to a fixed size elsewhere, see
+ * cache_dir_utimes() and trim_dir_utimes_cache().
+ */
+ btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0);
+
+ sctx->pending_dir_moves = RB_ROOT;
+ sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
+ sctx->rbtree_new_refs = RB_ROOT;
+ sctx->rbtree_deleted_refs = RB_ROOT;
sctx->flags = arg->flags;
@@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->pending_dir_moves = RB_ROOT;
- sctx->waiting_dir_moves = RB_ROOT;
- sctx->orphan_dirs = RB_ROOT;
- sctx->rbtree_new_refs = RB_ROOT;
- sctx->rbtree_deleted_refs = RB_ROOT;
-
sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
arg->clone_sources_count + 1,
GFP_KERNEL);
@@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
if (ret < 0)
goto out;
+ btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
+ ret = send_utimes(sctx, entry->key, entry->gen);
+ if (ret < 0)
+ goto out;
+ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry);
+ }
+
if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
ret = begin_cmd(sctx, BTRFS_SEND_C_END);
if (ret < 0)
@@ -8358,11 +8389,12 @@ out:
kvfree(sctx->send_buf);
kvfree(sctx->verity_descriptor);
- name_cache_free(sctx);
-
close_current_inode(sctx);
- empty_backref_cache(sctx);
+ btrfs_lru_cache_clear(&sctx->name_cache);
+ btrfs_lru_cache_clear(&sctx->backref_cache);
+ btrfs_lru_cache_clear(&sctx->dir_created_cache);
+ btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
kfree(sctx);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 433ce221dc5c..581845bc206a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -58,6 +58,7 @@
#include "scrub.h"
#include "verity.h"
#include "super.h"
+#include "extent-tree.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
/*
- * Metadata in mixed block goup profiles are accounted in data
+ * Metadata in mixed block group profiles are accounted in data
*/
if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 45615ce36498..8c5efa5813b3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj)
kfree(to_raid_kobj(kobj));
}
-static struct kobj_type btrfs_raid_ktype = {
+static const struct kobj_type btrfs_raid_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = release_raid_kobj,
.default_groups = raid_groups,
@@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj)
kfree(sinfo);
}
-static struct kobj_type space_info_ktype = {
+static const struct kobj_type space_info_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = space_info_release,
.default_groups = space_info_groups,
@@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj)
complete(&fs_devs->kobj_unregister);
}
-static struct kobj_type btrfs_ktype = {
+static const struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = btrfs_release_fsid_kobj,
};
@@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj)
complete(&device->kobj_unregister);
}
-static struct kobj_type devid_ktype = {
+static const struct kobj_type devid_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = devid_groups,
.release = btrfs_release_devid_kobj,
@@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj)
kfree(kobj);
}
-static struct kobj_type qgroups_ktype = {
+static const struct kobj_type qgroups_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = qgroups_groups,
.release = qgroups_release,
@@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj)
memset(&qgroup->kobj, 0, sizeof(*kobj));
}
-static struct kobj_type qgroup_ktype = {
+static const struct kobj_type qgroup_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = qgroup_release,
.default_groups = qgroup_groups,
@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
* values in superblock. Call after any changes to incompat/compat_ro flags
*/
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set)
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 __maybe_unused features;
- int __maybe_unused ret;
+ int ret;
if (!fs_info)
return;
- /*
- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
- * safe when called from some contexts (eg. balance)
- */
- features = get_features(fs_info, set);
- ASSERT(bit & supported_feature_masks[set]);
-
- fs_devs = fs_info->fs_devices;
- fsid_kobj = &fs_devs->fsid_kobj;
-
+ fsid_kobj = &fs_info->fs_devices->fsid_kobj;
if (!fsid_kobj->state_initialized)
return;
- /*
- * FIXME: this is too heavy to update just one value, ideally we'd like
- * to use sysfs_update_group but some refactoring is needed first.
- */
- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret < 0)
+ btrfs_warn(fs_info,
+ "failed to update /sys/fs/btrfs/%pU/features: %d",
+ fs_info->fs_devices->fsid, ret);
}
int __init btrfs_init_sysfs(void)
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index bacef43f7267..86c7eef12873 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
- u64 bit, enum btrfs_feature_set set);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
int __init btrfs_init_sysfs(void);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index c5b3a631bf4f..f2f2e11dac4c 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b8c52e89688c..18329ebcb1cb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ /* If we have features changed, wake up the cleaner to update sysfs. */
+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
+ fs_info->cleaner_kthread)
+ wake_up_process(fs_info->cleaner_kthread);
+
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
@@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
return (ret < 0) ? 0 : 1;
}
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ WRITE_ONCE(trans->aborted, errno);
+ WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+}
+
int __init btrfs_transaction_init(void)
{
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 97f6c39f59c8..fa728ab80826 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
+bool __cold abort_should_print_stack(int errno);
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact stack trace is reported for some errors.
+ */
+#define btrfs_abort_transaction(trans, errno) \
+do { \
+ bool first = false; \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((trans)->fs_info->fs_state))) { \
+ first = true; \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_ERR \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno))) { \
+ /* Stack trace printed. */ \
+ } else { \
+ btrfs_debug((trans)->fs_info, \
+ "Transaction aborted (error %d)", \
+ (errno)); \
+ } \
+ } \
+ __btrfs_abort_transaction((trans), __func__, \
+ __LINE__, (errno), first); \
+} while (0)
+
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
unsigned int num_items);
@@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit);
int __init btrfs_transaction_init(void);
void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 58599189bd18..200cea6e49e5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
-static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
-{
- filemap_fdatawait_range(buf->pages[0]->mapping,
- buf->start, buf->start + buf->len - 1);
-}
-
/*
* the walk control struct is used to pass state down the chain when
* processing the log tree. The stage field tells us which part
@@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
+ btrfs_tree_lock(next);
+ btrfs_clear_buffer_dirty(trans, next);
+ wait_on_extent_buffer_writeback(next);
+ btrfs_tree_unlock(next);
+
if (trans) {
- btrfs_tree_lock(next);
- btrfs_clean_tree_block(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
bytenr, blocksize);
if (ret) {
@@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_redirty_list_add(
trans->transaction, next);
} else {
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
- clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info, bytenr);
}
}
@@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
next = path->nodes[*level];
+ btrfs_tree_lock(next);
+ btrfs_clear_buffer_dirty(trans, next);
+ wait_on_extent_buffer_writeback(next);
+ btrfs_tree_unlock(next);
+
if (trans) {
- btrfs_tree_lock(next);
- btrfs_clean_tree_block(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
path->nodes[*level]->start,
path->nodes[*level]->len);
@@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
btrfs_redirty_list_add(trans->transaction,
next);
} else {
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
- clear_extent_buffer_dirty(next);
-
unaccount_log_buffer(fs_info,
path->nodes[*level]->start);
}
@@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
next = path->nodes[orig_level];
+ btrfs_tree_lock(next);
+ btrfs_clear_buffer_dirty(trans, next);
+ wait_on_extent_buffer_writeback(next);
+ btrfs_tree_unlock(next);
+
if (trans) {
- btrfs_tree_lock(next);
- btrfs_clean_tree_block(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
next->start, next->len);
if (ret)
goto out;
btrfs_redirty_list_add(trans->transaction, next);
} else {
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
- clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info, next->start);
}
}
@@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
/*
* If for some unexpected reason the last item's index is not greater
- * than the last index we logged, warn and return an error to fallback
- * to a transaction commit.
+ * than the last index we logged, warn and force a transaction commit.
*/
if (WARN_ON(last_index <= inode->last_dir_index_offset))
- ret = -EUCLEAN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
else
inode->last_dir_index_offset = last_index;
out:
@@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- int err = 0;
int ret;
u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
@@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
path->slots[0]);
if (tmp.type == BTRFS_DIR_INDEX_KEY)
last_old_dentry_offset = tmp.offset;
- } else if (ret < 0) {
- err = ret;
+ } else if (ret > 0) {
+ ret = 0;
}
goto done;
@@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
if (tmp.type == BTRFS_DIR_INDEX_KEY)
last_old_dentry_offset = tmp.offset;
} else if (ret < 0) {
- err = ret;
goto done;
}
@@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (ret > 0)
+ if (ret > 0) {
ret = btrfs_next_item(root, path);
+ if (ret > 0) {
+ /* There are no more keys in the inode's root. */
+ ret = 0;
+ goto done;
+ }
+ }
if (ret < 0)
- err = ret;
- /* If ret is 1, there are no more keys in the inode's root. */
- if (ret != 0)
goto done;
/*
@@ -3897,8 +3887,8 @@ search:
ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
&last_old_dentry_offset);
if (ret != 0) {
- if (ret < 0)
- err = ret;
+ if (ret > 0)
+ ret = 0;
goto done;
}
path->slots[0] = btrfs_header_nritems(path->nodes[0]);
@@ -3909,10 +3899,10 @@ search:
*/
ret = btrfs_next_leaf(root, path);
if (ret) {
- if (ret == 1)
+ if (ret == 1) {
last_offset = (u64)-1;
- else
- err = ret;
+ ret = 0;
+ }
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
@@ -3943,7 +3933,7 @@ done:
btrfs_release_path(path);
btrfs_release_path(dst_path);
- if (err == 0) {
+ if (ret == 0) {
*last_offset_ret = last_offset;
/*
* In case the leaf was changed in the current transaction but
@@ -3954,15 +3944,13 @@ done:
* a range, last_old_dentry_offset is == to last_offset.
*/
ASSERT(last_old_dentry_offset <= last_offset);
- if (last_old_dentry_offset < last_offset) {
+ if (last_old_dentry_offset < last_offset)
ret = insert_dir_log_key(trans, log, path, ino,
last_old_dentry_offset + 1,
last_offset);
- if (ret)
- err = ret;
- }
}
- return err;
+
+ return ret;
}
/*
@@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
* commits.
*/
- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
- btrfs_set_log_full_commit(trans);
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
return BTRFS_LOG_FORCE_COMMIT;
- }
inode = btrfs_iget(root->fs_info->sb, ino, root);
/*
@@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* result in losing the file after a log replay.
*/
if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
- btrfs_set_log_full_commit(trans);
ret = BTRFS_LOG_FORCE_COMMIT;
goto out_unlock;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 85cd24cb0540..bdeb5216718f 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -13,8 +13,13 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
-/* We can't use the tree log for whatever reason, force a transaction commit */
-#define BTRFS_LOG_FORCE_COMMIT (1)
+/*
+ * We can't use the tree log for whatever reason, force a transaction commit.
+ * We use a negative value because there are functions through the logging code
+ * that need to return an error (< 0 value), false (0) or true (1). Any negative
+ * value will do, as it will cause the log to be marked for a full sync.
+ */
+#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1))
struct btrfs_log_ctx {
int log_ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index df43093b7a46..7823168c08a6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata(
/*
* Handle the case where the scanned device is part of an fs whose last
* metadata UUID change reverted it to the original FSID. At the same
- * time * fs_devices was first created by another constitutent device
+ * time fs_devices was first created by another constituent device
* which didn't fully observe the operation. This results in an
* btrfs_fs_devices created with metadata/fsid different AND
* btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
@@ -6284,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op)
return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}
-/*
- * Calculate the geometry of a particular (address, len) tuple. This
- * information is used to calculate how big a particular bio can get before it
- * straddles a stripe.
- *
- * @fs_info: the filesystem
- * @em: mapping containing the logical extent
- * @op: type of operation - write or read
- * @logical: address that we want to figure out the geometry of
- * @io_geom: pointer used to return values
- *
- * Returns < 0 in case a chunk for the given logical address cannot be found,
- * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
- */
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
- enum btrfs_map_op op, u64 logical,
- struct btrfs_io_geometry *io_geom)
+static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
+ u64 offset, u64 *stripe_nr, u64 *stripe_offset,
+ u64 *full_stripe_start)
{
- struct map_lookup *map;
- u64 len;
- u64 offset;
- u64 stripe_offset;
- u64 stripe_nr;
- u32 stripe_len;
- u64 raid56_full_stripe_start = (u64)-1;
- int data_stripes;
+ u32 stripe_len = map->stripe_len;
ASSERT(op != BTRFS_MAP_DISCARD);
- map = em->map_lookup;
- /* Offset of this logical address in the chunk */
- offset = logical - em->start;
- /* Len of a stripe in a chunk */
- stripe_len = map->stripe_len;
/*
- * Stripe_nr is where this block falls in
- * stripe_offset is the offset of this block in its stripe.
+ * Stripe_nr is the stripe where this block falls. stripe_offset is
+ * the offset of this block in its stripe.
*/
- stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
- ASSERT(stripe_offset < U32_MAX);
+ *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
+ ASSERT(*stripe_offset < U32_MAX);
- data_stripes = nr_data_stripes(map);
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
- /* Only stripe based profiles needs to check against stripe length. */
- if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
- u64 max_len = stripe_len - stripe_offset;
+ *full_stripe_start =
+ div64_u64(offset, full_stripe_len) * full_stripe_len;
/*
- * In case of raid56, we need to know the stripe aligned start
+ * For writes to RAID56, allow to write a full stripe set, but
+ * no straddling of stripe sets.
*/
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- unsigned long full_stripe_len = stripe_len * data_stripes;
- raid56_full_stripe_start = offset;
-
- /*
- * Allow a write of a full stripe, but make sure we
- * don't allow straddling of stripes
- */
- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
- full_stripe_len);
- raid56_full_stripe_start *= full_stripe_len;
-
- /*
- * For writes to RAID[56], allow a full stripeset across
- * all disks. For other RAID types and for RAID[56]
- * reads, just allow a single stripe (on a single disk).
- */
- if (op == BTRFS_MAP_WRITE) {
- max_len = stripe_len * data_stripes -
- (offset - raid56_full_stripe_start);
- }
- }
- len = min_t(u64, em->len - offset, max_len);
- } else {
- len = em->len - offset;
+ if (op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - *full_stripe_start);
}
- io_geom->len = len;
- io_geom->offset = offset;
- io_geom->stripe_len = stripe_len;
- io_geom->stripe_nr = stripe_nr;
- io_geom->stripe_offset = stripe_offset;
- io_geom->raid56_stripe_offset = raid56_full_stripe_start;
-
- return 0;
+ /*
+ * For other RAID types and for RAID56 reads, allow a single stripe (on
+ * a single disk).
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
+ return stripe_len - *stripe_offset;
+ return U64_MAX;
}
static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
@@ -6387,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
{
struct extent_map *em;
struct map_lookup *map;
+ u64 map_offset;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
@@ -6405,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
- struct btrfs_io_geometry geom;
+ u64 max_len;
ASSERT(bioc_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
@@ -6413,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
em = btrfs_get_chunk_map(fs_info, logical, *length);
ASSERT(!IS_ERR(em));
- ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
- if (ret < 0)
- return ret;
-
map = em->map_lookup;
-
- *length = geom.len;
- stripe_len = geom.stripe_len;
- stripe_nr = geom.stripe_nr;
- stripe_offset = geom.stripe_offset;
- raid56_full_stripe_start = geom.raid56_stripe_offset;
data_stripes = nr_data_stripes(map);
+ stripe_len = map->stripe_len;
+
+ map_offset = logical - em->start;
+ max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
+ &stripe_offset, &raid56_full_stripe_start);
+ *length = min_t(u64, em->len - map_offset, max_len);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6b7a05f6cf82..7e51f2238f72 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -53,21 +53,6 @@ enum btrfs_raid_types {
BTRFS_NR_RAID_TYPES
};
-struct btrfs_io_geometry {
- /* remaining bytes before crossing a stripe */
- u64 len;
- /* offset of logical address in chunk */
- u64 offset;
- /* length of single IO stripe */
- u32 stripe_len;
- /* offset of address in stripe */
- u32 stripe_offset;
- /* number of stripe where address falls */
- u64 stripe_nr;
- /* offset of raid56 stripe into the chunk */
- u64 raid56_stripe_offset;
-};
-
/*
* Use sequence counter to get consistent device stat data on
* 32-bit processors.
@@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes);
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
- enum btrfs_map_op op, u64 logical,
- struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1f503e8e42d4..f95b2c94d619 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -17,6 +17,7 @@
#include "space-info.h"
#include "fs.h"
#include "accessors.h"
+#include "bio.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
*/
static inline u32 sb_zone_number(int shift, int mirror)
{
- u64 zone;
+ u64 zone = U64_MAX;
ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
switch (mirror) {
@@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
struct blk_zone *zones, unsigned int *nr_zones)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
- u32 zno;
int ret;
if (!*nr_zones)
@@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
/* Check cache */
if (zinfo->zone_cache) {
unsigned int i;
+ u32 zno;
ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
zno = pos >> zinfo->zone_size_shift;
@@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
return -EIO;
/* Populate cache */
- if (zinfo->zone_cache)
+ if (zinfo->zone_cache) {
+ u32 zno = pos >> zinfo->zone_size_shift;
+
memcpy(zinfo->zone_cache + zno, zones,
sizeof(*zinfo->zone_cache) * *nr_zones);
+ }
return 0;
}
@@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
- /*
- * We limit max_zone_append_size also by max_segments *
- * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
- * since btrfs adds the pages one by one to a bio, and btrfs cannot
- * increase the metadata reservation even if it increases the number of
- * extents, it is safe to stick with the limit.
- *
- * With the zoned emulation, we can have non-zoned device on the zoned
- * mode. In this case, we don't have a valid max zone append size. So,
- * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
- */
- if (bdev_is_zoned(bdev)) {
- zone_info->max_zone_append_size = min_t(u64,
- (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
- (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
- } else {
- zone_info->max_zone_append_size =
- (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
- }
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
@@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
+ struct queue_limits *lim = &fs_info->limits;
struct btrfs_device *device;
u64 zone_size = 0;
- u64 max_zone_append_size = 0;
int ret;
/*
@@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
if (!btrfs_fs_incompat(fs_info, ZONED))
return btrfs_check_for_zoned_device(fs_info);
+ blk_set_stacking_limits(lim);
+
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
struct btrfs_zoned_device_info *zone_info = device->zone_info;
@@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
zone_info->zone_size, zone_size);
return -EINVAL;
}
- if (!max_zone_append_size ||
- (zone_info->max_zone_append_size &&
- zone_info->max_zone_append_size < max_zone_append_size))
- max_zone_append_size = zone_info->max_zone_append_size;
+
+ /*
+ * With the zoned emulation, we can have non-zoned device on the
+ * zoned mode. In this case, we don't have a valid max zone
+ * append size.
+ */
+ if (bdev_is_zoned(device->bdev)) {
+ blk_stack_limits(lim,
+ &bdev_get_queue(device->bdev)->limits,
+ 0);
+ }
}
/*
@@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
}
fs_info->zone_size = zone_size;
- fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
- fs_info->sectorsize);
+ /*
+ * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
+ * Technically, we can have multiple pages per segment. But, since
+ * we add the pages one by one to a bio, and cannot increase the
+ * metadata reservation even if it increases the number of extents, it
+ * is safe to stick with the limit.
+ */
+ fs_info->max_zone_append_size = ALIGN_DOWN(
+ min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
+ (u64)lim->max_sectors << SECTOR_SHIFT,
+ (u64)lim->max_segments << PAGE_SHIFT),
+ fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
if (fs_info->max_zone_append_size < fs_info->max_extent_size)
fs_info->max_extent_size = fs_info->max_zone_append_size;
@@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans)
spin_unlock(&trans->releasing_ebs_lock);
}
-bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
+ u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
+ struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
bool ret = false;
@@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!is_data_inode(&inode->vfs_inode))
return false;
+ if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
+ return false;
+
/*
* Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
* extent layout the relocation code has.
@@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
return ret;
}
-void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
- struct bio *bio)
+void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
+ const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
struct btrfs_ordered_extent *ordered;
- const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- if (bio_op(bio) != REQ_OP_ZONE_APPEND)
- return;
-
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
+ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
if (WARN_ON(!ordered))
return;
ordered->physical = physical;
- ordered->bdev = bio->bi_bdev;
-
btrfs_put_ordered_extent(ordered);
}
@@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
struct extent_map *em;
struct btrfs_ordered_sum *sum;
u64 orig_logical = ordered->disk_bytenr;
- u64 *logical = NULL;
- int nr, stripe_len;
+ struct map_lookup *map;
+ u64 physical = ordered->physical;
+ u64 chunk_start_phys;
+ u64 logical;
- /* Zoned devices should not have partitions. So, we can assume it is 0 */
- ASSERT(!bdev_is_partition(ordered->bdev));
- if (WARN_ON(!ordered->bdev))
+ em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
+ if (IS_ERR(em))
return;
+ map = em->map_lookup;
+ chunk_start_phys = map->stripes[0].physical;
- if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
- ordered->physical, &logical, &nr,
- &stripe_len)))
- goto out;
-
- WARN_ON(nr != 1);
+ if (WARN_ON_ONCE(map->num_stripes > 1) ||
+ WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) ||
+ WARN_ON_ONCE(physical < chunk_start_phys) ||
+ WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) {
+ free_extent_map(em);
+ return;
+ }
+ logical = em->start + (physical - map->stripes[0].physical);
+ free_extent_map(em);
- if (orig_logical == *logical)
- goto out;
+ if (orig_logical == logical)
+ return;
- ordered->disk_bytenr = *logical;
+ ordered->disk_bytenr = logical;
em_tree = &inode->extent_tree;
write_lock(&em_tree->lock);
em = search_extent_mapping(em_tree, ordered->file_offset,
ordered->num_bytes);
- em->block_start = *logical;
+ em->block_start = logical;
free_extent_map(em);
write_unlock(&em_tree->lock);
list_for_each_entry(sum, &ordered->list, list) {
- if (*logical < orig_logical)
- sum->bytenr -= orig_logical - *logical;
+ if (logical < orig_logical)
+ sum->bytenr -= orig_logical - logical;
else
- sum->bytenr += *logical - orig_logical;
+ sum->bytenr += logical - orig_logical;
}
-
-out:
- kfree(logical);
}
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
}
-struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
-{
- struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
-
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return ERR_CAST(em);
-
- map = em->map_lookup;
- /* We only support single profile for now */
- device = map->stripes[0].dev;
-
- free_extent_map(em);
-
- return device;
-}
-
/*
* Activate block group and underlying device zones
*
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index f43990985d80..c0570d35fea2 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -20,7 +20,6 @@ struct btrfs_zoned_device_info {
*/
u64 zone_size;
u8 zone_size_shift;
- u64 max_zone_append_size;
u32 nr_zones;
unsigned int max_active_zones;
atomic_t active_zones_left;
@@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb);
void btrfs_free_redirty_list(struct btrfs_transaction *trans);
-bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
-void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
- struct bio *bio);
+bool btrfs_use_zone_append(struct btrfs_bio *bbio);
+void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
@@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
-struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length);
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
@@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb) { }
static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
-static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
return false;
}
-static inline void btrfs_record_physical_zoned(struct inode *inode,
- u64 file_offset, struct bio *bio)
+static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
}
@@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
return -EOPNOTSUPP;
}
-static inline struct btrfs_device *btrfs_zoned_get_device(
- struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
return true;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 9804714b1751..f771001574d0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
- if (!(dio->flags & IOMAP_DIO_WRITE)) {
- WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
+ if (!(dio->flags & IOMAP_DIO_WRITE))
return REQ_OP_READ;
- }
-
- if (iomap->flags & IOMAP_F_ZONE_APPEND)
- opflags |= REQ_OP_ZONE_APPEND;
- else
- opflags |= REQ_OP_WRITE;
+ opflags |= REQ_OP_WRITE;
if (use_fua)
opflags |= REQ_FUA;
else
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c1da63f6c808..d766be7152e1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -12,6 +12,8 @@
#define BIO_MAX_VECS 256U
+struct queue_limits;
+
static inline unsigned int bio_max_segs(unsigned int nr_segs)
{
return min(nr_segs, BIO_MAX_VECS);
@@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip,
void bio_trim(struct bio *bio, sector_t offset, sector_t size);
extern struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs);
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, struct bio_set *bs, unsigned max_bytes);
/**
* bio_next_split - get next @sectors from a bio, splitting if necessary
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0983dfc9a203..fca43a4bd96b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -58,8 +58,7 @@ struct vm_fault;
#define IOMAP_F_SHARED (1U << 2)
#define IOMAP_F_MERGED (1U << 3)
#define IOMAP_F_BUFFER_HEAD (1U << 4)
-#define IOMAP_F_ZONE_APPEND (1U << 5)
-#define IOMAP_F_XATTR (1U << 6)
+#define IOMAP_F_XATTR (1U << 5)
/*
* Flags set by the core iomap code during operations:
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 6548b5b5aa60..75d7d22c3a27 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -32,6 +32,7 @@ struct prelim_ref;
struct btrfs_space_info;
struct btrfs_raid_bio;
struct raid56_bio_trace_info;
+struct find_free_extent_ctl;
#define show_ref_type(type) \
__print_symbolic(type, \
@@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free,
TRACE_EVENT(find_free_extent,
- TP_PROTO(const struct btrfs_root *root, u64 num_bytes,
- u64 empty_size, u64 data),
+ TP_PROTO(const struct btrfs_root *root,
+ const struct find_free_extent_ctl *ffe_ctl),
- TP_ARGS(root, num_bytes, empty_size, data),
+ TP_ARGS(root, ffe_ctl),
TP_STRUCT__entry_btrfs(
__field( u64, root_objectid )
__field( u64, num_bytes )
__field( u64, empty_size )
- __field( u64, data )
+ __field( u64, flags )
),
TP_fast_assign_btrfs(root->fs_info,
__entry->root_objectid = root->root_key.objectid;
- __entry->num_bytes = num_bytes;
- __entry->empty_size = empty_size;
- __entry->data = data;
+ __entry->num_bytes = ffe_ctl->num_bytes;
+ __entry->empty_size = ffe_ctl->empty_size;
+ __entry->flags = ffe_ctl->flags;
),
TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
show_root_type(__entry->root_objectid),
- __entry->num_bytes, __entry->empty_size, __entry->data,
- __print_flags((unsigned long)__entry->data, "|",
+ __entry->num_bytes, __entry->empty_size, __entry->flags,
+ __print_flags((unsigned long)__entry->flags, "|",
+ BTRFS_GROUP_FLAGS))
+);
+
+TRACE_EVENT(find_free_extent_search_loop,
+
+ TP_PROTO(const struct btrfs_root *root,
+ const struct find_free_extent_ctl *ffe_ctl),
+
+ TP_ARGS(root, ffe_ctl),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, root_objectid )
+ __field( u64, num_bytes )
+ __field( u64, empty_size )
+ __field( u64, flags )
+ __field( u64, loop )
+ ),
+
+ TP_fast_assign_btrfs(root->fs_info,
+ __entry->root_objectid = root->root_key.objectid;
+ __entry->num_bytes = ffe_ctl->num_bytes;
+ __entry->empty_size = ffe_ctl->empty_size;
+ __entry->flags = ffe_ctl->flags;
+ __entry->loop = ffe_ctl->loop;
+ ),
+
+ TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu",
+ show_root_type(__entry->root_objectid),
+ __entry->num_bytes, __entry->empty_size, __entry->flags,
+ __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
+ __entry->loop)
+);
+
+TRACE_EVENT(find_free_extent_have_block_group,
+
+ TP_PROTO(const struct btrfs_root *root,
+ const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *block_group),
+
+ TP_ARGS(root, ffe_ctl, block_group),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, root_objectid )
+ __field( u64, num_bytes )
+ __field( u64, empty_size )
+ __field( u64, flags )
+ __field( u64, loop )
+ __field( bool, hinted )
+ __field( u64, bg_start )
+ __field( u64, bg_flags )
+ ),
+
+ TP_fast_assign_btrfs(root->fs_info,
+ __entry->root_objectid = root->root_key.objectid;
+ __entry->num_bytes = ffe_ctl->num_bytes;
+ __entry->empty_size = ffe_ctl->empty_size;
+ __entry->flags = ffe_ctl->flags;
+ __entry->loop = ffe_ctl->loop;
+ __entry->hinted = ffe_ctl->hinted;
+ __entry->bg_start = block_group->start;
+ __entry->bg_flags = block_group->flags;
+ ),
+
+ TP_printk_btrfs(
+"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)",
+ show_root_type(__entry->root_objectid),
+ __entry->num_bytes, __entry->empty_size, __entry->flags,
+ __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
+ __entry->loop, __entry->hinted,
+ __entry->bg_start, __entry->bg_flags,
+ __print_flags((unsigned long)__entry->bg_flags, "|",
BTRFS_GROUP_FLAGS))
);
DECLARE_EVENT_CLASS(btrfs__reserve_extent,
- TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
- u64 len),
+ TP_PROTO(const struct btrfs_block_group *block_group,
+ const struct find_free_extent_ctl *ffe_ctl),
- TP_ARGS(block_group, start, len),
+ TP_ARGS(block_group, ffe_ctl),
TP_STRUCT__entry_btrfs(
__field( u64, bg_objectid )
__field( u64, flags )
+ __field( int, bg_size_class )
__field( u64, start )
__field( u64, len )
+ __field( u64, loop )
+ __field( bool, hinted )
+ __field( int, size_class )
),
TP_fast_assign_btrfs(block_group->fs_info,
__entry->bg_objectid = block_group->start;
__entry->flags = block_group->flags;
- __entry->start = start;
- __entry->len = len;
+ __entry->bg_size_class = block_group->size_class;
+ __entry->start = ffe_ctl->search_start;
+ __entry->len = ffe_ctl->num_bytes;
+ __entry->loop = ffe_ctl->loop;
+ __entry->hinted = ffe_ctl->hinted;
+ __entry->size_class = ffe_ctl->size_class;
),
- TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
- "start=%llu len=%llu",
+ TP_printk_btrfs(
+"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d",
show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
__entry->bg_objectid,
__entry->flags, __print_flags((unsigned long)__entry->flags,
"|", BTRFS_GROUP_FLAGS),
- __entry->start, __entry->len)
+ __entry->bg_size_class, __entry->start, __entry->len,
+ __entry->loop, __entry->hinted, __entry->size_class)
);
DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
- TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
- u64 len),
+ TP_PROTO(const struct btrfs_block_group *block_group,
+ const struct find_free_extent_ctl *ffe_ctl),
- TP_ARGS(block_group, start, len)
+ TP_ARGS(block_group, ffe_ctl)
);
DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
- TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
- u64 len),
+ TP_PROTO(const struct btrfs_block_group *block_group,
+ const struct find_free_extent_ctl *ffe_ctl),
- TP_ARGS(block_group, start, len)
+ TP_ARGS(block_group, ffe_ctl)
);
TRACE_EVENT(btrfs_find_cluster,