diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 638 |
1 files changed, 415 insertions, 223 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9e81d25dea70..4e03a6d3aa32 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include <linux/fsverity.h> #include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" @@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; + /* Caller should ensure the bio has at least some range added */ + ASSERT(bio->bi_iter.bi_size); if (is_data_inode(tree->private_data)) ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, bio_flags); @@ -238,7 +241,7 @@ int __init extent_io_init(void) return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), + offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) goto free_buffer_cache; @@ -1972,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode, /* * Find and lock a contiguous range of bytes in the file marked as delalloc, no - * more than @max_bytes. @Start and @end are used to return the range, + * more than @max_bytes. * - * Return: true if we find something - * false if nothing was in the tree + * @start: The original start bytenr to search. + * Will store the extent range start bytenr. + * @end: The original end bytenr of the search range + * Will store the extent range end bytenr. + * + * Return true if we find a delalloc range which starts inside the original + * range, and @start/@end will store the delalloc range start/end. + * + * Return false if we can't find any delalloc range which starts inside the + * original range, and @start/@end will be the non-delalloc range start/end. */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, @@ -1983,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, u64 *end) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -1991,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, int ret; int loops = 0; + /* Caller should pass a valid @end to indicate the search range end */ + ASSERT(orig_end > orig_start); + + /* The range should at least cover part of the page */ + ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || + orig_end <= page_offset(locked_page))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); - if (!found || delalloc_end <= *start) { + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { *start = delalloc_start; - *end = delalloc_end; + + /* @delalloc_end can be -1, never go beyond @orig_end */ + *end = min(delalloc_end, orig_end); free_extent_state(cached_state); return false; } @@ -2245,18 +2266,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, return bitset; } -/* - * helper function to set a given page up to date if all the - * extents in the tree for that page are up to date - */ -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) - SetPageUptodate(page); -} - int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec) @@ -2291,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree, * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); @@ -2308,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_is_zoned(fs_info)) return btrfs_repair_one_zone(fs_info, logical); - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation. */ @@ -2326,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, * stripe's dev and sector. */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bbio, 0); + &map_length, &bioc, 0); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - ASSERT(bbio->mirror_num == 1); + ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); + &map_length, &bioc, mirror_num); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - BUG_ON(mirror_num != bbio->mirror_num); + BUG_ON(mirror_num != bioc->mirror_num); } - sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[bbio->mirror_num - 1].dev; - btrfs_put_bbio(bbio); + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { btrfs_bio_counter_dec(fs_info); @@ -2627,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; - struct btrfs_io_bio *repair_io_bio; + struct btrfs_bio *repair_bbio; blk_status_t status; btrfs_debug(fs_info, @@ -2648,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode, return -EIO; } - repair_bio = btrfs_io_bio_alloc(1); - repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio = btrfs_bio_alloc(1); + repair_bbio = btrfs_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; - if (failed_io_bio->csum) { + if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; - repair_io_bio->csum = repair_io_bio->csum_inline; - memcpy(repair_io_bio->csum, - failed_io_bio->csum + csum_size * icsum, csum_size); + repair_bbio->csum = repair_bbio->csum_inline; + memcpy(repair_bbio->csum, + failed_bbio->csum + csum_size * icsum, csum_size); } bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_io_bio->logical = failrec->start; - repair_io_bio->iter = repair_bio->bi_iter; + repair_bbio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), "repair read error: submitting new read to mirror %d", @@ -2688,7 +2696,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) start + len <= page_offset(page) + PAGE_SIZE); if (uptodate) { - btrfs_page_set_uptodate(fs_info, page, start, len); + if (fsverity_active(page->mapping->host) && + !PageError(page) && + !PageUptodate(page) && + start < i_size_read(page->mapping->host) && + !fsverity_verify_page(page)) { + btrfs_page_set_error(fs_info, page, start, len); + } else { + btrfs_page_set_uptodate(fs_info, page, start, len); + } } else { btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_set_error(fs_info, page, start, len); @@ -2779,7 +2795,7 @@ next: void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { struct btrfs_inode *inode; - int uptodate = (err == 0); + const bool uptodate = (err == 0); int ret = 0; ASSERT(page && page->mapping); @@ -2787,8 +2803,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { - ClearPageUptodate(page); - SetPageError(page); + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 len; + + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; + + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } @@ -2971,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* @@ -2998,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, - io_bio->mirror_num); + bbio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -3023,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = io_bio->mirror_num; + mirror = bbio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(io_bio, + error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); ret = error_bitmap; } else { - ret = btrfs_validate_metadata_buffer(io_bio, + ret = btrfs_validate_metadata_buffer(bbio, page, start, end, mirror); } if (ret) @@ -3097,11 +3119,11 @@ readpage_ok: /* Update page status and unlock */ end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); + start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_io_bio_free_csum(io_bio); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -3110,63 +3132,55 @@ readpage_ok: * new bio by bio_alloc_bioset as it does not initialize the bytes outside of * 'bio' because use of __GFP_ZERO is not supported. */ -static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) +static inline void btrfs_bio_init(struct btrfs_bio *bbio) { - memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); } /* - * The following helpers allocate a bio. As it's backed by a bioset, it'll - * never fail. We're returning a bio right now but you can call btrfs_io_bio - * for the appropriate container_of magic + * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. + * + * The bio allocation is backed by bioset and does not fail. */ -struct bio *btrfs_bio_alloc(u64 first_byte) +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); - bio->bi_iter.bi_sector = first_byte >> 9; - btrfs_io_bio_init(btrfs_io_bio(bio)); + ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio)); return bio; } struct bio *btrfs_bio_clone(struct bio *bio) { - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); - btrfs_bio = btrfs_io_bio(new); - btrfs_io_bio_init(btrfs_bio); - btrfs_bio->iter = bio->bi_iter; + bbio = btrfs_bio(new); + btrfs_bio_init(bbio); + bbio->iter = bio->bi_iter; return new; } -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; + struct btrfs_bio *bbio; - /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); - btrfs_io_bio_init(btrfs_io_bio(bio)); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) -{ - struct bio *bio; - struct btrfs_io_bio *btrfs_bio; + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); /* this will never fail when it's backed by a bioset */ bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); - btrfs_bio = btrfs_io_bio(bio); - btrfs_io_bio_init(btrfs_bio); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio); bio_trim(bio, offset >> 9, size >> 9); - btrfs_bio->iter = bio->bi_iter; + bbio->iter = bio->bi_iter; return bio; } @@ -3181,20 +3195,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @size: portion of page that we want to write * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them - * @return: true if page was added, false otherwise * * Attempt to add a page to bio considering stripe alignment etc. * - * Return true if successfully page added. Otherwise, return false. + * Return >= 0 for the number of bytes added to the bio. + * Can return 0 if the current bio is already at stripe/zone boundary. + * Return <0 for error. */ -static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, - u64 disk_bytenr, unsigned int size, - unsigned int pg_offset, - unsigned long bio_flags) +static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long bio_flags) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; + u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; int ret; @@ -3203,29 +3219,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); if (bio_ctrl->bio_flags != bio_flags) - return false; + return 0; if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; if (!contig) - return false; + return 0; - if (bio_size + size > bio_ctrl->len_to_oe_boundary || - bio_size + size > bio_ctrl->len_to_stripe_boundary) - return false; + real_size = min(bio_ctrl->len_to_oe_boundary, + bio_ctrl->len_to_stripe_boundary) - bio_size; + real_size = min(real_size, size); + + /* + * If real_size is 0, never call bio_add_*_page(), as even size is 0, + * bio will still execute its endio function on the page! + */ + if (real_size == 0) + return 0; if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, size, pg_offset); + ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); else - ret = bio_add_page(bio, page, size, pg_offset); + ret = bio_add_page(bio, page, real_size, pg_offset); - return ret == size; + return ret; } static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u64 file_offset) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_io_geometry geom; @@ -3266,9 +3289,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, return 0; } - ASSERT(fs_info->max_zone_append_size > 0); /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, logical); + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (!ordered) { bio_ctrl->len_to_oe_boundary = U32_MAX; return 0; @@ -3280,6 +3302,63 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, return 0; } +static int alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, + unsigned int opf, + bio_end_io_t end_io_func, + u64 disk_bytenr, u32 offset, u64 file_offset, + unsigned long bio_flags) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio *bio; + int ret; + + bio = btrfs_bio_alloc(BIO_MAX_VECS); + /* + * For compressed page range, its disk_bytenr is always @disk_bytenr + * passed in, no matter if we have added any range into previous bio. + */ + if (bio_flags & EXTENT_BIO_COMPRESSED) + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + else + bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; + bio_ctrl->bio = bio; + bio_ctrl->bio_flags = bio_flags; + bio->bi_end_io = end_io_func; + bio->bi_private = &inode->io_tree; + bio->bi_write_hint = inode->vfs_inode.i_write_hint; + bio->bi_opf = opf; + ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); + if (ret < 0) + goto error; + if (wbc) { + struct block_device *bdev; + + bdev = fs_info->fs_devices->latest_dev->bdev; + bio_set_dev(bio, bdev); + wbc_init_bio(wbc, bio); + } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *device; + + device = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + btrfs_bio(bio)->device = device; + } + return 0; +error: + bio_ctrl->bio = NULL; + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return ret; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3305,61 +3384,67 @@ static int submit_extent_page(unsigned int opf, bool force_bio_submit) { int ret = 0; - struct bio *bio; - size_t io_size = min_t(size_t, size, PAGE_SIZE); struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned int cur = pg_offset; ASSERT(bio_ctrl); ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (bio_ctrl->bio) { - bio = bio_ctrl->bio; - if (force_bio_submit || - !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, - pg_offset, bio_flags)) { - ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); + if (force_bio_submit && bio_ctrl->bio) { + ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); + bio_ctrl->bio = NULL; + if (ret < 0) + return ret; + } + + while (cur < pg_offset + size) { + u32 offset = cur - pg_offset; + int added; + + /* Allocate new bio if needed */ + if (!bio_ctrl->bio) { + ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, + end_io_func, disk_bytenr, offset, + page_offset(page) + cur, + bio_flags); + if (ret < 0) + return ret; + } + /* + * We must go through btrfs_bio_add_page() to ensure each + * page range won't cross various boundaries. + */ + if (bio_flags & EXTENT_BIO_COMPRESSED) + added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, + size - offset, pg_offset + offset, + bio_flags); + else + added = btrfs_bio_add_page(bio_ctrl, page, + disk_bytenr + offset, size - offset, + pg_offset + offset, bio_flags); + + /* Metadata page range should never be split */ + if (!is_data_inode(&inode->vfs_inode)) + ASSERT(added == 0 || added == size - offset); + + /* At least we added some page, update the account */ + if (wbc && added) + wbc_account_cgroup_owner(wbc, page, added); + + /* We have reached boundary, submit right now */ + if (added < size - offset) { + /* The bio should contain some page(s) */ + ASSERT(bio_ctrl->bio->bi_iter.bi_size); + ret = submit_one_bio(bio_ctrl->bio, mirror_num, + bio_ctrl->bio_flags); bio_ctrl->bio = NULL; if (ret < 0) return ret; - } else { - if (wbc) - wbc_account_cgroup_owner(wbc, page, io_size); - return 0; } + cur += added; } - - bio = btrfs_bio_alloc(disk_bytenr); - bio_add_page(bio, page, io_size, pg_offset); - bio->bi_end_io = end_io_func; - bio->bi_private = tree; - bio->bi_write_hint = page->mapping->host->i_write_hint; - bio->bi_opf = opf; - if (wbc) { - struct block_device *bdev; - - bdev = fs_info->fs_devices->latest_bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, page, io_size); - } - if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; - - device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); - if (IS_ERR(device)) - return PTR_ERR(device); - - btrfs_io_bio(bio)->device = device; - } - - bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; - ret = calc_bio_boundaries(bio_ctrl, inode); - - return ret; + return 0; } static int attach_extent_buffer_page(struct extent_buffer *eb, @@ -3488,7 +3573,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_page_extent_mapped(page); @@ -3519,9 +3603,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } begin_page_read(fs_info, page); while (cur <= end) { + unsigned long this_bio_flag = 0; bool force_bio_submit = false; u64 disk_bytenr; + ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { struct extent_state *cached = NULL; @@ -3627,7 +3713,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* the get_extent function already copied into the page */ if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { - check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, true, cur, iosize); cur = cur + iosize; @@ -3701,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc, */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - u64 delalloc_start, unsigned long *nr_written) + unsigned long *nr_written) { - u64 page_end = delalloc_start + PAGE_SIZE - 1; - bool found; + const u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; - u64 delalloc_end = 0; int ret; int page_started = 0; + while (delalloc_start < page_end) { + u64 delalloc_end = page_end; + bool found; - while (delalloc_end < page_end) { found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); @@ -3722,14 +3808,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); if (ret) { - SetPageError(page); - /* - * btrfs_run_delalloc_range should return < 0 for error - * but just in case, we use > 0 here meaning the IO is - * started, so we don't want to return > 0 unless - * things are going well. - */ - return ret < 0 ? ret : -EIO; + btrfs_page_set_error(inode->root->fs_info, page, + page_offset(page), PAGE_SIZE); + return ret; } /* * delalloc_end is already one less than the total length, so @@ -3783,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ - unsigned long dirty_bitmap; unsigned long flags; - int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; - int range_start_bit = nbits; + int range_start_bit; int range_end_bit; /* @@ -3801,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, return; } + range_start_bit = spi->dirty_offset + + (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + /* We should have the page locked, but just in case */ spin_lock_irqsave(&subpage->lock, flags); - dirty_bitmap = subpage->dirty_bitmap; + bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, + spi->dirty_offset + spi->bitmap_nr_bits); spin_unlock_irqrestore(&subpage->lock, flags); - bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, - BTRFS_SUBPAGE_BITMAP_SIZE); + range_start_bit -= spi->dirty_offset; + range_end_bit -= spi->dirty_offset; + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; *end = page_offset(page) + range_end_bit * fs_info->sectorsize; } @@ -3829,9 +3914,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + u64 cur = page_offset(page); + u64 end = cur + PAGE_SIZE - 1; u64 extent_offset; u64 block_start; struct extent_map *em; @@ -3841,7 +3925,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, end); + ret = btrfs_writepage_cow_fixup(page); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3865,7 +3949,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (cur >= i_size) { btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, 1); + end, true); + /* + * This range is beyond i_size, thus we don't need to + * bother writing back. + * But we still need to clear the dirty subpage bit, or + * the next time the page gets dirtied, we will try to + * writeback the sectors with subpage dirty bits, + * causing writeback without ordered extent. + */ + btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); break; } @@ -3915,7 +4008,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, nr++; else btrfs_writepage_endio_finish_ordered(inode, - page, cur, cur + iosize - 1, 1); + page, cur, cur + iosize - 1, true); + btrfs_page_clear_dirty(fs_info, page, cur, iosize); cur += iosize; continue; } @@ -3951,6 +4045,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, cur += iosize; nr++; } + /* + * If we finish without problem, we should not only clear page dirty, + * but also empty subpage dirty bits + */ + if (!ret) + btrfs_page_assert_not_dirty(fs_info, page); *nr_ret = nr; return ret; } @@ -3968,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { struct inode *inode = page->mapping->host; - u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -3981,7 +4082,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, WARN_ON(!PageLocked(page)); - ClearPageError(page); + btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, + page_offset(page), PAGE_SIZE); pg_offset = offset_in_page(i_size); if (page->index > end_index || @@ -4003,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!epd->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, - &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written); if (ret == 1) return 0; if (ret) @@ -4022,11 +4123,52 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (PageError(page)) { - ret = ret < 0 ? ret : -EIO; - end_extent_writepage(page, ret, start, page_end); + /* + * Here we used to have a check for PageError() and then set @ret and + * call end_extent_writepage(). + * + * But in fact setting @ret here will cause different error paths + * between subpage and regular sectorsize. + * + * For regular page size, we never submit current page, but only add + * current page to current bio. + * The bio submission can only happen in next page. + * Thus if we hit the PageError() branch, @ret is already set to + * non-zero value and will not get updated for regular sectorsize. + * + * But for subpage case, it's possible we submit part of current page, + * thus can get PageError() set by submitted bio of the same page, + * while our @ret is still 0. + * + * So here we unify the behavior and don't set @ret. + * Error can still be properly passed to higher layer as page will + * be set error, here we just don't handle the IO failure. + * + * NOTE: This is just a hotfix for subpage. + * The root fix will be properly ending ordered extent when we hit + * an error during writeback. + * + * But that needs a bigger refactoring, as we not only need to grab the + * submitted OE, but also need to know exactly at which bytenr we hit + * the error. + * Currently the full page based __extent_writepage_io() is not + * capable of that. + */ + if (PageError(page)) + end_extent_writepage(page, ret, page_start, page_end); + if (epd->extent_locked) { + /* + * If epd->extent_locked, it's from extent_write_locked_range(), + * the page can either be locked by lock_page() or + * process_one_page(). + * Let btrfs_page_unlock_writer() handle both cases. + */ + ASSERT(wbc); + btrfs_page_unlock_writer(fs_info, page, wbc->range_start, + wbc->range_end + 1 - wbc->range_start); + } else { + unlock_page(page); } - unlock_page(page); ASSERT(ret <= 0); return ret; } @@ -4039,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { + if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) + btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4486,12 +4631,11 @@ static int submit_eb_subpage(struct page *page, int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; - const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; int ret; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < nbits) { + while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; struct extent_buffer *eb; unsigned long flags; @@ -4507,7 +4651,8 @@ static int submit_eb_subpage(struct page *page, break; } spin_lock_irqsave(&subpage->lock, flags); - if (!((1 << bit_start) & subpage->dirty_bitmap)) { + if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&page->mapping->private_lock); bit_start++; @@ -4640,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, free_extent_buffer(eb); return ret; } - if (cache) + if (cache) { + /* Impiles write in zoned mode */ btrfs_put_block_group(cache); + /* Mark the last eb in a block group */ + if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) + set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4757,7 +4907,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. */ - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (!BTRFS_FS_ERROR(fs_info)) { ret = flush_write_bio(&epd); } else { ret = -EROFS; @@ -4953,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode) +/* + * Submit the pages in the range to bio for call sites which delalloc range has + * already been ran (aka, ordered extent inserted) and all pages are still + * locked. + */ +int extent_write_locked_range(struct inode *inode, u64 start, u64 end) { + bool found_error = false; + int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long nr_pages = (end - start + PAGE_SIZE) >> - PAGE_SHIFT; - + u64 cur = start; + unsigned long nr_pages; + const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; struct extent_page_data epd = { .bio_ctrl = { 0 }, .extent_locked = 1, - .sync_io = mode == WB_SYNC_ALL, + .sync_io = 1, }; struct writeback_control wbc_writepages = { - .sync_mode = mode, - .nr_to_write = nr_pages * 2, + .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end + 1, /* We're called from an async helper function */ @@ -4977,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, .no_cgroup_owner = 1, }; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); + nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> + PAGE_SHIFT; + wbc_writepages.nr_to_write = nr_pages * 2; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); - while (start <= end) { - page = find_get_page(mapping, start >> PAGE_SHIFT); - if (clear_page_dirty_for_io(page)) - ret = __extent_writepage(page, &wbc_writepages, &epd); - else { - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), - page, start, start + PAGE_SIZE - 1, 1); - unlock_page(page); + while (cur <= end) { + u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + + page = find_get_page(mapping, cur >> PAGE_SHIFT); + /* + * All pages in the range are locked since + * btrfs_run_delalloc_range(), thus there is no way to clear + * the page dirty flag. + */ + ASSERT(PageLocked(page)); + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + ret = __extent_writepage(page, &wbc_writepages, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + found_error = true; + first_error = ret; } put_page(page); - start += PAGE_SIZE; + cur = cur_end + 1; } - ASSERT(ret <= 0); - if (ret == 0) + if (!found_error) ret = flush_write_bio(&epd); else end_write_bio(&epd, ret); wbc_detach_inode(&wbc_writepages); + if (found_error) + return first_error; return ret; } int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct inode *inode = mapping->host; + const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root); + const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info); int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5011,7 +5184,15 @@ int extent_writepages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + /* + * Allow only a single thread to do the reloc work in zoned mode to + * protect the write pointer updates. + */ + if (data_reloc && zoned) + btrfs_inode_lock(inode, 0); ret = extent_write_cache_pages(mapping, wbc, &epd); + if (data_reloc && zoned) + btrfs_inode_unlock(inode, 0); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); @@ -6021,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - ret = btrfs_alloc_subpage(fs_info, &prealloc, - BTRFS_SUBPAGE_METADATA); - if (ret < 0) { - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; + if (fs_info->sectorsize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + ret = PTR_ERR(prealloc); + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } } spin_lock(&mapping->private_lock); @@ -7051,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; u64 page_start = page_offset(page); - int ret; - int i; + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); - ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); lockdep_assert_held(&fs_info->buffer_lock); - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, - bytenr >> fs_info->sectorsize_bits, - PAGE_SIZE / fs_info->nodesize); - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - break; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - break; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } +out: return found; } |