diff options
Diffstat (limited to 'fs/iomap/buffered-io.c')
-rw-r--r-- | fs/iomap/buffered-io.c | 288 |
1 files changed, 143 insertions, 145 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9b4ca3811a24..ef0b68bccbb6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -23,7 +23,6 @@ #define IOEND_BATCH_SIZE 4096 -typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -1022,13 +1021,14 @@ retry: ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(i), .flags = IOMAP_WRITE, + .private = private, }; ssize_t ret; @@ -1046,15 +1046,14 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static int iomap_write_delalloc_ifs_punch(struct inode *inode, +static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, - iomap_punch_t punch) + struct iomap *iomap, iomap_punch_t punch) { unsigned int first_blk, last_blk, i; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; - int ret = 0; /* * When we have per-block dirty tracking, there can be @@ -1064,47 +1063,35 @@ static int iomap_write_delalloc_ifs_punch(struct inode *inode, */ ifs = folio->private; if (!ifs) - return ret; + return; last_byte = min_t(loff_t, end_byte - 1, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) { - ret = punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits); - if (ret) - return ret; - } + if (!ifs_block_is_dirty(folio, ifs, i)) + punch(inode, folio_pos(folio) + (i << blkbits), + 1 << blkbits, iomap); } - - return ret; } - -static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, +static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - iomap_punch_t punch) + struct iomap *iomap, iomap_punch_t punch) { - int ret = 0; - if (!folio_test_dirty(folio)) - return ret; + return; /* if dirty, punch up to offset */ if (start_byte > *punch_start_byte) { - ret = punch(inode, *punch_start_byte, - start_byte - *punch_start_byte); - if (ret) - return ret; + punch(inode, *punch_start_byte, start_byte - *punch_start_byte, + iomap); } /* Punch non-dirty blocks within folio */ - ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, - end_byte, punch); - if (ret) - return ret; + iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, + iomap, punch); /* * Make sure the next punch start is correctly bound to @@ -1112,8 +1099,6 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, */ *punch_start_byte = min_t(loff_t, end_byte, folio_pos(folio) + folio_size(folio)); - - return ret; } /* @@ -1133,13 +1118,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, * This function uses [start_byte, end_byte) intervals (i.e. open ended) to * simplify range iterations. */ -static int iomap_write_delalloc_scan(struct inode *inode, +static void iomap_write_delalloc_scan(struct inode *inode, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - iomap_punch_t punch) + struct iomap *iomap, iomap_punch_t punch) { while (start_byte < end_byte) { struct folio *folio; - int ret; /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, @@ -1150,27 +1134,47 @@ static int iomap_write_delalloc_scan(struct inode *inode, continue; } - ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, - start_byte, end_byte, punch); - if (ret) { - folio_unlock(folio); - folio_put(folio); - return ret; - } + iomap_write_delalloc_punch(inode, folio, punch_start_byte, + start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ start_byte = folio_next_index(folio) << PAGE_SHIFT; folio_unlock(folio); folio_put(folio); } - return 0; } /* + * When a short write occurs, the filesystem might need to use ->iomap_end + * to remove space reservations created in ->iomap_begin. + * + * For filesystems that use delayed allocation, there can be dirty pages over + * the delalloc extent outside the range of a short write but still within the + * delalloc extent allocated for this iomap if the write raced with page + * faults. + * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This @@ -1199,20 +1203,21 @@ static int iomap_write_delalloc_scan(struct inode *inode, * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ -static int iomap_write_delalloc_release(struct inode *inode, - loff_t start_byte, loff_t end_byte, iomap_punch_t punch) +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, + loff_t end_byte, unsigned flags, struct iomap *iomap, + iomap_punch_t punch) { loff_t punch_start_byte = start_byte; loff_t scan_end_byte = min(i_size_read(inode), end_byte); - int error = 0; /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite whilst we walk the - * cache and perform delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. + * The caller must hold invalidate_lock to avoid races with page faults + * re-instantiating folios and dirtying them via ->page_mkwrite whilst + * we walk the cache and perform delalloc extent removal. Failing to do + * this can leave dirty pages with no space reservation in the cache. */ - filemap_invalidate_lock(inode->i_mapping); + lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); + while (start_byte < scan_end_byte) { loff_t data_end; @@ -1221,13 +1226,15 @@ static int iomap_write_delalloc_release(struct inode *inode, /* * If there is no more data to scan, all that is left is to * punch out the remaining range. + * + * Note that mapping_seek_hole_data is only supposed to return + * either an offset or -ENXIO, so WARN on any other error as + * that would be an API change without updating the callers. */ if (start_byte == -ENXIO || start_byte == scan_end_byte) break; - if (start_byte < 0) { - error = start_byte; - goto out_unlock; - } + if (WARN_ON_ONCE(start_byte < 0)) + return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); @@ -1237,108 +1244,40 @@ static int iomap_write_delalloc_release(struct inode *inode, */ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); - if (data_end < 0) { - error = data_end; - goto out_unlock; - } - WARN_ON_ONCE(data_end <= start_byte); + if (WARN_ON_ONCE(data_end < 0)) + return; + + /* + * If we race with post-direct I/O invalidation of the page cache, + * there might be no data left at start_byte. + */ + if (data_end == start_byte) + continue; + + WARN_ON_ONCE(data_end < start_byte); WARN_ON_ONCE(data_end > scan_end_byte); - error = iomap_write_delalloc_scan(inode, &punch_start_byte, - start_byte, data_end, punch); - if (error) - goto out_unlock; + iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte, + data_end, iomap, punch); /* The next data search starts at the end of this one. */ start_byte = data_end; } if (punch_start_byte < end_byte) - error = punch(inode, punch_start_byte, - end_byte - punch_start_byte); -out_unlock: - filemap_invalidate_unlock(inode->i_mapping); - return error; + punch(inode, punch_start_byte, end_byte - punch_start_byte, + iomap); } - -/* - * When a short write occurs, the filesystem may need to remove reserved space - * that was allocated in ->iomap_begin from it's ->iomap_end method. For - * filesystems that use delayed allocation, we need to punch out delalloc - * extents from the range that are not dirty in the page cache. As the write can - * race with page faults, there can be dirty pages over the delalloc extent - * outside the range of a short write but still within the delalloc extent - * allocated for this iomap. - * - * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations. - * - * The punch() callback *must* only punch delalloc extents in the range passed - * to it. It must skip over all other types of extents in the range and leave - * them completely unchanged. It must do this punch atomically with respect to - * other extent modifications. - * - * The punch() callback may be called with a folio locked to prevent writeback - * extent allocation racing at the edge of the range we are currently punching. - * The locked folio may or may not cover the range being punched, so it is not - * safe for the punch() callback to lock folios itself. - * - * Lock order is: - * - * inode->i_rwsem (shared or exclusive) - * inode->i_mapping->invalidate_lock (exclusive) - * folio_lock() - * ->punch - * internal filesystem allocation lock - */ -int iomap_file_buffered_write_punch_delalloc(struct inode *inode, - struct iomap *iomap, loff_t pos, loff_t length, - ssize_t written, iomap_punch_t punch) -{ - loff_t start_byte; - loff_t end_byte; - unsigned int blocksize = i_blocksize(inode); - - if (iomap->type != IOMAP_DELALLOC) - return 0; - - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return 0; - - /* - * start_byte refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(pos, blocksize); - else - start_byte = round_up(pos + written, blocksize); - end_byte = round_up(pos + length, blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return 0; - - return iomap_write_delalloc_release(inode, start_byte, end_byte, - punch); -} -EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); +EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; - /* don't bother with blocks that are not shared to start with */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; - /* don't bother with holes or unwritten extents */ - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + if (!iomap_want_unshare_iter(iter)) return length; do { @@ -1382,27 +1321,68 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, struct iomap_iter iter = { .inode = inode, .pos = pos, - .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE, }; + loff_t size = i_size_read(inode); int ret; + if (pos < 0 || pos >= size) + return 0; + + iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +/* + * Flush the remaining range of the iter and mark the current mapping stale. + * This is used when zero range sees an unwritten mapping that may have had + * dirty pagecache over it. + */ +static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) +{ + struct address_space *mapping = i->inode->i_mapping; + loff_t end = i->pos + i->len - 1; + + i->iomap.flags |= IOMAP_F_STALE; + return filemap_write_and_wait_range(mapping, i->pos, end); +} + +static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, + bool *range_dirty) { const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; - /* already zeroed? we're done. */ - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + /* + * We must zero subranges of unwritten mappings that might be dirty in + * pagecache from previous writes. We only know whether the entire range + * was clean or not, however, and dirty folios may have been written + * back or reclaimed at any point after mapping lookup. + * + * The easiest way to deal with this is to flush pagecache to trigger + * any pending unwritten conversions and then grab the updated extents + * from the fs. The flush may change the current mapping, so mark it + * stale for the iterator to remap it for the next pass to handle + * properly. + * + * Note that holes are treated the same as unwritten because zero range + * is (ab)used for partial folio zeroing in some cases. Hole backed + * post-eof ranges can be dirtied via mapped write and the flush + * triggers writeback time post-eof zeroing. + */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { + if (*range_dirty) { + *range_dirty = false; + return iomap_zero_iter_flush_and_stale(iter); + } + /* range is clean and already zeroed, nothing to do */ return length; + } do { struct folio *folio; @@ -1450,9 +1430,27 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, .flags = IOMAP_ZERO, }; int ret; + bool range_dirty; + + /* + * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but + * pagecache must be flushed to ensure stale data from previous + * buffered writes is not exposed. A flush is only required for certain + * types of mappings, but checking pagecache after mapping lookup is + * racy with writeback and reclaim. + * + * Therefore, check the entire range first and pass along whether any + * part of it is dirty. If so and an underlying mapping warrants it, + * flush the cache at that point. This trades off the occasional false + * positive (and spurious flush, if the dirty data and mapping don't + * happen to overlap) for simplicity in handling a relatively uncommon + * situation. + */ + range_dirty = filemap_range_needs_writeback(inode->i_mapping, + pos, pos + len - 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.processed = iomap_zero_iter(&iter, did_zero, &range_dirty); return ret; } EXPORT_SYMBOL_GPL(iomap_zero_range); @@ -2007,10 +2005,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, } EXPORT_SYMBOL_GPL(iomap_writepages); -static int __init iomap_init(void) +static int __init iomap_buffered_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } -fs_initcall(iomap_init); +fs_initcall(iomap_buffered_init); |