diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r-- | fs/xfs/xfs_file.c | 470 |
1 files changed, 114 insertions, 356 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a314fc7b56fa..65d27a502909 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,40 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; /* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. - */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - inode_lock(VFS_I(ip)); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( - struct xfs_inode *ip, - int type) -{ - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -/* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. */ @@ -183,19 +149,16 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { - /* - * If we have an RT and/or log subvolume we need to make sure - * to flush the write cache the device used for file data - * first. This is to ensure newly written file data make - * it to disk before logging the new inode size in case of - * an extending write. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); - else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); - } + /* + * If we have an RT and/or log subvolume we need to make sure to flush + * the write cache the device used for file data first. This is to + * ensure newly written file data make it to disk before logging the new + * inode size in case of an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); /* * All metadata updates are logged, which means that we just have to @@ -230,10 +193,8 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if ((mp->m_flags & XFS_MOUNT_BARRIER) && - mp->m_logdev_targp == mp->m_ddev_targp && - !XFS_IS_REALTIME_INODE(ip) && - !log_flushed) + if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; @@ -244,87 +205,20 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - loff_t isize = i_size_read(inode); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); - struct iov_iter data; - struct xfs_buftarg *target; - ssize_t ret = 0; + ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ - if (XFS_IS_REALTIME_INODE(ip)) - target = ip->i_mount->m_rtdev_targp; - else - target = ip->i_mount->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } - file_accessed(iocb->ki_filp); - /* - * Locking is a bit tricky here. If we take an exclusive lock for direct - * IO, we effectively serialise all new concurrent read IO to this file - * and block it behind IO that is currently in progress because IO in - * progress holds the IO lock shared. We only need to hold the lock - * exclusive to blow away the page cache, so only take lock exclusively - * if the page cache needs invalidation. This allows the normal direct - * IO case of no page cache pages to proceeed concurrently without - * serialisation. - */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if (mapping->nrpages) { - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); - - /* - * The generic dio code only flushes the range of the particular - * I/O. Because we take an exclusive lock here, this whole - * sequence is considerably more expensive for us. This has a - * noticeable performance impact for any file with cached pages, - * even when outside of the range of the particular I/O. - * - * Hence, amortize the cost of the lock against a full file - * flush and reduce the chances of repeated iolock cycles going - * forward. - */ - if (mapping->nrpages) { - ret = filemap_write_and_wait(mapping); - if (ret) { - xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; - } - - /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. - */ - ret = invalidate_inode_pages2(mapping); - WARN_ON_ONCE(ret); - ret = 0; - } - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } - - data = *to; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); - if (ret >= 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -343,9 +237,9 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; @@ -361,9 +255,9 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -444,15 +338,18 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, true); + error = xfs_break_layouts(inode, iolock); if (error) return error; - /* For changing security info in file_remove_privs() we need i_mutex */ + /* + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); goto restart; } /* @@ -477,9 +374,9 @@ restart: spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* @@ -522,6 +419,58 @@ restart: return 0; } +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + bool update_size = false; + int error = 0; + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (size <= 0) + return size; + + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + update_size = true; + } + spin_unlock(&ip->i_flags_lock); + + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + if (flags & IOMAP_DIO_UNWRITTEN) + error = xfs_iomap_write_unwritten(ip, offset, size); + else if (update_size) + error = xfs_setfilesize(ip, offset, size); + + return error; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -561,75 +510,44 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t end; - struct iov_iter data; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) - unaligned_io = 1; - /* - * We don't need to take an exclusive lock unless there page cache needs - * to be invalidated or unaligned IO is being executed. We don't need to - * consider the EOF extension case here because - * xfs_file_aio_write_checks() will relock the inode as necessary for - * EOF zeroing cases and fill out the new inode size as appropriate. + * Don't take the exclusive iolock here unless the I/O is unaligned to + * the file system block size. We don't need to consider the EOF + * extension case here because xfs_file_aio_write_checks() will relock + * the inode as necessary for EOF zeroing cases and fill out the new + * inode size as appropriate. */ - if (unaligned_io || mapping->nrpages) + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = 1; iolock = XFS_IOLOCK_EXCL; - else + } else { iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, iolock); - - /* - * Recheck if there are cached pages that need invalidate after we got - * the iolock to protect against other threads adding new pages while - * we were waiting for the iolock. - */ - if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, iolock); - iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); } + xfs_ilock(ip, iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); - end = iocb->ki_pos + count - 1; - - /* - * See xfs_file_dio_aio_read() for why we do a full-file flush here. - */ - if (mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); - if (ret) - goto out; - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); - WARN_ON_ONCE(ret); - ret = 0; - } /* * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to flush cached pages + * otherwise demote the lock if we had to take the exclusive lock + * for other reasons in xfs_file_aio_write_checks. */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -642,24 +560,9 @@ xfs_file_dio_aio_write( goto out; } - data = *from; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - - /* see generic_file_direct_write() for why this is necessary */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } - - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -681,7 +584,7 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -690,15 +593,13 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - - ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } - out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return error ? error : ret; } @@ -715,7 +616,7 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -759,7 +660,7 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return ret; } @@ -835,7 +736,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -947,134 +848,6 @@ out_unlock: return error; } -/* - * Flush all file writes out to disk. - */ -static int -xfs_file_wait_for_io( - struct inode *inode, - loff_t offset, - size_t len) -{ - loff_t rounding; - loff_t ioffset; - loff_t iendoffset; - loff_t bs; - int ret; - - bs = inode->i_sb->s_blocksize; - inode_dio_wait(inode); - - rounding = max_t(xfs_off_t, bs, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - iendoffset); - return ret; -} - -/* Hook up to the VFS reflink function */ -STATIC int -xfs_file_share_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in; - struct inode *inode_out; - ssize_t ret; - loff_t bs; - loff_t isize; - int same_inode; - loff_t blen; - unsigned int flags = 0; - - inode_in = file_inode(file_in); - inode_out = file_inode(file_out); - bs = inode_out->i_sb->s_blocksize; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - if (IS_SWAPFILE(inode_in) || - IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Reflink only works within this filesystem. */ - if (inode_in->i_sb != inode_out->i_sb) - return -EXDEV; - same_inode = (inode_in->i_ino == inode_out->i_ino); - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - return -EINVAL; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Don't share DAX file data for now. */ - if (IS_DAX(inode_in) || IS_DAX(inode_out)) - return -EINVAL; - - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) - return 0; - if (len == 0) - len = isize - pos_in; - - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + len < pos_in || pos_out + len < pos_out || - pos_in + len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - - /* Wait for the completion of any pending IOs on srcfile */ - ret = xfs_file_wait_for_io(inode_in, pos_in, len); - if (ret) - goto out; - ret = xfs_file_wait_for_io(inode_out, pos_out, len); - if (ret) - goto out; - - if (is_dedupe) - flags |= XFS_REFLINK_DEDUPE; - ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), - pos_out, len, flags); - if (ret < 0) - goto out; - -out: - return ret; -} - STATIC ssize_t xfs_file_copy_range( struct file *file_in, @@ -1086,7 +859,7 @@ xfs_file_copy_range( { int error; - error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, + error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, false); if (error) return error; @@ -1101,11 +874,10 @@ xfs_file_clone_range( loff_t pos_out, u64 len) { - return xfs_file_share_range(file_in, pos_in, file_out, pos_out, + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, false); } -#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) STATIC ssize_t xfs_file_dedupe_range( struct file *src_file, @@ -1116,15 +888,7 @@ xfs_file_dedupe_range( { int error; - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > XFS_MAX_DEDUPE_LEN) - len = XFS_MAX_DEDUPE_LEN; - - error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, + error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) return error; @@ -1640,7 +1404,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1667,15 +1431,9 @@ xfs_filemap_fault( return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - /* - * we do not want to trigger unwritten extent conversion on read - * faults - that is unnecessary overhead and would also require - * changes to xfs_get_blocks_direct() to map unwritten extent - * ioend for conversion on read-only mappings. - */ - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); - } else + if (IS_DAX(inode)) + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); + else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1711,7 +1469,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) |