summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c248
1 files changed, 152 insertions, 96 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8dad722c0041..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
@@ -86,11 +87,11 @@ xfs_destroy_ioend(
}
if (ioend->io_iocb) {
+ inode_dio_done(ioend->io_inode);
if (ioend->io_isasync) {
aio_complete(ioend->io_iocb, ioend->io_error ?
ioend->io_error : ioend->io_result, 0);
}
- inode_dio_done(ioend->io_inode);
}
mempool_free(ioend, xfs_ioend_pool);
@@ -124,6 +125,12 @@ xfs_setfilesize_trans_alloc(
ioend->io_append_trans = tp;
/*
+ * We may pass freeze protection with a transaction. So tell lockdep
+ * we released it.
+ */
+ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
+ /*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
@@ -143,11 +150,13 @@ xfs_setfilesize(
xfs_fsize_t isize;
/*
- * The transaction was allocated in the I/O submission thread,
- * thus we need to mark ourselves as beeing in a transaction
- * manually.
+ * The transaction may have been allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction manually.
+ * Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -179,9 +188,10 @@ xfs_finish_ioend(
if (atomic_dec_and_test(&ioend->io_remaining)) {
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == IO_UNWRITTEN)
+ if (ioend->io_type == XFS_IO_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
+ else if (ioend->io_append_trans ||
+ (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
@@ -210,36 +220,32 @@ xfs_end_io(
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
*/
- if (ioend->io_type == IO_UNWRITTEN) {
+ if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
/*
- * For buffered I/O we never preallocate a transaction when
- * doing the unwritten extent conversion, but for direct I/O
- * we do not know if we are converting an unwritten extent
- * or not at the point where we preallocate the transaction.
+ * For direct I/O we do not know if we need to allocate blocks
+ * or not so we can't preallocate an append transaction as that
+ * results in nested reservations and log space deadlocks. Hence
+ * allocate the transaction here. While this is sub-optimal and
+ * can block IO completion for some time, we're stuck with doing
+ * it this way until we can pass the ioend to the direct IO
+ * allocation callbacks and avoid nesting that way.
*/
- if (ioend->io_append_trans) {
- ASSERT(ioend->io_isdirect);
-
- current_set_flags_nested(
- &ioend->io_append_trans->t_pflags, PF_FSTRANS);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
-
- error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
- ioend->io_size);
- if (error) {
- ioend->io_error = -error;
+ error = xfs_setfilesize_trans_alloc(ioend);
+ if (error)
goto done;
- }
+ error = xfs_setfilesize(ioend);
} else if (ioend->io_append_trans) {
error = xfs_setfilesize(ioend);
- if (error)
- ioend->io_error = -error;
} else {
ASSERT(!xfs_ioend_is_append(ioend));
}
done:
+ if (error)
+ ioend->io_error = -error;
xfs_destroy_ioend(ioend);
}
@@ -312,7 +318,7 @@ xfs_map_blocks(
if (XFS_FORCED_SHUTDOWN(mp))
return -XFS_ERROR(EIO);
- if (type == IO_UNWRITTEN)
+ if (type == XFS_IO_UNWRITTEN)
bmapi_flags |= XFS_BMAPI_IGSTATE;
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
@@ -323,10 +329,10 @@ xfs_map_blocks(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_maxioffset);
+ ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset + count > mp->m_maxioffset)
- count = mp->m_maxioffset - offset;
+ if (offset + count > mp->m_super->s_maxbytes)
+ count = mp->m_super->s_maxbytes - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
@@ -336,7 +342,7 @@ xfs_map_blocks(
if (error)
return -XFS_ERROR(error);
- if (type == IO_DELALLOC &&
+ if (type == XFS_IO_DELALLOC &&
(!nimaps || isnullstartblock(imap->br_startblock))) {
error = xfs_iomap_write_allocate(ip, offset, count, imap);
if (!error)
@@ -345,7 +351,7 @@ xfs_map_blocks(
}
#ifdef DEBUG
- if (type == IO_UNWRITTEN) {
+ if (type == XFS_IO_UNWRITTEN) {
ASSERT(nimaps);
ASSERT(imap->br_startblock != HOLESTARTBLOCK);
ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
@@ -466,11 +472,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
*
* The fix is two passes across the ioend list - one to start writeback on the
* buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
*/
STATIC void
xfs_submit_ioend(
struct writeback_control *wbc,
- xfs_ioend_t *ioend)
+ xfs_ioend_t *ioend,
+ int fail)
{
xfs_ioend_t *head = ioend;
xfs_ioend_t *next;
@@ -491,6 +503,18 @@ xfs_submit_ioend(
next = ioend->io_list;
bio = NULL;
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ if (fail) {
+ ioend->io_error = -fail;
+ xfs_finish_ioend(ioend);
+ continue;
+ }
+
for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
if (!bio) {
@@ -634,11 +658,11 @@ xfs_check_page_type(
bh = head = page_buffers(page);
do {
if (buffer_unwritten(bh))
- acceptable += (type == IO_UNWRITTEN);
+ acceptable += (type == XFS_IO_UNWRITTEN);
else if (buffer_delay(bh))
- acceptable += (type == IO_DELALLOC);
+ acceptable += (type == XFS_IO_DELALLOC);
else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable += (type == IO_OVERWRITE);
+ acceptable += (type == XFS_IO_OVERWRITE);
else
break;
} while ((bh = bh->b_this_page) != head);
@@ -701,6 +725,25 @@ xfs_convert_page(
(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
i_size_read(inode));
+ /*
+ * If the current map does not span the entire page we are about to try
+ * to write, then give up. The only way we can write a page that spans
+ * multiple mappings in a single writeback iteration is via the
+ * xfs_vm_writepage() function. Data integrity writeback requires the
+ * entire page to be written in a single attempt, otherwise the part of
+ * the page we don't write here doesn't get written as part of the data
+ * integrity sync.
+ *
+ * For normal writeback, we also don't attempt to write partial pages
+ * here as it simply means that write_cache_pages() will see it under
+ * writeback and ignore the page until some point in the future, at
+ * which time this will be the only page in the file that needs
+ * writeback. Hence for more optimal IO patterns, we should always
+ * avoid partial page writeback due to multiple mappings on a page here.
+ */
+ if (!xfs_imap_valid(inode, imap, end_offset))
+ goto fail_unlock_page;
+
len = 1 << inode->i_blkbits;
p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
PAGE_CACHE_SIZE);
@@ -721,11 +764,11 @@ xfs_convert_page(
if (buffer_unwritten(bh) || buffer_delay(bh) ||
buffer_mapped(bh)) {
if (buffer_unwritten(bh))
- type = IO_UNWRITTEN;
+ type = XFS_IO_UNWRITTEN;
else if (buffer_delay(bh))
- type = IO_DELALLOC;
+ type = XFS_IO_DELALLOC;
else
- type = IO_OVERWRITE;
+ type = XFS_IO_OVERWRITE;
if (!xfs_imap_valid(inode, imap, offset)) {
done = 1;
@@ -733,7 +776,7 @@ xfs_convert_page(
}
lock_buffer(bh);
- if (type != IO_OVERWRITE)
+ if (type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, imap, offset);
xfs_add_to_ioend(inode, bh, offset, type,
ioendp, done);
@@ -800,10 +843,12 @@ xfs_cluster_write(
STATIC void
xfs_vm_invalidatepage(
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
}
/*
@@ -831,7 +876,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_check_page_type(page, IO_DELALLOC))
+ if (!xfs_check_page_type(page, XFS_IO_DELALLOC))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -867,7 +912,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0);
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
return;
}
@@ -897,7 +942,7 @@ xfs_vm_writepage(
int count = 0;
int nonblocking = 0;
- trace_xfs_writepage(inode, page, 0);
+ trace_xfs_writepage(inode, page, 0, 0);
ASSERT(page_has_buffers(page));
@@ -927,11 +972,26 @@ xfs_vm_writepage(
end_index = offset >> PAGE_CACHE_SHIFT;
last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
if (page->index >= end_index) {
- if ((page->index >= end_index + 1) ||
- !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
- unlock_page(page);
- return 0;
- }
+ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * Skip the page if it is fully outside i_size, e.g. due to a
+ * truncate operation that is in progress. We must redirty the
+ * page so that reclaim stops reclaiming it. Otherwise
+ * xfs_vm_releasepage() is called on it and gets confused.
+ */
+ if (page->index >= end_index + 1 || offset_into_page == 0)
+ goto redirty;
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each
+ * and every writepage invocation because it may be mmapped.
+ * "A file is mapped in multiples of the page size. For a file
+ * that is not a multiple of the page size, the remaining
+ * memory is zeroed when mapped, and writes to that region are
+ * not written out to the file."
+ */
+ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
}
end_offset = min_t(unsigned long long,
@@ -941,7 +1001,7 @@ xfs_vm_writepage(
bh = head = page_buffers(page);
offset = page_offset(page);
- type = IO_OVERWRITE;
+ type = XFS_IO_OVERWRITE;
if (wbc->sync_mode == WB_SYNC_NONE)
nonblocking = 1;
@@ -966,18 +1026,18 @@ xfs_vm_writepage(
}
if (buffer_unwritten(bh)) {
- if (type != IO_UNWRITTEN) {
- type = IO_UNWRITTEN;
+ if (type != XFS_IO_UNWRITTEN) {
+ type = XFS_IO_UNWRITTEN;
imap_valid = 0;
}
} else if (buffer_delay(bh)) {
- if (type != IO_DELALLOC) {
- type = IO_DELALLOC;
+ if (type != XFS_IO_DELALLOC) {
+ type = XFS_IO_DELALLOC;
imap_valid = 0;
}
} else if (buffer_uptodate(bh)) {
- if (type != IO_OVERWRITE) {
- type = IO_OVERWRITE;
+ if (type != XFS_IO_OVERWRITE) {
+ type = XFS_IO_OVERWRITE;
imap_valid = 0;
}
} else {
@@ -1013,7 +1073,7 @@ xfs_vm_writepage(
}
if (imap_valid) {
lock_buffer(bh);
- if (type != IO_OVERWRITE)
+ if (type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, &imap, offset);
xfs_add_to_ioend(inode, bh, offset, type, &ioend,
new_ioend);
@@ -1030,7 +1090,18 @@ xfs_vm_writepage(
xfs_start_page_writeback(page, 1, count);
- if (ioend && imap_valid) {
+ /* if there is no IO to be submitted for this page, we are done */
+ if (!ioend)
+ return 0;
+
+ ASSERT(iohead);
+
+ /*
+ * Any errors from this point onwards need tobe reported through the IO
+ * completion path as we have marked the initial page as under writeback
+ * and unlocked it.
+ */
+ if (imap_valid) {
xfs_off_t end_index;
end_index = imap.br_startoff + imap.br_blockcount;
@@ -1049,20 +1120,15 @@ xfs_vm_writepage(
wbc, end_index);
}
- if (iohead) {
- /*
- * Reserve log space if we might write beyond the on-disk
- * inode size.
- */
- if (ioend->io_type != IO_UNWRITTEN &&
- xfs_ioend_is_append(ioend)) {
- err = xfs_setfilesize_trans_alloc(ioend);
- if (err)
- goto error;
- }
- xfs_submit_ioend(wbc, iohead);
- }
+ /*
+ * Reserve log space if we might write beyond the on-disk inode size.
+ */
+ err = 0;
+ if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ err = xfs_setfilesize_trans_alloc(ioend);
+
+ xfs_submit_ioend(wbc, iohead, err);
return 0;
@@ -1107,7 +1173,7 @@ xfs_vm_releasepage(
{
int delalloc, unwritten;
- trace_xfs_releasepage(page->mapping->host, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
xfs_count_page_state(page, &delalloc, &unwritten);
@@ -1162,9 +1228,9 @@ __xfs_get_blocks(
lockmode = xfs_ilock_map_shared(ip);
}
- ASSERT(offset <= mp->m_maxioffset);
- if (offset + size > mp->m_maxioffset)
- size = mp->m_maxioffset - offset;
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if (offset + size > mp->m_super->s_maxbytes)
+ size = mp->m_super->s_maxbytes - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1351,7 +1417,7 @@ xfs_end_io_direct_write(
ioend->io_iocb = iocb;
ioend->io_result = ret;
if (private && size > 0)
- ioend->io_type = IO_UNWRITTEN;
+ ioend->io_type = XFS_IO_UNWRITTEN;
if (is_async) {
ioend->io_isasync = 1;
@@ -1378,25 +1444,21 @@ xfs_vm_direct_IO(
size_t size = iov_length(iov, nr_segs);
/*
- * We need to preallocate a transaction for a size update
- * here. In the case that this write both updates the size
- * and converts at least on unwritten extent we will cancel
- * the still clean transaction after the I/O has finished.
+ * We cannot preallocate a size update transaction here as we
+ * don't know whether allocation is necessary or not. Hence we
+ * can only tell IO completion that one is necessary if we are
+ * not doing unwritten extent conversion.
*/
- iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
- if (offset + size > XFS_I(inode)->i_d.di_size) {
- ret = xfs_setfilesize_trans_alloc(ioend);
- if (ret)
- goto out_destroy_ioend;
+ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
+ if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- }
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL, 0);
if (ret != -EIOCBQUEUED && iocb->private)
- goto out_trans_cancel;
+ goto out_destroy_ioend;
} else {
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
@@ -1406,12 +1468,6 @@ xfs_vm_direct_IO(
return ret;
-out_trans_cancel:
- if (ioend->io_append_trans) {
- current_set_flags_nested(&ioend->io_append_trans->t_pflags,
- PF_FSTRANS);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
out_destroy_ioend:
xfs_destroy_ioend(ioend);
return ret;
@@ -1584,7 +1640,7 @@ xfs_vm_bmap(
trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+ filemap_write_and_wait(mapping);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}