From 6d17d653c9f152e113043d00f3bcf00c0eb5f5a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 9 Jul 2017 13:45:27 -0400 Subject: NFS: Simplify page writeback We don't expect the page header lock to ever be held across I/O, so it should always be safe to wait for it, even if we're doing nonblocking writebacks. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b1af5dee5e0a..1d447e37f472 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -366,7 +366,6 @@ nfs_page_group_clear_bits(struct nfs_page *req) * @inode - inode associated with request page group, must be holding inode lock * @head - head request of page group, must be holding head lock * @req - request that couldn't lock and needs to wait on the req bit lock - * @nonblock - if true, don't actually wait * * NOTE: this must be called holding page_group bit lock and inode spin lock * and BOTH will be released before returning. @@ -375,7 +374,7 @@ nfs_page_group_clear_bits(struct nfs_page *req) */ static int nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, - struct nfs_page *req, bool nonblock) + struct nfs_page *req) __releases(&inode->i_lock) { struct nfs_page *tmp; @@ -396,10 +395,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, /* release ref from nfs_page_find_head_request_locked */ nfs_release_request(head); - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; + ret = nfs_wait_on_request(req); nfs_release_request(req); return ret; @@ -464,7 +460,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * operations for this page. * * @page - the page used to lookup the "page group" of nfs_page structures - * @nonblock - if true, don't block waiting for request locks * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations @@ -478,7 +473,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * error was encountered. */ static struct nfs_page * -nfs_lock_and_join_requests(struct page *page, bool nonblock) +nfs_lock_and_join_requests(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *head, *subreq; @@ -511,14 +506,9 @@ try_again: if (ret < 0) { spin_unlock(&inode->i_lock); - if (!nonblock && ret == -EAGAIN) { - nfs_page_group_lock_wait(head); - nfs_release_request(head); - goto try_again; - } - + nfs_page_group_lock_wait(head); nfs_release_request(head); - return ERR_PTR(ret); + goto try_again; } /* lock each request in the page group */ @@ -543,7 +533,7 @@ try_again: /* releases page group bit lock and * inode spin lock and all references */ ret = nfs_unroll_locks_and_wait(inode, head, - subreq, nonblock); + subreq); if (ret == 0) goto try_again; @@ -624,12 +614,12 @@ nfs_error_is_fatal_on_server(int err) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page) { struct nfs_page *req; int ret = 0; - req = nfs_lock_and_join_requests(page, nonblock); + req = nfs_lock_and_join_requests(page); if (!req) goto out; ret = PTR_ERR(req); @@ -672,7 +662,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -2015,7 +2005,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(page, false); + req = nfs_lock_and_join_requests(page); if (IS_ERR(req)) { ret = PTR_ERR(req); -- cgit v1.2.3 From 82749dd4efcec8e90fa7769eec3dd0afa2e3396a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 09:30:13 -0400 Subject: NFS: Reduce lock contention in nfs_page_find_head_request() Add a lockless check for whether or not the page might be carrying an existing writeback before we grab the inode->i_lock. Reported-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1d447e37f472..06e150c4e315 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -190,9 +190,11 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page) struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; - spin_lock(&inode->i_lock); - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - spin_unlock(&inode->i_lock); + if (PagePrivate(page) || PageSwapCache(page)) { + spin_lock(&inode->i_lock); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); + spin_unlock(&inode->i_lock); + } return req; } -- cgit v1.2.3 From 1403390d8366c717139cab26b8e94d943915fa12 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 09:37:12 -0400 Subject: NFS: Reduce lock contention in nfs_try_to_update_request() Micro-optimisation to move the lockless check into the for(;;) loop. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 06e150c4e315..bb019096c331 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1097,13 +1097,12 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, unsigned int end; int error; - if (!PagePrivate(page)) - return NULL; - end = offset + bytes; - spin_lock(&inode->i_lock); for (;;) { + if (!(PagePrivate(page) || PageSwapCache(page))) + return NULL; + spin_lock(&inode->i_lock); req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; @@ -1132,7 +1131,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, nfs_release_request(req); if (error != 0) goto out_err; - spin_lock(&inode->i_lock); } /* Okay, the request matches. Update the region */ -- cgit v1.2.3 From 7cb9cd9aa2eafe869935d4168031f1ed376d924c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 10:11:11 -0400 Subject: NFS: Fix a reference and lock leak in nfs_lock_and_join_requests() Yes, this is a situation that should never happen (hence the WARN_ON) but we should still ensure that we free up the locks and references to the faulty pages. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb019096c331..1ca759719429 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -526,8 +526,7 @@ try_again: } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { - nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); + nfs_unroll_locks_and_wait(inode, head, subreq); return ERR_PTR(-EIO); } -- cgit v1.2.3 From a0e265bc78010d2d831a968d4cea3c40a0efac8b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 10:29:32 -0400 Subject: NFS: Fix an ABBA issue in nfs_lock_and_join_requests() All other callers of nfs_page_group_lock() appear to already hold the page lock on the head page, so doing it in the opposite order here is inefficient, although not deadlock prone since we roll back all locks on contention. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1ca759719429..c940e615f5dc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -383,7 +383,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, int ret; /* relinquish all the locks successfully grabbed this run */ - for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) nfs_unlock_request(tmp); WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); @@ -395,7 +395,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, spin_unlock(&inode->i_lock); /* release ref from nfs_page_find_head_request_locked */ - nfs_release_request(head); + nfs_unlock_and_release_request(head); ret = nfs_wait_on_request(req); nfs_release_request(req); @@ -484,10 +484,6 @@ nfs_lock_and_join_requests(struct page *page) int ret; try_again: - total_bytes = 0; - - WARN_ON_ONCE(destroy_list); - spin_lock(&inode->i_lock); /* @@ -502,6 +498,16 @@ try_again: return NULL; } + /* lock the page head first in order to avoid an ABBA inefficiency */ + if (!nfs_lock_request(head)) { + spin_unlock(&inode->i_lock); + ret = nfs_wait_on_request(head); + nfs_release_request(head); + if (ret < 0) + return ERR_PTR(ret); + goto try_again; + } + /* holding inode lock, so always make a non-blocking call to try the * page group lock */ ret = nfs_page_group_lock(head, true); @@ -509,13 +515,14 @@ try_again: spin_unlock(&inode->i_lock); nfs_page_group_lock_wait(head); - nfs_release_request(head); + nfs_unlock_and_release_request(head); goto try_again; } /* lock each request in the page group */ - subreq = head; - do { + total_bytes = head->wb_bytes; + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { /* * Subrequests are always contiguous, non overlapping * and in order - but may be repeated (mirrored writes). @@ -541,9 +548,7 @@ try_again: return ERR_PTR(ret); } - - subreq = subreq->wb_this_page; - } while (subreq != head); + } /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ -- cgit v1.2.3 From e14bebf6de11a4b8476cf2b0a75bf7c3e69112d5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 11:11:49 -0400 Subject: NFS: Don't check request offset and size without holding a lock Request offsets and sizes are not guaranteed to be stable unless you are holding the request locked. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c940e615f5dc..84b6818e5278 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -523,6 +523,17 @@ try_again: total_bytes = head->wb_bytes; for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq); + + if (ret == 0) + goto try_again; + + return ERR_PTR(ret); + } /* * Subrequests are always contiguous, non overlapping * and in order - but may be repeated (mirrored writes). @@ -533,21 +544,10 @@ try_again: } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { + nfs_unlock_request(subreq); nfs_unroll_locks_and_wait(inode, head, subreq); return ERR_PTR(-EIO); } - - if (!nfs_lock_request(subreq)) { - /* releases page group bit lock and - * inode spin lock and all references */ - ret = nfs_unroll_locks_and_wait(inode, head, - subreq); - - if (ret == 0) - goto try_again; - - return ERR_PTR(ret); - } } /* Now that all requests are locked, make sure they aren't on any list. -- cgit v1.2.3 From 31a01f093edbc687e783a4c8adcd76d3cc91a559 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Jul 2017 19:18:49 -0400 Subject: NFS: Don't unlock writebacks before declaring PG_WB_END We don't want nfs_lock_and_join_requests() to start fiddling with the request before the call to nfs_page_group_sync_on_bit(). Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 84b6818e5278..bb38c881fc48 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -335,8 +335,11 @@ static void nfs_end_page_writeback(struct nfs_page *req) { struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); + bool is_done; - if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); + nfs_unlock_request(req); + if (!is_done) return; end_page_writeback(req->wb_page); @@ -596,7 +599,6 @@ try_again: static void nfs_write_error_remove_page(struct nfs_page *req) { - nfs_unlock_request(req); nfs_end_page_writeback(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); @@ -1019,7 +1021,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) remove_req: nfs_inode_remove_request(req); next: - nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1406,7 +1407,6 @@ static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); - nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } -- cgit v1.2.3 From b66aaa8dfeda7b5c7df513cf3b36e1290fa84055 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Jul 2017 15:22:12 -0400 Subject: NFS: Fix the inode request accounting when pages have subrequests Both nfs_destroy_unlinked_subrequests() and nfs_lock_and_join_requests() manipulate the inode flags adjusting the NFS_I(inode)->nrequests. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb38c881fc48..ee981353d4aa 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -418,7 +418,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, */ static void nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, - struct nfs_page *old_head) + struct nfs_page *old_head, + struct inode *inode) { while (destroy_list) { struct nfs_page *subreq = destroy_list; @@ -443,9 +444,12 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, nfs_page_group_clear_bits(subreq); /* release the PG_INODE_REF reference */ - if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { nfs_release_request(subreq); - else + spin_lock(&inode->i_lock); + NFS_I(inode)->nrequests--; + spin_unlock(&inode->i_lock); + } else WARN_ON_ONCE(1); } else { WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); @@ -572,25 +576,24 @@ try_again: head->wb_bytes = total_bytes; } + /* Postpone destruction of this request */ + if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { + set_bit(PG_INODE_REF, &head->wb_flags); + kref_get(&head->wb_kref); + NFS_I(inode)->nrequests++; + } + /* * prepare head request to be added to new pgio descriptor */ nfs_page_group_clear_bits(head); - /* - * some part of the group was still on the inode list - otherwise - * the group wouldn't be involved in async write. - * grab a reference for the head request, iff it needs one. - */ - if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) - kref_get(&head->wb_kref); - nfs_page_group_unlock(head); /* drop lock to clean uprequests on destroy list */ spin_unlock(&inode->i_lock); - nfs_destroy_unlinked_subrequests(destroy_list, head); + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); /* still holds ref on head from nfs_page_find_head_request_locked * and still has lock on head from lock loop */ -- cgit v1.2.3 From f6032f216fca8a1fa7f43a652f26cdf633183745 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 19:50:23 -0400 Subject: NFS: Teach nfs_try_to_update_request() to deal with request page_groups Simplify the code, and avoid some flushes to disk. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 60 ++++++++++++++++++++-------------------------------------- 1 file changed, 20 insertions(+), 40 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ee981353d4aa..0b4d1ef168e0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1107,39 +1107,19 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, end = offset + bytes; - for (;;) { - if (!(PagePrivate(page) || PageSwapCache(page))) - return NULL; - spin_lock(&inode->i_lock); - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - if (req == NULL) - goto out_unlock; - - /* should be handled by nfs_flush_incompatible */ - WARN_ON_ONCE(req->wb_head != req); - WARN_ON_ONCE(req->wb_this_page != req); - - rqend = req->wb_offset + req->wb_bytes; - /* - * Tell the caller to flush out the request if - * the offsets are non-contiguous. - * Note: nfs_flush_incompatible() will already - * have flushed out requests having wrong owners. - */ - if (offset > rqend - || end < req->wb_offset) - goto out_flushme; - - if (nfs_lock_request(req)) - break; + req = nfs_lock_and_join_requests(page); + if (IS_ERR_OR_NULL(req)) + return req; - /* The request is locked, so wait and then retry */ - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error != 0) - goto out_err; - } + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (offset > rqend || end < req->wb_offset) + goto out_flushme; /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -1150,17 +1130,17 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = end - req->wb_offset; else req->wb_bytes = rqend - req->wb_offset; -out_unlock: - if (req) - nfs_clear_request_commit(req); - spin_unlock(&inode->i_lock); return req; out_flushme: - spin_unlock(&inode->i_lock); - nfs_release_request(req); + /* + * Note: we mark the request dirty here because + * nfs_lock_and_join_requests() cannot preserve + * commit flags, so we have to replay the write. + */ + nfs_mark_request_dirty(req); + nfs_unlock_and_release_request(req); error = nfs_wb_page(inode, page); -out_err: - return ERR_PTR(error); + return (error < 0) ? ERR_PTR(error) : NULL; } /* -- cgit v1.2.3 From 7e6cca6caf7230b049bd681c5400b01c365ee452 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 20:00:46 -0400 Subject: NFS: Remove page group limit in nfs_flush_incompatible() nfs_try_to_update_request() should be able to cope now. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0b4d1ef168e0..08c1ce968cce 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1205,8 +1205,6 @@ int nfs_flush_incompatible(struct file *file, struct page *page) l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || !nfs_match_open_context(req->wb_context, ctx); - /* for now, flush if more than 1 request in page_group */ - do_flush |= req->wb_this_page != req; if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { -- cgit v1.2.3 From b5bab9bf91324a7fe21b365d6966cfd087d08e3a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 10:34:21 -0400 Subject: NFS: Reduce inode->i_lock contention in nfs_lock_and_join_requests() We should no longer need the inode->i_lock, now that we've straightened out the request locking. The locking schema is now: 1) Lock page head request 2) Lock the page group 3) Lock the subrequests one by one Note that there is a subtle race with nfs_inode_remove_request() due to the fact that the latter does not lock the page head, when removing it from the struct page. Only the last subrequest is locked, hence we need to re-check that the PagePrivate(page) is still set after we've locked all the subrequests. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 08c1ce968cce..ff7c90c7ff79 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -372,15 +372,14 @@ nfs_page_group_clear_bits(struct nfs_page *req) * @head - head request of page group, must be holding head lock * @req - request that couldn't lock and needs to wait on the req bit lock * - * NOTE: this must be called holding page_group bit lock and inode spin lock - * and BOTH will be released before returning. + * NOTE: this must be called holding page_group bit lock + * which will be released before returning. * * returns 0 on success, < 0 on error. */ static int nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, struct nfs_page *req) - __releases(&inode->i_lock) { struct nfs_page *tmp; int ret; @@ -395,7 +394,6 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, kref_get(&req->wb_kref); nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); /* release ref from nfs_page_find_head_request_locked */ nfs_unlock_and_release_request(head); @@ -491,8 +489,9 @@ nfs_lock_and_join_requests(struct page *page) int ret; try_again: + if (!(PagePrivate(page) || PageSwapCache(page))) + return NULL; spin_lock(&inode->i_lock); - /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -514,16 +513,12 @@ try_again: return ERR_PTR(ret); goto try_again; } + spin_unlock(&inode->i_lock); - /* holding inode lock, so always make a non-blocking call to try the - * page group lock */ - ret = nfs_page_group_lock(head, true); + ret = nfs_page_group_lock(head, false); if (ret < 0) { - spin_unlock(&inode->i_lock); - - nfs_page_group_lock_wait(head); nfs_unlock_and_release_request(head); - goto try_again; + return ERR_PTR(ret); } /* lock each request in the page group */ @@ -531,8 +526,10 @@ try_again: for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { if (!nfs_lock_request(subreq)) { - /* releases page group bit lock and - * inode spin lock and all references */ + /* + * releases page group bit lock and + * page locks and all references + */ ret = nfs_unroll_locks_and_wait(inode, head, subreq); @@ -580,7 +577,9 @@ try_again: if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { set_bit(PG_INODE_REF, &head->wb_flags); kref_get(&head->wb_kref); + spin_lock(&inode->i_lock); NFS_I(inode)->nrequests++; + spin_unlock(&inode->i_lock); } /* @@ -590,11 +589,14 @@ try_again: nfs_page_group_unlock(head); - /* drop lock to clean uprequests on destroy list */ - spin_unlock(&inode->i_lock); - nfs_destroy_unlinked_subrequests(destroy_list, head, inode); + /* Did we lose a race with nfs_inode_remove_request()? */ + if (!(PagePrivate(page) || PageSwapCache(page))) { + nfs_unlock_and_release_request(head); + return NULL; + } + /* still holds ref on head from nfs_page_find_head_request_locked * and still has lock on head from lock loop */ return head; @@ -968,7 +970,7 @@ nfs_clear_page_commit(struct page *page) WB_RECLAIMABLE); } -/* Called holding inode (/cinfo) lock */ +/* Called holding the request lock on @req */ static void nfs_clear_request_commit(struct nfs_page *req) { @@ -977,9 +979,11 @@ nfs_clear_request_commit(struct nfs_page *req) struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); + spin_lock(&inode->i_lock); if (!pnfs_clear_request_commit(req, &cinfo)) { nfs_request_remove_commit_list(req, &cinfo); } + spin_unlock(&inode->i_lock); nfs_clear_page_commit(req->wb_page); } } -- cgit v1.2.3 From 74a6d4b5ae4ec7e93c72a92decb2f8c16c812416 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Jul 2017 08:23:10 -0400 Subject: NFS: Further optimise nfs_lock_and_join_requests() When locking the entire group in order to remove subrequests, the locks are always taken in order, and with the page group lock being taken after the page head is locked. The intention is that: 1) The lock on the group head guarantees that requests may not be removed from the group (although new entries could be appended if we're not holding the group lock). 2) It is safe to drop and retake the page group lock while iterating through the list, in particular when waiting for a subrequest lock. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ff7c90c7ff79..1ee5d89380d9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -377,31 +377,17 @@ nfs_page_group_clear_bits(struct nfs_page *req) * * returns 0 on success, < 0 on error. */ -static int -nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, +static void +nfs_unroll_locks(struct inode *inode, struct nfs_page *head, struct nfs_page *req) { struct nfs_page *tmp; - int ret; /* relinquish all the locks successfully grabbed this run */ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) nfs_unlock_request(tmp); WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); - - /* grab a ref on the request that will be waited on */ - kref_get(&req->wb_kref); - - nfs_page_group_unlock(head); - - /* release ref from nfs_page_find_head_request_locked */ - nfs_unlock_and_release_request(head); - - ret = nfs_wait_on_request(req); - nfs_release_request(req); - - return ret; } /* @@ -525,18 +511,21 @@ try_again: total_bytes = head->wb_bytes; for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - if (!nfs_lock_request(subreq)) { + + while (!nfs_lock_request(subreq)) { /* - * releases page group bit lock and - * page locks and all references + * Unlock page to allow nfs_page_group_sync_on_bit() + * to succeed */ - ret = nfs_unroll_locks_and_wait(inode, head, - subreq); - - if (ret == 0) - goto try_again; - - return ERR_PTR(ret); + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head, false); + if (ret < 0) { + nfs_unroll_locks(inode, head, subreq); + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); + } } /* * Subrequests are always contiguous, non overlapping @@ -549,7 +538,9 @@ try_again: ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { nfs_unlock_request(subreq); - nfs_unroll_locks_and_wait(inode, head, subreq); + nfs_unroll_locks(inode, head, subreq); + nfs_page_group_unlock(head); + nfs_unlock_and_release_request(head); return ERR_PTR(-EIO); } } -- cgit v1.2.3 From 5b2b5187fa85665f0c47029ecaf49186ec138d9b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Jul 2017 10:06:36 -0400 Subject: NFS: Fix nfs_page_group_destroy() and nfs_lock_and_join_requests() race cases Since nfs_page_group_destroy() does not take any locks on the requests to be freed, we need to ensure that we don't inadvertently free the request in nfs_destroy_unlinked_subrequests() while the last reference is being released elsewhere. Do this by: 1) Taking a reference to the request unless it is already being freed 2) Checking (under the page group lock) if PG_TEARDOWN is already set before freeing an unreferenced request in nfs_destroy_unlinked_subrequests() Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 58 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1ee5d89380d9..ffb9934607ef 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -384,10 +384,11 @@ nfs_unroll_locks(struct inode *inode, struct nfs_page *head, struct nfs_page *tmp; /* relinquish all the locks successfully grabbed this run */ - for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) - nfs_unlock_request(tmp); - - WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } } /* @@ -414,36 +415,32 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ - subreq->wb_head = subreq; subreq->wb_this_page = subreq; - /* subreq is now totally disconnected from page group or any - * write / commit lists. last chance to wake any waiters */ - nfs_unlock_request(subreq); - - if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { - /* release ref on old head request */ - nfs_release_request(old_head); + /* Note: races with nfs_page_group_destroy() */ + if (!kref_read(&subreq->wb_kref)) { + bool freeme = test_bit(PG_TEARDOWN, &subreq->wb_flags); nfs_page_group_clear_bits(subreq); + /* Check if we raced with nfs_page_group_destroy() */ + if (freeme) + nfs_free_request(subreq); + continue; + } - /* release the PG_INODE_REF reference */ - if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { - nfs_release_request(subreq); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests--; - spin_unlock(&inode->i_lock); - } else - WARN_ON_ONCE(1); - } else { - WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); - /* zombie requests have already released the last - * reference and were waiting on the rest of the - * group to complete. Since it's no longer part of a - * group, simply free the request */ - nfs_page_group_clear_bits(subreq); - nfs_free_request(subreq); + subreq->wb_head = subreq; + + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { + nfs_release_request(subreq); + spin_lock(&inode->i_lock); + NFS_I(inode)->nrequests--; + spin_unlock(&inode->i_lock); } + + nfs_page_group_clear_bits(subreq); + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_and_release_request(subreq); } } @@ -512,6 +509,8 @@ try_again: for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { + if (!kref_get_unless_zero(&subreq->wb_kref)) + continue; while (!nfs_lock_request(subreq)) { /* * Unlock page to allow nfs_page_group_sync_on_bit() @@ -523,6 +522,7 @@ try_again: ret = nfs_page_group_lock(head, false); if (ret < 0) { nfs_unroll_locks(inode, head, subreq); + nfs_release_request(subreq); nfs_unlock_and_release_request(head); return ERR_PTR(ret); } @@ -537,8 +537,8 @@ try_again: } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { - nfs_unlock_request(subreq); nfs_unroll_locks(inode, head, subreq); + nfs_unlock_and_release_request(subreq); nfs_page_group_unlock(head); nfs_unlock_and_release_request(head); return ERR_PTR(-EIO); -- cgit v1.2.3 From 902a4c00462a755fe4a6ca655813c8b2a51fab4c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 19 Jul 2017 13:50:07 -0400 Subject: NFS: Remove nfs_page_group_clear_bits() At this point, we only expect ever to potentially see PG_REMOVE and PG_TEARDOWN being set on the subrequests. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ffb9934607ef..20d44ea328b6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -347,22 +347,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } - -/* nfs_page_group_clear_bits - * @req - an nfs request - * clears all page group related bits from @req - */ -static void -nfs_page_group_clear_bits(struct nfs_page *req) -{ - clear_bit(PG_TEARDOWN, &req->wb_flags); - clear_bit(PG_UNLOCKPAGE, &req->wb_flags); - clear_bit(PG_UPTODATE, &req->wb_flags); - clear_bit(PG_WB_END, &req->wb_flags); - clear_bit(PG_REMOVE, &req->wb_flags); -} - - /* * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req * @@ -417,13 +401,12 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, /* make sure old group is not used */ subreq->wb_this_page = subreq; + clear_bit(PG_REMOVE, &subreq->wb_flags); + /* Note: races with nfs_page_group_destroy() */ if (!kref_read(&subreq->wb_kref)) { - bool freeme = test_bit(PG_TEARDOWN, &subreq->wb_flags); - - nfs_page_group_clear_bits(subreq); /* Check if we raced with nfs_page_group_destroy() */ - if (freeme) + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) nfs_free_request(subreq); continue; } @@ -437,7 +420,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, spin_unlock(&inode->i_lock); } - nfs_page_group_clear_bits(subreq); /* subreq is now totally disconnected from page group or any * write / commit lists. last chance to wake any waiters */ nfs_unlock_and_release_request(subreq); @@ -573,11 +555,6 @@ try_again: spin_unlock(&inode->i_lock); } - /* - * prepare head request to be added to new pgio descriptor - */ - nfs_page_group_clear_bits(head); - nfs_page_group_unlock(head); nfs_destroy_unlinked_subrequests(destroy_list, head, inode); -- cgit v1.2.3 From 1344b7ea172b4911a8ee8a6ff26c5bc6b5abb302 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 10:54:14 -0400 Subject: NFS: Remove unused parameter from nfs_page_group_lock() nfs_page_group_lock() is now always called with the 'nonblock' parameter set to 'false'. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 31 +++++++++++-------------------- fs/nfs/write.c | 6 +++--- include/linux/nfs_page.h | 2 +- 3 files changed, 15 insertions(+), 24 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ced7974622dd..af6731dd4324 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked - * @nonblock - if true don't block waiting for lock * - * this lock must be held if modifying the page group list + * this lock must be held when traversing or modifying the page + * group list * - * return 0 on success, < 0 on error: -EDELAY if nonblocking or the - * result from wait_on_bit_lock - * - * NOTE: calling with nonblock=false should always have set the - * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock - * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. + * return 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req, bool nonblock) +nfs_page_group_lock(struct nfs_page *req) { struct nfs_page *head = req->wb_head; @@ -155,14 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock) if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) return 0; - if (!nonblock) { - set_bit(PG_CONTENDED1, &head->wb_flags); - smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + set_bit(PG_CONTENDED1, &head->wb_flags); + smp_mb__after_atomic(); + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); - } - - return -EAGAIN; } /* @@ -225,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) { bool ret; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); ret = nfs_page_group_sync_on_bit_locked(req, bit); nfs_page_group_unlock(req); @@ -1016,7 +1007,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, unsigned int bytes_left = 0; unsigned int offset, pgbase; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); subreq = req; bytes_left = subreq->wb_bytes; @@ -1038,7 +1029,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (mirror->pg_recoalesce) return 0; /* retry add_request for this subreq */ - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); continue; } @@ -1135,7 +1126,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); /* find the last request */ for (lastreq = req->wb_head; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 20d44ea328b6..0f418d825185 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -271,7 +271,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) unsigned int pos = 0; unsigned int len = nfs_page_length(req->wb_page); - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); do { tmp = nfs_page_group_search_locked(req->wb_head, pos); @@ -480,7 +480,7 @@ try_again: } spin_unlock(&inode->i_lock); - ret = nfs_page_group_lock(head, false); + ret = nfs_page_group_lock(head); if (ret < 0) { nfs_unlock_and_release_request(head); return ERR_PTR(ret); @@ -501,7 +501,7 @@ try_again: nfs_page_group_unlock(head); ret = nfs_wait_on_request(subreq); if (!ret) - ret = nfs_page_group_lock(head, false); + ret = nfs_page_group_lock(head); if (ret < 0) { nfs_unroll_locks(inode, head, subreq); nfs_release_request(subreq); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index de1d24cedaa2..2f4fdafb6746 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -139,7 +139,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); -extern int nfs_page_group_lock(struct nfs_page *, bool); +extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); -- cgit v1.2.3 From 7e8a30f8b497315a6467d86c82f6cc8acaa9db61 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Jul 2017 16:58:07 -0400 Subject: NFS: Fix up nfs_page_group_covers_page() Fix up the test in nfs_page_group_covers_page(). The simplest implementation is to check that we have a set of intersecting or contiguous subrequests that connect page offset 0 to nfs_page_length(req->wb_page). Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0f418d825185..759e37d26acf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -243,9 +243,6 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) { struct nfs_page *req; - WARN_ON_ONCE(head != head->wb_head); - WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); - req = head; do { if (page_offset >= req->wb_pgbase && @@ -273,18 +270,15 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) nfs_page_group_lock(req); - do { + for (;;) { tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (tmp) { - /* no way this should happen */ - WARN_ON_ONCE(tmp->wb_pgbase != pos); - pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); - } - } while (tmp && pos < len); + if (!tmp) + break; + pos = tmp->wb_pgbase + tmp->wb_bytes; + } nfs_page_group_unlock(req); - WARN_ON_ONCE(pos > len); - return pos == len; + return pos >= len; } /* We can set the PG_uptodate flag if we see that a write request -- cgit v1.2.3 From bd37d6fce184836bd5e7cd90ce40116a4fadaf2a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 12:06:30 -0400 Subject: NFSv4: Convert nfs_lock_and_join_requests() to use nfs_page_find_head_request() Hide the locking from nfs_lock_and_join_requests() so that we can separate out the requirements for swapcache pages. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 759e37d26acf..a06167e20b72 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -154,6 +154,14 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +static struct nfs_page * +nfs_page_private_request(struct page *page) +{ + if (!PagePrivate(page)) + return NULL; + return (struct nfs_page *)page_private(page); +} + /* * nfs_page_find_head_request_locked - find head request associated with @page * @@ -164,11 +172,10 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) static struct nfs_page * nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { - struct nfs_page *req = NULL; + struct nfs_page *req; - if (PagePrivate(page)) - req = (struct nfs_page *)page_private(page); - else if (unlikely(PageSwapCache(page))) + req = nfs_page_private_request(page); + if (!req && unlikely(PageSwapCache(page))) req = nfs_page_search_commits_for_head_request_locked(nfsi, page); @@ -448,31 +455,29 @@ nfs_lock_and_join_requests(struct page *page) int ret; try_again: - if (!(PagePrivate(page) || PageSwapCache(page))) - return NULL; - spin_lock(&inode->i_lock); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_page_find_head_request_locked(NFS_I(inode), page); - - if (!head) { - spin_unlock(&inode->i_lock); + head = nfs_page_find_head_request(page); + if (!head) return NULL; - } /* lock the page head first in order to avoid an ABBA inefficiency */ if (!nfs_lock_request(head)) { - spin_unlock(&inode->i_lock); ret = nfs_wait_on_request(head); nfs_release_request(head); if (ret < 0) return ERR_PTR(ret); goto try_again; } - spin_unlock(&inode->i_lock); + + /* Ensure that nobody removed the request before we locked it */ + if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { + nfs_unlock_and_release_request(head); + goto try_again; + } ret = nfs_page_group_lock(head); if (ret < 0) { @@ -559,7 +564,7 @@ try_again: return NULL; } - /* still holds ref on head from nfs_page_find_head_request_locked + /* still holds ref on head from nfs_page_find_head_request * and still has lock on head from lock loop */ return head; } -- cgit v1.2.3 From b30d2f04c35d539bf8003b3e014c389abefc249b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 12:26:53 -0400 Subject: NFS: Refactor nfs_page_find_head_request() Split out the 2 cases so that we can treat the locking differently. The issue is that the locking in the pageswapcache cache is highly linked to the commit list locking. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a06167e20b72..8d8fa6d4cfcc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -170,20 +170,41 @@ nfs_page_private_request(struct page *page) * returns matching head request with reference held, or NULL if not found. */ static struct nfs_page * -nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_private_request(struct page *page) { + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req; + if (!PagePrivate(page)) + return NULL; + spin_lock(&inode->i_lock); req = nfs_page_private_request(page); - if (!req && unlikely(PageSwapCache(page))) - req = nfs_page_search_commits_for_head_request_locked(nfsi, - page); - if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } + spin_unlock(&inode->i_lock); + return req; +} +static struct nfs_page * +nfs_page_find_swap_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *req = NULL; + if (!PageSwapCache(page)) + return NULL; + spin_lock(&inode->i_lock); + if (PageSwapCache(page)) { + req = nfs_page_search_commits_for_head_request_locked(nfsi, + page); + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } + } + spin_unlock(&inode->i_lock); return req; } @@ -194,14 +215,11 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) */ static struct nfs_page *nfs_page_find_head_request(struct page *page) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req = NULL; + struct nfs_page *req; - if (PagePrivate(page) || PageSwapCache(page)) { - spin_lock(&inode->i_lock); - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - spin_unlock(&inode->i_lock); - } + req = nfs_page_find_private_request(page); + if (!req) + req = nfs_page_find_swap_request(page); return req; } -- cgit v1.2.3 From e824f99adaaf1ed0e03eac8574599af6d992163d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 11:53:49 -0400 Subject: NFSv4: Use a mutex to protect the per-inode commit lists The commit lists can get very large, so using the inode->i_lock can end up affecting general metadata performance. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 4 ++-- fs/nfs/inode.c | 1 + fs/nfs/pnfs_nfs.c | 15 +++++++-------- fs/nfs/write.c | 24 ++++++++++++------------ include/linux/nfs_fs.h | 1 + 5 files changed, 23 insertions(+), 22 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6fb9fad2d1e6..d2972d537469 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -616,13 +616,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 109279d6d91b..34d9ebbc0dfd 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2016,6 +2016,7 @@ static void init_once(void *foo) nfsi->commit_info.ncommit = 0; atomic_set(&nfsi->commit_info.rpcs_out, 0); init_rwsem(&nfsi->rmdir_sem); + mutex_init(&nfsi->commit_mutex); nfs4_init_once(nfsi); } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 25f28fa64c57..2cdee8ce2094 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -98,14 +98,13 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; if ((ret == max) && !cinfo->dreq) break; + cond_resched(); } return ret; } @@ -119,7 +118,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; @@ -142,7 +141,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -162,7 +161,7 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, int nwritten; int i; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { nwritten = pnfs_generic_transfer_commit_list(&b->written, @@ -953,12 +952,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -975,7 +974,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8d8fa6d4cfcc..5ab5ca24b48a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -195,7 +195,7 @@ nfs_page_find_swap_request(struct page *page) struct nfs_page *req = NULL; if (!PageSwapCache(page)) return NULL; - spin_lock(&inode->i_lock); + mutex_lock(&nfsi->commit_mutex); if (PageSwapCache(page)) { req = nfs_page_search_commits_for_head_request_locked(nfsi, page); @@ -204,7 +204,7 @@ nfs_page_find_swap_request(struct page *page) kref_get(&req->wb_kref); } } - spin_unlock(&inode->i_lock); + mutex_unlock(&nfsi->commit_mutex); return req; } @@ -856,7 +856,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. + * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the + * nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -884,9 +885,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); if (req->wb_page) nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -964,11 +965,11 @@ nfs_clear_request_commit(struct nfs_page *req) struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); - spin_lock(&inode->i_lock); + mutex_lock(&NFS_I(inode)->commit_mutex); if (!pnfs_clear_request_commit(req, &cinfo)) { nfs_request_remove_commit_list(req, &cinfo); } - spin_unlock(&inode->i_lock); + mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_clear_page_commit(req->wb_page); } } @@ -1027,7 +1028,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo) return cinfo->mds->ncommit; } -/* cinfo->inode->i_lock held by caller */ +/* NFS_I(cinfo->inode)->commit_mutex held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -1039,13 +1040,12 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); ret++; if ((ret == max) && !cinfo->dreq) break; + cond_resched(); } return ret; } @@ -1065,7 +1065,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); if (cinfo->mds->ncommit > 0) { const int max = INT_MAX; @@ -1073,7 +1073,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); return ret; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 5cc91d6381a3..121a702888b4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -163,6 +163,7 @@ struct nfs_inode { /* Readers: in-flight sillydelete RPC calls */ /* Writers: rmdir */ struct rw_semaphore rmdir_sem; + struct mutex commit_mutex; #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; -- cgit v1.2.3 From a6b6d5b85abf4914bbceade5dddd54c345c64136 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 15:39:46 -0400 Subject: NFS: Use an atomic_long_t to count the number of requests Rather than forcing us to take the inode->i_lock just in order to bump the number. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- fs/nfs/delegation.c | 2 +- fs/nfs/inode.c | 7 +++---- fs/nfs/pagelist.c | 4 +--- fs/nfs/write.c | 18 +++++------------- include/linux/nfs_fs.h | 4 ++-- 6 files changed, 13 insertions(+), 24 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 5427cdf04c5a..14358de173fb 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -51,7 +51,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; - if (nfsi->nrequests != 0) + if (nfs_have_writebacks(inode)) res->change_attr++; res->ctime = inode->i_ctime; res->mtime = inode->i_mtime; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d7df5e67b0c1..606dd3871f66 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1089,7 +1089,7 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode) delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || !(delegation->type & FMODE_WRITE)) goto out; - if (nfsi->nrequests < delegation->pagemod_limit) + if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit) ret = false; out: rcu_read_unlock(); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 34d9ebbc0dfd..0480eb02299a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1285,7 +1285,6 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); unsigned long ret = 0; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) @@ -1315,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->nrequests == 0) { + && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); ret |= NFS_INO_INVALID_ATTR; } @@ -1823,7 +1822,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->nrequests == 0 || new_isize > cur_isize) { + if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; @@ -2012,7 +2011,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_LIST_HEAD(&nfsi->commit_info.list); - nfsi->nrequests = 0; + atomic_long_set(&nfsi->nrequests, 0); nfsi->commit_info.ncommit = 0; atomic_set(&nfsi->commit_info.rpcs_out, 0); init_rwsem(&nfsi->rmdir_sem); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index af6731dd4324..ec97c301899b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -258,9 +258,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) inode = page_file_mapping(req->wb_page)->host; set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests++; - spin_unlock(&inode->i_lock); + atomic_long_inc(&NFS_I(inode)->nrequests); } } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5ab5ca24b48a..08093552f115 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -434,9 +434,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { nfs_release_request(subreq); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests--; - spin_unlock(&inode->i_lock); + atomic_long_dec(&NFS_I(inode)->nrequests); } /* subreq is now totally disconnected from page group or any @@ -567,9 +565,7 @@ try_again: if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { set_bit(PG_INODE_REF, &head->wb_flags); kref_get(&head->wb_kref); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests++; - spin_unlock(&inode->i_lock); + atomic_long_inc(&NFS_I(inode)->nrequests); } nfs_page_group_unlock(head); @@ -755,7 +751,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) nfs_lock_request(req); spin_lock(&inode->i_lock); - if (!nfsi->nrequests && + if (!nfs_have_writebacks(inode) && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) inode->i_version++; /* @@ -767,7 +763,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); } - nfsi->nrequests++; + atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. * This flag also informs pgio layer when to bump nrequests when @@ -786,6 +782,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; + atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; @@ -795,11 +792,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) ClearPagePrivate(head->wb_page); clear_bit(PG_MAPPED, &head->wb_flags); } - nfsi->nrequests--; - spin_unlock(&inode->i_lock); - } else { - spin_lock(&inode->i_lock); - nfsi->nrequests--; spin_unlock(&inode->i_lock); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 121a702888b4..238fdc4c46df 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -154,7 +154,7 @@ struct nfs_inode { */ __be32 cookieverf[2]; - unsigned long nrequests; + atomic_long_t nrequests; struct nfs_mds_commit_info commit_info; /* Open contexts for shared mmap writes */ @@ -511,7 +511,7 @@ extern void nfs_commit_free(struct nfs_commit_data *data); static inline int nfs_have_writebacks(struct inode *inode) { - return NFS_I(inode)->nrequests != 0; + return atomic_long_read(&NFS_I(inode)->nrequests) != 0; } /* -- cgit v1.2.3 From 5cb953d4b1e70a09084f71594c45d47458346bc2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 17:04:12 -0400 Subject: NFS: Use an atomic_long_t to count the number of commits Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- fs/nfs/write.c | 12 +++++++----- include/linux/nfs_xdr.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0480eb02299a..134d9f560240 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2012,7 +2012,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_LIST_HEAD(&nfsi->commit_info.list); atomic_long_set(&nfsi->nrequests, 0); - nfsi->commit_info.ncommit = 0; + atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); init_rwsem(&nfsi->rmdir_sem); mutex_init(&nfsi->commit_mutex); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 08093552f115..12479c25028e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -857,7 +857,7 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, { set_bit(PG_CLEAN, &req->wb_flags); nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + atomic_long_inc(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); @@ -903,7 +903,7 @@ nfs_request_remove_commit_list(struct nfs_page *req, if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) return; nfs_list_remove_request(req); - cinfo->mds->ncommit--; + atomic_long_dec(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); @@ -1017,7 +1017,7 @@ out: unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { - return cinfo->mds->ncommit; + return atomic_long_read(&cinfo->mds->ncommit); } /* NFS_I(cinfo->inode)->commit_mutex held by caller */ @@ -1057,8 +1057,10 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; + if (!atomic_long_read(&cinfo->mds->ncommit)) + return 0; mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - if (cinfo->mds->ncommit > 0) { + if (atomic_long_read(&cinfo->mds->ncommit) > 0) { const int max = INT_MAX; ret = nfs_scan_commit_list(&cinfo->mds->list, dst, @@ -1890,7 +1892,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; /* no commits means nothing needs to be done */ - if (!nfsi->commit_info.ncommit) + if (!atomic_long_read(&nfsi->commit_info.ncommit)) return ret; if (wbc->sync_mode == WB_SYNC_NONE) { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 62cbcb842f99..164d5359d4ab 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1476,7 +1476,7 @@ struct nfs_pgio_header { struct nfs_mds_commit_info { atomic_t rpcs_out; - unsigned long ncommit; + atomic_long_t ncommit; struct list_head list; }; -- cgit v1.2.3 From 4b9bb25b36baa3e2e42b91e451bcd3acfe197a1d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 12:34:44 -0400 Subject: NFS: Switch to using mapping->private_lock for page writeback lookups. Switch from using the inode->i_lock for this to avoid contention with other metadata manipulation. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 12479c25028e..866702823869 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -172,18 +172,18 @@ nfs_page_private_request(struct page *page) static struct nfs_page * nfs_page_find_private_request(struct page *page) { - struct inode *inode = page_file_mapping(page)->host; + struct address_space *mapping = page_file_mapping(page); struct nfs_page *req; if (!PagePrivate(page)) return NULL; - spin_lock(&inode->i_lock); + spin_lock(&mapping->private_lock); req = nfs_page_private_request(page); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&inode->i_lock); + spin_unlock(&mapping->private_lock); return req; } @@ -743,6 +743,7 @@ out_err: */ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { + struct address_space *mapping = page_file_mapping(req->wb_page); struct nfs_inode *nfsi = NFS_I(inode); WARN_ON_ONCE(req->wb_this_page != req); @@ -750,19 +751,23 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) /* Lock the request! */ nfs_lock_request(req); - spin_lock(&inode->i_lock); - if (!nfs_have_writebacks(inode) && - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - inode->i_version++; /* * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ + spin_lock(&mapping->private_lock); + if (!nfs_have_writebacks(inode) && + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) { + spin_lock(&inode->i_lock); + inode->i_version++; + spin_unlock(&inode->i_lock); + } if (likely(!PageSwapCache(req->wb_page))) { set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); } + spin_unlock(&mapping->private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -770,7 +775,6 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) * adding subrequests. */ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); } /* @@ -778,7 +782,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct address_space *mapping = page_file_mapping(req->wb_page); + struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; @@ -786,13 +791,13 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; - spin_lock(&inode->i_lock); + spin_lock(&mapping->private_lock); if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); clear_bit(PG_MAPPED, &head->wb_flags); } - spin_unlock(&inode->i_lock); + spin_unlock(&mapping->private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) -- cgit v1.2.3 From 2ce209c42c01ca976ad680fea52a8e8b9a53643b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 17:29:29 -0400 Subject: NFS: Wait for requests that are locked on the commit list If a request is on the commit list, but is locked, we will currently skip it, which can lead to livelocking when the commit count doesn't reduce to zero. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 ++ fs/nfs/pnfs_nfs.c | 18 ++++++++++++++---- fs/nfs/write.c | 17 +++++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ec97c301899b..548ebc7256ff 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -434,6 +434,7 @@ void nfs_release_request(struct nfs_page *req) { kref_put(&req->wb_kref, nfs_page_group_destroy); } +EXPORT_SYMBOL_GPL(nfs_release_request); /** * nfs_wait_on_request - Wait for a request to complete. @@ -452,6 +453,7 @@ nfs_wait_on_request(struct nfs_page *req) return wait_on_bit_io(&req->wb_flags, PG_BUSY, TASK_UNINTERRUPTIBLE); } +EXPORT_SYMBOL_GPL(nfs_wait_on_request); /* * nfs_generic_pg_test - determine if requests can be coalesced diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 4b0a809653d1..303ff171cb5d 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -91,13 +91,23 @@ static int pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) { - struct nfs_page *req, *tmp; + struct nfs_page *req; int ret = 0; - list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; + while(!list_empty(src)) { + req = list_first_entry(src, struct nfs_page, wb_list); + kref_get(&req->wb_kref); + if (!nfs_lock_request(req)) { + int status; + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + status = nfs_wait_on_request(req); + nfs_release_request(req); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (status < 0) + break; + continue; + } nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 866702823869..5dd3b212376e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1030,13 +1030,22 @@ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) { - struct nfs_page *req, *tmp; + struct nfs_page *req; int ret = 0; - list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; + while(!list_empty(src)) { + req = list_first_entry(src, struct nfs_page, wb_list); kref_get(&req->wb_kref); + if (!nfs_lock_request(req)) { + int status; + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + status = nfs_wait_on_request(req); + nfs_release_request(req); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (status < 0) + break; + continue; + } nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); ret++; -- cgit v1.2.3 From 3bde7afdabe9f37974af806abe646c2ca43c67c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 20 Aug 2017 11:33:25 -0400 Subject: NFS: Remove unused parameter gfp_flags from nfs_pageio_init() Now that the mirror allocation has been moved, the parameter can go. Also remove the redundant symbol export. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 4 +--- fs/nfs/read.c | 2 +- fs/nfs/write.c | 2 +- include/linux/nfs_page.h | 3 +-- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b7d193e2a243..ee66d8c3336c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -711,8 +711,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags, - gfp_t gfp_flags) + int io_flags) { desc->pg_moreio = 0; desc->pg_inode = inode; @@ -733,7 +732,6 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_mirrors = desc->pg_mirrors_static; nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); } -EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_pgio_result - Basic pageio error handling diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a8421d9dab6a..0d42573d423d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -68,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0, GFP_KERNEL); + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b1af5dee5e0a..ae78ac0a7a8a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1452,7 +1452,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags, GFP_NOIO); + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index d67b67ae6c8b..8b1a35aad0c3 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -125,8 +125,7 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int how, - gfp_t gfp_flags); + int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern int nfs_pageio_resend(struct nfs_pageio_descriptor *, -- cgit v1.2.3 From 237f8306c30206d997e265ff46652e602a50824f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 18 Aug 2017 17:12:51 +1000 Subject: NFS: don't expect errors from mempool_alloc(). Commit fbe77c30e9ab ("NFS: move rw_mode to nfs_pageio_header") reintroduced some pointless code that commit 518662e0fcb9 ("NFS: fix usage of mempools.") had recently removed. Remove it again. Cc: Benjamin Coddington Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1a877d17e6cb..ae26775b5448 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -102,10 +102,8 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) { - memset(p, 0, sizeof(*p)); - p->rw_mode = FMODE_WRITE; - } + memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; return p; } -- cgit v1.2.3 From 137da553dba62dfc64fb8f4ccb5be769acbf615e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 9 Sep 2017 12:28:01 -0400 Subject: NFS: nfs_lock_and_join_requests and nfs_scan_commit_list can deadlock Since the commit list is not ordered, it is possible for nfs_scan_commit_list to hold a request that nfs_lock_and_join_requests() is waiting for, while at the same time trying to grab a request that nfs_lock_and_join_requests already holds. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 16 +++++++++++----- fs/nfs/write.c | 15 +++++++++++---- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 303ff171cb5d..d03d836b6ee0 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -91,22 +91,28 @@ static int pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) { - struct nfs_page *req; + struct nfs_page *req, *tmp; int ret = 0; - while(!list_empty(src)) { - req = list_first_entry(src, struct nfs_page, wb_list); - +restart: + list_for_each_entry_safe(req, tmp, src, wb_list) { kref_get(&req->wb_kref); if (!nfs_lock_request(req)) { int status; + + /* Prevent deadlock with nfs_lock_and_join_requests */ + if (!list_empty(dst)) { + nfs_release_request(req); + continue; + } + /* Ensure we make progress to prevent livelock */ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); status = nfs_wait_on_request(req); nfs_release_request(req); mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); if (status < 0) break; - continue; + goto restart; } nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ae26775b5448..c3f627b08ec6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1028,21 +1028,28 @@ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) { - struct nfs_page *req; + struct nfs_page *req, *tmp; int ret = 0; - while(!list_empty(src)) { - req = list_first_entry(src, struct nfs_page, wb_list); +restart: + list_for_each_entry_safe(req, tmp, src, wb_list) { kref_get(&req->wb_kref); if (!nfs_lock_request(req)) { int status; + + /* Prevent deadlock with nfs_lock_and_join_requests */ + if (!list_empty(dst)) { + nfs_release_request(req); + continue; + } + /* Ensure we make progress to prevent livelock */ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); status = nfs_wait_on_request(req); nfs_release_request(req); mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); if (status < 0) break; - continue; + goto restart; } nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); -- cgit v1.2.3 From 5d2a9d9dac902f438daa1334540398aec6c8dcfa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 9 Sep 2017 12:40:38 -0400 Subject: NFS: Remove pnfs_generic_transfer_commit_list() It's pretty much a duplicate of nfs_scan_commit_list() that also clears the PG_COMMIT_TO_DS flag. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 43 ++----------------------------------------- fs/nfs/write.c | 2 ++ 2 files changed, 4 insertions(+), 41 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index d03d836b6ee0..60da59be83b6 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -87,44 +87,6 @@ out: } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); -static int -pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, - struct nfs_commit_info *cinfo, int max) -{ - struct nfs_page *req, *tmp; - int ret = 0; - -restart: - list_for_each_entry_safe(req, tmp, src, wb_list) { - kref_get(&req->wb_kref); - if (!nfs_lock_request(req)) { - int status; - - /* Prevent deadlock with nfs_lock_and_join_requests */ - if (!list_empty(dst)) { - nfs_release_request(req); - continue; - } - /* Ensure we make progress to prevent livelock */ - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - status = nfs_wait_on_request(req); - nfs_release_request(req); - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - if (status < 0) - break; - goto restart; - } - nfs_request_remove_commit_list(req, cinfo); - clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); - nfs_list_add_request(req, dst); - ret++; - if ((ret == max) && !cinfo->dreq) - break; - cond_resched(); - } - return ret; -} - static int pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, @@ -135,7 +97,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, int ret; lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); - ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); + ret = nfs_scan_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; @@ -180,8 +142,7 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - nwritten = pnfs_generic_transfer_commit_list(&b->written, - dst, cinfo, 0); + nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); if (!nwritten) continue; cinfo->ds->nwritten -= nwritten; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c3f627b08ec6..121218d4e5ed 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1052,6 +1052,7 @@ restart: goto restart; } nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; if ((ret == max) && !cinfo->dreq) @@ -1060,6 +1061,7 @@ restart: } return ret; } +EXPORT_SYMBOL_GPL(nfs_scan_commit_list); /* * nfs_scan_commit - Scan an inode for commit requests -- cgit v1.2.3 From 8b77484f2b3d8f1096a2231d513fc589e4857a73 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 9 Sep 2017 15:31:28 -0400 Subject: NFS: Don't hold the group lock when calling nfs_release_request() That can deadlock if this is the last reference since nfs_page_group_destroy() calls nfs_page_group_sync_on_bit(). Note that even if the page was removed from the subpage list, the req->wb_head could still be pointing to the old head. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 121218d4e5ed..36d34a4c86bd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -532,9 +532,9 @@ try_again: } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { + nfs_page_group_unlock(head); nfs_unroll_locks(inode, head, subreq); nfs_unlock_and_release_request(subreq); - nfs_page_group_unlock(head); nfs_unlock_and_release_request(head); return ERR_PTR(-EIO); } -- cgit v1.2.3 From 1bd5d6d08ea7ed0794c8a3908383d6d6fc202cdd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 9 Sep 2017 16:43:09 -0400 Subject: NFS: Count the bytes of skipped subrequests in nfs_lock_and_join_requests() If we skip a subrequest due to a zero refcount, we should still count the byte range that it covered so that we accurately reconstruct the original request size. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 36d34a4c86bd..f68083db63c8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -504,8 +504,12 @@ try_again: for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - if (!kref_get_unless_zero(&subreq->wb_kref)) + if (!kref_get_unless_zero(&subreq->wb_kref)) { + if (subreq->wb_offset == head->wb_offset + total_bytes) + total_bytes += subreq->wb_bytes; continue; + } + while (!nfs_lock_request(subreq)) { /* * Unlock page to allow nfs_page_group_sync_on_bit() -- cgit v1.2.3 From 8224b2734ab1da4996b851e1e5d3047e7a0df499 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 21 Aug 2017 15:00:49 -0400 Subject: NFS: Add static NFS I/O tracepoints Tools like tcpdump and rpcdebug can be very useful. But there are plenty of environments where they are difficult or impossible to use. For example, we've had customers report I/O failures during workloads so heavy that collecting network traffic or enabling RPC debugging are themselves onerous. The kernel's static tracepoints are lightweight (less likely to introduce timing changes) and efficient (the trace data is compact). They also work in scenarios where capturing network traffic is not possible due to lack of hardware support (some InfiniBand HCAs) or where data or network privacy is a concern. Introduce tracepoints that show when an NFS READ, WRITE, or COMMIT is initiated, and when it completes. Record the arguments and results of each operation, which are not shown by existing sunrpc module's tracepoints. For instance, the recorded offset and count can be used to match an "initiate" event to a "done" event. If an NFS READ result returns fewer bytes than requested or zero, seeing the EOF flag can be probative. Seeing an NFS4ERR_BAD_STATEID result is also indication of a particular class of problems. The timing information attached to each event record can often be useful as well. Usage example: [root@manet tmp]# trace-cmd record -e nfs:*initiate* -e nfs:*done /sys/kernel/debug/tracing/events/nfs/*initiate*/filter /sys/kernel/debug/tracing/events/nfs/*done/filter Hit Ctrl^C to stop recording ^CKernel buffer statistics: Note: "entries" are the entries left in the kernel ring buffer and are not recorded in the trace data. They should all be zero. CPU: 0 entries: 0 overrun: 0 commit overrun: 0 bytes: 3680 oldest event ts: 78.367422 now ts: 100.124419 dropped events: 0 read events: 74 ... and so on. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/read.c | 4 + fs/nfs/write.c | 7 ++ 3 files changed, 259 insertions(+) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 2ca9167bc97d..551711042ba4 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -719,6 +719,254 @@ TRACE_EVENT(nfs_sillyrename_unlink, __get_str(name) ) ); + +TRACE_EVENT(nfs_initiate_read, + TP_PROTO( + const struct inode *inode, + loff_t offset, unsigned long count + ), + + TP_ARGS(inode, offset, count), + + TP_STRUCT__entry( + __field(loff_t, offset) + __field(unsigned long, count) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->offset = offset; + __entry->count = count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count + ) +); + +TRACE_EVENT(nfs_readpage_done, + TP_PROTO( + const struct inode *inode, + int status, loff_t offset, bool eof + ), + + TP_ARGS(inode, status, offset, eof), + + TP_STRUCT__entry( + __field(int, status) + __field(loff_t, offset) + __field(bool, eof) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->status = status; + __entry->offset = offset; + __entry->eof = eof; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld status=%d%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->status, + __entry->eof ? " eof" : "" + ) +); + +/* + * XXX: I tried using NFS_UNSTABLE and friends in this table, but they + * all evaluate to 0 for some reason, even if I include linux/nfs.h. + */ +#define nfs_show_stable(stable) \ + __print_symbolic(stable, \ + { 0, " (UNSTABLE)" }, \ + { 1, " (DATA_SYNC)" }, \ + { 2, " (FILE_SYNC)" }) + +TRACE_EVENT(nfs_initiate_write, + TP_PROTO( + const struct inode *inode, + loff_t offset, unsigned long count, + enum nfs3_stable_how stable + ), + + TP_ARGS(inode, offset, count, stable), + + TP_STRUCT__entry( + __field(loff_t, offset) + __field(unsigned long, count) + __field(enum nfs3_stable_how, stable) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->offset = offset; + __entry->count = count; + __entry->stable = stable; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%lu stable=%d%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count, + __entry->stable, nfs_show_stable(__entry->stable) + ) +); + +TRACE_EVENT(nfs_writeback_done, + TP_PROTO( + const struct inode *inode, + int status, + loff_t offset, + struct nfs_writeverf *writeverf + ), + + TP_ARGS(inode, status, offset, writeverf), + + TP_STRUCT__entry( + __field(int, status) + __field(loff_t, offset) + __field(enum nfs3_stable_how, stable) + __field(unsigned long long, verifier) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->status = status; + __entry->offset = offset; + __entry->stable = writeverf->committed; + memcpy(&__entry->verifier, &writeverf->verifier, + sizeof(__entry->verifier)); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld status=%d stable=%d%s " + "verifier 0x%016llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->status, + __entry->stable, nfs_show_stable(__entry->stable), + __entry->verifier + ) +); + +TRACE_EVENT(nfs_initiate_commit, + TP_PROTO( + const struct nfs_commit_data *data + ), + + TP_ARGS(data), + + TP_STRUCT__entry( + __field(loff_t, offset) + __field(unsigned long, count) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count + ) +); + +TRACE_EVENT(nfs_commit_done, + TP_PROTO( + const struct nfs_commit_data *data + ), + + TP_ARGS(data), + + TP_STRUCT__entry( + __field(int, status) + __field(loff_t, offset) + __field(unsigned long long, verifier) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->status = data->res.op_status; + __entry->offset = data->args.offset; + memcpy(&__entry->verifier, &data->verf.verifier, + sizeof(__entry->verifier)); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld status=%d verifier 0x%016llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->status, + __entry->verifier + ) +); + #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0d42573d423d..48d7277c60a9 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -25,6 +25,7 @@ #include "iostat.h" #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -200,6 +201,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, task_setup_data->flags |= swap_flags; rpc_ops->read_setup(hdr, msg); + trace_nfs_initiate_read(inode, hdr->io_start, hdr->good_bytes); } static void @@ -232,6 +234,8 @@ static int nfs_readpage_done(struct rpc_task *task, return status; nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count); + trace_nfs_readpage_done(inode, task->tk_status, + hdr->args.offset, hdr->res.eof); if (task->tk_status == -ESTALE) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f68083db63c8..c66206ac4ebf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1383,6 +1383,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, task_setup_data->priority = priority; rpc_ops->write_setup(hdr, msg); + trace_nfs_initiate_write(hdr->inode, hdr->io_start, hdr->good_bytes, + hdr->args.stable); nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client, &task_setup_data->rpc_client, msg, hdr); @@ -1540,7 +1542,10 @@ static int nfs_writeback_done(struct rpc_task *task, status = NFS_PROTO(inode)->write_done(task, hdr); if (status != 0) return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); + trace_nfs_writeback_done(inode, task->tk_status, + hdr->args.offset, hdr->res.verf); if (hdr->res.verf->committed < hdr->args.stable && task->tk_status >= 0) { @@ -1669,6 +1674,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, }; /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg); + trace_nfs_initiate_commit(data); dprintk("NFS: initiated commit call\n"); @@ -1793,6 +1799,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) /* Call the NFS version-specific code */ NFS_PROTO(data->inode)->commit_done(task, data); + trace_nfs_commit_done(data); } static void nfs_commit_release_pages(struct nfs_commit_data *data) -- cgit v1.2.3 From bf4b49059718b2217339eb15c60f8753d5b0da99 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 11 Sep 2017 13:15:50 +1000 Subject: NFS: various changes relating to reporting IO errors. 1/ remove 'start' and 'end' args from nfs_file_fsync_commit(). They aren't used. 2/ Make nfs_context_set_write_error() a "static inline" in internal.h so we can... 3/ Use nfs_context_set_write_error() instead of mapping_set_error() if nfs_pageio_add_request() fails before sending any request. NFS generally keeps errors in the open_context, not the mapping, so this is more consistent. 4/ If filemap_write_and_write_range() reports any error, still check ctx->error. The value in ctx->error is likely to be more useful. As part of this, NFS_CONTEXT_ERROR_WRITE is cleared slightly earlier, before nfs_file_fsync_commit() is called, rather than at the start of that function. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 16 ++++++++++------ fs/nfs/internal.h | 7 +++++++ fs/nfs/pagelist.c | 4 ++-- fs/nfs/write.c | 7 ------- 4 files changed, 19 insertions(+), 15 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a385d1c3f146..0214dd1e1060 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap); * fall back to doing a synchronous write. */ static int -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) +nfs_file_fsync_commit(struct file *file, int datasync) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = file_inode(file); - int have_error, do_resend, status; + int do_resend, status; int ret = 0; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) { + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) { ret = xchg(&ctx->error, 0); if (ret) goto out; @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); do { + struct nfs_open_context *ctx = nfs_file_open_context(file); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) { + int ret2 = xchg(&ctx->error, 0); + if (ret2) + ret = ret2; + } if (ret != 0) break; - ret = nfs_file_fsync_commit(file, start, end, datasync); + ret = nfs_file_fsync_commit(file, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 68cc22083639..5bdf952f414b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -768,3 +768,10 @@ static inline bool nfs_error_is_fatal(int err) return false; } } + +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) +{ + ctx->error = error; + smp_wmb(); + set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); +} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index bec120ec1967..d0543e19098a 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1170,8 +1170,8 @@ out_failed: /* remember fatal errors */ if (nfs_error_is_fatal(desc->pg_error)) - mapping_set_error(desc->pg_inode->i_mapping, - desc->pg_error); + nfs_context_set_write_error(req->wb_context, + desc->pg_error); func = desc->pg_completion_ops->error_cleanup; for (midx = 0; midx < desc->pg_mirror_count; midx++) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c66206ac4ebf..babebbccae2a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -145,13 +145,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) -{ - ctx->error = error; - smp_wmb(); - set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); -} - static struct nfs_page * nfs_page_private_request(struct page *page) { -- cgit v1.2.3 From 0671d8f108762efc51ca893dbf8f0ba72f655c3d Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 7 Nov 2017 08:51:00 +0100 Subject: nfs/write: Use common error handling code in nfs_lock_and_join_requests() Add a jump target so that a bit of exception handling can be better reused at the end of this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/nfs/write.c') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index babebbccae2a..5b5f464f6f2a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -487,10 +487,8 @@ try_again: } ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); - } + if (ret < 0) + goto release_request; /* lock each request in the page group */ total_bytes = head->wb_bytes; @@ -515,8 +513,7 @@ try_again: if (ret < 0) { nfs_unroll_locks(inode, head, subreq); nfs_release_request(subreq); - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + goto release_request; } } /* @@ -532,8 +529,8 @@ try_again: nfs_page_group_unlock(head); nfs_unroll_locks(inode, head, subreq); nfs_unlock_and_release_request(subreq); - nfs_unlock_and_release_request(head); - return ERR_PTR(-EIO); + ret = -EIO; + goto release_request; } } @@ -576,6 +573,10 @@ try_again: /* still holds ref on head from nfs_page_find_head_request * and still has lock on head from lock loop */ return head; + +release_request: + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); } static void nfs_write_error_remove_page(struct nfs_page *req) -- cgit v1.2.3