From fd9a8d7160937f94aad36ac80d7255b4988740ac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Feb 2013 09:48:42 -0500 Subject: NFSv4.1: Fix bulk recall and destroy of layouts The current code in pnfs_destroy_all_layouts() assumes that removing the layout from the server->layouts list is sufficient to make it invisible to other processes. This ignores the fact that most users access the layout through the nfs_inode->layout... There is further breakage due to lack of reference counting of the layouts, meaning that the whole thing Oopses at the drop of a hat. The code in initiate_bulk_draining() is almost correct, and can be used as a model for pnfs_destroy_all_layouts(), so move that code to pnfs.c, and refactor the code to allow us to choose between a single filesystem bulk recall, and a recall of all layouts. Also note that initiate_bulk_draining() currently calls iput() while holding locks. Fix that too. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/pnfs.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 20 deletions(-) (limited to 'fs/nfs/pnfs.c') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d00260b08103..6be70f622b62 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -/* - * Called by the state manger to remove all layouts established under an - * expired lease. - */ -void -pnfs_destroy_all_layouts(struct nfs_client *clp) +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) { - struct nfs_server *server; struct pnfs_layout_hdr *lo; - LIST_HEAD(tmp_list); + bool ret = false; - nfs4_deviceid_mark_client_invalid(clp); - nfs4_deviceid_purge_client(clp); + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { + pnfs_get_layout_hdr(lo); + list_add(&lo->plh_bulk_destroy, layout_list); + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, + struct nfs_server *server, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo, *next; + struct inode *inode; + + list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + inode = igrab(lo->plh_inode); + if (inode == NULL) + continue; + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + rcu_read_lock(); + return -EAGAIN; + } + return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, + bool is_bulk_recall) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(lseg_list); + int ret = 0; + + while (!list_empty(layout_list)) { + lo = list_entry(layout_list->next, struct pnfs_layout_hdr, + plh_bulk_destroy); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + list_del_init(&lo->plh_bulk_destroy); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + ret = -EAGAIN; + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&lseg_list); + pnfs_put_layout_hdr(lo); + iput(inode); + } + return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (!list_empty(&server->layouts)) - list_splice_init(&server->layouts, &tmp_list); + if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); - while (!list_empty(&tmp_list)) { - lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, - plh_layouts); - dprintk("%s freeing layout for inode %lu\n", __func__, - lo->plh_inode->i_ino); - list_del_init(&lo->plh_layouts); - pnfs_destroy_layout(NFS_I(lo->plh_inode)); + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + + pnfs_destroy_layouts_byclid(clp, false); } /* @@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino, atomic_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); - INIT_LIST_HEAD(&lo->plh_bulk_recall); + INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); return lo; -- cgit v1.2.3 From 78f33277f96430ea001c39e952f6b8200b2ab850 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 24 Feb 2013 09:55:57 -0500 Subject: pnfs: fix resend_to_mds for directio Pass the directio request on pageio_init to clean up the API. Percolate pg_dreq from original nfs_pageio_descriptor to the pnfs_{read,write}_done_resend_to_mds and use it on respective call to nfs_pageio_init_{read,write} on the newly created nfs_pageio_descriptor. Reproduced by command: mount -o vers=4.1 server:/ /mnt dd bs=128k count=8 if=/dev/zero of=/mnt/dd.out oflag=direct BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] atomic_inc+0x4/0x9 [nfs] PGD 34786067 PUD 34794067 PMD 0 Oops: 0002 [#1] SMP Modules linked in: nfs_layout_nfsv41_files nfsv4 nfs nfsd lockd nfs_acl auth_rpcgss exportfs sunrpc btrfs zlib_deflate libcrc32c ipv6 autofs4 CPU 1 Pid: 259, comm: kworker/1:2 Not tainted 3.8.0-rc6 #2 Bochs Bochs RIP: 0010:[] [] atomic_inc+0x4/0x9 [nfs] RSP: 0018:ffff880038f8fa68 EFLAGS: 00010206 RAX: ffffffffa021a6a9 RBX: ffff880038f8fb48 RCX: 00000000000a0000 RDX: ffffffffa021e616 RSI: ffff8800385e9a40 RDI: 0000000000000028 RBP: ffff880038f8fa68 R08: ffffffff81ad6720 R09: ffff8800385e9510 R10: ffffffffa0228450 R11: ffff880038e87418 R12: ffff8800385e9a40 R13: ffff8800385e9a70 R14: ffff880038f8fb38 R15: ffffffffa0148878 FS: 0000000000000000(0000) GS:ffff88003e400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000028 CR3: 0000000034789000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/1:2 (pid: 259, threadinfo ffff880038f8e000, task ffff880038302480) Stack: ffff880038f8fa78 ffffffffa021a6bf ffff880038f8fa88 ffffffffa021bb82 ffff880038f8fae8 ffffffffa021f454 ffff880038f8fae8 ffffffff8109689d ffff880038f8fab8 ffffffff00000006 0000000000000000 ffff880038f8fb48 Call Trace: [] nfs_direct_pgio_init+0x16/0x18 [nfs] [] nfs_pgheader_init+0x6a/0x6c [nfs] [] nfs_generic_pg_writepages+0x51/0xf8 [nfs] [] ? mark_held_locks+0x71/0x99 [] ? rpc_release_resources_task+0x37/0x37 [sunrpc] [] nfs_pageio_doio+0x1a/0x43 [nfs] [] nfs_pageio_complete+0x16/0x2c [nfs] [] pnfs_write_done_resend_to_mds+0x95/0xc5 [nfsv4] [] ? rpc_release_resources_task+0x37/0x37 [sunrpc] [] filelayout_reset_write+0x8c/0x99 [nfs_layout_nfsv41_files] [] filelayout_write_done_cb+0x4d/0xc1 [nfs_layout_nfsv41_files] [] nfs4_write_done+0x36/0x49 [nfsv4] [] nfs_writeback_done+0x53/0x1cc [nfs] [] nfs_writeback_done_common+0xe/0x10 [nfs] [] filelayout_write_call_done+0x28/0x2a [nfs_layout_nfsv41_files] [] rpc_exit_task+0x29/0x87 [sunrpc] [] __rpc_execute+0x11d/0x3cc [sunrpc] [] ? trace_hardirqs_on_caller+0x117/0x173 [] rpc_async_schedule+0x27/0x32 [sunrpc] [] ? __rpc_execute+0x3cc/0x3cc [sunrpc] [] process_one_work+0x226/0x422 [] ? process_one_work+0x159/0x422 [] ? lock_acquired+0x210/0x249 [] ? __rpc_execute+0x3cc/0x3cc [sunrpc] [] worker_thread+0x126/0x1c4 [] ? manage_workers+0x240/0x240 [] kthread+0xb1/0xb9 [] ? __kthread_parkme+0x65/0x65 [] ret_from_fork+0x7c/0xb0 [] ? __kthread_parkme+0x65/0x65 Code: 00 83 38 02 74 12 48 81 4b 50 00 00 01 00 c7 83 60 07 00 00 01 00 00 00 48 89 df e8 55 fe ff ff 5b 41 5c 5d c3 66 90 55 48 89 e5 ff 07 5d c3 55 48 89 e5 f0 ff 0f 0f 94 c0 84 c0 0f 95 c0 0f RIP [] atomic_inc+0x4/0x9 [nfs] RSP CR2: 0000000000000028 Signed-off-by: Benny Halevy Cc: stable@kernel.org [>= 3.6] Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 6 ++++-- fs/nfs/pnfs.c | 14 ++++++++++---- fs/nfs/pnfs.h | 6 ++++-- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs/nfs/pnfs.c') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 194c48410336..49eeb044c109 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data) task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } @@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data) task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6be70f622b62..97767c8683f9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1422,13 +1422,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1463,7 +1465,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* @@ -1578,13 +1581,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_read(&pgio, inode, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1615,7 +1620,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 97cb358bb882..94ba80417748 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -230,9 +230,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ -- cgit v1.2.3 From 3000512137602b84d1ad5fd89d62984993a19bb6 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 28 Feb 2013 20:30:10 -0500 Subject: NFSv4.1: LAYOUTGET EDELAY loops timeout to the MDS The client will currently try LAYOUTGETs forever if a server is returning NFS4ERR_LAYOUTTRYLATER or NFS4ERR_RECALLCONFLICT - even if the client no longer needs the layout (ie process killed, unmounted). This patch uses the DS timeout value (module parameter 'dataserver_timeo' via rpc layer) to set an upper limit of how long the client tries LATOUTGETs in this situation. Once the timeout is reached, IO is redirected to the MDS. This also changes how the client checks if a layout is on the clp list to avoid a double list_add. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- fs/nfs/pnfs.c | 7 +++---- include/linux/nfs_xdr.h | 1 + 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'fs/nfs/pnfs.c') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 37eb38566fbc..b2671cb0f901 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -93,6 +93,8 @@ static int nfs4_map_errors(int err) return err; switch (err) { case -NFS4ERR_RESOURCE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: return -EPERM; @@ -6046,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; + unsigned long timeo, giveup; dprintk("--> %s\n", __func__); @@ -6057,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) goto out; case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: - task->tk_status = -NFS4ERR_DELAY; + timeo = rpc_get_timeout(task->tk_client); + giveup = lgp->args.timestamp + timeo; + if (time_after(giveup, jiffies)) + task->tk_status = -NFS4ERR_DELAY; break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: @@ -6178,6 +6184,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 97767c8683f9..48ac5aad6258 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1181,7 +1181,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; - bool first = false; + bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; @@ -1215,10 +1215,9 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; atomic_inc(&lo->plh_outstanding); - if (list_empty(&lo->plh_segs)) - first = true; - + first = list_empty(&lo->plh_layouts) ? true : false; spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 29adb12c7ecf..2250cab6fc4b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -233,6 +233,7 @@ struct nfs4_layoutget_args { struct inode *inode; struct nfs_open_context *ctx; nfs4_stateid stateid; + unsigned long timestamp; struct nfs4_layoutdriver_data layout; }; -- cgit v1.2.3