From 6820bf77864d5894ff67b5c00d7dba8f92011e3d Mon Sep 17 00:00:00 2001 From: Timo Rothenpieler Date: Tue, 23 Feb 2021 00:36:19 +0100 Subject: svcrdma: disable timeouts on rdma backchannel This brings it in line with the regular tcp backchannel, which also has all those timeouts disabled. Prevents the backchannel from timing out, getting some async operations like server side copying getting stuck indefinitely on the client side. Signed-off-by: Timo Rothenpieler Fixes: 5d252f90a800 ("svcrdma: Add class for RDMA backwards direction transport") Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 4a1edbb4028e..9150df35fb6f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -252,9 +252,9 @@ xprt_setup_rdma_bc(struct xprt_create *args) xprt->timeout = &xprt_rdma_bc_timeout; xprt_set_bound(xprt); xprt_set_connected(xprt); - xprt->bind_timeout = RPCRDMA_BIND_TO; - xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + xprt->bind_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; xprt->prot = XPRT_TRANSPORT_BC_RDMA; xprt->ops = &xprt_rdma_bc_procs; -- cgit v1.2.3 From c7de87ff9dac5f396f62d584f3908f80ddc0e07b Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Fri, 26 Feb 2021 09:38:20 -0500 Subject: NFSD: Repair misuse of sv_lock in 5.10.16-rt30. [ This problem is in mainline, but only rt has the chops to be able to detect it. ] Lockdep reports a circular lock dependency between serv->sv_lock and softirq_ctl.lock on system shutdown, when using a kernel built with CONFIG_PREEMPT_RT=y, and a nfs mount exists. This is due to the definition of spin_lock_bh on rt: local_bh_disable(); rt_spin_lock(lock); which forces a softirq_ctl.lock -> serv->sv_lock dependency. This is not a problem as long as _every_ lock of serv->sv_lock is a: spin_lock_bh(&serv->sv_lock); but there is one of the form: spin_lock(&serv->sv_lock); This is what is causing the circular dependency splat. The spin_lock() grabs the lock without first grabbing softirq_ctl.lock via local_bh_disable. If later on in the critical region, someone does a local_bh_disable, we get a serv->sv_lock -> softirq_ctrl.lock dependency established. Deadlock. Fix is to make serv->sv_lock be locked with spin_lock_bh everywhere, no exceptions. [ OK ] Stopped target NFS client services. Stopping Logout off all iSCSI sessions on shutdown... Stopping NFS server and services... [ 109.442380] [ 109.442385] ====================================================== [ 109.442386] WARNING: possible circular locking dependency detected [ 109.442387] 5.10.16-rt30 #1 Not tainted [ 109.442389] ------------------------------------------------------ [ 109.442390] nfsd/1032 is trying to acquire lock: [ 109.442392] ffff994237617f60 ((softirq_ctrl.lock).lock){+.+.}-{2:2}, at: __local_bh_disable_ip+0xd9/0x270 [ 109.442405] [ 109.442405] but task is already holding lock: [ 109.442406] ffff994245cb00b0 (&serv->sv_lock){+.+.}-{0:0}, at: svc_close_list+0x1f/0x90 [ 109.442415] [ 109.442415] which lock already depends on the new lock. [ 109.442415] [ 109.442416] [ 109.442416] the existing dependency chain (in reverse order) is: [ 109.442417] [ 109.442417] -> #1 (&serv->sv_lock){+.+.}-{0:0}: [ 109.442421] rt_spin_lock+0x2b/0xc0 [ 109.442428] svc_add_new_perm_xprt+0x42/0xa0 [ 109.442430] svc_addsock+0x135/0x220 [ 109.442434] write_ports+0x4b3/0x620 [ 109.442438] nfsctl_transaction_write+0x45/0x80 [ 109.442440] vfs_write+0xff/0x420 [ 109.442444] ksys_write+0x4f/0xc0 [ 109.442446] do_syscall_64+0x33/0x40 [ 109.442450] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 109.442454] [ 109.442454] -> #0 ((softirq_ctrl.lock).lock){+.+.}-{2:2}: [ 109.442457] __lock_acquire+0x1264/0x20b0 [ 109.442463] lock_acquire+0xc2/0x400 [ 109.442466] rt_spin_lock+0x2b/0xc0 [ 109.442469] __local_bh_disable_ip+0xd9/0x270 [ 109.442471] svc_xprt_do_enqueue+0xc0/0x4d0 [ 109.442474] svc_close_list+0x60/0x90 [ 109.442476] svc_close_net+0x49/0x1a0 [ 109.442478] svc_shutdown_net+0x12/0x40 [ 109.442480] nfsd_destroy+0xc5/0x180 [ 109.442482] nfsd+0x1bc/0x270 [ 109.442483] kthread+0x194/0x1b0 [ 109.442487] ret_from_fork+0x22/0x30 [ 109.442492] [ 109.442492] other info that might help us debug this: [ 109.442492] [ 109.442493] Possible unsafe locking scenario: [ 109.442493] [ 109.442493] CPU0 CPU1 [ 109.442494] ---- ---- [ 109.442495] lock(&serv->sv_lock); [ 109.442496] lock((softirq_ctrl.lock).lock); [ 109.442498] lock(&serv->sv_lock); [ 109.442499] lock((softirq_ctrl.lock).lock); [ 109.442501] [ 109.442501] *** DEADLOCK *** [ 109.442501] [ 109.442501] 3 locks held by nfsd/1032: [ 109.442503] #0: ffffffff93b49258 (nfsd_mutex){+.+.}-{3:3}, at: nfsd+0x19a/0x270 [ 109.442508] #1: ffff994245cb00b0 (&serv->sv_lock){+.+.}-{0:0}, at: svc_close_list+0x1f/0x90 [ 109.442512] #2: ffffffff93a81b20 (rcu_read_lock){....}-{1:2}, at: rt_spin_lock+0x5/0xc0 [ 109.442518] [ 109.442518] stack backtrace: [ 109.442519] CPU: 0 PID: 1032 Comm: nfsd Not tainted 5.10.16-rt30 #1 [ 109.442522] Hardware name: Supermicro X9DRL-3F/iF/X9DRL-3F/iF, BIOS 3.2 09/22/2015 [ 109.442524] Call Trace: [ 109.442527] dump_stack+0x77/0x97 [ 109.442533] check_noncircular+0xdc/0xf0 [ 109.442546] __lock_acquire+0x1264/0x20b0 [ 109.442553] lock_acquire+0xc2/0x400 [ 109.442564] rt_spin_lock+0x2b/0xc0 [ 109.442570] __local_bh_disable_ip+0xd9/0x270 [ 109.442573] svc_xprt_do_enqueue+0xc0/0x4d0 [ 109.442577] svc_close_list+0x60/0x90 [ 109.442581] svc_close_net+0x49/0x1a0 [ 109.442585] svc_shutdown_net+0x12/0x40 [ 109.442588] nfsd_destroy+0xc5/0x180 [ 109.442590] nfsd+0x1bc/0x270 [ 109.442595] kthread+0x194/0x1b0 [ 109.442600] ret_from_fork+0x22/0x30 [ 109.518225] nfsd: last server has exited, flushing export cache [ OK ] Stopped NFSv4 ID-name mapping service. [ OK ] Stopped GSSAPI Proxy Daemon. [ OK ] Stopped NFS Mount Daemon. [ OK ] Stopped NFS status monitor for NFSv2/3 locking.. Fixes: 719f8bcc883e ("svcrpc: fix xpt_list traversal locking on shutdown") Signed-off-by: Joe Korty Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index dcc50ae54550..3cdd71a8df1e 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1060,7 +1060,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st struct svc_xprt *xprt; int ret = 0; - spin_lock(&serv->sv_lock); + spin_lock_bh(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; @@ -1068,7 +1068,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); } - spin_unlock(&serv->sv_lock); + spin_unlock_bh(&serv->sv_lock); return ret; } -- cgit v1.2.3 From f1442d6349a2e7bb7a6134791bdc26cb776c79af Mon Sep 17 00:00:00 2001 From: Daniel Kobras Date: Sat, 27 Feb 2021 00:04:37 +0100 Subject: sunrpc: fix refcount leak for rpc auth modules If an auth module's accept op returns SVC_CLOSE, svc_process_common() enters a call path that does not call svc_authorise() before leaving the function, and thus leaks a reference on the auth module's refcount. Hence, make sure calls to svc_authenticate() and svc_authorise() are paired for all call paths, to make sure rpc auth modules can be unloaded. Signed-off-by: Daniel Kobras Fixes: 4d712ef1db05 ("svcauth_gss: Close connection when dropping an incoming message") Link: https://lore.kernel.org/linux-nfs/3F1B347F-B809-478F-A1E9-0BE98E22B0F0@oracle.com/T/#t Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 61fb8a18552c..d76dc9d95d16 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1413,7 +1413,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) sendit: if (svc_authorise(rqstp)) - goto close; + goto close_xprt; return 1; /* Caller can now send it */ release_dropit: @@ -1425,6 +1425,8 @@ release_dropit: return 0; close: + svc_authorise(rqstp); +close_xprt: if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) svc_close_xprt(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); @@ -1433,7 +1435,7 @@ release_dropit: err_short_len: svc_printk(rqstp, "short len %zd, dropping request\n", argv->iov_len); - goto close; + goto close_xprt; err_bad_rpc: serv->sv_stats->rpcbadfmt++; -- cgit v1.2.3 From 0ddc942394013f08992fc379ca04cffacbbe3dae Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 2 Mar 2021 10:48:38 -0500 Subject: rpc: fix NULL dereference on kmalloc failure I think this is unlikely but possible: svc_authenticate sets rq_authop and calls svcauth_gss_accept. The kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL, and returning SVC_DENIED. This causes svc_process_common to go to err_bad_auth, and eventually call svc_authorise. That calls ->release == svcauth_gss_release, which tries to dereference rq_auth_data. Signed-off-by: J. Bruce Fields Link: https://lore.kernel.org/linux-nfs/3F1B347F-B809-478F-A1E9-0BE98E22B0F0@oracle.com/T/#t Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/svcauth_gss.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index bd4678db9d76..6dff64374bfe 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1825,11 +1825,14 @@ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; - struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct rpc_gss_wire_cred *gc; struct xdr_buf *resbuf = &rqstp->rq_res; int stat = -EINVAL; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); + if (!gsd) + goto out; + gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; /* Release can be called twice, but we only wrap once. */ @@ -1870,10 +1873,10 @@ out_err: if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; - if (gsd->rsci) + if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); - gsd->rsci = NULL; - + gsd->rsci = NULL; + } return stat; } -- cgit v1.2.3 From bade4be69a6ea6f38c5894468ede10ee60b6f7a0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Mar 2021 13:25:01 -0500 Subject: svcrdma: Revert "svcrdma: Reduce Receive doorbell rate" I tested commit 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate") with mlx4 (IB) and software iWARP and didn't find any issues. However, I recently got my hardware iWARP setup back on line (FastLinQ) and it's crashing hard on this commit (confirmed via bisect). The failure mode is complex. - After a connection is established, the first Receive completes normally. - But the second and third Receives have garbage in their Receive buffers. The server responds with ERR_VERS as a result. - When the client tears down the connection to retry, a couple of posted Receives flush twice, and that corrupts the recv_ctxt free list. - __svc_rdma_free then faults or loops infinitely while destroying the xprt's recv_ctxts. Since 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate") does not fix a bug but is a scalability enhancement, it's safe and appropriate to revert it while working on a replacement. Fixes: 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate") Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 - net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 82 ++++++++++++++++----------------- 2 files changed, 39 insertions(+), 44 deletions(-) (limited to 'net/sunrpc') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 7c693b31965e..1e76ed688044 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -104,7 +104,6 @@ struct svcxprt_rdma { wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ unsigned long sc_flags; - u32 sc_pending_recvs; struct list_head sc_read_complete_q; struct work_struct sc_work; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 6d28f23ceb35..7d34290e2ff8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -266,46 +266,33 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp) svc_rdma_recv_ctxt_put(rdma, ctxt); } -static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, - unsigned int wanted, bool temp) +static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) { - const struct ib_recv_wr *bad_wr = NULL; - struct svc_rdma_recv_ctxt *ctxt; - struct ib_recv_wr *recv_chain; int ret; - recv_chain = NULL; - while (wanted--) { - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - break; - - trace_svcrdma_post_recv(ctxt); - ctxt->rc_temp = temp; - ctxt->rc_recv_wr.next = recv_chain; - recv_chain = &ctxt->rc_recv_wr; - rdma->sc_pending_recvs++; - } - if (!recv_chain) - return false; - - ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); + trace_svcrdma_post_recv(ctxt); + ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); if (ret) goto err_post; - return true; + return 0; err_post: - while (bad_wr) { - ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, - rc_recv_wr); - bad_wr = bad_wr->next; - svc_rdma_recv_ctxt_put(rdma, ctxt); - } - trace_svcrdma_rq_post_err(rdma, ret); - /* Since we're destroying the xprt, no need to reset - * sc_pending_recvs. */ - return false; + svc_rdma_recv_ctxt_put(rdma, ctxt); + return ret; +} + +static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return 0; + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; + return __svc_rdma_post_recv(rdma, ctxt); } /** @@ -316,7 +303,20 @@ err_post: */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); + struct svc_rdma_recv_ctxt *ctxt; + unsigned int i; + int ret; + + for (i = 0; i < rdma->sc_max_requests; i++) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return false; + ctxt->rc_temp = true; + ret = __svc_rdma_post_recv(rdma, ctxt); + if (ret) + return false; + } + return true; } /** @@ -324,6 +324,8 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) * @cq: Completion Queue context * @wc: Work Completion object * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Receive completion handler could be running. */ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { @@ -331,8 +333,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; - rdma->sc_pending_recvs--; - /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); @@ -340,6 +340,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto flushed; + if (svc_rdma_post_recv(rdma)) + goto post_err; + /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; @@ -350,18 +353,11 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); - - if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) && - rdma->sc_pending_recvs < rdma->sc_max_requests) - if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH, - false)) - goto post_err; - return; flushed: - svc_rdma_recv_ctxt_put(rdma, ctxt); post_err: + svc_rdma_recv_ctxt_put(rdma, ctxt); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } -- cgit v1.2.3