From 5fb45f95eec682621748b7cb012c6a8f0f981e6a Mon Sep 17 00:00:00 2001 From: Qingfang DENG Date: Thu, 8 Dec 2022 20:35:29 +0800 Subject: netfilter: flowtable: really fix NAT IPv6 offload The for-loop was broken from the start. It translates to: for (i = 0; i < 4; i += 4) which means the loop statement is run only once, so only the highest 32-bit of the IPv6 address gets mangled. Fix the loop increment. Fixes: 0e07e25b481a ("netfilter: flowtable: fix NAT IPv6 offload mangling") Fixes: 5c27d8d76ce8 ("netfilter: nf_flow_table_offload: add IPv6 support") Signed-off-by: Qingfang DENG Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 0fdcdb2c9ae4..4d9b99abe37d 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -383,12 +383,12 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, const __be32 *addr, const __be32 *mask) { struct flow_action_entry *entry; - int i, j; + int i; - for (i = 0, j = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32), j++) { + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, - offset + i, &addr[j], mask); + offset + i * sizeof(u32), &addr[i], mask); } } -- cgit v1.2.3 From ba57ee0944ff0085652cf8df91f9c571883debe6 Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Mon, 12 Dec 2022 15:43:51 +0800 Subject: ipvs: add a 'default' case in do_ip_vs_set_ctl() It is better to return the default switch case with '-EINVAL', in case new commands are added. otherwise, return a uninitialized value of ret. Signed-off-by: Li Qiong Reviewed-by: Simon Horman Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 988222fff9f0..97f6a1c8933a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2590,6 +2590,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; } out_unlock: -- cgit v1.2.3 From 3ff8bff704f4de125dca2262e5b5b963a3da1d87 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Dec 2022 00:05:53 +0300 Subject: unix: Fix race in SOCK_SEQPACKET's unix_dgram_sendmsg() There is a race resulting in alive SOCK_SEQPACKET socket may change its state from TCP_ESTABLISHED to TCP_CLOSE: unix_release_sock(peer) unix_dgram_sendmsg(sk) sock_orphan(peer) sock_set_flag(peer, SOCK_DEAD) sock_alloc_send_pskb() if !(sk->sk_shutdown & SEND_SHUTDOWN) OK if sock_flag(peer, SOCK_DEAD) sk->sk_state = TCP_CLOSE sk->sk_shutdown = SHUTDOWN_MASK After that socket sk remains almost normal: it is able to connect, listen, accept and recvmsg, while it can't sendmsg. Since this is the only possibility for alive SOCK_SEQPACKET to change the state in such way, we should better fix this strange and potentially danger corner case. Note, that we will return EPIPE here like this is normally done in sock_alloc_send_pskb(). Originally used ECONNREFUSED looks strange, since it's strange to return a specific retval in dependence of race in kernel, when user can't affect on this. Also, move TCP_CLOSE assignment for SOCK_DGRAM sockets under state lock to fix race with unix_dgram_connect(): unix_dgram_connect(other) unix_dgram_sendmsg(sk) unix_peer(sk) = NULL unix_state_unlock(sk) unix_state_double_lock(sk, other) sk->sk_state = TCP_ESTABLISHED unix_peer(sk) = other unix_state_double_unlock(sk, other) sk->sk_state = TCP_CLOSED This patch fixes both of these races. Fixes: 83301b5367a9 ("af_unix: Set TCP_ESTABLISHED for datagram sockets too") Signed-off-by: Kirill Tkhai Link: https://lore.kernel.org/r/135fda25-22d5-837a-782b-ceee50e19844@ya.ru Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ede2b2a140a4..f0c2293f1d3b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1999,13 +1999,20 @@ restart_locked: unix_state_lock(sk); err = 0; - if (unix_peer(sk) == other) { + if (sk->sk_type == SOCK_SEQPACKET) { + /* We are here only when racing with unix_release_sock() + * is clearing @other. Never change state to TCP_CLOSE + * unlike SOCK_DGRAM wants. + */ + unix_state_unlock(sk); + err = -EPIPE; + } else if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); + sk->sk_state = TCP_CLOSE; unix_state_unlock(sk); - sk->sk_state = TCP_CLOSE; unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; -- cgit v1.2.3 From b4cafb3d2c740f8d1b1234b43ac4a60e5291c960 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 14 Dec 2022 18:01:00 -0800 Subject: devlink: hold region lock when flushing snapshots Netdevsim triggers a splat on reload, when it destroys regions with snapshots pending: WARNING: CPU: 1 PID: 787 at net/core/devlink.c:6291 devlink_region_snapshot_del+0x12e/0x140 CPU: 1 PID: 787 Comm: devlink Not tainted 6.1.0-07460-g7ae9888d6e1c #580 RIP: 0010:devlink_region_snapshot_del+0x12e/0x140 Call Trace: devl_region_destroy+0x70/0x140 nsim_dev_reload_down+0x2f/0x60 [netdevsim] devlink_reload+0x1f7/0x360 devlink_nl_cmd_reload+0x6ce/0x860 genl_family_rcv_msg_doit.isra.0+0x145/0x1c0 This is the locking assert in devlink_region_snapshot_del(), we're supposed to be holding the region->snapshot_lock here. Fixes: 2dec18ad826f ("net: devlink: remove region snapshots list dependency on devlink->lock") Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 6004bd0ccee4..d2df30829083 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -11925,8 +11925,10 @@ void devl_region_destroy(struct devlink_region *region) devl_assert_locked(devlink); /* Free all snapshots of region */ + mutex_lock(®ion->snapshot_lock); list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); + mutex_unlock(®ion->snapshot_lock); list_del(®ion->list); mutex_destroy(®ion->snapshot_lock); -- cgit v1.2.3 From 68bb10101e6b0a6bb44e9c908ef795fc4af99eae Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 15 Dec 2022 15:46:33 +0100 Subject: openvswitch: Fix flow lookup to use unmasked key The commit mentioned below causes the ovs_flow_tbl_lookup() function to be called with the masked key. However, it's supposed to be called with the unmasked key. This due to the fact that the datapath supports installing wider flows, and OVS relies on this behavior. For example if ipv4(src=1.1.1.1/192.0.0.0, dst=1.1.1.2/192.0.0.0) exists, a wider flow (smaller mask) of ipv4(src=192.1.1.1/128.0.0.0,dst=192.1.1.2/ 128.0.0.0) is allowed to be added. However, if we try to add a wildcard rule, the installation fails: $ ovs-appctl dpctl/add-flow system@myDP "in_port(1),eth_type(0x0800), \ ipv4(src=1.1.1.1/192.0.0.0,dst=1.1.1.2/192.0.0.0,frag=no)" 2 $ ovs-appctl dpctl/add-flow system@myDP "in_port(1),eth_type(0x0800), \ ipv4(src=192.1.1.1/0.0.0.0,dst=49.1.1.2/0.0.0.0,frag=no)" 2 ovs-vswitchd: updating flow table (File exists) The reason is that the key used to determine if the flow is already present in the system uses the original key ANDed with the mask. This results in the IP address not being part of the (miniflow) key, i.e., being substituted with an all-zero value. When doing the actual lookup, this results in the key wrongfully matching the first flow, and therefore the flow does not get installed. This change reverses the commit below, but rather than having the key on the stack, it's allocated. Fixes: 190aa3e77880 ("openvswitch: Fix Frame-size larger than 1024 bytes warning.") Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 932bcf766d63..9ca721c9fa71 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -973,6 +973,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; + struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -1000,24 +1001,26 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->key, false, &mask); + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) { + error = -ENOMEM; + goto err_kfree_key; + } + + ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; + ovs_flow_mask_key(&new_flow->key, key, true, &mask); + /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &new_flow->key, log); + key, log); if (error) goto err_kfree_flow; - /* unmasked key is needed to match when ufid is not used. */ - if (ovs_identifier_is_key(&new_flow->id)) - match.key = new_flow->id.unmasked_key; - - ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); - /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); @@ -1044,7 +1047,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); + flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1114,6 +1117,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); + + kfree(key); return 0; err_unlock_ovs: @@ -1123,6 +1128,8 @@ err_kfree_acts: ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); +err_kfree_key: + kfree(key); error: return error; } -- cgit v1.2.3 From 214964a13ab56a9757d146b79b468a7ca190fbfb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Dec 2022 20:41:22 -0800 Subject: devlink: protect devlink dump by the instance lock Take the instance lock around devlink_nl_fill() when dumping, doit takes it already. We are only dumping basic info so in the worst case we were risking data races around the reload statistics. Until the big devlink mutex was removed all relevant code was protected by it, so the missing instance lock was not exposed. Fixes: d3efc2a6a6d8 ("net: devlink: remove devlink_mutex") Reviewed-by: Jiri Pirko Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20221216044122.1863550-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index d2df30829083..032d6d0a5ce6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1648,10 +1648,13 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, continue; } + devl_lock(devlink); err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); + devl_unlock(devlink); devlink_put(devlink); + if (err) goto out; idx++; -- cgit v1.2.3 From 2d7afdcbc9d32423f177ee12b7c93783aea338fb Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 14 Dec 2022 23:11:58 -0700 Subject: skbuff: Account for tail adjustment during pull operations Extending the tail can have some unexpected side effects if a program uses a helper like BPF_FUNC_skb_pull_data to read partial content beyond the head skb headlen when all the skbs in the gso frag_list are linear with no head_frag - kernel BUG at net/core/skbuff.c:4219! pc : skb_segment+0xcf4/0xd2c lr : skb_segment+0x63c/0xd2c Call trace: skb_segment+0xcf4/0xd2c __udp_gso_segment+0xa4/0x544 udp4_ufo_fragment+0x184/0x1c0 inet_gso_segment+0x16c/0x3a4 skb_mac_gso_segment+0xd4/0x1b0 __skb_gso_segment+0xcc/0x12c udp_rcv_segment+0x54/0x16c udp_queue_rcv_skb+0x78/0x144 udp_unicast_rcv_skb+0x8c/0xa4 __udp4_lib_rcv+0x490/0x68c udp_rcv+0x20/0x30 ip_protocol_deliver_rcu+0x1b0/0x33c ip_local_deliver+0xd8/0x1f0 ip_rcv+0x98/0x1a4 deliver_ptype_list_skb+0x98/0x1ec __netif_receive_skb_core+0x978/0xc60 Fix this by marking these skbs as GSO_DODGY so segmentation can handle the tail updates accordingly. Fixes: 3dcbdb134f32 ("net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/1671084718-24796-1-git-send-email-quic_subashab@quicinc.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3cbba7099c0f..4a0eb5593275 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2482,6 +2482,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } else { /* Eaten partially. */ + if (skb_is_gso(skb) && !list->head_frag && + skb_headlen(list)) + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ -- cgit v1.2.3 From 9cd3fd2054c3b3055163accbf2f31a4426f10317 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 17 Dec 2022 14:17:07 -0800 Subject: net_sched: reject TCF_EM_SIMPLE case for complex ematch module When TCF_EM_SIMPLE was introduced, it is supposed to be convenient for ematch implementation: https://lore.kernel.org/all/20050105110048.GO26856@postel.suug.ch/ "You don't have to, providing a 32bit data chunk without TCF_EM_SIMPLE set will simply result in allocating & copy. It's an optimization, nothing more." So if an ematch module provides ops->datalen that means it wants a complex data structure (saved in its em->data) instead of a simple u32 value. We should simply reject such a combination, otherwise this u32 could be misinterpreted as a pointer. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+4caeae4c7103813598ae@syzkaller.appspotmail.com Reported-by: Jun Nie Cc: Jamal Hadi Salim Cc: Paolo Abeni Signed-off-by: Cong Wang Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/sched/ematch.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 4ce681361851..5c1235e6076a 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -255,6 +255,8 @@ static int tcf_em_validate(struct tcf_proto *tp, * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { + if (em->ops->datalen > 0) + goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; -- cgit v1.2.3 From 4feb2c44629e6f9b459b41a5a60491069d346a95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:19:47 +0000 Subject: rxrpc: Fix missing unlock in rxrpc_do_sendmsg() One of the error paths in rxrpc_do_sendmsg() doesn't unlock the call mutex before returning. Fix it to do this. Note that this still doesn't get rid of the checker warning: ../net/rxrpc/sendmsg.c:617:5: warning: context imbalance in 'rxrpc_do_sendmsg' - wrong count at exit I think the interplay between the socket lock and the call's user_mutex may be too complicated for checker to analyse, especially as rxrpc_new_client_call_for_sendmsg(), which it calls, returns with the call's user_mutex if successful but unconditionally drops the socket lock. Fixes: e754eba685aa ("rxrpc: Provide a cmsg to specify the amount of Tx data for a call") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/sendmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9fa7e37f7155..cde1e65f16b4 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -625,7 +625,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) - goto error_put; + goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } -- cgit v1.2.3 From fdb99487b0189f0ef883e353ad7484c78a8bd425 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:19:56 +0000 Subject: rxrpc: Fix security setting propagation Fix the propagation of the security settings from sendmsg to the rxrpc_call struct. Fixes: f3441d4125fc ("rxrpc: Copy client call parameters into rxrpc_call earlier") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/call_object.c | 1 + net/rxrpc/conn_client.c | 2 -- net/rxrpc/security.c | 6 +++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index be5eb8cdf549..89dcf60b1158 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -217,6 +217,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); + call->security_level = cp->security_level; if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); if (cp->upgrade) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index a08e33c9e54b..87efa0373aed 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -551,8 +551,6 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; - call->security = conn->security; - call->security_ix = conn->security_ix; call->dest_srx.srx_service = conn->service_id; trace_rxrpc_connect_call(call); diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 209f2c25a0da..ab968f65a490 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -67,13 +67,13 @@ const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) */ int rxrpc_init_client_call_security(struct rxrpc_call *call) { - const struct rxrpc_security *sec; + const struct rxrpc_security *sec = &rxrpc_no_security; struct rxrpc_key_token *token; struct key *key = call->key; int ret; if (!key) - return 0; + goto found; ret = key_validate(key); if (ret < 0) @@ -88,7 +88,7 @@ int rxrpc_init_client_call_security(struct rxrpc_call *call) found: call->security = sec; - _leave(" = 0"); + call->security_ix = sec->security_index; return 0; } -- cgit v1.2.3 From eaa02390adb03b82f04babebf0cdd233793aecf5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:04 +0000 Subject: rxrpc: Fix NULL deref in rxrpc_unuse_local() Fix rxrpc_unuse_local() to get the debug_id *after* checking to see if local is NULL. Fixes: a2cf3264f331 ("rxrpc: Fold __rxrpc_unuse_local() into rxrpc_unuse_local()") Reported-by: syzbot+3538a6a72efa8b059c38@syzkaller.appspotmail.com Signed-off-by: David Howells Tested-by: syzbot+3538a6a72efa8b059c38@syzkaller.appspotmail.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/local_object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 44222923c0d1..24ee585d9aaf 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -357,10 +357,11 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { - unsigned int debug_id = local->debug_id; + unsigned int debug_id; int r, u; if (local) { + debug_id = local->debug_id; r = refcount_read(&local->ref); u = atomic_dec_return(&local->active_users); trace_rxrpc_local(debug_id, why, r, u); -- cgit v1.2.3 From 8fbcc83334a7b5b42b6bc1fae2458bf25eb57768 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:13 +0000 Subject: rxrpc: Fix I/O thread startup getting skipped When starting a kthread, the __kthread_create_on_node() function, as called from kthread_run(), waits for a completion to indicate that the task_struct (or failure state) of the new kernel thread is available before continuing. This does not wait, however, for the thread function to be invoked and, indeed, will skip it if kthread_stop() gets called before it gets there. If this happens, though, kthread_run() will have returned successfully, indicating that the thread was started and returning the task_struct pointer. The actual error indication is returned by kthread_stop(). Note that this is ambiguous, as the caller cannot tell whether the -EINTR error code came from kthread() or from the thread function. This was encountered in the new rxrpc I/O thread, where if the system is being pounded hard by, say, syzbot, the check of KTHREAD_SHOULD_STOP can be delayed long enough for kthread_stop() to get called when rxrpc releases a socket - and this causes an oops because the I/O thread function doesn't get started and thus doesn't remove the rxrpc_local struct from the local_endpoints list. Fix this by using a completion to wait for the thread to actually enter rxrpc_io_thread(). This makes sure the thread can't be prematurely stopped and makes sure the relied-upon cleanup is done. Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Reported-by: syzbot+3538a6a72efa8b059c38@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Marc Dionne cc: Hillf Danton Link: https://lore.kernel.org/r/000000000000229f1505ef2b6159@google.com/ Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/io_thread.c | 2 ++ net/rxrpc/local_object.c | 2 ++ 3 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e7dccab7b741..37f3aec784cc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -287,6 +287,7 @@ struct rxrpc_local { struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; + struct completion io_thread_ready; /* Indication that the I/O thread started */ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head rx_queue; /* Received packets */ diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index d83ae3193032..e460e4151c16 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -426,6 +426,8 @@ int rxrpc_io_thread(void *data) struct rxrpc_call *call; struct sk_buff *skb; + complete(&local->io_thread_ready); + skb_queue_head_init(&rx_queue); set_user_nice(current, MIN_NICE); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 24ee585d9aaf..270b63d8f37a 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -97,6 +97,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->rxnet = rxnet; INIT_HLIST_NODE(&local->link); init_rwsem(&local->defrag_sem); + init_completion(&local->io_thread_ready); skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; @@ -189,6 +190,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) goto error_sock; } + wait_for_completion(&local->io_thread_ready); local->io_thread = io_thread; _leave(" = 0"); return 0; -- cgit v1.2.3 From 608aecd16a31269485e2980898029dd01b03a73e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:21 +0000 Subject: rxrpc: Fix locking issues in rxrpc_put_peer_locked() Now that rxrpc_put_local() may call kthread_stop(), it can't be called under spinlock as it might sleep. This can cause a problem in the peer keepalive code in rxrpc as it tries to avoid dropping the peer_hash_lock from the point it needs to re-add peer->keepalive_link to going round the loop again in rxrpc_peer_keepalive_dispatch(). Fix this by just dropping the lock when we don't need it and accepting that we'll have to take it again. This code is only called about every 20s for each peer, so not very often. This allows rxrpc_put_peer_unlocked() to be removed also. If triggered, this bug produces an oops like the following, as reproduced by a syzbot reproducer for a different oops[1]: BUG: sleeping function called from invalid context at kernel/sched/completion.c:101 ... RCU nest depth: 0, expected: 0 3 locks held by kworker/u9:0/50: #0: ffff88810e74a138 ((wq_completion)krxrpcd){+.+.}-{0:0}, at: process_one_work+0x294/0x636 #1: ffff8881013a7e20 ((work_completion)(&rxnet->peer_keepalive_work)){+.+.}-{0:0}, at: process_one_work+0x294/0x636 #2: ffff88817d366390 (&rxnet->peer_hash_lock){+.+.}-{2:2}, at: rxrpc_peer_keepalive_dispatch+0x2bd/0x35f ... Call Trace: dump_stack_lvl+0x4c/0x5f __might_resched+0x2cf/0x2f2 __wait_for_common+0x87/0x1e8 kthread_stop+0x14d/0x255 rxrpc_peer_keepalive_dispatch+0x333/0x35f rxrpc_peer_keepalive_worker+0x2e9/0x449 process_one_work+0x3c1/0x636 worker_thread+0x25f/0x359 kthread+0x1a6/0x1b5 ret_from_fork+0x1f/0x30 Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/0000000000002b4a9f05ef2b616f@google.com/ [1] Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/peer_event.c | 10 +++++++--- net/rxrpc/peer_object.c | 19 ------------------- 3 files changed, 7 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 37f3aec784cc..5b732a4af009 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1073,7 +1073,6 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); -void rxrpc_put_peer_locked(struct rxrpc_peer *, enum rxrpc_peer_trace); /* * proc.c diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 6685bf917aa6..552ba84a255c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -235,6 +235,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, struct rxrpc_peer *peer; const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; time64_t keepalive_at; + bool use; int slot; spin_lock(&rxnet->peer_hash_lock); @@ -247,9 +248,10 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, if (!rxrpc_get_peer_maybe(peer, rxrpc_peer_get_keepalive)) continue; - if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) { - spin_unlock(&rxnet->peer_hash_lock); + use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); + spin_unlock(&rxnet->peer_hash_lock); + if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; slot = keepalive_at - base; _debug("%02x peer %u t=%d {%pISp}", @@ -270,9 +272,11 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, spin_lock(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); + spin_unlock(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } - rxrpc_put_peer_locked(peer, rxrpc_peer_put_keepalive); + rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); + spin_lock(&rxnet->peer_hash_lock); } spin_unlock(&rxnet->peer_hash_lock); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 608946dcc505..82de295393a0 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -438,25 +438,6 @@ void rxrpc_put_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) } } -/* - * Drop a ref on a peer record where the caller already holds the - * peer_hash_lock. - */ -void rxrpc_put_peer_locked(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) -{ - unsigned int debug_id = peer->debug_id; - bool dead; - int r; - - dead = __refcount_dec_and_test(&peer->ref, &r); - trace_rxrpc_peer(debug_id, r - 1, why); - if (dead) { - hash_del_rcu(&peer->hash_link); - list_del_init(&peer->keepalive_link); - rxrpc_free_peer(peer); - } -} - /* * Make sure all peer records have been discarded. */ -- cgit v1.2.3 From c838f1a73d77abadb0810eff0e150ac88fef3da5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:30 +0000 Subject: rxrpc: Fix switched parameters in peer tracing Fix the switched parameters on rxrpc_alloc_peer() and rxrpc_get_peer(). The ref argument and the why argument got mixed. Fixes: 47c810a79844 ("rxrpc: trace: Don't use __builtin_return_address for rxrpc_peer tracing") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- net/rxrpc/peer_object.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 049b52e7aa6a..c6cfed00d0c6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -471,7 +471,7 @@ TRACE_EVENT(rxrpc_peer, TP_STRUCT__entry( __field(unsigned int, peer ) __field(int, ref ) - __field(int, why ) + __field(enum rxrpc_peer_trace, why ) ), TP_fast_assign( diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 82de295393a0..4eecea2be307 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -226,7 +226,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, rxrpc_peer_init_rtt(peer); peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; - trace_rxrpc_peer(peer->debug_id, why, 1); + trace_rxrpc_peer(peer->debug_id, 1, why); } _leave(" = %p", peer); @@ -382,7 +382,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace int r; __refcount_inc(&peer->ref, &r); - trace_rxrpc_peer(peer->debug_id, why, r + 1); + trace_rxrpc_peer(peer->debug_id, r + 1, why); return peer; } -- cgit v1.2.3 From 743d1768a008c8eae56ead497c9ba8237b14ee81 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:38 +0000 Subject: rxrpc: Fix I/O thread stop The rxrpc I/O thread checks to see if there's any work it needs to do, and if not, checks kthread_should_stop() before scheduling, and if it should stop, breaks out of the loop and tries to clean up and exit. This can, however, race with socket destruction, wherein outstanding calls are aborted and released from the socket and then the socket unuses the local endpoint, causing kthread_stop() to be issued. The abort is deferred to the I/O thread and the event can by issued between the I/O thread checking if there's any work to be done (such as processing call aborts) and the stop being seen. This results in the I/O thread stopping processing of events whilst call cleanup events are still outstanding, leading to connections or other objects still being around and uncleaned up, which can result in assertions being triggered, e.g.: rxrpc: AF_RXRPC: Leaked client conn 00000000e8009865 {2} ------------[ cut here ]------------ kernel BUG at net/rxrpc/conn_client.c:64! Fix this by retrieving the kthread_should_stop() indication, then checking to see if there's more work to do, and going back round the loop if there is, and breaking out of the loop only if there wasn't. This was triggered by a syzbot test that produced some other symptom[1]. Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/0000000000002b4a9f05ef2b616f@google.com/ [1] Signed-off-by: David S. Miller --- net/rxrpc/io_thread.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index e460e4151c16..e6b9f0ceae17 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -425,6 +425,7 @@ int rxrpc_io_thread(void *data) struct rxrpc_local *local = data; struct rxrpc_call *call; struct sk_buff *skb; + bool should_stop; complete(&local->io_thread_ready); @@ -478,13 +479,14 @@ int rxrpc_io_thread(void *data) } set_current_state(TASK_INTERRUPTIBLE); + should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || !list_empty(&local->call_attend_q)) { __set_current_state(TASK_RUNNING); continue; } - if (kthread_should_stop()) + if (should_stop) break; schedule(); } -- cgit v1.2.3 From 11e1706bc84f60040578056f8cef3d0139b92dda Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:46 +0000 Subject: rxrpc: rxperf: Fix uninitialised variable Dan Carpenter sayeth[1]: The patch 75bfdbf2fca3: "rxrpc: Implement an in-kernel rxperf server for testing purposes" from Nov 3, 2022, leads to the following Smatch static checker warning: net/rxrpc/rxperf.c:337 rxperf_deliver_to_call() error: uninitialized symbol 'ret'. Fix this by initialising ret to 0. The value is only used for tracing purposes in the rxperf server. Fixes: 75bfdbf2fca3 ("rxrpc: Implement an in-kernel rxperf server for testing purposes") Reported-by: Dan Carpenter Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-December/006124.html [1] Signed-off-by: David S. Miller --- net/rxrpc/rxperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 66f5eea291ff..d33a109e846c 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -275,7 +275,7 @@ static void rxperf_deliver_to_call(struct work_struct *work) struct rxperf_call *call = container_of(work, struct rxperf_call, work); enum rxperf_call_state state; u32 abort_code, remote_abort = 0; - int ret; + int ret = 0; if (call->state == RXPERF_CALL_COMPLETE) return; -- cgit v1.2.3 From 31d35a02ad5b803354fe0727686fcbace7a343fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:55 +0000 Subject: rxrpc: Fix the return value of rxrpc_new_incoming_call() Dan Carpenter sayeth[1]: The patch 5e6ef4f1017c: "rxrpc: Make the I/O thread take over the call and local processor work" from Jan 23, 2020, leads to the following Smatch static checker warning: net/rxrpc/io_thread.c:283 rxrpc_input_packet() warn: bool is not less than zero. Fix this (for now) by changing rxrpc_new_incoming_call() to return an int with 0 or error code rather than bool. Note that the actual return value of rxrpc_input_packet() is currently ignored. I have a separate patch to clean that up. Fixes: 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") Reported-by: Dan Carpenter Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-December/006123.html [1] Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 6 +++--- net/rxrpc/call_accept.c | 18 +++++++++--------- net/rxrpc/io_thread.c | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5b732a4af009..18092526d3c8 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -812,9 +812,9 @@ extern struct workqueue_struct *rxrpc_workqueue; */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); -bool rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, - struct rxrpc_connection *, struct sockaddr_rxrpc *, - struct sk_buff *); +int rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, + struct rxrpc_connection *, struct sockaddr_rxrpc *, + struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index d1850863507f..c02401656fa9 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -326,11 +326,11 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * If we want to report an error, we mark the skb with the packet type and * abort code and return false. */ -bool rxrpc_new_incoming_call(struct rxrpc_local *local, - struct rxrpc_peer *peer, - struct rxrpc_connection *conn, - struct sockaddr_rxrpc *peer_srx, - struct sk_buff *skb) +int rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb) { const struct rxrpc_security *sec = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -342,7 +342,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, /* Don't set up a call for anything other than the first DATA packet. */ if (sp->hdr.seq != 1 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) - return true; /* Just discard */ + return 0; /* Just discard */ rcu_read_lock(); @@ -413,7 +413,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, _leave(" = %p{%d}", call, call->debug_id); rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return true; + return 0; unsupported_service: trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, @@ -425,10 +425,10 @@ no_call: reject: rcu_read_unlock(); _leave(" = f [%u]", skb->mark); - return false; + return -EPROTO; discard: rcu_read_unlock(); - return true; + return 0; } /* diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index e6b9f0ceae17..1ad067d66fb6 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -292,7 +292,7 @@ protocol_error: skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: rxrpc_reject_packet(local, skb); - return ret; + return 0; } /* @@ -384,7 +384,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (rxrpc_to_client(sp)) goto bad_message; if (rxrpc_new_incoming_call(conn->local, conn->peer, conn, - peer_srx, skb)) + peer_srx, skb) == 0) return 0; goto reject_packet; } -- cgit v1.2.3 From e0c8bccd40fc1c19e1d246c39bcf79e357e1ada3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Dec 2022 16:29:17 +0000 Subject: net: stream: purge sk_error_queue in sk_stream_kill_queues() Changheon Lee reported TCP socket leaks, with a nice repro. It seems we leak TCP sockets with the following sequence: 1) SOF_TIMESTAMPING_TX_ACK is enabled on the socket. Each ACK will cook an skb put in error queue, from __skb_tstamp_tx(). __skb_tstamp_tx() is using skb_clone(), unless SOF_TIMESTAMPING_OPT_TSONLY was also requested. 2) If the application is also using MSG_ZEROCOPY, then we put in the error queue cloned skbs that had a struct ubuf_info attached to them. Whenever an struct ubuf_info is allocated, sock_zerocopy_alloc() does a sock_hold(). As long as the cloned skbs are still in sk_error_queue, socket refcount is kept elevated. 3) Application closes the socket, while error queue is not empty. Since tcp_close() no longer purges the socket error queue, we might end up with a TCP socket with at least one skb in error queue keeping the socket alive forever. This bug can be (ab)used to consume all kernel memory and freeze the host. We need to purge the error queue, with proper synchronization against concurrent writers. Fixes: 24bcbe1cc69f ("net: stream: don't purge sk_error_queue in sk_stream_kill_queues()") Reported-by: Changheon Lee Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/stream.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/stream.c b/net/core/stream.c index 5b1fe2b82eac..cd06750dd329 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -196,6 +196,12 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); + /* Next, the error queue. + * We need to use queue lock, because other threads might + * add packets to the queue without socket lock being held. + */ + skb_queue_purge(&sk->sk_error_queue); + /* Next, the write queue. */ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); -- cgit v1.2.3 From b389a902dd5be4ece505a2e0463b9b034de04bf5 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 15 Dec 2022 13:49:33 +0800 Subject: mctp: Remove device type check at unregister The unregister check could be incorrectly triggered if a netdev changes its type after register. That is possible for a tun device using TUNSETLINK ioctl, resulting in mctp unregister failing and the netdev unregister waiting forever. This was encountered by https://github.com/openthread/openthread/issues/8523 Neither check at register or unregister is required. They were added in an attempt to track down mctp_ptr being set unexpectedly, which should not happen in normal operation. Fixes: 7b1871af75f3 ("mctp: Warn if pointer is set for a wrong dev type") Signed-off-by: Matt Johnston Link: https://lore.kernel.org/r/20221215054933.2403401-1-matt@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/device.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'net') diff --git a/net/mctp/device.c b/net/mctp/device.c index 99a3bda8852f..acb97b257428 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -429,12 +429,6 @@ static void mctp_unregister(struct net_device *dev) struct mctp_dev *mdev; mdev = mctp_dev_get_rtnl(dev); - if (mdev && !mctp_known(dev)) { - // Sanity check, should match what was set in mctp_register - netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", - __func__, dev->type); - return; - } if (!mdev) return; @@ -451,14 +445,8 @@ static int mctp_register(struct net_device *dev) struct mctp_dev *mdev; /* Already registered? */ - mdev = rtnl_dereference(dev->mctp_ptr); - - if (mdev) { - if (!mctp_known(dev)) - netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", - __func__, dev->type); + if (rtnl_dereference(dev->mctp_ptr)) return 0; - } /* only register specific types */ if (!mctp_known(dev)) -- cgit v1.2.3 From fb87bd47516d9a26b6d549231aa743b20fd4a569 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 16 Dec 2022 07:45:26 -0500 Subject: net: Introduce sk_use_task_frag in struct sock. Sockets that can be used while recursing into memory reclaim, like those used by network block devices and file systems, mustn't use current->task_frag: if the current process is already using it, then the inner memory reclaim call would corrupt the task_frag structure. To avoid this, sk_page_frag() uses ->sk_allocation to detect sockets that mustn't use current->task_frag, assuming that those used during memory reclaim had their allocation constraints reflected in ->sk_allocation. This unfortunately doesn't cover all cases: in an attempt to remove all usage of GFP_NOFS and GFP_NOIO, sunrpc stopped setting these flags in ->sk_allocation, and used memalloc_nofs critical sections instead. This breaks the sk_page_frag() heuristic since the allocation constraints are now stored in current->flags, which sk_page_frag() can't read without risking triggering a cache miss and slowing down TCP's fast path. This patch creates a new field in struct sock, named sk_use_task_frag, which sockets with memory reclaim constraints can set to false if they can't safely use current->task_frag. In such cases, sk_page_frag() now always returns the socket's page_frag (->sk_frag). The first user is sunrpc, which needs to avoid using current->task_frag but can keep ->sk_allocation set to GFP_KERNEL otherwise. Eventually, it might be possible to simplify sk_page_frag() by only testing ->sk_use_task_frag and avoid relying on the ->sk_allocation heuristic entirely (assuming other sockets will set ->sk_use_task_frag according to their constraints in the future). The new ->sk_use_task_frag field is placed in a hole in struct sock and belongs to a cache line shared with ->sk_shutdown. Therefore it should be hot and shouldn't have negative performance impacts on TCP's fast path (sk_shutdown is tested just before the while() loop in tcp_sendmsg_locked()). Link: https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/ Signed-off-by: Guillaume Nault Reviewed-by: Benjamin Coddington Signed-off-by: Jakub Kicinski --- include/net/sock.h | 11 +++++++++-- net/core/sock.c | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index ecea3dcc2217..fefe1f4abf19 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -318,6 +318,9 @@ struct sk_filter; * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. + * Sockets that can be used under memory reclaim should + * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests @@ -512,6 +515,7 @@ struct sock { u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; + bool sk_use_task_frag; struct socket *sk_socket; void *sk_user_data; @@ -2561,14 +2565,17 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * usage if the caller is potentially doing any of them. - * This assumes that page fault handlers use the GFP_NOFS flags. + * This assumes that page fault handlers use the GFP_NOFS flags or + * explicitly disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == + if (sk->sk_use_task_frag && + (sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | + __GFP_FS)) == (__GFP_DIRECT_RECLAIM | __GFP_FS)) return ¤t->task_frag; diff --git a/net/core/sock.c b/net/core/sock.c index d2587d8712db..f954d5893e79 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; + sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); -- cgit v1.2.3 From 98123866fcf3fe95a0c1b198ef122dfdbd351916 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 16 Dec 2022 07:45:27 -0500 Subject: Treewide: Stop corrupting socket's task_frag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since moving to memalloc_nofs_save/restore, SUNRPC has stopped setting the GFP_NOIO flag on sk_allocation which the networking system uses to decide when it is safe to use current->task_frag. The results of this are unexpected corruption in task_frag when SUNRPC is involved in memory reclaim. The corruption can be seen in crashes, but the root cause is often difficult to ascertain as a crashing machine's stack trace will have no evidence of being near NFS or SUNRPC code. I believe this problem to be much more pervasive than reports to the community may indicate. Fix this by having kernel users of sockets that may corrupt task_frag due to reclaim set sk_use_task_frag = false. Preemptively correcting this situation for users that still set sk_allocation allows them to convert to memalloc_nofs_save/restore without the same unexpected corruptions that are sure to follow, unlikely to show up in testing, and difficult to bisect. CC: Philipp Reisner CC: Lars Ellenberg CC: "Christoph Böhmwalder" CC: Jens Axboe CC: Josef Bacik CC: Keith Busch CC: Christoph Hellwig CC: Sagi Grimberg CC: Lee Duncan CC: Chris Leech CC: Mike Christie CC: "James E.J. Bottomley" CC: "Martin K. Petersen" CC: Valentina Manea CC: Shuah Khan CC: Greg Kroah-Hartman CC: David Howells CC: Marc Dionne CC: Steve French CC: Christine Caulfield CC: David Teigland CC: Mark Fasheh CC: Joel Becker CC: Joseph Qi CC: Eric Van Hensbergen CC: Latchesar Ionkov CC: Dominique Martinet CC: Ilya Dryomov CC: Xiubo Li CC: Chuck Lever CC: Jeff Layton CC: Trond Myklebust CC: Anna Schumaker CC: Steffen Klassert CC: Herbert Xu Suggested-by: Guillaume Nault Signed-off-by: Benjamin Coddington Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- drivers/block/drbd/drbd_receiver.c | 3 +++ drivers/block/nbd.c | 1 + drivers/nvme/host/tcp.c | 1 + drivers/scsi/iscsi_tcp.c | 1 + drivers/usb/usbip/usbip_common.c | 1 + fs/cifs/connect.c | 1 + fs/dlm/lowcomms.c | 2 ++ fs/ocfs2/cluster/tcp.c | 1 + net/9p/trans_fd.c | 1 + net/ceph/messenger.c | 1 + net/sunrpc/xprtsock.c | 3 +++ net/xfrm/espintcp.c | 1 + 12 files changed, 17 insertions(+) (limited to 'net') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 0e58a3187345..757f4692b5bd 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1030,6 +1030,9 @@ randomize: sock.socket->sk->sk_allocation = GFP_NOIO; msock.socket->sk->sk_allocation = GFP_NOIO; + sock.socket->sk->sk_use_task_frag = false; + msock.socket->sk->sk_use_task_frag = false; + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index e379ccc63c52..592cfa8b765a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, noreclaim_flag = memalloc_noreclaim_save(); do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; + sock->sk->sk_use_task_frag = false; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_control = NULL; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b69b89166b6b..8cedc1ef496c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) queue->sock->sk->sk_rcvtimeo = 10 * HZ; queue->sock->sk->sk_allocation = GFP_ATOMIC; + queue->sock->sk->sk_use_task_frag = false; nvme_tcp_set_queue_io_cpu(queue); queue->request = NULL; queue->data_remaining = 0; diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 5fb1f364e815..1d1cf641937c 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session, sk->sk_reuse = SK_CAN_REUSE; sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */ sk->sk_allocation = GFP_ATOMIC; + sk->sk_use_task_frag = false; sk_set_memalloc(sk); sock_no_linger(sk); diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index f8b326eed54d..a2b2da1255dd 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size) do { sock->sk->sk_allocation = GFP_NOIO; + sock->sk->sk_use_task_frag = false; result = sock_recvmsg(sock, &msg, MSG_WAITALL); if (result <= 0) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e80252a83225..7bc7b5e03c51 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server) cifs_dbg(FYI, "Socket created\n"); server->ssocket = socket; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_use_task_frag = false; if (sfamily == AF_INET6) cifs_reclassify_socket6(socket); else diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 8b80ca0cd65f..4450721ec83c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con) if (dlm_config.ci_protocol == DLM_PROTO_SCTP) sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; + sk->sk_use_task_frag = false; sk->sk_error_report = lowcomms_error_report; release_sock(sk); } @@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void) listen_con.sock = sock; sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; sock->sk->sk_data_ready = lowcomms_listen_data_ready; release_sock(sock->sk); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 37d222bdfc8c..a07b24d170f2 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work) sc->sc_sock = sock; /* freed by sc_kref_release */ sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; myaddr.sin_family = AF_INET; myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 07db2f436d44..d9120f14684b 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) } csocket->sk->sk_allocation = GFP_NOIO; + csocket->sk->sk_use_task_frag = false; file = sock_alloc_file(csocket, 0, NULL); if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dfa237fbd5a3..1d06e114ba3f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con) if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c0506d0d7478..aaa5b2741b79 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_write_space = xs_udp_write_space; sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; + sk->sk_use_task_frag = false; xprt_clear_connected(xprt); @@ -2082,6 +2083,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_use_task_frag = false; xprt_set_connected(xprt); @@ -2249,6 +2251,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; + sk->sk_use_task_frag = false; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index d6fece1ed982..74a54295c164 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -489,6 +489,7 @@ static int espintcp_init_sk(struct sock *sk) /* avoid using task_frag */ sk->sk_allocation = GFP_ATOMIC; + sk->sk_use_task_frag = false; return 0; -- cgit v1.2.3