From c6a953cce8d0438391e6da48c8d0793d3fbfcfa6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 19 Jul 2023 09:29:29 +0200 Subject: batman-adv: Trigger events for auto adjusted MTU If an interface changes the MTU, it is expected that an NETDEV_PRECHANGEMTU and NETDEV_CHANGEMTU notification events is triggered. This worked fine for .ndo_change_mtu based changes because core networking code took care of it. But for auto-adjustments after hard-interfaces changes, these events were simply missing. Due to this problem, non-batman-adv components weren't aware of MTU changes and thus couldn't perform their own tasks correctly. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Cc: stable@vger.kernel.org Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 41c1ad33d009..ae5762af0146 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -630,7 +630,7 @@ out: */ void batadv_update_min_mtu(struct net_device *soft_iface) { - soft_iface->mtu = batadv_hardif_min_mtu(soft_iface); + dev_set_mtu(soft_iface, batadv_hardif_min_mtu(soft_iface)); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. -- cgit v1.2.3 From d8e42a2b0addf238be8b3b37dcd9795a5c1be459 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 19 Jul 2023 10:01:15 +0200 Subject: batman-adv: Don't increase MTU when set by user If the user set an MTU value, it usually means that there are special requirements for the MTU. But if an interface gots activated, the MTU was always recalculated and then the user set value was overwritten. The only reason why this user set value has to be overwritten, is when the MTU has to be decreased because batman-adv is not able to transfer packets with the user specified size. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Cc: stable@vger.kernel.org Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 14 +++++++++++++- net/batman-adv/soft-interface.c | 3 +++ net/batman-adv/types.h | 6 ++++++ 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index ae5762af0146..24c9c0c3f316 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -630,7 +630,19 @@ out: */ void batadv_update_min_mtu(struct net_device *soft_iface) { - dev_set_mtu(soft_iface, batadv_hardif_min_mtu(soft_iface)); + struct batadv_priv *bat_priv = netdev_priv(soft_iface); + int limit_mtu; + int mtu; + + mtu = batadv_hardif_min_mtu(soft_iface); + + if (bat_priv->mtu_set_by_user) + limit_mtu = bat_priv->mtu_set_by_user; + else + limit_mtu = ETH_DATA_LEN; + + mtu = min(mtu, limit_mtu); + dev_set_mtu(soft_iface, mtu); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d3fdf82282af..85d00dc9ce32 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -153,11 +153,14 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { + struct batadv_priv *bat_priv = netdev_priv(dev); + /* check ranges */ if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; + bat_priv->mtu_set_by_user = new_mtu; return 0; } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ca9449ec9836..cf1a0eafe3ab 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1546,6 +1546,12 @@ struct batadv_priv { /** @soft_iface: net device which holds this struct as private data */ struct net_device *soft_iface; + /** + * @mtu_set_by_user: MTU was set once by user + * protected by rtnl_lock + */ + int mtu_set_by_user; + /** * @bat_counters: mesh internal traffic statistic counters (see * batadv_counters) -- cgit v1.2.3 From eac27a41ab641de074655d2932fc7f8cdb446881 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Fri, 28 Jul 2023 15:38:50 +0200 Subject: batman-adv: Do not get eth header before batadv_check_management_packet If received skb in batadv_v_elp_packet_recv or batadv_v_ogm_packet_recv is either cloned or non linearized then its data buffer will be reallocated by batadv_check_management_packet when skb_cow or skb_linearize get called. Thus geting ethernet header address inside skb data buffer before batadv_check_management_packet had any chance to reallocate it could lead to the following kernel panic: Unable to handle kernel paging request at virtual address ffffff8020ab069a Mem abort info: ESR = 0x96000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x07: level 3 translation fault Data abort info: ISV = 0, ISS = 0x00000007 CM = 0, WnR = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000040f45000 [ffffff8020ab069a] pgd=180000007fffa003, p4d=180000007fffa003, pud=180000007fffa003, pmd=180000007fefe003, pte=0068000020ab0706 Internal error: Oops: 96000007 [#1] SMP Modules linked in: ahci_mvebu libahci_platform libahci dvb_usb_af9035 dvb_usb_dib0700 dib0070 dib7000m dibx000_common ath11k_pci ath10k_pci ath10k_core mwl8k_new nf_nat_sip nf_conntrack_sip xhci_plat_hcd xhci_hcd nf_nat_pptp nf_conntrack_pptp at24 sbsa_gwdt CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.15.42-00066-g3242268d425c-dirty #550 Hardware name: A8k (DT) pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : batadv_is_my_mac+0x60/0xc0 lr : batadv_v_ogm_packet_recv+0x98/0x5d0 sp : ffffff8000183820 x29: ffffff8000183820 x28: 0000000000000001 x27: ffffff8014f9af00 x26: 0000000000000000 x25: 0000000000000543 x24: 0000000000000003 x23: ffffff8020ab0580 x22: 0000000000000110 x21: ffffff80168ae880 x20: 0000000000000000 x19: ffffff800b561000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 00dc098924ae0032 x14: 0f0405433e0054b0 x13: ffffffff00000080 x12: 0000004000000001 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : ffffffc076dae000 x6 : ffffff8000183700 x5 : ffffffc00955e698 x4 : ffffff80168ae000 x3 : ffffff80059cf000 x2 : ffffff800b561000 x1 : ffffff8020ab0696 x0 : ffffff80168ae880 Call trace: batadv_is_my_mac+0x60/0xc0 batadv_v_ogm_packet_recv+0x98/0x5d0 batadv_batman_skb_recv+0x1b8/0x244 __netif_receive_skb_core.isra.0+0x440/0xc74 __netif_receive_skb_one_core+0x14/0x20 netif_receive_skb+0x68/0x140 br_pass_frame_up+0x70/0x80 br_handle_frame_finish+0x108/0x284 br_handle_frame+0x190/0x250 __netif_receive_skb_core.isra.0+0x240/0xc74 __netif_receive_skb_list_core+0x6c/0x90 netif_receive_skb_list_internal+0x1f4/0x310 napi_complete_done+0x64/0x1d0 gro_cell_poll+0x7c/0xa0 __napi_poll+0x34/0x174 net_rx_action+0xf8/0x2a0 _stext+0x12c/0x2ac run_ksoftirqd+0x4c/0x7c smpboot_thread_fn+0x120/0x210 kthread+0x140/0x150 ret_from_fork+0x10/0x20 Code: f9403844 eb03009f 54fffee1 f94 Thus ethernet header address should only be fetched after batadv_check_management_packet has been called. Fixes: 0da0035942d4 ("batman-adv: OGMv2 - add basic infrastructure") Cc: stable@vger.kernel.org Signed-off-by: Remi Pommarel Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 3 ++- net/batman-adv/bat_v_ogm.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index acff565849ae..1d704574e6bf 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -505,7 +505,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_elp_packet *elp_packet; struct batadv_hard_iface *primary_if; - struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + struct ethhdr *ethhdr; bool res; int ret = NET_RX_DROP; @@ -513,6 +513,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb, if (!res) goto free_skb; + ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e710e9afe78f..84eac41d4658 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -985,7 +985,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm2_packet *ogm_packet; - struct ethhdr *ethhdr = eth_hdr(skb); + struct ethhdr *ethhdr; int ogm_offset; u8 *packet_pos; int ret = NET_RX_DROP; @@ -999,6 +999,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) goto free_skb; + ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; -- cgit v1.2.3 From d25ddb7e788d34cf27ff1738d11a87cb4b67d446 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Fri, 4 Aug 2023 11:39:36 +0200 Subject: batman-adv: Fix TT global entry leak when client roamed back When a client roamed back to a node before it got time to destroy the pending local entry (i.e. within the same originator interval) the old global one is directly removed from hash table and left as such. But because this entry had an extra reference taken at lookup (i.e using batadv_tt_global_hash_find) there is no way its memory will be reclaimed at any time causing the following memory leak: unreferenced object 0xffff0000073c8000 (size 18560): comm "softirq", pid 0, jiffies 4294907738 (age 228.644s) hex dump (first 32 bytes): 06 31 ac 12 c7 7a 05 00 01 00 00 00 00 00 00 00 .1...z.......... 2c ad be 08 00 80 ff ff 6c b6 be 08 00 80 ff ff ,.......l....... backtrace: [<00000000ee6e0ffa>] kmem_cache_alloc+0x1b4/0x300 [<000000000ff2fdbc>] batadv_tt_global_add+0x700/0xe20 [<00000000443897c7>] _batadv_tt_update_changes+0x21c/0x790 [<000000005dd90463>] batadv_tt_update_changes+0x3c/0x110 [<00000000a2d7fc57>] batadv_tt_tvlv_unicast_handler_v1+0xafc/0xe10 [<0000000011793f2a>] batadv_tvlv_containers_process+0x168/0x2b0 [<00000000b7cbe2ef>] batadv_recv_unicast_tvlv+0xec/0x1f4 [<0000000042aef1d8>] batadv_batman_skb_recv+0x25c/0x3a0 [<00000000bbd8b0a2>] __netif_receive_skb_core.isra.0+0x7a8/0xe90 [<000000004033d428>] __netif_receive_skb_one_core+0x64/0x74 [<000000000f39a009>] __netif_receive_skb+0x48/0xe0 [<00000000f2cd8888>] process_backlog+0x174/0x344 [<00000000507d6564>] __napi_poll+0x58/0x1f4 [<00000000b64ef9eb>] net_rx_action+0x504/0x590 [<00000000056fa5e4>] _stext+0x1b8/0x418 [<00000000878879d6>] run_ksoftirqd+0x74/0xa4 unreferenced object 0xffff00000bae1a80 (size 56): comm "softirq", pid 0, jiffies 4294910888 (age 216.092s) hex dump (first 32 bytes): 00 78 b1 0b 00 00 ff ff 0d 50 00 00 00 00 00 00 .x.......P...... 00 00 00 00 00 00 00 00 50 c8 3c 07 00 00 ff ff ........P.<..... backtrace: [<00000000ee6e0ffa>] kmem_cache_alloc+0x1b4/0x300 [<00000000d9aaa49e>] batadv_tt_global_add+0x53c/0xe20 [<00000000443897c7>] _batadv_tt_update_changes+0x21c/0x790 [<000000005dd90463>] batadv_tt_update_changes+0x3c/0x110 [<00000000a2d7fc57>] batadv_tt_tvlv_unicast_handler_v1+0xafc/0xe10 [<0000000011793f2a>] batadv_tvlv_containers_process+0x168/0x2b0 [<00000000b7cbe2ef>] batadv_recv_unicast_tvlv+0xec/0x1f4 [<0000000042aef1d8>] batadv_batman_skb_recv+0x25c/0x3a0 [<00000000bbd8b0a2>] __netif_receive_skb_core.isra.0+0x7a8/0xe90 [<000000004033d428>] __netif_receive_skb_one_core+0x64/0x74 [<000000000f39a009>] __netif_receive_skb+0x48/0xe0 [<00000000f2cd8888>] process_backlog+0x174/0x344 [<00000000507d6564>] __napi_poll+0x58/0x1f4 [<00000000b64ef9eb>] net_rx_action+0x504/0x590 [<00000000056fa5e4>] _stext+0x1b8/0x418 [<00000000878879d6>] run_ksoftirqd+0x74/0xa4 Releasing the extra reference from batadv_tt_global_hash_find even at roam back when batadv_tt_global_free is called fixes this memory leak. Cc: stable@vger.kernel.org Fixes: 068ee6e204e1 ("batman-adv: roaming handling mechanism redesign") Signed-off-by: Remi Pommarel Signed-off-by; Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 36ca31252a73..b95c36765d04 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -774,7 +774,6 @@ check_roaming: if (roamed_back) { batadv_tt_global_free(bat_priv, tt_global, "Roaming canceled"); - tt_global = NULL; } else { /* The global entry has to be marked as ROAMING and * has to be kept for consistency purpose -- cgit v1.2.3 From 421d467dc2d483175bad4fb76a31b9e5a3d744cf Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Wed, 9 Aug 2023 17:29:13 +0200 Subject: batman-adv: Fix batadv_v_ogm_aggr_send memory leak When batadv_v_ogm_aggr_send is called for an inactive interface, the skb is silently dropped by batadv_v_ogm_send_to_if() but never freed causing the following memory leak: unreferenced object 0xffff00000c164800 (size 512): comm "kworker/u8:1", pid 2648, jiffies 4295122303 (age 97.656s) hex dump (first 32 bytes): 00 80 af 09 00 00 ff ff e1 09 00 00 75 01 60 83 ............u.`. 1f 00 00 00 b8 00 00 00 15 00 05 00 da e3 d3 64 ...............d backtrace: [<0000000007ad20f6>] __kmalloc_track_caller+0x1a8/0x310 [<00000000d1029e55>] kmalloc_reserve.constprop.0+0x70/0x13c [<000000008b9d4183>] __alloc_skb+0xec/0x1fc [<00000000c7af5051>] __netdev_alloc_skb+0x48/0x23c [<00000000642ee5f5>] batadv_v_ogm_aggr_send+0x50/0x36c [<0000000088660bd7>] batadv_v_ogm_aggr_work+0x24/0x40 [<0000000042fc2606>] process_one_work+0x3b0/0x610 [<000000002f2a0b1c>] worker_thread+0xa0/0x690 [<0000000059fae5d4>] kthread+0x1fc/0x210 [<000000000c587d3a>] ret_from_fork+0x10/0x20 Free the skb in that case to fix this leak. Cc: stable@vger.kernel.org Fixes: 0da0035942d4 ("batman-adv: OGMv2 - add basic infrastructure") Signed-off-by: Remi Pommarel Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_ogm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 84eac41d4658..e503ee0d896b 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -123,8 +123,10 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) + if (hard_iface->if_status != BATADV_IF_ACTIVE) { + kfree_skb(skb); return; + } batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, -- cgit v1.2.3 From 2ebbc9752d06bb1d01201fe632cb6da033b0248d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 17 Aug 2023 14:52:40 +0200 Subject: devlink: add missing unregister linecard notification Cited fixes commit introduced linecard notifications for register, however it didn't add them for unregister. Fix that by adding them. Fixes: c246f9b5fd61 ("devlink: add support to create line card and expose to user") Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230817125240.2144794-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/leftover.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1f00f874471f..bfed7929a904 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6704,6 +6704,7 @@ void devlink_notify_unregister(struct devlink *devlink) struct devlink_param_item *param_item; struct devlink_trap_item *trap_item; struct devlink_port *devlink_port; + struct devlink_linecard *linecard; struct devlink_rate *rate_node; struct devlink_region *region; unsigned long port_index; @@ -6732,6 +6733,8 @@ void devlink_notify_unregister(struct devlink *devlink) xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); devlink_notify(devlink, DEVLINK_CMD_DEL); } -- cgit v1.2.3 From 76f33296d2e09f63118db78125c95ef56df438e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Aug 2023 01:51:32 +0000 Subject: sock: annotate data-races around prot->memory_pressure *prot->memory_pressure is read/writen locklessly, we need to add proper annotations. A recent commit added a new race, it is time to audit all accesses. Fixes: 2d0c88e84e48 ("sock: Fix misuse of sk_under_memory_pressure()") Fixes: 4d93df0abd50 ("[SCTP]: Rewrite of sctp buffer management code") Signed-off-by: Eric Dumazet Cc: Abel Wu Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20230818015132.2699348-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 7 ++++--- net/sctp/socket.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index e3d987b2ef12..690e22139543 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1323,6 +1323,7 @@ struct proto { /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. + * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ @@ -1423,7 +1424,7 @@ static inline bool sk_has_memory_pressure(const struct sock *sk) static inline bool sk_under_global_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure && - !!*sk->sk_prot->memory_pressure; + !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline bool sk_under_memory_pressure(const struct sock *sk) @@ -1435,7 +1436,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; - return !!*sk->sk_prot->memory_pressure; + return !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline long @@ -1512,7 +1513,7 @@ proto_memory_pressure(struct proto *prot) { if (!prot->memory_pressure) return false; - return !!*prot->memory_pressure; + return !!READ_ONCE(*prot->memory_pressure); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9388d98aebc0..6da738f60f4b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -99,7 +99,7 @@ struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) { - sctp_memory_pressure = 1; + WRITE_ONCE(sctp_memory_pressure, 1); } -- cgit v1.2.3 From cba3f1786916063261e3e5ccbb803abc325b24ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Aug 2023 01:58:20 +0000 Subject: dccp: annotate data-races in dccp_poll() We changed tcp_poll() over time, bug never updated dccp. Note that we also could remove dccp instead of maintaining it. Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230818015820.2701595-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/dccp/proto.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 4e3266e4d7c3..fcc5c9d64f46 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -315,11 +315,15 @@ EXPORT_SYMBOL_GPL(dccp_disconnect); __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask; struct sock *sk = sock->sk; + __poll_t mask; + u8 shutdown; + int state; sock_poll_wait(file, sock, wait); - if (sk->sk_state == DCCP_LISTEN) + + state = inet_sk_state_load(sk); + if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -328,20 +332,21 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, */ mask = 0; - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; + shutdown = READ_ONCE(sk->sk_shutdown); - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED) + if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { + if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ @@ -359,7 +364,6 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, } return mask; } - EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, int *karg) -- cgit v1.2.3 From 895cedc1791916e8a98864f12b656702fad0bb67 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 3 Jul 2023 14:18:29 -0400 Subject: xprtrdma: Remap Receive buffers after a reconnect On server-initiated disconnect, rpcrdma_xprt_disconnect() was DMA- unmapping the Receive buffers, but rpcrdma_post_recvs() neglected to remap them after a new connection had been established. The result was immediate failure of the new connection with the Receives flushing with LOCAL_PROT_ERR. Fixes: 671c450b6fe0 ("xprtrdma: Fix oops in Receive handler after device removal") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b098fde373ab..28c0771c4e8c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -935,9 +935,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (!rep->rr_rdmabuf) goto out_free; - if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) - goto out_free_regbuf; - rep->rr_cid.ci_completion_id = atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); @@ -956,8 +953,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, spin_unlock(&buf->rb_lock); return rep; -out_free_regbuf: - rpcrdma_regbuf_free(rep->rr_rdmabuf); out_free: kfree(rep); out: @@ -1363,6 +1358,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) rep = rpcrdma_rep_create(r_xprt, temp); if (!rep) break; + if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) { + rpcrdma_rep_put(buf, rep); + break; + } rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); -- cgit v1.2.3 From f534f6581ec084fe94d6759f7672bd009794b07e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Aug 2023 18:26:02 -0700 Subject: net: validate veth and vxcan peer ifindexes veth and vxcan need to make sure the ifindexes of the peer are not negative, core does not validate this. Using iproute2 with user-space-level checking removed: Before: # ./ip link add index 10 type veth peer index -1 # ip link show 1: lo: mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 2: enp1s0: mtu 1500 qdisc fq_codel state UP mode DEFAULT group default qlen 1000 link/ether 52:54:00:74:b2:03 brd ff:ff:ff:ff:ff:ff 10: veth1@veth0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 8a:90:ff:57:6d:5d brd ff:ff:ff:ff:ff:ff -1: veth0@veth1: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether ae:ed:18:e6:fa:7f brd ff:ff:ff:ff:ff:ff Now: $ ./ip link add index 10 type veth peer index -1 Error: ifindex can't be negative. This problem surfaced in net-next because an explicit WARN() was added, the root cause is older. Fixes: e6f8f1a739b6 ("veth: Allow to create peer link with given ifindex") Fixes: a8f820a380a2 ("can: add Virtual CAN Tunnel driver (vxcan)") Reported-by: syzbot+5ba06978f34abb058571@syzkaller.appspotmail.com Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/can/vxcan.c | 7 +------ drivers/net/veth.c | 5 +---- include/net/rtnetlink.h | 4 ++-- net/core/rtnetlink.c | 22 ++++++++++++++++++---- 4 files changed, 22 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index 4068d962203d..98c669ad5141 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -192,12 +192,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, nla_peer = data[VXCAN_INFO_PEER]; ifmp = nla_data(nla_peer); - err = rtnl_nla_parse_ifla(peer_tb, - nla_data(nla_peer) + - sizeof(struct ifinfomsg), - nla_len(nla_peer) - - sizeof(struct ifinfomsg), - NULL); + err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); if (err < 0) return err; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 509e901da41d..ef8eacb596f7 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1861,10 +1861,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); - err = rtnl_nla_parse_ifla(peer_tb, - nla_data(nla_peer) + sizeof(struct ifinfomsg), - nla_len(nla_peer) - sizeof(struct ifinfomsg), - NULL); + err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); if (err < 0) return err; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index d9076a7a430c..6506221c5fe3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -190,8 +190,8 @@ int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr * int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh); -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr); +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index aef25aa5cf1d..bcebdeb59163 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2268,13 +2268,27 @@ out_err: return err; } -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr) +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr) { - return nla_parse_deprecated(tb, IFLA_MAX, head, len, ifla_policy, + const struct ifinfomsg *ifmp; + const struct nlattr *attrs; + size_t len; + + ifmp = nla_data(nla_peer); + attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); + len = nla_len(nla_peer) - sizeof(struct ifinfomsg); + + if (ifmp->ifi_index < 0) { + NL_SET_ERR_MSG_ATTR(exterr, nla_peer, + "ifindex can't be negative"); + return -EINVAL; + } + + return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } -EXPORT_SYMBOL(rtnl_nla_parse_ifla); +EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { -- cgit v1.2.3 From f866fbc842de5976e41ba874b76ce31710b634b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Aug 2023 03:17:07 +0000 Subject: ipv4: fix data-races around inet->inet_id UDP sendmsg() is lockless, so ip_select_ident_segs() can very well be run from multiple cpus [1] Convert inet->inet_id to an atomic_t, but implement a dedicated path for TCP, avoiding cost of a locked instruction (atomic_add_return()) Note that this patch will cause a trivial merge conflict because we added inet->flags in net-next tree. v2: added missing change in drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c (David Ahern) [1] BUG: KCSAN: data-race in __ip_make_skb / __ip_make_skb read-write to 0xffff888145af952a of 2 bytes by task 7803 on cpu 1: ip_select_ident_segs include/net/ip.h:542 [inline] ip_select_ident include/net/ip.h:556 [inline] __ip_make_skb+0x844/0xc70 net/ipv4/ip_output.c:1446 ip_make_skb+0x233/0x2c0 net/ipv4/ip_output.c:1560 udp_sendmsg+0x1199/0x1250 net/ipv4/udp.c:1260 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:830 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2494 ___sys_sendmsg net/socket.c:2548 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2634 __do_sys_sendmmsg net/socket.c:2663 [inline] __se_sys_sendmmsg net/socket.c:2660 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2660 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888145af952a of 2 bytes by task 7804 on cpu 0: ip_select_ident_segs include/net/ip.h:541 [inline] ip_select_ident include/net/ip.h:556 [inline] __ip_make_skb+0x817/0xc70 net/ipv4/ip_output.c:1446 ip_make_skb+0x233/0x2c0 net/ipv4/ip_output.c:1560 udp_sendmsg+0x1199/0x1250 net/ipv4/udp.c:1260 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:830 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2494 ___sys_sendmsg net/socket.c:2548 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2634 __do_sys_sendmmsg net/socket.c:2663 [inline] __se_sys_sendmmsg net/socket.c:2660 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2660 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x184d -> 0x184e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7804 Comm: syz-executor.1 Not tainted 6.5.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 ================================================================== Fixes: 23f57406b82d ("ipv4: avoid using shared IP generator for connected sockets") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- include/net/inet_sock.h | 2 +- include/net/ip.h | 15 +++++++++++++-- net/dccp/ipv4.c | 4 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/datagram.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/sctp/socket.c | 2 +- 8 files changed, 22 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index c2e7037c7ba1..7750702900fa 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1466,7 +1466,7 @@ static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt) tp->write_seq = snd_isn; tp->snd_nxt = snd_isn; tp->snd_una = snd_isn; - inet_sk(sk)->inet_id = get_random_u16(); + atomic_set(&inet_sk(sk)->inet_id, get_random_u16()); assign_rxopt(sk, opt); if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10)) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 0bb32bfc6183..491ceb7ebe5d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -222,8 +222,8 @@ struct inet_sock { __s16 uc_ttl; __u16 cmsg_flags; struct ip_options_rcu __rcu *inet_opt; + atomic_t inet_id; __be16 inet_sport; - __u16 inet_id; __u8 tos; __u8 min_ttl; diff --git a/include/net/ip.h b/include/net/ip.h index 332521170d9b..19adacd5ece0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -538,8 +538,19 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, * generator as much as we can. */ if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; + int val; + + /* avoid atomic operations for TCP, + * as we hold socket lock at this point. + */ + if (sk_is_tcp(sk)) { + sock_owned_by_me(sk); + val = atomic_read(&inet_sk(sk)->inet_id); + atomic_set(&inet_sk(sk)->inet_id, val + segs); + } else { + val = atomic_add_return(segs, &inet_sk(sk)->inet_id); + } + iph->id = htons(val); return; } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index fa8079303cb0..a545ad71201c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -130,7 +130,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, inet->inet_dport); - inet->inet_id = get_random_u16(); + atomic_set(&inet->inet_id, get_random_u16()); err = dccp_connect(sk); rt = NULL; @@ -432,7 +432,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; - newinet->inet_id = get_random_u16(); + atomic_set(&newinet->inet_id, get_random_u16()); if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9b2ca2fcc5a1..02736b83c303 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -340,7 +340,7 @@ lookup_protocol: else inet->pmtudisc = IP_PMTUDISC_WANT; - inet->inet_id = 0; + atomic_set(&inet->inet_id, 0); sock_init_data(sock, sk); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4d1af0cd7d99..cb5dbee9e018 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = get_random_u16(); + atomic_set(&inet->inet_id, get_random_u16()); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a59cc4b83861..2dbdc26da86e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -312,7 +312,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr)); } - inet->inet_id = get_random_u16(); + atomic_set(&inet->inet_id, get_random_u16()); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -1596,7 +1596,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = get_random_u16(); + atomic_set(&newinet->inet_id, get_random_u16()); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6da738f60f4b..76f1bce49a8e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9479,7 +9479,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; - newinet->inet_id = get_random_u16(); + atomic_set(&newinet->inet_id, get_random_u16()); newinet->uc_ttl = inet->uc_ttl; newinet->mc_loop = 1; -- cgit v1.2.3 From b98c16107cc1647242abbd11f234c05a3a5864f6 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 18 Aug 2023 09:40:04 +0800 Subject: wifi: mac80211: limit reorder_buf_filtered to avoid UBSAN warning The commit 06470f7468c8 ("mac80211: add API to allow filtering frames in BA sessions") added reorder_buf_filtered to mark frames filtered by firmware, and it can only work correctly if hw.max_rx_aggregation_subframes <= 64 since it stores the bitmap in a u64 variable. However, new HE or EHT devices can support BlockAck number up to 256 or 1024, and then using a higher subframe index leads UBSAN warning: UBSAN: shift-out-of-bounds in net/mac80211/rx.c:1129:39 shift exponent 215 is too large for 64-bit type 'long long unsigned int' Call Trace: dump_stack_lvl+0x48/0x70 dump_stack+0x10/0x20 __ubsan_handle_shift_out_of_bounds+0x1ac/0x360 ieee80211_release_reorder_frame.constprop.0.cold+0x64/0x69 [mac80211] ieee80211_sta_reorder_release+0x9c/0x400 [mac80211] ieee80211_prepare_and_rx_handle+0x1234/0x1420 [mac80211] ieee80211_rx_list+0xaef/0xf60 [mac80211] ieee80211_rx_napi+0x53/0xd0 [mac80211] Since only old hardware that supports <=64 BlockAck uses ieee80211_mark_rx_ba_filtered_frames(), limit the use as it is, so add a WARN_ONCE() and comment to note to avoid using this function if hardware capability is not suitable. Signed-off-by: Ping-Ke Shih Link: https://lore.kernel.org/r/20230818014004.16177-1-pkshih@realtek.com [edit commit message] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 1 + net/mac80211/rx.c | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 3a8a2d2c58c3..2a55ae932c56 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6612,6 +6612,7 @@ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) + * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4f707d2a160f..0af2599c17e8 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1083,7 +1083,8 @@ static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx, struct sk_buff *tail = skb_peek_tail(frames); struct ieee80211_rx_status *status; - if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index)) + if (tid_agg_rx->reorder_buf_filtered && + tid_agg_rx->reorder_buf_filtered & BIT_ULL(index)) return true; if (!tail) @@ -1124,7 +1125,8 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, } no_frame: - tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index); + if (tid_agg_rx->reorder_buf_filtered) + tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index); tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num); } @@ -4264,6 +4266,7 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, u16 received_mpdus) { + struct ieee80211_local *local; struct sta_info *sta; struct tid_ampdu_rx *tid_agg_rx; struct sk_buff_head frames; @@ -4281,6 +4284,11 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, sta = container_of(pubsta, struct sta_info, sta); + local = sta->sdata->local; + WARN_ONCE(local->hw.max_rx_aggregation_subframes > 64, + "RX BA marker can't support max_rx_aggregation_subframes %u > 64\n", + local->hw.max_rx_aggregation_subframes); + if (!ieee80211_rx_data_set_sta(&rx, sta, -1)) return; -- cgit v1.2.3 From 0bfe71159230bab79ee230225ae12ffecbb69f3e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 21 Aug 2023 16:45:46 +0200 Subject: can: isotp: fix support for transmission of SF without flow control The original implementation had a very simple handling for single frame transmissions as it just sent the single frame without a timeout handling. With the new echo frame handling the echo frame was also introduced for single frames but the former exception ('simple without timers') has been maintained by accident. This leads to a 1 second timeout when closing the socket and to an -ECOMM error when CAN_ISOTP_WAIT_TX_DONE is selected. As the echo handling is always active (also for single frames) remove the wrong extra condition for single frames. Fixes: 9f39d36530e5 ("can: isotp: add support for transmission without flow control") Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20230821144547.6658-2-socketcan@hartkopp.net Signed-off-by: Jakub Kicinski --- net/can/isotp.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 99770ed28531..f02b5d3e4733 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -188,12 +188,6 @@ static bool isotp_register_rxid(struct isotp_sock *so) return (isotp_bc_flags(so) == 0); } -static bool isotp_register_txecho(struct isotp_sock *so) -{ - /* all modes but SF_BROADCAST register for tx echo skbs */ - return (isotp_bc_flags(so) != CAN_ISOTP_SF_BROADCAST); -} - static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, @@ -1209,7 +1203,7 @@ static int isotp_release(struct socket *sock) lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && isotp_register_txecho(so)) { + if (so->bound) { if (so->ifindex) { struct net_device *dev; @@ -1332,14 +1326,12 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); - if (isotp_register_txecho(so)) { - /* no consecutive frame echo skb in flight */ - so->cfecho = 0; + /* no consecutive frame echo skb in flight */ + so->cfecho = 0; - /* register for echo skb's */ - can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), - isotp_rcv_echo, sk, "isotpe", sk); - } + /* register for echo skb's */ + can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), + isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); @@ -1560,7 +1552,7 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg, case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && isotp_register_txecho(so)) { + if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), -- cgit v1.2.3 From c275a176e4b69868576e543409927ae75e3a3288 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 21 Aug 2023 16:45:47 +0200 Subject: can: raw: add missing refcount for memory leak fix Commit ee8b94c8510c ("can: raw: fix receiver memory leak") introduced a new reference to the CAN netdevice that has assigned CAN filters. But this new ro->dev reference did not maintain its own refcount which lead to another KASAN use-after-free splat found by Eric Dumazet. This patch ensures a proper refcount for the CAN nedevice. Fixes: ee8b94c8510c ("can: raw: fix receiver memory leak") Reported-by: Eric Dumazet Cc: Ziyang Xuan Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20230821144547.6658-3-socketcan@hartkopp.net Signed-off-by: Jakub Kicinski --- net/can/raw.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index e10f59375659..d50c3f3d892f 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -85,6 +85,7 @@ struct raw_sock { int bound; int ifindex; struct net_device *dev; + netdevice_tracker dev_tracker; struct list_head notifier; int loopback; int recv_own_msgs; @@ -285,8 +286,10 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg, case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ - if (ro->bound) + if (ro->bound) { raw_disable_allfilters(dev_net(dev), dev, sk); + netdev_put(dev, &ro->dev_tracker); + } if (ro->count > 1) kfree(ro->filter); @@ -391,10 +394,12 @@ static int raw_release(struct socket *sock) /* remove current filters & unregister */ if (ro->bound) { - if (ro->dev) + if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); - else + netdev_put(ro->dev, &ro->dev_tracker); + } else { raw_disable_allfilters(sock_net(sk), NULL, sk); + } } if (ro->count > 1) @@ -445,10 +450,10 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) goto out; } if (dev->type != ARPHRD_CAN) { - dev_put(dev); err = -ENODEV; - goto out; + goto out_put_dev; } + if (!(dev->flags & IFF_UP)) notify_enetdown = 1; @@ -456,7 +461,9 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), dev, sk); - dev_put(dev); + if (err) + goto out_put_dev; + } else { ifindex = 0; @@ -467,18 +474,28 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (!err) { if (ro->bound) { /* unregister old filters */ - if (ro->dev) + if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); - else + /* drop reference to old ro->dev */ + netdev_put(ro->dev, &ro->dev_tracker); + } else { raw_disable_allfilters(sock_net(sk), NULL, sk); + } } ro->ifindex = ifindex; ro->bound = 1; + /* bind() ok -> hold a reference for new ro->dev */ ro->dev = dev; + if (ro->dev) + netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL); } - out: +out_put_dev: + /* remove potential reference from dev_get_by_index() */ + if (dev) + dev_put(dev); +out: release_sock(sk); rtnl_unlock(); -- cgit v1.2.3 From 987aae75fc1041072941ffb622b45ce2359a99b9 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 21 Aug 2023 21:48:48 +0200 Subject: batman-adv: Hold rtnl lock during MTU update via netlink The automatic recalculation of the maximum allowed MTU is usually triggered by code sections which are already rtnl lock protected by callers outside of batman-adv. But when the fragmentation setting is changed via batman-adv's own batadv genl family, then the rtnl lock is not yet taken. But dev_set_mtu requires that the caller holds the rtnl lock because it uses netdevice notifiers. And this code will then fail the check for this lock: RTNL: assertion failed at net/core/dev.c (1953) Cc: stable@vger.kernel.org Reported-by: syzbot+f8812454d9b3ac00d282@syzkaller.appspotmail.com Fixes: c6a953cce8d0 ("batman-adv: Trigger events for auto adjusted MTU") Signed-off-by: Sven Eckelmann Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230821-batadv-missing-mtu-rtnl-lock-v1-1-1c5a7bfe861e@narfation.org Signed-off-by: Jakub Kicinski --- net/batman-adv/netlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index ad5714f737be..6efbc9275aec 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -495,7 +495,10 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]; atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); + + rtnl_lock(); batadv_update_min_mtu(bat_priv->soft_iface); + rtnl_unlock(); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) { -- cgit v1.2.3 From da71714e359b64bd7aab3bd56ec53f307f058133 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Tue, 22 Aug 2023 06:12:31 -0400 Subject: net/sched: fix a qdisc modification with ambiguous command request When replacing an existing root qdisc, with one that is of the same kind, the request boils down to essentially a parameterization change i.e not one that requires allocation and grafting of a new qdisc. syzbot was able to create a scenario which resulted in a taprio qdisc replacing an existing taprio qdisc with a combination of NLM_F_CREATE, NLM_F_REPLACE and NLM_F_EXCL leading to create and graft scenario. The fix ensures that only when the qdisc kinds are different that we should allow a create and graft, otherwise it goes into the "change" codepath. While at it, fix the code and comments to improve readability. While syzbot was able to create the issue, it did not zone on the root cause. Analysis from Vladimir Oltean helped narrow it down. v1->V2 changes: - remove "inline" function definition (Vladmir) - remove extrenous braces in branches (Vladmir) - change inline function names (Pedro) - Run tdc tests (Victor) v2->v3 changes: - dont break else/if (Simon) Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+a3618a167af2021433cd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/20230816225759.g25x76kmgzya2gei@skbuf/T/ Tested-by: Vladimir Oltean Tested-by: Victor Nogueira Reviewed-by: Pedro Tammela Reviewed-by: Victor Nogueira Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_api.c | 53 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index aa6b1fe65151..e9eaf637220e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1547,10 +1547,28 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, return 0; } +static bool req_create_or_replace(struct nlmsghdr *n) +{ + return (n->nlmsg_flags & NLM_F_CREATE && + n->nlmsg_flags & NLM_F_REPLACE); +} + +static bool req_create_exclusive(struct nlmsghdr *n) +{ + return (n->nlmsg_flags & NLM_F_CREATE && + n->nlmsg_flags & NLM_F_EXCL); +} + +static bool req_change(struct nlmsghdr *n) +{ + return (!(n->nlmsg_flags & NLM_F_CREATE) && + !(n->nlmsg_flags & NLM_F_REPLACE) && + !(n->nlmsg_flags & NLM_F_EXCL)); +} + /* * Create/change qdisc. */ - static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -1644,27 +1662,35 @@ replay: * * We know, that some child q is already * attached to this parent and have choice: - * either to change it or to create/graft new one. + * 1) change it or 2) create/graft new one. + * If the requested qdisc kind is different + * than the existing one, then we choose graft. + * If they are the same then this is "change" + * operation - just let it fallthrough.. * * 1. We are allowed to create/graft only - * if CREATE and REPLACE flags are set. + * if the request is explicitly stating + * "please create if it doesn't exist". * - * 2. If EXCL is set, requestor wanted to say, - * that qdisc tcm_handle is not expected + * 2. If the request is to exclusive create + * then the qdisc tcm_handle is not expected * to exist, so that we choose create/graft too. * * 3. The last case is when no flags are set. + * This will happen when for example tc + * utility issues a "change" command. * Alas, it is sort of hole in API, we * cannot decide what to do unambiguously. - * For now we select create/graft, if - * user gave KIND, which does not match existing. + * For now we select create/graft. */ - if ((n->nlmsg_flags & NLM_F_CREATE) && - (n->nlmsg_flags & NLM_F_REPLACE) && - ((n->nlmsg_flags & NLM_F_EXCL) || - (tca[TCA_KIND] && - nla_strcmp(tca[TCA_KIND], q->ops->id)))) - goto create_n_graft; + if (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)) { + if (req_create_or_replace(n) || + req_create_exclusive(n)) + goto create_n_graft; + else if (req_change(n)) + goto create_n_graft2; + } } } } else { @@ -1698,6 +1724,7 @@ create_n_graft: NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; } +create_n_graft2: if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), -- cgit v1.2.3 From 4b80ced971b0d118f9a11dd503a5833a5016de92 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 17 Aug 2023 20:28:32 +0200 Subject: netfilter: nf_tables: validate all pending tables We have to validate all tables in the transaction that are in VALIDATE_DO state, the blamed commit below did not move the break statement to its right location so we only validate one table. Moreover, we can't init table->validate to _SKIP when a table object is allocated. If we do, then if a transcaction creates a new table and then fails the transaction, nfnetlink will loop and nft will hang until user cancels the command. Add back the pernet state as a place to stash the last state encountered. This is either _DO (we hit an error during commit validation) or _SKIP (transaction passed all checks). Fixes: 00c320f9b755 ("netfilter: nf_tables: make validation state per table") Reported-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e9ae567c037d..ffcbdf08380f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1729,6 +1729,7 @@ struct nftables_pernet { u64 table_handle; unsigned int base_seq; unsigned int gc_seq; + u8 validate_state; }; extern unsigned int nf_tables_net_id; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3e841e45f2c0..a76a62ebe9c9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1373,7 +1373,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, if (table == NULL) goto err_kzalloc; - table->validate_state = NFT_VALIDATE_SKIP; + table->validate_state = nft_net->validate_state; table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (table->name == NULL) goto err_strdup; @@ -9051,9 +9051,8 @@ static int nf_tables_validate(struct net *net) return -EAGAIN; nft_validate_state_update(table, NFT_VALIDATE_SKIP); + break; } - - break; } return 0; @@ -9799,8 +9798,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } /* 0. Validate ruleset, otherwise roll back for error reporting. */ - if (nf_tables_validate(net) < 0) + if (nf_tables_validate(net) < 0) { + nft_net->validate_state = NFT_VALIDATE_DO; return -EAGAIN; + } err = nft_flow_rule_offload_commit(net); if (err < 0) @@ -10059,6 +10060,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_commit_audit_log(&adl, nft_net->base_seq); nft_gc_seq_end(nft_net, gc_seq); + nft_net->validate_state = NFT_VALIDATE_SKIP; nf_tables_commit_release(net); return 0; @@ -11115,6 +11117,7 @@ static int __net_init nf_tables_init_net(struct net *net) mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; nft_net->gc_seq = 0; + nft_net->validate_state = NFT_VALIDATE_SKIP; return 0; } -- cgit v1.2.3 From 2c9f0293280e258606e54ed2b96fa71498432eae Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 18 Aug 2023 01:13:31 +0200 Subject: netfilter: nf_tables: flush pending destroy work before netlink notifier Destroy work waits for the RCU grace period then it releases the objects with no mutex held. All releases objects follow this path for transactions, therefore, order is guaranteed and references to top-level objects in the hierarchy remain valid. However, netlink notifier might interfer with pending destroy work. rcu_barrier() is not correct because objects are not release via RCU callback. Flush destroy work before releasing objects from netlink notifier path. Fixes: d4bc8271db21 ("netfilter: nf_tables: netlink notifier might race to release objects") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a76a62ebe9c9..d299e7aa1b96 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -11073,7 +11073,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, gc_seq = nft_gc_seq_begin(nft_net); if (!list_empty(&nf_tables_destroy_list)) - rcu_barrier(); + nf_tables_trans_destroy_flush_work(); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && -- cgit v1.2.3 From 720344340fb9be2765bbaab7b292ece0a4570eae Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 18 Aug 2023 01:13:52 +0200 Subject: netfilter: nf_tables: GC transaction race with abort path Abort path is missing a synchronization point with GC transactions. Add GC sequence number hence any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d299e7aa1b96..a255456efae4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10337,8 +10337,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); - int ret = __nf_tables_abort(net, action); + unsigned int gc_seq; + int ret; + gc_seq = nft_gc_seq_begin(nft_net); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); return ret; -- cgit v1.2.3 From 8357bc946a2abc2a10ca40e5a2105d2b4c57515e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 21 Aug 2023 14:33:32 +0200 Subject: netfilter: nf_tables: use correct lock to protect gc_list Use nf_tables_gc_list_lock spinlock, not nf_tables_destroy_list_lock to protect the gc list. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a255456efae4..eb8b1167dced 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9456,9 +9456,9 @@ static void nft_trans_gc_work(struct work_struct *work) struct nft_trans_gc *trans, *next; LIST_HEAD(trans_gc_list); - spin_lock(&nf_tables_destroy_list_lock); + spin_lock(&nf_tables_gc_list_lock); list_splice_init(&nf_tables_gc_list, &trans_gc_list); - spin_unlock(&nf_tables_destroy_list_lock); + spin_unlock(&nf_tables_gc_list_lock); list_for_each_entry_safe(trans, next, &trans_gc_list, list) { list_del(&trans->list); -- cgit v1.2.3 From 5e1be4cdc98c989d5387ce94ff15b5ad06a5b681 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Aug 2023 19:49:52 +0200 Subject: netfilter: nf_tables: fix out of memory error handling Several instances of pipapo_resize() don't propagate allocation failures, this causes a crash when fault injection is enabled for gfp_kernel slabs. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio --- net/netfilter/nft_set_pipapo.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 3757fcc55723..6af9c9ed4b5c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -902,12 +902,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules++, group, ret, bit_offset = 0; + int rule = f->rules, group, ret, bit_offset = 0; - ret = pipapo_resize(f, f->rules - 1, f->rules); + ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) return ret; + f->rules++; + for (group = 0; group < f->groups; group++) { int i, v; u8 mask; @@ -1052,7 +1054,9 @@ static int pipapo_expand(struct nft_pipapo_field *f, step++; if (step >= len) { if (!masks) { - pipapo_insert(f, base, 0); + err = pipapo_insert(f, base, 0); + if (err < 0) + return err; masks = 1; } goto out; @@ -1235,6 +1239,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else ret = pipapo_expand(f, start, end, f->groups * f->bb); + if (ret < 0) + return ret; + if (f->bsize > bsize_max) bsize_max = f->bsize; -- cgit v1.2.3 From 8e51830e29e12670b4c10df070a4ea4c9593e961 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Aug 2023 22:03:57 +0200 Subject: netfilter: nf_tables: defer gc run if previous batch is still pending Don't queue more gc work, else we may queue the same elements multiple times. If an element is flagged as dead, this can mean that either the previous gc request was invalidated/discarded by a transaction or that the previous request is still pending in the system work queue. The latter will happen if the gc interval is set to a very low value, e.g. 1ms, and system work queue is backlogged. The sets refcount is 1 if no previous gc requeusts are queued, so add a helper for this and skip gc run if old requests are pending. Add a helper for this and skip the gc run in this case. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Florian Westphal Reviewed-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nft_set_hash.c | 3 +++ net/netfilter/nft_set_rbtree.c | 3 +++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ffcbdf08380f..dd40c75011d2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -587,6 +587,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index cef5df846000..524763659f25 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -326,6 +326,9 @@ static void nft_rhash_gc(struct work_struct *work) nft_net = nft_pernet(net); gc_seq = READ_ONCE(nft_net->gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index f9d4c8fcbbf8..c6435e709231 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -611,6 +611,9 @@ static void nft_rbtree_gc(struct work_struct *work) nft_net = nft_pernet(net); gc_seq = READ_ONCE(nft_net->gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; -- cgit v1.2.3 From 30188bd7838c16a98a520db1fe9df01ffc6ed368 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 23 Aug 2023 09:43:48 +0300 Subject: rtnetlink: Reject negative ifindexes in RTM_NEWLINK Negative ifindexes are illegal, but the kernel does not validate the ifindex in the ancillary header of RTM_NEWLINK messages, resulting in the kernel generating a warning [1] when such an ifindex is specified. Fix by rejecting negative ifindexes. [1] WARNING: CPU: 0 PID: 5031 at net/core/dev.c:9593 dev_index_reserve+0x1a2/0x1c0 net/core/dev.c:9593 [...] Call Trace: register_netdevice+0x69a/0x1490 net/core/dev.c:10081 br_dev_newlink+0x27/0x110 net/bridge/br_netlink.c:1552 rtnl_newlink_create net/core/rtnetlink.c:3471 [inline] __rtnl_newlink+0x115e/0x18c0 net/core/rtnetlink.c:3688 rtnl_newlink+0x67/0xa0 net/core/rtnetlink.c:3701 rtnetlink_rcv_msg+0x439/0xd30 net/core/rtnetlink.c:6427 netlink_rcv_skb+0x16b/0x440 net/netlink/af_netlink.c:2545 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0x536/0x810 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x93c/0xe40 net/netlink/af_netlink.c:1910 sock_sendmsg_nosec net/socket.c:728 [inline] sock_sendmsg+0xd9/0x180 net/socket.c:751 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2538 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2592 __sys_sendmsg+0x117/0x1e0 net/socket.c:2621 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 38f7b870d4a6 ("[RTNETLINK]: Link creation API") Reported-by: syzbot+5ba06978f34abb058571@syzkaller.appspotmail.com Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230823064348.2252280-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bcebdeb59163..00c94d9622b4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3561,6 +3561,9 @@ replay: if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); + } else if (ifm->ifi_index < 0) { + NL_SET_ERR_MSG(extack, "ifindex can't be negative"); + return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(net, tb); -- cgit v1.2.3