From fa3fbe64037839f448dc569212bafc5a495d8219 Mon Sep 17 00:00:00 2001 From: Deren Wu Date: Tue, 2 Aug 2022 23:15:07 +0800 Subject: wifi: mt76: mt7921e: fix crash in chip reset fail In case of drv own fail in reset, we may need to run mac_reset several times. The sequence would trigger system crash as the log below. Because we do not re-enable/schedule "tx_napi" before disable it again, the process would keep waiting for state change in napi_diable(). To avoid the problem and keep status synchronize for each run, goto final resource handling if drv own failed. [ 5857.353423] mt7921e 0000:3b:00.0: driver own failed [ 5858.433427] mt7921e 0000:3b:00.0: Timeout for driver own [ 5859.633430] mt7921e 0000:3b:00.0: driver own failed [ 5859.633444] ------------[ cut here ]------------ [ 5859.633446] WARNING: CPU: 6 at kernel/kthread.c:659 kthread_park+0x11d [ 5859.633717] Workqueue: mt76 mt7921_mac_reset_work [mt7921_common] [ 5859.633728] RIP: 0010:kthread_park+0x11d/0x150 [ 5859.633736] RSP: 0018:ffff8881b676fc68 EFLAGS: 00010202 ...... [ 5859.633766] Call Trace: [ 5859.633768] [ 5859.633771] mt7921e_mac_reset+0x176/0x6f0 [mt7921e] [ 5859.633778] mt7921_mac_reset_work+0x184/0x3a0 [mt7921_common] [ 5859.633785] ? mt7921_mac_set_timing+0x520/0x520 [mt7921_common] [ 5859.633794] ? __kasan_check_read+0x11/0x20 [ 5859.633802] process_one_work+0x7ee/0x1320 [ 5859.633810] worker_thread+0x53c/0x1240 [ 5859.633818] kthread+0x2b8/0x370 [ 5859.633824] ? process_one_work+0x1320/0x1320 [ 5859.633828] ? kthread_complete_and_exit+0x30/0x30 [ 5859.633834] ret_from_fork+0x1f/0x30 [ 5859.633842] Cc: stable@vger.kernel.org Fixes: 0efaf31dec57 ("mt76: mt7921: fix MT7921E reset failure") Signed-off-by: Deren Wu Link: https://lore.kernel.org/r/727eb5ffd3c7c805245e512da150ecf0a7154020.1659452909.git.deren.wu@mediatek.com Signed-off-by: Johannes Berg --- drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c index e1800674089a..576a0149251b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c @@ -261,7 +261,7 @@ int mt7921e_mac_reset(struct mt7921_dev *dev) err = mt7921e_driver_own(dev); if (err) - return err; + goto out; err = mt7921_run_firmware(dev); if (err) -- cgit v1.2.3 From 40b717bfcefab28a0656b8caa5e43d5449e5a671 Mon Sep 17 00:00:00 2001 From: "Ajay.Kathat@microchip.com" Date: Tue, 9 Aug 2022 07:57:56 +0000 Subject: wifi: wilc1000: fix DMA on stack objects Sometimes 'wilc_sdio_cmd53' is called with addresses pointing to an object on the stack. Use dynamically allocated memory for cmd53 instead of stack address which is not DMA'able. Fixes: 5625f965d764 ("wilc1000: move wilc driver out of staging") Reported-by: Michael Walle Suggested-by: Michael Walle Signed-off-by: Ajay Singh Reviewed-by: Michael Walle Tested-by: Michael Walle Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220809075749.62752-1-ajay.kathat@microchip.com --- drivers/net/wireless/microchip/wilc1000/netdev.h | 1 + drivers/net/wireless/microchip/wilc1000/sdio.c | 39 ++++++++++++++++++++---- drivers/net/wireless/microchip/wilc1000/wlan.c | 15 +++++++-- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.h b/drivers/net/wireless/microchip/wilc1000/netdev.h index 43c085c74b7a..bb1a315a7b7e 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.h +++ b/drivers/net/wireless/microchip/wilc1000/netdev.h @@ -245,6 +245,7 @@ struct wilc { u8 *rx_buffer; u32 rx_buffer_offset; u8 *tx_buffer; + u32 *vmm_table; struct txq_handle txq[NQUEUES]; int txq_entries; diff --git a/drivers/net/wireless/microchip/wilc1000/sdio.c b/drivers/net/wireless/microchip/wilc1000/sdio.c index 600cc57e9da2..7390f94cd4ca 100644 --- a/drivers/net/wireless/microchip/wilc1000/sdio.c +++ b/drivers/net/wireless/microchip/wilc1000/sdio.c @@ -28,6 +28,7 @@ struct wilc_sdio { u32 block_size; bool isinit; int has_thrpt_enh3; + u8 *cmd53_buf; }; struct sdio_cmd52 { @@ -47,6 +48,7 @@ struct sdio_cmd53 { u32 count: 9; u8 *buffer; u32 block_size; + bool use_global_buf; }; static const struct wilc_hif_func wilc_hif_sdio; @@ -91,6 +93,8 @@ static int wilc_sdio_cmd53(struct wilc *wilc, struct sdio_cmd53 *cmd) { struct sdio_func *func = container_of(wilc->dev, struct sdio_func, dev); int size, ret; + struct wilc_sdio *sdio_priv = wilc->bus_data; + u8 *buf = cmd->buffer; sdio_claim_host(func); @@ -101,12 +105,23 @@ static int wilc_sdio_cmd53(struct wilc *wilc, struct sdio_cmd53 *cmd) else size = cmd->count; + if (cmd->use_global_buf) { + if (size > sizeof(u32)) + return -EINVAL; + + buf = sdio_priv->cmd53_buf; + } + if (cmd->read_write) { /* write */ - ret = sdio_memcpy_toio(func, cmd->address, - (void *)cmd->buffer, size); + if (cmd->use_global_buf) + memcpy(buf, cmd->buffer, size); + + ret = sdio_memcpy_toio(func, cmd->address, buf, size); } else { /* read */ - ret = sdio_memcpy_fromio(func, (void *)cmd->buffer, - cmd->address, size); + ret = sdio_memcpy_fromio(func, buf, cmd->address, size); + + if (cmd->use_global_buf) + memcpy(cmd->buffer, buf, size); } sdio_release_host(func); @@ -128,6 +143,12 @@ static int wilc_sdio_probe(struct sdio_func *func, if (!sdio_priv) return -ENOMEM; + sdio_priv->cmd53_buf = kzalloc(sizeof(u32), GFP_KERNEL); + if (!sdio_priv->cmd53_buf) { + ret = -ENOMEM; + goto free; + } + ret = wilc_cfg80211_init(&wilc, &func->dev, WILC_HIF_SDIO, &wilc_hif_sdio); if (ret) @@ -161,6 +182,7 @@ dispose_irq: irq_dispose_mapping(wilc->dev_irq_num); wilc_netdev_cleanup(wilc); free: + kfree(sdio_priv->cmd53_buf); kfree(sdio_priv); return ret; } @@ -172,6 +194,7 @@ static void wilc_sdio_remove(struct sdio_func *func) clk_disable_unprepare(wilc->rtc_clk); wilc_netdev_cleanup(wilc); + kfree(sdio_priv->cmd53_buf); kfree(sdio_priv); } @@ -375,8 +398,9 @@ static int wilc_sdio_write_reg(struct wilc *wilc, u32 addr, u32 data) cmd.address = WILC_SDIO_FBR_DATA_REG; cmd.block_mode = 0; cmd.increment = 1; - cmd.count = 4; + cmd.count = sizeof(u32); cmd.buffer = (u8 *)&data; + cmd.use_global_buf = true; cmd.block_size = sdio_priv->block_size; ret = wilc_sdio_cmd53(wilc, &cmd); if (ret) @@ -414,6 +438,7 @@ static int wilc_sdio_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size) nblk = size / block_size; nleft = size % block_size; + cmd.use_global_buf = false; if (nblk > 0) { cmd.block_mode = 1; cmd.increment = 1; @@ -492,8 +517,9 @@ static int wilc_sdio_read_reg(struct wilc *wilc, u32 addr, u32 *data) cmd.address = WILC_SDIO_FBR_DATA_REG; cmd.block_mode = 0; cmd.increment = 1; - cmd.count = 4; + cmd.count = sizeof(u32); cmd.buffer = (u8 *)data; + cmd.use_global_buf = true; cmd.block_size = sdio_priv->block_size; ret = wilc_sdio_cmd53(wilc, &cmd); @@ -535,6 +561,7 @@ static int wilc_sdio_read(struct wilc *wilc, u32 addr, u8 *buf, u32 size) nblk = size / block_size; nleft = size % block_size; + cmd.use_global_buf = false; if (nblk > 0) { cmd.block_mode = 1; cmd.increment = 1; diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.c b/drivers/net/wireless/microchip/wilc1000/wlan.c index 947d9a0a494e..58bbf50081e4 100644 --- a/drivers/net/wireless/microchip/wilc1000/wlan.c +++ b/drivers/net/wireless/microchip/wilc1000/wlan.c @@ -714,7 +714,7 @@ int wilc_wlan_handle_txq(struct wilc *wilc, u32 *txq_count) int ret = 0; int counter; int timeout; - u32 vmm_table[WILC_VMM_TBL_SIZE]; + u32 *vmm_table = wilc->vmm_table; u8 ac_pkt_num_to_chip[NQUEUES] = {0, 0, 0, 0}; const struct wilc_hif_func *func; int srcu_idx; @@ -1252,6 +1252,8 @@ void wilc_wlan_cleanup(struct net_device *dev) while ((rqe = wilc_wlan_rxq_remove(wilc))) kfree(rqe); + kfree(wilc->vmm_table); + wilc->vmm_table = NULL; kfree(wilc->rx_buffer); wilc->rx_buffer = NULL; kfree(wilc->tx_buffer); @@ -1489,6 +1491,14 @@ int wilc_wlan_init(struct net_device *dev) goto fail; } + if (!wilc->vmm_table) + wilc->vmm_table = kzalloc(WILC_VMM_TBL_SIZE, GFP_KERNEL); + + if (!wilc->vmm_table) { + ret = -ENOBUFS; + goto fail; + } + if (!wilc->tx_buffer) wilc->tx_buffer = kmalloc(WILC_TX_BUFF_SIZE, GFP_KERNEL); @@ -1513,7 +1523,8 @@ int wilc_wlan_init(struct net_device *dev) return 0; fail: - + kfree(wilc->vmm_table); + wilc->vmm_table = NULL; kfree(wilc->rx_buffer); wilc->rx_buffer = NULL; kfree(wilc->tx_buffer); -- cgit v1.2.3 From 6d0ef7241553f3553a0a2764c69b07892705924c Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 15 Aug 2022 09:37:37 +0200 Subject: wifi: iwlegacy: 4965: corrected fix for potential off-by-one overflow in il4965_rs_fill_link_cmd() This reverts commit a8eb8e6f7159c7c20c0ddac428bde3d110890aa7 as it can cause invalid link quality command sent to the firmware and address the off-by-one issue by fixing condition of while loop. Cc: stable@vger.kernel.org Fixes: a8eb8e6f7159 ("wifi: iwlegacy: 4965: fix potential off-by-one overflow in il4965_rs_fill_link_cmd()") Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220815073737.GA999388@wp.pl --- drivers/net/wireless/intel/iwlegacy/4965-rs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlegacy/4965-rs.c b/drivers/net/wireless/intel/iwlegacy/4965-rs.c index c62f299b9e0a..d8a5dbf89a02 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-rs.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-rs.c @@ -2403,7 +2403,7 @@ il4965_rs_fill_link_cmd(struct il_priv *il, struct il_lq_sta *lq_sta, /* Repeat initial/next rate. * For legacy IL_NUMBER_TRY == 1, this loop will not execute. * For HT IL_HT_NUMBER_TRY == 3, this executes twice. */ - while (repeat_rate > 0) { + while (repeat_rate > 0 && idx < (LINK_QUAL_MAX_RETRY_NUM - 1)) { if (is_legacy(tbl_type.lq_type)) { if (ant_toggle_cnt < NUM_TRY_BEFORE_ANT_TOGGLE) ant_toggle_cnt++; @@ -2422,8 +2422,6 @@ il4965_rs_fill_link_cmd(struct il_priv *il, struct il_lq_sta *lq_sta, cpu_to_le32(new_rate); repeat_rate--; idx++; - if (idx >= LINK_QUAL_MAX_RETRY_NUM) - goto out; } il4965_rs_get_tbl_info_from_mcs(new_rate, lq_sta->band, @@ -2468,7 +2466,6 @@ il4965_rs_fill_link_cmd(struct il_priv *il, struct il_lq_sta *lq_sta, repeat_rate--; } -out: lq_cmd->agg_params.agg_frame_cnt_limit = LINK_QUAL_AGG_FRAME_LIMIT_DEF; lq_cmd->agg_params.agg_dis_start_th = LINK_QUAL_AGG_DISABLE_START_DEF; -- cgit v1.2.3 From b118509076b39cc5e616c0680312b5caaca535fe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Aug 2022 08:49:16 +0200 Subject: netfilter: remove nf_conntrack_helper sysctl and modparam toggles __nf_ct_try_assign_helper() remains in place but it now requires a template to configure the helper. A toggle to disable automatic helper assignment was added by: a9006892643a ("netfilter: nf_ct_helper: allow to disable automatic helper assignment") in 2012 to address the issues described in "Secure use of iptables and connection tracking helpers". Automatic conntrack helper assignment was disabled by: 3bb398d925ec ("netfilter: nf_ct_helper: disable automatic helper assignment") back in 2016. This patch removes the sysctl and modparam toggles, users now have to rely on explicit conntrack helper configuration via ruleset. Update tools/testing/selftests/netfilter/nft_conntrack_helper.sh to check that auto-assignment does not happen anymore. Acked-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 - include/net/netns/conntrack.h | 1 - net/netfilter/nf_conntrack_core.c | 7 +- net/netfilter/nf_conntrack_helper.c | 80 +++------------------- net/netfilter/nf_conntrack_netlink.c | 5 -- net/netfilter/nf_conntrack_standalone.c | 10 --- net/netfilter/nft_ct.c | 3 - .../selftests/netfilter/nft_conntrack_helper.sh | 36 +++++++--- 8 files changed, 37 insertions(+), 107 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a32be8aa7ed2..6a2019aaa464 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -53,8 +53,6 @@ struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; unsigned int expect_count; - u8 sysctl_auto_assign_helper; - bool auto_assign_helper_warned; /* only used from work queues, configuration plane, and so on: */ unsigned int users4; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index c396a3862e80..e1290c159184 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -101,7 +101,6 @@ struct netns_ct { u8 sysctl_log_invalid; /* Log invalid packets */ u8 sysctl_events; u8 sysctl_acct; - u8 sysctl_auto_assign_helper; u8 sysctl_tstamp; u8 sysctl_checksum; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 71c2f4f95d36..1357a2729a4b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1782,7 +1782,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock_bh(&nf_conntrack_expect_lock); } - if (!exp) + if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was @@ -2068,10 +2068,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (ct->master || (help && !hlist_empty(&help->expectations))) return; - - rcu_read_lock(); - __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); @@ -2797,7 +2793,6 @@ int nf_conntrack_init_net(struct net *net) nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); - nf_conntrack_helper_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e96b32221444..ff737a76052e 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = false; -module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); -MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 0)"); - static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; @@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } -static struct nf_conntrack_helper * -__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) -{ - struct nf_conntrack_helper *helper; - struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - unsigned int h; - - if (!nf_ct_helper_count) - return NULL; - - h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) - return helper; - } - return NULL; -} - struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { @@ -209,33 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -static struct nf_conntrack_helper * -nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - if (!cnet->sysctl_auto_assign_helper) { - if (cnet->auto_assign_helper_warned) - return NULL; - if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) - return NULL; - pr_info("nf_conntrack: default automatic helper assignment " - "has been turned off for security reasons and CT-based " - "firewall rule not found. Use the iptables CT target " - "to attach helpers instead.\n"); - cnet->auto_assign_helper_warned = true; - return NULL; - } - - return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); -} - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - struct net *net = nf_ct_net(ct); /* We already got a helper explicitly attached. The function * nf_conntrack_alter_reply - in case NAT is in use - asks for looking @@ -246,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; - if (tmpl != NULL) { - help = nfct_help(tmpl); - if (help != NULL) { - helper = rcu_dereference(help->helper); - set_bit(IPS_HELPER_BIT, &ct->status); - } + if (WARN_ON_ONCE(!tmpl)) + return 0; + + help = nfct_help(tmpl); + if (help != NULL) { + helper = rcu_dereference(help->helper); + set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { - helper = nf_ct_lookup_helper(ct, net); - if (helper == NULL) { - if (help) - RCU_INIT_POINTER(help->helper, NULL); - return 0; - } + if (help) + RCU_INIT_POINTER(help->helper, NULL); + return 0; } if (help == NULL) { @@ -545,19 +498,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); -void nf_ct_set_auto_assign_helper_warned(struct net *net) -{ - nf_ct_pernet(net)->auto_assign_helper_warned = true; -} -EXPORT_SYMBOL_GPL(nf_ct_set_auto_assign_helper_warned); - -void nf_conntrack_helper_pernet_init(struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; -} - int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 04169b54f2a2..7562b215b932 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2298,11 +2298,6 @@ ctnetlink_create_conntrack(struct net *net, ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } - } else { - /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - if (err < 0) - goto err2; } err = ctnetlink_setup_nat(ct, cda); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 05895878610c..4ffe84c5a82c 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -561,7 +561,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, - NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif @@ -680,14 +679,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - [NF_SYSCTL_CT_HELPER] = { - .procname = "nf_conntrack_helper", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", @@ -1100,7 +1091,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b04995c3e17f..a3f01f209a53 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1089,9 +1089,6 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (err < 0) goto err_put_helper; - /* Avoid the bogus warning, helper will be assigned after CT init */ - nf_ct_set_auto_assign_helper_warned(ctx->net); - return 0; err_put_helper: diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh index bf6b9626c7dd..faa7778d7bd1 100755 --- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh +++ b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh @@ -102,26 +102,42 @@ check_for_helper() ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp' if [ $? -ne 0 ] ; then - echo "FAIL: ${netns} did not show attached helper $message" 1>&2 - ret=1 + if [ $autoassign -eq 0 ] ;then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + else + echo "PASS: ${netns} did not show attached helper $message" 1>&2 + fi + else + if [ $autoassign -eq 0 ] ;then + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + else + echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 + ret=1 + fi fi - echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 return 0 } test_helper() { local port=$1 - local msg=$2 + local autoassign=$2 + + if [ $autoassign -eq 0 ] ;then + msg="set via ruleset" + else + msg="auto-assign" + fi sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & sleep 1 - check_for_helper "$ns1" "ip $msg" $port - check_for_helper "$ns2" "ip $msg" $port + check_for_helper "$ns1" "ip $msg" $port $autoassign + check_for_helper "$ns2" "ip $msg" $port $autoassign wait @@ -173,9 +189,9 @@ if [ $? -ne 0 ];then fi fi -test_helper 2121 "set via ruleset" -ip netns exec ${ns1} sysctl -q 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -q 'net.netfilter.nf_conntrack_helper=1' -test_helper 21 "auto-assign" +test_helper 2121 0 +ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 1 exit $ret -- cgit v1.2.3 From d047283a7034140ea5da759a494fd2274affdd46 Mon Sep 17 00:00:00 2001 From: Harsh Modi Date: Tue, 30 Aug 2022 22:36:03 -0700 Subject: netfilter: br_netfilter: Drop dst references before setting. The IPv6 path already drops dst in the daddr changed case, but the IPv4 path does not. This change makes the two code paths consistent. Further, it is possible that there is already a metadata_dst allocated from ingress that might already be attached to skbuff->dst while following the bridge path. If it is not released before setting a new metadata_dst, it will be leaked. This is similar to what is done in bpf_set_tunnel_key() or ip6_route_input(). It is important to note that the memory being leaked is not the dst being set in the bridge code, but rather memory allocated from some other code path that is not being freed correctly before the skb dst is overwritten. An example of the leakage fixed by this commit found using kmemleak: unreferenced object 0xffff888010112b00 (size 256): comm "softirq", pid 0, jiffies 4294762496 (age 32.012s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 80 16 f1 83 ff ff ff ff ................ e1 4e f6 82 ff ff ff ff 00 00 00 00 00 00 00 00 .N.............. backtrace: [<00000000d79567ea>] metadata_dst_alloc+0x1b/0xe0 [<00000000be113e13>] udp_tun_rx_dst+0x174/0x1f0 [<00000000a36848f4>] geneve_udp_encap_recv+0x350/0x7b0 [<00000000d4afb476>] udp_queue_rcv_one_skb+0x380/0x560 [<00000000ac064aea>] udp_unicast_rcv_skb+0x75/0x90 [<000000009a8ee8c5>] ip_protocol_deliver_rcu+0xd8/0x230 [<00000000ef4980bb>] ip_local_deliver_finish+0x7a/0xa0 [<00000000d7533c8c>] __netif_receive_skb_one_core+0x89/0xa0 [<00000000a879497d>] process_backlog+0x93/0x190 [<00000000e41ade9f>] __napi_poll+0x28/0x170 [<00000000b4c0906b>] net_rx_action+0x14f/0x2a0 [<00000000b20dd5d4>] __do_softirq+0xf4/0x305 [<000000003a7d7e15>] __irq_exit_rcu+0xc3/0x140 [<00000000968d39a2>] sysvec_apic_timer_interrupt+0x9e/0xc0 [<000000009e920794>] asm_sysvec_apic_timer_interrupt+0x16/0x20 [<000000008942add0>] native_safe_halt+0x13/0x20 Florian Westphal says: "Original code was likely fine because nothing ever did set a skb->dst entry earlier than bridge in those days." Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Harsh Modi Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 2 ++ net/bridge/br_netfilter_ipv6.c | 1 + 2 files changed, 3 insertions(+) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index ff4779036649..f20f4373ff40 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -384,6 +384,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); goto bridged_dnat; } @@ -413,6 +414,7 @@ bridged_dnat: kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index e4e0c836c3f5..6b07f30675bb 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -197,6 +197,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } -- cgit v1.2.3 From 77972a36ecc4db7fc7c68f0e80714263c5f03f65 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 31 Aug 2022 13:11:47 +0200 Subject: netfilter: nf_tables: clean up hook list when offload flags check fails splice back the hook list so nft_chain_release_hook() has a chance to release the hooks. BUG: memory leak unreferenced object 0xffff88810180b100 (size 96): comm "syz-executor133", pid 3619, jiffies 4294945714 (age 12.690s) hex dump (first 32 bytes): 28 64 23 02 81 88 ff ff 28 64 23 02 81 88 ff ff (d#.....(d#..... 90 a8 aa 83 ff ff ff ff 00 00 b5 0f 81 88 ff ff ................ backtrace: [] kmalloc include/linux/slab.h:600 [inline] [] nft_netdev_hook_alloc+0x3b/0xc0 net/netfilter/nf_tables_api.c:1901 [] nft_chain_parse_netdev net/netfilter/nf_tables_api.c:1998 [inline] [] nft_chain_parse_hook+0x33a/0x530 net/netfilter/nf_tables_api.c:2073 [] nf_tables_addchain.constprop.0+0x10b/0x950 net/netfilter/nf_tables_api.c:2218 [] nf_tables_newchain+0xa8b/0xc60 net/netfilter/nf_tables_api.c:2593 [] nfnetlink_rcv_batch+0xa46/0xd20 net/netfilter/nfnetlink.c:517 [] nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:638 [inline] [] nfnetlink_rcv+0x1f9/0x220 net/netfilter/nfnetlink.c:656 [] netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] [] netlink_unicast+0x397/0x4c0 net/netlink/af_netlink.c:1345 [] netlink_sendmsg+0x396/0x710 net/netlink/af_netlink.c:1921 [] sock_sendmsg_nosec net/socket.c:714 [inline] [] sock_sendmsg+0x56/0x80 net/socket.c:734 [] ____sys_sendmsg+0x36c/0x390 net/socket.c:2482 [] ___sys_sendmsg+0xa8/0x110 net/socket.c:2536 [] __sys_sendmsg+0x88/0x100 net/socket.c:2565 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: d54725cd11a5 ("netfilter: nf_tables: support for multiple devices per netdev hook") Reported-by: syzbot+5fcdbfab6d6744c57418@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2ee50e23c9b7..816052089b33 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2166,8 +2166,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - !nft_chain_offload_support(basechain)) + !nft_chain_offload_support(basechain)) { + list_splice_init(&basechain->hook_list, &hook->list); return -EOPNOTSUPP; + } flow_block_init(&basechain->flow_block); -- cgit v1.2.3 From 0efe125cfb99e6773a7434f3463f7c2fa28f3a43 Mon Sep 17 00:00:00 2001 From: David Leadbeater Date: Fri, 26 Aug 2022 14:56:58 +1000 Subject: netfilter: nf_conntrack_irc: Fix forged IP logic Ensure the match happens in the right direction, previously the destination used was the server, not the NAT host, as the comment shows the code intended. Additionally nf_nat_irc uses port 0 as a signal and there's no valid way it can appear in a DCC message, so consider port 0 also forged. Fixes: 869f37d8e48f ("[NETFILTER]: nf_conntrack/nf_nat: add IRC helper port") Signed-off-by: David Leadbeater Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_irc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1796c456ac98..992decbcaa5c 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -194,8 +194,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, /* dcc_ip can be the internal OR external (NAT'ed) IP */ tuple = &ct->tuplehash[dir].tuple; - if (tuple->src.u3.ip != dcc_ip && - tuple->dst.u3.ip != dcc_ip) { + if ((tuple->src.u3.ip != dcc_ip && + ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) || + dcc_port == 0) { net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", &tuple->src.u3.ip, &dcc_ip, dcc_port); -- cgit v1.2.3 From ac56a0b48da86fd1b4389632fb7c4c8a5d86eefa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 Aug 2022 15:39:28 +0100 Subject: rxrpc: Fix ICMP/ICMP6 error handling Because rxrpc pretends to be a tunnel on top of a UDP/UDP6 socket, allowing it to siphon off UDP packets early in the handling of received UDP packets thereby avoiding the packet going through the UDP receive queue, it doesn't get ICMP packets through the UDP ->sk_error_report() callback. In fact, it doesn't appear that there's any usable option for getting hold of ICMP packets. Fix this by adding a new UDP encap hook to distribute error messages for UDP tunnels. If the hook is set, then the tunnel driver will be able to see ICMP packets. The hook provides the offset into the packet of the UDP header of the original packet that caused the notification. An alternative would be to call the ->error_handler() hook - but that requires that the skbuff be cloned (as ip_icmp_error() or ipv6_cmp_error() do, though isn't really necessary or desirable in rxrpc's case is we want to parse them there and then, not queue them). Changes ======= ver #3) - Fixed an uninitialised variable. ver #2) - Fixed some missing CONFIG_AF_RXRPC_IPV6 conditionals. Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Signed-off-by: David Howells --- include/linux/udp.h | 1 + include/net/udp_tunnel.h | 4 + net/ipv4/udp.c | 2 + net/ipv4/udp_tunnel_core.c | 1 + net/ipv6/udp.c | 5 +- net/rxrpc/ar-internal.h | 1 + net/rxrpc/local_object.c | 1 + net/rxrpc/peer_event.c | 293 +++++++++++++++++++++++++++++++++++++++------ 8 files changed, 270 insertions(+), 38 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 254a2654400f..e96da4157d04 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,6 +70,7 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index afc7ce713657..72394f441dad 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -67,6 +67,9 @@ static inline int udp_sock_create(struct net *net, typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); +typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, + struct sk_buff *skb, + unsigned int udp_offset); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, @@ -80,6 +83,7 @@ struct udp_tunnel_sock_cfg { __u8 encap_type; udp_tunnel_encap_rcv_t encap_rcv; udp_tunnel_encap_err_lookup_t encap_err_lookup; + udp_tunnel_encap_err_rcv_t encap_err_rcv; udp_tunnel_encap_destroy_t encap_destroy; udp_tunnel_gro_receive_t gro_receive; udp_tunnel_gro_complete_t gro_complete; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 34eda973bbf1..cd72158e953a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -783,6 +783,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2); goto out; } if (!inet->recverr) { diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 8efaf8c3fe2a..8242c8947340 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 16c176e7c69a..3366d6a77ff2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -616,8 +616,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } /* Tunnels don't have an application socket: don't pass errors back */ - if (tunnel) + if (tunnel) { + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, offset); goto out; + } if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 571436064cd6..62c70709d798 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -982,6 +982,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 96ecb7356c0f..79bb02eb67b2 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -137,6 +137,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_input_packet; + tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index be032850ae8c..32561e9567fe 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -16,22 +16,105 @@ #include #include #include +#include #include "ar-internal.h" +static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); static void rxrpc_distribute_error(struct rxrpc_peer *, int, enum rxrpc_call_completion); /* - * Find the peer associated with an ICMP packet. + * Find the peer associated with an ICMPv4 packet. */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, - const struct sk_buff *skb, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, struct sockaddr_rxrpc *srx) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct iphdr *ip, *ip0 = ip_hdr(skb); + struct icmphdr *icmp = icmp_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); - _enter(""); + _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); + + switch (icmp->type) { + case ICMP_DEST_UNREACH: + *info = ntohs(icmp->un.frag.mtu); + fallthrough; + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip = (struct iphdr *)((void *)icmp + 8); + break; + default: + return NULL; + } + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ + switch (srx->transport.family) { + case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; +#endif + + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} + +#ifdef CONFIG_AF_RXRPC_IPV6 +/* + * Find the peer associated with an ICMPv6 packet. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, + struct sockaddr_rxrpc *srx) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); + + _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); + + switch (icmp->icmp6_type) { + case ICMPV6_DEST_UNREACH: + *info = ntohl(icmp->icmp6_mtu); + fallthrough; + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + ip = (struct ipv6hdr *)((void *)icmp + 8); + break; + default: + return NULL; + } memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; @@ -41,6 +124,165 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice * versa? */ + switch (srx->transport.family) { + case AF_INET: + _net("Rx ICMP6 on v4 sock"); + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, + &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); + break; + case AF_INET6: + _net("Rx ICMP6"); + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, + sizeof(struct in6_addr)); + break; + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} +#endif /* CONFIG_AF_RXRPC_IPV6 */ + +/* + * Handle an error received on the local endpoint as a tunnel. + */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, + unsigned int udp_offset) +{ + struct sock_extended_err ee; + struct sockaddr_rxrpc srx; + struct rxrpc_local *local; + struct rxrpc_peer *peer; + unsigned int info = 0; + int err; + u8 version = ip_hdr(skb)->version; + u8 type = icmp_hdr(skb)->type; + u8 code = icmp_hdr(skb)->code; + + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } + + rxrpc_new_skb(skb, rxrpc_skb_received); + + switch (ip_hdr(skb)->version) { + case IPVERSION: + peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, + &info, &srx); + break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, + &info, &srx); + break; +#endif + default: + rcu_read_unlock(); + return; + } + + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); + return; + } + + memset(&ee, 0, sizeof(ee)); + + switch (version) { + case IPVERSION: + switch (type) { + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_FRAG_NEEDED: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + default: + break; + } + + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + /* Might want to do something different with + * non-fatal errors + */ + //harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + err = EPROTO; + break; + } + + ee.ee_origin = SO_EE_ORIGIN_ICMP; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + switch (type) { + case ICMPV6_PKT_TOOBIG: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + } + + icmpv6_err_convert(type, code, &err); + + if (err == EACCES) + err = EHOSTUNREACH; + + ee.ee_origin = SO_EE_ORIGIN_ICMP6; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; +#endif + } + + trace_rxrpc_rx_icmp(peer, &ee, &srx); + + rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); + rcu_read_unlock(); + rxrpc_put_peer(peer); +} + +/* + * Find the peer associated with a local error. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, + const struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + + _enter(""); + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + switch (srx->transport.family) { case AF_INET: srx->transport_len = sizeof(srx->transport.sin); @@ -104,10 +346,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* * Handle an MTU/fragmentation problem. */ -static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) +static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { - u32 mtu = serr->ee.ee_info; - _net("Rx ICMP Fragmentation Needed (%d)", mtu); /* wind down the local interface MTU */ @@ -148,7 +388,7 @@ void rxrpc_error_report(struct sock *sk) struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; struct rxrpc_local *local; - struct rxrpc_peer *peer; + struct rxrpc_peer *peer = NULL; struct sk_buff *skb; rcu_read_lock(); @@ -172,41 +412,20 @@ void rxrpc_error_report(struct sock *sk) } rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); - if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { - _leave("UDP empty message"); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - return; - } - peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - _leave(" [no peer]"); - return; - } - - trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); - - if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && - serr->ee.ee_type == ICMP_DEST_UNREACH && - serr->ee.ee_code == ICMP_FRAG_NEEDED)) { - rxrpc_adjust_mtu(peer, serr); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - rxrpc_put_peer(peer); - _leave(" [MTU update]"); - return; + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { + peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (peer) { + trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); + rxrpc_store_error(peer, serr); + } } - rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); - _leave(""); } -- cgit v1.2.3 From 0d40f728e28393a8817d1fcae923dfa3409e488c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Aug 2022 22:39:28 +0100 Subject: rxrpc: Fix an insufficiently large sglist in rxkad_verify_packet_2() rxkad_verify_packet_2() has a small stack-allocated sglist of 4 elements, but if that isn't sufficient for the number of fragments in the socket buffer, we try to allocate an sglist large enough to hold all the fragments. However, for large packets with a lot of fragments, this isn't sufficient and we need at least one additional fragment. The problem manifests as skb_to_sgvec() returning -EMSGSIZE and this then getting returned by userspace. Most of the time, this isn't a problem as rxrpc sets a limit of 5692, big enough for 4 jumbo subpackets to be glued together; occasionally, however, the server will ignore the reported limit and give a packet that's a lot bigger - say 19852 bytes with ->nr_frags being 7. skb_to_sgvec() then tries to return a "zeroth" fragment that seems to occur before the fragments counted by ->nr_frags and we hit the end of the sglist too early. Note that __skb_to_sgvec() also has an skb_walk_frags() loop that is recursive up to 24 deep. I'm not sure if I need to take account of that too - or if there's an easy way of counting those frags too. Fix this by counting an extra frag and allocating a larger sglist based on that. Fixes: d0d5c0cd1e71 ("rxrpc: Use skb_unshare() rather than skb_cow_data()") Reported-by: Marc Dionne Signed-off-by: David Howells cc: linux-afs@lists.infradead.org --- net/rxrpc/rxkad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 258917a714c8..78fa0524156f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -540,7 +540,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, * directly into the target buffer. */ sg = _sg; - nsg = skb_shinfo(skb)->nr_frags; + nsg = skb_shinfo(skb)->nr_frags + 1; if (nsg <= 4) { nsg = 4; } else { -- cgit v1.2.3 From d3d863036d688313f8d566b87acd7d99daf82749 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 May 2022 23:55:21 +0100 Subject: rxrpc: Fix local destruction being repeated If the local processor work item for the rxrpc local endpoint gets requeued by an event (such as an incoming packet) between it getting scheduled for destruction and the UDP socket being closed, the rxrpc_local_destroyer() function can get run twice. The second time it can hang because it can end up waiting for cleanup events that will never happen. Signed-off-by: David Howells --- net/rxrpc/local_object.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 79bb02eb67b2..38ea98ff426b 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -406,6 +406,9 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; + if (local->dead) + return; + trace_rxrpc_local(local->debug_id, rxrpc_local_processing, refcount_read(&local->ref), NULL); -- cgit v1.2.3 From 214a9dc7d852216e83acac7b75bc18f01ce184c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Apr 2022 13:34:09 +0100 Subject: rxrpc: Fix calc of resend age Fix the calculation of the resend age to add a microsecond value as microseconds, not nanoseconds. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index f8ecad2b730e..2a93e7b5fbd0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -166,7 +166,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); now = ktime_get_real(); - max_age = ktime_sub(now, jiffies_to_usecs(call->peer->rto_j)); + max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); spin_lock_bh(&call->lock); -- cgit v1.2.3 From 7903192c4b4a82d792cb0dc5e2779a2efe60d45b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 31 Aug 2022 13:16:42 +0100 Subject: afs: Use the operation issue time instead of the reply time for callbacks rxrpc and kafs between them try to use the receive timestamp on the first data packet (ie. the one with sequence number 1) as a base from which to calculate the time at which callback promise and lock expiration occurs. However, we don't know how long it took for the server to send us the reply from it having completed the basic part of the operation - it might then, for instance, have to send a bunch of a callback breaks, depending on the particular operation. Fix this by using the time at which the operation is issued on the client as a base instead. That should never be longer than the server's idea of the expiry time. Fixes: 781070551c26 ("afs: Fix calculation of callback expiry time") Fixes: 2070a3e44962 ("rxrpc: Allow the reply time to be obtained on a client call") Suggested-by: Jeffrey E Altman Signed-off-by: David Howells --- fs/afs/flock.c | 2 +- fs/afs/fsclient.c | 2 +- fs/afs/internal.h | 3 +-- fs/afs/rxrpc.c | 7 +------ fs/afs/yfsclient.c | 3 +-- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/afs/flock.c b/fs/afs/flock.c index c4210a3964d8..bbcc5afd1576 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -76,7 +76,7 @@ void afs_lock_op_done(struct afs_call *call) if (call->error == 0) { spin_lock(&vnode->lock); trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0); - vnode->locked_at = call->reply_time; + vnode->locked_at = call->issue_time; afs_schedule_lock_extension(vnode); spin_unlock(&vnode->lock); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4943413d9c5f..7d37f63ef0f0 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -131,7 +131,7 @@ bad: static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry) { - return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry; + return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry; } static void xdr_decode_AFSCallBack(const __be32 **_bp, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 64ad55494349..723d162078a3 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -137,7 +137,6 @@ struct afs_call { bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ - bool have_reply_time; /* T if have got reply_time */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ u16 service_id; /* Actual service ID (after upgrade) */ @@ -151,7 +150,7 @@ struct afs_call { } __attribute__((packed)); __be64 tmp64; }; - ktime_t reply_time; /* Time of first reply packet */ + ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5c4785c862d..eccc3cd0cb70 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -351,6 +351,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) if (call->max_lifespan) rxrpc_kernel_set_max_life(call->net->socket, rxcall, call->max_lifespan); + call->issue_time = ktime_get_real(); /* send the request */ iov[0].iov_base = call->request; @@ -501,12 +502,6 @@ static void afs_deliver_to_call(struct afs_call *call) return; } - if (!call->have_reply_time && - rxrpc_kernel_get_reply_time(call->net->socket, - call->rxcall, - &call->reply_time)) - call->have_reply_time = true; - ret = call->type->deliver(call); state = READ_ONCE(call->state); if (ret == 0 && call->unmarshalling_error) diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index fdc7d675b4b0..11571cca86c1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -232,8 +232,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp, struct afs_callback *cb = &scb->callback; ktime_t cb_expiry; - cb_expiry = call->reply_time; - cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100); + cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100); cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC); scb->have_cb = true; *_bp += xdr_size(x); -- cgit v1.2.3 From 21457f4a91cb522f1a3ad9741ff1d25fadfaa3c5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 31 Aug 2022 13:24:44 +0100 Subject: rxrpc: Remove rxrpc_get_reply_time() which is no longer used Remove rxrpc_get_reply_time() as that is no longer used now that the call issue time is used instead of the reply time. Signed-off-by: David Howells --- Documentation/networking/rxrpc.rst | 11 ---------- include/net/af_rxrpc.h | 2 -- net/rxrpc/recvmsg.c | 43 -------------------------------------- 3 files changed, 56 deletions(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 39c2249c7aa7..39494a6ea739 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1055,17 +1055,6 @@ The kernel interface functions are as follows: first function to change. Note that this must be called in TASK_RUNNING state. - (#) Get reply timestamp:: - - bool rxrpc_kernel_get_reply_time(struct socket *sock, - struct rxrpc_call *call, - ktime_t *_ts) - - This allows the timestamp on the first DATA packet of the reply of a - client call to be queried, provided that it is still in the Rx ring. If - successful, the timestamp will be stored into ``*_ts`` and true will be - returned; false will be returned otherwise. - (#) Get remote client epoch:: u32 rxrpc_kernel_get_epoch(struct socket *sock, diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index cee5f83c0f11..b69ca695935c 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -66,8 +66,6 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, - ktime_t *); bool rxrpc_kernel_call_is_complete(struct rxrpc_call *); void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 250f23bc1c07..7e39c262fd79 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -771,46 +771,3 @@ call_complete: goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data); - -/** - * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet - * @sock: The socket that the call exists on - * @call: The call to query - * @_ts: Where to put the timestamp - * - * Retrieve the timestamp from the first DATA packet of the reply if it is - * in the ring. Returns true if successful, false if not. - */ -bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call, - ktime_t *_ts) -{ - struct sk_buff *skb; - rxrpc_seq_t hard_ack, top, seq; - bool success = false; - - mutex_lock(&call->user_mutex); - - if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY) - goto out; - - hard_ack = call->rx_hard_ack; - if (hard_ack != 0) - goto out; - - seq = hard_ack + 1; - top = smp_load_acquire(&call->rx_top); - if (after(seq, top)) - goto out; - - skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK]; - if (!skb) - goto out; - - *_ts = skb_get_ktime(skb); - success = true; - -out: - mutex_unlock(&call->user_mutex); - return success; -} -EXPORT_SYMBOL(rxrpc_kernel_get_reply_time); -- cgit v1.2.3 From 7fdc77665f3d45c9da7c6edd4beadee9790f43aa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 31 Aug 2022 21:20:49 +0200 Subject: Revert "net: phy: meson-gxl: improve link-up behavior" This reverts commit 2c87c6f9fbddc5b84d67b2fa3f432fcac6d99d93. Meanwhile it turned out that the following commit is the proper workaround for the issue that 2c87c6f9fbdd tries to address. a3a57bf07de2 ("net: stmmac: work around sporadic tx issue on link-up") It's nor clear why the to be reverted commit helped for one user, for others it didn't make a difference. Fixes: 2c87c6f9fbdd ("net: phy: meson-gxl: improve link-up behavior") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/8deeeddc-6b71-129b-1918-495a12dc11e3@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/meson-gxl.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index 73f7962a37d3..c49062ad72c6 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -243,13 +243,7 @@ static irqreturn_t meson_gxl_handle_interrupt(struct phy_device *phydev) irq_status == INTSRC_ENERGY_DETECT) return IRQ_HANDLED; - /* Give PHY some time before MAC starts sending data. This works - * around an issue where network doesn't come up properly. - */ - if (!(irq_status & INTSRC_LINK_DOWN)) - phy_queue_state_machine(phydev, msecs_to_jiffies(100)); - else - phy_trigger_machine(phydev); + phy_trigger_machine(phydev); return IRQ_HANDLED; } -- cgit v1.2.3 From 9efd23297cca530bb35e1848665805d3fcdd7889 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 31 Aug 2022 23:52:18 +0200 Subject: sch_sfb: Don't assume the skb is still around after enqueueing to child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sch_sfb enqueue() routine assumes the skb is still alive after it has been enqueued into a child qdisc, using the data in the skb cb field in the increment_qlen() routine after enqueue. However, the skb may in fact have been freed, causing a use-after-free in this case. In particular, this happens if sch_cake is used as a child of sfb, and the GSO splitting mode of CAKE is enabled (in which case the skb will be split into segments and the original skb freed). Fix this by copying the sfb cb data to the stack before enqueueing the skb, and using this stack copy in increment_qlen() instead of the skb pointer itself. Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231 Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 3d061a13d7ed..0d761f454ae8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) } } -static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; - sfbhash = sfb_hash(skb, 0); + sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); - sfbhash = sfb_hash(skb, 1); + sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } @@ -283,6 +283,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; + struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -399,11 +400,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } enqueue: + memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; - increment_qlen(skb, q); + increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); -- cgit v1.2.3 From e2b224abd9bf45dcb55750479fc35970725a430b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 31 Aug 2022 17:47:56 +0300 Subject: tipc: fix shift wrapping bug in map_get() There is a shift wrapping bug in this code so anything thing above 31 will return false. Fixes: 35c55c9877f8 ("tipc: add neighbor monitoring framework") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 2f4d23238a7e..9618e4429f0f 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -160,7 +160,7 @@ static void map_set(u64 *up_map, int i, unsigned int v) static int map_get(u64 up_map, int i) { - return (up_map & (1 << i)) >> i; + return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) -- cgit v1.2.3 From 3261400639463a853ba2b3be8bd009c2a8089775 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 23:38:09 +0000 Subject: tcp: TX zerocopy should not sense pfmemalloc status We got a recent syzbot report [1] showing a possible misuse of pfmemalloc page status in TCP zerocopy paths. Indeed, for pages coming from user space or other layers, using page_is_pfmemalloc() is moot, and possibly could give false positives. There has been attempts to make page_is_pfmemalloc() more robust, but not using it in the first place in this context is probably better, removing cpu cycles. Note to stable teams : You need to backport 84ce071e38a6 ("net: introduce __skb_fill_page_desc_noacc") as a prereq. Race is more probable after commit c07aea3ef4d4 ("mm: add a signature in struct page") because page_is_pfmemalloc() is now using low order bit from page->lru.next, which can change more often than page->index. Low order bit should never be set for lru.next (when used as an anchor in LRU list), so KCSAN report is mostly a false positive. Backporting to older kernel versions seems not necessary. [1] BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0: __list_add include/linux/list.h:73 [inline] list_add include/linux/list.h:88 [inline] lruvec_add_folio include/linux/mm_inline.h:105 [inline] lru_add_fn+0x440/0x520 mm/swap.c:228 folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246 folio_batch_add_and_move mm/swap.c:263 [inline] folio_add_lru+0xf1/0x140 mm/swap.c:490 filemap_add_folio+0xf8/0x150 mm/filemap.c:948 __filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981 pagecache_get_page+0x26/0x190 mm/folio-compat.c:104 grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116 ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988 generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738 ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270 ext4_file_write_iter+0x2e3/0x1210 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x468/0x760 fs/read_write.c:578 ksys_write+0xe8/0x1a0 fs/read_write.c:631 __do_sys_write fs/read_write.c:643 [inline] __se_sys_write fs/read_write.c:640 [inline] __x64_sys_write+0x3e/0x50 fs/read_write.c:640 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1: page_is_pfmemalloc include/linux/mm.h:1740 [inline] __skb_fill_page_desc include/linux/skbuff.h:2422 [inline] skb_fill_page_desc include/linux/skbuff.h:2443 [inline] tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018 do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075 tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline] tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833 kernel_sendpage+0x184/0x300 net/socket.c:3561 sock_sendpage+0x5a/0x70 net/socket.c:1054 pipe_to_sendpage+0x128/0x160 fs/splice.c:361 splice_from_pipe_feed fs/splice.c:415 [inline] __splice_from_pipe+0x222/0x4d0 fs/splice.c:559 splice_from_pipe fs/splice.c:594 [inline] generic_splice_sendpage+0x89/0xc0 fs/splice.c:743 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:931 splice_direct_to_actor+0x305/0x620 fs/splice.c:886 do_splice_direct+0xfb/0x180 fs/splice.c:974 do_sendfile+0x3bf/0x910 fs/read_write.c:1249 __do_sys_sendfile64 fs/read_write.c:1317 [inline] __se_sys_sendfile64 fs/read_write.c:1303 [inline] __x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0xffffea0004a1d288 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022 Fixes: c07aea3ef4d4 ("mm: add a signature in struct page") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Shakeel Butt Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/linux/skbuff.h | 21 +++++++++++++++++++++ net/core/datagram.c | 2 +- net/ipv4/tcp.c | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..18e163a3460d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2444,6 +2444,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, skb_shinfo(skb)->nr_frags = i + 1; } +/** + * skb_fill_page_desc_noacc - initialise a paged fragment in an skb + * @skb: buffer containing fragment to be initialised + * @i: paged fragment index to initialise + * @page: the page to use for this fragment + * @off: the offset to the data with @page + * @size: the length of the data + * + * Variant of skb_fill_page_desc() which does not deal with + * pfmemalloc, if page is not owned by us. + */ +static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, + struct page *page, int off, + int size) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + __skb_fill_page_desc_noacc(shinfo, i, page, off, size); + shinfo->nr_frags = i + 1; +} + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); diff --git a/net/core/datagram.c b/net/core/datagram.c index 7255531f63ae..e4ff2db40c98 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e5011c136fdb..6cdfce6f2867 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1015,7 +1015,7 @@ new_segment: skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); + skb_fill_page_desc_noacc(skb, i, page, offset, copy); } if (!(flags & MSG_NO_SHARED_FRAGS)) -- cgit v1.2.3 From 7e753eb675f0523207b184558638ee2eed6c9ac2 Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Thu, 11 Aug 2022 12:09:22 +0200 Subject: ice: Fix DMA mappings leak Fix leak, when user changes ring parameters. During reallocation of RX buffers, new DMA mappings are created for those buffers. New buffers with different RX ring count should substitute older ones, but those buffers were freed in ice_vsi_cfg_rxq and reallocated again with ice_alloc_rx_buf. kfree on rx_buf caused leak of already mapped DMA. Reallocate ZC with xdp_buf struct, when BPF program loads. Reallocate back to rx_buf, when BPF program unloads. If BPF program is loaded/unloaded and XSK pools are created, reallocate RX queues accordingly in XDP_SETUP_XSK_POOL handler. Steps for reproduction: while : do for ((i=0; i<=8160; i=i+32)) do ethtool -G enp130s0f0 rx $i tx $i sleep 0.5 ethtool -g enp130s0f0 done done Fixes: 617f3e1b588c ("ice: xsk: allocate separate memory for XDP SW ring") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Mateusz Palczewski Tested-by: Chandan (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 17 --------- drivers/net/ethernet/intel/ice/ice_main.c | 8 ++++ drivers/net/ethernet/intel/ice/ice_xsk.c | 63 +++++++++++++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_xsk.h | 8 ++++ 4 files changed, 79 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 136d7911adb4..1e3243808178 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -7,18 +7,6 @@ #include "ice_dcb_lib.h" #include "ice_sriov.h" -static bool ice_alloc_rx_buf_zc(struct ice_rx_ring *rx_ring) -{ - rx_ring->xdp_buf = kcalloc(rx_ring->count, sizeof(*rx_ring->xdp_buf), GFP_KERNEL); - return !!rx_ring->xdp_buf; -} - -static bool ice_alloc_rx_buf(struct ice_rx_ring *rx_ring) -{ - rx_ring->rx_buf = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_buf), GFP_KERNEL); - return !!rx_ring->rx_buf; -} - /** * __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI * @qs_cfg: gathered variables needed for PF->VSI queues assignment @@ -519,11 +507,8 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index, ring->q_vector->napi.napi_id); - kfree(ring->rx_buf); ring->xsk_pool = ice_xsk_pool(ring); if (ring->xsk_pool) { - if (!ice_alloc_rx_buf_zc(ring)) - return -ENOMEM; xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ring->rx_buf_len = @@ -538,8 +523,6 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->q_index); } else { - if (!ice_alloc_rx_buf(ring)) - return -ENOMEM; if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 173fe6c31341..e5bc69a9a37c 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2898,10 +2898,18 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, if (xdp_ring_err) NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed"); } + /* reallocate Rx queues that are used for zero-copy */ + xdp_ring_err = ice_realloc_zc_buf(vsi, true); + if (xdp_ring_err) + NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Rx resources failed"); } else if (ice_is_xdp_ena_vsi(vsi) && !prog) { xdp_ring_err = ice_destroy_xdp_rings(vsi); if (xdp_ring_err) NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Tx resources failed"); + /* reallocate Rx queues that were used for zero-copy */ + xdp_ring_err = ice_realloc_zc_buf(vsi, false); + if (xdp_ring_err) + NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Rx resources failed"); } else { /* safe to call even when prog == vsi->xdp_prog as * dev_xdp_install in net/core/dev.c incremented prog's diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index e48e29258450..03ce85f6e6df 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -192,6 +192,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); if (err) return err; + ice_clean_rx_ring(rx_ring); ice_qvec_toggle_napi(vsi, q_vector, false); ice_qp_clean_rings(vsi, q_idx); @@ -316,6 +317,62 @@ ice_xsk_pool_enable(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) return 0; } +/** + * ice_realloc_rx_xdp_bufs - reallocate for either XSK or normal buffer + * @rx_ring: Rx ring + * @pool_present: is pool for XSK present + * + * Try allocating memory and return ENOMEM, if failed to allocate. + * If allocation was successful, substitute buffer with allocated one. + * Returns 0 on success, negative on failure + */ +static int +ice_realloc_rx_xdp_bufs(struct ice_rx_ring *rx_ring, bool pool_present) +{ + size_t elem_size = pool_present ? sizeof(*rx_ring->xdp_buf) : + sizeof(*rx_ring->rx_buf); + void *sw_ring = kcalloc(rx_ring->count, elem_size, GFP_KERNEL); + + if (!sw_ring) + return -ENOMEM; + + if (pool_present) { + kfree(rx_ring->rx_buf); + rx_ring->rx_buf = NULL; + rx_ring->xdp_buf = sw_ring; + } else { + kfree(rx_ring->xdp_buf); + rx_ring->xdp_buf = NULL; + rx_ring->rx_buf = sw_ring; + } + + return 0; +} + +/** + * ice_realloc_zc_buf - reallocate XDP ZC queue pairs + * @vsi: Current VSI + * @zc: is zero copy set + * + * Reallocate buffer for rx_rings that might be used by XSK. + * XDP requires more memory, than rx_buf provides. + * Returns 0 on success, negative on failure + */ +int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc) +{ + struct ice_rx_ring *rx_ring; + unsigned long q; + + for_each_set_bit(q, vsi->af_xdp_zc_qps, + max_t(int, vsi->alloc_txq, vsi->alloc_rxq)) { + rx_ring = vsi->rx_rings[q]; + if (ice_realloc_rx_xdp_bufs(rx_ring, zc)) + return -ENOMEM; + } + + return 0; +} + /** * ice_xsk_pool_setup - enable/disable a buffer pool region depending on its state * @vsi: Current VSI @@ -345,11 +402,17 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi); if (if_running) { + struct ice_rx_ring *rx_ring = vsi->rx_rings[qid]; + ret = ice_qp_dis(vsi, qid); if (ret) { netdev_err(vsi->netdev, "ice_qp_dis error = %d\n", ret); goto xsk_pool_if_up; } + + ret = ice_realloc_rx_xdp_bufs(rx_ring, pool_present); + if (ret) + goto xsk_pool_if_up; } pool_failure = pool_present ? ice_xsk_pool_enable(vsi, pool, qid) : diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h index 21faec8e97db..4edbe81eb646 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.h +++ b/drivers/net/ethernet/intel/ice/ice_xsk.h @@ -27,6 +27,7 @@ bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi); void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring); void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring); bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, u32 budget, int napi_budget); +int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc); #else static inline bool ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring, @@ -72,5 +73,12 @@ ice_xsk_wakeup(struct net_device __always_unused *netdev, static inline void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring) { } static inline void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring) { } + +static inline int +ice_realloc_zc_buf(struct ice_vsi __always_unused *vsi, + bool __always_unused zc) +{ + return 0; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* !_ICE_XSK_H_ */ -- cgit v1.2.3 From 59ac325557b6c14f1f793b90d3946bc145ffa085 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 17 Aug 2022 10:53:20 +0200 Subject: ice: use bitmap_free instead of devm_kfree pf->avail_txqs was allocated using bitmap_zalloc, bitmap_free should be used to free this memory. Fixes: 78b5713ac1241 ("ice: Alloc queue management bitmaps and arrays dynamically") Signed-off-by: Michal Swiatkowski Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index e5bc69a9a37c..8c30eea61b6d 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3913,7 +3913,7 @@ static int ice_init_pf(struct ice_pf *pf) pf->avail_rxqs = bitmap_zalloc(pf->max_pf_rxqs, GFP_KERNEL); if (!pf->avail_rxqs) { - devm_kfree(ice_pf_to_dev(pf), pf->avail_txqs); + bitmap_free(pf->avail_txqs); pf->avail_txqs = NULL; return -ENOMEM; } -- cgit v1.2.3 From 45bb006d3c924b1201ed43c87a96b437662dcaa8 Mon Sep 17 00:00:00 2001 From: Przemyslaw Patynowski Date: Tue, 9 Aug 2022 10:57:44 +0200 Subject: i40e: Fix ADQ rate limiting for PF Fix HW rate limiting for ADQ. Fallback to kernel queue selection for ADQ, as it is network stack that decides which queue to use for transmit with ADQ configured. Reset PF after creation of VMDq2 VSIs required for ADQ, as to reprogram TX queue contexts in i40e_configure_tx_ring. Without this patch PF would limit TX rate only according to TC0. Fixes: a9ce82f744dc ("i40e: Enable 'channel' mode in mqprio for TC configs") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Jan Sokolowski Tested-by: Bharathi Sreenivas Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 9f1d5de7bf16..10c1e1ea83a1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6659,6 +6659,9 @@ static int i40e_configure_queue_channels(struct i40e_vsi *vsi) vsi->tc_seid_map[i] = ch->seid; } } + + /* reset to reconfigure TX queue contexts */ + i40e_do_reset(vsi->back, I40E_PF_RESET_FLAG, true); return ret; err_free: diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index d4226161a3ef..69e67eb6aea7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3688,7 +3688,8 @@ u16 i40e_lan_select_queue(struct net_device *netdev, u8 prio; /* is DCB enabled at all? */ - if (vsi->tc_config.numtc == 1) + if (vsi->tc_config.numtc == 1 || + i40e_is_tc_mqprio_enabled(vsi->back)) return netdev_pick_tx(netdev, skb, sb_dev); prio = skb->priority; -- cgit v1.2.3 From fb8396aeda5872369a8ed6d2301e2c86e303c520 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 16 Aug 2022 18:22:30 +0200 Subject: i40e: Fix kernel crash during module removal The driver incorrectly frees client instance and subsequent i40e module removal leads to kernel crash. Reproducer: 1. Do ethtool offline test followed immediately by another one host# ethtool -t eth0 offline; ethtool -t eth0 offline 2. Remove recursively irdma module that also removes i40e module host# modprobe -r irdma Result: [ 8675.035651] i40e 0000:3d:00.0 eno1: offline testing starting [ 8675.193774] i40e 0000:3d:00.0 eno1: testing finished [ 8675.201316] i40e 0000:3d:00.0 eno1: offline testing starting [ 8675.358921] i40e 0000:3d:00.0 eno1: testing finished [ 8675.496921] i40e 0000:3d:00.0: IRDMA hardware initialization FAILED init_state=2 status=-110 [ 8686.188955] i40e 0000:3d:00.1: i40e_ptp_stop: removed PHC on eno2 [ 8686.943890] i40e 0000:3d:00.1: Deleted LAN device PF1 bus=0x3d dev=0x00 func=0x01 [ 8686.952669] i40e 0000:3d:00.0: i40e_ptp_stop: removed PHC on eno1 [ 8687.761787] BUG: kernel NULL pointer dereference, address: 0000000000000030 [ 8687.768755] #PF: supervisor read access in kernel mode [ 8687.773895] #PF: error_code(0x0000) - not-present page [ 8687.779034] PGD 0 P4D 0 [ 8687.781575] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 8687.785935] CPU: 51 PID: 172891 Comm: rmmod Kdump: loaded Tainted: G W I 5.19.0+ #2 [ 8687.794800] Hardware name: Intel Corporation S2600WFD/S2600WFD, BIOS SE5C620.86B.0X.02.0001.051420190324 05/14/2019 [ 8687.805222] RIP: 0010:i40e_lan_del_device+0x13/0xb0 [i40e] [ 8687.810719] Code: d4 84 c0 0f 84 b8 25 01 00 e9 9c 25 01 00 41 bc f4 ff ff ff eb 91 90 0f 1f 44 00 00 41 54 55 53 48 8b 87 58 08 00 00 48 89 fb <48> 8b 68 30 48 89 ef e8 21 8a 0f d5 48 89 ef e8 a9 78 0f d5 48 8b [ 8687.829462] RSP: 0018:ffffa604072efce0 EFLAGS: 00010202 [ 8687.834689] RAX: 0000000000000000 RBX: ffff8f43833b2000 RCX: 0000000000000000 [ 8687.841821] RDX: 0000000000000000 RSI: ffff8f4b0545b298 RDI: ffff8f43833b2000 [ 8687.848955] RBP: ffff8f43833b2000 R08: 0000000000000001 R09: 0000000000000000 [ 8687.856086] R10: 0000000000000000 R11: 000ffffffffff000 R12: ffff8f43833b2ef0 [ 8687.863218] R13: ffff8f43833b2ef0 R14: ffff915103966000 R15: ffff8f43833b2008 [ 8687.870342] FS: 00007f79501c3740(0000) GS:ffff8f4adffc0000(0000) knlGS:0000000000000000 [ 8687.878427] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8687.884174] CR2: 0000000000000030 CR3: 000000014276e004 CR4: 00000000007706e0 [ 8687.891306] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8687.898441] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8687.905572] PKRU: 55555554 [ 8687.908286] Call Trace: [ 8687.910737] [ 8687.912843] i40e_remove+0x2c0/0x330 [i40e] [ 8687.917040] pci_device_remove+0x33/0xa0 [ 8687.920962] device_release_driver_internal+0x1aa/0x230 [ 8687.926188] driver_detach+0x44/0x90 [ 8687.929770] bus_remove_driver+0x55/0xe0 [ 8687.933693] pci_unregister_driver+0x2a/0xb0 [ 8687.937967] i40e_exit_module+0xc/0xf48 [i40e] Two offline tests cause IRDMA driver failure (ETIMEDOUT) and this failure is indicated back to i40e_client_subtask() that calls i40e_client_del_instance() to free client instance referenced by pf->cinst and sets this pointer to NULL. During the module removal i40e_remove() calls i40e_lan_del_device() that dereferences pf->cinst that is NULL -> crash. Do not remove client instance when client open callbacks fails and just clear __I40E_CLIENT_INSTANCE_OPENED bit. The driver also needs to take care about this situation (when netdev is up and client is NOT opened) in i40e_notify_client_of_netdev_close() and calls client close callback only when __I40E_CLIENT_INSTANCE_OPENED is set. Fixes: 0ef2d5afb12d ("i40e: KISS the client interface") Signed-off-by: Ivan Vecera Tested-by: Helena Anna Dubel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c index ea2bb0140a6e..10d7a982a5b9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_client.c +++ b/drivers/net/ethernet/intel/i40e/i40e_client.c @@ -177,6 +177,10 @@ void i40e_notify_client_of_netdev_close(struct i40e_vsi *vsi, bool reset) "Cannot locate client instance close routine\n"); return; } + if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) { + dev_dbg(&pf->pdev->dev, "Client is not open, abort close\n"); + return; + } cdev->client->ops->close(&cdev->lan_info, cdev->client, reset); clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state); i40e_client_release_qvlist(&cdev->lan_info); @@ -429,7 +433,6 @@ void i40e_client_subtask(struct i40e_pf *pf) /* Remove failed client instance */ clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state); - i40e_client_del_instance(pf); return; } } -- cgit v1.2.3 From aa626da947e9cd30c4cf727493903e1adbb2c0a0 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 30 Aug 2022 10:16:27 +0200 Subject: iavf: Detach device during reset task iavf_reset_task() takes crit_lock at the beginning and holds it during whole call. The function subsequently calls iavf_init_interrupt_scheme() that grabs RTNL. Problem occurs when userspace initiates during the reset task any ndo callback that runs under RTNL like iavf_open() because some of that functions tries to take crit_lock. This leads to classic A-B B-A deadlock scenario. To resolve this situation the device should be detached in iavf_reset_task() prior taking crit_lock to avoid subsequent ndos running under RTNL and reattach the device at the end. Fixes: 62fe2a865e6d ("i40evf: add missing rtnl_lock() around i40evf_set_interrupt_capability") Cc: Jacob Keller Cc: Patryk Piotrowski Cc: SlawomirX Laba Tested-by: Vitaly Grinberg Signed-off-by: Ivan Vecera Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index f39440ad5c50..10aa99dfdcdb 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2877,6 +2877,11 @@ static void iavf_reset_task(struct work_struct *work) int i = 0, err; bool running; + /* Detach interface to avoid subsequent NDO callbacks */ + rtnl_lock(); + netif_device_detach(netdev); + rtnl_unlock(); + /* When device is being removed it doesn't make sense to run the reset * task, just return in such a case. */ @@ -2884,7 +2889,7 @@ static void iavf_reset_task(struct work_struct *work) if (adapter->state != __IAVF_REMOVE) queue_work(iavf_wq, &adapter->reset_task); - return; + goto reset_finish; } while (!mutex_trylock(&adapter->client_lock)) @@ -2954,7 +2959,6 @@ continue_reset: if (running) { netif_carrier_off(netdev); - netif_tx_stop_all_queues(netdev); adapter->link_up = false; iavf_napi_disable_all(adapter); } @@ -3084,7 +3088,7 @@ continue_reset: mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); - return; + goto reset_finish; reset_err: if (running) { set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); @@ -3095,6 +3099,10 @@ reset_err: mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); dev_err(&adapter->pdev->dev, "failed to allocate resources during reinit\n"); +reset_finish: + rtnl_lock(); + netif_device_attach(netdev); + rtnl_unlock(); } /** -- cgit v1.2.3 From be318363daa2939453b4d80981de3e9c28b66135 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 1 Sep 2022 17:24:13 -0700 Subject: Bluetooth: hci_sync: Fix hci_read_buffer_size_sync hci_read_buffer_size_sync shall not use HCI_OP_LE_READ_BUFFER_SIZE_V2 sinze that is LE specific, instead it is hci_le_read_buffer_size_sync version that shall use it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216382 Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 187786454d98..fbd5613eebfc 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3018,12 +3018,6 @@ static const struct hci_init_stage amp_init2[] = { /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { - /* Use Read LE Buffer Size V2 if supported */ - if (hdev->commands[41] & 0x20) - return __hci_cmd_sync_status(hdev, - HCI_OP_LE_READ_BUFFER_SIZE_V2, - 0, NULL, HCI_CMD_TIMEOUT); - return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } @@ -3237,6 +3231,12 @@ static const struct hci_init_stage hci_init2[] = { /* Read LE Buffer Size */ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { + /* Use Read LE Buffer Size V2 if supported */ + if (hdev->commands[41] & 0x20) + return __hci_cmd_sync_status(hdev, + HCI_OP_LE_READ_BUFFER_SIZE_V2, + 0, NULL, HCI_CMD_TIMEOUT); + return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } -- cgit v1.2.3 From 7d650df99d528f674cc744719a00a20be1f912f8 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 30 Aug 2022 15:01:48 +0800 Subject: net: fec: add pm_qos support on imx6q platform There is a very low probability that tx timeout will occur during suspend and resume stress test on imx6q platform. So we add pm_qos support to prevent system from entering low level idles which may affect the transmission of tx. Signed-off-by: Wei Fang Link: https://lore.kernel.org/r/20220830070148.2021947-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec.h | 5 +++++ drivers/net/ethernet/freescale/fec_main.c | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 0cebe4b63adb..20369f177863 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -498,6 +499,9 @@ struct bufdesc_ex { /* i.MX8MQ SoC integration mix wakeup interrupt signal into "int2" interrupt line. */ #define FEC_QUIRK_WAKEUP_FROM_INT2 (1 << 22) +/* i.MX6Q adds pm_qos support */ +#define FEC_QUIRK_HAS_PMQOS BIT(23) + struct bufdesc_prop { int qid; /* Address of Rx and Tx buffers */ @@ -608,6 +612,7 @@ struct fec_enet_private { struct delayed_work time_keep; struct regulator *reg_phy; struct fec_stop_mode_gpr stop_gpr; + struct pm_qos_request pm_qos_req; unsigned int tx_align; unsigned int rx_align; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index b0d60f898249..111aaef33080 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -111,7 +111,8 @@ static const struct fec_devinfo fec_imx6q_info = { .quirks = FEC_QUIRK_ENET_MAC | FEC_QUIRK_HAS_GBIT | FEC_QUIRK_HAS_BUFDESC_EX | FEC_QUIRK_HAS_CSUM | FEC_QUIRK_HAS_VLAN | FEC_QUIRK_ERR006358 | - FEC_QUIRK_HAS_RACC | FEC_QUIRK_CLEAR_SETUP_MII, + FEC_QUIRK_HAS_RACC | FEC_QUIRK_CLEAR_SETUP_MII | + FEC_QUIRK_HAS_PMQOS, }; static const struct fec_devinfo fec_mvf600_info = { @@ -3244,6 +3245,9 @@ fec_enet_open(struct net_device *ndev) if (fep->quirks & FEC_QUIRK_ERR006687) imx6q_cpuidle_fec_irqs_used(); + if (fep->quirks & FEC_QUIRK_HAS_PMQOS) + cpu_latency_qos_add_request(&fep->pm_qos_req, 0); + napi_enable(&fep->napi); phy_start(ndev->phydev); netif_tx_start_all_queues(ndev); @@ -3285,6 +3289,9 @@ fec_enet_close(struct net_device *ndev) fec_enet_update_ethtool_stats(ndev); fec_enet_clk_enable(ndev, false); + if (fep->quirks & FEC_QUIRK_HAS_PMQOS) + cpu_latency_qos_remove_request(&fep->pm_qos_req); + pinctrl_pm_select_sleep_state(&fep->pdev->dev); pm_runtime_mark_last_busy(&fep->pdev->dev); pm_runtime_put_autosuspend(&fep->pdev->dev); -- cgit v1.2.3 From b353b241f1eb9b6265358ffbe2632fdcb563354f Mon Sep 17 00:00:00 2001 From: Csókás Bence Date: Thu, 1 Sep 2022 16:04:03 +0200 Subject: net: fec: Use a spinlock to guard `fep->ptp_clk_on` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mutexes cannot be taken in a non-preemptible context, causing a panic in `fec_ptp_save_state()`. Replacing `ptp_clk_mutex` by `tmreg_lock` fixes this. Fixes: 6a4d7234ae9a ("net: fec: ptp: avoid register access when ipg clock is disabled") Fixes: f79959220fa5 ("fec: Restart PPS after link state change") Reported-by: Marc Kleine-Budde Link: https://lore.kernel.org/all/20220827160922.642zlcd5foopozru@pengutronix.de/ Signed-off-by: Csókás Bence Tested-by: Francesco Dolcini # Toradex Apalis iMX6 Link: https://lore.kernel.org/r/20220901140402.64804-1-csokas.bence@prolan.hu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec.h | 1 - drivers/net/ethernet/freescale/fec_main.c | 17 +++++++++-------- drivers/net/ethernet/freescale/fec_ptp.c | 28 ++++++++++------------------ 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 20369f177863..d77ee8936c6a 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -561,7 +561,6 @@ struct fec_enet_private { struct clk *clk_2x_txclk; bool ptp_clk_on; - struct mutex ptp_clk_mutex; unsigned int num_tx_queues; unsigned int num_rx_queues; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 111aaef33080..6152f6dbf1bc 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2029,6 +2029,7 @@ static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) static int fec_enet_clk_enable(struct net_device *ndev, bool enable) { struct fec_enet_private *fep = netdev_priv(ndev); + unsigned long flags; int ret; if (enable) { @@ -2037,15 +2038,15 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable) return ret; if (fep->clk_ptp) { - mutex_lock(&fep->ptp_clk_mutex); + spin_lock_irqsave(&fep->tmreg_lock, flags); ret = clk_prepare_enable(fep->clk_ptp); if (ret) { - mutex_unlock(&fep->ptp_clk_mutex); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); goto failed_clk_ptp; } else { fep->ptp_clk_on = true; } - mutex_unlock(&fep->ptp_clk_mutex); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); } ret = clk_prepare_enable(fep->clk_ref); @@ -2060,10 +2061,10 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable) } else { clk_disable_unprepare(fep->clk_enet_out); if (fep->clk_ptp) { - mutex_lock(&fep->ptp_clk_mutex); + spin_lock_irqsave(&fep->tmreg_lock, flags); clk_disable_unprepare(fep->clk_ptp); fep->ptp_clk_on = false; - mutex_unlock(&fep->ptp_clk_mutex); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); } clk_disable_unprepare(fep->clk_ref); clk_disable_unprepare(fep->clk_2x_txclk); @@ -2076,10 +2077,10 @@ failed_clk_2x_txclk: clk_disable_unprepare(fep->clk_ref); failed_clk_ref: if (fep->clk_ptp) { - mutex_lock(&fep->ptp_clk_mutex); + spin_lock_irqsave(&fep->tmreg_lock, flags); clk_disable_unprepare(fep->clk_ptp); fep->ptp_clk_on = false; - mutex_unlock(&fep->ptp_clk_mutex); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); } failed_clk_ptp: clk_disable_unprepare(fep->clk_enet_out); @@ -3914,7 +3915,7 @@ fec_probe(struct platform_device *pdev) } fep->ptp_clk_on = false; - mutex_init(&fep->ptp_clk_mutex); + spin_lock_init(&fep->tmreg_lock); /* clk_ref is optional, depends on board */ fep->clk_ref = devm_clk_get_optional(&pdev->dev, "enet_clk_ref"); diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index c74d04f4b2fd..8dd5a2615a89 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -365,21 +365,19 @@ static int fec_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) */ static int fec_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { - struct fec_enet_private *adapter = + struct fec_enet_private *fep = container_of(ptp, struct fec_enet_private, ptp_caps); u64 ns; unsigned long flags; - mutex_lock(&adapter->ptp_clk_mutex); + spin_lock_irqsave(&fep->tmreg_lock, flags); /* Check the ptp clock */ - if (!adapter->ptp_clk_on) { - mutex_unlock(&adapter->ptp_clk_mutex); + if (!fep->ptp_clk_on) { + spin_unlock_irqrestore(&fep->tmreg_lock, flags); return -EINVAL; } - spin_lock_irqsave(&adapter->tmreg_lock, flags); - ns = timecounter_read(&adapter->tc); - spin_unlock_irqrestore(&adapter->tmreg_lock, flags); - mutex_unlock(&adapter->ptp_clk_mutex); + ns = timecounter_read(&fep->tc); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); *ts = ns_to_timespec64(ns); @@ -404,10 +402,10 @@ static int fec_ptp_settime(struct ptp_clock_info *ptp, unsigned long flags; u32 counter; - mutex_lock(&fep->ptp_clk_mutex); + spin_lock_irqsave(&fep->tmreg_lock, flags); /* Check the ptp clock */ if (!fep->ptp_clk_on) { - mutex_unlock(&fep->ptp_clk_mutex); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); return -EINVAL; } @@ -417,11 +415,9 @@ static int fec_ptp_settime(struct ptp_clock_info *ptp, */ counter = ns & fep->cc.mask; - spin_lock_irqsave(&fep->tmreg_lock, flags); writel(counter, fep->hwp + FEC_ATIME); timecounter_init(&fep->tc, &fep->cc, ns); spin_unlock_irqrestore(&fep->tmreg_lock, flags); - mutex_unlock(&fep->ptp_clk_mutex); return 0; } @@ -518,13 +514,11 @@ static void fec_time_keep(struct work_struct *work) struct fec_enet_private *fep = container_of(dwork, struct fec_enet_private, time_keep); unsigned long flags; - mutex_lock(&fep->ptp_clk_mutex); + spin_lock_irqsave(&fep->tmreg_lock, flags); if (fep->ptp_clk_on) { - spin_lock_irqsave(&fep->tmreg_lock, flags); timecounter_read(&fep->tc); - spin_unlock_irqrestore(&fep->tmreg_lock, flags); } - mutex_unlock(&fep->ptp_clk_mutex); + spin_unlock_irqrestore(&fep->tmreg_lock, flags); schedule_delayed_work(&fep->time_keep, HZ); } @@ -599,8 +593,6 @@ void fec_ptp_init(struct platform_device *pdev, int irq_idx) } fep->ptp_inc = NSEC_PER_SEC / fep->cycle_speed; - spin_lock_init(&fep->tmreg_lock); - fec_ptp_start_cyclecounter(ndev); INIT_DELAYED_WORK(&fep->time_keep, fec_time_keep); -- cgit v1.2.3 From c55f34b6aec2a8cb47eadaffea773e83bf85de91 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 1 Sep 2022 12:55:54 +0100 Subject: xen-netback: only remove 'hotplug-status' when the vif is actually destroyed Removing 'hotplug-status' in backend_disconnected() means that it will be removed even in the case that the frontend unilaterally disconnects (which it is free to do at any time). The consequence of this is that, when the frontend attempts to re-connect, the backend gets stuck in 'InitWait' rather than moving straight to 'Connected' (which it can do because the hotplug script has already run). Instead, the 'hotplug-status' mode should be removed in netback_remove() i.e. when the vif really is going away. Fixes: 0f4558ae9187 ("Revert "xen-netback: remove 'hotplug-status' once it has served its purpose"") Signed-off-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/xenbus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 990360d75cb6..e85b3c5d4acc 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -256,7 +256,6 @@ static void backend_disconnect(struct backend_info *be) unsigned int queue_index; xen_unregister_watchers(vif); - xenbus_rm(XBT_NIL, be->dev->nodename, "hotplug-status"); #ifdef CONFIG_DEBUG_FS xenvif_debugfs_delif(vif); #endif /* CONFIG_DEBUG_FS */ @@ -984,6 +983,7 @@ static int netback_remove(struct xenbus_device *dev) struct backend_info *be = dev_get_drvdata(&dev->dev); unregister_hotplug_status_watch(be); + xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); if (be->vif) { kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); backend_disconnect(be); -- cgit v1.2.3 From 3015c50384743eb14d20c907a400e10225da033b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 2 Sep 2022 11:27:37 +0200 Subject: net: dsa: microchip: fix kernel oops on ksz8 switches After driver refactoring we was running ksz9477 specific CPU port configuration on ksz8 family which ended with kernel oops. So, make sure we run this code only on ksz9477 compatible devices. Tested on KSZ8873 and KSZ9477. Fixes: da8cd08520f3 ("net: dsa: microchip: add support for common phylink mac link up") Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz_common.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 6bd69a7e6809..872aba63e7d4 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -170,6 +170,13 @@ static const struct ksz_dev_ops ksz8_dev_ops = { .exit = ksz8_switch_exit, }; +static void ksz9477_phylink_mac_link_up(struct ksz_device *dev, int port, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev, int speed, + int duplex, bool tx_pause, + bool rx_pause); + static const struct ksz_dev_ops ksz9477_dev_ops = { .setup = ksz9477_setup, .get_port_addr = ksz9477_get_port_addr, @@ -196,6 +203,7 @@ static const struct ksz_dev_ops ksz9477_dev_ops = { .mdb_del = ksz9477_mdb_del, .change_mtu = ksz9477_change_mtu, .max_mtu = ksz9477_max_mtu, + .phylink_mac_link_up = ksz9477_phylink_mac_link_up, .config_cpu_port = ksz9477_config_cpu_port, .enable_stp_addr = ksz9477_enable_stp_addr, .reset = ksz9477_reset_switch, @@ -230,6 +238,7 @@ static const struct ksz_dev_ops lan937x_dev_ops = { .mdb_del = ksz9477_mdb_del, .change_mtu = lan937x_change_mtu, .max_mtu = ksz9477_max_mtu, + .phylink_mac_link_up = ksz9477_phylink_mac_link_up, .config_cpu_port = lan937x_config_cpu_port, .enable_stp_addr = ksz9477_enable_stp_addr, .reset = lan937x_reset_switch, @@ -1656,13 +1665,13 @@ static void ksz_duplex_flowctrl(struct ksz_device *dev, int port, int duplex, ksz_prmw8(dev, port, regs[P_XMII_CTRL_0], mask, val); } -static void ksz_phylink_mac_link_up(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev, int speed, - int duplex, bool tx_pause, bool rx_pause) +static void ksz9477_phylink_mac_link_up(struct ksz_device *dev, int port, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev, int speed, + int duplex, bool tx_pause, + bool rx_pause) { - struct ksz_device *dev = ds->priv; struct ksz_port *p; p = &dev->ports[port]; @@ -1676,6 +1685,15 @@ static void ksz_phylink_mac_link_up(struct dsa_switch *ds, int port, ksz_port_set_xmii_speed(dev, port, speed); ksz_duplex_flowctrl(dev, port, duplex, tx_pause, rx_pause); +} + +static void ksz_phylink_mac_link_up(struct dsa_switch *ds, int port, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev, int speed, + int duplex, bool tx_pause, bool rx_pause) +{ + struct ksz_device *dev = ds->priv; if (dev->dev_ops->phylink_mac_link_up) dev->dev_ops->phylink_mac_link_up(dev, port, mode, interface, -- cgit v1.2.3 From 4a86c5462616e0d690ad3c94dc84c3b5f1ea5631 Mon Sep 17 00:00:00 2001 From: Mukesh Sisodiya Date: Fri, 2 Sep 2022 16:11:31 +0200 Subject: wifi: mac80211: fix link warning in RX agg timer expiry The rx data link pointer isn't set from the RX aggregation timer, resulting in a later warning. Fix that by setting it to the first valid link for now, with a FIXME to worry about statistics later, it's not very important since it's just the timeout case. Reported-by: Hans de Goede Link: https://lore.kernel.org/r/498d714c-76be-9d04-26db-a1206878de5e@redhat.com Fixes: 56057da4569b ("wifi: mac80211: rx: track link in RX data") Signed-off-by: Mukesh Sisodiya Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 57df21e2170a..45d7e71661e3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4074,6 +4074,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) .link_id = -1, }; struct tid_ampdu_rx *tid_agg_rx; + u8 link_id; tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) @@ -4093,6 +4094,9 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) }; drv_event_callback(rx.local, rx.sdata, &event); } + /* FIXME: statistics won't be right with this */ + link_id = sta->sta.valid_links ? ffs(sta->sta.valid_links) - 1 : 0; + rx.link = rcu_dereference(sta->sdata->link[link_id]); ieee80211_rx_handlers(&rx, &frames); } -- cgit v1.2.3 From 7a2c6d1616be5d49c0dae2c876af3fe20e71a111 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:11:15 +0200 Subject: wifi: mac80211: mlme: release deflink channel in error case In the prep_channel error case we didn't release the deflink channel leaving it to be left around. Fix that. Change-Id: If0dfd748125ec46a31fc6045a480dc28e03723d2 Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3d4ab711f0d1..4c40f0427e88 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6509,6 +6509,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return 0; out_err: + ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0); return err; } -- cgit v1.2.3 From 69371801f929ff4b3c846a45b9d49db631897cd9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:11:14 +0200 Subject: wifi: mac80211: fix locking in auth/assoc timeout If we hit an authentication or association timeout, we only release the chanctx for the deflink, and the other link(s) are released later by ieee80211_vif_set_links(), but we're not locking this correctly. Fix the locking here while releasing the channels and links. Change-Id: I9e08c1a5434592bdc75253c1abfa6c788f9f39b1 Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 4c40f0427e88..5265d2b6db12 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3420,11 +3420,11 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; + mutex_lock(&sdata->local->mtx); ieee80211_link_release_channel(&sdata->deflink); - mutex_unlock(&sdata->local->mtx); - ieee80211_vif_set_links(sdata, 0); + mutex_unlock(&sdata->local->mtx); } cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); @@ -3462,10 +3462,6 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.flags = 0; sdata->vif.bss_conf.mu_mimo_owner = false; - mutex_lock(&sdata->local->mtx); - ieee80211_link_release_channel(&sdata->deflink); - mutex_unlock(&sdata->local->mtx); - if (status != ASSOC_REJECTED) { struct cfg80211_assoc_failure data = { .timeout = status == ASSOC_TIMEOUT, @@ -3484,7 +3480,10 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, cfg80211_assoc_failure(sdata->dev, &data); } + mutex_lock(&sdata->local->mtx); + ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0); + mutex_unlock(&sdata->local->mtx); } kfree(assoc_data); -- cgit v1.2.3 From 8c0427842aaef161a38ac83b7e8d8fe050b4be04 Mon Sep 17 00:00:00 2001 From: Soenke Huster Date: Fri, 2 Sep 2022 10:19:58 +0200 Subject: wifi: mac80211_hwsim: check length for virtio packets An invalid packet with a length shorter than the specified length in the netlink header can lead to use-after-frees and slab-out-of-bounds in the processing of the netlink attributes, such as the following: BUG: KASAN: slab-out-of-bounds in __nla_validate_parse+0x1258/0x2010 Read of size 2 at addr ffff88800ac7952c by task kworker/0:1/12 Workqueue: events hwsim_virtio_rx_work Call Trace: dump_stack_lvl+0x45/0x5d print_report.cold+0x5e/0x5e5 kasan_report+0xb1/0x1c0 __nla_validate_parse+0x1258/0x2010 __nla_parse+0x22/0x30 hwsim_virtio_handle_cmd.isra.0+0x13f/0x2d0 hwsim_virtio_rx_work+0x1b2/0x370 process_one_work+0x8df/0x1530 worker_thread+0x575/0x11a0 kthread+0x29d/0x340 ret_from_fork+0x22/0x30 Discarding packets with an invalid length solves this. Therefore, skb->len must be set at reception. Change-Id: Ieaeb9a4c62d3beede274881a7c2722c6c6f477b6 Signed-off-by: Soenke Huster Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 6e55f153ff26..1f301a5fb396 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -5060,6 +5060,10 @@ static int hwsim_virtio_handle_cmd(struct sk_buff *skb) nlh = nlmsg_hdr(skb); gnlh = nlmsg_data(nlh); + + if (skb->len < nlh->nlmsg_len) + return -EINVAL; + err = genlmsg_parse(nlh, &hwsim_genl_family, tb, HWSIM_ATTR_MAX, hwsim_genl_policy, NULL); if (err) { @@ -5102,7 +5106,8 @@ static void hwsim_virtio_rx_work(struct work_struct *work) spin_unlock_irqrestore(&hwsim_virtio_lock, flags); skb->data = skb->head; - skb_set_tail_pointer(skb, len); + skb_reset_tail_pointer(skb); + skb_put(skb, len); hwsim_virtio_handle_cmd(skb); spin_lock_irqsave(&hwsim_virtio_lock, flags); -- cgit v1.2.3 From 2aec909912da55a6e469fd6ee8412080a5433ed2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Aug 2022 11:46:38 +0200 Subject: wifi: use struct_group to copy addresses We sometimes copy all the addresses from the 802.11 header for the AAD, which may cause complaints from fortify checks. Use struct_group() to avoid the compiler warnings/errors. Change-Id: Ic3ea389105e7813b22095b295079eecdabde5045 Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 +++++--- net/mac80211/wpa.c | 4 ++-- net/wireless/lib80211_crypt_ccmp.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55e6f4ad0ca6..b6e6d5b40774 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -310,9 +310,11 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; - u8 addr1[ETH_ALEN]; - u8 addr2[ETH_ALEN]; - u8 addr3[ETH_ALEN]; + struct_group(addrs, + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; + ); __le16 seq_ctrl; u8 addr4[ETH_ALEN]; } __packed __aligned(2); diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 93ec2f349748..20f742b5503b 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -351,7 +351,7 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad) * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(len_a, &aad[0]); put_unaligned(mask_fc, (__le16 *)&aad[2]); - memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN); + memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; @@ -792,7 +792,7 @@ static void bip_aad(struct sk_buff *skb, u8 *aad) IEEE80211_FCTL_MOREDATA); put_unaligned(mask_fc, (__le16 *) &aad[0]); /* A1 || A2 || A3 */ - memcpy(aad + 2, &hdr->addr1, 3 * ETH_ALEN); + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); } diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 6a5f08f7491e..cca5e1cf089e 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -136,7 +136,7 @@ static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, pos = (u8 *) hdr; aad[0] = pos[0] & 0x8f; aad[1] = pos[1] & 0xc7; - memcpy(aad + 2, hdr->addr1, 3 * ETH_ALEN); + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); pos = (u8 *) & hdr->seq_ctrl; aad[20] = pos[0] & 0x0f; aad[21] = 0; /* all bits masked */ -- cgit v1.2.3 From b7f14132bf58256e841774ae07d3ffb7a841c2bc Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 30 Aug 2022 17:37:20 +0800 Subject: bonding: use unspecified address if no available link local address When ns_ip6_target was set, the ipv6_dev_get_saddr() will be called to get available source address and send IPv6 neighbor solicit message. If the target is global address, ipv6_dev_get_saddr() will get any available src address. But if the target is link local address, ipv6_dev_get_saddr() will only get available address from our interface, i.e. the corresponding bond interface. But before bond interface up, all the address is tentative, while ipv6_dev_get_saddr() will ignore tentative address. This makes we can't find available link local src address, then bond_ns_send() will not be called and no NS message was sent. Finally bond interface will keep in down state. Fix this by sending NS with unspecified address if there is no available source address. Reported-by: LiLiang Fixes: 5e1eeef69c0f ("bonding: NS target should accept link local address") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2f4da2c13c0a..531c7465bc51 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3167,6 +3167,9 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave) found: if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr)) bond_ns_send(slave, &targets[i], &saddr, tags); + else + bond_ns_send(slave, &targets[i], &in6addr_any, tags); + dst_release(dst); kfree(tags); } -- cgit v1.2.3 From fd16eb948ea8b28afb03e11a5b11841e6ac2aa2b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 30 Aug 2022 17:37:21 +0800 Subject: bonding: add all node mcast address when slave up When a link is enslave to bond, it need to set the interface down first. This makes the slave remove mac multicast address 33:33:00:00:00:01(The IPv6 multicast address ff02::1 is kept even when the interface down). When bond set the slave up, ipv6_mc_up() was not called due to commit c2edacf80e15 ("bonding / ipv6: no addrconf for slaves separately from master"). This is not an issue before we adding the lladdr target feature for bonding, as the mac multicast address will be added back when bond interface up and join group ff02::1. But after adding lladdr target feature for bonding. When user set a lladdr target, the unsolicited NA message with all-nodes multicast dest will be dropped as the slave interface never add 33:33:00:00:00:01 back. Fix this by calling ipv6_mc_up() to add 33:33:00:00:00:01 back when the slave interface up. Reported-by: LiLiang Fixes: 5e1eeef69c0f ("bonding: NS target should accept link local address") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e15f64f22fa8..10ce86bf228e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3557,11 +3557,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, fallthrough; case NETDEV_UP: case NETDEV_CHANGE: - if (dev->flags & IFF_SLAVE) + if (idev && idev->cnf.disable_ipv6) break; - if (idev && idev->cnf.disable_ipv6) + if (dev->flags & IFF_SLAVE) { + if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && + dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) + ipv6_mc_up(idev); break; + } if (event == NETDEV_UP) { /* restore routes for permanent addresses */ -- cgit v1.2.3 From 592335a4164c3c41f57967223a1e1efe3a0c6eb3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 30 Aug 2022 17:37:22 +0800 Subject: bonding: accept unsolicited NA message The unsolicited NA message with all-nodes multicast dest address should be valid, as this also means the link could reach the target. Also rename bond_validate_ns() to bond_validate_na(). Reported-by: LiLiang Fixes: 5e1eeef69c0f ("bonding: NS target should accept link local address") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 531c7465bc51..5c2febe94428 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3201,12 +3201,19 @@ static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr) return ret; } -static void bond_validate_ns(struct bonding *bond, struct slave *slave, +static void bond_validate_na(struct bonding *bond, struct slave *slave, struct in6_addr *saddr, struct in6_addr *daddr) { int i; - if (ipv6_addr_any(saddr) || !bond_has_this_ip6(bond, daddr)) { + /* Ignore NAs that: + * 1. Source address is unspecified address. + * 2. Dest address is neither all-nodes multicast address nor + * exist on bond interface. + */ + if (ipv6_addr_any(saddr) || + (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) && + !bond_has_this_ip6(bond, daddr))) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n", __func__, saddr, daddr); return; @@ -3249,14 +3256,14 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, * see bond_arp_rcv(). */ if (bond_is_active_slave(slave)) - bond_validate_ns(bond, slave, saddr, daddr); + bond_validate_na(bond, slave, saddr, daddr); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) - bond_validate_ns(bond, slave, saddr, daddr); + bond_validate_na(bond, slave, saddr, daddr); else if (curr_arp_slave && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) - bond_validate_ns(bond, slave, saddr, daddr); + bond_validate_na(bond, slave, saddr, daddr); out: return RX_HANDLER_ANOTHER; -- cgit v1.2.3 From 84a53580c5d2138c7361c7c3eea5b31827e63b35 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Fri, 2 Sep 2022 10:45:06 +0100 Subject: ipv6: sr: fix out-of-bounds read when setting HMAC data. The SRv6 layer allows defining HMAC data that can later be used to sign IPv6 Segment Routing Headers. This configuration is realised via netlink through four attributes: SEG6_ATTR_HMACKEYID, SEG6_ATTR_SECRET, SEG6_ATTR_SECRETLEN and SEG6_ATTR_ALGID. Because the SECRETLEN attribute is decoupled from the actual length of the SECRET attribute, it is possible to provide invalid combinations (e.g., secret = "", secretlen = 64). This case is not checked in the code and with an appropriately crafted netlink message, an out-of-bounds read of up to 64 bytes (max secret length) can occur past the skb end pointer and into skb_shared_info: Breakpoint 1, seg6_genl_sethmac (skb=, info=) at net/ipv6/seg6.c:208 208 memcpy(hinfo->secret, secret, slen); (gdb) bt #0 seg6_genl_sethmac (skb=, info=) at net/ipv6/seg6.c:208 #1 0xffffffff81e012e9 in genl_family_rcv_msg_doit (skb=skb@entry=0xffff88800b1f9f00, nlh=nlh@entry=0xffff88800b1b7600, extack=extack@entry=0xffffc90000ba7af0, ops=ops@entry=0xffffc90000ba7a80, hdrlen=4, net=0xffffffff84237580 , family=, family=) at net/netlink/genetlink.c:731 #2 0xffffffff81e01435 in genl_family_rcv_msg (extack=0xffffc90000ba7af0, nlh=0xffff88800b1b7600, skb=0xffff88800b1f9f00, family=0xffffffff82fef6c0 ) at net/netlink/genetlink.c:775 #3 genl_rcv_msg (skb=0xffff88800b1f9f00, nlh=0xffff88800b1b7600, extack=0xffffc90000ba7af0) at net/netlink/genetlink.c:792 #4 0xffffffff81dfffc3 in netlink_rcv_skb (skb=skb@entry=0xffff88800b1f9f00, cb=cb@entry=0xffffffff81e01350 ) at net/netlink/af_netlink.c:2501 #5 0xffffffff81e00919 in genl_rcv (skb=0xffff88800b1f9f00) at net/netlink/genetlink.c:803 #6 0xffffffff81dff6ae in netlink_unicast_kernel (ssk=0xffff888010eec800, skb=0xffff88800b1f9f00, sk=0xffff888004aed000) at net/netlink/af_netlink.c:1319 #7 netlink_unicast (ssk=ssk@entry=0xffff888010eec800, skb=skb@entry=0xffff88800b1f9f00, portid=portid@entry=0, nonblock=) at net/netlink/af_netlink.c:1345 #8 0xffffffff81dff9a4 in netlink_sendmsg (sock=, msg=0xffffc90000ba7e48, len=) at net/netlink/af_netlink.c:1921 ... (gdb) p/x ((struct sk_buff *)0xffff88800b1f9f00)->head + ((struct sk_buff *)0xffff88800b1f9f00)->end $1 = 0xffff88800b1b76c0 (gdb) p/x secret $2 = 0xffff88800b1b76c0 (gdb) p slen $3 = 64 '@' The OOB data can then be read back from userspace by dumping HMAC state. This commit fixes this by ensuring SECRETLEN cannot exceed the actual length of SECRET. Reported-by: Lucas Leong Tested: verified that EINVAL is correctly returned when secretlen > len(secret) Fixes: 4f4853dc1c9c1 ("ipv6: sr: implement API to control SR HMAC structure") Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 73aaabf0e966..0b0e34ddc64e 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -191,6 +191,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } + if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { + err = -EINVAL; + goto out_unlock; + } + if (hinfo) { err = seg6_hmac_info_del(net, hmackeyid); if (err) -- cgit v1.2.3 From fe2c9c61f668cde28dac2b188028c5299cedcc1e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Sep 2022 15:41:11 +0200 Subject: net: mvpp2: debugfs: fix memory leak when using debugfs_lookup() When calling debugfs_lookup() the result must have dput() called on it, otherwise the memory will leak over time. Fix this up to be much simpler logic and only create the root debugfs directory once when the driver is first accessed. That resolves the memory leak and makes things more obvious as to what the intent is. Cc: Marcin Wojtas Cc: Russell King Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Cc: stable Fixes: 21da57a23125 ("net: mvpp2: add a debugfs interface for the Header Parser") Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c index 4a3baa7e0142..0eec05d905eb 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c @@ -700,10 +700,10 @@ void mvpp2_dbgfs_cleanup(struct mvpp2 *priv) void mvpp2_dbgfs_init(struct mvpp2 *priv, const char *name) { - struct dentry *mvpp2_dir, *mvpp2_root; + static struct dentry *mvpp2_root; + struct dentry *mvpp2_dir; int ret, i; - mvpp2_root = debugfs_lookup(MVPP2_DRIVER_NAME, NULL); if (!mvpp2_root) mvpp2_root = debugfs_create_dir(MVPP2_DRIVER_NAME, NULL); -- cgit v1.2.3 From 1621e70fc79d77dfeef4e4ba90e125405a274db3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 2 Sep 2022 18:26:56 +0200 Subject: stmmac: intel: Simplify intel_eth_pci_remove() There is no point to call pcim_iounmap_regions() in the remove function, this frees a managed resource that would be release by the framework anyway. Signed-off-by: Christophe JAILLET Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 4f2b82a884b9..9af25be42401 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -1136,8 +1136,6 @@ static void intel_eth_pci_remove(struct pci_dev *pdev) clk_disable_unprepare(priv->plat->stmmac_clk); clk_unregister_fixed_rate(priv->plat->stmmac_clk); - - pcim_iounmap_regions(pdev, BIT(0)); } static int __maybe_unused intel_eth_pci_suspend(struct device *dev) -- cgit v1.2.3 From 686dc2db2a0fdc1d34b424ec2c0a735becd8d62b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 3 Sep 2022 08:10:23 -0400 Subject: tcp: fix early ETIMEDOUT after spurious non-SACK RTO Fix a bug reported and analyzed by Nagaraj Arankal, where the handling of a spurious non-SACK RTO could cause a connection to fail to clear retrans_stamp, causing a later RTO to very prematurely time out the connection with ETIMEDOUT. Here is the buggy scenario, expanding upon Nagaraj Arankal's excellent report: (*1) Send one data packet on a non-SACK connection (*2) Because no ACK packet is received, the packet is retransmitted and we enter CA_Loss; but this retransmission is spurious. (*3) The ACK for the original data is received. The transmitted packet is acknowledged. The TCP timestamp is before the retrans_stamp, so tcp_may_undo() returns true, and tcp_try_undo_loss() returns true without changing state to Open (because tcp_is_sack() is false), and tcp_process_loss() returns without calling tcp_try_undo_recovery(). Normally after undoing a CA_Loss episode, tcp_fastretrans_alert() would see that the connection has returned to CA_Open and fall through and call tcp_try_to_open(), which would set retrans_stamp to 0. However, for non-SACK connections we hold the connection in CA_Loss, so do not fall through to call tcp_try_to_open() and do not set retrans_stamp to 0. So retrans_stamp is (erroneously) still non-zero. At this point the first "retransmission event" has passed and been recovered from. Any future retransmission is a completely new "event". However, retrans_stamp is erroneously still set. (And we are still in CA_Loss, which is correct.) (*4) After 16 minutes (to correspond with tcp_retries2=15), a new data packet is sent. Note: No data is transmitted between (*3) and (*4) and we disabled keep alives. The socket's timeout SHOULD be calculated from this point in time, but instead it's calculated from the prior "event" 16 minutes ago (step (*2)). (*5) Because no ACK packet is received, the packet is retransmitted. (*6) At the time of the 2nd retransmission, the socket returns ETIMEDOUT, prematurely, because retrans_stamp is (erroneously) too far in the past (set at the time of (*2)). This commit fixes this bug by ensuring that we reuse in tcp_try_undo_loss() the same careful logic for non-SACK connections that we have in tcp_try_undo_recovery(). To avoid duplicating logic, we factor out that logic into a new tcp_is_non_sack_preventing_reopen() helper and call that helper from both undo functions. Fixes: da34ac7626b5 ("tcp: only undo on partial ACKs in CA_Loss") Reported-by: Nagaraj Arankal Link: https://lore.kernel.org/all/SJ0PR84MB1847BE6C24D274C46A1B9B0EB27A9@SJ0PR84MB1847.NAMPRD84.PROD.OUTLOOK.COM/ Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220903121023.866900-1-ncardwell.kernel@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_input.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b85a9f755da4..bc2ea12221f9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2513,6 +2513,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp) return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + return true; + } + return false; +} + /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { @@ -2535,14 +2550,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { - /* Hold old state until something *above* high_seq - * is ACKed. For Reno it is MUST to prevent false - * fast retransmits (RFC2582). SACK TCP is safe. */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) return true; - } tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; @@ -2578,6 +2587,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) + return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; -- cgit v1.2.3 From 42b998d4aa59b9dea51665e4f3be1d733c47e2bf Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sun, 4 Sep 2022 23:53:19 +0200 Subject: net: dsa: qca8k: fix NULL pointer dereference for of_device_get_match_data of_device_get_match_data is called on priv->dev before priv->dev is actually set. Move of_device_get_match_data after priv->dev is correctly set to fix this kernel panic. Fixes: 3bb0844e7bcd ("net: dsa: qca8k: cache match data to speed up access") Signed-off-by: Christian Marangi Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220904215319.13070-1-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/qca/qca8k-8xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 1d3e7782a71f..c181346388a4 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -1889,9 +1889,9 @@ qca8k_sw_probe(struct mdio_device *mdiodev) if (!priv) return -ENOMEM; - priv->info = of_device_get_match_data(priv->dev); priv->bus = mdiodev->bus; priv->dev = &mdiodev->dev; + priv->info = of_device_get_match_data(priv->dev); priv->reset_gpio = devm_gpiod_get_optional(priv->dev, "reset", GPIOD_ASIS); -- cgit v1.2.3 From e1091e226a2bab4ded1fe26efba2aee1aab06450 Mon Sep 17 00:00:00 2001 From: "jerry.meng" Date: Mon, 5 Sep 2022 09:24:52 +0800 Subject: net: usb: qmi_wwan: add Quectel RM520N MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add support for Quectel RM520N which is based on Qualcomm SDX62 chip. 0x0801: DIAG + NMEA + AT + MODEM + RMNET T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 10 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0801 Rev= 5.04 S: Manufacturer=Quectel S: Product=RM520N-GL S: SerialNumber=384af524 C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: jerry.meng Acked-by: Bjørn Mork Link: https://lore.kernel.org/r/tencent_E50CA8A206904897C2D20DDAE90731183C05@qq.com Signed-off-by: Paolo Abeni --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 709e3c59e340..0cb187def5bc 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1087,6 +1087,7 @@ static const struct usb_device_id products[] = { {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0620)}, /* Quectel EM160R-GL */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ + {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0801)}, /* Quectel RM520N */ /* 3. Combined interface devices matching on interface number */ {QMI_FIXED_INTF(0x0408, 0xea42, 4)}, /* Yota / Megafon M100-1 */ -- cgit v1.2.3 From 11afdc6526de0e0368c05da632a8c0d29fc60bb8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 5 Sep 2022 20:01:23 +0300 Subject: net: dsa: felix: tc-taprio intervals smaller than MTU should send at least one packet The blamed commit broke tc-taprio schedules such as this one: tc qdisc replace dev $swp1 root taprio \ num_tc 8 \ map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 \ sched-entry S 0x7f 990000 \ sched-entry S 0x80 10000 \ flags 0x2 because the gate entry for TC 7 (S 0x80 10000 ns) now has a static guard band added earlier than its 'gate close' event, such that packet overruns won't occur in the worst case of the largest packet possible. Since guard bands are statically determined based on the per-tc QSYS_QMAXSDU_CFG_* with a fallback on the port-based QSYS_PORT_MAX_SDU, we need to discuss what happens with TC 7 depending on kernel version, since the driver, prior to commit 55a515b1f5a9 ("net: dsa: felix: drop oversized frames with tc-taprio instead of hanging the port"), did not touch QSYS_QMAXSDU_CFG_*, and therefore relied on QSYS_PORT_MAX_SDU. 1 (before vsc9959_tas_guard_bands_update): QSYS_PORT_MAX_SDU defaults to 1518, and at gigabit this introduces a static guard band (independent of packet sizes) of 12144 ns, plus QSYS::HSCH_MISC_CFG.FRM_ADJ (bit time of 20 octets => 160 ns). But this is larger than the time window itself, of 10000 ns. So, the queue system never considers a frame with TC 7 as eligible for transmission, since the gate practically never opens, and these frames are forever stuck in the TX queues and hang the port. 2 (after vsc9959_tas_guard_bands_update): Under the sole goal of enabling oversized frame dropping, we make an effort to set QSYS_QMAXSDU_CFG_7 to 1230 bytes. But QSYS_QMAXSDU_CFG_7 plays one more role, which we did not take into account: per-tc static guard band, expressed in L2 byte time (auto-adjusted for FCS and L1 overhead). There is a discrepancy between what the driver thinks (that there is no guard band, and 100% of min_gate_len[tc] is available for egress scheduling) and what the hardware actually does (crops the equivalent of QSYS_QMAXSDU_CFG_7 ns out of min_gate_len[tc]). In practice, this means that the hardware thinks it has exactly 0 ns for scheduling tc 7. In both cases, even minimum sized Ethernet frames are stuck on egress rather than being considered for scheduling on TC 7, even if they would fit given a proper configuration. Considering the current situation, with vsc9959_tas_guard_bands_update(), frames between 60 octets and 1230 octets in size are not eligible for oversized dropping (because they are smaller than QSYS_QMAXSDU_CFG_7), but won't be considered as eligible for scheduling either, because the min_gate_len[7] (10000 ns) minus the guard band determined by QSYS_QMAXSDU_CFG_7 (1230 octets * 8 ns per octet == 9840 ns) minus the guard band auto-added for L1 overhead by QSYS::HSCH_MISC_CFG.FRM_ADJ (20 octets * 8 ns per octet == 160 octets) leaves 0 ns for scheduling in the queue system proper. Investigating the hardware behavior, it becomes apparent that the queue system needs precisely 33 ns of 'gate open' time in order to consider a frame as eligible for scheduling to a tc. So the solution to this problem is to amend vsc9959_tas_guard_bands_update(), by giving the per-tc guard bands less space by exactly 33 ns, just enough for one frame to be scheduled in that interval. This allows the queue system to make forward progress for that port-tc, and prevents it from hanging. Fixes: 297c4de6f780 ("net: dsa: felix: re-enable TAS guard band mode") Reported-by: Xiaoliang Yang Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix_vsc9959.c | 35 ++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 1cdce8a98d1d..262be2cf4e4b 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -22,6 +22,7 @@ #define VSC9959_NUM_PORTS 6 #define VSC9959_TAS_GCL_ENTRY_MAX 63 +#define VSC9959_TAS_MIN_GATE_LEN_NS 33 #define VSC9959_VCAP_POLICER_BASE 63 #define VSC9959_VCAP_POLICER_MAX 383 #define VSC9959_SWITCH_PCI_BAR 4 @@ -1478,6 +1479,23 @@ static void vsc9959_mdio_bus_free(struct ocelot *ocelot) mdiobus_free(felix->imdio); } +/* The switch considers any frame (regardless of size) as eligible for + * transmission if the traffic class gate is open for at least 33 ns. + * Overruns are prevented by cropping an interval at the end of the gate time + * slot for which egress scheduling is blocked, but we need to still keep 33 ns + * available for one packet to be transmitted, otherwise the port tc will hang. + * This function returns the size of a gate interval that remains available for + * setting the guard band, after reserving the space for one egress frame. + */ +static u64 vsc9959_tas_remaining_gate_len_ps(u64 gate_len_ns) +{ + /* Gate always open */ + if (gate_len_ns == U64_MAX) + return U64_MAX; + + return (gate_len_ns - VSC9959_TAS_MIN_GATE_LEN_NS) * PSEC_PER_NSEC; +} + /* Extract shortest continuous gate open intervals in ns for each traffic class * of a cyclic tc-taprio schedule. If a gate is always open, the duration is * considered U64_MAX. If the gate is always closed, it is considered 0. @@ -1596,10 +1614,13 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) vsc9959_tas_min_gate_lengths(ocelot_port->taprio, min_gate_len); for (tc = 0; tc < OCELOT_NUM_TC; tc++) { + u64 remaining_gate_len_ps; u32 max_sdu; - if (min_gate_len[tc] == U64_MAX /* Gate always open */ || - min_gate_len[tc] * PSEC_PER_NSEC > needed_bit_time_ps) { + remaining_gate_len_ps = + vsc9959_tas_remaining_gate_len_ps(min_gate_len[tc]); + + if (remaining_gate_len_ps > needed_bit_time_ps) { /* Setting QMAXSDU_CFG to 0 disables oversized frame * dropping. */ @@ -1612,9 +1633,15 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) /* If traffic class doesn't support a full MTU sized * frame, make sure to enable oversize frame dropping * for frames larger than the smallest that would fit. + * + * However, the exact same register, QSYS_QMAXSDU_CFG_*, + * controls not only oversized frame dropping, but also + * per-tc static guard band lengths, so it reduces the + * useful gate interval length. Therefore, be careful + * to calculate a guard band (and therefore max_sdu) + * that still leaves 33 ns available in the time slot. */ - max_sdu = div_u64(min_gate_len[tc] * PSEC_PER_NSEC, - picos_per_byte); + max_sdu = div_u64(remaining_gate_len_ps, picos_per_byte); /* A TC gate may be completely closed, which is a * special case where all packets are oversized. * Any limit smaller than 64 octets accomplishes this -- cgit v1.2.3 From 843794bbdef83955ae5b43dfafc355c3786e2145 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 5 Sep 2022 20:01:24 +0300 Subject: net: dsa: felix: disable cut-through forwarding for frames oversized for tc-taprio Experimentally, it looks like when QSYS_QMAXSDU_CFG_7 is set to 605, frames even way larger than 601 octets are transmitted even though these should be considered as oversized, according to the documentation, and dropped. Since oversized frame dropping depends on frame size, which is only known at the EOF stage, and therefore not at SOF when cut-through forwarding begins, it means that the switch cannot take QSYS_QMAXSDU_CFG_* into consideration for traffic classes that are cut-through. Since cut-through forwarding has no UAPI to control it, and the driver enables it based on the mantra "if we can, then why not", the strategy is to alter vsc9959_cut_through_fwd() to take into consideration which tc's have oversize frame dropping enabled, and disable cut-through for them. Then, from vsc9959_tas_guard_bands_update(), we re-trigger the cut-through determination process. There are 2 strategies for vsc9959_cut_through_fwd() to determine whether a tc has oversized dropping enabled or not. One is to keep a bit mask of traffic classes per port, and the other is to read back from the hardware registers (a non-zero value of QSYS_QMAXSDU_CFG_* means the feature is enabled). We choose reading back from registers, because struct ocelot_port is shared with drivers (ocelot, seville) that don't support either cut-through nor tc-taprio, and we don't have a felix specific extension of struct ocelot_port. Furthermore, reading registers from the Felix hardware is quite cheap, since they are memory-mapped. Fixes: 55a515b1f5a9 ("net: dsa: felix: drop oversized frames with tc-taprio instead of hanging the port") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix_vsc9959.c | 122 +++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 43 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 262be2cf4e4b..ad7c795d6612 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1557,6 +1557,65 @@ static void vsc9959_tas_min_gate_lengths(struct tc_taprio_qopt_offload *taprio, min_gate_len[tc] = 0; } +/* ocelot_write_rix is a macro that concatenates QSYS_MAXSDU_CFG_* with _RSZ, + * so we need to spell out the register access to each traffic class in helper + * functions, to simplify callers + */ +static void vsc9959_port_qmaxsdu_set(struct ocelot *ocelot, int port, int tc, + u32 max_sdu) +{ + switch (tc) { + case 0: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_0, + port); + break; + case 1: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_1, + port); + break; + case 2: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_2, + port); + break; + case 3: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_3, + port); + break; + case 4: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_4, + port); + break; + case 5: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_5, + port); + break; + case 6: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_6, + port); + break; + case 7: + ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_7, + port); + break; + } +} + +static u32 vsc9959_port_qmaxsdu_get(struct ocelot *ocelot, int port, int tc) +{ + switch (tc) { + case 0: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_0, port); + case 1: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_1, port); + case 2: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_2, port); + case 3: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_3, port); + case 4: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_4, port); + case 5: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_5, port); + case 6: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_6, port); + case 7: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_7, port); + default: + return 0; + } +} + /* Update QSYS_PORT_MAX_SDU to make sure the static guard bands added by the * switch (see the ALWAYS_GUARD_BAND_SCH_Q comment) are correct at all MTU * values (the default value is 1518). Also, for traffic class windows smaller @@ -1613,6 +1672,8 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) vsc9959_tas_min_gate_lengths(ocelot_port->taprio, min_gate_len); + mutex_lock(&ocelot->fwd_domain_lock); + for (tc = 0; tc < OCELOT_NUM_TC; tc++) { u64 remaining_gate_len_ps; u32 max_sdu; @@ -1664,47 +1725,14 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) max_sdu); } - /* ocelot_write_rix is a macro that concatenates - * QSYS_MAXSDU_CFG_* with _RSZ, so we need to spell out - * the writes to each traffic class - */ - switch (tc) { - case 0: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_0, - port); - break; - case 1: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_1, - port); - break; - case 2: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_2, - port); - break; - case 3: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_3, - port); - break; - case 4: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_4, - port); - break; - case 5: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_5, - port); - break; - case 6: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_6, - port); - break; - case 7: - ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_7, - port); - break; - } + vsc9959_port_qmaxsdu_set(ocelot, port, tc, max_sdu); } ocelot_write_rix(ocelot, maxlen, QSYS_PORT_MAX_SDU, port); + + ocelot->ops->cut_through_fwd(ocelot); + + mutex_unlock(&ocelot->fwd_domain_lock); } static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, @@ -2797,7 +2825,7 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot) { struct felix *felix = ocelot_to_felix(ocelot); struct dsa_switch *ds = felix->ds; - int port, other_port; + int tc, port, other_port; lockdep_assert_held(&ocelot->fwd_domain_lock); @@ -2841,19 +2869,27 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot) min_speed = other_ocelot_port->speed; } - /* Enable cut-through forwarding for all traffic classes. */ - if (ocelot_port->speed == min_speed) + /* Enable cut-through forwarding for all traffic classes that + * don't have oversized dropping enabled, since this check is + * bypassed in cut-through mode. + */ + if (ocelot_port->speed == min_speed) { val = GENMASK(7, 0); + for (tc = 0; tc < OCELOT_NUM_TC; tc++) + if (vsc9959_port_qmaxsdu_get(ocelot, port, tc)) + val &= ~BIT(tc); + } + set: tmp = ocelot_read_rix(ocelot, ANA_CUT_THRU_CFG, port); if (tmp == val) continue; dev_dbg(ocelot->dev, - "port %d fwd mask 0x%lx speed %d min_speed %d, %s cut-through forwarding\n", + "port %d fwd mask 0x%lx speed %d min_speed %d, %s cut-through forwarding on TC mask 0x%x\n", port, mask, ocelot_port->speed, min_speed, - val ? "enabling" : "disabling"); + val ? "enabling" : "disabling", val); ocelot_write_rix(ocelot, val, ANA_CUT_THRU_CFG, port); } -- cgit v1.2.3 From a4bb481aeb9d84cb53112a478e6db4705b794c34 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 5 Sep 2022 20:01:25 +0300 Subject: net: dsa: felix: access QSYS_TAG_CONFIG under tas_lock in vsc9959_sched_speed_set The read-modify-write of QSYS_TAG_CONFIG from vsc9959_sched_speed_set() runs unlocked with respect to the other functions that access it, which are vsc9959_tas_guard_bands_update(), vsc9959_qos_port_tas_set() and vsc9959_tas_clock_adjust(). All the others are under ocelot->tas_lock, so move the vsc9959_sched_speed_set() access under that lock as well, to resolve the concurrency. Fixes: 55a515b1f5a9 ("net: dsa: felix: drop oversized frames with tc-taprio instead of hanging the port") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix_vsc9959.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index ad7c795d6612..f8f19a85744c 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1759,13 +1759,13 @@ static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, break; } + mutex_lock(&ocelot->tas_lock); + ocelot_rmw_rix(ocelot, QSYS_TAG_CONFIG_LINK_SPEED(tas_speed), QSYS_TAG_CONFIG_LINK_SPEED_M, QSYS_TAG_CONFIG, port); - mutex_lock(&ocelot->tas_lock); - if (ocelot_port->taprio) vsc9959_tas_guard_bands_update(ocelot, port); -- cgit v1.2.3 From 0e80707d94e4c88f9879bdafcbaceb13432ec1f4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 6 Sep 2022 16:36:32 +0200 Subject: net: ethernet: mtk_eth_soc: fix typo in __mtk_foe_entry_clear Set ib1 state to MTK_FOE_STATE_UNBIND in __mtk_foe_entry_clear routine. Fixes: 33fc42de33278 ("net: ethernet: mtk_eth_soc: support creating mac address based offload entries") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_ppe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index dab8f3f771f8..cfe804bc8d20 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -412,7 +412,7 @@ __mtk_foe_entry_clear(struct mtk_ppe *ppe, struct mtk_flow_entry *entry) if (entry->hash != 0xffff) { ppe->foe_table[entry->hash].ib1 &= ~MTK_FOE_IB1_STATE; ppe->foe_table[entry->hash].ib1 |= FIELD_PREP(MTK_FOE_IB1_STATE, - MTK_FOE_STATE_BIND); + MTK_FOE_STATE_UNBIND); dma_wmb(); } entry->hash = 0xffff; -- cgit v1.2.3 From 9cb252c4c1c53ae58bc565bab76e98133288f23a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 5 Sep 2022 11:50:15 +0800 Subject: net: skb: export skb drop reaons to user by TRACE_DEFINE_ENUM As Eric reported, the 'reason' field is not presented when trace the kfree_skb event by perf: $ perf record -e skb:kfree_skb -a sleep 10 $ perf script ip_defrag 14605 [021] 221.614303: skb:kfree_skb: skbaddr=0xffff9d2851242700 protocol=34525 location=0xffffffffa39346b1 reason: The cause seems to be passing kernel address directly to TP_printk(), which is not right. As the enum 'skb_drop_reason' is not exported to user space through TRACE_DEFINE_ENUM(), perf can't get the drop reason string from the 'reason' field, which is a number. Therefore, we introduce the macro DEFINE_DROP_REASON(), which is used to define the trace enum by TRACE_DEFINE_ENUM(). With the help of DEFINE_DROP_REASON(), now we can remove the auto-generate that we introduced in the commit ec43908dd556 ("net: skb: use auto-generation to convert skb drop reason to string"), and define the string array 'drop_reasons'. Hmmmm...now we come back to the situation that have to maintain drop reasons in both enum skb_drop_reason and DEFINE_DROP_REASON. But they are both in dropreason.h, which makes it easier. After this commit, now the format of kfree_skb is like this: $ cat /tracing/events/skb/kfree_skb/format name: kfree_skb ID: 1524 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:void * skbaddr; offset:8; size:8; signed:0; field:void * location; offset:16; size:8; signed:0; field:unsigned short protocol; offset:24; size:2; signed:0; field:enum skb_drop_reason reason; offset:28; size:4; signed:0; print fmt: "skbaddr=%p protocol=%u location=%p reason: %s", REC->skbaddr, REC->protocol, REC->location, __print_symbolic(REC->reason, { 1, "NOT_SPECIFIED" }, { 2, "NO_SOCKET" } ...... Fixes: ec43908dd556 ("net: skb: use auto-generation to convert skb drop reason to string") Link: https://lore.kernel.org/netdev/CANn89i+bx0ybvE55iMYf5GJM48WwV1HNpdm9Q6t-HaEstqpCSA@mail.gmail.com/ Reported-by: Eric Dumazet Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/net/dropreason.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++ include/trace/events/skb.h | 15 ++++++++++- net/core/.gitignore | 1 - net/core/Makefile | 22 +-------------- net/core/skbuff.c | 6 ++++- 5 files changed, 87 insertions(+), 24 deletions(-) delete mode 100644 net/core/.gitignore diff --git a/include/net/dropreason.h b/include/net/dropreason.h index fae9b40e54fa..c1cbcdbaf149 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -3,6 +3,73 @@ #ifndef _LINUX_DROPREASON_H #define _LINUX_DROPREASON_H +#define DEFINE_DROP_REASON(FN, FNe) \ + FN(NOT_SPECIFIED) \ + FN(NO_SOCKET) \ + FN(PKT_TOO_SMALL) \ + FN(TCP_CSUM) \ + FN(SOCKET_FILTER) \ + FN(UDP_CSUM) \ + FN(NETFILTER_DROP) \ + FN(OTHERHOST) \ + FN(IP_CSUM) \ + FN(IP_INHDR) \ + FN(IP_RPFILTER) \ + FN(UNICAST_IN_L2_MULTICAST) \ + FN(XFRM_POLICY) \ + FN(IP_NOPROTO) \ + FN(SOCKET_RCVBUFF) \ + FN(PROTO_MEM) \ + FN(TCP_MD5NOTFOUND) \ + FN(TCP_MD5UNEXPECTED) \ + FN(TCP_MD5FAILURE) \ + FN(SOCKET_BACKLOG) \ + FN(TCP_FLAGS) \ + FN(TCP_ZEROWINDOW) \ + FN(TCP_OLD_DATA) \ + FN(TCP_OVERWINDOW) \ + FN(TCP_OFOMERGE) \ + FN(TCP_RFC7323_PAWS) \ + FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_RESET) \ + FN(TCP_INVALID_SYN) \ + FN(TCP_CLOSE) \ + FN(TCP_FASTOPEN) \ + FN(TCP_OLD_ACK) \ + FN(TCP_TOO_OLD_ACK) \ + FN(TCP_ACK_UNSENT_DATA) \ + FN(TCP_OFO_QUEUE_PRUNE) \ + FN(TCP_OFO_DROP) \ + FN(IP_OUTNOROUTES) \ + FN(BPF_CGROUP_EGRESS) \ + FN(IPV6DISABLED) \ + FN(NEIGH_CREATEFAIL) \ + FN(NEIGH_FAILED) \ + FN(NEIGH_QUEUEFULL) \ + FN(NEIGH_DEAD) \ + FN(TC_EGRESS) \ + FN(QDISC_DROP) \ + FN(CPU_BACKLOG) \ + FN(XDP) \ + FN(TC_INGRESS) \ + FN(UNHANDLED_PROTO) \ + FN(SKB_CSUM) \ + FN(SKB_GSO_SEG) \ + FN(SKB_UCOPY_FAULT) \ + FN(DEV_HDR) \ + FN(DEV_READY) \ + FN(FULL_RING) \ + FN(NOMEM) \ + FN(HDR_TRUNC) \ + FN(TAP_FILTER) \ + FN(TAP_TXFILTER) \ + FN(ICMP_CSUM) \ + FN(INVALID_PROTO) \ + FN(IP_INADDRERRORS) \ + FN(IP_INNOROUTES) \ + FN(PKT_TOO_BIG) \ + FNe(MAX) + /** * enum skb_drop_reason - the reasons of skb drops * diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 45264e4bb254..50a974f7dfb4 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -9,6 +9,15 @@ #include #include +#undef FN +#define FN(reason) TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason); +DEFINE_DROP_REASON(FN, FN) + +#undef FN +#undef FNe +#define FN(reason) { SKB_DROP_REASON_##reason, #reason }, +#define FNe(reason) { SKB_DROP_REASON_##reason, #reason } + /* * Tracepoint for free an sk_buff: */ @@ -35,9 +44,13 @@ TRACE_EVENT(kfree_skb, TP_printk("skbaddr=%p protocol=%u location=%p reason: %s", __entry->skbaddr, __entry->protocol, __entry->location, - drop_reasons[__entry->reason]) + __print_symbolic(__entry->reason, + DEFINE_DROP_REASON(FN, FNe))) ); +#undef FN +#undef FNe + TRACE_EVENT(consume_skb, TP_PROTO(struct sk_buff *skb), diff --git a/net/core/.gitignore b/net/core/.gitignore deleted file mode 100644 index df1e74372cce..000000000000 --- a/net/core/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dropreason_str.c diff --git a/net/core/Makefile b/net/core/Makefile index e8ce3bd283a6..5857cec87b83 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -5,7 +5,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ - flow_dissector.o dropreason_str.o + flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o @@ -40,23 +40,3 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o - -clean-files := dropreason_str.c - -quiet_cmd_dropreason_str = GEN $@ -cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include \n"; \ - print "const char * const drop_reasons[] = {" }\ - /^enum skb_drop/ { dr=1; }\ - /^\};/ { dr=0; }\ - /^\tSKB_DROP_REASON_/ {\ - if (dr) {\ - sub(/\tSKB_DROP_REASON_/, "", $$1);\ - printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\ - }\ - }\ - END{ print "};" }' $< > $@ - -$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h - $(call cmd,dropreason_str) - -$(obj)/dropreason_str.o: $(obj)/dropreason_str.c diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 84bb5e188d0d..417463da4fac 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -91,7 +91,11 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); -/* The array 'drop_reasons' is auto-generated in dropreason_str.c */ +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +const char * const drop_reasons[] = { + DEFINE_DROP_REASON(FN, FN) +}; EXPORT_SYMBOL(drop_reasons); /** -- cgit v1.2.3 From f27b405ef43319a3ceefc2123245201a63ed4e00 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 5 Sep 2022 14:41:28 +0200 Subject: net: ethernet: mtk_eth_soc: check max allowed hash in mtk_ppe_check_skb Even if max hash configured in hw in mtk_ppe_hash_entry is MTK_PPE_ENTRIES - 1, check theoretical OOB accesses in mtk_ppe_check_skb routine Fixes: c4f033d9e03e9 ("net: ethernet: mtk_eth_soc: rework hardware flow table management") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_ppe.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h index 1f5cf1c9a947..69ffce04d630 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.h +++ b/drivers/net/ethernet/mediatek/mtk_ppe.h @@ -293,6 +293,9 @@ mtk_ppe_check_skb(struct mtk_ppe *ppe, struct sk_buff *skb, u16 hash) if (!ppe) return; + if (hash > MTK_PPE_HASH_MASK) + return; + now = (u16)jiffies; diff = now - ppe->foe_check_time[hash]; if (diff < HZ / 10) -- cgit v1.2.3 From e9b1a4f867ae9c1dbd1d71cd09cbdb3239fb4968 Mon Sep 17 00:00:00 2001 From: Yacan Liu Date: Tue, 6 Sep 2022 21:01:39 +0800 Subject: net/smc: Fix possible access to freed memory in link clear After modifying the QP to the Error state, all RX WR would be completed with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not wait for it is done, but destroy the QP and free the link group directly. So there is a risk that accessing the freed memory in tasklet context. Here is a crash example: BUG: unable to handle page fault for address: ffffffff8f220860 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S OE 5.10.0-0607+ #23 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040 FS: 0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: _raw_spin_lock_irqsave+0x30/0x40 mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib] smc_wr_rx_tasklet_fn+0x56/0xa0 [smc] tasklet_action_common.isra.21+0x66/0x100 __do_softirq+0xd5/0x29c asm_call_irq_on_stack+0x12/0x20 do_softirq_own_stack+0x37/0x40 irq_exit_rcu+0x9d/0xa0 sysvec_call_function_single+0x34/0x80 asm_sysvec_call_function_single+0x12/0x20 Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR") Signed-off-by: Yacan Liu Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 1 + net/smc/smc_core.h | 2 ++ net/smc/smc_wr.c | 5 +++++ net/smc/smc_wr.h | 5 +++++ 4 files changed, 13 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ff49a11f57b8..ebf56cdf17db 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; + lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fe8b524ad846..285f9bd8e232 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -115,8 +115,10 @@ struct smc_link { dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ + u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ + wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 26f8f240d9e8..b0678a417e09 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) for (i = 0; i < num; i++) { link = wc[i].qp->qp_context; + link->wr_rx_id_compl = wc[i].wr_id; if (wc[i].status == IB_WC_SUCCESS) { link->wr_rx_tstamp = jiffies; smc_wr_rx_demultiplex(&wc[i]); @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: smcr_link_down_cond_sched(link); + if (link->wr_rx_id_compl == link->wr_rx_id) + wake_up(&link->wr_rx_empty_wait); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -639,6 +642,7 @@ void smc_wr_free_link(struct smc_link *lnk) return; ibdev = lnk->smcibdev->ibdev; + smc_wr_drain_cq(lnk); smc_wr_wakeup_reg_wait(lnk); smc_wr_wakeup_tx_wait(lnk); @@ -889,6 +893,7 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..45e9b894d3f8 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -73,6 +73,11 @@ static inline void smc_wr_tx_link_put(struct smc_link *link) wake_up_all(&link->wr_tx_wait); } +static inline void smc_wr_drain_cq(struct smc_link *lnk) +{ + wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); -- cgit v1.2.3 From 5382033a35227c57a349d74752ad2527780159a9 Mon Sep 17 00:00:00 2001 From: Arun Ramadoss Date: Mon, 5 Sep 2022 20:57:50 +0530 Subject: net: phy: lan87xx: change interrupt src of link_up to comm_ready Currently phy link up/down interrupt is enabled using the LAN87xx_INTERRUPT_MASK register. In the lan87xx_read_status function, phy link is determined using the T1_MODE_STAT_REG register comm_ready bit. comm_ready bit is set using the loc_rcvr_status & rem_rcvr_status. Whenever the phy link is up, LAN87xx_INTERRUPT_SOURCE link_up bit is set first but comm_ready bit takes some time to set based on local and remote receiver status. As per the current implementation, interrupt is triggered using link_up but the comm_ready bit is still cleared in the read_status function. So, link is always down. Initially tested with the shared interrupt mechanism with switch and internal phy which is working, but after implementing interrupt controller it is not working. It can fixed either by updating the read_status function to read from LAN87XX_INTERRUPT_SOURCE register or enable the interrupt mask for comm_ready bit. But the validation team recommends the use of comm_ready for link detection. This patch fixes by enabling the comm_ready bit for link_up in the LAN87XX_INTERRUPT_MASK_2 register (MISC Bank) and link_down in LAN87xx_INTERRUPT_MASK register. Fixes: 8a1b415d70b7 ("net: phy: added ethtool master-slave configuration support") Signed-off-by: Arun Ramadoss Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220905152750.5079-1-arun.ramadoss@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/phy/microchip_t1.c | 58 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c index d4c93d59bc53..8569a545e0a3 100644 --- a/drivers/net/phy/microchip_t1.c +++ b/drivers/net/phy/microchip_t1.c @@ -28,12 +28,16 @@ /* Interrupt Source Register */ #define LAN87XX_INTERRUPT_SOURCE (0x18) +#define LAN87XX_INTERRUPT_SOURCE_2 (0x08) /* Interrupt Mask Register */ #define LAN87XX_INTERRUPT_MASK (0x19) #define LAN87XX_MASK_LINK_UP (0x0004) #define LAN87XX_MASK_LINK_DOWN (0x0002) +#define LAN87XX_INTERRUPT_MASK_2 (0x09) +#define LAN87XX_MASK_COMM_RDY BIT(10) + /* MISC Control 1 Register */ #define LAN87XX_CTRL_1 (0x11) #define LAN87XX_MASK_RGMII_TXC_DLY_EN (0x4000) @@ -424,17 +428,55 @@ static int lan87xx_phy_config_intr(struct phy_device *phydev) int rc, val = 0; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { - /* unmask all source and clear them before enable */ - rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, 0x7FFF); + /* clear all interrupt */ + rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val); + if (rc < 0) + return rc; + rc = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE); - val = LAN87XX_MASK_LINK_UP | LAN87XX_MASK_LINK_DOWN; + if (rc < 0) + return rc; + + rc = access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, + PHYACC_ATTR_BANK_MISC, + LAN87XX_INTERRUPT_MASK_2, val); + if (rc < 0) + return rc; + + rc = access_ereg(phydev, PHYACC_ATTR_MODE_READ, + PHYACC_ATTR_BANK_MISC, + LAN87XX_INTERRUPT_SOURCE_2, 0); + if (rc < 0) + return rc; + + /* enable link down and comm ready interrupt */ + val = LAN87XX_MASK_LINK_DOWN; rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val); + if (rc < 0) + return rc; + + val = LAN87XX_MASK_COMM_RDY; + rc = access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, + PHYACC_ATTR_BANK_MISC, + LAN87XX_INTERRUPT_MASK_2, val); } else { rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val); - if (rc) + if (rc < 0) return rc; rc = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE); + if (rc < 0) + return rc; + + rc = access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, + PHYACC_ATTR_BANK_MISC, + LAN87XX_INTERRUPT_MASK_2, val); + if (rc < 0) + return rc; + + rc = access_ereg(phydev, PHYACC_ATTR_MODE_READ, + PHYACC_ATTR_BANK_MISC, + LAN87XX_INTERRUPT_SOURCE_2, 0); } return rc < 0 ? rc : 0; @@ -444,6 +486,14 @@ static irqreturn_t lan87xx_handle_interrupt(struct phy_device *phydev) { int irq_status; + irq_status = access_ereg(phydev, PHYACC_ATTR_MODE_READ, + PHYACC_ATTR_BANK_MISC, + LAN87XX_INTERRUPT_SOURCE_2, 0); + if (irq_status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + irq_status = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE); if (irq_status < 0) { phy_error(phydev); -- cgit v1.2.3 From 2f09707d0c972120bf794cfe0f0c67e2c2ddb252 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 5 Sep 2022 21:21:36 +0200 Subject: sch_sfb: Also store skb len before calling child enqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cong Wang noticed that the previous fix for sch_sfb accessing the queued skb after enqueueing it to a child qdisc was incomplete: the SFB enqueue function was also calling qdisc_qstats_backlog_inc() after enqueue, which reads the pkt len from the skb cb field. Fix this by also storing the skb len, and using the stored value to increment the backlog after enqueueing. Fixes: 9efd23297cca ("sch_sfb: Don't assume the skb is still around after enqueueing to child") Signed-off-by: Toke Høiland-Jørgensen Acked-by: Cong Wang Link: https://lore.kernel.org/r/20220905192137.965549-1-toke@toke.dk Signed-off-by: Paolo Abeni --- net/sched/sch_sfb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 0d761f454ae8..2829455211f8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct sfb_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; @@ -403,7 +404,7 @@ enqueue: memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { -- cgit v1.2.3