From 52bd2d62ce6758d811edcbd2256eb9ea7f6a56cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:50 -0800 Subject: net: better skb->sender_cpu and skb->napi_id cohabitation skb->sender_cpu and skb->napi_id share a common storage, and we had various bugs about this. We had to call skb_sender_cpu_clear() in some places to not leave a prior skb->napi_id and fool netdev_pick_tx() As suggested by Alexei, we could split the space so that these errors can not happen. 0 value being reserved as the common (not initialized) value, let's reserve [1 .. NR_CPUS] range for valid sender_cpu, and [NR_CPUS+1 .. ~0U] for valid napi_id. This will allow proper busy polling support over tunnels. Signed-off-by: Eric Dumazet Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ae00b894e675..2582c24a75c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL(dev_base_lock); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; +static unsigned int napi_gen_id = NR_CPUS; static DEFINE_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; @@ -3021,7 +3021,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, int queue_index = 0; #ifdef CONFIG_XPS - if (skb->sender_cpu == 0) + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif @@ -4676,25 +4678,22 @@ EXPORT_SYMBOL_GPL(napi_by_id); void napi_hash_add(struct napi_struct *napi) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + return; - spin_lock(&napi_hash_lock); + spin_lock(&napi_hash_lock); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; - } + /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < NR_CPUS + 1)) + napi_gen_id = NR_CPUS + 1; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); - spin_unlock(&napi_hash_lock); - } + spin_unlock(&napi_hash_lock); } EXPORT_SYMBOL_GPL(napi_hash_add); -- cgit v1.2.3 From 02d62e86fe892c59a1259d089d4d16ac76977a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:52 -0800 Subject: net: un-inline sk_busy_loop() There is really little gain from inlining this big function. We'll soon make it even bigger in following patches. This means we no longer need to export napi_by_id() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 --------- include/net/busy_poll.h | 45 +----------------------------------------- net/core/dev.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 55 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bfac1abfc1..2020a89df12b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -460,15 +460,6 @@ static inline void napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_by_id - lookup a NAPI by napi_id - * @napi_id: hashed napi_id - * - * lookup @napi_id in napi_hash table - * must be called under rcu_read_lock() - */ -struct napi_struct *napi_by_id(unsigned int napi_id); - /** * napi_hash_add - add a NAPI to global hashtable * @napi: napi context diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 1d67fb6b23a0..2fbeb1313c0f 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -72,50 +72,7 @@ static inline bool busy_loop_timeout(unsigned long end_time) return time_after(now, end_time); } -/* when used in sock_poll() nonblock is known at compile time to be true - * so the loop and end_time will be optimized out - */ -static inline bool sk_busy_loop(struct sock *sk, int nonblock) -{ - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; - const struct net_device_ops *ops; - struct napi_struct *napi; - int rc = false; - - /* - * rcu read lock for napi hash - * bh so we don't race with net_rx_action - */ - rcu_read_lock_bh(); - - napi = napi_by_id(sk->sk_napi_id); - if (!napi) - goto out; - - ops = napi->dev->netdev_ops; - if (!ops->ndo_busy_poll) - goto out; - - do { - rc = ops->ndo_busy_poll(napi); - - if (rc == LL_FLUSH_FAILED) - break; /* permanent failure */ - - if (rc > 0) - /* local bh are disabled so it is ok to use _BH */ - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); - cpu_relax(); - - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); - - rc = !skb_queue_empty(&sk->sk_receive_queue); -out: - rcu_read_unlock_bh(); - return rc; -} +bool sk_busy_loop(struct sock *sk, int nonblock); /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 2582c24a75c6..74a816b299df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #include @@ -4663,7 +4664,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -4674,7 +4675,52 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } -EXPORT_SYMBOL_GPL(napi_by_id); + +#if defined(CONFIG_NET_RX_BUSY_POLL) +bool sk_busy_loop(struct sock *sk, int nonblock) +{ + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_busy_poll) + goto out; + + do { + rc = ops->ndo_busy_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + cpu_relax(); + + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + !need_resched() && !busy_loop_timeout(end_time)); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} +EXPORT_SYMBOL(sk_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ void napi_hash_add(struct napi_struct *napi) { -- cgit v1.2.3 From 2a028ecb76497d05e5cd4e3e8b09d965cac2e3f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:53 -0800 Subject: net: allow BH servicing in sk_busy_loop() Instead of blocking BH in whole sk_busy_loop(), block them only around ->ndo_busy_poll() calls. This has many benefits. 1) allow tunneled traffic to use busy poll as well as native traffic. Tunnels handlers usually call netif_rx() and depend on net_rx_action() being run (from sofirq handler) 2) allow RFS/RPS being used (sending IPI to other cpus if needed) 3) use the 'lets burn cpu cycles' budget to do useful work (like TX completions, timers, RCU callbacks...) 4) reduce BH latencies, making busy poll a better citizen. Tested: Tested with SIT tunnel lpaa5:~# echo 0 >/proc/sys/net/core/busy_read lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0 Local /Remote Socket Size Request Resp. Elapsed Trans. Send Recv Size Size Time Rate bytes Bytes bytes bytes secs. per sec 16384 87380 1 1 10.00 37373.93 16384 87380 Now enable busy poll on both hosts lpaa5:~# echo 70 >/proc/sys/net/core/busy_read lpaa6:~# echo 70 >/proc/sys/net/core/busy_read lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0 Local /Remote Socket Size Request Resp. Elapsed Trans. Send Recv Size Size Time Rate bytes Bytes bytes bytes secs. per sec 16384 87380 1 1 10.00 58314.77 16384 87380 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 74a816b299df..2002eec2617d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4684,11 +4684,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock) struct napi_struct *napi; int rc = false; - /* - * rcu read lock for napi hash - * bh so we don't race with net_rx_action - */ - rcu_read_lock_bh(); + rcu_read_lock(); napi = napi_by_id(sk->sk_napi_id); if (!napi) @@ -4699,23 +4695,23 @@ bool sk_busy_loop(struct sock *sk, int nonblock) goto out; do { + local_bh_disable(); rc = ops->ndo_busy_poll(napi); + if (rc > 0) + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + local_bh_enable(); if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - if (rc > 0) - /* local bh are disabled so it is ok to use _BH */ - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: - rcu_read_unlock_bh(); + rcu_read_unlock(); return rc; } EXPORT_SYMBOL(sk_busy_loop); -- cgit v1.2.3 From ce6aea93f7510437dde625b77a7a2f4d20b72660 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:54 -0800 Subject: net: network drivers no longer need to implement ndo_busy_poll() Instead of having to implement complex ndo_busy_poll() method, drivers can simply rely on NAPI poll logic. Busy polling gains are mainly coming from polling itself, not on exact details on how we poll the device. ndo_busy_poll() if implemented can avoid touching napi state, but it adds extra synchronization between normal napi->poll() and busy poll handler, slowing down the common path (non busy polling) with extra atomic operations. In practice few drivers ever got busy poll because of the complexity. We could go one step further, and make busy polling available for all NAPI drivers, but this would require that all netif_napi_del() calls are done in process context so that we can call synchronize_rcu(). Full audit would be required. Before this is done, a driver still needs to call : - skb_mark_napi_id() for each skb provided to the stack. - napi_hash_add() and napi_hash_del() to allocate a napi_id per napi struct. - Make sure RCU grace period is respected after napi_hash_del() before memory containing napi structure is freed. Followup patch implements busy poll for mlx5 driver as an example. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2002eec2617d..93009610aee8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4677,10 +4677,11 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) +#define BUSY_POLL_BUDGET 8 bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; - const struct net_device_ops *ops; + int (*busy_poll)(struct napi_struct *dev); struct napi_struct *napi; int rc = false; @@ -4690,13 +4691,27 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (!napi) goto out; - ops = napi->dev->netdev_ops; - if (!ops->ndo_busy_poll) - goto out; + /* Note: ndo_busy_poll method is optional in linux-4.5 */ + busy_poll = napi->dev->netdev_ops->ndo_busy_poll; do { + rc = 0; local_bh_disable(); - rc = ops->ndo_busy_poll(napi); + if (busy_poll) { + rc = busy_poll(napi); + } else if (napi_schedule_prep(napi)) { + void *have = netpoll_poll_lock(napi); + + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi); + if (rc == BUSY_POLL_BUDGET) { + napi_complete_done(napi, rc); + napi_schedule(napi); + } + } + netpoll_poll_unlock(have); + } if (rc > 0) NET_ADD_STATS_BH(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); -- cgit v1.2.3 From 93f93a4404159ecf7e9148f5ad0718ec702ac4cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:59 -0800 Subject: net: move skb_mark_napi_id() into core networking stack We would like to automatically provide busy polling support to all NAPI drivers, without them having to implement anything. skb_mark_napi_id() can be called from napi_gro_receive() and napi_get_frags(). Few drivers are still calling skb_mark_napi_id() because they use netif_receive_skb(). They should eventually call napi_gro_receive() instead. I will leave this to drivers maintainers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 1 - drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 -- drivers/net/ethernet/chelsio/cxgb4/sge.c | 1 - drivers/net/ethernet/emulex/benet/be_main.c | 1 - drivers/net/ethernet/intel/i40e/i40e_txrx.c | 1 - drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 - drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 2 +- drivers/net/ethernet/sfc/rx.c | 1 - drivers/net/virtio_net.c | 2 -- net/core/dev.c | 2 ++ 13 files changed, 4 insertions(+), 16 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 53ce1222b11d..8a9b493566c9 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2024,7 +2024,6 @@ read_again: skb->dev = netdev; skb->protocol = eth_type_trans(skb, netdev); skb_record_rx_queue(skb, channel->queue_index); - skb_mark_napi_id(skb, napi); napi_gro_receive(napi, skb); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index ca208a7eecd5..ab9222924bd9 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1094,8 +1094,6 @@ reuse_rx: __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), le16_to_cpu(cqe_fp->vlan_tag)); - skb_mark_napi_id(skb, &fp->napi); - napi_gro_receive(&fp->napi, skb); next_rx: rx_buf->data = NULL; diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index b7b93e7a643d..f650f295f264 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1864,7 +1864,6 @@ static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl, skb->truesize += skb->data_len; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_record_rx_queue(skb, rxq->rspq.idx); - skb_mark_napi_id(skb, &rxq->rspq.napi); pi = netdev_priv(skb->dev); if (pi->rxtstamp) cxgb4_sgetim_to_hwtstamp(adapter, skb_hwtstamps(skb), diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index b6ad02909d6b..c29d62496ad9 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2184,7 +2184,6 @@ static void be_rx_compl_process_gro(struct be_rx_obj *rxo, skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3); skb->csum_level = rxcp->tunneled; - skb_mark_napi_id(skb, napi); if (rxcp->vlanf) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 635b3ac17877..6649ce4ba2de 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1632,7 +1632,6 @@ static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, int budget) continue; } #endif - skb_mark_napi_id(skb, &rx_ring->q_vector->napi); i40e_receive_skb(rx_ring, skb, vlan_tag); rx_desc->wb.qword1.status_error_len = 0; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 47e9a90d6b10..77968b184b1f 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -1090,7 +1090,6 @@ static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, int budget) continue; } #endif - skb_mark_napi_id(skb, &rx_ring->q_vector->napi); i40e_receive_skb(rx_ring, skb, vlan_tag); rx_desc->wb.qword1.status_error_len = 0; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 47395ff5d908..4089d776d01a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1659,6 +1659,7 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring, static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector, struct sk_buff *skb) { + skb_mark_napi_id(skb, &q_vector->napi); if (ixgbe_qv_busy_polling(q_vector)) netif_receive_skb(skb); else @@ -2123,7 +2124,6 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, } #endif /* IXGBE_FCOE */ - skb_mark_napi_id(skb, &q_vector->napi); ixgbe_rx_skb(q_vector, skb); /* update budget accounting */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 1feead34093b..41440b2b20a3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -925,7 +925,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud PKT_HASH_TYPE_L3); skb_record_rx_queue(gro_skb, cq->ring); - skb_mark_napi_id(gro_skb, &cq->napi); if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { timestamp = mlx4_en_get_cqe_ts(cqe); @@ -988,8 +987,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud timestamp); } - skb_mark_napi_id(skb, &cq->napi); - napi_gro_receive(&cq->napi, skb); next: for (nr = 0; nr < priv->num_frags; nr++) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index fe752f8e24b9..7c8c4088d1be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -243,7 +243,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter); skb = rq->skb[wqe_counter]; prefetch(skb->data); - skb_mark_napi_id(skb, cq->napi); rq->skb[wqe_counter] = NULL; dma_unmap_single(rq->pdev, diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 83651ac8ddb9..acf866147d65 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -1488,7 +1488,6 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) } myri10ge_vlan_rx(mgp->dev, va, skb); skb_record_rx_queue(skb, ss - &mgp->ss[0]); - skb_mark_napi_id(skb, &ss->napi); if (polling) { int hlen; @@ -1506,6 +1505,7 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) skb->data_len -= hlen; skb->tail += hlen; skb->protocol = eth_type_trans(skb, dev); + skb_mark_napi_id(skb, &ss->napi); netif_receive_skb(skb); } else diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 809ea4610a77..8956995b2fe7 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -463,7 +463,6 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, skb_record_rx_queue(skb, channel->rx_queue.core_index); - skb_mark_napi_id(skb, &channel->napi_str); gro_result = napi_gro_frags(napi); if (gro_result != GRO_DROP) channel->irq_mod_score += 2; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d8838dedb7a4..d1d14cecf450 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -516,8 +516,6 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, skb_shinfo(skb)->gso_segs = 0; } - skb_mark_napi_id(skb, &rq->napi); - napi_gro_receive(&rq->napi, skb); return; diff --git a/net/core/dev.c b/net/core/dev.c index 93009610aee8..83b48747928c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4356,6 +4356,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); @@ -4390,6 +4391,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; + skb_mark_napi_id(skb, napi); } return skb; } -- cgit v1.2.3 From d64b5e85bfe2fe4c790abcbd16d9ae32391ddd7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:00 -0800 Subject: net: add netif_tx_napi_add() netif_tx_napi_add() is a variant of netif_napi_add() It should be used by drivers that use a napi structure to exclusively poll TX. We do not want to add this kind of napi in napi_hash[] in following patches, adding generic busy polling to all NAPI drivers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 ++-- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 4 ++-- drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c | 4 ++-- drivers/net/ethernet/rocker/rocker.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/netcp_core.c | 2 +- drivers/net/wireless/ath/wil6210/netdev.c | 2 +- include/linux/netdevice.h | 23 +++++++++++++++++++++- net/core/dev.c | 3 ++- 12 files changed, 38 insertions(+), 16 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 858106352ce9..993c780bdfab 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1216,7 +1216,7 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv, /* Initialize SW view of the ring */ spin_lock_init(&ring->lock); ring->priv = priv; - netif_napi_add(priv->netdev, &ring->napi, bcm_sysport_tx_poll, 64); + netif_tx_napi_add(priv->netdev, &ring->napi, bcm_sysport_tx_poll, 64); ring->index = index; ring->size = size; ring->alloc_size = ring->size; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 17f017ab4dac..b15a60d787c7 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2041,11 +2041,11 @@ static void bcmgenet_init_tx_napi(struct bcmgenet_priv *priv) for (i = 0; i < priv->hw_params->tx_queues; ++i) { ring = &priv->tx_rings[i]; - netif_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); + netif_tx_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); } ring = &priv->tx_rings[DESC_INDEX]; - netif_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); + netif_tx_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); } static void bcmgenet_enable_tx_napi(struct bcmgenet_priv *priv) diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index cf8e54652df9..48a9c176e0d1 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -1050,7 +1050,7 @@ static int fs_enet_probe(struct platform_device *ofdev) ndev->netdev_ops = &fs_enet_netdev_ops; ndev->watchdog_timeo = 2 * HZ; netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi, fpi->napi_weight); - netif_napi_add(ndev, &fep->napi_tx, fs_enet_tx_napi, 2); + netif_tx_napi_add(ndev, &fep->napi_tx, fs_enet_tx_napi, 2); ndev->ethtool_ops = &fs_ethtool_ops; diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 3e6b9b437497..c8bc43e99a35 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1347,12 +1347,12 @@ static int gfar_probe(struct platform_device *ofdev) if (priv->poll_mode == GFAR_SQ_POLLING) { netif_napi_add(dev, &priv->gfargrp[i].napi_rx, gfar_poll_rx_sq, GFAR_DEV_WEIGHT); - netif_napi_add(dev, &priv->gfargrp[i].napi_tx, + netif_tx_napi_add(dev, &priv->gfargrp[i].napi_tx, gfar_poll_tx_sq, 2); } else { netif_napi_add(dev, &priv->gfargrp[i].napi_rx, gfar_poll_rx, GFAR_DEV_WEIGHT); - netif_napi_add(dev, &priv->gfargrp[i].napi_tx, + netif_tx_napi_add(dev, &priv->gfargrp[i].napi_tx, gfar_poll_tx, 2); } } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index eb8a4988de63..3a6176fea78d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -156,8 +156,8 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, cq->mcq.event = mlx4_en_cq_event; if (cq->is_tx) { - netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, - NAPI_POLL_WEIGHT); + netif_tx_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, + NAPI_POLL_WEIGHT); } else { netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); napi_hash_add(&cq->napi); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c index d4b5085a21fa..7bd6f25b4625 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c @@ -1604,7 +1604,7 @@ int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter, if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) { for (ring = 0; ring < adapter->drv_tx_rings; ring++) { tx_ring = &adapter->tx_ring[ring]; - netif_napi_add(netdev, &tx_ring->napi, qlcnic_tx_poll, + netif_tx_napi_add(netdev, &tx_ring->napi, qlcnic_tx_poll, NAPI_POLL_WEIGHT); } } @@ -2135,7 +2135,7 @@ int qlcnic_83xx_napi_add(struct qlcnic_adapter *adapter, !(adapter->flags & QLCNIC_TX_INTR_SHARED)) { for (ring = 0; ring < adapter->drv_tx_rings; ring++) { tx_ring = &adapter->tx_ring[ring]; - netif_napi_add(netdev, &tx_ring->napi, + netif_tx_napi_add(netdev, &tx_ring->napi, qlcnic_83xx_msix_tx_poll, NAPI_POLL_WEIGHT); } diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index e9f2349e98bc..a4ab71d43e4e 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4998,7 +4998,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) dev->netdev_ops = &rocker_port_netdev_ops; dev->ethtool_ops = &rocker_port_ethtool_ops; dev->switchdev_ops = &rocker_port_switchdev_ops; - netif_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx, + netif_tx_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx, NAPI_POLL_WEIGHT); netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx, NAPI_POLL_WEIGHT); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 48b92c9de12a..15322c08de80 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2469,7 +2469,7 @@ static int cpsw_probe(struct platform_device *pdev) ndev->netdev_ops = &cpsw_netdev_ops; ndev->ethtool_ops = &cpsw_ethtool_ops; netif_napi_add(ndev, &priv->napi_rx, cpsw_rx_poll, CPSW_POLL_WEIGHT); - netif_napi_add(ndev, &priv->napi_tx, cpsw_tx_poll, CPSW_POLL_WEIGHT); + netif_tx_napi_add(ndev, &priv->napi_tx, cpsw_tx_poll, CPSW_POLL_WEIGHT); /* register the network device */ SET_NETDEV_DEV(ndev, &pdev->dev); diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 37b9b39192ec..e5e20e734f21 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1990,7 +1990,7 @@ static int netcp_create_interface(struct netcp_device *netcp_device, /* NAPI register */ netif_napi_add(ndev, &netcp->rx_napi, netcp_rx_poll, NETCP_NAPI_WEIGHT); - netif_napi_add(ndev, &netcp->tx_napi, netcp_tx_poll, NETCP_NAPI_WEIGHT); + netif_tx_napi_add(ndev, &netcp->tx_napi, netcp_tx_poll, NETCP_NAPI_WEIGHT); /* Register the network device */ ndev->dev_id = 0; diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c index e3b3c8fb4605..56aaa2d4fb0e 100644 --- a/drivers/net/wireless/ath/wil6210/netdev.c +++ b/drivers/net/wireless/ath/wil6210/netdev.c @@ -183,7 +183,7 @@ void *wil_if_alloc(struct device *dev) netif_napi_add(ndev, &wil->napi_rx, wil6210_netdev_poll_rx, WIL6210_NAPI_BUDGET); - netif_napi_add(ndev, &wil->napi_tx, wil6210_netdev_poll_tx, + netif_tx_napi_add(ndev, &wil->napi_tx, wil6210_netdev_poll_tx, WIL6210_NAPI_BUDGET); netif_tx_stop_all_queues(ndev); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2020a89df12b..838935d1cdbb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -326,7 +326,8 @@ enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_HASHED, /* In NAPI hash */ + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ }; enum gro_result { @@ -1938,6 +1939,26 @@ static inline void *netdev_priv(const struct net_device *dev) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); +/** + * netif_tx_napi_add - initialize a napi context + * @dev: network device + * @napi: napi context + * @poll: polling function + * @weight: default weight + * + * This variant of netif_napi_add() should be used from drivers using NAPI + * to exclusively poll a TX queue. + * This will avoid we add it into napi_hash[], thus polluting this hash table. + */ +static inline void netif_tx_napi_add(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add(dev, napi, poll, weight); +} + /** * netif_napi_del - remove a napi context * @napi: napi context diff --git a/net/core/dev.c b/net/core/dev.c index 83b48747928c..ff58a8bc5e3c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4737,7 +4737,8 @@ EXPORT_SYMBOL(sk_busy_loop); void napi_hash_add(struct napi_struct *napi) { - if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || + test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) return; spin_lock(&napi_hash_lock); -- cgit v1.2.3 From 6180d9de61a5c461f9e3efef5417a844701dbbb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:01 -0800 Subject: net: move napi_hash[] into read mostly section We do not often add/delete a napi context. Moving napi_hash[] into read mostly section avoids potential false sharing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/hashtable.h | 4 ++++ net/core/dev.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h index 519b6e2d769e..661e5c2a8e2a 100644 --- a/include/linux/hashtable.h +++ b/include/linux/hashtable.h @@ -16,6 +16,10 @@ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } +#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] __read_mostly = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] diff --git a/net/core/dev.c b/net/core/dev.c index ff58a8bc5e3c..02dfbd91a8e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL(dev_base_lock); static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; -static DEFINE_HASHTABLE(napi_hash, 8); +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; -- cgit v1.2.3 From 34cbe27e811c591c854a39c0dee1b461bb796953 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:02 -0800 Subject: net: napi_hash_del() returns a boolean status napi_hash_del() will soon be used from both drivers (if they want) or core networking stack. Callers are responsibles to ensure an RCU grace period is respected before freeing napi structure : napi_hash_del() can signal if this RCU grace period is needed or not. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- net/core/dev.c | 10 +++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 838935d1cdbb..e5c33b29471b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -474,9 +474,10 @@ void napi_hash_add(struct napi_struct *napi); * @napi: napi context * * Warning: caller must observe rcu grace period - * before freeing memory containing @napi + * before freeing memory containing @napi, if + * this function returns true. */ -void napi_hash_del(struct napi_struct *napi); +bool napi_hash_del(struct napi_struct *napi); /** * napi_disable - prevent NAPI from scheduling diff --git a/net/core/dev.c b/net/core/dev.c index 02dfbd91a8e4..59dddac1c2e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4760,14 +4760,18 @@ EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -void napi_hash_del(struct napi_struct *napi) +bool napi_hash_del(struct napi_struct *napi) { + bool rcu_sync_needed = false; + spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { + rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); - + } spin_unlock(&napi_hash_lock); + return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); -- cgit v1.2.3 From 93d05d4a320cb16712bb3d57a9658f395d8cecb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:03 -0800 Subject: net: provide generic busy polling to all NAPI drivers NAPI drivers no longer need to observe a particular protocol to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y) napi_hash_add() and napi_hash_del() are automatically called from core networking stack, respectively from netif_napi_add() and netif_napi_del() This patch depends on free_netdev() and netif_napi_del() being called from process context, which seems to be the norm. Drivers might still prefer to call napi_hash_del() on their own, since they might combine all the rcu grace periods into a single one, knowing their NAPI structures lifetime, while core networking stack has no idea of a possible combining. Once this patch proves to not bring serious regressions, we will cleanup drivers to either remove napi_hash_del() or provide appropriate rcu grace periods combining. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 -- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 -- drivers/net/ethernet/chelsio/cxgb4/sge.c | 1 - drivers/net/ethernet/cisco/enic/enic_main.c | 2 -- drivers/net/ethernet/emulex/benet/be_main.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 1 - drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 6 ++---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 1 - drivers/net/ethernet/sfc/efx.c | 1 - drivers/net/virtio_net.c | 1 - include/linux/netdevice.h | 7 +++++++ net/core/dev.c | 7 +++++++ 14 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index ab9222924bd9..d9add7c02e42 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -46,7 +46,6 @@ static void bnx2x_add_all_napi_cnic(struct bnx2x *bp) for_each_rx_queue_cnic(bp, i) { netif_napi_add(bp->dev, &bnx2x_fp(bp, i, napi), bnx2x_poll, NAPI_POLL_WEIGHT); - napi_hash_add(&bnx2x_fp(bp, i, napi)); } } @@ -58,7 +57,6 @@ static void bnx2x_add_all_napi(struct bnx2x *bp) for_each_eth_queue(bp, i) { netif_napi_add(bp->dev, &bnx2x_fp(bp, i, napi), bnx2x_poll, NAPI_POLL_WEIGHT); - napi_hash_add(&bnx2x_fp(bp, i, napi)); } } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index db15c5ee09c5..f2d0dc9b1c41 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4227,12 +4227,10 @@ static void bnxt_init_napi(struct bnxt *bp) bnapi = bp->bnapi[i]; netif_napi_add(bp->dev, &bnapi->napi, bnxt_poll, 64); - napi_hash_add(&bnapi->napi); } } else { bnapi = bp->bnapi[0]; netif_napi_add(bp->dev, &bnapi->napi, bnxt_poll, 64); - napi_hash_add(&bnapi->napi); } } diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index f650f295f264..48d8fbb1c220 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2527,7 +2527,6 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, goto err; netif_napi_add(dev, &iq->napi, napi_rx_handler, 64); - napi_hash_add(&iq->napi); iq->cur_desc = iq->desc; iq->cidx = 0; iq->gen = 1; diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index b36643ef0593..b2182d3ba3cc 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -2458,13 +2458,11 @@ static int enic_dev_init(struct enic *enic) switch (vnic_dev_get_intr_mode(enic->vdev)) { default: netif_napi_add(netdev, &enic->napi[0], enic_poll, 64); - napi_hash_add(&enic->napi[0]); break; case VNIC_DEV_INTR_MODE_MSIX: for (i = 0; i < enic->rq_count; i++) { netif_napi_add(netdev, &enic->napi[i], enic_poll_msix_rq, NAPI_POLL_WEIGHT); - napi_hash_add(&enic->napi[i]); } for (i = 0; i < enic->wq_count; i++) netif_napi_add(netdev, &enic->napi[enic_cq_wq(enic, i)], diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index c29d62496ad9..4cab8879f5ae 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2630,7 +2630,6 @@ static int be_evt_queues_create(struct be_adapter *adapter) eqo->affinity_mask); netif_napi_add(adapter->netdev, &eqo->napi, be_poll, BE_NAPI_WEIGHT); - napi_hash_add(&eqo->napi); } return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index f3168bcc7d87..e771e764daa3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -844,7 +844,6 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, /* initialize NAPI */ netif_napi_add(adapter->netdev, &q_vector->napi, ixgbe_poll, 64); - napi_hash_add(&q_vector->napi); #ifdef CONFIG_NET_RX_BUSY_POLL /* initialize busy poll */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 592ff237d692..2955186cd4f6 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2483,9 +2483,6 @@ static int ixgbevf_alloc_q_vectors(struct ixgbevf_adapter *adapter) q_vector->v_idx = q_idx; netif_napi_add(adapter->netdev, &q_vector->napi, ixgbevf_poll, 64); -#ifdef CONFIG_NET_RX_BUSY_POLL - napi_hash_add(&q_vector->napi); -#endif adapter->q_vector[q_idx] = q_vector; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 3a6176fea78d..af975a2b74c6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -155,13 +155,11 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, cq->mcq.comp = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq; cq->mcq.event = mlx4_en_cq_event; - if (cq->is_tx) { + if (cq->is_tx) netif_tx_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, NAPI_POLL_WEIGHT); - } else { + else netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); - napi_hash_add(&cq->napi); - } napi_enable(&cq->napi); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ffb1f9c1b973..f6a8cc787603 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -982,7 +982,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, mlx5e_build_channeltc_to_txq_map(priv, ix); netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); - napi_hash_add(&c->napi); err = mlx5e_open_tx_cqs(c, cparam); if (err) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index acf866147d65..270c9eeb7ab6 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -3814,7 +3814,6 @@ static int myri10ge_alloc_slices(struct myri10ge_priv *mgp) ss->dev = mgp->dev; netif_napi_add(ss->dev, &ss->napi, myri10ge_poll, myri10ge_napi_weight); - napi_hash_add(&ss->napi); } return 0; abort: diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index a3c42a376741..4e82bcfbe3e0 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2059,7 +2059,6 @@ static void efx_init_napi_channel(struct efx_channel *channel) channel->napi_dev = efx->net_dev; netif_napi_add(channel->napi_dev, &channel->napi_str, efx_poll, napi_weight); - napi_hash_add(&channel->napi_str); efx_channel_busy_poll_init(channel); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d1d14cecf450..b1ae4cbf2453 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1610,7 +1610,6 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) vi->rq[i].pages = NULL; netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll, napi_weight); - napi_hash_add(&vi->rq[i].napi); sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e5c33b29471b..7d2d1d7aaec7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -466,6 +466,9 @@ static inline void napi_complete(struct napi_struct *n) * @napi: napi context * * generate a new napi_id and store a @napi under it in napi_hash + * Used for busy polling (CONFIG_NET_RX_BUSY_POLL) + * Note: This is normally automatically done from netif_napi_add(), + * so might disappear in a future linux version. */ void napi_hash_add(struct napi_struct *napi); @@ -476,6 +479,10 @@ void napi_hash_add(struct napi_struct *napi); * Warning: caller must observe rcu grace period * before freeing memory containing @napi, if * this function returns true. + * Note: core networking stack automatically calls it + * from netif_napi_del() + * Drivers might want to call this helper to combine all + * the needed rcu grace periods into a single one. */ bool napi_hash_del(struct napi_struct *napi); diff --git a/net/core/dev.c b/net/core/dev.c index 59dddac1c2e7..41cef3e3f558 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4807,6 +4807,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); @@ -4826,8 +4827,12 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +/* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { + might_sleep(); + if (napi_hash_del(napi)) + synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); @@ -7227,11 +7232,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs); * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. * If this is the last reference then it will be freed. + * Must be called in process context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + might_sleep(); netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); -- cgit v1.2.3 From e2f9dc3bd213792ac006e83f50a5453f23b8c354 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Nov 2015 12:11:23 -0800 Subject: net: avoid NULL deref in napi_get_frags() napi_alloc_skb() can return NULL. We should not crash should this happen. Fixes: 93f93a440415 ("net: move skb_mark_napi_id() into core networking stack") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 41cef3e3f558..5df6cbce727c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4390,8 +4390,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - napi->skb = skb; - skb_mark_napi_id(skb, napi); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } } return skb; } -- cgit v1.2.3 From b03804e7c3ad41c265c0ca21ddb306b252b4f99f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Dec 2015 12:12:03 +0100 Subject: net: Check CHANGEUPPER notifier return value switchdev drivers reflect the newly requested topology to hardware when CHANGEUPPER is received, after software links were already formed. However, the operation can fail and user will not be notified, as the return value of the notifier is not checked. Add this check and rollback software links if necessary. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5df6cbce727c..939cd1b1da15 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5490,8 +5490,12 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, - &changeupper_info.info); + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback_lower_mesh; + return 0; rollback_lower_mesh: -- cgit v1.2.3 From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:10 +0100 Subject: net: propagate upper priv via netdev_master_upper_dev_link Eliminate netdev_master_upper_dev_link_private and pass priv directly as a parameter of netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/vrf.c | 2 +- include/linux/netdevice.h | 6 ++---- net/batman-adv/hard-interface.c | 3 ++- net/bridge/br_if.c | 2 +- net/core/dev.c | 18 ++++++------------ net/openvswitch/vport-netdev.c | 2 +- 8 files changed, 15 insertions(+), 22 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9e0f8a7ef8b1..924015729b2d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1204,7 +1204,7 @@ static int bond_master_upper_dev_link(struct net_device *bond_dev, { int err; - err = netdev_master_upper_dev_link_private(slave_dev, bond_dev, slave); + err = netdev_master_upper_dev_link(slave_dev, bond_dev, slave); if (err) return err; slave_dev->flags |= IFF_SLAVE; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index d2f3ee832c47..b37f8d14dca0 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1083,7 +1083,7 @@ static int team_upper_dev_link(struct net_device *dev, { int err; - err = netdev_master_upper_dev_link(port_dev, dev); + err = netdev_master_upper_dev_link(port_dev, dev, NULL); if (err) return err; port_dev->priv_flags |= IFF_TEAM_PORT; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index c2d54c4ed556..59c5bddeaedd 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -624,7 +624,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) goto out_fail; } - ret = netdev_master_upper_dev_link(port_dev, dev); + ret = netdev_master_upper_dev_link(port_dev, dev, NULL); if (ret < 0) goto out_unregister; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1506be58c59a..939b8f3de810 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3619,10 +3619,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private); + struct net_device *upper_dev, + void *upper_priv); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index f11345e163d7..a7f4f1085dbb 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -464,7 +464,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); - ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface); + ret = netdev_master_upper_dev_link(hard_iface->net_dev, + soft_iface, NULL); if (ret) goto err_dev; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ec02f5869a78..781abc34667a 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -493,7 +493,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev); + err = netdev_master_upper_dev_link(dev, br->dev, NULL); if (err) goto err5; diff --git a/net/core/dev.c b/net/core/dev.c index 939cd1b1da15..27d052bb78bc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *private) + void *upper_priv) { struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; @@ -5452,7 +5452,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; @@ -5557,6 +5557,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device + * @upper_priv: upper device private * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -5565,20 +5566,13 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + void *upper_priv) { - return __netdev_upper_dev_link(dev, upper_dev, true, NULL); + return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv); } EXPORT_SYMBOL(netdev_master_upper_dev_link); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private) -{ - return __netdev_upper_dev_link(dev, upper_dev, true, private); -} -EXPORT_SYMBOL(netdev_master_upper_dev_link_private); - /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index b327368a3848..3ee3df1edeae 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -105,7 +105,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp)); + get_dpdev(vport->dp), NULL); if (err) goto error_unlock; -- cgit v1.2.3 From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:11 +0100 Subject: net: add possibility to pass information about upper device via notifier Sometimes the drivers and other code would find it handy to know some internal information about upper device being changed. So allow upper-code to pass information down to notifier listeners during linking. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/vrf.c | 2 +- include/linux/netdevice.h | 3 ++- net/batman-adv/hard-interface.c | 2 +- net/bridge/br_if.c | 2 +- net/core/dev.c | 11 +++++++---- net/openvswitch/vport-netdev.c | 2 +- 8 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 924015729b2d..fa3ed1d8a12d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1204,7 +1204,7 @@ static int bond_master_upper_dev_link(struct net_device *bond_dev, { int err; - err = netdev_master_upper_dev_link(slave_dev, bond_dev, slave); + err = netdev_master_upper_dev_link(slave_dev, bond_dev, slave, NULL); if (err) return err; slave_dev->flags |= IFF_SLAVE; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index b37f8d14dca0..f7b6ff7948b8 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1083,7 +1083,7 @@ static int team_upper_dev_link(struct net_device *dev, { int err; - err = netdev_master_upper_dev_link(port_dev, dev, NULL); + err = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); if (err) return err; port_dev->priv_flags |= IFF_TEAM_PORT; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 59c5bddeaedd..8944a49cda15 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -624,7 +624,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) goto out_fail; } - ret = netdev_master_upper_dev_link(port_dev, dev, NULL); + ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); if (ret < 0) goto out_unregister; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 939b8f3de810..aea556c64f2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2163,6 +2163,7 @@ struct netdev_notifier_changeupper_info { struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the nofication for link or unlink */ + void *upper_info; /* upper dev info */ }; static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, @@ -3620,7 +3621,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv); + void *upper_priv, void *upper_info); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index a7f4f1085dbb..aa8867e1d983 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -465,7 +465,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - soft_iface, NULL); + soft_iface, NULL, NULL); if (ret) goto err_dev; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 781abc34667a..8d1d4a22c50d 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -493,7 +493,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL); if (err) goto err5; diff --git a/net/core/dev.c b/net/core/dev.c index 27d052bb78bc..8ed886663c6d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *upper_priv) + void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; @@ -5445,6 +5445,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, changeupper_info.upper_dev = upper_dev; changeupper_info.master = master; changeupper_info.linking = true; + changeupper_info.upper_info = upper_info; ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, &changeupper_info.info); @@ -5549,7 +5550,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -5558,6 +5559,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -5567,9 +5569,10 @@ EXPORT_SYMBOL(netdev_upper_dev_link); */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv) + void *upper_priv, void *upper_info) { - return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv); + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info); } EXPORT_SYMBOL(netdev_master_upper_dev_link); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 3ee3df1edeae..8f4dd4c39bfe 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -105,7 +105,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), NULL); + get_dpdev(vport->dp), NULL, NULL); if (err) goto error_unlock; -- cgit v1.2.3 From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:15 +0100 Subject: net: introduce change lower state notifier When lower device like bonding slave, team/bridge port, etc changes its state, it is useful for others to notice this change. Currently this is implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This patch aims to replace this specific usage and make this more generic to be used for all upper-lower devices. Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and netdev_lower_state_changed() helper. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ net/core/dev.c | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ab90ea0ed03..ad69f237aa78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2158,6 +2158,7 @@ struct netdev_lag_upper_info { #define NETDEV_CHANGEINFODATA 0x0018 #define NETDEV_BONDING_INFO 0x0019 #define NETDEV_PRECHANGEUPPER 0x001A +#define NETDEV_CHANGELOWERSTATE 0x001B int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); @@ -2179,6 +2180,11 @@ struct netdev_notifier_changeupper_info { void *upper_info; /* upper dev info */ }; +struct netdev_notifier_changelowerstate_info { + struct netdev_notifier_info info; /* must be first */ + void *lower_state_info; /* is lower dev state */ +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { @@ -3640,6 +3646,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 diff --git a/net/core/dev.c b/net/core/dev.c index 8ed886663c6d..d1706e88fbeb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5756,6 +5756,26 @@ int dev_get_nest_level(struct net_device *dev, } EXPORT_SYMBOL(dev_get_nest_level); +/** + * netdev_lower_change - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.2.3 From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 4 Dec 2015 15:01:31 +0100 Subject: net: constify netif_is_* helpers net_device param As suggested by Eric, these helpers should have const dev param. Suggested-by: Eric Dumazet Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- include/linux/netdevice.h | 22 +++++++++++----------- net/core/dev.c | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 67ce5bd3b56a..05f5879821b8 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); -static inline bool is_vlan_dev(struct net_device *dev) +static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3efe017fe419..1bb21ff0fa64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3661,7 +3661,7 @@ extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)); + bool (*type_check)(const struct net_device *dev)); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); @@ -3858,32 +3858,32 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, skb->mac_len = mac_len; } -static inline bool netif_is_macvlan(struct net_device *dev) +static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; } -static inline bool netif_is_macvlan_port(struct net_device *dev) +static inline bool netif_is_macvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN_PORT; } -static inline bool netif_is_ipvlan(struct net_device *dev) +static inline bool netif_is_ipvlan(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_SLAVE; } -static inline bool netif_is_ipvlan_port(struct net_device *dev) +static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_MASTER; } -static inline bool netif_is_bond_master(struct net_device *dev) +static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; } -static inline bool netif_is_bond_slave(struct net_device *dev) +static inline bool netif_is_bond_slave(const struct net_device *dev) { return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; } @@ -3918,22 +3918,22 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } -static inline bool netif_is_team_master(struct net_device *dev) +static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; } -static inline bool netif_is_team_port(struct net_device *dev) +static inline bool netif_is_team_port(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM_PORT; } -static inline bool netif_is_lag_master(struct net_device *dev) +static inline bool netif_is_lag_master(const struct net_device *dev) { return netif_is_bond_master(dev) || netif_is_team_master(dev); } -static inline bool netif_is_lag_port(struct net_device *dev) +static inline bool netif_is_lag_port(const struct net_device *dev) { return netif_is_bond_slave(dev) || netif_is_team_port(dev); } diff --git a/net/core/dev.c b/net/core/dev.c index d1706e88fbeb..e5c395473eba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5734,7 +5734,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)) + bool (*type_check)(const struct net_device *dev)) { struct net_device *lower = NULL; struct list_head *iter; -- cgit v1.2.3 From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:52 -0500 Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data. ->sk_cgroup_prioidx and ->sk_classid are moved into it. The struct and its accessors are defined in cgroup-defs.h. This is to prepare for overloading the fields with a cgroup pointer. This patch mostly performs equivalent conversions but the followings are noteworthy. * Equality test before updating classid is removed from sock_update_classid(). This shouldn't make any noticeable difference and a similar test will be implemented on the helper side later. * sock_update_netprioidx() now takes struct sock_cgroup_data and can be moved to netprio_cgroup.h without causing include dependency loop. Moved. * The dummy version of sock_update_netprioidx() converted to a static inline function while at it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 36 ++++++++++++++++++++++++++++++++++++ include/net/cls_cgroup.h | 11 +++++------ include/net/netprio_cgroup.h | 16 +++++++++++++--- include/net/sock.h | 11 +++-------- net/Kconfig | 6 ++++++ net/core/dev.c | 3 ++- net/core/netclassid_cgroup.c | 4 ++-- net/core/netprio_cgroup.c | 3 ++- net/core/scm.c | 4 ++-- net/core/sock.c | 15 ++------------- net/netfilter/nft_meta.c | 2 +- net/netfilter/xt_cgroup.c | 3 ++- 12 files changed, 76 insertions(+), 38 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 504d8591b6d3..ed128fed0335 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -542,4 +542,40 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +struct sock_cgroup_data { + u16 prioidx; + u32 classid; +}; + +static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +{ + return skcd->prioidx; +} + +static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +{ + return skcd->classid; +} + +static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, + u16 prioidx) +{ + skcd->prioidx = prioidx; +} + +static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, + u32 classid) +{ + skcd->classid = classid; +} + +#else /* CONFIG_SOCK_CGROUP_DATA */ + +struct sock_cgroup_data { +}; + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index ccd6d8bffa4d..c0a92e2c286d 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -41,13 +41,12 @@ static inline u32 task_cls_classid(struct task_struct *p) return classid; } -static inline void sock_update_classid(struct sock *sk) +static inline void sock_update_classid(struct sock_cgroup_data *skcd) { u32 classid; classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; + sock_cgroup_set_classid(skcd, classid); } static inline u32 task_get_classid(const struct sk_buff *skb) @@ -64,17 +63,17 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * softirqs always disables bh. */ if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ + /* If there is an sock_cgroup_classid we'll use that. */ if (!skb->sk) return 0; - classid = skb->sk->sk_classid; + classid = sock_cgroup_classid(&skb->sk->sk_cgrp_data); } return classid; } #else /* !CONFIG_CGROUP_NET_CLASSID */ -static inline void sock_update_classid(struct sock *sk) +static inline void sock_update_classid(struct sock_cgroup_data *skcd) { } diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index f2a9597ff53c..604190596cde 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -25,8 +25,6 @@ struct netprio_map { u32 priomap[]; }; -void sock_update_netprioidx(struct sock *sk); - static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; @@ -38,13 +36,25 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_unlock(); return idx; } + +static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) +{ + if (in_interrupt()) + return; + + sock_cgroup_set_prioidx(skcd, task_netprioidx(current)); +} + #else /* !CONFIG_CGROUP_NET_PRIO */ + static inline u32 task_netprioidx(struct task_struct *p) { return 0; } -#define sock_update_netprioidx(sk) +static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) +{ +} #endif /* CONFIG_CGROUP_NET_PRIO */ #endif /* _NET_CLS_CGROUP_H */ diff --git a/include/net/sock.h b/include/net/sock.h index a95bcf7d6efa..0ca22b014de1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -308,8 +309,7 @@ struct cg_proto; * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules * @sk_mark: generic packet mark - * @sk_cgrp_prioidx: socket group's priority map index - * @sk_classid: this socket's cgroup classid + * @sk_cgrp_data: cgroup data for this cgroup * @sk_cgrp: this socket's cgroup-specific proto data * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock @@ -443,12 +443,7 @@ struct sock { #ifdef CONFIG_SECURITY void *sk_security; #endif -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) - u16 sk_cgrp_prioidx; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - u32 sk_classid; -#endif + struct sock_cgroup_data sk_cgrp_data; struct cg_proto *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); diff --git a/net/Kconfig b/net/Kconfig index 127da94ae25e..11f8c22af34d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -250,9 +250,14 @@ config XPS depends on SMP default y +config SOCK_CGROUP_DATA + bool + default n + config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS + select SOCK_CGROUP_DATA ---help--- Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis. @@ -260,6 +265,7 @@ config CGROUP_NET_PRIO config CGROUP_NET_CLASSID bool "Network classid cgroup" depends on CGROUPS + select SOCK_CGROUP_DATA ---help--- Cgroup subsystem for use as general purpose socket classid marker that is being used in cls_cgroup and for netfilter matching. diff --git a/net/core/dev.c b/net/core/dev.c index e5c395473eba..8f705fcedb94 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2929,7 +2929,8 @@ static void skb_update_prio(struct sk_buff *skb) struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + unsigned int prioidx = + sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 2e4df84c34a1..e60ded46b3ac 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -62,8 +62,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct socket *sock = sock_from_file(file, &err); if (sock) - sock->sk->sk_classid = (u32)(unsigned long)v; - + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, + (unsigned long)v); return 0; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 2b9159b7a28a..de42aa7f6c77 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -223,7 +223,8 @@ static int update_netprio(const void *v, struct file *file, unsigned n) int err; struct socket *sock = sock_from_file(file, &err); if (sock) - sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); return 0; } diff --git a/net/core/scm.c b/net/core/scm.c index 8a1741b14302..14596fb37172 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -289,8 +289,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); if (sock) { - sock_update_netprioidx(sock->sk); - sock_update_classid(sock->sk); + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/sock.c b/net/core/sock.c index 7965ef487375..947741dc43fa 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1393,17 +1393,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) -void sock_update_netprioidx(struct sock *sk) -{ - if (in_interrupt()) - return; - - sk->sk_cgrp_prioidx = task_netprioidx(current); -} -EXPORT_SYMBOL_GPL(sock_update_netprioidx); -#endif - /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1432,8 +1421,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); } return sk; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 9dfaf4d55ee0..1915cab7f32d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -174,7 +174,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) goto err; - *dest = sk->sk_classid; + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif default: diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index a1d126f29463..54eaeb45ce99 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -42,7 +42,8 @@ cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) if (skb->sk == NULL || !sk_fullsock(skb->sk)) return false; - return (info->id == skb->sk->sk_classid) ^ info->invert; + return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ + info->invert; } static struct xt_match cgroup_mt_reg __read_mostly = { -- cgit v1.2.3 From a188222b6ed29404ac2d4232d35d1fe0e77af370 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:43 -0800 Subject: net: Rename NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK The name NETIF_F_ALL_CSUM is a misnomer. This does not correspond to the set of features for offloading all checksums. This is a mask of the checksum offload related features bits. It is incorrect to set both NETIF_F_HW_CSUM and NETIF_F_IP_CSUM or NETIF_F_IPV6 at the same time for features of a device. This patch: - Changes instances of NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK (where NETIF_F_ALL_CSUM is being used as a mask). - Changes bonding, sfc/efx, ipvlan, macvlan, vlan, and team drivers to use NEITF_F_HW_CSUM in features list instead of NETIF_F_ALL_CSUM. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 +++---- drivers/net/ethernet/emulex/benet/be_main.c | 2 +- drivers/net/ethernet/ibm/ibmveth.c | 5 +++-- drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/jme.c | 2 +- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_param.c | 2 +- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++-- drivers/net/ipvlan/ipvlan_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/macvtap.c | 2 +- drivers/net/team/team.c | 3 +-- drivers/net/usb/r8152.c | 2 +- drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c | 2 +- include/linux/netdev_features.h | 7 ++++++- include/linux/netdevice.h | 6 +++--- include/net/vxlan.h | 2 +- net/8021q/vlan_dev.c | 2 +- net/core/dev.c | 10 +++++----- net/core/ethtool.c | 2 +- net/ipv4/tcp.c | 4 ++-- 25 files changed, 43 insertions(+), 39 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index fe0e7a6f4d72..cab99fd44c8e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1067,12 +1067,12 @@ static netdev_features_t bond_fix_features(struct net_device *dev, return features; } -#define BOND_VLAN_FEATURES (NETIF_F_ALL_CSUM | NETIF_F_SG | \ +#define BOND_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ NETIF_F_HIGHDMA | NETIF_F_LRO) -#define BOND_ENC_FEATURES (NETIF_F_ALL_CSUM | NETIF_F_SG | NETIF_F_RXCSUM |\ - NETIF_F_ALL_TSO) +#define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_RXCSUM | NETIF_F_ALL_TSO) static void bond_compute_features(struct bonding *bond) { @@ -4182,7 +4182,6 @@ void bond_setup(struct net_device *bond_dev) NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER; - bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM); bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; } diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 4cab8879f5ae..34e324f20d80 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5289,7 +5289,7 @@ static netdev_features_t be_features_check(struct sk_buff *skb, skb->inner_protocol != htons(ETH_P_TEB) || skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)) - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index 7af870a3c549..6691b5a45b9d 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -763,7 +763,7 @@ static netdev_features_t ibmveth_fix_features(struct net_device *dev, */ if (!(features & NETIF_F_RXCSUM)) - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; return features; } @@ -928,7 +928,8 @@ static int ibmveth_set_features(struct net_device *dev, rc1 = ibmveth_set_csum_offload(dev, rx_csum); if (rc1 && !adapter->rx_csum) dev->features = - features & ~(NETIF_F_ALL_CSUM | NETIF_F_RXCSUM); + features & ~(NETIF_F_CSUM_MASK | + NETIF_F_RXCSUM); } if (large_send != adapter->large_send) { diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index d9854d39576d..83ddf362ea77 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -1357,7 +1357,7 @@ static netdev_features_t fm10k_features_check(struct sk_buff *skb, if (!skb->encapsulation || fm10k_tx_encap_offload(skb)) return features; - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } static const struct net_device_ops fm10k_netdev_ops = { diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index a63d980f478e..c284e4341c7c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8766,7 +8766,7 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb, if (skb->encapsulation && (skb_inner_mac_header(skb) - skb_transport_header(skb) > I40E_MAX_TUNNEL_HDR_LEN)) - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9f27001cac1f..fca35aa90d0f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8598,7 +8598,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev, if (unlikely(skb_inner_mac_header(skb) - skb_transport_header(skb) > IXGBE_MAX_TUNNEL_HDR_LEN)) - return features & ~NETIF_F_ALL_CSUM; + return features & ~NETIF_F_CSUM_MASK; return features; } diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 060dd3922974..b1de7afd4116 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -2753,7 +2753,7 @@ static netdev_features_t jme_fix_features(struct net_device *netdev, netdev_features_t features) { if (netdev->mtu > 1900) - features &= ~(NETIF_F_ALL_TSO | NETIF_F_ALL_CSUM); + features &= ~(NETIF_F_ALL_TSO | NETIF_F_CSUM_MASK); return features; } diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 5606a043063e..ec0a22119e09 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4380,7 +4380,7 @@ static netdev_features_t sky2_fix_features(struct net_device *dev, */ if (dev->mtu > ETH_DATA_LEN && hw->chip_id == CHIP_ID_YUKON_EC_U) { netdev_info(dev, "checksum offload not possible with jumbo frames\n"); - features &= ~(NETIF_F_TSO|NETIF_F_SG|NETIF_F_ALL_CSUM); + features &= ~(NETIF_F_TSO | NETIF_F_SG | NETIF_F_CSUM_MASK); } /* Some hardware requires receive checksum for RSS to work. */ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 038ac6b14a60..7060539d276a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2071,7 +2071,7 @@ nfp_net_features_check(struct sk_buff *skb, struct net_device *dev, l4_hdr = ipv6_hdr(skb)->nexthdr; break; default: - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (skb->inner_protocol_type != ENCAP_TYPE_ETHER || @@ -2080,7 +2080,7 @@ nfp_net_features_check(struct sk_buff *skb, struct net_device *dev, (l4_hdr == IPPROTO_UDP && (skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_param.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_param.c index 08d4be616064..e097e6baaac4 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_param.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_param.c @@ -500,7 +500,7 @@ void pch_gbe_check_options(struct pch_gbe_adapter *adapter) val = XsumTX; pch_gbe_validate_option(&val, &opt, adapter); if (!val) - dev->features &= ~NETIF_F_ALL_CSUM; + dev->features &= ~NETIF_F_CSUM_MASK; } { /* Flow Control */ static const struct pch_gbe_option opt = { diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index b405349a570c..1fe13c733c1e 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -3131,7 +3131,7 @@ static int efx_pci_probe(struct pci_dev *pci_dev, if (efx->type->offload_features & NETIF_F_V6_CSUM) net_dev->features |= NETIF_F_TSO6; /* Mask for features that also apply to VLAN devices */ - net_dev->vlan_features |= (NETIF_F_ALL_CSUM | NETIF_F_SG | + net_dev->vlan_features |= (NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_ALL_TSO | NETIF_F_RXCSUM); /* All offloads can be toggled */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3c6549aee11d..0b0fea73a7a7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2402,7 +2402,7 @@ static netdev_features_t stmmac_fix_features(struct net_device *dev, features &= ~NETIF_F_RXCSUM; if (!priv->plat->tx_coe) - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; /* Some GMAC devices have a bugged Jumbo frame support that * needs to have the Tx COE disabled for oversized frames @@ -2410,7 +2410,7 @@ static netdev_features_t stmmac_fix_features(struct net_device *dev, * the TX csum insertionin the TDES and not use SF. */ if (priv->plat->bugged_jumbo && (dev->mtu > ETH_DATA_LEN)) - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; return features; } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index a9268db4e349..f94392d07126 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -88,7 +88,7 @@ static struct lock_class_key ipvlan_netdev_xmit_lock_key; static struct lock_class_key ipvlan_netdev_addr_lock_key; #define IPVLAN_FEATURES \ - (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 06c8bfeaccd6..ae3b486fb663 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -762,7 +762,7 @@ static struct lock_class_key macvlan_netdev_addr_lock_key; NETIF_F_GSO_ROBUST) #define MACVLAN_FEATURES \ - (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0fc521941c71..d636d051fac8 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -388,7 +388,7 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) * check, we either support them all or none. */ if (skb->ip_summed == CHECKSUM_PARTIAL && - !(features & NETIF_F_ALL_CSUM) && + !(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto drop; skb_queue_tail(&q->sk.sk_receive_queue, skb); diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 059c0f60a2b2..915f60fce186 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -981,7 +981,7 @@ static void team_port_disable(struct team *team, team_lower_state_changed(port); } -#define TEAM_VLAN_FEATURES (NETIF_F_ALL_CSUM | NETIF_F_SG | \ +#define TEAM_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ NETIF_F_HIGHDMA | NETIF_F_LRO) @@ -2091,7 +2091,6 @@ static void team_setup(struct net_device *dev) NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER; - dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM); dev->features |= dev->hw_features; } diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index d9427ca3dba7..34642a9583e0 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1986,7 +1986,7 @@ rtl8152_features_check(struct sk_buff *skb, struct net_device *dev, int offset = skb_transport_offset(skb); if ((mss || skb->ip_summed == CHECKSUM_PARTIAL) && offset > max_offset) - features &= ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); else if ((skb->len + sizeof(struct tx_desc)) > agg_buf_sz) features &= ~NETIF_F_GSO_MASK; diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c index 679785b0209c..9de4f23910d8 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c @@ -69,7 +69,7 @@ ksocknal_lib_zc_capable(ksock_conn_t *conn) /* ZC if the socket supports scatter/gather and doesn't need software * checksums */ - return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0); + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_CSUM_MASK) != 0); } int diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 6395f8309393..2c4e94ab88da 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -149,7 +149,12 @@ enum { #define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM #define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) #define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) -#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) + +/* List of IP checksum features. Note that NETIF_HW_CSUM should not be + * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- + * this would be contradictory + */ +#define NETIF_F_CSUM_MASK (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1bb21ff0fa64..a54223a113b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3763,12 +3763,12 @@ static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { if (f1 & NETIF_F_GEN_CSUM) - f1 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); if (f2 & NETIF_F_GEN_CSUM) - f2 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f2 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); f1 &= f2; if (f1 & NETIF_F_GEN_CSUM) - f1 &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); return f1; } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index c1c899c3a51b..b5a1aec1a167 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -232,7 +232,7 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 45b74e875381..ad5e2fd1012c 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -543,7 +543,7 @@ static int vlan_dev_init(struct net_device *dev) (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); - dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG | + dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; diff --git a/net/core/dev.c b/net/core/dev.c index 8f705fcedb94..5a3b5a404642 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,7 +2645,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2792,7 +2792,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && + if (!(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto out_kfree_skb; } @@ -7572,15 +7572,15 @@ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_GEN_CSUM) - mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; - all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); return all; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4a0cab85d67d..09948a726347 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -235,7 +235,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: - return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CRC; + return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c82cca18c90f..cf7ef7be79f0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1018,7 +1018,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, ssize_t res; if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) + !(sk->sk_route_caps & NETIF_F_CSUM_MASK)) return sock_no_sendpage(sk->sk_socket, page, offset, size, flags); @@ -1175,7 +1175,7 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & NETIF_F_ALL_CSUM) + if (sk->sk_route_caps & NETIF_F_CSUM_MASK) skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); -- cgit v1.2.3 From c8cd0989bd151fda87bbf10887b3df18021284bc Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:44 -0800 Subject: net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM These netif flags are unnecessary convolutions. It is more straightforward to just use NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, and NETIF_F_IPV6_CSUM directly. This patch also: - Cleans up can_checksum_protocol - Simplifies netdev_intersect_features Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/macvlan.c | 2 +- include/linux/if_vlan.h | 2 +- include/linux/netdev_features.h | 9 +++---- include/linux/netdevice.h | 40 ++++++++++++++++++++------------ net/core/dev.c | 12 +++++----- net/core/pktgen.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 3 ++- net/ipv4/udp.c | 3 ++- net/ipv4/udp_offload.c | 5 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 3 ++- 13 files changed, 50 insertions(+), 39 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 1fe13c733c1e..6f697438545d 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -3128,7 +3128,7 @@ static int efx_pci_probe(struct pci_dev *pci_dev, net_dev->features |= (efx->type->offload_features | NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_RXCSUM); - if (efx->type->offload_features & NETIF_F_V6_CSUM) + if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) net_dev->features |= NETIF_F_TSO6; /* Mask for features that also apply to VLAN devices */ net_dev->vlan_features |= (NETIF_F_HW_CSUM | NETIF_F_SG | diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index ae3b486fb663..6a57a005e0ca 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -758,7 +758,7 @@ static struct lock_class_key macvlan_netdev_xmit_lock_key; static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_FEATURES \ - (NETIF_F_SG | NETIF_F_GEN_CSUM | NETIF_F_GSO_SOFTWARE | NETIF_F_LLTX | \ + (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | NETIF_F_LLTX | \ NETIF_F_GSO_ROBUST) #define MACVLAN_FEATURES \ diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 05f5879821b8..a5f6ce6b578c 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -621,7 +621,7 @@ static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c4e94ab88da..d9654f0eecb3 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -146,15 +146,12 @@ enum { #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ NETIF_F_TSO6 | NETIF_F_UFO) -#define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM -#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) -#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) - -/* List of IP checksum features. Note that NETIF_HW_CSUM should not be +/* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- * this would be contradictory */ -#define NETIF_F_CSUM_MASK (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) +#define NETIF_F_CSUM_MASK (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ + NETIF_F_HW_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a54223a113b1..283984b67cd9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3691,13 +3691,24 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); + if (protocol == htons(ETH_P_FCOE)) + return !!(features & NETIF_F_FCOE_CRC); + + /* Assume this is an IP checksum (not SCTP CRC) */ + + if (features & NETIF_F_HW_CSUM) { + /* Can checksum everything */ + return true; + } + + switch (protocol) { + case htons(ETH_P_IP): + return !!(features & NETIF_F_IP_CSUM); + case htons(ETH_P_IPV6): + return !!(features & NETIF_F_IPV6_CSUM); + default: + return false; + } } #ifdef CONFIG_BUG @@ -3762,15 +3773,14 @@ void linkwatch_run_queue(void); static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { - if (f1 & NETIF_F_GEN_CSUM) - f1 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); - if (f2 & NETIF_F_GEN_CSUM) - f2 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); - f1 &= f2; - if (f1 & NETIF_F_GEN_CSUM) - f1 &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); + if ((f1 ^ f2) & NETIF_F_HW_CSUM) { + if (f1 & NETIF_F_HW_CSUM) + f1 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + else + f2 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + } - return f1; + return f1 & f2; } static inline netdev_features_t netdev_get_wanted_features( diff --git a/net/core/dev.c b/net/core/dev.c index 5a3b5a404642..45b013f27625 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6467,9 +6467,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { /* maybe split UFO into V4 and V6? */ - if (!((features & NETIF_F_GEN_CSUM) || - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + if (!(features & NETIF_F_HW_CSUM) && + ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { netdev_dbg(dev, "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; @@ -7571,7 +7571,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { - if (mask & NETIF_F_GEN_CSUM) + if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; @@ -7579,8 +7579,8 @@ netdev_features_t netdev_increment_features(netdev_features_t all, all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ - if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index de8d5cc5eb24..2be144498bcf 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2898,7 +2898,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V4_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; udp4_hwcsum(skb, iph->saddr, iph->daddr); @@ -3032,7 +3032,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V6_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e0b94cd843d7..568e2bc0d93d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -911,7 +911,7 @@ static int __ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->dst.dev->features & NETIF_F_V4_CSUM && + rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && !(flags & MSG_MORE) && !exthdrlen) csummode = CHECKSUM_PARTIAL; diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 5075b7ecd26d..61c7cc22ea68 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -132,7 +132,8 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { if (!(rt->rt_flags & RTCF_LOCAL) && - (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { + (!skb->dev || skb->dev->features & + (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0c7b0e61b917..8841e984f8bf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -772,7 +772,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, else if (skb_is_gso(skb)) uh->check = ~udp_v4_check(len, saddr, daddr, 0); else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + (skb_dst(skb)->dev->features & + (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f9386160cbee..130042660181 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -60,8 +60,9 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, /* Try to offload checksum if possible */ offload_csum = !!(need_csum && - (skb->dev->features & - (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM))); + ((skb->dev->features & NETIF_F_HW_CSUM) || + (skb->dev->features & (is_ipv6 ? + NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & features; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e6a7bd15b9b7..2f748452b4aa 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1322,7 +1322,7 @@ emsgsize: headersize == sizeof(struct ipv6hdr) && length < mtu - headersize && !(flags & MSG_MORE) && - rt->dst.dev->features & NETIF_F_V6_CSUM) + rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 238e70c3f7b7..6ce309928841 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -136,7 +136,8 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { if (!(rt->rt6i_flags & RTF_LOCAL) && - (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) { + (!skb->dev || skb->dev->features & + (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + -- cgit v1.2.3 From 6ae23ad36253a8033c5714c52b691b84456487c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:46 -0800 Subject: net: Add driver helper functions to determine checksum offloadability Add skb_csum_offload_chk driver helper function to determine if a device with limited checksum offload capabilities is able to offload the checksum for a given packet. This patch includes: - The skb_csum_offload_chk function. Returns true if checksum is offloadable, else false. Optionally, in the case that the checksum is not offloable, the function can call skb_checksum_help to resolve the checksum. skb_csum_offload_chk also returns whether the checksum refers to an encapsulated checksum. - Definition of skb_csum_offl_spec structure that caller uses to indicate rules about what it can offload (e.g. IPv4/v6, TCP/UDP only, whether encapsulated checksums can be offloaded, whether checksum with IPv6 extension headers can be offloaded). - Ancilary functions called skb_csum_offload_chk_help, skb_csum_off_chk_help_cmn, skb_csum_off_chk_help_cmn_v4_only. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 78 ++++++++++++++++++++++++++ net/core/dev.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 283984b67cd9..9fb6395967de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2522,6 +2522,71 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, remcsum_unadjust((__sum16 *)ptr, grc->delta); } +struct skb_csum_offl_spec { + __u16 ipv4_okay:1, + ipv6_okay:1, + encap_okay:1, + ip_options_okay:1, + ext_hdrs_okay:1, + tcp_okay:1, + udp_okay:1, + sctp_okay:1, + vlan_okay:1, + no_encapped_ipv6:1, + no_not_encapped:1; +}; + +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help); + +static inline bool skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) + return false; + + return __skb_csum_offload_chk(skb, spec, csum_encapped, csum_help); +} + +static inline bool skb_csum_offload_chk_help(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec) +{ + bool csum_encapped; + + return skb_csum_offload_chk(skb, spec, &csum_encapped, true); +} + +static inline bool skb_csum_off_chk_help_cmn(struct sk_buff *skb) +{ + static const struct skb_csum_offl_spec csum_offl_spec = { + .ipv4_okay = 1, + .ip_options_okay = 1, + .ipv6_okay = 1, + .vlan_okay = 1, + .tcp_okay = 1, + .udp_okay = 1, + }; + + return skb_csum_offload_chk_help(skb, &csum_offl_spec); +} + +static inline bool skb_csum_off_chk_help_cmn_v4_only(struct sk_buff *skb) +{ + static const struct skb_csum_offl_spec csum_offl_spec = { + .ipv4_okay = 1, + .ip_options_okay = 1, + .tcp_okay = 1, + .udp_okay = 1, + .vlan_okay = 1, + }; + + return skb_csum_offload_chk_help(skb, &csum_offl_spec); +} + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, @@ -3711,6 +3776,19 @@ static inline bool can_checksum_protocol(netdev_features_t features, } } +/* Map an ethertype into IP protocol if possible */ +static inline int eproto_to_ipproto(int eproto) +{ + switch (eproto) { + case htons(ETH_P_IP): + return IPPROTO_IP; + case htons(ETH_P_IPV6): + return IPPROTO_IPV6; + default: + return -1; + } +} + #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev); #else diff --git a/net/core/dev.c b/net/core/dev.c index 45b013f27625..914b4a24c654 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -138,6 +138,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2471,6 +2472,141 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +/* skb_csum_offload_check - Driver helper function to determine if a device + * with limited checksum offload capabilities is able to offload the checksum + * for a given packet. + * + * Arguments: + * skb - sk_buff for the packet in question + * spec - contains the description of what device can offload + * csum_encapped - returns true if the checksum being offloaded is + * encpasulated. That is it is checksum for the transport header + * in the inner headers. + * checksum_help - when set indicates that helper function should + * call skb_checksum_help if offload checks fail + * + * Returns: + * true: Packet has passed the checksum checks and should be offloadable to + * the device (a driver may still need to check for additional + * restrictions of its device) + * false: Checksum is not offloadable. If checksum_help was set then + * skb_checksum_help was called to resolve checksum for non-GSO + * packets and when IP protocol is not SCTP + */ +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + struct iphdr *iph; + struct ipv6hdr *ipv6; + void *nhdr; + int protocol; + u8 ip_proto; + + if (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD)) { + if (!spec->vlan_okay) + goto need_help; + } + + /* We check whether the checksum refers to a transport layer checksum in + * the outermost header or an encapsulated transport layer checksum that + * corresponds to the inner headers of the skb. If the checksum is for + * something else in the packet we need help. + */ + if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { + /* Non-encapsulated checksum */ + protocol = eproto_to_ipproto(vlan_get_protocol(skb)); + nhdr = skb_network_header(skb); + *csum_encapped = false; + if (spec->no_not_encapped) + goto need_help; + } else if (skb->encapsulation && spec->encap_okay && + skb_checksum_start_offset(skb) == + skb_inner_transport_offset(skb)) { + /* Encapsulated checksum */ + *csum_encapped = true; + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = eproto_to_ipproto(skb->inner_protocol); + break; + case ENCAP_TYPE_IPPROTO: + protocol = skb->inner_protocol; + break; + } + nhdr = skb_inner_network_header(skb); + } else { + goto need_help; + } + + switch (protocol) { + case IPPROTO_IP: + if (!spec->ipv4_okay) + goto need_help; + iph = nhdr; + ip_proto = iph->protocol; + if (iph->ihl != 5 && !spec->ip_options_okay) + goto need_help; + break; + case IPPROTO_IPV6: + if (!spec->ipv6_okay) + goto need_help; + if (spec->no_encapped_ipv6 && *csum_encapped) + goto need_help; + ipv6 = nhdr; + nhdr += sizeof(*ipv6); + ip_proto = ipv6->nexthdr; + break; + default: + goto need_help; + } + +ip_proto_again: + switch (ip_proto) { + case IPPROTO_TCP: + if (!spec->tcp_okay || + skb->csum_offset != offsetof(struct tcphdr, check)) + goto need_help; + break; + case IPPROTO_UDP: + if (!spec->udp_okay || + skb->csum_offset != offsetof(struct udphdr, check)) + goto need_help; + break; + case IPPROTO_SCTP: + if (!spec->sctp_okay || + skb->csum_offset != offsetof(struct sctphdr, checksum)) + goto cant_help; + break; + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 *opthdr = nhdr; + + if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) + goto need_help; + + ip_proto = opthdr[0]; + nhdr += (opthdr[1] + 1) << 3; + + goto ip_proto_again; + } + default: + goto need_help; + } + + /* Passed the tests for offloading checksum */ + return true; + +need_help: + if (csum_help && !skb_shinfo(skb)->gso_size) + skb_checksum_help(skb); +cant_help: + return false; +} +EXPORT_SYMBOL(__skb_csum_offload_chk); + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; -- cgit v1.2.3 From 1f211a1b929c804100e138c5d3d656992cfd5622 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 22:29:47 +0100 Subject: net, sched: add clsact qdisc This work adds a generalization of the ingress qdisc as a qdisc holding only classifiers. The clsact qdisc works on ingress, but also on egress. In both cases, it's execution happens without taking the qdisc lock, and the main difference for the egress part compared to prior version of [1] is that this can be applied with _any_ underlying real egress qdisc (also classless ones). Besides solving the use-case of [1], that is, allowing for more programmability on assigning skb->priority for the mqprio case that is supported by most popular 10G+ NICs, it also opens up a lot more flexibility for other tc applications. The main work on classification can already be done at clsact egress time if the use-case allows and state stored for later retrieval f.e. again in skb->priority with major/minors (which is checked by most classful qdiscs before consulting tc_classify()) and/or in other skb fields like skb->tc_index for some light-weight post-processing to get to the eventual classid in case of a classful qdisc. Another use case is that the clsact egress part allows to have a central egress counterpart to the ingress classifiers, so that classifiers can easily share state (e.g. in cls_bpf via eBPF maps) for ingress and egress. Currently, default setups like mq + pfifo_fast would require for this to use, for example, prio qdisc instead (to get a tc_classify() run) and to duplicate the egress classifier for each queue. With clsact, it allows for leaving the setup as is, it can additionally assign skb->priority to put the skb in one of pfifo_fast's bands and it can share state with maps. Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid) w/o the need to perform a skb_dst_force() to hold on to it any longer. In lwt case, we can also use this facility to setup dst metadata via cls_bpf (bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for that (case of IFF_NO_QUEUE devices, for example). The realization can be done without any changes to the scheduler core framework. All it takes is that we have two a-priori defined minors/child classes, where we can mux between ingress and egress classifier list (dev->ingress_cl_list and dev->egress_cl_list, latter stored close to dev->_tx to avoid extra cacheline miss for moderate loads). The egress part is a bit similar modelled to handle_ing() and patched to a noop in case the functionality is not used. Both handlers are now called sch_handle_ingress() and sch_handle_egress(), code sharing among the two doesn't seem practical as there are various minor differences in both paths, so that making them conditional in a single handler would rather slow things down. Full compatibility to ingress qdisc is provided as well. Since both piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist per netdevice, and thus ingress qdisc specific behaviour can be retained for user space. This means, either a user does 'tc qdisc add dev foo ingress' and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact' alternative, where both, ingress and egress classifier can be configured as in the below example. ingress qdisc supports attaching classifier to any minor number whereas clsact has two fixed minors for muxing between the lists, therefore to not break user space setups, they are better done as two separate qdiscs. I decided to extend the sch_ingress module with clsact functionality so that commonly used code can be reused, the module is being aliased with sch_clsact so that it can be auto-loaded properly. Alternative would have been to add a flag when initializing ingress to alter its behaviour plus aliasing to a different name (as it's more than just ingress). However, the first would end up, based on the flag, choosing the new/old behaviour by calling different function implementations to handle each anyway, the latter would require to register ingress qdisc once again under different alias. So, this really begs to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops by its own that share callbacks used by both. Example, adding qdisc: # tc qdisc add dev foo clsact # tc qdisc show dev foo qdisc mq 0: root qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc clsact ffff: parent ffff:fff1 Adding filters (deleting, etc works analogous by specifying ingress/egress): # tc filter add dev foo ingress bpf da obj bar.o sec ingress # tc filter add dev foo egress bpf da obj bar.o sec egress # tc filter show dev foo ingress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action # tc filter show dev foo egress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will show an empty list for clsact. Either using the parent names (ingress/egress) or specifying the full major/minor will then show the related filter lists. Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend. [1] http://patchwork.ozlabs.org/patch/512949/ Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +- include/linux/rtnetlink.h | 5 +++ include/uapi/linux/pkt_sched.h | 4 ++ net/Kconfig | 3 ++ net/core/dev.c | 82 +++++++++++++++++++++++++++++++++++---- net/sched/Kconfig | 14 +++++-- net/sched/cls_bpf.c | 2 +- net/sched/sch_ingress.c | 88 +++++++++++++++++++++++++++++++++++++++++- 8 files changed, 186 insertions(+), 16 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8d8e5ca951b4..2285596e7045 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1739,7 +1739,9 @@ struct net_device { #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps; #endif - +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto __rcu *egress_cl_list; +#endif #ifdef CONFIG_NET_SWITCHDEV u32 offload_fwd_mark; #endif diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4be5048b1fbe..c006cc900c44 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -84,6 +84,11 @@ void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif +#ifdef CONFIG_NET_EGRESS +void net_inc_egress_queue(void); +void net_dec_egress_queue(void); +#endif + extern void rtnetlink_init(void); extern void __rtnl_unlock(void); diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8d2530daca9f..8cb18b44968e 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -72,6 +72,10 @@ struct tc_estimator { #define TC_H_UNSPEC (0U) #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ enum tc_link_layer { diff --git a/net/Kconfig b/net/Kconfig index 11f8c22af34d..174354618f8a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -48,6 +48,9 @@ config COMPAT_NETLINK_MESSAGES config NET_INGRESS bool +config NET_EGRESS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/dev.c b/net/core/dev.c index 914b4a24c654..0ca95d5d7af0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1676,6 +1676,22 @@ void net_dec_ingress_queue(void) EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif +#ifdef CONFIG_NET_EGRESS +static struct static_key egress_needed __read_mostly; + +void net_inc_egress_queue(void) +{ + static_key_slow_inc(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_key_slow_dec(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL /* We are not allowed to call static_key_slow_dec() from irq context @@ -3007,7 +3023,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -3100,6 +3115,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +#ifdef CONFIG_NET_EGRESS +static struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); + struct tcf_result cl_res; + + if (!cl) + return skb; + + /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set + * earlier by the caller. + */ + qdisc_bstats_cpu_update(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + qdisc_qstats_cpu_drop(cl->q); + *ret = NET_XMIT_DROP; + goto drop; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *ret = NET_XMIT_SUCCESS; +drop: + kfree_skb(skb); + return NULL; + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + return NULL; + default: + break; + } + + return skb; +} +#endif /* CONFIG_NET_EGRESS */ + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -3226,6 +3284,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + qdisc_pkt_len_init(skb); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); +# ifdef CONFIG_NET_EGRESS + if (static_key_false(&egress_needed)) { + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + } +# endif +#endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ @@ -3247,9 +3316,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); -#endif trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); @@ -3806,9 +3872,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) +static inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { #ifdef CONFIG_NET_CLS_ACT struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); @@ -4002,7 +4068,7 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index daa33432b716..82830824fb1f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -310,15 +310,21 @@ config NET_SCH_PIE If unsure, say N. config NET_SCH_INGRESS - tristate "Ingress Qdisc" + tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT select NET_INGRESS + select NET_EGRESS ---help--- - Say Y here if you want to use classifiers for incoming packets. + Say Y here if you want to use classifiers for incoming and/or outgoing + packets. This qdisc doesn't do anything else besides running classifiers, + which can also have actions attached to them. In case of outgoing packets, + classifiers that this qdisc holds are executed in the transmit path + before real enqueuing to an egress qdisc happens. + If unsure, say Y. - To compile this code as a module, choose M here: the - module will be called sch_ingress. + To compile this code as a module, choose M here: the module will be + called sch_ingress with alias of sch_clsact. config NET_SCH_PLUG tristate "Plug network traffic until release (PLUG)" diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index b3c8bb4aeef5..8dc84300ee79 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -291,7 +291,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, prog->bpf_name = name; prog->filter = fp; - if (fp->dst_needed) + if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS)) netif_keep_dst(qdisc_dev(tp->q)); return 0; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e7c648fa9dc3..10adbc617905 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -1,4 +1,5 @@ -/* net/sched/sch_ingress.c - Ingress qdisc +/* net/sched/sch_ingress.c - Ingress and clsact qdisc + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -98,17 +99,100 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static unsigned long clsact_get(struct Qdisc *sch, u32 classid) +{ + switch (TC_H_MIN(classid)) { + case TC_H_MIN(TC_H_MIN_INGRESS): + case TC_H_MIN(TC_H_MIN_EGRESS): + return TC_H_MIN(classid); + default: + return 0; + } +} + +static unsigned long clsact_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return clsact_get(sch, classid); +} + +static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + + switch (cl) { + case TC_H_MIN(TC_H_MIN_INGRESS): + return &dev->ingress_cl_list; + case TC_H_MIN(TC_H_MIN_EGRESS): + return &dev->egress_cl_list; + default: + return NULL; + } +} + +static int clsact_init(struct Qdisc *sch, struct nlattr *opt) +{ + net_inc_ingress_queue(); + net_inc_egress_queue(); + + sch->flags |= TCQ_F_CPUSTATS; + + return 0; +} + +static void clsact_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + + tcf_destroy_chain(&dev->ingress_cl_list); + tcf_destroy_chain(&dev->egress_cl_list); + + net_dec_ingress_queue(); + net_dec_egress_queue(); +} + +static const struct Qdisc_class_ops clsact_class_ops = { + .leaf = ingress_leaf, + .get = clsact_get, + .put = ingress_put, + .walk = ingress_walk, + .tcf_chain = clsact_find_tcf, + .bind_tcf = clsact_bind_filter, + .unbind_tcf = ingress_put, +}; + +static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { + .cl_ops = &clsact_class_ops, + .id = "clsact", + .init = clsact_init, + .destroy = clsact_destroy, + .dump = ingress_dump, + .owner = THIS_MODULE, +}; + static int __init ingress_module_init(void) { - return register_qdisc(&ingress_qdisc_ops); + int ret; + + ret = register_qdisc(&ingress_qdisc_ops); + if (!ret) { + ret = register_qdisc(&clsact_qdisc_ops); + if (ret) + unregister_qdisc(&ingress_qdisc_ops); + } + + return ret; } static void __exit ingress_module_exit(void) { unregister_qdisc(&ingress_qdisc_ops); + unregister_qdisc(&clsact_qdisc_ops); } module_init(ingress_module_init); module_exit(ingress_module_exit); +MODULE_ALIAS("sch_clsact"); MODULE_LICENSE("GPL"); -- cgit v1.2.3