From fd2bc4195d5107f88c1b90e1ec935888ccbfc5c0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Oct 2023 20:57:20 +0300 Subject: xfrm: generalize xdo_dev_state_update_curlft to allow statistics update In order to allow drivers to fill all statistics, change the name of xdo_dev_state_update_curlft to be xdo_dev_state_update_stats. Acked-by: Steffen Klassert Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..9538576dbebc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1062,7 +1062,7 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); - void (*xdo_dev_state_update_curlft) (struct xfrm_state *x); + void (*xdo_dev_state_update_stats) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); -- cgit v1.2.3 From d160c66cda0ac8614adc53a5b5b0e6d6f1a05a5b Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 5 Feb 2024 12:30:22 +0200 Subject: net: Do not return value from init_dummy_netdev() init_dummy_netdev() always returns zero and all the callers do not check the returned value. Set the function to not return value, as it is not really used today. Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240205103022.440946-1-amcohen@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- net/core/dev.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..1845dd5043b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3198,7 +3198,7 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); void netdev_freemem(struct net_device *dev); -int init_dummy_netdev(struct net_device *dev); +void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 27ba057d06c4..e52e2888cccd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10345,7 +10345,7 @@ EXPORT_SYMBOL(register_netdevice); * that need to tie several hardware interfaces to a single NAPI * poll scheduler due to HW limitations. */ -int init_dummy_netdev(struct net_device *dev) +void init_dummy_netdev(struct net_device *dev) { /* Clear everything. Note we don't initialize spinlocks * are they aren't supposed to be taken by any of the @@ -10373,8 +10373,6 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ - - return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); -- cgit v1.2.3 From 4d2bb0bfe8741a8778e0053f31a4e0f0cba80e8b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:55 +0100 Subject: xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp Rely on skb pointer reference instead of the skb pointer in do_xdp_generic and netif_receive_generic_xdp routine signatures. This is a preliminary patch to add multi-buff support for xdp running in generic mode where we will need to reallocate the skb to avoid linearization and we will need to make it visible to do_xdp_generic() caller. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c09415b1f48c8620ef4d76deed35050a7bddf7c2.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 4 ++-- include/linux/netdevice.h | 2 +- net/core/dev.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b472f2c972d8..bc80fc1d576e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1926,7 +1926,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { - ret = do_xdp_generic(xdp_prog, skb); + ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); @@ -2516,7 +2516,7 @@ build: skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { - ret = do_xdp_generic(xdp_prog, skb); + ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 07cefa32eafa..a3f9c95da51e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3958,7 +3958,7 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index e19bdf1421e0..ffeb0e0279fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4936,10 +4936,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct sk_buff *skb = *pskb; u32 act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should @@ -5020,24 +5021,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; - act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb, + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: - generic_xdp_tx(skb, xdp_prog); + generic_xdp_tx(*pskb, xdp_prog); break; } return XDP_DROP; @@ -5045,7 +5046,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -5368,7 +5369,8 @@ another_round: int ret2; migrate_disable(); - ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); migrate_enable(); if (ret2 != XDP_PASS) { -- cgit v1.2.3 From 4d42b37def70327b2bb19f823d42289aed2cd7c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:36 +0000 Subject: net: convert dev->reg_state to u8 Prepares things so that dev->reg_state reads can be lockless, by adding WRITE_ONCE() on write side. READ_ONCE()/WRITE_ONCE() do not support bitfields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++++--------- net/core/dev.c | 8 ++++---- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3f9c95da51e..631124655107 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1815,6 +1815,15 @@ enum netdev_stat_type { NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; +enum netdev_reg_state { + NETREG_UNINITIALIZED = 0, + NETREG_REGISTERED, /* completed register_netdevice */ + NETREG_UNREGISTERING, /* called unregister_netdevice */ + NETREG_UNREGISTERED, /* completed unregister todo */ + NETREG_RELEASED, /* called free_netdev */ + NETREG_DUMMY, /* dummy device for NAPI poll */ +}; + /** * struct net_device - The DEVICE structure. * @@ -2372,13 +2381,7 @@ struct net_device { struct list_head link_watch_list; - enum { NETREG_UNINITIALIZED=0, - NETREG_REGISTERED, /* completed register_netdevice */ - NETREG_UNREGISTERING, /* called unregister_netdevice */ - NETREG_UNREGISTERED, /* completed unregister todo */ - NETREG_RELEASED, /* called free_netdev */ - NETREG_DUMMY, /* dummy device for NAPI poll */ - } reg_state:8; + u8 reg_state; bool dismantle; @@ -5254,7 +5257,9 @@ static inline const char *netdev_name(const struct net_device *dev) static inline const char *netdev_reg_state(const struct net_device *dev) { - switch (dev->reg_state) { + u8 reg_state = READ_ONCE(dev->reg_state); + + switch (reg_state) { case NETREG_UNINITIALIZED: return " (uninitialized)"; case NETREG_REGISTERED: return ""; case NETREG_UNREGISTERING: return " (unregistering)"; @@ -5263,7 +5268,7 @@ static inline const char *netdev_reg_state(const struct net_device *dev) case NETREG_DUMMY: return " (dummy)"; } - WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state); + WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state); return " (unknown)"; } diff --git a/net/core/dev.c b/net/core/dev.c index 720bd6838212..9c95cae9d6ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10339,7 +10339,7 @@ int register_netdevice(struct net_device *dev) ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); write_unlock(&dev_base_lock); if (ret) goto err_uninit_notify; @@ -10630,7 +10630,7 @@ void netdev_run_todo(void) } write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); write_unlock(&dev_base_lock); linkwatch_sync_dev(dev); } @@ -11050,7 +11050,7 @@ void free_netdev(struct net_device *dev) } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -11140,7 +11140,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* And unlink it from device chain. */ write_lock(&dev_base_lock); unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); write_unlock(&dev_base_lock); } flush_all_backlogs(); -- cgit v1.2.3 From 6a2968ee1ee2cc6fce30f6f5724442b34b1483b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:42 +0000 Subject: net: add netdev_set_operstate() helper dev_base_lock is going away, add netdev_set_operstate() helper so that hsr does not have to know core internals. Remove dev_base_lock acquisition from rfc2863_policy() v3: use an "unsigned int" for dev->operstate, so that try_cmpxchg() can work on all arches. ( https://lore.kernel.org/oe-kbuild-all/202402081918.OLyGaea3-lkp@intel.com/ ) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/linux/rtnetlink.h | 2 ++ net/core/link_watch.c | 9 ++------- net/core/rtnetlink.c | 22 +++++++++++++++------- net/hsr/hsr_device.c | 22 ++++++---------------- 5 files changed, 26 insertions(+), 31 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 631124655107..697370706a82 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2258,7 +2258,7 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif - unsigned char operstate; + unsigned int operstate; unsigned char link_mode; unsigned char if_port; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 21780608cf47..cdfc897f1e3c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -172,4 +172,6 @@ rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); } +void netdev_set_operstate(struct net_device *dev, int newstate); + #endif /* __LINUX_RTNETLINK_H */ diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b93e054c9a3..8ec35194bfcb 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; @@ -62,16 +62,13 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); if (operstate == READ_ONCE(dev->operstate)) return; - write_lock(&dev_base_lock); - switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) @@ -88,8 +85,6 @@ static void rfc2863_policy(struct net_device *dev) } WRITE_ONCE(dev->operstate, operstate); - - write_unlock(&dev_base_lock); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 43d92de8601c..e484ba44f23b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -842,9 +842,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netdev_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netdev_state_change(dev); +} +EXPORT_SYMBOL(netdev_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: @@ -866,12 +879,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (READ_ONCE(dev->operstate) != operstate) { - write_lock(&dev_base_lock); - WRITE_ONCE(dev->operstate, operstate); - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } + netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index be0e43f46556..5ef6d437db72 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -28,29 +28,19 @@ static bool is_slave_up(struct net_device *dev) return dev && is_admin_up(dev) && netif_oper_up(dev); } -static void __hsr_set_operstate(struct net_device *dev, int transition) -{ - write_lock(&dev_base_lock); - if (READ_ONCE(dev->operstate) != transition) { - WRITE_ONCE(dev->operstate, transition); - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } else { - write_unlock(&dev_base_lock); - } -} - static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { - if (!is_admin_up(master->dev)) { - __hsr_set_operstate(master->dev, IF_OPER_DOWN); + struct net_device *dev = master->dev; + + if (!is_admin_up(dev)) { + netdev_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) - __hsr_set_operstate(master->dev, IF_OPER_UP); + netdev_set_operstate(dev, IF_OPER_UP); else - __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); + netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) -- cgit v1.2.3 From 1b3ef46cb7f2618cc0b507393220a69810f6da12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:45 +0000 Subject: net: remove dev_base_lock dev_base_lock is not needed anymore, all remaining users also hold RTNL. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- net/core/dev.c | 39 ++++----------------------------------- 2 files changed, 4 insertions(+), 37 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 697370706a82..c541550b0e6e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3077,8 +3077,6 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); -extern rwlock_t dev_base_lock; /* Device list lock */ - #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ diff --git a/net/core/dev.c b/net/core/dev.c index 02cf9fd68da6..d8dd293a7a27 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -168,28 +168,6 @@ static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); - static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ @@ -395,12 +373,10 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); @@ -427,11 +403,9 @@ static void unlist_netdevice(struct net_device *dev) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -752,9 +726,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path); * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -835,8 +809,7 @@ EXPORT_SYMBOL(netdev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -1241,15 +1214,11 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); -- cgit v1.2.3 From 8afc7a78d55de726b2747d7775c54def79509ec5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2024 10:50:10 +0000 Subject: ipv6: prepare inet6_fill_ifinfo() for RCU protection We want to use RCU protection instead of RTNL for inet6_fill_ifinfo(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- net/core/dev.c | 4 ++-- net/ipv6/addrconf.c | 11 +++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f07c8374f29c..09023e44db4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4354,8 +4354,10 @@ static inline bool netif_testing(const struct net_device *dev) */ static inline bool netif_oper_up(const struct net_device *dev) { - return (dev->operstate == IF_OPER_UP || - dev->operstate == IF_OPER_UNKNOWN /* backward compat */); + unsigned int operstate = READ_ONCE(dev->operstate); + + return operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN /* backward compat */; } /** diff --git a/net/core/dev.c b/net/core/dev.c index 0628d8ff1ed9..275fd5259a4a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8632,12 +8632,12 @@ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a56dad307fe3..daa81556d118 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6062,6 +6062,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, struct net_device *dev = idev->dev; struct ifinfomsg *hdr; struct nlmsghdr *nlh; + int ifindex, iflink; void *protoinfo; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); @@ -6072,16 +6073,18 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_family = AF_INET6; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; - hdr->ifi_index = dev->ifindex; + ifindex = READ_ONCE(dev->ifindex); + hdr->ifi_index = ifindex; hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; + iflink = dev_get_iflink(dev); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + (ifindex != iflink && + nla_put_u32(skb, IFLA_LINK, iflink)) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN)) goto nla_put_failure; -- cgit v1.2.3 From 0d60d8df6f493bb46bf5db40d39dd60a1bafdd4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Feb 2024 12:32:08 +0000 Subject: dpll: rely on rcu for netdev_dpll_pin() This fixes a possible UAF in if_nlmsg_size(), which can run without RTNL. Add rcu protection to "struct dpll_pin" Move netdev_dpll_pin() from netdevice.h to dpll.h to decrease name pollution. Note: This looks possible to no longer acquire RTNL in netdev_dpll_pin_assign() later in net-next. v2: do not force rcu_read_lock() in rtnl_dpll_pin_size() (Jiri Pirko) Fixes: 5f1842692880 ("netdev: expose DPLL pin handle for netdevice") Signed-off-by: Eric Dumazet Cc: Arkadiusz Kubalewski Cc: Vadim Fedorenko Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240223123208.3543319-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_core.c | 2 +- drivers/dpll/dpll_core.h | 2 ++ include/linux/dpll.h | 11 +++++++++++ include/linux/netdevice.h | 11 +---------- net/core/dev.c | 2 +- 5 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 5152bd1b0daf..4c2bb27c99fe 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -564,7 +564,7 @@ void dpll_pin_put(struct dpll_pin *pin) xa_destroy(&pin->parent_refs); xa_erase(&dpll_pin_xa, pin->id); dpll_pin_prop_free(&pin->prop); - kfree(pin); + kfree_rcu(pin, rcu); } mutex_unlock(&dpll_lock); } diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h index 717f715015c7..2b6d8ef1cdf3 100644 --- a/drivers/dpll/dpll_core.h +++ b/drivers/dpll/dpll_core.h @@ -47,6 +47,7 @@ struct dpll_device { * @prop: pin properties copied from the registerer * @rclk_dev_name: holds name of device when pin can recover clock from it * @refcount: refcount + * @rcu: rcu_head for kfree_rcu() **/ struct dpll_pin { u32 id; @@ -57,6 +58,7 @@ struct dpll_pin { struct xarray parent_refs; struct dpll_pin_properties prop; refcount_t refcount; + struct rcu_head rcu; }; /** diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 9cf896ea1d41..4ec2fe9caf5a 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include struct dpll_device; struct dpll_pin; @@ -167,4 +169,13 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); +static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_DPLL) + return rcu_dereference_rtnl(dev->dpll_pin); +#else + return NULL; +#endif +} + #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef7bfbb98497..a9c973b92294 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2469,7 +2469,7 @@ struct net_device { struct devlink_port *devlink_port; #if IS_ENABLED(CONFIG_DPLL) - struct dpll_pin *dpll_pin; + struct dpll_pin __rcu *dpll_pin; #endif #if IS_ENABLED(CONFIG_PAGE_POOL) /** @page_pools: page pools created for this netdevice */ @@ -4035,15 +4035,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); void netdev_dpll_pin_clear(struct net_device *dev); -static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ -#if IS_ENABLED(CONFIG_DPLL) - return dev->dpll_pin; -#else - return NULL; -#endif -} - struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/net/core/dev.c b/net/core/dev.c index 73a021973007..0230391c78f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9078,7 +9078,7 @@ static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll { #if IS_ENABLED(CONFIG_DPLL) rtnl_lock(); - dev->dpll_pin = dpll_pin; + rcu_assign_pointer(dev->dpll_pin, dpll_pin); rtnl_unlock(); #endif } -- cgit v1.2.3 From eb2c11b27c58a62b5027b77f702c15cd0ca38f7d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Feb 2024 17:06:56 +0100 Subject: net: bql: fix building with BQL disabled It is now possible to disable BQL, but that causes the cpsw driver to break: drivers/net/ethernet/ti/am65-cpsw-nuss.c:297:28: error: no member named 'dql' in 'struct netdev_queue' 297 | dql_avail(&netif_txq->dql), There is already a helper function in net/sch_generic.h that could be used to help here. Move its implementation into the common linux/netdevice.h along with the other bql interfaces and change both users over to the new interface. Fixes: ea7f3cfaa588 ("net: bql: allow the config to be disabled") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- include/linux/netdevice.h | 10 ++++++++++ include/net/sch_generic.h | 7 +------ 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 9d2f4ac783e4..2939a21ca74f 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -294,7 +294,7 @@ static void am65_cpsw_nuss_ndo_host_tx_timeout(struct net_device *ndev, txqueue, netif_tx_queue_stopped(netif_txq), jiffies_to_msecs(jiffies - trans_start), - dql_avail(&netif_txq->dql), + netdev_queue_dql_avail(netif_txq), k3_cppi_desc_pool_avail(tx_chn->desc_pool)); if (netif_tx_queue_stopped(netif_txq)) { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9c973b92294..735a9386fcf8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3499,6 +3499,16 @@ static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue #endif } +static inline int netdev_queue_dql_avail(const struct netdev_queue *txq) +{ +#ifdef CONFIG_BQL + /* Non-BQL migrated drivers will return 0, too. */ + return dql_avail(&txq->dql); +#else + return 0; +#endif +} + /** * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 934fdb977551..cefe0c4bdae3 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -238,12 +238,7 @@ static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { -#ifdef CONFIG_BQL - /* Non-BQL migrated drivers will return 0, too. */ - return dql_avail(&txq->dql); -#else - return 0; -#endif + return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { -- cgit v1.2.3 From 289e922582af5b4721ba02e86bde4d9ba918158a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Mar 2024 17:35:32 -0800 Subject: dpll: move all dpll<>netdev helpers to dpll code Older versions of GCC really want to know the full definition of the type involved in rcu_assign_pointer(). struct dpll_pin is defined in a local header, net/core can't reach it. Move all the netdev <> dpll code into dpll, where the type is known. Otherwise we'd need multiple function calls to jump between the compilation units. This is the same problem the commit under fixes was trying to address, but with rcu_assign_pointer() not rcu_dereference(). Some of the exports are not needed, networking core can't be a module, we only need exports for the helpers used by drivers. Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/35a869c8-52e8-177-1d4d-e57578b99b6@linux-m68k.org/ Fixes: 640f41ed33b5 ("dpll: fix build failure due to rcu_dereference_check() on unknown type") Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240305013532.694866-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 2 +- drivers/dpll/dpll_core.c | 25 +++++++++++++---- drivers/dpll/dpll_netlink.c | 38 ++++++++++++++++---------- drivers/net/ethernet/intel/ice/ice_dpll.c | 4 +-- drivers/net/ethernet/mellanox/mlx5/core/dpll.c | 4 +-- include/linux/dpll.h | 26 +++++++++--------- include/linux/netdevice.h | 4 --- net/core/dev.c | 22 --------------- net/core/rtnetlink.c | 4 +-- 9 files changed, 64 insertions(+), 65 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index e3d593841aa7..ea8d16600e16 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -545,7 +545,7 @@ In such scenario, dpll device input signal shall be also configurable to drive dpll with signal recovered from the PHY netdevice. This is done by exposing a pin to the netdevice - attaching pin to the netdevice itself with -``netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. +``dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. Exposed pin id handle ``DPLL_A_PIN_ID`` is then identifiable by the user as it is attached to rtnetlink respond to get ``RTM_NEWLINK`` command in nested attribute ``IFLA_DPLL_PIN``. diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 241db366b2c7..7f686d179fc9 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -42,11 +42,6 @@ struct dpll_pin_registration { void *priv; }; -struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ - return rcu_dereference_rtnl(dev->dpll_pin); -} - struct dpll_device *dpll_device_get_by_id(int id) { if (xa_get_mark(&dpll_device_xa, id, DPLL_REGISTERED)) @@ -513,6 +508,26 @@ err_pin_prop: return ERR_PTR(ret); } +static void dpll_netdev_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) +{ + rtnl_lock(); + rcu_assign_pointer(dev->dpll_pin, dpll_pin); + rtnl_unlock(); +} + +void dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) +{ + WARN_ON(!dpll_pin); + dpll_netdev_pin_assign(dev, dpll_pin); +} +EXPORT_SYMBOL(dpll_netdev_pin_set); + +void dpll_netdev_pin_clear(struct net_device *dev) +{ + dpll_netdev_pin_assign(dev, NULL); +} +EXPORT_SYMBOL(dpll_netdev_pin_clear); + /** * dpll_pin_get - find existing or create new dpll pin * @clock_id: clock_id of creator diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 4ca9ad16cd95..b57355e0c214 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include "dpll_core.h" #include "dpll_netlink.h" @@ -47,18 +48,6 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id) return 0; } -/** - * dpll_msg_pin_handle_size - get size of pin handle attribute for given pin - * @pin: pin pointer - * - * Return: byte size of pin handle attribute for given pin. - */ -size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) -{ - return pin ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */ -} -EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size); - /** * dpll_msg_add_pin_handle - attach pin handle attribute to a given message * @msg: pointer to sk_buff message to attach a pin handle @@ -68,7 +57,7 @@ EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size); * * 0 - success * * -EMSGSIZE - no space in message to attach pin handle */ -int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +static int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) { if (!pin) return 0; @@ -76,7 +65,28 @@ int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) return -EMSGSIZE; return 0; } -EXPORT_SYMBOL_GPL(dpll_msg_add_pin_handle); + +static struct dpll_pin *dpll_netdev_pin(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->dpll_pin); +} + +/** + * dpll_netdev_pin_handle_size - get size of pin handle attribute of a netdev + * @dev: netdev from which to get the pin + * + * Return: byte size of pin handle attribute, or 0 if @dev has no pin. + */ +size_t dpll_netdev_pin_handle_size(const struct net_device *dev) +{ + return dpll_netdev_pin(dev) ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */ +} + +int dpll_netdev_add_pin_handle(struct sk_buff *msg, + const struct net_device *dev) +{ + return dpll_msg_add_pin_handle(msg, dpll_netdev_pin(dev)); +} static int dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll, diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index adfa1f2a80a6..c59e972dbaae 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1597,7 +1597,7 @@ static void ice_dpll_deinit_rclk_pin(struct ice_pf *pf) } if (WARN_ON_ONCE(!vsi || !vsi->netdev)) return; - netdev_dpll_pin_clear(vsi->netdev); + dpll_netdev_pin_clear(vsi->netdev); dpll_pin_put(rclk->pin); } @@ -1641,7 +1641,7 @@ ice_dpll_init_rclk_pins(struct ice_pf *pf, struct ice_dpll_pin *pin, } if (WARN_ON((!vsi || !vsi->netdev))) return -EINVAL; - netdev_dpll_pin_set(vsi->netdev, pf->dplls.rclk.pin); + dpll_netdev_pin_set(vsi->netdev, pf->dplls.rclk.pin); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c index 928bf24d4b12..d74a5aaf4268 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c @@ -261,7 +261,7 @@ static void mlx5_dpll_netdev_dpll_pin_set(struct mlx5_dpll *mdpll, { if (mdpll->tracking_netdev) return; - netdev_dpll_pin_set(netdev, mdpll->dpll_pin); + dpll_netdev_pin_set(netdev, mdpll->dpll_pin); mdpll->tracking_netdev = netdev; } @@ -269,7 +269,7 @@ static void mlx5_dpll_netdev_dpll_pin_clear(struct mlx5_dpll *mdpll) { if (!mdpll->tracking_netdev) return; - netdev_dpll_pin_clear(mdpll->tracking_netdev); + dpll_netdev_pin_clear(mdpll->tracking_netdev); mdpll->tracking_netdev = NULL; } diff --git a/include/linux/dpll.h b/include/linux/dpll.h index c60591308ae8..e37344f6a231 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -122,15 +122,24 @@ struct dpll_pin_properties { }; #if IS_ENABLED(CONFIG_DPLL) -size_t dpll_msg_pin_handle_size(struct dpll_pin *pin); -int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin); +void dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); +void dpll_netdev_pin_clear(struct net_device *dev); + +size_t dpll_netdev_pin_handle_size(const struct net_device *dev); +int dpll_netdev_add_pin_handle(struct sk_buff *msg, + const struct net_device *dev); #else -static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) +static inline void +dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) { } +static inline void dpll_netdev_pin_clear(struct net_device *dev) { } + +static inline size_t dpll_netdev_pin_handle_size(const struct net_device *dev) { return 0; } -static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +static inline int +dpll_netdev_add_pin_handle(struct sk_buff *msg, const struct net_device *dev) { return 0; } @@ -169,13 +178,4 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); -#if !IS_ENABLED(CONFIG_DPLL) -static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ - return NULL; -} -#else -struct dpll_pin *netdev_dpll_pin(const struct net_device *dev); -#endif - #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 735a9386fcf8..78a09af89e39 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -79,8 +79,6 @@ struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; -/* DPLL specific */ -struct dpll_pin; typedef u32 xdp_features_t; @@ -4042,8 +4040,6 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); -void netdev_dpll_pin_clear(struct net_device *dev); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 0230391c78f7..76e6438f4858 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9074,28 +9074,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) -{ -#if IS_ENABLED(CONFIG_DPLL) - rtnl_lock(); - rcu_assign_pointer(dev->dpll_pin, dpll_pin); - rtnl_unlock(); -#endif -} - -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) -{ - WARN_ON(!dpll_pin); - netdev_dpll_pin_assign(dev, dpll_pin); -} -EXPORT_SYMBOL(netdev_dpll_pin_set); - -void netdev_dpll_pin_clear(struct net_device *dev) -{ - netdev_dpll_pin_assign(dev, NULL); -} -EXPORT_SYMBOL(netdev_dpll_pin_clear); - /** * dev_change_proto_down - set carrier according to proto_down. * diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ae86f751efc3..bd50e9fe3234 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1057,7 +1057,7 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ - size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev)); + size += dpll_netdev_pin_handle_size(dev); return size; } @@ -1792,7 +1792,7 @@ static int rtnl_fill_dpll_pin(struct sk_buff *skb, if (!dpll_pin_nest) return -EMSGSIZE; - ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev)); + ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; -- cgit v1.2.3 From 2658b5a8a4eee5fad378d0bde2f221deacbc58f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:14 +0000 Subject: net: introduce struct net_hotdata Instead of spreading networking critical fields all over the places, add a custom net_hotdata structure so that we can precisely control its layout. In this first patch, move : - gro_normal_batch used in rx (GRO stack) - offload_base used in rx and tx (GRO and TSO stacks) Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/gro.h | 5 ++--- include/net/hotdata.h | 15 +++++++++++++++ net/core/Makefile | 1 + net/core/gro.c | 15 ++++++--------- net/core/gso.c | 4 ++-- net/core/hotdata.c | 9 +++++++++ net/core/sysctl_net_core.c | 3 ++- 8 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 include/net/hotdata.h create mode 100644 net/core/hotdata.c (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2767467138a0..6643452af543 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4796,7 +4796,6 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; -extern int gro_normal_batch; enum { NESTED_SYNC_IMM_BIT, diff --git a/include/net/gro.h b/include/net/gro.h index 2b58671a6549..d6fc8fbd3730 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -9,6 +9,7 @@ #include #include #include +#include struct napi_gro_cb { union { @@ -446,7 +447,7 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, { list_add_tail(&skb->list, &napi->rx_list); napi->rx_count += segs; - if (napi->rx_count >= READ_ONCE(gro_normal_batch)) + if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(napi); } @@ -493,6 +494,4 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * #endif } -extern struct list_head offload_base; - #endif /* _NET_IPV6_GRO_H */ diff --git a/include/net/hotdata.h b/include/net/hotdata.h new file mode 100644 index 000000000000..6ed32e4e34aa --- /dev/null +++ b/include/net/hotdata.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_HOTDATA_H +#define _NET_HOTDATA_H + +#include + +/* Read mostly data used in network fast paths. */ +struct net_hotdata { + struct list_head offload_base; + int gro_normal_batch; +}; + +extern struct net_hotdata net_hotdata; + +#endif /* _NET_HOTDATA_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 821aec06abf1..6e6548011fae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -18,6 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o +obj-y += hotdata.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/gro.c b/net/core/gro.c index 6a0edbd826a1..ee30d4f0c038 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -10,9 +10,6 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers @@ -31,7 +28,7 @@ void dev_add_offload(struct packet_offload *po) struct packet_offload *elem; spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { + list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } @@ -55,7 +52,7 @@ EXPORT_SYMBOL(dev_add_offload); */ static void __dev_remove_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); @@ -235,9 +232,9 @@ done: static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); @@ -444,7 +441,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -550,7 +547,7 @@ normal: struct packet_offload *gro_find_receive_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -564,7 +561,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { diff --git a/net/core/gso.c b/net/core/gso.c index 9e1803bfc9c6..bcd156372f4d 100644 --- a/net/core/gso.c +++ b/net/core/gso.c @@ -17,7 +17,7 @@ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, struct packet_offload *ptype; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; @@ -48,7 +48,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, __skb_pull(skb, vlan_depth); rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; diff --git a/net/core/hotdata.c b/net/core/hotdata.c new file mode 100644 index 000000000000..abb8ad19d59a --- /dev/null +++ b/net/core/hotdata.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include + +struct net_hotdata net_hotdata __cacheline_aligned = { + .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .gro_normal_batch = 8, +}; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 986f15e5d6c4..0eb1242eabbe 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "dev.h" @@ -632,7 +633,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, -- cgit v1.2.3 From 0b91fa4bfb1caedd01cb6eb3b733cbc77c9edb0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:17 +0000 Subject: net: move ptype_all into net_hotdata ptype_all is used in rx/tx fast paths. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 16 +++++++--------- net/core/hotdata.c | 1 + net/core/net-procfs.c | 7 ++++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6643452af543..b18ac8072f18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5306,7 +5306,6 @@ static inline const char *netdev_reg_state(const struct net_device *dev) #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) -extern struct list_head ptype_all __read_mostly; extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 149e56528537..d462cb8f16ba 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -7,6 +7,7 @@ /* Read mostly data used in network fast paths. */ struct net_hotdata { struct list_head offload_base; + struct list_head ptype_all; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index f2f44303c035..e1493e558407 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -161,7 +161,6 @@ static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, @@ -540,7 +539,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &ptype_all; + return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; @@ -2226,7 +2225,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) */ bool dev_nit_active(struct net_device *dev) { - return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); + return !list_empty(&net_hotdata.ptype_all) || + !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); @@ -2237,10 +2237,9 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct list_head *ptype_list = &net_hotdata.ptype_all; + struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; - struct list_head *ptype_list = &ptype_all; rcu_read_lock(); again: @@ -2286,7 +2285,7 @@ again: pt_prev = ptype; } - if (ptype_list == &ptype_all) { + if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -5387,7 +5386,7 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -11749,7 +11748,6 @@ static int __init net_dev_init(void) if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 087c4c84987d..29fcfe89fd9a 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -7,6 +7,7 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), .gro_normal_batch = 8, .netdev_budget = 300, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 2e4e96d30ee1..a97eceb84e61 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "dev.h" @@ -183,7 +184,7 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &ptype_all, list) { + list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { if (i == pos) return pt; ++i; @@ -231,13 +232,13 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) } } - nxt = ptype_all.next; + nxt = net_hotdata.ptype_all.next; goto ptype_all; } if (pt->type == htons(ETH_P_ALL)) { ptype_all: - if (nxt != &ptype_all) + if (nxt != &net_hotdata.ptype_all) goto found; hash = 0; nxt = ptype_base[0].next; -- cgit v1.2.3 From edbc666cdcbf4a80ada4311c272a2078af87b880 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:18 +0000 Subject: net: move netdev_max_backlog to net_hotdata netdev_max_backlog is used in rx fat path. Move it to net_hodata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 8 +++----- net/core/gro_cells.c | 3 ++- net/core/hotdata.c | 2 ++ net/core/sysctl_net_core.c | 2 +- net/xfrm/espintcp.c | 4 +++- net/xfrm/xfrm_input.c | 3 ++- 8 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b18ac8072f18..c9a671b7bb37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,7 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index d462cb8f16ba..dc50b200a94b 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -12,6 +12,7 @@ struct net_hotdata { int netdev_budget; int netdev_budget_usecs; int tstamp_prequeue; + int max_backlog; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index e1493e558407..e23edbaff392 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4404,9 +4404,6 @@ EXPORT_SYMBOL(__dev_direct_xmit); * Receiver routines *************************************************************************/ -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); - unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ @@ -4713,7 +4710,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4761,7 +4758,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(net_hotdata.max_backlog) && + !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ed5ec5de47f6..ff8e5b64bf6b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -3,6 +3,7 @@ #include #include #include +#include struct gro_cell { struct sk_buff_head napi_skbs; @@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 29fcfe89fd9a..35ed5a83ecc7 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -15,4 +15,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, .tstamp_prequeue = 1, + .max_backlog = 1000, }; +EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index bddd07da0998..8eaeeb289914 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -440,7 +440,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index d3b3f9e720b3..fe82e2d07300 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -169,7 +170,8 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&ctx->out_queue) >= + READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bd4ce21d76d7..161f535c8b94 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "xfrm_inout.h" @@ -764,7 +765,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, trans = this_cpu_ptr(&xfrm_trans_tasklet); - if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); -- cgit v1.2.3 From 26722dc74bf08fd79564cbcad1e5f3e2aa3bf9cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:21 +0000 Subject: net: move dev_tx_weight to net_hotdata dev_tx_weight is used in tx fast path. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 1 - net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 2 +- net/sched/sch_generic.c | 3 ++- 6 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c9a671b7bb37..ad4b031098ff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4794,7 +4794,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int dev_rx_weight; -extern int dev_tx_weight; enum { NESTED_SYNC_IMM_BIT, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index d86d02f156fc..ffea9cc263e5 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -21,6 +21,7 @@ struct net_hotdata { int netdev_budget_usecs; int tstamp_prequeue; int max_backlog; + int dev_tx_weight; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index e23edbaff392..5d6bd481103f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4409,7 +4409,6 @@ int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; -int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 35ed5a83ecc7..ec8c3b48e8fe 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -16,5 +16,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, + .dev_tx_weight = 64, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8eaeeb289914..a30016a8660e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -302,7 +302,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9b3e9262040b..ff5336493777 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -409,7 +410,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = READ_ONCE(dev_tx_weight); + int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { -- cgit v1.2.3 From 71c0de9bac9c1dda503322c86be4924f055dc6c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:22 +0000 Subject: net: move dev_rx_weight to net_hotdata dev_rx_weight is read from process_backlog(). Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/net/hotdata.h | 1 + net/core/dev.c | 3 +-- net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad4b031098ff..dd641297e807 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,8 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int dev_rx_weight; - enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index ffea9cc263e5..e6595ed2c3be 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -22,6 +22,7 @@ struct net_hotdata { int tstamp_prequeue; int max_backlog; int dev_tx_weight; + int dev_rx_weight; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index 5d6bd481103f..40ba02e04bcb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4408,7 +4408,6 @@ unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ -int dev_rx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -5978,7 +5977,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = READ_ONCE(dev_rx_weight); + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index ec8c3b48e8fe..c8a7a451c18a 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -17,5 +17,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, .dev_tx_weight = 64, + .dev_rx_weight = 64, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a30016a8660e..8a4c698dad9c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -301,7 +301,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); -- cgit v1.2.3 From 490a79faf95e705ba0ffd9ebf04a624b379e53c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:30 +0000 Subject: net: introduce include/net/rps.h Move RPS related structures and helpers from include/linux/netdevice.h and include/net/sock.h to a new include file. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-18-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_arfs.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 1 + drivers/net/ethernet/sfc/rx_common.c | 1 + drivers/net/ethernet/sfc/siena/rx_common.c | 1 + drivers/net/tun.c | 1 + include/linux/netdevice.h | 82 -------------- include/net/rps.h | 127 ++++++++++++++++++++++ include/net/sock.h | 35 ------ net/core/dev.c | 1 + net/core/net-sysfs.c | 1 + net/core/sysctl_net_core.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 1 + net/ipv6/af_inet6.c | 1 + net/sctp/socket.c | 1 + 16 files changed, 140 insertions(+), 117 deletions(-) create mode 100644 include/net/rps.h (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c index cca0e753f38f..7cee365cc7d1 100644 --- a/drivers/net/ethernet/intel/ice/ice_arfs.c +++ b/drivers/net/ethernet/intel/ice/ice_arfs.c @@ -2,6 +2,7 @@ /* Copyright (C) 2018-2020, Intel Corporation. */ #include "ice.h" +#include /** * ice_is_arfs_active - helper to check is aRFS is active diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index d7da62cda821..5d3fde63b273 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index e66f486faafe..c7f542d0b8f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "en.h" #define ARFS_HASH_SHIFT BITS_PER_BYTE diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index fac227d372db..dcd901eccfc8 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -11,6 +11,7 @@ #include "net_driver.h" #include #include +#include #include "efx.h" #include "nic.h" #include "rx_common.h" diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c index 4579f43484c3..219fb358a646 100644 --- a/drivers/net/ethernet/sfc/siena/rx_common.c +++ b/drivers/net/ethernet/sfc/siena/rx_common.c @@ -11,6 +11,7 @@ #include "net_driver.h" #include #include +#include #include "efx.h" #include "nic.h" #include "rx_common.h" diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8d258e263f54..0b3f21cba552 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd641297e807..416a800d72ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -225,12 +225,6 @@ struct net_device_core_stats { #include #include -#ifdef CONFIG_RPS -#include -extern struct static_key_false rps_needed; -extern struct static_key_false rfs_needed; -#endif - struct neighbour; struct neigh_parms; struct sk_buff; @@ -730,86 +724,10 @@ static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node #endif } -#ifdef CONFIG_RPS -/* - * This structure holds an RPS map which can be of variable length. The - * map is an array of CPUs. - */ -struct rps_map { - unsigned int len; - struct rcu_head rcu; - u16 cpus[]; -}; -#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) - -/* - * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. - */ -struct rps_dev_flow { - u16 cpu; - u16 filter; - unsigned int last_qtail; -}; -#define RPS_NO_FILTER 0xffff - -/* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - unsigned int mask; - struct rcu_head rcu; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - -/* - * The rps_sock_flow_table contains mappings of flows to the last CPU - * on which they were processed by the application (set in recvmsg). - * Each entry is a 32bit value. Upper part is the high-order bits - * of flow hash, lower part is CPU number. - * rps_cpu_mask is used to partition the space, depending on number of - * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 - * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, - * meaning we use 32-6=26 bits for the hash. - */ -struct rps_sock_flow_table { - u32 mask; - - u32 ents[] ____cacheline_aligned_in_smp; -}; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) - -#define RPS_NO_CPU 0xffff - -extern u32 rps_cpu_mask; -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - -static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, - u32 hash) -{ - if (table && hash) { - unsigned int index = hash & table->mask; - u32 val = hash & ~rps_cpu_mask; - - /* We only give a hint, preemption can change CPU under us */ - val |= raw_smp_processor_id(); - - /* The following WRITE_ONCE() is paired with the READ_ONCE() - * here, and another one in get_rps_cpu(). - */ - if (READ_ONCE(table->ents[index]) != val) - WRITE_ONCE(table->ents[index], val); - } -} - #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif -#endif /* CONFIG_RPS */ /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { diff --git a/include/net/rps.h b/include/net/rps.h new file mode 100644 index 000000000000..6081d817d245 --- /dev/null +++ b/include/net/rps.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_H +#define _NET_RPS_H + +#include +#include +#include + +#ifdef CONFIG_RPS + +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; + +/* + * This structure holds an RPS map which can be of variable length. The + * map is an array of CPUs. + */ +struct rps_map { + unsigned int len; + struct rcu_head rcu; + u16 cpus[]; +}; +#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) + +/* + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, and + * a hardware filter index. + */ +struct rps_dev_flow { + u16 cpu; + u16 filter; + unsigned int last_qtail; +}; +#define RPS_NO_FILTER 0xffff + +/* + * The rps_dev_flow_table structure contains a table of flow mappings. + */ +struct rps_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct rps_dev_flow flows[]; +}; +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ + ((_num) * sizeof(struct rps_dev_flow))) + +/* + * The rps_sock_flow_table contains mappings of flows to the last CPU + * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high-order bits + * of flow hash, lower part is CPU number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. + */ +struct rps_sock_flow_table { + u32 mask; + + u32 ents[] ____cacheline_aligned_in_smp; +}; +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) + +#define RPS_NO_CPU 0xffff + +extern u32 rps_cpu_mask; +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; + +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, + u32 hash) +{ + unsigned int index = hash & table->mask; + u32 val = hash & ~rps_cpu_mask; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (READ_ONCE(table->ents[index]) != val) + WRITE_ONCE(table->ents[index], val); +} + +#endif /* CONFIG_RPS */ + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + if (!hash) + return; + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (sock_flow_table) + rps_record_sock_flow(sock_flow_table, hash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } + } +#endif +} + +#endif /* _NET_RPS_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 09a0cde8bf52..b5e00702acc1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1117,41 +1117,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, hash); - rcu_read_unlock(); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } - } -#endif -} static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 40ba02e04bcb..bcf49b0393d2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -155,6 +155,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index af238026ac3c..5560083774b1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8a4c698dad9c..4b93e27404e8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "dev.h" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6f1cfd176e7b..55bd72997b31 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #endif #include #include +#include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7e1b848398d0..c5b83875411a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include #include +#include /* Track pending CMSGs. */ enum { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b90d46533cdc..8041dc181bd4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b9fcdb0952a..c67679a41044 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -67,6 +67,7 @@ #include #include #include +#include /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); -- cgit v1.2.3 From ab63a2387cb906d43b72a8effb611bbaecb2d0cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Mar 2024 11:55:07 -0800 Subject: netdev: add per-queue statistics The ethtool-nl family does a good job exposing various protocol related and IEEE/IETF statistics which used to get dumped under ethtool -S, with creative names. Queue stats don't have a netlink API, yet, and remain a lion's share of ethtool -S output for new drivers. Not only is that bad because the names differ driver to driver but it's also bug-prone. Intuitively drivers try to report only the stats for active queues, but querying ethtool stats involves multiple system calls, and the number of stats is read separately from the stats themselves. Worse still when user space asks for values of the stats, it doesn't inform the kernel how big the buffer is. If number of stats increases in the meantime kernel will overflow user buffer. Add a netlink API for dumping queue stats. Queue information is exposed via the netdev-genl family, so add the stats there. Support per-queue and sum-for-device dumps. Latter will be useful when subsequent patches add more interesting common stats than just bytes and packets. The API does not currently distinguish between HW and SW stats. The expectation is that the source of the stats will either not matter much (good packets) or be obvious (skb alloc errors). Acked-by: Stanislav Fomichev Reviewed-by: Amritha Nambiar Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20240306195509.1502746-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 84 +++++++++++++ Documentation/networking/statistics.rst | 15 +++ include/linux/netdevice.h | 3 + include/net/netdev_queues.h | 54 ++++++++ include/uapi/linux/netdev.h | 19 +++ net/core/netdev-genl-gen.c | 12 ++ net/core/netdev-genl-gen.h | 2 + net/core/netdev-genl.c | 213 ++++++++++++++++++++++++++++++++ tools/include/uapi/linux/netdev.h | 19 +++ 9 files changed, 421 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 3addac970680..a1e48c3c84c9 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -74,6 +74,10 @@ definitions: name: queue-type type: enum entries: [ rx, tx ] + - + name: qstats-scope + type: flags + entries: [ queue ] attribute-sets: - @@ -265,6 +269,66 @@ attribute-sets: doc: ID of the NAPI instance which services this queue. type: u32 + - + name: qstats + doc: | + Get device statistics, scoped to a device or a queue. + These statistics extend (and partially duplicate) statistics available + in struct rtnl_link_stats64. + Value of the `scope` attribute determines how statistics are + aggregated. When aggregated for the entire device the statistics + represent the total number of events since last explicit reset of + the device (i.e. not a reconfiguration like changing queue count). + When reported per-queue, however, the statistics may not add + up to the total number of events, will only be reported for currently + active objects, and will likely report the number of events since last + reconfiguration. + attributes: + - + name: ifindex + doc: ifindex of the netdevice to which stats belong. + type: u32 + checks: + min: 1 + - + name: queue-type + doc: Queue type as rx, tx, for queue-id. + type: u32 + enum: queue-type + - + name: queue-id + doc: Queue ID, if stats are scoped to a single queue instance. + type: u32 + - + name: scope + doc: | + What object type should be used to iterate over the stats. + type: uint + enum: qstats-scope + - + name: rx-packets + doc: | + Number of wire packets successfully received and passed to the stack. + For drivers supporting XDP, XDP is considered the first layer + of the stack, so packets consumed by XDP are still counted here. + type: uint + value: 8 # reserve some attr ids in case we need more metadata later + - + name: rx-bytes + doc: Successfully received bytes, see `rx-packets`. + type: uint + - + name: tx-packets + doc: | + Number of wire packets successfully sent. Packet is considered to be + successfully sent once it is in device memory (usually this means + the device has issued a DMA completion for the packet). + type: uint + - + name: tx-bytes + doc: Successfully sent bytes, see `tx-packets`. + type: uint + operations: list: - @@ -405,6 +469,26 @@ operations: attributes: - ifindex reply: *napi-get-op + - + name: qstats-get + doc: | + Get / dump fine grained statistics. Which statistics are reported + depends on the device and the driver, and whether the driver stores + software counters per-queue. + attribute-set: qstats + dump: + request: + attributes: + - scope + reply: + attributes: + - ifindex + - queue-type + - queue-id + - rx-packets + - rx-bytes + - tx-packets + - tx-bytes mcast-groups: list: diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst index 551b3cc29a41..75e017dfa825 100644 --- a/Documentation/networking/statistics.rst +++ b/Documentation/networking/statistics.rst @@ -41,6 +41,15 @@ If `-s` is specified once the detailed errors won't be shown. `ip` supports JSON formatting via the `-j` option. +Queue statistics +~~~~~~~~~~~~~~~~ + +Queue statistics are accessible via the netdev netlink family. + +Currently no widely distributed CLI exists to access those statistics. +Kernel development tools (ynl) can be used to experiment with them, +see `Documentation/userspace-api/netlink/intro-specs.rst`. + Protocol-specific statistics ---------------------------- @@ -147,6 +156,12 @@ Statistics are reported both in the responses to link information requests (`RTM_GETLINK`) and statistic requests (`RTM_GETSTATS`, when `IFLA_STATS_LINK_64` bit is set in the `.filter_mask` of the request). +netdev (netlink) +~~~~~~~~~~~~~~~~ + +`netdev` generic netlink family allows accessing page pool and per queue +statistics. + ethtool ------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 416a800d72ba..4230c7f3b959 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1955,6 +1955,7 @@ enum netdev_reg_state { * * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops + * @stat_ops: Optional ops for queue-aware statistics * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size @@ -2335,6 +2336,8 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; + const struct netdev_stat_ops *stat_ops; + /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 8b8ed4e13d74..d633347eeda5 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -4,6 +4,60 @@ #include +struct netdev_queue_stats_rx { + u64 bytes; + u64 packets; +}; + +struct netdev_queue_stats_tx { + u64 bytes; + u64 packets; +}; + +/** + * struct netdev_stat_ops - netdev ops for fine grained stats + * @get_queue_stats_rx: get stats for a given Rx queue + * @get_queue_stats_tx: get stats for a given Tx queue + * @get_base_stats: get base stats (not belonging to any live instance) + * + * Query stats for a given object. The values of the statistics are undefined + * on entry (specifically they are *not* zero-initialized). Drivers should + * assign values only to the statistics they collect. Statistics which are not + * collected must be left undefined. + * + * Queue objects are not necessarily persistent, and only currently active + * queues are queried by the per-queue callbacks. This means that per-queue + * statistics will not generally add up to the total number of events for + * the device. The @get_base_stats callback allows filling in the delta + * between events for currently live queues and overall device history. + * When the statistics for the entire device are queried, first @get_base_stats + * is issued to collect the delta, and then a series of per-queue callbacks. + * Only statistics which are set in @get_base_stats will be reported + * at the device level, meaning that unlike in queue callbacks, setting + * a statistic to zero in @get_base_stats is a legitimate thing to do. + * This is because @get_base_stats has a second function of designating which + * statistics are in fact correct for the entire device (e.g. when history + * for some of the events is not maintained, and reliable "total" cannot + * be provided). + * + * Device drivers can assume that when collecting total device stats, + * the @get_base_stats and subsequent per-queue calls are performed + * "atomically" (without releasing the rtnl_lock). + * + * Device drivers are encouraged to reset the per-queue statistics when + * number of queues change. This is because the primary use case for + * per-queue statistics is currently to detect traffic imbalance. + */ +struct netdev_stat_ops { + void (*get_queue_stats_rx)(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *stats); + void (*get_queue_stats_tx)(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *stats); + void (*get_base_stats)(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx); +}; + /** * DOC: Lockless queue stopping / waking helpers. * diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 93cb411adf72..639ffa04c172 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -70,6 +70,10 @@ enum netdev_queue_type { NETDEV_QUEUE_TYPE_TX, }; +enum netdev_qstats_scope { + NETDEV_QSTATS_SCOPE_QUEUE = 1, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, @@ -132,6 +136,20 @@ enum { NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) }; +enum { + NETDEV_A_QSTATS_IFINDEX = 1, + NETDEV_A_QSTATS_QUEUE_TYPE, + NETDEV_A_QSTATS_QUEUE_ID, + NETDEV_A_QSTATS_SCOPE, + NETDEV_A_QSTATS_RX_PACKETS = 8, + NETDEV_A_QSTATS_RX_BYTES, + NETDEV_A_QSTATS_TX_PACKETS, + NETDEV_A_QSTATS_TX_BYTES, + + __NETDEV_A_QSTATS_MAX, + NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -144,6 +162,7 @@ enum { NETDEV_CMD_PAGE_POOL_STATS_GET, NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, + NETDEV_CMD_QSTATS_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index be7f2ebd61b2..8d8ace9ef87f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -68,6 +68,11 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_QSTATS_GET - dump */ +static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -138,6 +143,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_NAPI_IFINDEX, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_QSTATS_GET, + .dumpit = netdev_nl_qstats_get_dumpit, + .policy = netdev_qstats_get_nl_policy, + .maxattr = NETDEV_A_QSTATS_SCOPE, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index a47f2bcbe4fa..4db40fd5b4a9 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -28,6 +28,8 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 918b109e0cf4..7fa75e13dc6d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "netdev-genl-gen.h" @@ -460,6 +461,218 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +#define NETDEV_STAT_NOT_SET (~0ULL) + +static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) +{ + const u64 *add = _add; + u64 *sum = _sum; + + while (size) { + if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) + *sum += *add; + sum++; + add++; + size -= 8; + } +} + +static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) +{ + if (value == NETDEV_STAT_NOT_SET) + return 0; + return nla_put_uint(rsp, attr_id, value); +} + +static int +netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, + u32 q_type, int i, const struct genl_info *info) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + memset(&rx, 0xff, sizeof(rx)); + ops->get_queue_stats_rx(netdev, i, &rx); + if (!memchr_inv(&rx, 0xff, sizeof(rx))) + goto nla_cancel; + if (netdev_nl_stats_write_rx(rsp, &rx)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + memset(&tx, 0xff, sizeof(tx)); + ops->get_queue_stats_tx(netdev, i, &tx); + if (!memchr_inv(&tx, 0xff, sizeof(tx))) + goto nla_cancel; + if (netdev_nl_stats_write_tx(rsp, &tx)) + goto nla_put_failure; + break; + } + + genlmsg_end(rsp, hdr); + return 0; + +nla_cancel: + genlmsg_cancel(rsp, hdr); + return 0; +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + int i, err; + + if (!(netdev->flags & IFF_UP)) + return 0; + + i = ctx->rxq_idx; + while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, + i, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + i = ctx->txq_idx; + while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, + i, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + return 0; +} + +static int +netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + struct netdev_queue_stats_rx rx_sum, rx; + struct netdev_queue_stats_tx tx_sum, tx; + const struct netdev_stat_ops *ops; + void *hdr; + int i; + + ops = netdev->stat_ops; + /* Netdev can't guarantee any complete counters */ + if (!ops->get_base_stats) + return 0; + + memset(&rx_sum, 0xff, sizeof(rx_sum)); + memset(&tx_sum, 0xff, sizeof(tx_sum)); + + ops->get_base_stats(netdev, &rx_sum, &tx_sum); + + /* The op was there, but nothing reported, don't bother */ + if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && + !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + for (i = 0; i < netdev->real_num_rx_queues; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); + } + for (i = 0; i < netdev->real_num_tx_queues; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); + } + + if (netdev_nl_stats_write_rx(rsp, &rx_sum) || + netdev_nl_stats_write_tx(rsp, &tx_sum)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + unsigned int scope; + int err = 0; + + scope = 0; + if (info->attrs[NETDEV_A_QSTATS_SCOPE]) + scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); + + rtnl_lock(); + for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (!netdev->stat_ops) + continue; + + switch (scope) { + case 0: + err = netdev_nl_stats_by_netdev(netdev, skb, info); + break; + case NETDEV_QSTATS_SCOPE_QUEUE: + err = netdev_nl_stats_by_queue(netdev, skb, info, ctx); + break; + } + if (err < 0) + break; + } + rtnl_unlock(); + + return err; +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 93cb411adf72..639ffa04c172 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -70,6 +70,10 @@ enum netdev_queue_type { NETDEV_QUEUE_TYPE_TX, }; +enum netdev_qstats_scope { + NETDEV_QSTATS_SCOPE_QUEUE = 1, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, @@ -132,6 +136,20 @@ enum { NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) }; +enum { + NETDEV_A_QSTATS_IFINDEX = 1, + NETDEV_A_QSTATS_QUEUE_TYPE, + NETDEV_A_QSTATS_QUEUE_ID, + NETDEV_A_QSTATS_SCOPE, + NETDEV_A_QSTATS_RX_PACKETS = 8, + NETDEV_A_QSTATS_RX_BYTES, + NETDEV_A_QSTATS_TX_PACKETS, + NETDEV_A_QSTATS_TX_BYTES, + + __NETDEV_A_QSTATS_MAX, + NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -144,6 +162,7 @@ enum { NETDEV_CMD_PAGE_POOL_STATS_GET, NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, + NETDEV_CMD_QSTATS_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From e5b7aefe38f7f6258935d8a10c36552dd957048a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Mar 2024 10:22:30 +0000 Subject: net: gro: move two declarations to include/net/gro.h Move gro_find_receive_by_type() and gro_find_complete_by_type() to include/net/gro.h where they belong. Also use _NET_GRO_H instead of _NET_IPV6_GRO_H to protect include/net/gro.h from multiple inclusions. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240308102230.296224-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/net/gro.h | 9 ++++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4230c7f3b959..c6f6ac779b34 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3901,8 +3901,6 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); -struct packet_offload *gro_find_receive_by_type(__be16 type); -struct packet_offload *gro_find_complete_by_type(__be16 type); static inline void napi_free_frags(struct napi_struct *napi) { diff --git a/include/net/gro.h b/include/net/gro.h index d6fc8fbd3730..50f1e403dbbb 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _NET_IPV6_GRO_H -#define _NET_IPV6_GRO_H +#ifndef _NET_GRO_H +#define _NET_GRO_H #include #include @@ -494,4 +494,7 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * #endif } -#endif /* _NET_IPV6_GRO_H */ +struct packet_offload *gro_find_receive_by_type(__be16 type); +struct packet_offload *gro_find_complete_by_type(__be16 type); + +#endif /* _NET_GRO_H */ -- cgit v1.2.3 From f6e0a4984c2e7244689ea87b62b433bed9d07e94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2024 20:08:45 +0000 Subject: net: move dev->state into net_device_read_txrx group dev->state can be read in rx and tx fast paths. netif_running() which needs dev->state is called from - enqueue_to_backlog() [RX path] - __dev_direct_xmit() [TX path] Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240314200845.3050179-1-edumazet@google.com Signed-off-by: Paolo Abeni --- Documentation/networking/net_cachelines/net_device.rst | 2 +- include/linux/netdevice.h | 2 +- net/core/dev.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index dceb49d56a91..70c4fb9d4e5c 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -13,7 +13,7 @@ struct_dev_ifalias* ifalias unsigned_long mem_end unsigned_long mem_start unsigned_long base_addr -unsigned_long state +unsigned_long state read_mostly read_mostly netif_running(dev) struct_list_head dev_list struct_list_head napi_list struct_list_head unreg_list diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6f6ac779b34..cb37817d6382 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2072,6 +2072,7 @@ struct net_device { struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; }; + unsigned long state; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; @@ -2117,7 +2118,6 @@ struct net_device { * part of the usual set specified in Space.c. */ - unsigned long state; struct list_head dev_list; struct list_head napi_list; diff --git a/net/core/dev.c b/net/core/dev.c index 722787c32755..303a6ff46e4e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11665,11 +11665,12 @@ static void __init net_dev_struct_check(void) /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); -- cgit v1.2.3