From 723de3ebef03bc14bd72531f00f9094337654009 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 Jan 2024 12:14:49 -0800 Subject: net: free altname using an RCU callback We had to add another synchronize_rcu() in recent fix. Bite the bullet and add an rcu_head to netdev_name_node, free from RCU. Note that name_node does not hold any reference on dev to which it points, but there must be a synchronize_rcu() on device removal path, so we should be fine. Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cb2dab0feee0..b53b9c94de40 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -341,13 +341,22 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return 0; } -static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +static void netdev_name_node_alt_free(struct rcu_head *head) { - list_del(&name_node->list); + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + kfree(name_node->name); netdev_name_node_free(name_node); } +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -362,10 +371,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; - netdev_name_node_del(name_node); - synchronize_rcu(); __netdev_name_node_alt_destroy(name_node); - return 0; } @@ -373,8 +379,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; - list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) - __netdev_name_node_alt_destroy(name_node); + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ @@ -11576,11 +11584,8 @@ static void __net_exit default_device_exit_net(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) - if (netdev_name_in_use(&init_net, name_node->name)) { - netdev_name_node_del(name_node); - synchronize_rcu(); + if (netdev_name_in_use(&init_net, name_node->name)) __netdev_name_node_alt_destroy(name_node); - } err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { -- cgit v1.2.3 From ffabe98cb576097b77d404d39e8b3df03caa986a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Feb 2024 10:11:06 +0000 Subject: net: make dev_unreg_count global We can use a global dev_unreg_count counter instead of a per netns one. As a bonus we can factorize the changes done on it for bulk device removals. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + include/net/net_namespace.h | 2 -- net/core/dev.c | 12 +++++++++--- net/core/rtnetlink.c | 11 +---------- 4 files changed, 11 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 410529fca18b..21780608cf47 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -47,6 +47,7 @@ extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; +extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 13b3a4e29fdb..cd0c2eedbb5e 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -67,8 +67,6 @@ struct net { */ spinlock_t rules_mod_lock; - atomic_t dev_unreg_count; - unsigned int dev_base_seq; /* protected by rtnl_mutex */ u32 ifindex; diff --git a/net/core/dev.c b/net/core/dev.c index b53b9c94de40..27ba057d06c4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9698,11 +9698,11 @@ static void dev_index_release(struct net *net, int ifindex) /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -10529,6 +10529,7 @@ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; + int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -10565,6 +10566,7 @@ void netdev_run_todo(void) linkwatch_sync_dev(dev); } + cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); @@ -10582,12 +10584,13 @@ void netdev_run_todo(void) if (dev->needs_free_netdev) free_netdev(dev); - if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) - wake_up(&netdev_unregistering_wq); + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has @@ -11034,6 +11037,7 @@ void unregister_netdevice_many_notify(struct list_head *head, { struct net_device *dev, *tmp; LIST_HEAD(close_head); + int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -11130,7 +11134,9 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); + cnt++; } + atomic_add(cnt, &dev_unreg_count); list_del(head); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f6f29eb03ec2..31f433950c8d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -483,24 +483,15 @@ EXPORT_SYMBOL_GPL(__rtnl_link_unregister); */ static void rtnl_lock_unregistering_all(void) { - struct net *net; - bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ - for_each_net(net) { - if (atomic_read(&net->dev_unreg_count) > 0) { - unregistering = true; - break; - } - } - if (!unregistering) + if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); -- cgit v1.2.3 From d160c66cda0ac8614adc53a5b5b0e6d6f1a05a5b Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 5 Feb 2024 12:30:22 +0200 Subject: net: Do not return value from init_dummy_netdev() init_dummy_netdev() always returns zero and all the callers do not check the returned value. Set the function to not return value, as it is not really used today. Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240205103022.440946-1-amcohen@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- net/core/dev.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..1845dd5043b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3198,7 +3198,7 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); void netdev_freemem(struct net_device *dev); -int init_dummy_netdev(struct net_device *dev); +void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 27ba057d06c4..e52e2888cccd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10345,7 +10345,7 @@ EXPORT_SYMBOL(register_netdevice); * that need to tie several hardware interfaces to a single NAPI * poll scheduler due to HW limitations. */ -int init_dummy_netdev(struct net_device *dev) +void init_dummy_netdev(struct net_device *dev) { /* Clear everything. Note we don't initialize spinlocks * are they aren't supposed to be taken by any of the @@ -10373,8 +10373,6 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ - - return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); -- cgit v1.2.3 From 13d381b440ed84ec4cc92975de035efb1a9e5f7e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 6 Feb 2024 09:30:03 -0700 Subject: net: split off __napi_busy_poll from napi_busy_poll This splits off the key part of the napi_busy_poll function into its own function, __napi_busy_poll, and changes the prefer_busy_poll bool to be flag based to allow passing in more flags in the future. This is done in preparation for an additional napi_busy_poll() function, that doesn't take the rcu_read_lock(). The new function is introduced in the next patch. Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20230608163839.2891748-2-shr@devkernel.io Signed-off-by: Jens Axboe Signed-off-by: Jakub Kicinski --- net/core/dev.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cb2dab0feee0..1eaed657f2c2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6177,8 +6177,12 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, - u16 budget) +enum { + NAPI_F_PREFER_BUSY_POLL = 1, +}; + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6198,7 +6202,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_disable(); - if (prefer_busy_poll) { + if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { @@ -6222,23 +6226,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_enable(); } -void napi_busy_loop(unsigned int napi_id, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll, u16 budget) +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; - rcu_read_lock(); - napi = napi_by_id(napi_id); if (!napi) - goto out; + return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); @@ -6254,14 +6258,14 @@ restart: */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } @@ -6282,11 +6286,12 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); + rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; @@ -6294,10 +6299,19 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); -out: +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; + + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); -- cgit v1.2.3 From b4e8ae5c8c41355791a99fdf2fcac16deace1e79 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 6 Feb 2024 09:30:04 -0700 Subject: net: add napi_busy_loop_rcu() This adds the napi_busy_loop_rcu() function. This function assumes that the calling function is already holding the rcu read lock and napi_busy_loop() does not need to take the rcu read lock. Add a NAPI_F_NO_SCHED flag, which tells __napi_busy_loop() to abort if we need to reschedule rather than drop the RCU read lock and reschedule. Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20230608163839.2891748-3-shr@devkernel.io Signed-off-by: Jens Axboe Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 4 ++++ net/core/dev.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 4dabeb6c76d3..9b09acac538e 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { diff --git a/net/core/dev.c b/net/core/dev.c index 1eaed657f2c2..ffa394f3e796 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6179,6 +6179,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) enum { NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, }; static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, @@ -6285,6 +6286,8 @@ count: break; if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) @@ -6304,6 +6307,18 @@ count: preempt_enable(); } +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); +} + void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) -- cgit v1.2.3 From 4cd582ffa5a9a5d58e5bac9c5e55ca8eeabffddc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Feb 2024 15:30:57 +0000 Subject: net: use synchronize_net() in dev_change_name() dev_change_name() holds RTNL, we better use synchronize_net() instead of plain synchronize_rcu(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 31f2c97d1990..7cf15d2bf78d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1239,7 +1239,7 @@ rollback: netdev_name_node_del(dev->name_node); write_unlock(&dev_base_lock); - synchronize_rcu(); + synchronize_net(); write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); -- cgit v1.2.3 From 2b0cfa6e49566c8fa6759734cf821aa6e8271a9e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:54 +0100 Subject: net: add generic percpu page_pool allocator Introduce generic percpu page_pools allocator. Moreover add page_pool_create_percpu() and cpuid filed in page_pool struct in order to recycle the page in the page_pool "hot" cache if napi_pp_put_page() is running on the same cpu. This is a preliminary patch to add xdp multi-buff support for xdp running in generic mode. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/80bc4285228b6f4220cd03de1999d86e46e3fcbd.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 3 +++ net/core/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++++ net/core/page_pool.c | 23 ++++++++++++++++++---- net/core/skbuff.c | 5 +++-- 4 files changed, 70 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 76481c465375..3828396ae60c 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -128,6 +128,7 @@ struct page_pool_stats { struct page_pool { struct page_pool_params_fast p; + int cpuid; bool has_init_callback; long frag_users; @@ -203,6 +204,8 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); struct page_pool *page_pool_create(const struct page_pool_params *params); +struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, + int cpuid); struct xdp_mem_info; diff --git a/net/core/dev.c b/net/core/dev.c index 7cf15d2bf78d..e19bdf1421e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,6 +153,8 @@ #include #include #include +#include +#include #include "dev.h" #include "net-sysfs.h" @@ -450,6 +452,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool); + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class @@ -11724,6 +11732,27 @@ static void __init net_dev_struct_check(void) * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .nid = NUMA_NO_NODE, + }; + struct page_pool *pp_ptr; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + per_cpu(system_page_pool, cpuid) = pp_ptr; +#endif + return 0; +} + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11776,6 +11805,9 @@ static int __init net_dev_init(void) init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + + if (net_page_pool_create(i)) + goto out; } dev_boot_phase = 0; @@ -11803,6 +11835,19 @@ static int __init net_dev_init(void) WARN_ON(rc < 0); rc = 0; out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool, i); + if (!pp_ptr) + continue; + + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool, i) = NULL; + } + } + return rc; } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4933762e5a6b..89c835fcf094 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -171,13 +171,16 @@ static void page_pool_producer_unlock(struct page_pool *pool, } static int page_pool_init(struct page_pool *pool, - const struct page_pool_params *params) + const struct page_pool_params *params, + int cpuid) { unsigned int ring_qsize = 1024; /* Default */ memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); + pool->cpuid = cpuid; + /* Validate only known flags were used */ if (pool->p.flags & ~(PP_FLAG_ALL)) return -EINVAL; @@ -253,10 +256,12 @@ static void page_pool_uninit(struct page_pool *pool) } /** - * page_pool_create() - create a page pool. + * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params + * @cpuid: cpu identifier */ -struct page_pool *page_pool_create(const struct page_pool_params *params) +struct page_pool * +page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; @@ -265,7 +270,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) if (!pool) return ERR_PTR(-ENOMEM); - err = page_pool_init(pool, params); + err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; @@ -282,6 +287,16 @@ err_free: kfree(pool); return ERR_PTR(err); } +EXPORT_SYMBOL(page_pool_create_percpu); + +/** + * page_pool_create() - create a page pool + * @params: parameters, see struct page_pool_params + */ +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + return page_pool_create_percpu(params, -1); +} EXPORT_SYMBOL(page_pool_create); static void page_pool_return_page(struct page_pool *pool, struct page *page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index edbbef563d4d..9e5eb47b4025 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -923,9 +923,10 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) */ if (napi_safe || in_softirq()) { const struct napi_struct *napi = READ_ONCE(pp->p.napi); + unsigned int cpuid = smp_processor_id(); - allow_direct = napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); + allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; + allow_direct |= (pp->cpuid == cpuid); } /* Driver set this to memory recycling info. Reset it on recycle. -- cgit v1.2.3 From 4d2bb0bfe8741a8778e0053f31a4e0f0cba80e8b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:55 +0100 Subject: xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp Rely on skb pointer reference instead of the skb pointer in do_xdp_generic and netif_receive_generic_xdp routine signatures. This is a preliminary patch to add multi-buff support for xdp running in generic mode where we will need to reallocate the skb to avoid linearization and we will need to make it visible to do_xdp_generic() caller. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c09415b1f48c8620ef4d76deed35050a7bddf7c2.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 4 ++-- include/linux/netdevice.h | 2 +- net/core/dev.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b472f2c972d8..bc80fc1d576e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1926,7 +1926,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { - ret = do_xdp_generic(xdp_prog, skb); + ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); @@ -2516,7 +2516,7 @@ build: skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { - ret = do_xdp_generic(xdp_prog, skb); + ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 07cefa32eafa..a3f9c95da51e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3958,7 +3958,7 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index e19bdf1421e0..ffeb0e0279fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4936,10 +4936,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct sk_buff *skb = *pskb; u32 act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should @@ -5020,24 +5021,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; - act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb, + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: - generic_xdp_tx(skb, xdp_prog); + generic_xdp_tx(*pskb, xdp_prog); break; } return XDP_DROP; @@ -5045,7 +5046,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -5368,7 +5369,8 @@ another_round: int ret2; migrate_disable(); - ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); migrate_enable(); if (ret2 != XDP_PASS) { -- cgit v1.2.3 From e6d5dbdd20aa6a86974af51deb9414cd2e7794cb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:56 +0100 Subject: xdp: add multi-buff support for xdp running in generic mode Similar to native xdp, do not always linearize the skb in netif_receive_generic_xdp routine but create a non-linear xdp_buff to be processed by the eBPF program. This allow to add multi-buffer support for xdp running in generic mode. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/1044d6412b1c3e95b40d34993fd5f37cd2f319fd.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ net/core/dev.c | 70 +++++++++++++++++++++++++++----------- net/core/skbuff.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 19 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2dde34c29203..def3d8689c3d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3446,6 +3446,8 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + struct bpf_prog *prog); bool napi_pp_put_page(struct page *page, bool napi_safe); static inline void diff --git a/net/core/dev.c b/net/core/dev.c index ffeb0e0279fe..2d02ca8a3da5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4874,6 +4874,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4903,6 +4909,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, skb->len += off; /* positive on grow, negative on shrink */ } + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || @@ -4936,12 +4950,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; - u32 act = XDP_DROP; + u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4949,41 +4986,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb, if (skb_is_redirected(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) + if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } - act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); + trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: - kfree_skb(skb); + kfree_skb(*pskb); break; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9e5eb47b4025..bdb94749f05d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -895,6 +895,97 @@ static bool is_pp_page(struct page *page) return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; } +static int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + data = page_pool_dev_alloc_va(pool, &truesize); + if (!data) + return -ENOMEM; + + nskb = napi_build_skb(data, truesize); + if (!nskb) { + page_pool_free_va(pool, data, true); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + skb_mark_for_recycle(nskb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(pool, &page_off, &truesize); + if (!data) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + struct bpf_prog *prog) +{ + if (!prog->aux->xdp_has_frags) + return -EINVAL; + + return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); + #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(struct page *page, bool napi_safe) { -- cgit v1.2.3 From 1c07dbb0cccfe85060b6eb089db3d6bfeb6aaf31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:33 +0000 Subject: net: annotate data-races around dev->name_assign_type name_assign_type_show() runs locklessly, we should annotate accesses to dev->name_assign_type. Alternative would be to grab devnet_rename_sem semaphore from name_assign_type_show(), but this would not bring more accuracy. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/net-sysfs.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2d02ca8a3da5..720bd6838212 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1228,13 +1228,13 @@ int dev_change_name(struct net_device *dev, const char *newname) dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; - dev->name_assign_type = NET_NAME_RENAMED; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); up_write(&devnet_rename_sem); return ret; } @@ -1263,7 +1263,7 @@ rollback: down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a09d507c5b03..f4c2b8267495 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -125,7 +125,7 @@ static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sysfs_emit(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, @@ -135,7 +135,7 @@ static ssize_t name_assign_type_show(struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (ndev->name_assign_type != NET_NAME_UNKNOWN) + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; -- cgit v1.2.3 From 4d42b37def70327b2bb19f823d42289aed2cd7c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:36 +0000 Subject: net: convert dev->reg_state to u8 Prepares things so that dev->reg_state reads can be lockless, by adding WRITE_ONCE() on write side. READ_ONCE()/WRITE_ONCE() do not support bitfields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++++--------- net/core/dev.c | 8 ++++---- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3f9c95da51e..631124655107 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1815,6 +1815,15 @@ enum netdev_stat_type { NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; +enum netdev_reg_state { + NETREG_UNINITIALIZED = 0, + NETREG_REGISTERED, /* completed register_netdevice */ + NETREG_UNREGISTERING, /* called unregister_netdevice */ + NETREG_UNREGISTERED, /* completed unregister todo */ + NETREG_RELEASED, /* called free_netdev */ + NETREG_DUMMY, /* dummy device for NAPI poll */ +}; + /** * struct net_device - The DEVICE structure. * @@ -2372,13 +2381,7 @@ struct net_device { struct list_head link_watch_list; - enum { NETREG_UNINITIALIZED=0, - NETREG_REGISTERED, /* completed register_netdevice */ - NETREG_UNREGISTERING, /* called unregister_netdevice */ - NETREG_UNREGISTERED, /* completed unregister todo */ - NETREG_RELEASED, /* called free_netdev */ - NETREG_DUMMY, /* dummy device for NAPI poll */ - } reg_state:8; + u8 reg_state; bool dismantle; @@ -5254,7 +5257,9 @@ static inline const char *netdev_name(const struct net_device *dev) static inline const char *netdev_reg_state(const struct net_device *dev) { - switch (dev->reg_state) { + u8 reg_state = READ_ONCE(dev->reg_state); + + switch (reg_state) { case NETREG_UNINITIALIZED: return " (uninitialized)"; case NETREG_REGISTERED: return ""; case NETREG_UNREGISTERING: return " (unregistering)"; @@ -5263,7 +5268,7 @@ static inline const char *netdev_reg_state(const struct net_device *dev) case NETREG_DUMMY: return " (dummy)"; } - WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state); + WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state); return " (unknown)"; } diff --git a/net/core/dev.c b/net/core/dev.c index 720bd6838212..9c95cae9d6ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10339,7 +10339,7 @@ int register_netdevice(struct net_device *dev) ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); write_unlock(&dev_base_lock); if (ret) goto err_uninit_notify; @@ -10630,7 +10630,7 @@ void netdev_run_todo(void) } write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); write_unlock(&dev_base_lock); linkwatch_sync_dev(dev); } @@ -11050,7 +11050,7 @@ void free_netdev(struct net_device *dev) } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -11140,7 +11140,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* And unlink it from device chain. */ write_lock(&dev_base_lock); unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); write_unlock(&dev_base_lock); } flush_all_backlogs(); -- cgit v1.2.3 From c7d52737e7ebd31cc5fef46380d94b58becf9479 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:38 +0000 Subject: net-sysfs: use dev_addr_sem to remove races in address_show() Using dev_base_lock is not preventing from reading garbage. Use dev_addr_sem instead. v4: place dev_addr_sem extern in net/core/dev.h (Jakub Kicinski) Link: https://lore.kernel.org/netdev/20240212175845.10f6680a@kernel.org/ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/dev.h | 3 +++ net/core/net-sysfs.c | 10 +++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9c95cae9d6ab..26f93446b743 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8993,7 +8993,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); -static DECLARE_RWSEM(dev_addr_sem); +DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) diff --git a/net/core/dev.h b/net/core/dev.h index a43dfe3de50e..45892267848d 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -3,6 +3,7 @@ #define _NET_CORE_DEV_H #include +#include struct net; struct net_device; @@ -46,6 +47,8 @@ extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; +extern struct rw_semaphore dev_addr_sem; + /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 678e4be69082..23ef2df549c3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -142,17 +142,21 @@ static ssize_t name_assign_type_show(struct device *dev, } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's */ +/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); - read_unlock(&dev_base_lock); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); -- cgit v1.2.3 From e51b962438741f5482c82fb225c1d59136f0fd87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:44 +0000 Subject: net: remove dev_base_lock from register_netdevice() and friends. RTNL already protects writes to dev->reg_state, we no longer need to hold dev_base_lock to protect the readers. unlist_netdevice() second argument can be removed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 26f93446b743..02cf9fd68da6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -414,7 +414,7 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev, bool lock) +static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); @@ -427,13 +427,11 @@ static void unlist_netdevice(struct net_device *dev, bool lock) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - if (lock) - write_lock(&dev_base_lock); + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - if (lock) - write_unlock(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -10338,9 +10336,9 @@ int register_netdevice(struct net_device *dev) goto err_ifindex_release; ret = netdev_register_kobject(dev); - write_lock(&dev_base_lock); + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); - write_unlock(&dev_base_lock); + if (ret) goto err_uninit_notify; @@ -10629,9 +10627,7 @@ void netdev_run_todo(void) continue; } - write_lock(&dev_base_lock); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); - write_unlock(&dev_base_lock); linkwatch_sync_dev(dev); } @@ -11138,10 +11134,8 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - write_lock(&dev_base_lock); - unlist_netdevice(dev, false); + unlist_netdevice(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); - write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -11323,7 +11317,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev, true); + unlist_netdevice(dev); synchronize_net(); -- cgit v1.2.3 From 1b3ef46cb7f2618cc0b507393220a69810f6da12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:45 +0000 Subject: net: remove dev_base_lock dev_base_lock is not needed anymore, all remaining users also hold RTNL. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- net/core/dev.c | 39 ++++----------------------------------- 2 files changed, 4 insertions(+), 37 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 697370706a82..c541550b0e6e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3077,8 +3077,6 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); -extern rwlock_t dev_base_lock; /* Device list lock */ - #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ diff --git a/net/core/dev.c b/net/core/dev.c index 02cf9fd68da6..d8dd293a7a27 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -168,28 +168,6 @@ static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); - static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ @@ -395,12 +373,10 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); @@ -427,11 +403,9 @@ static void unlist_netdevice(struct net_device *dev) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -752,9 +726,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path); * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -835,8 +809,7 @@ EXPORT_SYMBOL(netdev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -1241,15 +1214,11 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); -- cgit v1.2.3 From f853fa5c54e7a0364a52125074dedeaf2c7ddace Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 16 Feb 2024 10:25:43 +0100 Subject: net: page_pool: fix recycle stats for system page_pool allocator Use global percpu page_pool_recycle_stats counter for system page_pool allocator instead of allocating a separate percpu variable for each (also percpu) page pool instance. Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/87f572425e98faea3da45f76c3c68815c01a20ee.1708075412.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 5 +++-- net/core/dev.c | 1 + net/core/page_pool.c | 22 +++++++++++++++++----- 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 3590fbe6e3f1..5e43a08d3231 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -18,8 +18,9 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ - PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_SYSTEM_POOL BIT(2) /* Global system page_pool */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ + PP_FLAG_SYSTEM_POOL) /* * Fast allocation side cache array/stack diff --git a/net/core/dev.c b/net/core/dev.c index cc9c2eda65ac..c588808be77f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11738,6 +11738,7 @@ static int net_page_pool_create(int cpuid) #if IS_ENABLED(CONFIG_PAGE_POOL) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .flags = PP_FLAG_SYSTEM_POOL, .nid = NUMA_NO_NODE, }; struct page_pool *pp_ptr; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e8b9399d8e32..d706fe5548df 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -31,6 +31,8 @@ #define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS +static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); + /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ @@ -220,14 +222,23 @@ static int page_pool_init(struct page_pool *pool, pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS - pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); - if (!pool->recycle_stats) - return -ENOMEM; + if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; + } else { + /* For system page pool instance we use a singular stats object + * instead of allocating a separate percpu variable for each + * (also percpu) page pool instance. + */ + pool->recycle_stats = &pp_system_recycle_stats; + } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); + if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + free_percpu(pool->recycle_stats); #endif return -ENOMEM; } @@ -251,7 +262,8 @@ static void page_pool_uninit(struct page_pool *pool) put_device(pool->p.dev); #ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); + if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + free_percpu(pool->recycle_stats); #endif } -- cgit v1.2.3 From e353ea9ce471331c13edffd5977eadd602d1bb80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2024 10:50:08 +0000 Subject: rtnetlink: prepare nla_put_iflink() to run under RCU We want to be able to run rtnl_fill_ifinfo() under RCU protection instead of RTNL in the future. This patch prepares dev_get_iflink() and nla_put_iflink() to run either with RTNL or RCU held. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 4 ++-- drivers/net/can/vxcan.c | 2 +- drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 2 +- drivers/net/macsec.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/netkit.c | 2 +- drivers/net/veth.c | 2 +- drivers/net/wireless/virtual/virt_wifi.c | 2 +- net/8021q/vlan_dev.c | 4 ++-- net/core/dev.c | 2 +- net/core/rtnetlink.c | 6 +++--- net/dsa/user.c | 2 +- net/ieee802154/6lowpan/core.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/xfrm/xfrm_interface_core.c | 2 +- 16 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7a5be705d718..6f2a688fccbf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1272,10 +1272,10 @@ static int ipoib_get_iflink(const struct net_device *dev) /* parent interface */ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) - return dev->ifindex; + return READ_ONCE(dev->ifindex); /* child/vlan interface */ - return priv->parent->ifindex; + return READ_ONCE(priv->parent->ifindex); } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index 98c669ad5141..f7fabba707ea 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -119,7 +119,7 @@ static int vxcan_get_iflink(const struct net_device *dev) rcu_read_lock(); peer = rcu_dereference(priv->peer); - iflink = peer ? peer->ifindex : 0; + iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index 046b5f7d8e7c..9d2a9562c96f 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -98,7 +98,7 @@ static int rmnet_vnd_get_iflink(const struct net_device *dev) { struct rmnet_priv *priv = netdev_priv(dev); - return priv->real_dev->ifindex; + return READ_ONCE(priv->real_dev->ifindex); } static int rmnet_vnd_init(struct net_device *dev) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index df7c43a109e1..5920f7e63352 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -349,7 +349,7 @@ static int ipvlan_get_iflink(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); - return ipvlan->phy_dev->ifindex; + return READ_ONCE(ipvlan->phy_dev->ifindex); } static const struct net_device_ops ipvlan_netdev_ops = { diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 7f5426285c61..4b5513c9c2be 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3753,7 +3753,7 @@ static void macsec_get_stats64(struct net_device *dev, static int macsec_get_iflink(const struct net_device *dev) { - return macsec_priv(dev)->real_dev->ifindex; + return READ_ONCE(macsec_priv(dev)->real_dev->ifindex); } static const struct net_device_ops macsec_netdev_ops = { diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index a3cc665757e8..0cec2783a3e7 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1158,7 +1158,7 @@ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); - return vlan->lowerdev->ifindex; + return READ_ONCE(vlan->lowerdev->ifindex); } static const struct ethtool_ops macvlan_ethtool_ops = { diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 39171380ccf2..a4d2e76a8d58 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -145,7 +145,7 @@ static int netkit_get_iflink(const struct net_device *dev) rcu_read_lock(); peer = rcu_dereference(nk->peer); if (peer) - iflink = peer->ifindex; + iflink = READ_ONCE(peer->ifindex); rcu_read_unlock(); return iflink; } diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 500b9dfccd08..dd5aa8ab65a8 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1461,7 +1461,7 @@ static int veth_get_iflink(const struct net_device *dev) rcu_read_lock(); peer = rcu_dereference(priv->peer); - iflink = peer ? peer->ifindex : 0; + iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index ba14d83353a4..6a84ec58d618 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -453,7 +453,7 @@ static int virt_wifi_net_device_get_iflink(const struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); - return priv->lowerdev->ifindex; + return READ_ONCE(priv->lowerdev->ifindex); } static const struct net_device_ops virt_wifi_ops = { diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index df5552518251..39876eff51d2 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -762,9 +762,9 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) static int vlan_dev_get_iflink(const struct net_device *dev) { - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + const struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - return real_dev->ifindex; + return READ_ONCE(real_dev->ifindex); } static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, diff --git a/net/core/dev.c b/net/core/dev.c index c588808be77f..0628d8ff1ed9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -641,7 +641,7 @@ int dev_get_iflink(const struct net_device *dev) if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); - return dev->ifindex; + return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c54dbe05c4c5..060543fe7919 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1611,10 +1611,10 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { - int ifindex = dev_get_iflink(dev); + int iflink = dev_get_iflink(dev); - if (force || dev->ifindex != ifindex) - return nla_put_u32(skb, IFLA_LINK, ifindex); + if (force || READ_ONCE(dev->ifindex) != iflink) + return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } diff --git a/net/dsa/user.c b/net/dsa/user.c index 4d53c76a9840..9c42a6edcdc8 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -352,7 +352,7 @@ void dsa_user_mii_bus_init(struct dsa_switch *ds) /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { - return dsa_user_to_conduit(dev)->ifindex; + return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } static int dsa_user_open(struct net_device *dev) diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index e643f52663f9..77b4e92027c5 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -93,7 +93,7 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) static int lowpan_get_iflink(const struct net_device *dev) { - return lowpan_802154_dev(dev)->wdev->ifindex; + return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex); } static const struct net_device_ops lowpan_netdev_ops = { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 44406c28445d..5fd07581efaf 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1756,7 +1756,7 @@ int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - return t->parms.link; + return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index dafefef3cf51..717855b9acf1 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -727,7 +727,7 @@ static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return xi->p.link; + return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { -- cgit v1.2.3 From 8afc7a78d55de726b2747d7775c54def79509ec5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2024 10:50:10 +0000 Subject: ipv6: prepare inet6_fill_ifinfo() for RCU protection We want to use RCU protection instead of RTNL for inet6_fill_ifinfo(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- net/core/dev.c | 4 ++-- net/ipv6/addrconf.c | 11 +++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f07c8374f29c..09023e44db4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4354,8 +4354,10 @@ static inline bool netif_testing(const struct net_device *dev) */ static inline bool netif_oper_up(const struct net_device *dev) { - return (dev->operstate == IF_OPER_UP || - dev->operstate == IF_OPER_UNKNOWN /* backward compat */); + unsigned int operstate = READ_ONCE(dev->operstate); + + return operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN /* backward compat */; } /** diff --git a/net/core/dev.c b/net/core/dev.c index 0628d8ff1ed9..275fd5259a4a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8632,12 +8632,12 @@ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a56dad307fe3..daa81556d118 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6062,6 +6062,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, struct net_device *dev = idev->dev; struct ifinfomsg *hdr; struct nlmsghdr *nlh; + int ifindex, iflink; void *protoinfo; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); @@ -6072,16 +6073,18 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_family = AF_INET6; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; - hdr->ifi_index = dev->ifindex; + ifindex = READ_ONCE(dev->ifindex); + hdr->ifi_index = ifindex; hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; + iflink = dev_get_iflink(dev); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + (ifindex != iflink && + nla_put_u32(skb, IFLA_LINK, iflink)) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN)) goto nla_put_failure; -- cgit v1.2.3 From 0d60d8df6f493bb46bf5db40d39dd60a1bafdd4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Feb 2024 12:32:08 +0000 Subject: dpll: rely on rcu for netdev_dpll_pin() This fixes a possible UAF in if_nlmsg_size(), which can run without RTNL. Add rcu protection to "struct dpll_pin" Move netdev_dpll_pin() from netdevice.h to dpll.h to decrease name pollution. Note: This looks possible to no longer acquire RTNL in netdev_dpll_pin_assign() later in net-next. v2: do not force rcu_read_lock() in rtnl_dpll_pin_size() (Jiri Pirko) Fixes: 5f1842692880 ("netdev: expose DPLL pin handle for netdevice") Signed-off-by: Eric Dumazet Cc: Arkadiusz Kubalewski Cc: Vadim Fedorenko Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240223123208.3543319-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_core.c | 2 +- drivers/dpll/dpll_core.h | 2 ++ include/linux/dpll.h | 11 +++++++++++ include/linux/netdevice.h | 11 +---------- net/core/dev.c | 2 +- 5 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 5152bd1b0daf..4c2bb27c99fe 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -564,7 +564,7 @@ void dpll_pin_put(struct dpll_pin *pin) xa_destroy(&pin->parent_refs); xa_erase(&dpll_pin_xa, pin->id); dpll_pin_prop_free(&pin->prop); - kfree(pin); + kfree_rcu(pin, rcu); } mutex_unlock(&dpll_lock); } diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h index 717f715015c7..2b6d8ef1cdf3 100644 --- a/drivers/dpll/dpll_core.h +++ b/drivers/dpll/dpll_core.h @@ -47,6 +47,7 @@ struct dpll_device { * @prop: pin properties copied from the registerer * @rclk_dev_name: holds name of device when pin can recover clock from it * @refcount: refcount + * @rcu: rcu_head for kfree_rcu() **/ struct dpll_pin { u32 id; @@ -57,6 +58,7 @@ struct dpll_pin { struct xarray parent_refs; struct dpll_pin_properties prop; refcount_t refcount; + struct rcu_head rcu; }; /** diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 9cf896ea1d41..4ec2fe9caf5a 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include struct dpll_device; struct dpll_pin; @@ -167,4 +169,13 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); +static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_DPLL) + return rcu_dereference_rtnl(dev->dpll_pin); +#else + return NULL; +#endif +} + #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef7bfbb98497..a9c973b92294 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2469,7 +2469,7 @@ struct net_device { struct devlink_port *devlink_port; #if IS_ENABLED(CONFIG_DPLL) - struct dpll_pin *dpll_pin; + struct dpll_pin __rcu *dpll_pin; #endif #if IS_ENABLED(CONFIG_PAGE_POOL) /** @page_pools: page pools created for this netdevice */ @@ -4035,15 +4035,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); void netdev_dpll_pin_clear(struct net_device *dev); -static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ -#if IS_ENABLED(CONFIG_DPLL) - return dev->dpll_pin; -#else - return NULL; -#endif -} - struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/net/core/dev.c b/net/core/dev.c index 73a021973007..0230391c78f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9078,7 +9078,7 @@ static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll { #if IS_ENABLED(CONFIG_DPLL) rtnl_lock(); - dev->dpll_pin = dpll_pin; + rcu_assign_pointer(dev->dpll_pin, dpll_pin); rtnl_unlock(); #endif } -- cgit v1.2.3 From 1200097fa8f0d8e8ddfe5c554d8fa2bc03b2df92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Feb 2024 21:01:04 +0000 Subject: net: call skb_defer_free_flush() from __napi_busy_loop() skb_defer_free_flush() is currently called from net_rx_action() and napi_threaded_poll(). We should also call it from __napi_busy_loop() otherwise there is the risk the percpu queue can grow until an IPI is forced from skb_attempt_defer_free() adding a latency spike. Signed-off-by: Eric Dumazet Cc: Samiullah Khawaja Acked-by: Stanislav Fomichev Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240227210105.3815474-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 275fd5259a4a..053fac78305c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6173,6 +6173,27 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock(&sd->defer_lock); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock(&sd->defer_lock); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) @@ -6297,6 +6318,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); + skb_defer_free_flush(this_cpu_ptr(&softnet_data)); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) @@ -6726,27 +6748,6 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void skb_defer_free_flush(struct softnet_data *sd) -{ - struct sk_buff *skb, *next; - - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; - - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; - } -} - static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; -- cgit v1.2.3 From 3e2f544dd8a33b2f650b32920b9bef103da2a7cd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 28 Feb 2024 03:31:21 -0800 Subject: net: get stats64 if device if driver is configured If the network driver is relying in the net core to do stats allocation, then we want to dev_get_tstats64() instead of netdev_stats_to_stats64(), since there are per-cpu stats that needs to be taken in consideration. This will also simplify the drivers in regard to statistics. Once the driver sets NETDEV_PCPU_STAT_TSTATS, it doesn't not need to allocate the stacks, neither it needs to set `.ndo_get_stats64 = dev_get_tstats64` for the generic stats collection function anymore. Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 053fac78305c..34b39c03e97d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10702,6 +10702,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + dev_get_tstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } -- cgit v1.2.3 From 590e92cdc835fcf435d8611f2477fff0e16877c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Feb 2024 11:40:15 +0000 Subject: inet: prepare inet_base_seq() to run without RTNL In the following patch, inet_base_seq() will no longer be called with RTNL held. Add READ_ONCE()/WRITE_ONCE() annotations in dev_base_seq_inc() and inet_base_seq(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- net/ipv4/devinet.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4868e6734509..fe054cbd41e9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -180,8 +180,9 @@ static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0) - ; + unsigned int val = net->dev_base_seq + 1; + + WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 550b775cbbf3..2afe78dfc3c2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1837,7 +1837,7 @@ done: static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + - net->dev_base_seq; + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. -- cgit v1.2.3 From 289e922582af5b4721ba02e86bde4d9ba918158a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Mar 2024 17:35:32 -0800 Subject: dpll: move all dpll<>netdev helpers to dpll code Older versions of GCC really want to know the full definition of the type involved in rcu_assign_pointer(). struct dpll_pin is defined in a local header, net/core can't reach it. Move all the netdev <> dpll code into dpll, where the type is known. Otherwise we'd need multiple function calls to jump between the compilation units. This is the same problem the commit under fixes was trying to address, but with rcu_assign_pointer() not rcu_dereference(). Some of the exports are not needed, networking core can't be a module, we only need exports for the helpers used by drivers. Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/35a869c8-52e8-177-1d4d-e57578b99b6@linux-m68k.org/ Fixes: 640f41ed33b5 ("dpll: fix build failure due to rcu_dereference_check() on unknown type") Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240305013532.694866-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 2 +- drivers/dpll/dpll_core.c | 25 +++++++++++++---- drivers/dpll/dpll_netlink.c | 38 ++++++++++++++++---------- drivers/net/ethernet/intel/ice/ice_dpll.c | 4 +-- drivers/net/ethernet/mellanox/mlx5/core/dpll.c | 4 +-- include/linux/dpll.h | 26 +++++++++--------- include/linux/netdevice.h | 4 --- net/core/dev.c | 22 --------------- net/core/rtnetlink.c | 4 +-- 9 files changed, 64 insertions(+), 65 deletions(-) (limited to 'net/core/dev.c') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index e3d593841aa7..ea8d16600e16 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -545,7 +545,7 @@ In such scenario, dpll device input signal shall be also configurable to drive dpll with signal recovered from the PHY netdevice. This is done by exposing a pin to the netdevice - attaching pin to the netdevice itself with -``netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. +``dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. Exposed pin id handle ``DPLL_A_PIN_ID`` is then identifiable by the user as it is attached to rtnetlink respond to get ``RTM_NEWLINK`` command in nested attribute ``IFLA_DPLL_PIN``. diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 241db366b2c7..7f686d179fc9 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -42,11 +42,6 @@ struct dpll_pin_registration { void *priv; }; -struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ - return rcu_dereference_rtnl(dev->dpll_pin); -} - struct dpll_device *dpll_device_get_by_id(int id) { if (xa_get_mark(&dpll_device_xa, id, DPLL_REGISTERED)) @@ -513,6 +508,26 @@ err_pin_prop: return ERR_PTR(ret); } +static void dpll_netdev_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) +{ + rtnl_lock(); + rcu_assign_pointer(dev->dpll_pin, dpll_pin); + rtnl_unlock(); +} + +void dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) +{ + WARN_ON(!dpll_pin); + dpll_netdev_pin_assign(dev, dpll_pin); +} +EXPORT_SYMBOL(dpll_netdev_pin_set); + +void dpll_netdev_pin_clear(struct net_device *dev) +{ + dpll_netdev_pin_assign(dev, NULL); +} +EXPORT_SYMBOL(dpll_netdev_pin_clear); + /** * dpll_pin_get - find existing or create new dpll pin * @clock_id: clock_id of creator diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 4ca9ad16cd95..b57355e0c214 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include "dpll_core.h" #include "dpll_netlink.h" @@ -47,18 +48,6 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id) return 0; } -/** - * dpll_msg_pin_handle_size - get size of pin handle attribute for given pin - * @pin: pin pointer - * - * Return: byte size of pin handle attribute for given pin. - */ -size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) -{ - return pin ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */ -} -EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size); - /** * dpll_msg_add_pin_handle - attach pin handle attribute to a given message * @msg: pointer to sk_buff message to attach a pin handle @@ -68,7 +57,7 @@ EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size); * * 0 - success * * -EMSGSIZE - no space in message to attach pin handle */ -int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +static int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) { if (!pin) return 0; @@ -76,7 +65,28 @@ int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) return -EMSGSIZE; return 0; } -EXPORT_SYMBOL_GPL(dpll_msg_add_pin_handle); + +static struct dpll_pin *dpll_netdev_pin(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->dpll_pin); +} + +/** + * dpll_netdev_pin_handle_size - get size of pin handle attribute of a netdev + * @dev: netdev from which to get the pin + * + * Return: byte size of pin handle attribute, or 0 if @dev has no pin. + */ +size_t dpll_netdev_pin_handle_size(const struct net_device *dev) +{ + return dpll_netdev_pin(dev) ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */ +} + +int dpll_netdev_add_pin_handle(struct sk_buff *msg, + const struct net_device *dev) +{ + return dpll_msg_add_pin_handle(msg, dpll_netdev_pin(dev)); +} static int dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll, diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index adfa1f2a80a6..c59e972dbaae 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1597,7 +1597,7 @@ static void ice_dpll_deinit_rclk_pin(struct ice_pf *pf) } if (WARN_ON_ONCE(!vsi || !vsi->netdev)) return; - netdev_dpll_pin_clear(vsi->netdev); + dpll_netdev_pin_clear(vsi->netdev); dpll_pin_put(rclk->pin); } @@ -1641,7 +1641,7 @@ ice_dpll_init_rclk_pins(struct ice_pf *pf, struct ice_dpll_pin *pin, } if (WARN_ON((!vsi || !vsi->netdev))) return -EINVAL; - netdev_dpll_pin_set(vsi->netdev, pf->dplls.rclk.pin); + dpll_netdev_pin_set(vsi->netdev, pf->dplls.rclk.pin); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c index 928bf24d4b12..d74a5aaf4268 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c @@ -261,7 +261,7 @@ static void mlx5_dpll_netdev_dpll_pin_set(struct mlx5_dpll *mdpll, { if (mdpll->tracking_netdev) return; - netdev_dpll_pin_set(netdev, mdpll->dpll_pin); + dpll_netdev_pin_set(netdev, mdpll->dpll_pin); mdpll->tracking_netdev = netdev; } @@ -269,7 +269,7 @@ static void mlx5_dpll_netdev_dpll_pin_clear(struct mlx5_dpll *mdpll) { if (!mdpll->tracking_netdev) return; - netdev_dpll_pin_clear(mdpll->tracking_netdev); + dpll_netdev_pin_clear(mdpll->tracking_netdev); mdpll->tracking_netdev = NULL; } diff --git a/include/linux/dpll.h b/include/linux/dpll.h index c60591308ae8..e37344f6a231 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -122,15 +122,24 @@ struct dpll_pin_properties { }; #if IS_ENABLED(CONFIG_DPLL) -size_t dpll_msg_pin_handle_size(struct dpll_pin *pin); -int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin); +void dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); +void dpll_netdev_pin_clear(struct net_device *dev); + +size_t dpll_netdev_pin_handle_size(const struct net_device *dev); +int dpll_netdev_add_pin_handle(struct sk_buff *msg, + const struct net_device *dev); #else -static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) +static inline void +dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) { } +static inline void dpll_netdev_pin_clear(struct net_device *dev) { } + +static inline size_t dpll_netdev_pin_handle_size(const struct net_device *dev) { return 0; } -static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +static inline int +dpll_netdev_add_pin_handle(struct sk_buff *msg, const struct net_device *dev) { return 0; } @@ -169,13 +178,4 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); -#if !IS_ENABLED(CONFIG_DPLL) -static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ - return NULL; -} -#else -struct dpll_pin *netdev_dpll_pin(const struct net_device *dev); -#endif - #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 735a9386fcf8..78a09af89e39 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -79,8 +79,6 @@ struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; -/* DPLL specific */ -struct dpll_pin; typedef u32 xdp_features_t; @@ -4042,8 +4040,6 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); -void netdev_dpll_pin_clear(struct net_device *dev); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 0230391c78f7..76e6438f4858 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9074,28 +9074,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) -{ -#if IS_ENABLED(CONFIG_DPLL) - rtnl_lock(); - rcu_assign_pointer(dev->dpll_pin, dpll_pin); - rtnl_unlock(); -#endif -} - -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) -{ - WARN_ON(!dpll_pin); - netdev_dpll_pin_assign(dev, dpll_pin); -} -EXPORT_SYMBOL(netdev_dpll_pin_set); - -void netdev_dpll_pin_clear(struct net_device *dev) -{ - netdev_dpll_pin_assign(dev, NULL); -} -EXPORT_SYMBOL(netdev_dpll_pin_clear); - /** * dev_change_proto_down - set carrier according to proto_down. * diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ae86f751efc3..bd50e9fe3234 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1057,7 +1057,7 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ - size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev)); + size += dpll_netdev_pin_handle_size(dev); return size; } @@ -1792,7 +1792,7 @@ static int rtnl_fill_dpll_pin(struct sk_buff *skb, if (!dpll_pin_nest) return -EMSGSIZE; - ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev)); + ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; -- cgit v1.2.3 From ae6e22f7b7f0702015d86cfa036492b94be92f04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:15 +0000 Subject: net: move netdev_budget and netdev_budget to net_hotdata netdev_budget and netdev_budget are used in rx path (net_rx_action()) Move them into net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/hotdata.h | 2 ++ net/core/dev.c | 7 ++----- net/core/dev.h | 2 -- net/core/hotdata.c | 6 ++++++ net/core/sysctl_net_core.c | 4 ++-- 5 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 6ed32e4e34aa..72170223385e 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -8,6 +8,8 @@ struct net_hotdata { struct list_head offload_base; int gro_normal_batch; + int netdev_budget; + int netdev_budget_usecs; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index bf933eeaa688..9ccb48618dba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4410,9 +4410,6 @@ EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; unsigned int sysctl_skb_defer_max __read_mostly = 64; -int netdev_budget __read_mostly = 300; -/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ -unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -6790,8 +6787,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); - int budget = READ_ONCE(netdev_budget); + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); diff --git a/net/core/dev.h b/net/core/dev.h index 45892267848d..9a6170530850 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -38,8 +38,6 @@ int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index abb8ad19d59a..907d69120397 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,9 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include #include +#include #include + struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), .gro_normal_batch = 8, + + .netdev_budget = 300, + /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ + .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, }; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0eb1242eabbe..a9c2d798b219 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -577,7 +577,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -600,7 +600,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, -- cgit v1.2.3 From f59b5416c396ac4910dd7a0cdf26cbb0e1faf529 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:16 +0000 Subject: net: move netdev_tstamp_prequeue into net_hotdata netdev_tstamp_prequeue is used in rx path. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/hotdata.h | 1 + net/core/dev.c | 10 +++++----- net/core/dev.h | 1 - net/core/hotdata.c | 2 ++ net/core/sysctl_net_core.c | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 72170223385e..149e56528537 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -10,6 +10,7 @@ struct net_hotdata { int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; + int tstamp_prequeue; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index 9ccb48618dba..f2f44303c035 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4408,7 +4408,6 @@ EXPORT_SYMBOL(__dev_direct_xmit); int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); -int netdev_tstamp_prequeue __read_mostly = 1; unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ @@ -5052,7 +5051,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5344,7 +5343,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5728,7 +5727,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5758,7 +5757,8 @@ void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); diff --git a/net/core/dev.h b/net/core/dev.h index 9a6170530850..2bcaf8eee50c 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -39,7 +39,6 @@ void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern unsigned int sysctl_skb_defer_max; -extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 907d69120397..087c4c84987d 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -12,4 +12,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .netdev_budget = 300, /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, + + .tstamp_prequeue = 1, }; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a9c2d798b219..bddd07da0998 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -499,7 +499,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec -- cgit v1.2.3 From 0b91fa4bfb1caedd01cb6eb3b733cbc77c9edb0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:17 +0000 Subject: net: move ptype_all into net_hotdata ptype_all is used in rx/tx fast paths. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 16 +++++++--------- net/core/hotdata.c | 1 + net/core/net-procfs.c | 7 ++++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6643452af543..b18ac8072f18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5306,7 +5306,6 @@ static inline const char *netdev_reg_state(const struct net_device *dev) #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) -extern struct list_head ptype_all __read_mostly; extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 149e56528537..d462cb8f16ba 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -7,6 +7,7 @@ /* Read mostly data used in network fast paths. */ struct net_hotdata { struct list_head offload_base; + struct list_head ptype_all; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index f2f44303c035..e1493e558407 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -161,7 +161,6 @@ static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, @@ -540,7 +539,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &ptype_all; + return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; @@ -2226,7 +2225,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) */ bool dev_nit_active(struct net_device *dev) { - return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); + return !list_empty(&net_hotdata.ptype_all) || + !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); @@ -2237,10 +2237,9 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct list_head *ptype_list = &net_hotdata.ptype_all; + struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; - struct list_head *ptype_list = &ptype_all; rcu_read_lock(); again: @@ -2286,7 +2285,7 @@ again: pt_prev = ptype; } - if (ptype_list == &ptype_all) { + if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -5387,7 +5386,7 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -11749,7 +11748,6 @@ static int __init net_dev_init(void) if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 087c4c84987d..29fcfe89fd9a 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -7,6 +7,7 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), .gro_normal_batch = 8, .netdev_budget = 300, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 2e4e96d30ee1..a97eceb84e61 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "dev.h" @@ -183,7 +184,7 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &ptype_all, list) { + list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { if (i == pos) return pt; ++i; @@ -231,13 +232,13 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) } } - nxt = ptype_all.next; + nxt = net_hotdata.ptype_all.next; goto ptype_all; } if (pt->type == htons(ETH_P_ALL)) { ptype_all: - if (nxt != &ptype_all) + if (nxt != &net_hotdata.ptype_all) goto found; hash = 0; nxt = ptype_base[0].next; -- cgit v1.2.3 From edbc666cdcbf4a80ada4311c272a2078af87b880 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:18 +0000 Subject: net: move netdev_max_backlog to net_hotdata netdev_max_backlog is used in rx fat path. Move it to net_hodata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 8 +++----- net/core/gro_cells.c | 3 ++- net/core/hotdata.c | 2 ++ net/core/sysctl_net_core.c | 2 +- net/xfrm/espintcp.c | 4 +++- net/xfrm/xfrm_input.c | 3 ++- 8 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b18ac8072f18..c9a671b7bb37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,7 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index d462cb8f16ba..dc50b200a94b 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -12,6 +12,7 @@ struct net_hotdata { int netdev_budget; int netdev_budget_usecs; int tstamp_prequeue; + int max_backlog; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index e1493e558407..e23edbaff392 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4404,9 +4404,6 @@ EXPORT_SYMBOL(__dev_direct_xmit); * Receiver routines *************************************************************************/ -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); - unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ @@ -4713,7 +4710,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4761,7 +4758,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(net_hotdata.max_backlog) && + !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ed5ec5de47f6..ff8e5b64bf6b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -3,6 +3,7 @@ #include #include #include +#include struct gro_cell { struct sk_buff_head napi_skbs; @@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 29fcfe89fd9a..35ed5a83ecc7 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -15,4 +15,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, .tstamp_prequeue = 1, + .max_backlog = 1000, }; +EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index bddd07da0998..8eaeeb289914 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -440,7 +440,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index d3b3f9e720b3..fe82e2d07300 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -169,7 +170,8 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&ctx->out_queue) >= + READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bd4ce21d76d7..161f535c8b94 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "xfrm_inout.h" @@ -764,7 +765,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, trans = this_cpu_ptr(&xfrm_trans_tasklet); - if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); -- cgit v1.2.3 From 26722dc74bf08fd79564cbcad1e5f3e2aa3bf9cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:21 +0000 Subject: net: move dev_tx_weight to net_hotdata dev_tx_weight is used in tx fast path. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 1 - net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 2 +- net/sched/sch_generic.c | 3 ++- 6 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c9a671b7bb37..ad4b031098ff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4794,7 +4794,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int dev_rx_weight; -extern int dev_tx_weight; enum { NESTED_SYNC_IMM_BIT, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index d86d02f156fc..ffea9cc263e5 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -21,6 +21,7 @@ struct net_hotdata { int netdev_budget_usecs; int tstamp_prequeue; int max_backlog; + int dev_tx_weight; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index e23edbaff392..5d6bd481103f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4409,7 +4409,6 @@ int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; -int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 35ed5a83ecc7..ec8c3b48e8fe 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -16,5 +16,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, + .dev_tx_weight = 64, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8eaeeb289914..a30016a8660e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -302,7 +302,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9b3e9262040b..ff5336493777 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -409,7 +410,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = READ_ONCE(dev_tx_weight); + int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { -- cgit v1.2.3 From 71c0de9bac9c1dda503322c86be4924f055dc6c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:22 +0000 Subject: net: move dev_rx_weight to net_hotdata dev_rx_weight is read from process_backlog(). Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/net/hotdata.h | 1 + net/core/dev.c | 3 +-- net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad4b031098ff..dd641297e807 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,8 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int dev_rx_weight; - enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index ffea9cc263e5..e6595ed2c3be 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -22,6 +22,7 @@ struct net_hotdata { int tstamp_prequeue; int max_backlog; int dev_tx_weight; + int dev_rx_weight; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index 5d6bd481103f..40ba02e04bcb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4408,7 +4408,6 @@ unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ -int dev_rx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -5978,7 +5977,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = READ_ONCE(dev_rx_weight); + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index ec8c3b48e8fe..c8a7a451c18a 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -17,5 +17,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, .dev_tx_weight = 64, + .dev_rx_weight = 64, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a30016a8660e..8a4c698dad9c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -301,7 +301,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); -- cgit v1.2.3 From 490a79faf95e705ba0ffd9ebf04a624b379e53c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:30 +0000 Subject: net: introduce include/net/rps.h Move RPS related structures and helpers from include/linux/netdevice.h and include/net/sock.h to a new include file. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-18-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_arfs.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 1 + drivers/net/ethernet/sfc/rx_common.c | 1 + drivers/net/ethernet/sfc/siena/rx_common.c | 1 + drivers/net/tun.c | 1 + include/linux/netdevice.h | 82 -------------- include/net/rps.h | 127 ++++++++++++++++++++++ include/net/sock.h | 35 ------ net/core/dev.c | 1 + net/core/net-sysfs.c | 1 + net/core/sysctl_net_core.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 1 + net/ipv6/af_inet6.c | 1 + net/sctp/socket.c | 1 + 16 files changed, 140 insertions(+), 117 deletions(-) create mode 100644 include/net/rps.h (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c index cca0e753f38f..7cee365cc7d1 100644 --- a/drivers/net/ethernet/intel/ice/ice_arfs.c +++ b/drivers/net/ethernet/intel/ice/ice_arfs.c @@ -2,6 +2,7 @@ /* Copyright (C) 2018-2020, Intel Corporation. */ #include "ice.h" +#include /** * ice_is_arfs_active - helper to check is aRFS is active diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index d7da62cda821..5d3fde63b273 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index e66f486faafe..c7f542d0b8f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "en.h" #define ARFS_HASH_SHIFT BITS_PER_BYTE diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index fac227d372db..dcd901eccfc8 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -11,6 +11,7 @@ #include "net_driver.h" #include #include +#include #include "efx.h" #include "nic.h" #include "rx_common.h" diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c index 4579f43484c3..219fb358a646 100644 --- a/drivers/net/ethernet/sfc/siena/rx_common.c +++ b/drivers/net/ethernet/sfc/siena/rx_common.c @@ -11,6 +11,7 @@ #include "net_driver.h" #include #include +#include #include "efx.h" #include "nic.h" #include "rx_common.h" diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8d258e263f54..0b3f21cba552 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd641297e807..416a800d72ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -225,12 +225,6 @@ struct net_device_core_stats { #include #include -#ifdef CONFIG_RPS -#include -extern struct static_key_false rps_needed; -extern struct static_key_false rfs_needed; -#endif - struct neighbour; struct neigh_parms; struct sk_buff; @@ -730,86 +724,10 @@ static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node #endif } -#ifdef CONFIG_RPS -/* - * This structure holds an RPS map which can be of variable length. The - * map is an array of CPUs. - */ -struct rps_map { - unsigned int len; - struct rcu_head rcu; - u16 cpus[]; -}; -#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) - -/* - * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. - */ -struct rps_dev_flow { - u16 cpu; - u16 filter; - unsigned int last_qtail; -}; -#define RPS_NO_FILTER 0xffff - -/* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - unsigned int mask; - struct rcu_head rcu; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - -/* - * The rps_sock_flow_table contains mappings of flows to the last CPU - * on which they were processed by the application (set in recvmsg). - * Each entry is a 32bit value. Upper part is the high-order bits - * of flow hash, lower part is CPU number. - * rps_cpu_mask is used to partition the space, depending on number of - * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 - * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, - * meaning we use 32-6=26 bits for the hash. - */ -struct rps_sock_flow_table { - u32 mask; - - u32 ents[] ____cacheline_aligned_in_smp; -}; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) - -#define RPS_NO_CPU 0xffff - -extern u32 rps_cpu_mask; -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - -static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, - u32 hash) -{ - if (table && hash) { - unsigned int index = hash & table->mask; - u32 val = hash & ~rps_cpu_mask; - - /* We only give a hint, preemption can change CPU under us */ - val |= raw_smp_processor_id(); - - /* The following WRITE_ONCE() is paired with the READ_ONCE() - * here, and another one in get_rps_cpu(). - */ - if (READ_ONCE(table->ents[index]) != val) - WRITE_ONCE(table->ents[index], val); - } -} - #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif -#endif /* CONFIG_RPS */ /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { diff --git a/include/net/rps.h b/include/net/rps.h new file mode 100644 index 000000000000..6081d817d245 --- /dev/null +++ b/include/net/rps.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_H +#define _NET_RPS_H + +#include +#include +#include + +#ifdef CONFIG_RPS + +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; + +/* + * This structure holds an RPS map which can be of variable length. The + * map is an array of CPUs. + */ +struct rps_map { + unsigned int len; + struct rcu_head rcu; + u16 cpus[]; +}; +#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) + +/* + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, and + * a hardware filter index. + */ +struct rps_dev_flow { + u16 cpu; + u16 filter; + unsigned int last_qtail; +}; +#define RPS_NO_FILTER 0xffff + +/* + * The rps_dev_flow_table structure contains a table of flow mappings. + */ +struct rps_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct rps_dev_flow flows[]; +}; +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ + ((_num) * sizeof(struct rps_dev_flow))) + +/* + * The rps_sock_flow_table contains mappings of flows to the last CPU + * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high-order bits + * of flow hash, lower part is CPU number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. + */ +struct rps_sock_flow_table { + u32 mask; + + u32 ents[] ____cacheline_aligned_in_smp; +}; +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) + +#define RPS_NO_CPU 0xffff + +extern u32 rps_cpu_mask; +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; + +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, + u32 hash) +{ + unsigned int index = hash & table->mask; + u32 val = hash & ~rps_cpu_mask; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (READ_ONCE(table->ents[index]) != val) + WRITE_ONCE(table->ents[index], val); +} + +#endif /* CONFIG_RPS */ + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + if (!hash) + return; + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (sock_flow_table) + rps_record_sock_flow(sock_flow_table, hash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } + } +#endif +} + +#endif /* _NET_RPS_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 09a0cde8bf52..b5e00702acc1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1117,41 +1117,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, hash); - rcu_read_unlock(); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } - } -#endif -} static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 40ba02e04bcb..bcf49b0393d2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -155,6 +155,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index af238026ac3c..5560083774b1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8a4c698dad9c..4b93e27404e8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "dev.h" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6f1cfd176e7b..55bd72997b31 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #endif #include #include +#include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7e1b848398d0..c5b83875411a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include #include +#include /* Track pending CMSGs. */ enum { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b90d46533cdc..8041dc181bd4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b9fcdb0952a..c67679a41044 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -67,6 +67,7 @@ #include #include #include +#include /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); -- cgit v1.2.3 From ce7f49ab741591d83e33e56948bac2f12de6e14e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:31 +0000 Subject: net: move rps_sock_flow_table to net_hotdata rps_sock_flow_table and rps_cpu_mask are used in fast path. Move them to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-19-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/hotdata.h | 4 ++++ include/net/rps.h | 8 +++----- net/core/dev.c | 12 +++--------- net/core/sysctl_net_core.c | 9 ++++++--- 4 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/net/hotdata.h b/include/net/hotdata.h index b0b847585f7e..003667a1efd6 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -27,6 +27,10 @@ struct net_hotdata { struct kmem_cache *skbuff_cache; struct kmem_cache *skbuff_fclone_cache; struct kmem_cache *skb_small_head_cache; +#ifdef CONFIG_RPS + struct rps_sock_flow_table __rcu *rps_sock_flow_table; + u32 rps_cpu_mask; +#endif int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/include/net/rps.h b/include/net/rps.h index 6081d817d245..7660243e905b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef CONFIG_RPS @@ -64,14 +65,11 @@ struct rps_sock_flow_table { #define RPS_NO_CPU 0xffff -extern u32 rps_cpu_mask; -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { unsigned int index = hash & table->mask; - u32 val = hash & ~rps_cpu_mask; + u32 val = hash & ~net_hotdata.rps_cpu_mask; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id(); @@ -93,7 +91,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash) if (!hash) return; rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); diff --git a/net/core/dev.c b/net/core/dev.c index bcf49b0393d2..0766a245816b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4450,12 +4450,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); -u32 rps_cpu_mask __read_mostly; -EXPORT_SYMBOL(rps_cpu_mask); - struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; @@ -4547,7 +4541,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!hash) goto done; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; @@ -4557,10 +4551,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); - if ((ident ^ hash) & ~rps_cpu_mask) + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; - next_cpu = ident & rps_cpu_mask; + next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4b93e27404e8..6973dda3abda 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -140,7 +140,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -161,7 +162,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -172,7 +174,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); -- cgit v1.2.3 From 6ebfad33161afacb3e1e59ed1c2feefef70f9f97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2024 14:18:16 +0000 Subject: packet: annotate data-races around ignore_outgoing ignore_outgoing is read locklessly from dev_queue_xmit_nit() and packet_getsockopt() Add appropriate READ_ONCE()/WRITE_ONCE() annotations. syzbot reported: BUG: KCSAN: data-race in dev_queue_xmit_nit / packet_setsockopt write to 0xffff888107804542 of 1 bytes by task 22618 on cpu 0: packet_setsockopt+0xd83/0xfd0 net/packet/af_packet.c:4003 do_sock_setsockopt net/socket.c:2311 [inline] __sys_setsockopt+0x1d8/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2340 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 read to 0xffff888107804542 of 1 bytes by task 27 on cpu 1: dev_queue_xmit_nit+0x82/0x620 net/core/dev.c:2248 xmit_one net/core/dev.c:3527 [inline] dev_hard_start_xmit+0xcc/0x3f0 net/core/dev.c:3547 __dev_queue_xmit+0xf24/0x1dd0 net/core/dev.c:4335 dev_queue_xmit include/linux/netdevice.h:3091 [inline] batadv_send_skb_packet+0x264/0x300 net/batman-adv/send.c:108 batadv_send_broadcast_skb+0x24/0x30 net/batman-adv/send.c:127 batadv_iv_ogm_send_to_if net/batman-adv/bat_iv_ogm.c:392 [inline] batadv_iv_ogm_emit net/batman-adv/bat_iv_ogm.c:420 [inline] batadv_iv_send_outstanding_bat_ogm_packet+0x3f0/0x4b0 net/batman-adv/bat_iv_ogm.c:1700 process_one_work kernel/workqueue.c:3254 [inline] process_scheduled_works+0x465/0x990 kernel/workqueue.c:3335 worker_thread+0x526/0x730 kernel/workqueue.c:3416 kthread+0x1d1/0x210 kernel/kthread.c:388 ret_from_fork+0x4b/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 27 Comm: kworker/u8:1 Tainted: G W 6.8.0-syzkaller-08073-g480e035fc4c7 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Workqueue: bat_events batadv_iv_send_outstanding_bat_ogm_packet Fixes: fa788d986a3a ("packet: add sockopt to ignore outgoing packets") Reported-by: syzbot+c669c1136495a2e7c31f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/CANn89i+Z7MfbkBLOv=p7KZ7=K1rKHO4P1OL5LYDCtBiyqsa9oQ@mail.gmail.com/T/#t Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/packet/af_packet.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 0766a245816b..722787c32755 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2245,7 +2245,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { - if (ptype->ignore_outgoing) + if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 61270826b9ac..7cfc7d301508 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4000,7 +4000,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (val < 0 || val > 1) return -EINVAL; - po->prot_hook.ignore_outgoing = !!val; + WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: @@ -4134,7 +4134,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_IGNORE_OUTGOING: - val = po->prot_hook.ignore_outgoing; + val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) -- cgit v1.2.3 From f6e0a4984c2e7244689ea87b62b433bed9d07e94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2024 20:08:45 +0000 Subject: net: move dev->state into net_device_read_txrx group dev->state can be read in rx and tx fast paths. netif_running() which needs dev->state is called from - enqueue_to_backlog() [RX path] - __dev_direct_xmit() [TX path] Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240314200845.3050179-1-edumazet@google.com Signed-off-by: Paolo Abeni --- Documentation/networking/net_cachelines/net_device.rst | 2 +- include/linux/netdevice.h | 2 +- net/core/dev.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index dceb49d56a91..70c4fb9d4e5c 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -13,7 +13,7 @@ struct_dev_ifalias* ifalias unsigned_long mem_end unsigned_long mem_start unsigned_long base_addr -unsigned_long state +unsigned_long state read_mostly read_mostly netif_running(dev) struct_list_head dev_list struct_list_head napi_list struct_list_head unreg_list diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6f6ac779b34..cb37817d6382 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2072,6 +2072,7 @@ struct net_device { struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; }; + unsigned long state; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; @@ -2117,7 +2118,6 @@ struct net_device { * part of the usual set specified in Space.c. */ - unsigned long state; struct list_head dev_list; struct list_head napi_list; diff --git a/net/core/dev.c b/net/core/dev.c index 722787c32755..303a6ff46e4e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11665,11 +11665,12 @@ static void __init net_dev_struct_check(void) /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); -- cgit v1.2.3 From d6dbbb11247c71203785a2c9da474c36f4b19eae Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Tue, 19 Mar 2024 13:44:37 -0700 Subject: net: report RCU QS on threaded NAPI repolling NAPI threads can keep polling packets under load. Currently it is only calling cond_resched() before repolling, but it is not sufficient to clear out the holdout of RCU tasks, which prevent BPF tracing programs from detaching for long period. This can be reproduced easily with following set up: ip netns add test1 ip netns add test2 ip -n test1 link add veth1 type veth peer name veth2 netns test2 ip -n test1 link set veth1 up ip -n test1 link set lo up ip -n test2 link set veth2 up ip -n test2 link set lo up ip -n test1 addr add 192.168.1.2/31 dev veth1 ip -n test1 addr add 1.1.1.1/32 dev lo ip -n test2 addr add 192.168.1.3/31 dev veth2 ip -n test2 addr add 2.2.2.2/31 dev lo ip -n test1 route add default via 192.168.1.3 ip -n test2 route add default via 192.168.1.2 for i in `seq 10 210`; do for j in `seq 10 210`; do ip netns exec test2 iptables -I INPUT -s 3.3.$i.$j -p udp --dport 5201 done done ip netns exec test2 ethtool -K veth2 gro on ip netns exec test2 bash -c 'echo 1 > /sys/class/net/veth2/threaded' ip netns exec test1 ethtool -K veth1 tso off Then run an iperf3 client/server and a bpftrace script can trigger it: ip netns exec test2 iperf3 -s -B 2.2.2.2 >/dev/null& ip netns exec test1 iperf3 -c 2.2.2.2 -B 1.1.1.1 -u -l 1500 -b 3g -t 100 >/dev/null& bpftrace -e 'kfunc:__napi_poll{@=count();} interval:s:1{exit();}' Report RCU quiescent states periodically will resolve the issue. Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support") Reviewed-by: Jesper Dangaard Brouer Signed-off-by: Yan Zhai Acked-by: Paul E. McKenney Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/4c3b0d3f32d3b18949d75b18e5e1d9f13a24f025.1710877680.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 303a6ff46e4e..9a67003e49db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6743,6 +6743,8 @@ static int napi_threaded_poll(void *data) void *have; while (!napi_thread_wait(napi)) { + unsigned long last_qs = jiffies; + for (;;) { bool repoll = false; @@ -6767,6 +6769,7 @@ static int napi_threaded_poll(void *data) if (!repoll) break; + rcu_softirq_qs_periodic(last_qs); cond_resched(); } } -- cgit v1.2.3