From 12a686c2e761f1f1f6e6e2117a9ab9c6de2ac8a7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 26 Feb 2024 02:24:52 +0000 Subject: net: make SK_MEMORY_PCPU_RESERV tunable This patch adds /proc/sys/net/core/mem_pcpu_rsv sysctl file, to make SK_MEMORY_PCPU_RESERV tunable. Commit 3cd3399dd7a8 ("net: implement per-cpu reserves for memory_allocated") introduced per-cpu forward alloc cache: "Implement a per-cpu cache of +1/-1 MB, to reduce number of changes to sk->sk_prot->memory_allocated, which would otherwise be cause of false sharing." sk_prot->memory_allocated points to global atomic variable: atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; If increasing the per-cpu cache size from 1MB to e.g. 16MB, changes to sk->sk_prot->memory_allocated can be further reduced. Performance may be improved on system with many cores. Signed-off-by: Adam Li Reviewed-by: Christoph Lameter (Ampere) Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core/sysctl_net_core.c') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0f0cb1465e08..986f15e5d6c4 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -30,6 +30,7 @@ static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -407,6 +408,14 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, + { + .procname = "mem_pcpu_rsv", + .data = &sysctl_mem_pcpu_rsv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_mem_pcpu_rsv, + }, { .procname = "dev_weight", .data = &weight_p, -- cgit v1.2.3 From 2658b5a8a4eee5fad378d0bde2f221deacbc58f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:14 +0000 Subject: net: introduce struct net_hotdata Instead of spreading networking critical fields all over the places, add a custom net_hotdata structure so that we can precisely control its layout. In this first patch, move : - gro_normal_batch used in rx (GRO stack) - offload_base used in rx and tx (GRO and TSO stacks) Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/gro.h | 5 ++--- include/net/hotdata.h | 15 +++++++++++++++ net/core/Makefile | 1 + net/core/gro.c | 15 ++++++--------- net/core/gso.c | 4 ++-- net/core/hotdata.c | 9 +++++++++ net/core/sysctl_net_core.c | 3 ++- 8 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 include/net/hotdata.h create mode 100644 net/core/hotdata.c (limited to 'net/core/sysctl_net_core.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2767467138a0..6643452af543 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4796,7 +4796,6 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; -extern int gro_normal_batch; enum { NESTED_SYNC_IMM_BIT, diff --git a/include/net/gro.h b/include/net/gro.h index 2b58671a6549..d6fc8fbd3730 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -9,6 +9,7 @@ #include #include #include +#include struct napi_gro_cb { union { @@ -446,7 +447,7 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, { list_add_tail(&skb->list, &napi->rx_list); napi->rx_count += segs; - if (napi->rx_count >= READ_ONCE(gro_normal_batch)) + if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(napi); } @@ -493,6 +494,4 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * #endif } -extern struct list_head offload_base; - #endif /* _NET_IPV6_GRO_H */ diff --git a/include/net/hotdata.h b/include/net/hotdata.h new file mode 100644 index 000000000000..6ed32e4e34aa --- /dev/null +++ b/include/net/hotdata.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_HOTDATA_H +#define _NET_HOTDATA_H + +#include + +/* Read mostly data used in network fast paths. */ +struct net_hotdata { + struct list_head offload_base; + int gro_normal_batch; +}; + +extern struct net_hotdata net_hotdata; + +#endif /* _NET_HOTDATA_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 821aec06abf1..6e6548011fae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -18,6 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o +obj-y += hotdata.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/gro.c b/net/core/gro.c index 6a0edbd826a1..ee30d4f0c038 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -10,9 +10,6 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers @@ -31,7 +28,7 @@ void dev_add_offload(struct packet_offload *po) struct packet_offload *elem; spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { + list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } @@ -55,7 +52,7 @@ EXPORT_SYMBOL(dev_add_offload); */ static void __dev_remove_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); @@ -235,9 +232,9 @@ done: static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); @@ -444,7 +441,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -550,7 +547,7 @@ normal: struct packet_offload *gro_find_receive_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -564,7 +561,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { diff --git a/net/core/gso.c b/net/core/gso.c index 9e1803bfc9c6..bcd156372f4d 100644 --- a/net/core/gso.c +++ b/net/core/gso.c @@ -17,7 +17,7 @@ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, struct packet_offload *ptype; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; @@ -48,7 +48,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, __skb_pull(skb, vlan_depth); rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; diff --git a/net/core/hotdata.c b/net/core/hotdata.c new file mode 100644 index 000000000000..abb8ad19d59a --- /dev/null +++ b/net/core/hotdata.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include + +struct net_hotdata net_hotdata __cacheline_aligned = { + .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .gro_normal_batch = 8, +}; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 986f15e5d6c4..0eb1242eabbe 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "dev.h" @@ -632,7 +633,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, -- cgit v1.2.3 From ae6e22f7b7f0702015d86cfa036492b94be92f04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:15 +0000 Subject: net: move netdev_budget and netdev_budget to net_hotdata netdev_budget and netdev_budget are used in rx path (net_rx_action()) Move them into net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/hotdata.h | 2 ++ net/core/dev.c | 7 ++----- net/core/dev.h | 2 -- net/core/hotdata.c | 6 ++++++ net/core/sysctl_net_core.c | 4 ++-- 5 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 6ed32e4e34aa..72170223385e 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -8,6 +8,8 @@ struct net_hotdata { struct list_head offload_base; int gro_normal_batch; + int netdev_budget; + int netdev_budget_usecs; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index bf933eeaa688..9ccb48618dba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4410,9 +4410,6 @@ EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; unsigned int sysctl_skb_defer_max __read_mostly = 64; -int netdev_budget __read_mostly = 300; -/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ -unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -6790,8 +6787,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); - int budget = READ_ONCE(netdev_budget); + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); diff --git a/net/core/dev.h b/net/core/dev.h index 45892267848d..9a6170530850 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -38,8 +38,6 @@ int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index abb8ad19d59a..907d69120397 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,9 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include #include +#include #include + struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), .gro_normal_batch = 8, + + .netdev_budget = 300, + /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ + .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, }; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0eb1242eabbe..a9c2d798b219 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -577,7 +577,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -600,7 +600,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, -- cgit v1.2.3 From f59b5416c396ac4910dd7a0cdf26cbb0e1faf529 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:16 +0000 Subject: net: move netdev_tstamp_prequeue into net_hotdata netdev_tstamp_prequeue is used in rx path. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/hotdata.h | 1 + net/core/dev.c | 10 +++++----- net/core/dev.h | 1 - net/core/hotdata.c | 2 ++ net/core/sysctl_net_core.c | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 72170223385e..149e56528537 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -10,6 +10,7 @@ struct net_hotdata { int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; + int tstamp_prequeue; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index 9ccb48618dba..f2f44303c035 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4408,7 +4408,6 @@ EXPORT_SYMBOL(__dev_direct_xmit); int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); -int netdev_tstamp_prequeue __read_mostly = 1; unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ @@ -5052,7 +5051,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5344,7 +5343,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5728,7 +5727,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5758,7 +5757,8 @@ void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); diff --git a/net/core/dev.h b/net/core/dev.h index 9a6170530850..2bcaf8eee50c 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -39,7 +39,6 @@ void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern unsigned int sysctl_skb_defer_max; -extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 907d69120397..087c4c84987d 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -12,4 +12,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .netdev_budget = 300, /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, + + .tstamp_prequeue = 1, }; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a9c2d798b219..bddd07da0998 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -499,7 +499,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec -- cgit v1.2.3 From edbc666cdcbf4a80ada4311c272a2078af87b880 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:18 +0000 Subject: net: move netdev_max_backlog to net_hotdata netdev_max_backlog is used in rx fat path. Move it to net_hodata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 8 +++----- net/core/gro_cells.c | 3 ++- net/core/hotdata.c | 2 ++ net/core/sysctl_net_core.c | 2 +- net/xfrm/espintcp.c | 4 +++- net/xfrm/xfrm_input.c | 3 ++- 8 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b18ac8072f18..c9a671b7bb37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,7 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index d462cb8f16ba..dc50b200a94b 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -12,6 +12,7 @@ struct net_hotdata { int netdev_budget; int netdev_budget_usecs; int tstamp_prequeue; + int max_backlog; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index e1493e558407..e23edbaff392 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4404,9 +4404,6 @@ EXPORT_SYMBOL(__dev_direct_xmit); * Receiver routines *************************************************************************/ -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); - unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ @@ -4713,7 +4710,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4761,7 +4758,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(net_hotdata.max_backlog) && + !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ed5ec5de47f6..ff8e5b64bf6b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -3,6 +3,7 @@ #include #include #include +#include struct gro_cell { struct sk_buff_head napi_skbs; @@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 29fcfe89fd9a..35ed5a83ecc7 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -15,4 +15,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, .tstamp_prequeue = 1, + .max_backlog = 1000, }; +EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index bddd07da0998..8eaeeb289914 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -440,7 +440,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index d3b3f9e720b3..fe82e2d07300 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -169,7 +170,8 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&ctx->out_queue) >= + READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bd4ce21d76d7..161f535c8b94 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "xfrm_inout.h" @@ -764,7 +765,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, trans = this_cpu_ptr(&xfrm_trans_tasklet); - if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); -- cgit v1.2.3 From 26722dc74bf08fd79564cbcad1e5f3e2aa3bf9cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:21 +0000 Subject: net: move dev_tx_weight to net_hotdata dev_tx_weight is used in tx fast path. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - include/net/hotdata.h | 1 + net/core/dev.c | 1 - net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 2 +- net/sched/sch_generic.c | 3 ++- 6 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c9a671b7bb37..ad4b031098ff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4794,7 +4794,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int dev_rx_weight; -extern int dev_tx_weight; enum { NESTED_SYNC_IMM_BIT, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index d86d02f156fc..ffea9cc263e5 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -21,6 +21,7 @@ struct net_hotdata { int netdev_budget_usecs; int tstamp_prequeue; int max_backlog; + int dev_tx_weight; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index e23edbaff392..5d6bd481103f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4409,7 +4409,6 @@ int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; -int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 35ed5a83ecc7..ec8c3b48e8fe 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -16,5 +16,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, + .dev_tx_weight = 64, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8eaeeb289914..a30016a8660e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -302,7 +302,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9b3e9262040b..ff5336493777 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -409,7 +410,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = READ_ONCE(dev_tx_weight); + int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { -- cgit v1.2.3 From 71c0de9bac9c1dda503322c86be4924f055dc6c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:22 +0000 Subject: net: move dev_rx_weight to net_hotdata dev_rx_weight is read from process_backlog(). Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/net/hotdata.h | 1 + net/core/dev.c | 3 +-- net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad4b031098ff..dd641297e807 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,8 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int dev_rx_weight; - enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index ffea9cc263e5..e6595ed2c3be 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -22,6 +22,7 @@ struct net_hotdata { int tstamp_prequeue; int max_backlog; int dev_tx_weight; + int dev_rx_weight; }; extern struct net_hotdata net_hotdata; diff --git a/net/core/dev.c b/net/core/dev.c index 5d6bd481103f..40ba02e04bcb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4408,7 +4408,6 @@ unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ -int dev_rx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -5978,7 +5977,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = READ_ONCE(dev_rx_weight); + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index ec8c3b48e8fe..c8a7a451c18a 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -17,5 +17,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, .dev_tx_weight = 64, + .dev_rx_weight = 64, }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a30016a8660e..8a4c698dad9c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -301,7 +301,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); -- cgit v1.2.3 From 490a79faf95e705ba0ffd9ebf04a624b379e53c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:30 +0000 Subject: net: introduce include/net/rps.h Move RPS related structures and helpers from include/linux/netdevice.h and include/net/sock.h to a new include file. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-18-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_arfs.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 1 + drivers/net/ethernet/sfc/rx_common.c | 1 + drivers/net/ethernet/sfc/siena/rx_common.c | 1 + drivers/net/tun.c | 1 + include/linux/netdevice.h | 82 -------------- include/net/rps.h | 127 ++++++++++++++++++++++ include/net/sock.h | 35 ------ net/core/dev.c | 1 + net/core/net-sysfs.c | 1 + net/core/sysctl_net_core.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 1 + net/ipv6/af_inet6.c | 1 + net/sctp/socket.c | 1 + 16 files changed, 140 insertions(+), 117 deletions(-) create mode 100644 include/net/rps.h (limited to 'net/core/sysctl_net_core.c') diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c index cca0e753f38f..7cee365cc7d1 100644 --- a/drivers/net/ethernet/intel/ice/ice_arfs.c +++ b/drivers/net/ethernet/intel/ice/ice_arfs.c @@ -2,6 +2,7 @@ /* Copyright (C) 2018-2020, Intel Corporation. */ #include "ice.h" +#include /** * ice_is_arfs_active - helper to check is aRFS is active diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index d7da62cda821..5d3fde63b273 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index e66f486faafe..c7f542d0b8f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "en.h" #define ARFS_HASH_SHIFT BITS_PER_BYTE diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index fac227d372db..dcd901eccfc8 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -11,6 +11,7 @@ #include "net_driver.h" #include #include +#include #include "efx.h" #include "nic.h" #include "rx_common.h" diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c index 4579f43484c3..219fb358a646 100644 --- a/drivers/net/ethernet/sfc/siena/rx_common.c +++ b/drivers/net/ethernet/sfc/siena/rx_common.c @@ -11,6 +11,7 @@ #include "net_driver.h" #include #include +#include #include "efx.h" #include "nic.h" #include "rx_common.h" diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8d258e263f54..0b3f21cba552 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd641297e807..416a800d72ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -225,12 +225,6 @@ struct net_device_core_stats { #include #include -#ifdef CONFIG_RPS -#include -extern struct static_key_false rps_needed; -extern struct static_key_false rfs_needed; -#endif - struct neighbour; struct neigh_parms; struct sk_buff; @@ -730,86 +724,10 @@ static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node #endif } -#ifdef CONFIG_RPS -/* - * This structure holds an RPS map which can be of variable length. The - * map is an array of CPUs. - */ -struct rps_map { - unsigned int len; - struct rcu_head rcu; - u16 cpus[]; -}; -#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) - -/* - * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. - */ -struct rps_dev_flow { - u16 cpu; - u16 filter; - unsigned int last_qtail; -}; -#define RPS_NO_FILTER 0xffff - -/* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - unsigned int mask; - struct rcu_head rcu; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - -/* - * The rps_sock_flow_table contains mappings of flows to the last CPU - * on which they were processed by the application (set in recvmsg). - * Each entry is a 32bit value. Upper part is the high-order bits - * of flow hash, lower part is CPU number. - * rps_cpu_mask is used to partition the space, depending on number of - * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 - * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, - * meaning we use 32-6=26 bits for the hash. - */ -struct rps_sock_flow_table { - u32 mask; - - u32 ents[] ____cacheline_aligned_in_smp; -}; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) - -#define RPS_NO_CPU 0xffff - -extern u32 rps_cpu_mask; -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - -static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, - u32 hash) -{ - if (table && hash) { - unsigned int index = hash & table->mask; - u32 val = hash & ~rps_cpu_mask; - - /* We only give a hint, preemption can change CPU under us */ - val |= raw_smp_processor_id(); - - /* The following WRITE_ONCE() is paired with the READ_ONCE() - * here, and another one in get_rps_cpu(). - */ - if (READ_ONCE(table->ents[index]) != val) - WRITE_ONCE(table->ents[index], val); - } -} - #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif -#endif /* CONFIG_RPS */ /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { diff --git a/include/net/rps.h b/include/net/rps.h new file mode 100644 index 000000000000..6081d817d245 --- /dev/null +++ b/include/net/rps.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_H +#define _NET_RPS_H + +#include +#include +#include + +#ifdef CONFIG_RPS + +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; + +/* + * This structure holds an RPS map which can be of variable length. The + * map is an array of CPUs. + */ +struct rps_map { + unsigned int len; + struct rcu_head rcu; + u16 cpus[]; +}; +#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) + +/* + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, and + * a hardware filter index. + */ +struct rps_dev_flow { + u16 cpu; + u16 filter; + unsigned int last_qtail; +}; +#define RPS_NO_FILTER 0xffff + +/* + * The rps_dev_flow_table structure contains a table of flow mappings. + */ +struct rps_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct rps_dev_flow flows[]; +}; +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ + ((_num) * sizeof(struct rps_dev_flow))) + +/* + * The rps_sock_flow_table contains mappings of flows to the last CPU + * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high-order bits + * of flow hash, lower part is CPU number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. + */ +struct rps_sock_flow_table { + u32 mask; + + u32 ents[] ____cacheline_aligned_in_smp; +}; +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) + +#define RPS_NO_CPU 0xffff + +extern u32 rps_cpu_mask; +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; + +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, + u32 hash) +{ + unsigned int index = hash & table->mask; + u32 val = hash & ~rps_cpu_mask; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (READ_ONCE(table->ents[index]) != val) + WRITE_ONCE(table->ents[index], val); +} + +#endif /* CONFIG_RPS */ + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + if (!hash) + return; + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (sock_flow_table) + rps_record_sock_flow(sock_flow_table, hash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } + } +#endif +} + +#endif /* _NET_RPS_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 09a0cde8bf52..b5e00702acc1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1117,41 +1117,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, hash); - rcu_read_unlock(); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } - } -#endif -} static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 40ba02e04bcb..bcf49b0393d2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -155,6 +155,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index af238026ac3c..5560083774b1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8a4c698dad9c..4b93e27404e8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "dev.h" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6f1cfd176e7b..55bd72997b31 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #endif #include #include +#include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7e1b848398d0..c5b83875411a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include #include +#include /* Track pending CMSGs. */ enum { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b90d46533cdc..8041dc181bd4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b9fcdb0952a..c67679a41044 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -67,6 +67,7 @@ #include #include #include +#include /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); -- cgit v1.2.3 From ce7f49ab741591d83e33e56948bac2f12de6e14e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:31 +0000 Subject: net: move rps_sock_flow_table to net_hotdata rps_sock_flow_table and rps_cpu_mask are used in fast path. Move them to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-19-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/hotdata.h | 4 ++++ include/net/rps.h | 8 +++----- net/core/dev.c | 12 +++--------- net/core/sysctl_net_core.c | 9 ++++++--- 4 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/include/net/hotdata.h b/include/net/hotdata.h index b0b847585f7e..003667a1efd6 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -27,6 +27,10 @@ struct net_hotdata { struct kmem_cache *skbuff_cache; struct kmem_cache *skbuff_fclone_cache; struct kmem_cache *skb_small_head_cache; +#ifdef CONFIG_RPS + struct rps_sock_flow_table __rcu *rps_sock_flow_table; + u32 rps_cpu_mask; +#endif int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/include/net/rps.h b/include/net/rps.h index 6081d817d245..7660243e905b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef CONFIG_RPS @@ -64,14 +65,11 @@ struct rps_sock_flow_table { #define RPS_NO_CPU 0xffff -extern u32 rps_cpu_mask; -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { unsigned int index = hash & table->mask; - u32 val = hash & ~rps_cpu_mask; + u32 val = hash & ~net_hotdata.rps_cpu_mask; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id(); @@ -93,7 +91,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash) if (!hash) return; rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); diff --git a/net/core/dev.c b/net/core/dev.c index bcf49b0393d2..0766a245816b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4450,12 +4450,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); -u32 rps_cpu_mask __read_mostly; -EXPORT_SYMBOL(rps_cpu_mask); - struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; @@ -4547,7 +4541,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!hash) goto done; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; @@ -4557,10 +4551,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); - if ((ident ^ hash) & ~rps_cpu_mask) + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; - next_cpu = ident & rps_cpu_mask; + next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4b93e27404e8..6973dda3abda 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -140,7 +140,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -161,7 +162,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -172,7 +174,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); -- cgit v1.2.3