From fec5e652e58fa6017b2c9e06466cb2a6538de5b4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 16 Apr 2010 16:01:27 -0700 Subject: rfs: Receive Flow Steering This patch implements receive flow steering (RFS). RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running. RFS is an extension of Receive Packet Steering (RPS). The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure. The rxhash is passed in skb's received on the connection from netif_receive_skb. For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms. The convolution of the simple approach is that it would potentially allow OOO packets. If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter. To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table. rps_sock_table is a global hash table. Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above. This table contains the "desired" CPUs for flows. rps_dev_flow_table is specific to each device queue. Each entry contains a CPU and a tail queue counter. The CPU is the "current" CPU for a matching flow. The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry. Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length. When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table. And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted. When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true: - The current CPU is unset (equal to RPS_NO_CPU) - Current CPU is offline - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table. This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry. This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery. Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality. 2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant. This patch implements RFS for TCP and connected UDP sockets. It should be usable for other flow oriented protocols. There are two configuration parameters for RFS. The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue. Both are rounded to power of two. The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency). The benefits of RFS are dependent on cache hierarchy, application load, and other factors. On simple benchmarks, we don't necessarily see improvement and sometimes see degradation. However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf. e1000e on 8 core Intel No RFS or RPS 104K tps at 30% CPU No RFS (best RPS config): 290K tps at 63% CPU RFS 303K tps at 61% CPU RPC test tps CPU% 50/90/99% usec latency Latency StdDev No RFS/RPS 103K 48% 757/900/3185 4472.35 RPS only: 174K 73% 415/993/2468 491.66 RFS 223K 73% 379/651/1382 315.61 Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'net/core/sysctl_net_core.c') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b7b6b8208f75..dcc7d25996ab 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -11,12 +11,72 @@ #include #include #include +#include #include #include #include #include +#ifdef CONFIG_RPS +static int rps_sock_flow_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int orig_size, size; + int ret, i; + ctl_table tmp = { + .data = &size, + .maxlen = sizeof(size), + .mode = table->mode + }; + struct rps_sock_flow_table *orig_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + + mutex_lock(&sock_flow_mutex); + + orig_sock_table = rps_sock_flow_table; + size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + if (write) { + if (size) { + if (size > 1<<30) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = + vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); + if (!sock_table) { + mutex_unlock(&sock_flow_mutex); + return -ENOMEM; + } + + sock_table->mask = size - 1; + } else + sock_table = orig_sock_table; + + for (i = 0; i < size; i++) + sock_table->ents[i] = RPS_NO_CPU; + } else + sock_table = NULL; + + if (sock_table != orig_sock_table) { + rcu_assign_pointer(rps_sock_flow_table, sock_table); + synchronize_rcu(); + vfree(orig_sock_table); + } + } + + mutex_unlock(&sock_flow_mutex); + + return ret; +} +#endif /* CONFIG_RPS */ + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -82,6 +142,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_RPS + { + .procname = "rps_sock_flow_entries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rps_sock_flow_sysctl + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", -- cgit v1.2.3 From 3b098e2d7c693796cc4dffb07caa249fc0f70771 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 May 2010 23:57:10 -0700 Subject: net: Consistent skb timestamping With RPS inclusion, skb timestamping is not consistent in RX path. If netif_receive_skb() is used, its deferred after RPS dispatch. If netif_rx() is used, its done before RPS dispatch. This can give strange tcpdump timestamps results. I think timestamping should be done as soon as possible in the receive path, to get meaningful values (ie timestamps taken at the time packet was delivered by NIC driver to our stack), even if NAPI already can defer timestamping a bit (RPS can help to reduce the gap) Tom Herbert prefer to sample timestamps after RPS dispatch. In case sampling is expensive (HPET/acpi_pm on x86), this makes sense. Let admins switch from one mode to another, using a new sysctl, /proc/sys/net/core/netdev_tstamp_prequeue Its default value (1), means timestamps are taken as soon as possible, before backlog queueing, giving accurate timestamps. Setting a 0 value permits to sample timestamps when processing backlog, after RPS dispatch, to lower the load of the pre-RPS cpu. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 10 +++++++++ include/linux/netdevice.h | 1 + net/core/dev.c | 50 +++++++++++++++++++++++++++----------------- net/core/sysctl_net_core.c | 7 +++++++ 4 files changed, 49 insertions(+), 19 deletions(-) (limited to 'net/core/sysctl_net_core.c') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index df38ef046f8d..cbd05ffc606b 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -84,6 +84,16 @@ netdev_max_backlog Maximum number of packets, queued on the INPUT side, when the interface receives packets faster than kernel can process them. +netdev_tstamp_prequeue +---------------------- + +If set to 0, RX packet timestamps can be sampled after RPS processing, when +the target CPU processes packets. It might give some delay on timestamps, but +permit to distribute the load on several cpus. + +If set to 1 (default), timestamps are sampled as soon as possible, before +queueing. + optmem_max ---------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69022d47d6f2..c1b2341897c2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2100,6 +2100,7 @@ extern const struct net_device_stats *dev_get_stats(struct net_device *dev); extern void dev_txq_stats_fold(const struct net_device *dev, struct net_device_stats *stats); extern int netdev_max_backlog; +extern int netdev_tstamp_prequeue; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); extern int skb_checksum_help(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 5cbba0927a8e..988e42912e72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1454,7 +1454,7 @@ void net_disable_timestamp(void) } EXPORT_SYMBOL(net_disable_timestamp); -static inline void net_timestamp(struct sk_buff *skb) +static inline void net_timestamp_set(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) __net_timestamp(skb); @@ -1462,6 +1462,12 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +static inline void net_timestamp_check(struct sk_buff *skb) +{ + if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) + __net_timestamp(skb); +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1508,9 +1514,9 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) #ifdef CONFIG_NET_CLS_ACT if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) - net_timestamp(skb); + net_timestamp_set(skb); #else - net_timestamp(skb); + net_timestamp_set(skb); #endif rcu_read_lock(); @@ -2201,6 +2207,7 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ @@ -2465,8 +2472,8 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); #ifdef CONFIG_RPS { @@ -2791,8 +2798,8 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; - if (!skb->tstamp.tv64) - net_timestamp(skb); + if (!netdev_tstamp_prequeue) + net_timestamp_check(skb); if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; @@ -2910,23 +2917,28 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); + #ifdef CONFIG_RPS - struct rps_dev_flow voidflow, *rflow = &voidflow; - int cpu, ret; + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; - rcu_read_lock(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); - cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + ret = __netif_receive_skb(skb); + } - if (cpu >= 0) { - ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); - rcu_read_unlock(); - } else { - rcu_read_unlock(); - ret = __netif_receive_skb(skb); + return ret; } - - return ret; #else return __netif_receive_skb(skb); #endif diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index dcc7d25996ab..01eee5d984be 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -121,6 +121,13 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "netdev_tstamp_prequeue", + .data = &netdev_tstamp_prequeue, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "message_cost", .data = &net_ratelimit_state.interval, -- cgit v1.2.3