From e341694e3eb57fcda9f1adc7bfea42fe080d8d7a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 2 Aug 2014 11:47:45 +0200 Subject: netlink: Convert netlink_lookup() to use RCU protected hash table Heavy Netlink users such as Open vSwitch spend a considerable amount of time in netlink_lookup() due to the read-lock on nl_table_lock. Use of RCU relieves the lock contention. Makes use of the new resizable hash table to avoid locking on the lookup. The hash table will grow if entries exceeds 75% of table size up to a total table size of 64K. It will automatically shrink if usage falls below 30%. Also splits nl_table_lock into a separate mutex to protect hash table mutations and allow synchronize_rcu() to sleep while waiting for readers during expansion and shrinking. Before: 9.16% kpktgend_0 [openvswitch] [k] masked_flow_lookup 6.42% kpktgend_0 [pktgen] [k] mod_cur_headers 6.26% kpktgend_0 [pktgen] [k] pktgen_thread_worker 6.23% kpktgend_0 [kernel.kallsyms] [k] memset 4.79% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup 4.37% kpktgend_0 [kernel.kallsyms] [k] memcpy 3.60% kpktgend_0 [openvswitch] [k] ovs_flow_extract 2.69% kpktgend_0 [kernel.kallsyms] [k] jhash2 After: 15.26% kpktgend_0 [openvswitch] [k] masked_flow_lookup 8.12% kpktgend_0 [pktgen] [k] pktgen_thread_worker 7.92% kpktgend_0 [pktgen] [k] mod_cur_headers 5.11% kpktgend_0 [kernel.kallsyms] [k] memset 4.11% kpktgend_0 [openvswitch] [k] ovs_flow_extract 4.06% kpktgend_0 [kernel.kallsyms] [k] _raw_spin_lock 3.90% kpktgend_0 [kernel.kallsyms] [k] jhash2 [...] 0.67% kpktgend_0 [kernel.kallsyms] [k] netlink_lookup Signed-off-by: Thomas Graf Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/netlink/diag.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/netlink/diag.c') diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 1af29624b92f..7301850eb56f 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "af_netlink.h" @@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct netlink_table *tbl = &nl_table[protocol]; - struct nl_portid_hash *hash = &tbl->hash; + struct rhashtable *ht = &tbl->hash; + const struct bucket_table *htbl = rht_dereference(ht->tbl, ht); struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; + struct netlink_sock *nlsk; struct sock *sk; int ret = 0, num = 0, i; req = nlmsg_data(cb->nlh); - for (i = 0; i <= hash->mask; i++) { - sk_for_each(sk, &hash->table[i]) { + for (i = 0; i < htbl->size; i++) { + rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) { + sk = (struct sock *)nlsk; + if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { -- cgit v1.2.3 From 6c8f7e70837468da4e658080d4448930fb597e1b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 7 Aug 2014 00:18:47 +0100 Subject: netlink: hold nl_sock_hash_lock during diag dump Although RCU protection would be possible during diag dump, doing so allows for concurrent table mutations which can render the in-table offset between individual Netlink messages invalid and thus cause legitimate sockets to be skipped in the dump. Since the diag dump is relatively low volume and consistency is more important than performance, the table mutex is held during dump. Reported-by: Andrey Wagin Signed-off-by: Thomas Graf Fixes: e341694e3eb57fc ("netlink: Convert netlink_lookup() to use RCU protected hash table") Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + net/netlink/af_netlink.h | 1 + net/netlink/diag.c | 3 +++ 3 files changed, 5 insertions(+) (limited to 'net/netlink/diag.c') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 479a344563d8..a324b4b34c90 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -104,6 +104,7 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); /* Protects netlink socket hash table mutations */ DEFINE_MUTEX(nl_sk_hash_lock); +EXPORT_SYMBOL_GPL(nl_sk_hash_lock); static int lockdep_nl_sk_hash_is_held(void) { diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 60f631fb7087..b20a1731759b 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -73,5 +73,6 @@ struct netlink_table { extern struct netlink_table *nl_table; extern rwlock_t nl_table_lock; +extern struct mutex nl_sk_hash_lock; #endif diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 7301850eb56f..de8c74a3c061 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -170,6 +170,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) req = nlmsg_data(cb->nlh); + mutex_lock(&nl_sk_hash_lock); read_lock(&nl_table_lock); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { @@ -183,6 +184,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } else { if (req->sdiag_protocol >= MAX_LINKS) { read_unlock(&nl_table_lock); + mutex_unlock(&nl_sk_hash_lock); return -ENOENT; } @@ -190,6 +192,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } read_unlock(&nl_table_lock); + mutex_unlock(&nl_sk_hash_lock); return skb->len; } -- cgit v1.2.3