summaryrefslogtreecommitdiff
path: root/net/ipv4/inet_hashtables.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-02-11 03:54:23 -0500
committerDavid S. Miller <davem@davemloft.net>2016-02-11 03:54:23 -0500
commitfd1914b2901badd942f008ce57bf4a938d29fde4 (patch)
tree2764c897bbba3f7604ce942dbe62d17352036d58 /net/ipv4/inet_hashtables.c
parent30c1de08dda9202699c1ddc7fd658693faf93bf2 (diff)
parent4b2a6aed2115cd72faaffc92e03d6516e8113904 (diff)
Merge branch 'tcp-fast-so_reuseport'
Craig Gallek says: ==================== Faster SO_REUSEPORT for TCP This patch series complements an earlier series (6a5ef90c58da) which added faster SO_REUSEPORT lookup for UDP sockets by extending the feature to TCP sockets. It uses the same array-based data structure which allows for socket selection after finding the first listening socket that matches an incoming packet. Prior to this feature, every socket in the reuseport group needed to be found and examined before a selection could be made. With this series the SO_ATTACH_REUSEPORT_CBPF and SO_ATTACH_REUSEPORT_EBPF socket options now work for TCP sockets as well. The test at the end of the series includes an example of how to use these options to select a reuseport socket based on the cpu core id handling the incoming packet. There are several refactoring patches that precede the feature implementation. Only the last two patches in this series should result in any behavioral changes. v4 - Fix build issue when compiling IPv6 as a module. This required moving the ipv6_rcv_saddr_equal into an object that is included as a built-in object. I included this change in the second patch which adds inet6_hash since that is where ipv6_rcv_saddr_equal will later be called from non-module code. v3: - Another warning in the first patch caught by a build bot. Return 0 in the no-op UDP hash function. v2: - In the first patched I missed a couple of hash functions that should now be returning int instead of void. I missed these the first time through as it only generated a warning and not an error :\ ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r--net/ipv4/inet_hashtables.c67
1 files changed, 63 insertions, 4 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980797fc..c0f9942de924 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/sock_reuseport.h>
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
@@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
@@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 phash = 0;
rcu_read_lock();
@@ -229,6 +233,15 @@ begin:
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+ sk2 = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (sk2) {
+ result = sk2;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == hiscore && reuseport) {
@@ -246,11 +259,13 @@ begin:
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
+ select_ok = false;
goto begin;
}
}
@@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
}
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
-void __inet_hash(struct sock *sk, struct sock *osk)
+static int inet_reuseport_add_sock(struct sock *sk,
+ struct inet_listen_hashbucket *ilb,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+
+ sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ if (sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+ sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ saddr_same(sk, sk2, false))
+ return reuseport_add_sock(sk, sk2);
+ }
+
+ /* Initial allocation may have already happened via setsockopt */
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return reuseport_alloc(sk);
+ return 0;
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
+ int err = 0;
if (sk->sk_state != TCP_LISTEN) {
inet_ehash_nolisten(sk, osk);
- return;
+ return 0;
}
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
+ if (sk->sk_reuseport) {
+ err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ if (err)
+ goto unlock;
+ }
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
spin_unlock(&ilb->lock);
+
+ return err;
}
EXPORT_SYMBOL(__inet_hash);
-void inet_hash(struct sock *sk)
+int inet_hash(struct sock *sk)
{
+ int err = 0;
+
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk, NULL);
+ err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
local_bh_enable();
}
+
+ return err;
}
EXPORT_SYMBOL_GPL(inet_hash);
@@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);