From 58d607d3e52f2b15902f58a1161da9fb3b0f6d47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Sep 2015 15:24:20 -0700 Subject: tcp: provide skb->hash to synack packets In commit b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit"), Tom provided a l4 hash to most outgoing TCP packets. We'd like to provide one as well for SYNACK packets, so that all packets of a given flow share same txhash, to later enable bonding driver to also use skb->hash to perform slave selection. Note that a SYNACK retransmit shuffles the tx hash, as Tom did in commit 265f94ff54d62 ("net: Recompute sk_txhash on negative routing advice") for established sockets. This has nice effect making TCP flows resilient to some kind of black holes, even at connection establish phase. Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 93898e093d4e..d671d742a239 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1276,8 +1276,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; + newsk->sk_txhash = tcp_rsk(req)->txhash; inet_csk(newsk)->icsk_ext_hdr_len = 0; - sk_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; -- cgit v1.2.3 From d8ed625044cdede8661324074aaad7459a1e3c7a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Sep 2015 20:44:17 -0700 Subject: tcp: factorize sk_txhash init Neal suggested to move sk_txhash init into tcp_create_openreq_child(), called both from IPv4 and IPv6. This opportunity was missed in commit 58d607d3e52f ("tcp: provide skb->hash to synack packets") Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_minisocks.c | 1 + net/ipv6/tcp_ipv6.c | 2 -- 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d671d742a239..7e2646542312 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1276,7 +1276,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; - newsk->sk_txhash = tcp_rsk(req)->txhash; inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 10933d01b982..85830bb92d04 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -471,6 +471,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f9c0e2640671..a004e0b0b3e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1090,8 +1090,6 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; - newsk->sk_txhash = tcp_rsk(req)->txhash; - /* Now IPv6 options... First: no IPv4 options. -- cgit v1.2.3 From b40cf18ef7961b6d67732e234780586590510ce1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:08 -0700 Subject: tcp: constify listener socket in tcp_v[46]_init_req() Soon, listener socket spinlock will no longer be held, add const arguments to tcp_v[46]_init_req() to make clear these functions can not mess socket fields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 9 +++++---- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5cf9672c13e2..c006255a0df1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1705,7 +1705,8 @@ struct tcp_request_sock_ops { const struct sock *sk, const struct sk_buff *skb); #endif - void (*init_req)(struct request_sock *req, struct sock *sk, + void (*init_req)(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7e2646542312..9d968ca7b669 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1168,7 +1168,8 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, } #endif -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, +static void tcp_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a004e0b0b3e9..9016797445a2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -663,22 +663,23 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) } #endif -static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && + if (!sk_listener->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { -- cgit v1.2.3 From b83e3deb974ca2c11e21256fe602e517afb83247 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:15 -0700 Subject: tcp: md5: constify tcp_md5_do_lookup() socket argument When TCP new listener is done, these functions will be called without socket lock being held. Make sure they don't change anything. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 10 +++++----- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index d37ad0c3ea9c..45bc3c63c3fd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1372,16 +1372,16 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family); #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else -static inline struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +static inline struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -1684,7 +1684,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk, + struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, @@ -1699,7 +1699,7 @@ struct tcp_sock_af_ops { struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *(*req_md5_lookup)(struct sock *sk, + struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9d968ca7b669..1c1009d783f5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -865,7 +865,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -877,7 +877,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -894,7 +894,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, } EXPORT_SYMBOL(tcp_md5_do_lookup); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9016797445a2..1071131a6c53 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -476,13 +476,13 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) } #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr) { return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); } -static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, +static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); -- cgit v1.2.3 From 0f935dbedc49a5044ebff08b47eef35a2f2bbe92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:21 -0700 Subject: tcp: constify tcp_v{4|6}_send_synack() socket argument This documents fact that listener lock might not be held at the time SYNACK are sent. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 19f23590baa0..868c53532169 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1717,7 +1717,7 @@ struct tcp_request_sock_ops { const struct request_sock *req, bool *strict); __u32 (*init_seq)(const struct sk_buff *skb); - int (*send_synack)(struct sock *sk, struct dst_entry *dst, + int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc); void (*queue_hash_add)(struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c1009d783f5..a23ba7daecbf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -818,7 +818,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1071131a6c53..16fb299dcab8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -434,7 +434,7 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, -- cgit v1.2.3 From a00e74442bac5ad19a929d097370da7e07540ea6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:39 -0700 Subject: tcp/dccp: constify send_synack and send_reset socket argument None of these functions need to change the socket, make it const. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 4 ++-- net/dccp/dccp.h | 2 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/dccp/minisocks.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 12 ++++++------ 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 181f97f9fe1c..90247ec7955b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -34,9 +34,9 @@ struct request_sock_ops { char *slab_name; int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); - void (*send_ack)(struct sock *sk, struct sk_buff *skb, + void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); - void (*send_reset)(struct sock *sk, + void (*send_reset)(const struct sock *sk, struct sk_buff *skb); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 31e96df500d1..8ed1df2771bd 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -229,7 +229,7 @@ void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); int dccp_retransmit_skb(struct sock *sk); void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk); void dccp_send_sync(struct sock *sk, const u64 seq, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a46ae9c69ccf..00a14fa4270a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -527,7 +527,7 @@ out: return err; } -static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { int err; const struct iphdr *rxiph; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4fa199dc69a3..aa719e700961 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -234,7 +234,7 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) kfree_skb(inet_rsk(req)->pktopts); } -static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 838f524cf11a..9bfd0dc1e6cb 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -236,7 +236,7 @@ int dccp_child_process(struct sock *parent, struct sock *child, EXPORT_SYMBOL_GPL(dccp_child_process); -void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk) { DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a23ba7daecbf..4300d0132b9f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 16fb299dcab8..c47e5c87a2a8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -70,8 +70,8 @@ #include #include -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); @@ -724,7 +724,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; -static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, u8 tclass, u32 label) @@ -823,7 +823,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, kfree_skb(buff); } -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; @@ -894,7 +894,7 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq, +static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, u32 label) @@ -917,7 +917,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV -- cgit v1.2.3 From bda07a64c09c44ced789dbb815c71854f0c59839 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:40 -0700 Subject: tcp: remove unused len argument from tcp_rcv_state_process() Once we realize tcp_rcv_synsent_state_process() does not use its 'len' argument and we get rid of it, then it becomes clear this argument is no longer used in tcp_rcv_state_process() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 3 +-- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index cdbf63d3c5cf..1cfdedbe47e1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -366,7 +366,7 @@ void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len); + const struct tcphdr *th); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); void tcp_rcv_space_adjust(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4964d53907e9..dcbddf12f4b3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5472,7 +5472,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5699,7 +5699,7 @@ reset_and_undo: */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -5749,7 +5749,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4300d0132b9f..7e5ae1e01009 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1420,7 +1420,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb))) { rsk = sk; goto reset; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e4fe62b6b106..9c7c61cf7462 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -821,8 +821,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb)); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c47e5c87a2a8..b6e473f0f62e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1272,7 +1272,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb))) goto reset; if (opt_skb) goto ipv6_pktoptions; -- cgit v1.2.3 From 72ab4a86f7a260d4c2a320b49662da107ce77a81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:41 -0700 Subject: tcp: remove tcp_rcv_state_process() tcp_hdr argument Factorize code to get tcp header from skb. It makes no sense to duplicate code in callers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1cfdedbe47e1..1fe0bd458cb4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -365,8 +365,7 @@ void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th); +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); void tcp_rcv_space_adjust(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dcbddf12f4b3..67b27aee8d28 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5698,11 +5698,11 @@ reset_and_undo: * address independent. */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7e5ae1e01009..67c0dc8bddbf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1420,7 +1420,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb))) { + if (tcp_rcv_state_process(sk, skb)) { rsk = sk; goto reset; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9c7c61cf7462..139668cc2347 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -821,7 +821,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb)); + ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b6e473f0f62e..334d548a0cf6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1272,7 +1272,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb))) + if (tcp_rcv_state_process(sk, skb)) goto reset; if (opt_skb) goto ipv6_pktoptions; -- cgit v1.2.3 From 0c27171e66d94f9121fc00e87407ca7103bb6649 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:48 -0700 Subject: tcp/dccp: constify syn_recv_sock() method sock argument We'll soon no longer hold listener socket lock, these functions do not modify the socket in any way. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- include/net/tcp.h | 2 +- net/dccp/dccp.h | 2 +- net/dccp/ipv4.c | 3 ++- net/dccp/ipv6.c | 5 +++-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 5 +++-- 7 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 187cef7e56d5..ee54f21a8113 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -41,7 +41,7 @@ struct inet_connection_sock_af_ops { int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); - struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, + struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); u16 net_header_len; diff --git a/include/net/tcp.h b/include/net/tcp.h index 85995c1291d0..a1d2f5d6a430 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -454,7 +454,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 2409619b7043..e1f823451565 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -276,7 +276,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); -struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 00a14fa4270a..5b7818c63cec 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -390,7 +390,8 @@ static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) * * This is the equivalent of TCP's tcp_v4_syn_recv_sock */ -struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *dccp_v4_request_recv_sock(const struct sock *sk, + struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 0966bc08d362..e8753aa3b7a4 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -408,13 +408,14 @@ drop: return -1; } -static struct sock *dccp_v6_request_recv_sock(struct sock *sk, +static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_pinfo *newnp; + const struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67c0dc8bddbf..ee0239e190cf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1242,7 +1242,7 @@ EXPORT_SYMBOL(tcp_v4_conn_request); * The three way handshake has completed - we got a valid synack - * now create the new socket. */ -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 092a23ef1feb..2330c7be6323 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -986,12 +986,13 @@ drop: return 0; /* don't send reset */ } -static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq; - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct ipv6_pinfo *newnp; + const struct ipv6_pinfo *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; -- cgit v1.2.3 From f964629e3338d9e5a78c9b354380d5a1e2fa4617 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:50 -0700 Subject: tcp: constify tcp_v{4|6}_route_req() sock argument These functions do not change the listener socket. Goal is to make sure tcp_conn_request() is not messing with listener in a racy way. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5aa6672c6f5b..2c7dfe52f473 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1711,7 +1711,7 @@ struct tcp_request_sock_ops { __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif - struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl, + struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req, bool *strict); __u32 (*init_seq)(const struct sk_buff *skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ee0239e190cf..f551e9e862db 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1180,7 +1180,8 @@ static void tcp_v4_init_req(struct request_sock *req, ireq->opt = tcp_v4_save_options(skb); } -static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2330c7be6323..97bc26e0cd0f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -689,7 +689,8 @@ static void tcp_v6_init_req(struct request_sock *req, } } -static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v6_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { -- cgit v1.2.3 From 0536fcc039a8926ec12ec587f41a83f7acafeb82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:52 -0700 Subject: tcp: prepare fastopen code for upcoming listener changes While auditing TCP stack for upcoming 'lockless' listener changes, I found I had to change fastopen_init_queue() to properly init the object before publishing it. Otherwise an other cpu could try to lock the spinlock before it gets properly initialized. Instead of adding appropriate barriers, just remove dynamic memory allocations : - Structure is 28 bytes on 64bit arches. Using additional 8 bytes for holding a pointer seems overkill. - Two listeners can share same cache line and performance would suffer. If we really want to save few bytes, we would instead dynamically allocate whole struct request_sock_queue in the future. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 22 ++++------------------ include/net/request_sock.h | 7 ++----- net/core/request_sock.c | 9 ++++++++- net/ipv4/af_inet.c | 10 +++------- net/ipv4/inet_connection_sock.c | 17 ++++++++--------- net/ipv4/tcp.c | 14 ++------------ net/ipv4/tcp_fastopen.c | 10 +++++----- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 9 files changed, 35 insertions(+), 60 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fcb573be75d9..e442e6e9a365 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -382,25 +382,11 @@ static inline bool tcp_passive_fastopen(const struct sock *sk) tcp_sk(sk)->fastopen_rsk != NULL); } -extern void tcp_sock_destruct(struct sock *sk); - -static inline int fastopen_init_queue(struct sock *sk, int backlog) +static inline void fastopen_queue_tune(struct sock *sk, int backlog) { - struct request_sock_queue *queue = - &inet_csk(sk)->icsk_accept_queue; - - if (queue->fastopenq == NULL) { - queue->fastopenq = kzalloc( - sizeof(struct fastopen_queue), - sk->sk_allocation); - if (queue->fastopenq == NULL) - return -ENOMEM; - - sk->sk_destruct = tcp_sock_destruct; - spin_lock_init(&queue->fastopenq->lock); - } - queue->fastopenq->max_qlen = backlog; - return 0; + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + queue->fastopenq.max_qlen = backlog; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index c146b5284786..d2544de329bd 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -180,11 +180,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_tail; u8 rskq_defer_accept; struct listen_sock *listen_opt; - struct fastopen_queue *fastopenq; /* This is non-NULL iff TFO has been - * enabled on this listener. Check - * max_qlen != 0 in fastopen_queue - * to determine if TFO is enabled - * right at this moment. + struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine + * if TFO is enabled. */ /* temporary alignment, our goal is to get rid of this lock */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index b42f0e26f89e..e22cfa4ed25f 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -59,6 +59,13 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); spin_lock_init(&queue->syn_wait_lock); + + spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; + queue->fastopenq.max_qlen = 0; + queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; lopt->max_qlen_log = ilog2(nr_table_entries); @@ -174,7 +181,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; - fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8a556643b874..3af85eecbe11 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - !inet_csk(sk)->icsk_accept_queue.fastopenq) { + !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - err = fastopen_init_queue(sk, backlog); + fastopen_queue_tune(sk, backlog); else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0) - err = fastopen_init_queue(sk, + fastopen_queue_tune(sk, ((uint)sysctl_tcp_fastopen) >> 16); - else - err = 0; - if (err) - goto out; tcp_fastopen_init_key_once(true); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 694a5e8f4f9f..e1527882a578 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -335,9 +335,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && - tcp_rsk(req)->tfo_listener && - queue->fastopenq) { - spin_lock_bh(&queue->fastopenq->lock); + tcp_rsk(req)->tfo_listener) { + spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to @@ -348,7 +347,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) req->sk = NULL; req = NULL; } - spin_unlock_bh(&queue->fastopenq->lock); + spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); @@ -886,12 +885,12 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); reqsk_put(req); } - if (queue->fastopenq) { + if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ - spin_lock_bh(&queue->fastopenq->lock); - acc_req = queue->fastopenq->rskq_rst_head; - queue->fastopenq->rskq_rst_head = NULL; - spin_unlock_bh(&queue->fastopenq->lock); + spin_lock_bh(&queue->fastopenq.lock); + acc_req = queue->fastopenq.rskq_rst_head; + queue->fastopenq.rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq.lock); while ((req = acc_req) != NULL) { acc_req = req->dl_next; reqsk_put(req); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8b8fa184f75..3c96fa87ff9e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2253,13 +2253,6 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2581,7 +2574,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, TCPF_LISTEN))) { tcp_fastopen_init_key_once(true); - err = fastopen_init_queue(sk, val); + fastopen_queue_tune(sk, val); } else { err = -EINVAL; } @@ -2849,10 +2842,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq) - val = icsk->icsk_accept_queue.fastopenq->max_qlen; - else - val = 0; + val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_TIMESTAMP: diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index db43c6286cf7..f69f436fcbcc 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -142,9 +142,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, if (!child) return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -237,8 +237,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f551e9e862db..64ece718d66c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2186,7 +2186,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 97bc26e0cd0f..0ac64f47f882 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1672,7 +1672,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1716,7 +1716,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, sp->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } -- cgit v1.2.3 From 38cb52455c2c3e8b5751350a3fb32e43e82e129a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:26 -0700 Subject: tcp: call sk_mark_napi_id() on the child, not the listener This fixes a typo : We want to store the NAPI id on child socket. Presumably nobody really uses busy polling, on short lived flows. Fixes: 3d97379a67486 ("tcp: move sk_mark_napi_id() at the right place") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 64ece718d66c..2fb0945b9d83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1411,7 +1411,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2ae95e1d03e1..e463583c39ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1265,7 +1265,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) -- cgit v1.2.3 From ba8e275a457397ab06f3567cf7bef0d78a43ae7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:28 -0700 Subject: tcp: cleanup tcp_v[46]_inbound_md5_hash() We'll soon have to call tcp_v[46]_inbound_md5_hash() twice. Also add const attribute to the socket, as it might be the unlocked listener for SYN packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 16 ++++++---------- net/ipv6/tcp_ipv6.c | 10 ++++++---- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fb0945b9d83..56f8c6395966 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1112,10 +1112,13 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +#endif + /* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(struct sock *sk, +static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG /* * This gets called for each TCP segment that arrives * so we want to be efficient. @@ -1165,8 +1168,9 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, return true; } return false; -} #endif + return false; +} static void tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener, @@ -1607,16 +1611,8 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif nf_reset(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e463583c39ee..65e797dba504 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -622,8 +622,12 @@ clear_hash_noput: return 1; } -static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +#endif + +static bool tcp_v6_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -660,9 +664,9 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) &ip6h->daddr, ntohs(th->dest)); return true; } +#endif return false; } -#endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, @@ -1408,10 +1412,8 @@ process: tcp_v6_fill_cb(skb, hdr, th); -#ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif if (sk_filter(sk, skb)) goto discard_and_relse; -- cgit v1.2.3 From aa3a0c8ce651b5e16124866b0a10d1b90b9ef022 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:30 -0700 Subject: tcp: get_openreq[46]() changes When request sockets are no longer in a per listener hash table but on regular TCP ehash, we need to access listener uid through req->rsk_listener get_openreq6() also gets a const for its request socket argument. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_ipv4.c | 8 ++++---- net/ipv6/tcp_ipv6.c | 7 ++++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c7dfe52f473..a26341d2ad67 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1637,7 +1637,6 @@ struct tcp_iter_state { enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; - kuid_t uid; loff_t last_pos; }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56f8c6395966..a33101616215 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1871,7 +1871,6 @@ get_sk: spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: - st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; st->state = TCP_SEQ_STATE_OPENREQ; st->sbucket = 0; @@ -2151,7 +2150,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; @@ -2168,7 +2167,8 @@ static void get_openreq4(const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, - from_kuid_munged(seq_user_ns(f), uid), + from_kuid_munged(seq_user_ns(f), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2278,7 +2278,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(v, seq, st->num, st->uid); + get_openreq4(v, seq, st->num); break; } out: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 65e797dba504..cadb44a2d34e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1635,7 +1635,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct request_sock *req, int i, kuid_t uid) + const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; @@ -1659,7 +1659,8 @@ static void get_openreq6(struct seq_file *seq, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, - from_kuid_munged(seq_user_ns(seq), uid), + from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1773,7 +1774,7 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) get_tcp6_sock(seq, v, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq6(seq, v, st->num, st->uid); + get_openreq6(seq, v, st->num); break; } out: -- cgit v1.2.3 From 079096f103faca2dd87342cca6f23d4b34da8871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:32 -0700 Subject: tcp/dccp: install syn_recv requests into ehash table In this patch, we insert request sockets into TCP/DCCP regular ehash table (where ESTABLISHED and TIMEWAIT sockets are) instead of using the per listener hash table. ACK packets find SYN_RECV pseudo sockets without having to find and lock the listener. In nominal conditions, this halves pressure on listener lock. Note that this will allow for SO_REUSEPORT refinements, so that we can select a listener using cpu/numa affinities instead of the prior 'consistent hash', since only SYN packets will apply this selection logic. We will shrink listen_sock in the following patch to ease code review. Signed-off-by: Eric Dumazet Cc: Ying Cai Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 -- include/net/inet_hashtables.h | 1 + include/net/request_sock.h | 4 -- include/net/tcp.h | 3 - net/core/request_sock.c | 28 +-------- net/dccp/ipv4.c | 64 +++++++------------- net/dccp/ipv6.c | 72 +++++++---------------- net/ipv4/inet_connection_sock.c | 103 +++++++------------------------- net/ipv4/inet_diag.c | 96 +++--------------------------- net/ipv4/inet_hashtables.c | 14 ++++- net/ipv4/syncookies.c | 4 ++ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 117 +++++++++++-------------------------- net/ipv6/inet6_connection_sock.c | 67 --------------------- net/ipv6/tcp_ipv6.c | 82 ++++++++++++-------------- 15 files changed, 160 insertions(+), 501 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b2e2e30befa9..730aa034cd3d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); int inet_csk_get_port(struct sock *sk, unsigned short snum); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3fb778d7c875..6683ada25fef 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); +int inet_ehash_insert(struct sock *sk, struct sock *osk); void __inet_hash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 97c1ba61ed2d..e1850923c4f5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -266,8 +266,4 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log; } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout); - #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index a26341d2ad67..225e9561af35 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1618,7 +1618,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, - TCP_SEQ_STATE_OPENREQ, TCP_SEQ_STATE_ESTABLISHED, }; @@ -1717,8 +1716,6 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc); - void (*queue_hash_add)(struct sock *sk, struct request_sock *req, - const unsigned long timeout); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5ca624cea04c..a4b305d8ca2b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -99,35 +99,9 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk( void reqsk_queue_destroy(struct request_sock_queue *queue) { - /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - if (reqsk_queue_len(queue) != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - spin_lock_bh(&queue->syn_wait_lock); - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - /* Because of following del_timer_sync(), - * we must release the spinlock here - * or risk a dead lock. - */ - spin_unlock_bh(&queue->syn_wait_lock); - atomic_dec(&queue->qlen); - if (del_timer_sync(&req->rsk_timer)) - reqsk_put(req); - reqsk_put(req); - spin_lock_bh(&queue->syn_wait_lock); - } - spin_unlock_bh(&queue->syn_wait_lock); - } - } - - if (WARN_ON(reqsk_queue_len(queue) != 0)) - pr_err("qlen %u\n", reqsk_queue_len(queue)); + /* cleaning is done by req timers */ kvfree(lopt); } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5b7818c63cec..8910c9567719 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -444,36 +444,6 @@ put_and_exit: } EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); -static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct sock *nsk; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, - iph->saddr, iph->daddr); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, - inet_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v4_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dh, skb->len)) goto reset; @@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: dccp_v4_ctl_send_reset(sk, skb); -discard: kfree_skb(skb); return 0; } @@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v4_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e8753aa3b7a4..1361a3f45df7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { .syn_ack_timeout = dccp_syn_ack_timeout, }; -static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, - &iph->daddr, inet6_iif(skb)); - if (req) { - nsk = dccp_check_req(sk, skb, req); - if (!nsk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, - &iph->saddr, dh->dccph_sport, - &iph->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); - if (nsk != NULL) { - if (nsk->sk_state != DCCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } - - return sk; -} - static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; @@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: @@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ - if (sk->sk_state == DCCP_LISTEN) { - struct sock *nsk = dccp_v6_hnd_req(sk, skb); - - if (nsk == NULL) - goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ - if (nsk != sk) { - if (dccp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb != NULL) - __kfree_skb(opt_skb); - return 0; - } - } if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; @@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto no_dccp_socket; } + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (sk->sk_state == DCCP_LISTEN) + nsk = dccp_check_req(sk, skb, req); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (dccp_child_process(sk, nsk, skb)) { + dccp_v6_ctl_send_reset(sk, skb); + goto discard_it; + } else { + return 0; + } + } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e62f04775c93..80904df02187 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -476,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -571,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; - struct request_sock **prev; - bool found = false; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; + spinlock_t *lock; + bool found; - spin_lock(&queue->syn_wait_lock); + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -616,10 +557,8 @@ static void reqsk_timer_handler(unsigned long data) int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_listener->sk_state != TCP_LISTEN || !lopt) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -669,36 +608,36 @@ static void reqsk_timer_handler(unsigned long data) mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } +drop: inet_csk_reqsk_queue_drop(sk_listener, req); reqsk_put(req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); - req->rsk_hash = hash; + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); +} - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0ac1d68dc8a6..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 56742e995dd3..bed8886a4b6c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -398,14 +398,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +int inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; + int ret = 0; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -419,6 +423,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); + return ret; +} + +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +{ + inet_ehash_insert(sk, osk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6b97b5f6457c..729ceb5f63c6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -284,6 +284,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b0ce73c2049..a56912772354 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6241,7 +6241,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } tcp_reqsk_record_syn(sk, req, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a33101616215..bfe9d39ee87d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1224,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1343,34 +1342,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1409,10 +1385,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); sk_mark_napi_id(nsk, skb); @@ -1603,6 +1579,29 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1830,35 +1829,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1868,15 +1839,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2008,7 +1970,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2067,7 +2028,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2092,11 +2052,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2269,18 +2224,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq4(v, seq, st->num); - break; - } + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index ea915aa5e4e2..5d1c7cee2cb2 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -94,73 +94,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, } EXPORT_SYMBOL(inet6_csk_route_req); -/* - * request_sock (formerly open request) hash tables. - */ -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - u32 c; - - c = jhash_3words((__force u32)raddr->s6_addr32[0], - (__force u32)raddr->s6_addr32[1], - (__force u32)raddr->s6_addr32[2], - rnd); - - c = jhash_2words((__force u32)raddr->s6_addr32[3], - (__force u32)rport, - c); - - return c & (synq_hsize - 1); -} - -struct request_sock *inet6_csk_search_req(struct sock *sk, - const __be16 rport, - const struct in6_addr *raddr, - const struct in6_addr *laddr, - const int iif) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && - ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && - (!ireq->ir_iif || ireq->ir_iif == iif)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk != NULL); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet6_csk_search_req); - -void inet6_csk_reqsk_queue_hash_add(struct sock *sk, - struct request_sock *req, - const unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk); -} -EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); - void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cadb44a2d34e..a215614cfb2b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -727,7 +727,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, - .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, @@ -938,37 +937,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, } -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, th->source, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - tcp_v6_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v6_check(sk, skb); #endif @@ -1258,15 +1231,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); sk_mark_napi_id(nsk, skb); @@ -1402,6 +1371,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + reqsk_put(req); + goto discard_it; + } + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + tcp_v6_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v6_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1765,18 +1761,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait6_sock(seq, v, st->num); - else - get_tcp6_sock(seq, v, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); - break; - } + else + get_tcp6_sock(seq, v, st->num); out: return 0; } -- cgit v1.2.3 From ca6fb06518836ef9b65dc0aac02ff97704d52a05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:35 -0700 Subject: tcp: attach SYNACK messages to request sockets instead of listener If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ++++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_fastopen.c | 4 ++-- net/ipv4/tcp_input.c | 23 ++++++++++++----------- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 22 +++++++++++++++------- net/ipv6/tcp_ipv6.c | 5 +++-- net/sched/sch_fq.c | 12 +++++++----- 8 files changed, 47 insertions(+), 32 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 225e9561af35..a6be56d5f0e3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + bool attach_req); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops { __u32 (*init_seq)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc); + u16 queue_mapping, struct tcp_fastopen_cookie *foc, + bool attach_req); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 80904df02187..099e0ea9242a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); + atomic_set(&req->rsk_refcnt, 2 + 1); } void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f69f436fcbcc..410ac481fda0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); + atomic_set(&req->rsk_refcnt, 2); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a56912772354..27108757c310 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct request_sock *req; bool want_cookie = false; struct flowi fl; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - if (!want_cookie) + if (!want_cookie) { fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); + tcp_reqsk_record_syn(sk, req, skb); + } if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, false); sock_put(fastopen_sk); } else { - if (err || want_cookie) - goto drop_and_free; - tcp_rsk(req)->tfo_listener = false; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - tcp_reqsk_record_syn(sk, req, skb); - + reqsk_put(req); return 0; drop_and_release: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bfe9d39ee87d..ac2ea73e9aaf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 09bb082ca1a7..55ed3266b05f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk) */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, u16 user_mss; int mss; - /* sk is a const pointer, because we want to express multiple cpus - * might call us concurrently. - * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb->destructor = sock_edemux; + sock_hold(req_to_sk(req)); + skb->sk = req_to_sk(req); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); @@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a215614cfb2b..3d18571811c5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f377702d4b91..3386cce4751e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; - /* SYNACK messages are attached to a listener socket. - * 1) They are not part of a 'flow' yet - * 2) We do not want to rate limit them (eg SYNFLOOD attack), + /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * 1) request sockets are not full blown, + * they do not contain sk_pacing_rate + * 2) They are not part of a 'flow' yet + * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE - * 3) We pretend they are orphaned + * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_LISTEN) { + if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not -- cgit v1.2.3 From e994b2f0fb9229aeff5eea9541320bd7b2ca8714 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:39 -0700 Subject: tcp: do not lock listener to process SYN packets Everything should now be ready to finally allow SYN packets processing without holding listener lock. Tested: 3.5 Mpps SYNFLOOD. Plenty of cpu cycles available. Next bottleneck is the refcount taken on listener, that could be avoided if we remove SLAB_DESTROY_BY_RCU strict semantic for listeners, and use regular RCU. 13.18% [kernel] [k] __inet_lookup_listener 9.61% [kernel] [k] tcp_conn_request 8.16% [kernel] [k] sha_transform 5.30% [kernel] [k] inet_reqsk_alloc 4.22% [kernel] [k] sock_put 3.74% [kernel] [k] tcp_make_synack 2.88% [kernel] [k] ipt_do_table 2.56% [kernel] [k] memcpy_erms 2.53% [kernel] [k] sock_wfree 2.40% [kernel] [k] tcp_v4_rcv 2.08% [kernel] [k] fib_table_lookup 1.84% [kernel] [k] tcp_openreq_init_rwin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 +++++++++-- net/ipv6/tcp_ipv6.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac2ea73e9aaf..34310748a365 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1355,7 +1355,7 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1619,9 +1619,15 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v4_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1636,6 +1642,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3d18571811c5..33334f0c217d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1161,7 +1161,7 @@ out: } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1415,9 +1415,15 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v6_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1432,6 +1438,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret ? -1 : 0; -- cgit v1.2.3 From ed53d0ab761f5c71d77c8dc05fd19c0a851200db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:23 -0700 Subject: net: shrink struct sock and request_sock by 8 bytes One 32bit hole is following skc_refcnt, use it. skc_incoming_cpu can also be an union for request_sock rcv_wnd. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 5 ++--- include/net/sock.h | 14 +++++++++----- net/ipv4/syncookies.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 18 +++++++++--------- net/ipv4/tcp_output.c | 2 +- net/ipv6/syncookies.c | 4 ++-- net/ipv6/tcp_ipv6.c | 2 +- 9 files changed, 28 insertions(+), 25 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6b818b77d5e5..2e73748956d5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -51,15 +51,14 @@ struct request_sock { #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash #define rsk_listener __req_common.skc_listener +#define rsk_window_clamp __req_common.skc_window_clamp +#define rsk_rcv_wnd __req_common.skc_rcv_wnd struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ u8 num_timeout:7; /* number of timeouts */ - /* The following two fields can be easily recomputed I think -AK */ - u32 window_clamp; /* window clamp at creation time */ - u32 rcv_wnd; /* rcv_wnd offered first time */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; diff --git a/include/net/sock.h b/include/net/sock.h index 65712409464b..19cfe1fc911c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -226,11 +226,18 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; int skc_tx_queue_mapping; - int skc_incoming_cpu; + union { + int skc_incoming_cpu; + u32 skc_rcv_wnd; + }; atomic_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; + union { + u32 skc_rxhash; + u32 skc_window_clamp; + }; /* public: */ }; @@ -287,7 +294,6 @@ struct cg_proto; * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting - * @sk_rxhash: flow hash received from netif layer * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer @@ -346,6 +352,7 @@ struct sock { #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags +#define sk_rxhash __sk_common.skc_rxhash socket_lock_t sk_lock; struct sk_buff_head sk_receive_queue; @@ -365,9 +372,6 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; -#ifdef CONFIG_RPS - __u32 sk_rxhash; -#endif __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2dbb11331f6c..4c0892badb8b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -382,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ddadb318e850..3b35c3f4d268 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6022,7 +6022,7 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 34310748a365..ddb198392c7f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1079e6ad77fe..41828bdc5d32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req, window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > full_space || req->window_clamp == 0)) - req->window_clamp = full_space; + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -512,9 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 55ed3266b05f..6e79fcb0addb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f610b5310b17..bb8f2fa1c7fb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -235,9 +235,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 33334f0c217d..2887c8474b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -931,7 +931,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); -- cgit v1.2.3 From 4bdc3d66147b3a623b32216a45431d0cff005f50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Oct 2015 17:12:54 -0700 Subject: tcp/dccp: fix behavior of stale SYN_RECV request sockets When a TCP/DCCP listener is closed, its pending SYN_RECV request sockets become stale, meaning 3WHS can not complete. But current behavior is wrong : incoming packets finding such stale sockets are dropped. We need instead to cleanup the request socket and perform another lookup : - Incoming ACK will give a RST answer, - SYN rtx might find another listener if available. - We expedite cleanup of request sockets and old listener socket. Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 15 +++++++-------- net/dccp/ipv6.c | 15 +++++++-------- net/ipv4/tcp_ipv4.c | 7 ++++++- net/ipv6/tcp_ipv6.c | 7 ++++++- 4 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8e99681c8189..0dcf1963b323 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -799,15 +799,10 @@ static int dccp_v4_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq); } - /* Step 2: - * Look up flow ID in table and get corresponding socket */ +lookup: sk = __inet_lookup_skb(&dccp_hashinfo, skb, dh->dccph_sport, dh->dccph_dport); - /* - * Step 2: - * If no socket ... - */ - if (sk == NULL) { + if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; @@ -830,8 +825,12 @@ static int dccp_v4_rcv(struct sk_buff *skb) struct sock *nsk = NULL; sk = req->rsk_listener; - if (sk->sk_state == DCCP_LISTEN) + if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index aed314f8c7c6..68831931b1fe 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -656,16 +656,11 @@ static int dccp_v6_rcv(struct sk_buff *skb) else DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - /* Step 2: - * Look up flow ID in table and get corresponding socket */ +lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); - /* - * Step 2: - * If no socket ... - */ - if (sk == NULL) { + if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; @@ -688,8 +683,12 @@ static int dccp_v6_rcv(struct sk_buff *skb) struct sock *nsk = NULL; sk = req->rsk_listener; - if (sk->sk_state == DCCP_LISTEN) + if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddb198392c7f..1ff0923df715 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1572,6 +1572,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; +lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1587,8 +1588,12 @@ process: sk = req->rsk_listener; if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; - if (sk->sk_state == TCP_LISTEN) + if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2887c8474b65..7ce1c57199d1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1363,6 +1363,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); +lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, inet6_iif(skb)); if (!sk) @@ -1382,8 +1383,12 @@ process: reqsk_put(req); goto discard_it; } - if (sk->sk_state == TCP_LISTEN) + if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); + } else { + inet_csk_reqsk_queue_drop(sk, req); + goto lookup; + } if (!nsk) { reqsk_put(req); goto discard_it; -- cgit v1.2.3 From ef84d8ce5a36d0c4a6454e7e9dff54d19f96a25f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 11:16:26 -0700 Subject: Revert "inet: fix double request socket freeing" This reverts commit c69736696cf3742b37d850289dc0d7ead177bb14. At the time of above commit, tcp_req_err() and dccp_req_err() were dead code, as SYN_RECV request sockets were not yet in ehash table. Real bug was fixed later in a different commit. We need to revert to not leak a refcount on request socket. inet_csk_reqsk_queue_drop_and_put() will be added in following commit to make clean inet_csk_reqsk_queue_drop() does not release the reference owned by caller. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0dcf1963b323..644af510d932 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -208,7 +208,6 @@ void dccp_req_err(struct sock *sk, u64 seq) if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in RESPOND, just remove it silently. @@ -218,6 +217,7 @@ void dccp_req_err(struct sock *sk, u64 seq) */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); } + reqsk_put(req); } EXPORT_SYMBOL(dccp_req_err); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1ff0923df715..aad2298de7ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in SYN_RECV, just remove it silently. @@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq) * created socket, and POSIX does not want network * errors returned from accept(). */ - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); } + reqsk_put(req); } EXPORT_SYMBOL(tcp_req_err); -- cgit v1.2.3 From f03f2e154f52fdaa982de7e2c386737679963dc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2015 11:16:27 -0700 Subject: tcp/dccp: add inet_csk_reqsk_queue_drop_and_put() helper Let's reduce the confusion about inet_csk_reqsk_queue_drop() : In many cases we also need to release reference on request socket, so add a helper to do this, reducing code size and complexity. Fixes: 4bdc3d66147b ("tcp/dccp: fix behavior of stale SYN_RECV request sockets") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 1 + net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/inet_connection_sock.c | 10 ++++++++-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 6 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index fd645c49e71e..e84ea9f2498f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -299,6 +299,7 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) } void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); +void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); void inet_csk_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 644af510d932..59bc180b02d8 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -828,7 +828,7 @@ lookup: if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 68831931b1fe..d9cc731f2619 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -686,7 +686,7 @@ lookup: if (likely(sk->sk_state == DCCP_LISTEN)) { nsk = dccp_check_req(sk, skb, req); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ba9ec9a0d0ce..b85c720956a9 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -546,6 +546,13 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); +void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) +{ + inet_csk_reqsk_queue_drop(sk, req); + reqsk_put(req); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; @@ -608,8 +615,7 @@ static void reqsk_timer_handler(unsigned long data) return; } drop: - inet_csk_reqsk_queue_drop(sk_listener, req); - reqsk_put(req); + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); } static void reqsk_queue_hash_req(struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aad2298de7ad..9c68cf3762c4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1591,7 +1591,7 @@ process: if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7ce1c57199d1..acb06f86f372 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1386,7 +1386,7 @@ process: if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); } else { - inet_csk_reqsk_queue_drop(sk, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } if (!nsk) { -- cgit v1.2.3 From dc6ef6be52154490c5c03f742e28bc781cc751b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Oct 2015 13:00:01 -0700 Subject: tcp: do not set queue_mapping on SYNACK At the time of commit fff326990789 ("tcp: reflect SYN queue_mapping into SYNACK packets") we had little ways to cope with SYN floods. We no longer need to reflect incoming skb queue mappings, and instead can pick a TX queue based on cpu cooking the SYNACK, with normal XPS affinities. Note that all SYNACK retransmits were picking TX queue 0, this no longer is a win given that SYNACK rtx are now distributed on all cpus. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/ip_output.c | 1 - net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 -- 6 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index a6be56d5f0e3..eed94fc355c1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1716,7 +1716,7 @@ struct tcp_request_sock_ops { __u32 (*init_seq)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc, + struct tcp_fastopen_cookie *foc, bool attach_req); }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 67404e1fe7d4..50e29737b584 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1596,7 +1596,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b35c3f4d268..944eaca69115 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6236,7 +6236,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc, false); + &foc, false); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, fastopen_sk); sk->sk_data_ready(sk); @@ -6247,7 +6247,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!want_cookie) inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); af_ops->send_synack(sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc, !want_cookie); + &foc, !want_cookie); if (want_cookie) goto drop_and_free; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9c68cf3762c4..30dd45c1f568 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -821,7 +821,6 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc, bool attach_req) { @@ -839,7 +838,6 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e79fcb0addb..19adedb8c5cc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3518,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index acb06f86f372..f495d189f5e0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -437,7 +437,6 @@ out: static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc, bool attach_req) { @@ -462,7 +461,6 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); } -- cgit v1.2.3 From 5e0724d027f0548511a2165a209572d48fe7a4c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Oct 2015 08:20:46 -0700 Subject: tcp/dccp: fix hashdance race for passive sessions Multiple cpus can process duplicates of incoming ACK messages matching a SYN_RECV request socket. This is a rare event under normal operations, but definitely can happen. Only one must win the race, otherwise corruption would occur. To fix this without adding new atomic ops, we use logic in inet_ehash_nolisten() to detect the request was present in the same ehash bucket where we try to insert the new child. If request socket was not found, we have to undo the child creation. This actually removes a spin_lock()/spin_unlock() pair in reqsk_queue_unlink() for the fast path. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 7 ++++++- include/net/inet_hashtables.h | 4 ++-- include/net/tcp.h | 4 +++- net/dccp/dccp.h | 4 +++- net/dccp/ipv4.c | 6 ++++-- net/dccp/ipv6.c | 9 ++++++--- net/dccp/minisocks.c | 14 +++++++------- net/ipv4/inet_connection_sock.c | 33 +++++++++++++++++++++++++------- net/ipv4/inet_hashtables.c | 39 ++++++++++++++++++++++++-------------- net/ipv4/syncookies.c | 4 +++- net/ipv4/tcp_fastopen.c | 4 +++- net/ipv4/tcp_ipv4.c | 6 ++++-- net/ipv4/tcp_minisocks.c | 11 ++++------- net/ipv6/tcp_ipv6.c | 9 ++++++--- 14 files changed, 102 insertions(+), 52 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 63615709839d..481fe1c9044c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -43,7 +43,9 @@ struct inet_connection_sock_af_ops { int (*conn_request)(struct sock *sk, struct sk_buff *skb); struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst); + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req); u16 net_header_len; u16 net_frag_header_len; u16 sockaddr_len; @@ -272,6 +274,9 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); +struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, + struct request_sock *req, + bool own_req); static inline void inet_csk_reqsk_queue_added(struct sock *sk) { diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 6683ada25fef..de2e3ade6102 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,8 +205,8 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); -int inet_ehash_insert(struct sock *sk, struct sock *osk); -void __inet_hash_nolisten(struct sock *sk, struct sock *osk); +bool inet_ehash_insert(struct sock *sk, struct sock *osk); +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 11e320412216..f80e74c5ad18 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -457,7 +457,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst); + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 923f5a180134..b0e28d24e1a7 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -278,7 +278,9 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst); + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 59bc180b02d8..5684e14932bd 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -393,7 +393,9 @@ static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; @@ -426,7 +428,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - __inet_hash_nolisten(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); return newsk; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d9cc731f2619..ef4e48ce9143 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -380,7 +380,9 @@ drop: static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *newnp; @@ -393,7 +395,8 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, /* * v6 mapped */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, + req_unhash, own_req); if (newsk == NULL) return NULL; @@ -511,7 +514,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, dccp_done(newsk); goto out; } - __inet_hash(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); return newsk; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index d10aace43672..1994f8af646b 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -143,6 +143,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, { struct sock *child = NULL; struct dccp_request_sock *dreq = dccp_rsk(req); + bool own_req; /* Check for retransmitted REQUEST */ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { @@ -182,14 +183,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, if (dccp_parse_options(sk, dreq, skb)) goto drop; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); + if (!child) goto listen_overflow; - inet_csk_reqsk_queue_drop(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); -out: - return child; + return inet_csk_complete_hashdance(sk, child, req, own_req); + listen_overflow: dccp_pr_debug("listen_overflow!\n"); DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; @@ -198,7 +198,7 @@ drop: req->rsk_ops->send_reset(sk, skb); inet_csk_reqsk_queue_drop(sk, req); - goto out; + return NULL; } EXPORT_SYMBOL_GPL(dccp_check_req); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8430bc8ccd58..1feb15f23de8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -523,15 +523,15 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; - spinlock_t *lock; - bool found; + bool found = false; - lock = inet_ehash_lockp(hashinfo, req->rsk_hash); - - spin_lock(lock); - found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); - spin_unlock(lock); + if (sk_hashed(req_to_sk(req))) { + spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); + } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -811,6 +811,25 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); +struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, + struct request_sock *req, bool own_req) +{ + if (own_req) { + inet_csk_reqsk_queue_drop(sk, req); + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + inet_csk_reqsk_queue_add(sk, req, child); + /* Warning: caller must not call reqsk_put(req); + * child stole last reference on it. + */ + return child; + } + /* Too bad, another child took ownership of the request, undo. */ + bh_unlock_sock(child); + sock_put(child); + return NULL; +} +EXPORT_SYMBOL(inet_csk_complete_hashdance); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 958728a22001..ccc5980797fc 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -407,13 +407,13 @@ static u32 inet_sk_port_offset(const struct sock *sk) /* insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT */ -int inet_ehash_insert(struct sock *sk, struct sock *osk) +bool inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; - int ret = 0; + bool ret = true; WARN_ON_ONCE(!sk_unhashed(sk)); @@ -423,30 +423,41 @@ int inet_ehash_insert(struct sock *sk, struct sock *osk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); if (osk) { - WARN_ON(sk->sk_hash != osk->sk_hash); - sk_nulls_del_node_init_rcu(osk); + WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); + ret = sk_nulls_del_node_init_rcu(osk); } + if (ret) + __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); return ret; } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) { - inet_ehash_insert(sk, osk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + bool ok = inet_ehash_insert(sk, osk); + + if (ok) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } else { + percpu_counter_inc(sk->sk_prot->orphan_count); + sk->sk_state = TCP_CLOSE; + sock_set_flag(sk, SOCK_DEAD); + inet_csk_destroy_sock(sk); + } + return ok; } -EXPORT_SYMBOL_GPL(__inet_hash_nolisten); +EXPORT_SYMBOL_GPL(inet_ehash_nolisten); void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, osk); - + if (sk->sk_state != TCP_LISTEN) { + inet_ehash_nolisten(sk, osk); + return; + } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -567,7 +578,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - __inet_hash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw); } if (tw) inet_twsk_bind_unhash(tw, hinfo); @@ -584,7 +595,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4c0892badb8b..4cbe9f0a4281 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -221,8 +221,10 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; + bool own_req; - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + NULL, &own_req); if (child) { atomic_set(&req->rsk_refcnt, 1); sock_rps_save_rxhash(child, skb); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 93396bf7b475..55be6ac70cff 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -133,12 +133,14 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; u32 end_seq; + bool own_req; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + NULL, &own_req); if (!child) return NULL; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30dd45c1f568..1c2648bbac4b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1247,7 +1247,9 @@ EXPORT_SYMBOL(tcp_v4_conn_request); */ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; @@ -1323,7 +1325,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - __inet_hash_nolisten(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); return newsk; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1fd5d413a664..3575dd1e5b67 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -580,6 +580,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; + bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { @@ -767,18 +768,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); if (!child) goto listen_overflow; sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); - inet_csk_reqsk_queue_drop(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (!sysctl_tcp_abort_on_overflow) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f495d189f5e0..714bc5ad096e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -965,7 +965,9 @@ drop: static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; @@ -984,7 +986,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * v6 mapped */ - newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); if (!newsk) return NULL; @@ -1145,7 +1148,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_done(newsk); goto out; } - __inet_hash(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); return newsk; -- cgit v1.2.3 From 805c4bc05705fb2b71ec970960b456eee9900953 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2015 11:07:13 -0800 Subject: tcp: fix req->saved_syn race For the reasons explained in commit ce1050089c96 ("tcp/dccp: fix ireq->pktopts race"), we need to make sure we do not access req->saved_syn unless we own the request sock. This fixes races for listeners using TCP_SAVE_SYN option. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Reported-by: Ying Cai Signed-off-by: David S. Miller --- include/linux/tcp.h | 7 +++++++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv4/tcp_minisocks.c | 3 --- net/ipv6/tcp_ipv6.c | 20 ++++++++++++-------- 4 files changed, 21 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c906f4534581..b386361ba3e8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -397,6 +397,13 @@ static inline void fastopen_queue_tune(struct sock *sk, int backlog) queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn); } +static inline void tcp_move_syn(struct tcp_sock *tp, + struct request_sock *req) +{ + tp->saved_syn = req->saved_syn; + req->saved_syn = NULL; +} + static inline void tcp_saved_syn_free(struct tcp_sock *tp) { kfree(tp->saved_syn); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c2648bbac4b..59aff63b1776 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1326,6 +1326,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + if (*own_req) + tcp_move_syn(newtp, req_unhash); return newsk; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3575dd1e5b67..ac6b1961ffeb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -551,9 +551,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rack.mstamp.v64 = 0; newtp->rack.advanced = 0; - newtp->saved_syn = req->saved_syn; - req->saved_syn = NULL; - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ea2f4d5440b5..c509e5562429 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1140,14 +1140,18 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_atomic(sk, GFP_ATOMIC)); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); + if (*own_req) { + tcp_move_syn(newtp, req_unhash); + + /* Clone pktoptions received with SYN, if we own the req */ + if (ireq->pktopts) { + newnp->pktoptions = skb_clone(ireq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); + consume_skb(ireq->pktopts); + ireq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } } return newsk; -- cgit v1.2.3 From 49a496c97d035f2eab7cef4894dd46202184fc81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2015 12:50:19 -0800 Subject: tcp: use correct req pointer in tcp_move_syn() calls I mistakenly took wrong request sock pointer when calling tcp_move_syn() @req_unhash is either a copy of @req, or a NULL value for FastOpen connexions (as we do not expect to unhash the temporary request sock from ehash table) Fixes: 805c4bc05705 ("tcp: fix req->saved_syn race") Signed-off-by: Eric Dumazet Cc: Ying Cai Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 59aff63b1776..950e28c0cdf2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1327,7 +1327,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); if (*own_req) - tcp_move_syn(newtp, req_unhash); + tcp_move_syn(newtp, req); return newsk; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c509e5562429..5baa8e754e41 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1141,7 +1141,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); if (*own_req) { - tcp_move_syn(newtp, req_unhash); + tcp_move_syn(newtp, req); /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { -- cgit v1.2.3 From 00fd38d938db3f1ab1c486549afc450cb7e751b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Nov 2015 08:43:18 -0800 Subject: tcp: ensure proper barriers in lockless contexts Some functions access TCP sockets without holding a lock and might output non consistent data, depending on compiler and or architecture. tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... Introduce sk_state_load() and sk_state_store() to fix the issues, and more clearly document where this lack of locking is happening. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 25 +++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp.c | 21 +++++++++++---------- net/ipv4/tcp_diag.c | 2 +- net/ipv4/tcp_ipv4.c | 14 ++++++++------ net/ipv6/tcp_ipv6.c | 19 +++++++++++++++---- 6 files changed, 62 insertions(+), 23 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/sock.h b/include/net/sock.h index bbf7c2cf15b4..7f89e4ba18d1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2226,6 +2226,31 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/** + * sk_state_load - read sk->sk_state for lockless contexts + * @sk: socket pointer + * + * Paired with sk_state_store(). Used in places we do not hold socket lock : + * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... + */ +static inline int sk_state_load(const struct sock *sk) +{ + return smp_load_acquire(&sk->sk_state); +} + +/** + * sk_state_store - update sk->sk_state + * @sk: socket pointer + * @newstate: new state + * + * Paired with sk_state_load(). Should be used in contexts where + * state change might impact lockless readers. + */ +static inline void sk_state_store(struct sock *sk, int newstate) +{ + smp_store_release(&sk->sk_state, newstate); +} + void sock_enable_timestamp(struct sock *sk, int flag); int sock_get_timestamp(struct sock *, struct timeval __user *); int sock_get_timestampns(struct sock *, struct timespec __user *); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1feb15f23de8..46b9c887bede 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -563,7 +563,7 @@ static void reqsk_timer_handler(unsigned long data) int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN) + if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; @@ -749,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk->sk_state = TCP_LISTEN; + sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cfa7c0c1e80..c1728771cf89 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,11 +451,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + int state; sock_rps_record_flow(sk); sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + + state = sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -492,14 +495,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -507,9 +510,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; @@ -1934,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2644,7 +2644,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk->sk_state; + info->tcpi_state = sk_state_load(sk); + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2672,7 +2673,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { + if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = sk->sk_max_ack_backlog; } else { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 479f34946177..b31604086edd 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,7 +21,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) { + if (sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 950e28c0cdf2..ba09016d1bfd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2158,6 +2158,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; + int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || @@ -2175,17 +2176,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - if (sk->sk_state == TCP_LISTEN) + state = sk_state_load(sk); + if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else - /* - * because we dont lock socket, we might find a transient negative value + /* Because we don't lock the socket, + * we might find a transient negative value. */ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", - i, src, srcp, dest, destp, sk->sk_state, + i, src, srcp, dest, destp, state, tp->write_seq - tp->snd_una, rx_queue, timer_active, @@ -2199,8 +2201,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sk->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + state == TCP_LISTEN ? + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5baa8e754e41..c5429a636f1a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1690,6 +1690,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + int rx_queue; + int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1710,6 +1712,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_expires = jiffies; } + state = sk_state_load(sp); + if (state == TCP_LISTEN) + rx_queue = sp->sk_ack_backlog; + else + /* Because we don't lock the socket, + * we might find a transient negative value. + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", @@ -1718,9 +1729,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - tp->write_seq-tp->snd_una, - (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), + state, + tp->write_seq - tp->snd_una, + rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, @@ -1732,7 +1743,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sp->sk_state == TCP_LISTEN ? + state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); -- cgit v1.2.3 From 1b8e6a01e19f001e9f93b39c32387961c91ed3cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 12:40:13 -0800 Subject: tcp: md5: fix lockdep annotation When a passive TCP is created, we eventually call tcp_md5_do_add() with sk pointing to the child. It is not owner by the user yet (we will add this socket into listener accept queue a bit later anyway) But we do own the spinlock, so amend the lockdep annotation to avoid following splat : [ 8451.090932] net/ipv4/tcp_ipv4.c:923 suspicious rcu_dereference_protected() usage! [ 8451.090932] [ 8451.090932] other info that might help us debug this: [ 8451.090932] [ 8451.090934] [ 8451.090934] rcu_scheduler_active = 1, debug_locks = 1 [ 8451.090936] 3 locks held by socket_sockopt_/214795: [ 8451.090936] #0: (rcu_read_lock){.+.+..}, at: [] __netif_receive_skb_core+0x151/0xe90 [ 8451.090947] #1: (rcu_read_lock){.+.+..}, at: [] ip_local_deliver_finish+0x43/0x2b0 [ 8451.090952] #2: (slock-AF_INET){+.-...}, at: [] sk_clone_lock+0x1c5/0x500 [ 8451.090958] [ 8451.090958] stack backtrace: [ 8451.090960] CPU: 7 PID: 214795 Comm: socket_sockopt_ [ 8451.091215] Call Trace: [ 8451.091216] [] dump_stack+0x55/0x76 [ 8451.091229] [] lockdep_rcu_suspicious+0xeb/0x110 [ 8451.091235] [] tcp_md5_do_add+0x1bf/0x1e0 [ 8451.091239] [] tcp_v4_syn_recv_sock+0x1f1/0x4c0 [ 8451.091242] [] ? tcp_v4_md5_hash_skb+0x167/0x190 [ 8451.091246] [] tcp_check_req+0x3c8/0x500 [ 8451.091249] [] ? tcp_v4_inbound_md5_hash+0x11e/0x190 [ 8451.091253] [] tcp_v4_rcv+0x3c0/0x9f0 [ 8451.091256] [] ? ip_local_deliver_finish+0x43/0x2b0 [ 8451.091260] [] ip_local_deliver_finish+0xb6/0x2b0 [ 8451.091263] [] ? ip_local_deliver_finish+0x43/0x2b0 [ 8451.091267] [] ip_local_deliver+0x48/0x80 [ 8451.091270] [] ip_rcv_finish+0x160/0x700 [ 8451.091273] [] ip_rcv+0x29e/0x3d0 [ 8451.091277] [] __netif_receive_skb_core+0xb47/0xe90 Fixes: a8afca0329988 ("tcp: md5: protects md5sig_info with RCU") Signed-off-by: Eric Dumazet Reported-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ba09016d1bfd..db003438aaf5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -921,7 +921,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) -- cgit v1.2.3