From 58d607d3e52f2b15902f58a1161da9fb3b0f6d47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Sep 2015 15:24:20 -0700 Subject: tcp: provide skb->hash to synack packets In commit b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit"), Tom provided a l4 hash to most outgoing TCP packets. We'd like to provide one as well for SYNACK packets, so that all packets of a given flow share same txhash, to later enable bonding driver to also use skb->hash to perform slave selection. Note that a SYNACK retransmit shuffles the tx hash, as Tom did in commit 265f94ff54d62 ("net: Recompute sk_txhash on negative routing advice") for established sockets. This has nice effect making TCP flows resilient to some kind of black holes, even at connection establish phase. Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9a8a12b62ee..d0ad3554c333 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2987,6 +2987,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, foc) + sizeof(*th); @@ -3505,6 +3506,7 @@ int tcp_rtx_synack(struct sock *sk, struct request_sock *req) struct flowi fl; int res; + tcp_rsk(req)->txhash = net_tx_rndhash(); res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); -- cgit v1.2.3 From f9b9958229638245b5709f27c76c199a465f1496 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 18 Sep 2015 11:40:33 -0700 Subject: tcp: send loss probe after 1s if no RTT available This patch makes TLP to use 1 sec timer by default when RTT is not available due to SYN/ACK retransmission or SYN cookies. Prior to this change, the lack of RTT prevents TLP so the first data packets sent can only be recovered by fast recovery or RTO. If the fast recovery fails to trigger the RTO is 3 second when SYN/ACK is retransmitted. With this patch we can trigger fast recovery in 1sec instead. Note that we need to check Fast Open more properly. A Fast Open connection could be (accepted then) closed before it receives the final ACK of 3WHS so the state is FIN_WAIT_1. Without the new check, TLP will retransmit FIN instead of SYN/ACK. Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d0ad3554c333..4cd0b50d4e46 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2165,7 +2165,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (sk->sk_state == TCP_SYN_RECV) + if (tp->fastopen_rsk) return false; /* TLP is only scheduled when next timer event is RTO. */ @@ -2175,7 +2175,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2184,9 +2184,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account - * for delayed ack when there's one outstanding packet. + * for delayed ack when there's one outstanding packet. If no RTT + * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1; + timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; if (tp->packets_out == 1) timeout = max_t(u32, timeout, (rtt + (rtt >> 1) + TCP_DELACK_MAX)); -- cgit v1.2.3 From 37bfbdda0b036a3720924e04c0171d9038159c2c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:17 -0700 Subject: tcp: remove tcp_synack_options() socket argument We do not use the socket in this function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4cd0b50d4e46..87392cb51b11 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -612,12 +612,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct sock *sk, - struct request_sock *req, - unsigned int mss, struct sk_buff *skb, - struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) +static unsigned int tcp_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -2989,8 +2988,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit v1.2.3 From 6ac705b1805863b1899e85f641bb265f9e6e9d99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:18 -0700 Subject: tcp: remove tcp_ecn_make_synack() socket argument SYNACK packets might be sent without holding socket lock. For DCTCP/ECN sake, we should call INET_ECN_xmit() while socket lock is owned, and only when we init/change congestion control. This also fixies a bug if congestion module is changed from dctcp to another one on a listener : we now clear ECN bits properly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 12 ++++++++++-- net/ipv4/tcp_output.c | 10 +++------- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 93c4dc3ab23f..882caa4e72bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -173,6 +173,10 @@ out: */ if (ca->get_info) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) @@ -181,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } static void tcp_reinit_congestion_control(struct sock *sk, @@ -192,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); + if (sk->sk_state != TCP_CLOSE) + tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 87392cb51b11..ba6194152d39 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -357,14 +357,10 @@ static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, - struct sock *sk) +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) { + if (inet_rsk(req)->ecn_ok) th->ece = 1; - if (tcp_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); - } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -2998,7 +2994,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is -- cgit v1.2.3 From 5d062de7f8ea1ca7c635957ff1144fba815ba34c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:19 -0700 Subject: tcp: constify tcp_make_synack() socket argument listener socket is not locked when tcp_make_synack() is called. We better make sure no field is written. There is one exception : Since SYNACK packets are attached to the listener at this moment (or SYN_RECV child in case of Fast Open), sock_wmalloc() needs to update sk->sk_wmem_alloc, but this is done using atomic operations so this is safe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_output.c | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 45bc3c63c3fd..19f23590baa0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -461,7 +461,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc); int tcp_disconnect(struct sock *sk, int flags); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ba6194152d39..9eb67a8933f1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2944,20 +2944,25 @@ int tcp_send_synack(struct sock *sk) * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th; - struct sk_buff *skb; + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *md5 = NULL; + struct tcp_out_options opts; + struct sk_buff *skb; int tcp_header_size; + struct tcphdr *th; + u16 user_mss; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + /* sk is a const pointer, because we want to express multiple cpus + * might call us concurrently. + * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way. + */ + skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2968,8 +2973,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -3009,7 +3015,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); -- cgit v1.2.3 From ea3bea3a1d38aab1542176b2ff11a99ce3db9656 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 25 Sep 2015 07:39:23 -0700 Subject: tcp/dccp: constify rtx_synack() and friends This is done to make sure we do not change listener socket while sending SYNACK packets while socket lock is not held. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- include/net/tcp.h | 2 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/tcp_output.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 87935cad2f7b..ff7ce1e53ed4 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -32,7 +32,7 @@ struct request_sock_ops { int obj_size; struct kmem_cache *slab; char *slab_name; - int (*rtx_syn_ack)(struct sock *sk, + int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); void (*send_ack)(struct sock *sk, struct sk_buff *skb, struct request_sock *req); diff --git a/include/net/tcp.h b/include/net/tcp.h index 868c53532169..6630ab180f5c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1676,7 +1676,7 @@ int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif -int tcp_rtx_synack(struct sock *sk, struct request_sock *req); +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ccf4c5629b3c..a46ae9c69ccf 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -498,7 +498,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, return &rt->dst; } -static int dccp_v4_send_response(struct sock *sk, struct request_sock *req) +static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) { int err = -1; struct sk_buff *skb; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5165571f397a..4fa199dc69a3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -181,7 +181,7 @@ out: } -static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) +static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9eb67a8933f1..53ce6cf55598 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3502,7 +3502,7 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } -int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; -- cgit v1.2.3 From d2e1339f40db753286ca0a92c92a847e08c5d2de Mon Sep 17 00:00:00 2001 From: Bendik Rønning Opstad Date: Wed, 23 Sep 2015 18:49:53 +0200 Subject: tcp: Fix CWV being too strict on thin streams MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Application limited streams such as thin streams, that transmit small amounts of payload in relatively few packets per RTT, can be prevented from growing the CWND when in congestion avoidance. This leads to increased sojourn times for data segments in streams that often transmit time-dependent data. Currently, a connection is considered CWND limited only after having successfully transmitted at least one packet with new data, while at the same time failing to transmit some unsent data from the output queue because the CWND is full. Applications that produce small amounts of data may be left in a state where it is never considered to be CWND limited, because all unsent data is successfully transmitted each time an incoming ACK opens up for more data to be transmitted in the send window. Fix by always testing whether the CWND is fully used after successful packet transmissions, such that a connection is considered CWND limited whenever the CWND has been filled. This is the correct behavior as specified in RFC2861 (section 3.1). Cc: Andreas Petlund Cc: Carsten Griwodz Cc: Jonas Markussen Cc: Kenneth Klette Jonassen Cc: Mads Johannessen Signed-off-by: Bendik Rønning Opstad Acked-by: Eric Dumazet Tested-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9e53dd9bfcad..09bb082ca1a7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1822,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Ok, it looks like it is advisable to defer. */ - if (cong_win < send_win && cong_win < skb->len) + if (cong_win < send_win && cong_win <= skb->len) *is_cwnd_limited = true; return true; @@ -2055,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { - is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -2137,6 +2136,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; } -- cgit v1.2.3 From ca6fb06518836ef9b65dc0aac02ff97704d52a05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:35 -0700 Subject: tcp: attach SYNACK messages to request sockets instead of listener If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ++++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_fastopen.c | 4 ++-- net/ipv4/tcp_input.c | 23 ++++++++++++----------- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 22 +++++++++++++++------- net/ipv6/tcp_ipv6.c | 5 +++-- net/sched/sch_fq.c | 12 +++++++----- 8 files changed, 47 insertions(+), 32 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 225e9561af35..a6be56d5f0e3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + bool attach_req); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops { __u32 (*init_seq)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc); + u16 queue_mapping, struct tcp_fastopen_cookie *foc, + bool attach_req); }; #ifdef CONFIG_SYN_COOKIES diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 80904df02187..099e0ea9242a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); + atomic_set(&req->rsk_refcnt, 2 + 1); } void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f69f436fcbcc..410ac481fda0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); + atomic_set(&req->rsk_refcnt, 2); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a56912772354..27108757c310 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct request_sock *req; bool want_cookie = false; struct flowi fl; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - if (!want_cookie) + if (!want_cookie) { fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); + tcp_reqsk_record_syn(sk, req, skb); + } if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, false); sock_put(fastopen_sk); } else { - if (err || want_cookie) - goto drop_and_free; - tcp_rsk(req)->tfo_listener = false; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - tcp_reqsk_record_syn(sk, req, skb); - + reqsk_put(req); return 0; drop_and_release: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bfe9d39ee87d..ac2ea73e9aaf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 09bb082ca1a7..55ed3266b05f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk) */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, u16 user_mss; int mss; - /* sk is a const pointer, because we want to express multiple cpus - * might call us concurrently. - * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb->destructor = sock_edemux; + sock_hold(req_to_sk(req)); + skb->sk = req_to_sk(req); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); @@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a215614cfb2b..3d18571811c5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f377702d4b91..3386cce4751e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; - /* SYNACK messages are attached to a listener socket. - * 1) They are not part of a 'flow' yet - * 2) We do not want to rate limit them (eg SYNFLOOD attack), + /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket + * 1) request sockets are not full blown, + * they do not contain sk_pacing_rate + * 2) They are not part of a 'flow' yet + * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE - * 3) We pretend they are orphaned + * 4) We pretend they are orphaned */ - if (!sk || sk->sk_state == TCP_LISTEN) { + if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not -- cgit v1.2.3 From ed53d0ab761f5c71d77c8dc05fd19c0a851200db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Oct 2015 19:33:23 -0700 Subject: net: shrink struct sock and request_sock by 8 bytes One 32bit hole is following skc_refcnt, use it. skc_incoming_cpu can also be an union for request_sock rcv_wnd. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 5 ++--- include/net/sock.h | 14 +++++++++----- net/ipv4/syncookies.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 18 +++++++++--------- net/ipv4/tcp_output.c | 2 +- net/ipv6/syncookies.c | 4 ++-- net/ipv6/tcp_ipv6.c | 2 +- 9 files changed, 28 insertions(+), 25 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6b818b77d5e5..2e73748956d5 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -51,15 +51,14 @@ struct request_sock { #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash #define rsk_listener __req_common.skc_listener +#define rsk_window_clamp __req_common.skc_window_clamp +#define rsk_rcv_wnd __req_common.skc_rcv_wnd struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ u8 num_timeout:7; /* number of timeouts */ - /* The following two fields can be easily recomputed I think -AK */ - u32 window_clamp; /* window clamp at creation time */ - u32 rcv_wnd; /* rcv_wnd offered first time */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; diff --git a/include/net/sock.h b/include/net/sock.h index 65712409464b..19cfe1fc911c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -226,11 +226,18 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; int skc_tx_queue_mapping; - int skc_incoming_cpu; + union { + int skc_incoming_cpu; + u32 skc_rcv_wnd; + }; atomic_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; + union { + u32 skc_rxhash; + u32 skc_window_clamp; + }; /* public: */ }; @@ -287,7 +294,6 @@ struct cg_proto; * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting - * @sk_rxhash: flow hash received from netif layer * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer @@ -346,6 +352,7 @@ struct sock { #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags +#define sk_rxhash __sk_common.skc_rxhash socket_lock_t sk_lock; struct sk_buff_head sk_receive_queue; @@ -365,9 +372,6 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; -#ifdef CONFIG_RPS - __u32 sk_rxhash; -#endif __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2dbb11331f6c..4c0892badb8b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -382,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ddadb318e850..3b35c3f4d268 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6022,7 +6022,7 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 34310748a365..ddb198392c7f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1079e6ad77fe..41828bdc5d32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req, window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > full_space || req->window_clamp == 0)) - req->window_clamp = full_space; + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -512,9 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 55ed3266b05f..6e79fcb0addb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f610b5310b17..bb8f2fa1c7fb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -235,9 +235,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out_free; } - req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 33334f0c217d..2887c8474b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -931,7 +931,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); -- cgit v1.2.3 From dc6ef6be52154490c5c03f742e28bc781cc751b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Oct 2015 13:00:01 -0700 Subject: tcp: do not set queue_mapping on SYNACK At the time of commit fff326990789 ("tcp: reflect SYN queue_mapping into SYNACK packets") we had little ways to cope with SYN floods. We no longer need to reflect incoming skb queue mappings, and instead can pick a TX queue based on cpu cooking the SYNACK, with normal XPS affinities. Note that all SYNACK retransmits were picking TX queue 0, this no longer is a win given that SYNACK rtx are now distributed on all cpus. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/ip_output.c | 1 - net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 -- 6 files changed, 4 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index a6be56d5f0e3..eed94fc355c1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1716,7 +1716,7 @@ struct tcp_request_sock_ops { __u32 (*init_seq)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc, + struct tcp_fastopen_cookie *foc, bool attach_req); }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 67404e1fe7d4..50e29737b584 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1596,7 +1596,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b35c3f4d268..944eaca69115 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6236,7 +6236,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc, false); + &foc, false); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, fastopen_sk); sk->sk_data_ready(sk); @@ -6247,7 +6247,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!want_cookie) inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); af_ops->send_synack(sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc, !want_cookie); + &foc, !want_cookie); if (want_cookie) goto drop_and_free; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9c68cf3762c4..30dd45c1f568 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -821,7 +821,6 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc, bool attach_req) { @@ -839,7 +838,6 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e79fcb0addb..19adedb8c5cc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3518,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index acb06f86f372..f495d189f5e0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -437,7 +437,6 @@ out: static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc, bool attach_req) { @@ -462,7 +461,6 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); } -- cgit v1.2.3 From af82f4e84866ecd360a53f770d6217637116e6c1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 16 Oct 2015 21:57:43 -0700 Subject: tcp: remove tcp_mark_lost_retrans() Remove the existing lost retransmit detection because RACK subsumes it completely. This also stops the overloading the ack_seq field of the skb control block. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 -- net/ipv4/tcp_input.c | 65 --------------------------------------------------- net/ipv4/tcp_output.c | 6 ----- 3 files changed, 73 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90edef5508f9..8c54863dfc38 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -283,8 +283,6 @@ struct tcp_sock { int lost_cnt_hint; u32 retransmit_high; /* L-bits may be on up to this seqno */ - u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */ - u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eedb25db3947..5a776897a8c7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1048,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, return !before(start_seq, end_seq - tp->max_window); } -/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "B". Later note: FACK people cheated me again 8), we have to account - * for reordering! Ugly, but should help. - * - * Search retransmitted skbs from write_queue that were sent when snd_nxt was - * less than what is now known to be received by the other end (derived from - * highest SACK block). Also calculate the lowest snd_nxt among the remaining - * retransmitted skbs to avoid some costly processing per ACKs. - */ -static void tcp_mark_lost_retrans(struct sock *sk, int *flag) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt = 0; - u32 new_low_seq = tp->snd_nxt; - u32 received_upto = tcp_highest_sack_seq(tp); - - if (!tcp_is_fack(tp) || !tp->retrans_out || - !after(received_upto, tp->lost_retrans_low) || - icsk->icsk_ca_state != TCP_CA_Recovery) - return; - - tcp_for_write_queue(skb, sk) { - u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; - - if (skb == tcp_send_head(sk)) - break; - if (cnt == tp->retrans_out) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) - continue; - - /* TODO: We would like to get rid of tcp_is_fack(tp) only - * constraint here (see above) but figuring out that at - * least tp->reordering SACK blocks reside between ack_seq - * and received_upto is not easy task to do cheaply with - * the available datastructures. - * - * Whether FACK should check here for tp->reordering segs - * in-between one could argue for either way (it would be - * rather simple to implement as we could count fack_count - * during the walk and do tp->fackets_out - fack_count). - */ - if (after(received_upto, ack_seq)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - *flag |= FLAG_LOST_RETRANS; - tcp_skb_mark_lost_uncond_verify(tp, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); - } else { - if (before(ack_seq, new_low_seq)) - new_low_seq = ack_seq; - cnt += tcp_skb_pcount(skb); - } - } - - if (tp->retrans_out) - tp->lost_retrans_low = new_low_seq; -} - static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1838,7 +1774,6 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 19adedb8c5cc..f6f7f9b4901b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("retrans_out leaked\n"); } #endif - if (!tp->retrans_out) - tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, - * see tcp_input.c tcp_sacktag_write_queue(). - */ - TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } -- cgit v1.2.3 From e2e8009ff72ad2a795b67785f3238af152146368 Mon Sep 17 00:00:00 2001 From: Renato Westphal Date: Mon, 19 Oct 2015 18:51:34 -0200 Subject: tcp: remove improper preemption check in tcp_xmit_probe_skb() Commit e520af48c7e5a introduced the following bug when setting the TCP_REPAIR sockoption: [ 2860.657036] BUG: using __this_cpu_add() in preemptible [00000000] code: daemon/12164 [ 2860.657045] caller is __this_cpu_preempt_check+0x13/0x20 [ 2860.657049] CPU: 1 PID: 12164 Comm: daemon Not tainted 4.2.3 #1 [ 2860.657051] Hardware name: Dell Inc. PowerEdge R210 II/0JP7TR, BIOS 2.0.5 03/13/2012 [ 2860.657054] ffffffff81c7f071 ffff880231e9fdf8 ffffffff8185d765 0000000000000002 [ 2860.657058] 0000000000000001 ffff880231e9fe28 ffffffff8146ed91 ffff880231e9fe18 [ 2860.657062] ffffffff81cd1a5d ffff88023534f200 ffff8800b9811000 ffff880231e9fe38 [ 2860.657065] Call Trace: [ 2860.657072] [] dump_stack+0x4f/0x7b [ 2860.657075] [] check_preemption_disabled+0xe1/0xf0 [ 2860.657078] [] __this_cpu_preempt_check+0x13/0x20 [ 2860.657082] [] tcp_xmit_probe_skb+0xc7/0x100 [ 2860.657085] [] tcp_send_window_probe+0x2d/0x30 [ 2860.657089] [] do_tcp_setsockopt.isra.29+0x74c/0x830 [ 2860.657093] [] tcp_setsockopt+0x2c/0x30 [ 2860.657097] [] sock_common_setsockopt+0x14/0x20 [ 2860.657100] [] SyS_setsockopt+0x71/0xc0 [ 2860.657104] [] entry_SYSCALL_64_fastpath+0x16/0x75 Since tcp_xmit_probe_skb() can be called from process context, use NET_INC_STATS() instead of NET_INC_STATS_BH(). Fixes: e520af48c7e5 ("tcp: add TCPWinProbe and TCPKeepAlive SNMP counters") Signed-off-by: Renato Westphal Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1100ffe4a722..3dbee0d83b15 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3405,7 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); - NET_INC_STATS_BH(sock_net(sk), mib); + NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } -- cgit v1.2.3 From 9e17f8a475fca81950fdddc08df428ed66cf441f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 1 Nov 2015 15:36:55 -0800 Subject: net: make skb_set_owner_w() more robust skb_set_owner_w() is called from various places that assume skb->sk always point to a full blown socket (as it changes sk->sk_wmem_alloc) We'd like to attach skb to request sockets, and in the future to timewait sockets as well. For these kind of pseudo sockets, we need to take a traditional refcount and use sock_edemux() as the destructor. It is now time to un-inline skb_set_owner_w(), being too big. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Signed-off-by: Eric Dumazet Bisected-by: Haiyang Zhang Signed-off-by: David S. Miller --- include/net/sock.h | 17 ++--------------- net/core/sock.c | 22 ++++++++++++++++++++++ net/ipv4/tcp_output.c | 4 +--- 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/sock.h b/include/net/sock.h index aeed5c95f3ca..f570e75e3da9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1951,6 +1951,8 @@ static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) } } +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); + /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in @@ -1959,21 +1961,6 @@ static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) * Inlined as it's very short and called for pretty much every * packet ever received. */ - -static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = sock_wfree; - skb_set_hash_from_sk(skb, sk); - /* - * We used to take a refcount on sk, but following operation - * is enough to guarantee sk_free() wont free this sock until - * all in-flight packets are completed - */ - atomic_add(skb->truesize, &sk->sk_wmem_alloc); -} - static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); diff --git a/net/core/sock.c b/net/core/sock.c index 0ef30aa90132..7529eb9463be 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1656,6 +1656,28 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) { + skb->destructor = sock_edemux; + sock_hold(sk); + return; + } +#endif + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ + atomic_add(skb->truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(skb_set_owner_w); + void skb_orphan_partial(struct sk_buff *skb) { /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f4f9793eb025..cb7ca569052c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2963,9 +2963,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); if (attach_req) { - skb->destructor = sock_edemux; - sock_hold(req_to_sk(req)); - skb->sk = req_to_sk(req); + skb_set_owner_w(skb, req_to_sk(req)); } else { /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. -- cgit v1.2.3