From ee9952831cfd0bbe834f4a26489d7dce74582e37 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 19 Apr 2012 03:40:39 +0000 Subject: tcp: Initial repair mode This includes (according the the previous description): * TCP_REPAIR sockoption This one just puts the socket in/out of the repair mode. Allowed for CAP_NET_ADMIN and for closed/establised sockets only. When repair mode is turned off and the socket happens to be in the established state the window probe is sent to the peer to 'unlock' the connection. * TCP_REPAIR_QUEUE sockoption This one sets the queue which we're about to repair. The 'no-queue' is set by default. * TCP_QUEUE_SEQ socoption Sets the write_seq/rcv_nxt of a selected repaired queue. Allowed for TCP_CLOSE-d sockets only. When the socket changes its state the other seq-s are changed by the kernel according to the protocol rules (most of the existing code is actually reused). * Ability to forcibly bind a socket to a port The sk->sk_reuse is set to SK_FORCE_REUSE. * Immediate connect modification The connect syscall initializes the connection, then directly jumps to the code which finalizes it. * Silent close modification The close just aborts the connection (similar to SO_LINGER with 0 time) but without sending any FIN/RST-s to peer. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/tcp.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b6c62d294380..4e90e6ae79df 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -106,6 +106,16 @@ enum { #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 @@ -353,7 +363,9 @@ struct tcp_sock { u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ thin_dupack : 1,/* Fast retransmit on first dupack */ - unused : 2; + repair : 1, + unused : 1; + u8 repair_queue; /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ -- cgit v1.2.3 From b139ba4e90dccbf4cd4efb112af96a5c9e0b098c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 19 Apr 2012 03:41:57 +0000 Subject: tcp: Repair connection-time negotiated parameters There are options, which are set up on a socket while performing TCP handshake. Need to resurrect them on a socket while repairing. A new sockoption accepts a buffer and parses it. The buffer should be CODE:VALUE sequence of bytes, where CODE is standard option code and VALUE is the respective value. Only 4 options should be handled on repaired socket. To read 3 out of 4 of these options the TCP_INFO sockoption can be used. An ability to get the last one (the mss_clamp) was added by the previous patch. Now the restore. Three of these options -- timestamp_ok, mss_clamp and snd_wscale -- are just restored on a coket. The sack_ok flags has 2 issues. First, whether or not to do sacks at all. This flag is just read and set back. No other sack info is saved or restored, since according to the standart and the code dropping all sack-ed segments is OK, the sender will resubmit them again, so after the repair we will probably experience a pause in connection. Next, the fack bit. It's just set back on a socket if the respective sysctl is set. No collected stats about packets flow is preserved. As far as I see (plz, correct me if I'm wrong) the fack-based congestion algorithm survives dropping all of the stats and repairs itself eventually, probably losing the performance for that period. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e90e6ae79df..986593685566 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -109,6 +109,7 @@ enum { #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 enum { TCP_NO_QUEUE, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b4e690ddb08c..3ce3bd031f33 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2218,6 +2218,68 @@ static inline int tcp_can_repair_sock(struct sock *sk) ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } +static int tcp_repair_options_est(struct tcp_sock *tp, char __user *optbuf, unsigned int len) +{ + /* + * Options are stored in CODE:VALUE form where CODE is 8bit and VALUE + * fits the respective TCPOLEN_ size + */ + + while (len > 0) { + u8 opcode; + + if (get_user(opcode, optbuf)) + return -EFAULT; + + optbuf++; + len--; + + switch (opcode) { + case TCPOPT_MSS: { + u16 in_mss; + + if (len < sizeof(in_mss)) + return -ENODATA; + if (get_user(in_mss, optbuf)) + return -EFAULT; + + tp->rx_opt.mss_clamp = in_mss; + + optbuf += sizeof(in_mss); + len -= sizeof(in_mss); + break; + } + case TCPOPT_WINDOW: { + u8 wscale; + + if (len < sizeof(wscale)) + return -ENODATA; + if (get_user(wscale, optbuf)) + return -EFAULT; + + if (wscale > 14) + return -EFBIG; + + tp->rx_opt.snd_wscale = wscale; + + optbuf += sizeof(wscale); + len -= sizeof(wscale); + break; + } + case TCPOPT_SACK_PERM: + tp->rx_opt.sack_ok |= TCP_SACK_SEEN; + if (sysctl_tcp_fack) + tcp_enable_fack(tp); + break; + case TCPOPT_TIMESTAMP: + tp->rx_opt.tstamp_ok = 1; + break; + } + } + + return 0; +} + /* * Socket option code for TCP. */ @@ -2426,6 +2488,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EINVAL; break; + case TCP_REPAIR_OPTIONS: + if (!tp->repair) + err = -EINVAL; + else if (sk->sk_state == TCP_ESTABLISHED) + err = tcp_repair_options_est(tp, optval, optlen); + else + err = -EPERM; + break; + case TCP_CORK: /* When set indicates to always queue non-full frames. * Later the user clears this option and we transmit -- cgit v1.2.3 From de248a75c35e0208294cf304b112916254b69184 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 25 Apr 2012 23:43:04 +0000 Subject: tcp repair: Fix unaligned access when repairing options (v2) Don't pick __u8/__u16 values directly from raw pointers, but instead use an array of structures of code:value pairs. This is OK, since the buffer we take options from is not an skb memory, but a user-to-kernel one. For those options which don't require any value now, require this to be zero (for potential future extension of this API). v2: Changed tcp_repair_opt to use two __u32-s as spotted by David Laight. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 +++++ net/ipv4/tcp.c | 60 +++++++++++++++++++---------------------------------- 2 files changed, 26 insertions(+), 39 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 986593685566..278af9ea42d4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -111,6 +111,11 @@ enum { #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + enum { TCP_NO_QUEUE, TCP_RECV_QUEUE, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de6a238f0e1d..9670af341931 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2283,60 +2283,40 @@ static inline int tcp_can_repair_sock(struct sock *sk) ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } -static int tcp_repair_options_est(struct tcp_sock *tp, char __user *optbuf, unsigned int len) +static int tcp_repair_options_est(struct tcp_sock *tp, + struct tcp_repair_opt __user *optbuf, unsigned int len) { - /* - * Options are stored in CODE:VALUE form where CODE is 8bit and VALUE - * fits the respective TCPOLEN_ size - */ + struct tcp_repair_opt opt; - while (len > 0) { - u8 opcode; - - if (get_user(opcode, optbuf)) + while (len >= sizeof(opt)) { + if (copy_from_user(&opt, optbuf, sizeof(opt))) return -EFAULT; optbuf++; - len--; - - switch (opcode) { - case TCPOPT_MSS: { - u16 in_mss; + len -= sizeof(opt); - if (len < sizeof(in_mss)) - return -ENODATA; - if (get_user(in_mss, optbuf)) - return -EFAULT; - - tp->rx_opt.mss_clamp = in_mss; - - optbuf += sizeof(in_mss); - len -= sizeof(in_mss); + switch (opt.opt_code) { + case TCPOPT_MSS: + tp->rx_opt.mss_clamp = opt.opt_val; break; - } - case TCPOPT_WINDOW: { - u8 wscale; - - if (len < sizeof(wscale)) - return -ENODATA; - if (get_user(wscale, optbuf)) - return -EFAULT; - - if (wscale > 14) + case TCPOPT_WINDOW: + if (opt.opt_val > 14) return -EFBIG; - tp->rx_opt.snd_wscale = wscale; - - optbuf += sizeof(wscale); - len -= sizeof(wscale); + tp->rx_opt.snd_wscale = opt.opt_val; break; - } case TCPOPT_SACK_PERM: + if (opt.opt_val != 0) + return -EINVAL; + tp->rx_opt.sack_ok |= TCP_SACK_SEEN; if (sysctl_tcp_fack) tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: + if (opt.opt_val != 0) + return -EINVAL; + tp->rx_opt.tstamp_ok = 1; break; } @@ -2557,7 +2537,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(tp, optval, optlen); + err = tcp_repair_options_est(tp, + (struct tcp_repair_opt __user *)optval, + optlen); else err = -EPERM; break; -- cgit v1.2.3 From eed530b6c67624db3f2cf477bac7c4d005d8f7ba Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 May 2012 13:30:03 +0000 Subject: tcp: early retransmit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements RFC 5827 early retransmit (ER) for TCP. It reduces DUPACK threshold (dupthresh) if outstanding packets are less than 4 to recover losses by fast recovery instead of timeout. While the algorithm is simple, small but frequent network reordering makes this feature dangerous: the connection repeatedly enter false recovery and degrade performance. Therefore we implement a mitigation suggested in the appendix of the RFC that delays entering fast recovery by a small interval, i.e., RTT/4. Currently ER is conservative and is disabled for the rest of the connection after the first reordering event. A large scale web server experiment on the performance impact of ER is summarized in section 6 of the paper "Proportional Rate Reduction for TCP”, IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf Note that Linux has a similar feature called THIN_DUPACK. The differences are THIN_DUPACK do not mitigate reorderings and is only used after slow start. Currently ER is disabled if THIN_DUPACK is enabled. I would be happy to merge THIN_DUPACK feature with ER if people think it's a good idea. ER is enabled by sysctl_tcp_early_retrans: 0: Disables ER 1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4. 2: (Default) reduce dupthresh like mode 1. In addition, delay entering fast recovery by RTT/4. Note: mode 2 is implemented in the third part of this patch series. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 14 ++++++++++++++ include/linux/tcp.h | 1 + include/net/tcp.h | 15 +++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 15 +++++++++++++++ net/ipv4/tcp_minisocks.c | 1 + 7 files changed, 59 insertions(+) (limited to 'include/linux/tcp.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9b569a2d9c60..34916e792d9d 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER tcp_dsack - BOOLEAN Allows TCP to send "duplicate" SACKs. +tcp_early_retrans - INTEGER + Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold + for triggering fast retransmit when the amount of outstanding data is + small and when no previously unsent data can be transmitted (such + that limited transmit could be used). + Possible values: + 0 disables ER + 1 enables ER + 2 enables ER but delays fast recovery and fast retransmit + by a fourth of RTT. This mitigates connection falsely + recovers when network has a small degree of reordering + (less than 3 packets). + Default: 2 + tcp_ecn - INTEGER Enable Explicit Congestion Notification (ECN) in TCP. ECN is only used when both ends of the TCP flow support it. It is useful to diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 278af9ea42d4..7859b416d46e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -372,6 +372,7 @@ struct tcp_sock { repair : 1, unused : 1; u8 repair_queue; + u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */ /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0fb84de6da36..685437a16c97 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -252,6 +252,7 @@ extern int sysctl_tcp_max_ssthresh; extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; +extern int sysctl_tcp_early_retrans; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -797,6 +798,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; } +/* TCP early-retransmit (ER) is similar to but more conservative than + * the thin-dupack feature. Enable ER only if thin-dupack is disabled. + */ +static inline void tcp_enable_early_retrans(struct tcp_sock *tp) +{ + tp->do_early_retrans = sysctl_tcp_early_retrans && + !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; +} + +static inline void tcp_disable_early_retrans(struct tcp_sock *tp) +{ + tp->do_early_retrans = 0; +} + static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 33417f84e07f..ef32956ed655 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -27,6 +27,7 @@ #include static int zero; +static int two = 2; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -676,6 +677,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_early_retrans", + .data = &sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9670af341931..6802c89bc44d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; + tcp_enable_early_retrans(tp); icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; @@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EINVAL; else tp->thin_dupack = val; + if (tp->thin_dupack) + tcp_disable_early_retrans(tp); break; case TCP_REPAIR: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index be8e09d2c6b1..e042cabb695e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; +int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, #endif tcp_disable_fack(tp); } + + if (metric > 0) + tcp_disable_early_retrans(tp); } /* This must be called before lost_out is incremented */ @@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk) tcp_is_sack(tp) && !tcp_send_head(sk)) return 1; + /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious + * retransmissions due to small network reorderings, we implement + * Mitigation A.3 in the RFC and delay the retransmission for a short + * interval if appropriate. + */ + if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && + (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && + !tcp_may_send_now(sk)) + return 1; + return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3cabafb5cdd1..6f6a91832826 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tcp_enable_early_retrans(newtp); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control -- cgit v1.2.3 From 750ea2bafa55aaed208b2583470ecd7122225634 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 2 May 2012 13:30:04 +0000 Subject: tcp: early retransmit: delayed fast retransmit Implementing the advanced early retransmit (sysctl_tcp_early_retrans==2). Delays the fast retransmit by an interval of RTT/4. We borrow the RTO timer to implement the delay. If we receive another ACK or send a new packet, the timer is cancelled and restored to original RTO value offset by time elapsed. When the delayed-ER timer fires, we enter fast recovery and perform fast retransmit. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/net/tcp.h | 3 +++ net/ipv4/tcp_input.c | 69 +++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_output.c | 5 ++-- net/ipv4/tcp_timer.c | 5 ++++ 5 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7859b416d46e..d9b42c5be088 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -372,7 +372,8 @@ struct tcp_sock { repair : 1, unused : 1; u8 repair_queue; - u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */ + u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ + early_retrans_delayed:1; /* Delayed ER timer installed */ /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 685437a16c97..5283aa4bfa23 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -500,6 +500,8 @@ extern void tcp_send_delayed_ack(struct sock *sk); /* tcp_input.c */ extern void tcp_cwnd_application_limited(struct sock *sk); +extern void tcp_resume_early_retransmit(struct sock *sk); +extern void tcp_rearm_rto(struct sock *sk); /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); @@ -805,6 +807,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp) { tp->do_early_retrans = sysctl_tcp_early_retrans && !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; + tp->early_retrans_delayed = 0; } static inline void tcp_disable_early_retrans(struct tcp_sock *tp) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e042cabb695e..7096790e06bf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2344,6 +2344,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } +static bool tcp_pause_early_retransmit(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay; + + /* Delay early retransmit and entering fast recovery for + * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples + * available, or RTO is scheduled to fire first. + */ + if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) + return false; + + delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) + return false; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + tp->early_retrans_delayed = 1; + return true; +} + static inline int tcp_skb_timedout(const struct sock *sk, const struct sk_buff *skb) { @@ -2451,7 +2472,7 @@ static inline int tcp_head_timedout(const struct sock *sk) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk) +static int tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; @@ -2505,7 +2526,7 @@ static int tcp_time_to_recover(struct sock *sk) if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && !tcp_may_send_now(sk)) - return 1; + return !tcp_pause_early_retransmit(sk, flag); return 0; } @@ -3172,7 +3193,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk)) { + if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; } @@ -3271,16 +3292,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_rearm_rto(struct sock *sk) +void tcp_rearm_rto(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + u32 rto = inet_csk(sk)->icsk_rto; + /* Offset the time elapsed after installing regular RTO */ + if (tp->early_retrans_delayed) { + struct sk_buff *skb = tcp_write_queue_head(sk); + const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); + /* delta may not be positive if the socket is locked + * when the delayed ER timer fires and is rescheduled. + */ + if (delta > 0) + rto = delta; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); } + tp->early_retrans_delayed = 0; +} + +/* This function is called when the delayed ER timer fires. TCP enters + * fast recovery and performs fast-retransmit. + */ +void tcp_resume_early_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_rearm_rto(sk); + + /* Stop if ER is disabled after the delayed ER timer is scheduled */ + if (!tp->do_early_retrans) + return; + + tcp_enter_recovery(sk, false); + tcp_update_scoreboard(sk, 1); + tcp_xmit_retransmit_queue(sk); } /* If we get here, the whole TSO packet has not been acked. */ @@ -3729,6 +3781,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + if (tp->early_retrans_delayed) + tcp_rearm_rto(sk); + if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 834e89fc541b..d94733009923 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -78,9 +78,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->frto_counter = 3; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + if (!prior_packets || tp->early_retrans_delayed) + tcp_rearm_rto(sk); } /* SND.NXT, if window was not shrunk. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 34d4a02c2f16..e911e6c523ec 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -319,6 +319,11 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + if (tp->early_retrans_delayed) { + tcp_resume_early_retransmit(sk); + return; + } + if (!tp->packets_out) goto out; -- cgit v1.2.3 From c0a788c451e1f88b3fb5a85113b87dbc98c7abe4 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Mon, 7 May 2012 15:33:04 -0400 Subject: net: Fix tcp_build_and_update_options comment in struct tcp_sock Noticed this comment didn't get updated when tcp_build_and_update_options was refactored. Signed-off-by: Kyle McMartin Signed-off-by: Jiri Kosina --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b6c62d294380..288d201c3b59 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -406,7 +406,7 @@ struct tcp_sock { struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ - /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */ + /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ -- cgit v1.2.3