From ee9952831cfd0bbe834f4a26489d7dce74582e37 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 19 Apr 2012 03:40:39 +0000
Subject: tcp: Initial repair mode

This includes (according the the previous description):

* TCP_REPAIR sockoption

This one just puts the socket in/out of the repair mode.
Allowed for CAP_NET_ADMIN and for closed/establised sockets only.
When repair mode is turned off and the socket happens to be in
the established state the window probe is sent to the peer to
'unlock' the connection.

* TCP_REPAIR_QUEUE sockoption

This one sets the queue which we're about to repair. The
'no-queue' is set by default.

* TCP_QUEUE_SEQ socoption

Sets the write_seq/rcv_nxt of a selected repaired queue.
Allowed for TCP_CLOSE-d sockets only. When the socket changes
its state the other seq-s are changed by the kernel according
to the protocol rules (most of the existing code is actually
reused).

* Ability to forcibly bind a socket to a port

The sk->sk_reuse is set to SK_FORCE_REUSE.

* Immediate connect modification

The connect syscall initializes the connection, then directly jumps
to the code which finalizes it.

* Silent close modification

The close just aborts the connection (similar to SO_LINGER with 0
time) but without sending any FIN/RST-s to peer.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b6c62d294380..4e90e6ae79df 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -106,6 +106,16 @@ enum {
 #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
 #define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */
+#define TCP_REPAIR		19	/* TCP sock is under repair right now */
+#define TCP_REPAIR_QUEUE	20
+#define TCP_QUEUE_SEQ		21
+
+enum {
+	TCP_NO_QUEUE,
+	TCP_RECV_QUEUE,
+	TCP_SEND_QUEUE,
+	TCP_QUEUES_NR,
+};
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
@@ -353,7 +363,9 @@ struct tcp_sock {
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
-		unused      : 2;
+		repair      : 1,
+		unused      : 1;
+	u8	repair_queue;
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
-- 
cgit v1.2.3


From b139ba4e90dccbf4cd4efb112af96a5c9e0b098c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 19 Apr 2012 03:41:57 +0000
Subject: tcp: Repair connection-time negotiated parameters

There are options, which are set up on a socket while performing
TCP handshake. Need to resurrect them on a socket while repairing.
A new sockoption accepts a buffer and parses it. The buffer should
be CODE:VALUE sequence of bytes, where CODE is standard option
code and VALUE is the respective value.

Only 4 options should be handled on repaired socket.

To read 3 out of 4 of these options the TCP_INFO sockoption can be
used. An ability to get the last one (the mss_clamp) was added by
the previous patch.

Now the restore. Three of these options -- timestamp_ok, mss_clamp
and snd_wscale -- are just restored on a coket.

The sack_ok flags has 2 issues. First, whether or not to do sacks
at all. This flag is just read and set back. No other sack  info is
saved or restored, since according to the standart and the code
dropping all sack-ed segments is OK, the sender will resubmit them
again, so after the repair we will probably experience a pause in
connection. Next, the fack bit. It's just set back on a socket if
the respective sysctl is set. No collected stats about packets flow
is preserved. As far as I see (plz, correct me if I'm wrong) the
fack-based congestion algorithm survives dropping all of the stats
and repairs itself eventually, probably losing the performance for
that period.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h |  1 +
 net/ipv4/tcp.c      | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4e90e6ae79df..986593685566 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -109,6 +109,7 @@ enum {
 #define TCP_REPAIR		19	/* TCP sock is under repair right now */
 #define TCP_REPAIR_QUEUE	20
 #define TCP_QUEUE_SEQ		21
+#define TCP_REPAIR_OPTIONS	22
 
 enum {
 	TCP_NO_QUEUE,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b4e690ddb08c..3ce3bd031f33 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2218,6 +2218,68 @@ static inline int tcp_can_repair_sock(struct sock *sk)
 		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
 }
 
+static int tcp_repair_options_est(struct tcp_sock *tp, char __user *optbuf, unsigned int len)
+{
+	/*
+	 * Options are stored in CODE:VALUE form where CODE is 8bit and VALUE
+	 * fits the respective TCPOLEN_ size
+	 */
+
+	while (len > 0) {
+		u8 opcode;
+
+		if (get_user(opcode, optbuf))
+			return -EFAULT;
+
+		optbuf++;
+		len--;
+
+		switch (opcode) {
+		case TCPOPT_MSS: {
+			u16 in_mss;
+
+			if (len < sizeof(in_mss))
+				return -ENODATA;
+			if (get_user(in_mss, optbuf))
+				return -EFAULT;
+
+			tp->rx_opt.mss_clamp = in_mss;
+
+			optbuf += sizeof(in_mss);
+			len -= sizeof(in_mss);
+			break;
+		}
+		case TCPOPT_WINDOW: {
+			u8 wscale;
+
+			if (len < sizeof(wscale))
+				return -ENODATA;
+			if (get_user(wscale, optbuf))
+				return -EFAULT;
+
+			if (wscale > 14)
+				return -EFBIG;
+
+			tp->rx_opt.snd_wscale = wscale;
+
+			optbuf += sizeof(wscale);
+			len -= sizeof(wscale);
+			break;
+		}
+		case TCPOPT_SACK_PERM:
+			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+			if (sysctl_tcp_fack)
+				tcp_enable_fack(tp);
+			break;
+		case TCPOPT_TIMESTAMP:
+			tp->rx_opt.tstamp_ok = 1;
+			break;
+		}
+	}
+
+	return 0;
+}
+
 /*
  *	Socket option code for TCP.
  */
@@ -2426,6 +2488,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		break;
 
+	case TCP_REPAIR_OPTIONS:
+		if (!tp->repair)
+			err = -EINVAL;
+		else if (sk->sk_state == TCP_ESTABLISHED)
+			err = tcp_repair_options_est(tp, optval, optlen);
+		else
+			err = -EPERM;
+		break;
+
 	case TCP_CORK:
 		/* When set indicates to always queue non-full frames.
 		 * Later the user clears this option and we transmit
-- 
cgit v1.2.3


From de248a75c35e0208294cf304b112916254b69184 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 25 Apr 2012 23:43:04 +0000
Subject: tcp repair: Fix unaligned access when repairing options (v2)

Don't pick __u8/__u16 values directly from raw pointers, but instead use
an array of structures of code:value pairs. This is OK, since the buffer
we take options from is not an skb memory, but a user-to-kernel one.

For those options which don't require any value now, require this to be
zero (for potential future extension of this API).

v2: Changed tcp_repair_opt to use two __u32-s as spotted by David Laight.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h |  5 +++++
 net/ipv4/tcp.c      | 60 +++++++++++++++++++----------------------------------
 2 files changed, 26 insertions(+), 39 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 986593685566..278af9ea42d4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -111,6 +111,11 @@ enum {
 #define TCP_QUEUE_SEQ		21
 #define TCP_REPAIR_OPTIONS	22
 
+struct tcp_repair_opt {
+	__u32	opt_code;
+	__u32	opt_val;
+};
+
 enum {
 	TCP_NO_QUEUE,
 	TCP_RECV_QUEUE,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index de6a238f0e1d..9670af341931 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2283,60 +2283,40 @@ static inline int tcp_can_repair_sock(struct sock *sk)
 		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
 }
 
-static int tcp_repair_options_est(struct tcp_sock *tp, char __user *optbuf, unsigned int len)
+static int tcp_repair_options_est(struct tcp_sock *tp,
+		struct tcp_repair_opt __user *optbuf, unsigned int len)
 {
-	/*
-	 * Options are stored in CODE:VALUE form where CODE is 8bit and VALUE
-	 * fits the respective TCPOLEN_ size
-	 */
+	struct tcp_repair_opt opt;
 
-	while (len > 0) {
-		u8 opcode;
-
-		if (get_user(opcode, optbuf))
+	while (len >= sizeof(opt)) {
+		if (copy_from_user(&opt, optbuf, sizeof(opt)))
 			return -EFAULT;
 
 		optbuf++;
-		len--;
-
-		switch (opcode) {
-		case TCPOPT_MSS: {
-			u16 in_mss;
+		len -= sizeof(opt);
 
-			if (len < sizeof(in_mss))
-				return -ENODATA;
-			if (get_user(in_mss, optbuf))
-				return -EFAULT;
-
-			tp->rx_opt.mss_clamp = in_mss;
-
-			optbuf += sizeof(in_mss);
-			len -= sizeof(in_mss);
+		switch (opt.opt_code) {
+		case TCPOPT_MSS:
+			tp->rx_opt.mss_clamp = opt.opt_val;
 			break;
-		}
-		case TCPOPT_WINDOW: {
-			u8 wscale;
-
-			if (len < sizeof(wscale))
-				return -ENODATA;
-			if (get_user(wscale, optbuf))
-				return -EFAULT;
-
-			if (wscale > 14)
+		case TCPOPT_WINDOW:
+			if (opt.opt_val > 14)
 				return -EFBIG;
 
-			tp->rx_opt.snd_wscale = wscale;
-
-			optbuf += sizeof(wscale);
-			len -= sizeof(wscale);
+			tp->rx_opt.snd_wscale = opt.opt_val;
 			break;
-		}
 		case TCPOPT_SACK_PERM:
+			if (opt.opt_val != 0)
+				return -EINVAL;
+
 			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
 			if (sysctl_tcp_fack)
 				tcp_enable_fack(tp);
 			break;
 		case TCPOPT_TIMESTAMP:
+			if (opt.opt_val != 0)
+				return -EINVAL;
+
 			tp->rx_opt.tstamp_ok = 1;
 			break;
 		}
@@ -2557,7 +2537,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		if (!tp->repair)
 			err = -EINVAL;
 		else if (sk->sk_state == TCP_ESTABLISHED)
-			err = tcp_repair_options_est(tp, optval, optlen);
+			err = tcp_repair_options_est(tp,
+					(struct tcp_repair_opt __user *)optval,
+					optlen);
 		else
 			err = -EPERM;
 		break;
-- 
cgit v1.2.3


From eed530b6c67624db3f2cf477bac7c4d005d8f7ba Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 2 May 2012 13:30:03 +0000
Subject: tcp: early retransmit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch implements RFC 5827 early retransmit (ER) for TCP.
It reduces DUPACK threshold (dupthresh) if outstanding packets are
less than 4 to recover losses by fast recovery instead of timeout.

While the algorithm is simple, small but frequent network reordering
makes this feature dangerous: the connection repeatedly enter
false recovery and degrade performance. Therefore we implement
a mitigation suggested in the appendix of the RFC that delays
entering fast recovery by a small interval, i.e., RTT/4. Currently
ER is conservative and is disabled for the rest of the connection
after the first reordering event. A large scale web server
experiment on the performance impact of ER is summarized in
section 6 of the paper "Proportional Rate Reduction for TCP”,
IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf

Note that Linux has a similar feature called THIN_DUPACK. The
differences are THIN_DUPACK do not mitigate reorderings and is only
used after slow start. Currently ER is disabled if THIN_DUPACK is
enabled. I would be happy to merge THIN_DUPACK feature with ER if
people think it's a good idea.

ER is enabled by sysctl_tcp_early_retrans:
  0: Disables ER

  1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4.

  2: (Default) reduce dupthresh like mode 1. In addition, delay
     entering fast recovery by RTT/4.

Note: mode 2 is implemented in the third part of this patch series.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 14 ++++++++++++++
 include/linux/tcp.h                    |  1 +
 include/net/tcp.h                      | 15 +++++++++++++++
 net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
 net/ipv4/tcp.c                         |  3 +++
 net/ipv4/tcp_input.c                   | 15 +++++++++++++++
 net/ipv4/tcp_minisocks.c               |  1 +
 7 files changed, 59 insertions(+)

(limited to 'include/linux/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 9b569a2d9c60..34916e792d9d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER
 tcp_dsack - BOOLEAN
 	Allows TCP to send "duplicate" SACKs.
 
+tcp_early_retrans - INTEGER
+	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
+	for triggering fast retransmit when the amount of outstanding data is
+	small and when no previously unsent data can be transmitted (such
+	that limited transmit could be used).
+	Possible values:
+		0 disables ER
+		1 enables ER
+		2 enables ER but delays fast recovery and fast retransmit
+		  by a fourth of RTT. This mitigates connection falsely
+		  recovers when network has a small degree of reordering
+		  (less than 3 packets).
+	Default: 2
+
 tcp_ecn - INTEGER
 	Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
 	used when both ends of the TCP flow support it. It is useful to
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 278af9ea42d4..7859b416d46e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -372,6 +372,7 @@ struct tcp_sock {
 		repair      : 1,
 		unused      : 1;
 	u8	repair_queue;
+	u8	do_early_retrans:1;/* Enable RFC5827 early-retransmit  */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0fb84de6da36..685437a16c97 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -252,6 +252,7 @@ extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_early_retrans;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -797,6 +798,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
 }
 
+/* TCP early-retransmit (ER) is similar to but more conservative than
+ * the thin-dupack feature.  Enable ER only if thin-dupack is disabled.
+ */
+static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
+{
+	tp->do_early_retrans = sysctl_tcp_early_retrans &&
+		!sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
+}
+
+static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
+{
+	tp->do_early_retrans = 0;
+}
+
 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 33417f84e07f..ef32956ed655 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
 #include <net/tcp_memcontrol.h>
 
 static int zero;
+static int two = 2;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -676,6 +677,15 @@ static struct ctl_table ipv4_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
+	{
+		.procname	= "tcp_early_retrans",
+		.data		= &sysctl_tcp_early_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
 	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9670af341931..6802c89bc44d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sysctl_tcp_reordering;
+	tcp_enable_early_retrans(tp);
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
@@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 		else
 			tp->thin_dupack = val;
+			if (tp->thin_dupack)
+				tcp_disable_early_retrans(tp);
 		break;
 
 	case TCP_REPAIR:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be8e09d2c6b1..e042cabb695e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 2;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk)
 	if (dst_metric(dst, RTAX_REORDERING) &&
 	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
 		tcp_disable_fack(tp);
+		tcp_disable_early_retrans(tp);
 		tp->reordering = dst_metric(dst, RTAX_REORDERING);
 	}
 
@@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 #endif
 		tcp_disable_fack(tp);
 	}
+
+	if (metric > 0)
+		tcp_disable_early_retrans(tp);
 }
 
 /* This must be called before lost_out is incremented */
@@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk)
 	    tcp_is_sack(tp) && !tcp_send_head(sk))
 		return 1;
 
+	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
+	 * retransmissions due to small network reorderings, we implement
+	 * Mitigation A.3 in the RFC and delay the retransmission for a short
+	 * interval if appropriate.
+	 */
+	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+	    (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+	    !tcp_may_send_now(sk))
+		return 1;
+
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3cabafb5cdd1..6f6a91832826 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->sacked_out = 0;
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+		tcp_enable_early_retrans(newtp);
 
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control
-- 
cgit v1.2.3


From 750ea2bafa55aaed208b2583470ecd7122225634 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 2 May 2012 13:30:04 +0000
Subject: tcp: early retransmit: delayed fast retransmit

Implementing the advanced early retransmit (sysctl_tcp_early_retrans==2).
Delays the fast retransmit by an interval of RTT/4. We borrow the
RTO timer to implement the delay. If we receive another ACK or send
a new packet, the timer is cancelled and restored to original RTO
value offset by time elapsed.  When the delayed-ER timer fires,
we enter fast recovery and perform fast retransmit.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  3 ++-
 include/net/tcp.h     |  3 +++
 net/ipv4/tcp_input.c  | 69 +++++++++++++++++++++++++++++++++++++++++++++------
 net/ipv4/tcp_output.c |  5 ++--
 net/ipv4/tcp_timer.c  |  5 ++++
 5 files changed, 74 insertions(+), 11 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7859b416d46e..d9b42c5be088 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -372,7 +372,8 @@ struct tcp_sock {
 		repair      : 1,
 		unused      : 1;
 	u8	repair_queue;
-	u8	do_early_retrans:1;/* Enable RFC5827 early-retransmit  */
+	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
+		early_retrans_delayed:1; /* Delayed ER timer installed */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 685437a16c97..5283aa4bfa23 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -500,6 +500,8 @@ extern void tcp_send_delayed_ack(struct sock *sk);
 
 /* tcp_input.c */
 extern void tcp_cwnd_application_limited(struct sock *sk);
+extern void tcp_resume_early_retransmit(struct sock *sk);
+extern void tcp_rearm_rto(struct sock *sk);
 
 /* tcp_timer.c */
 extern void tcp_init_xmit_timers(struct sock *);
@@ -805,6 +807,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 {
 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
 		!sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
+	tp->early_retrans_delayed = 0;
 }
 
 static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e042cabb695e..7096790e06bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2344,6 +2344,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
 }
 
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long delay;
+
+	/* Delay early retransmit and entering fast recovery for
+	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+	 * available, or RTO is scheduled to fire first.
+	 */
+	if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+		return false;
+
+	delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+		return false;
+
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
+	tp->early_retrans_delayed = 1;
+	return true;
+}
+
 static inline int tcp_skb_timedout(const struct sock *sk,
 				   const struct sk_buff *skb)
 {
@@ -2451,7 +2472,7 @@ static inline int tcp_head_timedout(const struct sock *sk)
  * Main question: may we further continue forward transmission
  * with the same cwnd?
  */
-static int tcp_time_to_recover(struct sock *sk)
+static int tcp_time_to_recover(struct sock *sk, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 packets_out;
@@ -2505,7 +2526,7 @@ static int tcp_time_to_recover(struct sock *sk)
 	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
 	    (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
 	    !tcp_may_send_now(sk))
-		return 1;
+		return !tcp_pause_early_retransmit(sk, flag);
 
 	return 0;
 }
@@ -3172,7 +3193,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
 
-		if (!tcp_time_to_recover(sk)) {
+		if (!tcp_time_to_recover(sk, flag)) {
 			tcp_try_to_open(sk, flag);
 			return;
 		}
@@ -3271,16 +3292,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
-static void tcp_rearm_rto(struct sock *sk)
+void tcp_rearm_rto(struct sock *sk)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!tp->packets_out) {
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	} else {
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+		u32 rto = inet_csk(sk)->icsk_rto;
+		/* Offset the time elapsed after installing regular RTO */
+		if (tp->early_retrans_delayed) {
+			struct sk_buff *skb = tcp_write_queue_head(sk);
+			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+			s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+			/* delta may not be positive if the socket is locked
+			 * when the delayed ER timer fires and is rescheduled.
+			 */
+			if (delta > 0)
+				rto = delta;
+		}
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+					  TCP_RTO_MAX);
 	}
+	tp->early_retrans_delayed = 0;
+}
+
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_rearm_rto(sk);
+
+	/* Stop if ER is disabled after the delayed ER timer is scheduled */
+	if (!tp->do_early_retrans)
+		return;
+
+	tcp_enter_recovery(sk, false);
+	tcp_update_scoreboard(sk, 1);
+	tcp_xmit_retransmit_queue(sk);
 }
 
 /* If we get here, the whole TSO packet has not been acked. */
@@ -3729,6 +3781,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
+	if (tp->early_retrans_delayed)
+		tcp_rearm_rto(sk);
+
 	if (after(ack, prior_snd_una))
 		flag |= FLAG_SND_UNA_ADVANCED;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 834e89fc541b..d94733009923 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -78,9 +78,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 		tp->frto_counter = 3;
 
 	tp->packets_out += tcp_skb_pcount(skb);
-	if (!prior_packets)
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+	if (!prior_packets || tp->early_retrans_delayed)
+		tcp_rearm_rto(sk);
 }
 
 /* SND.NXT, if window was not shrunk.
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 34d4a02c2f16..e911e6c523ec 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -319,6 +319,11 @@ void tcp_retransmit_timer(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	if (tp->early_retrans_delayed) {
+		tcp_resume_early_retransmit(sk);
+		return;
+	}
+
 	if (!tp->packets_out)
 		goto out;
 
-- 
cgit v1.2.3


From c0a788c451e1f88b3fb5a85113b87dbc98c7abe4 Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@redhat.com>
Date: Mon, 7 May 2012 15:33:04 -0400
Subject: net: Fix tcp_build_and_update_options comment in struct tcp_sock

Noticed this comment didn't get updated when
tcp_build_and_update_options was refactored.

Signed-off-by: Kyle McMartin <kyle@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b6c62d294380..288d201c3b59 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -406,7 +406,7 @@ struct tcp_sock {
 
 	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here */
 
-	/* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */
+	/* SACKs data, these 2 need to be together (see tcp_options_write) */
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
 	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
-- 
cgit v1.2.3