From 64edc2736e23994e0334b70c5ff08dc33e2ebbd9 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:18:32 -0700
Subject: tcp: Partial hint clearing has again become meaningless
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ie., the difference between partial and all clearing doesn't
exists anymore since the SACK optimizations got dropped by
an sacktag rewrite.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8165f5aa8c71..11490958a096 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -750,7 +750,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	BUG_ON(len > skb->len);
 
-	tcp_clear_retrans_hints_partial(tp);
+	tcp_clear_all_retrans_hints(tp);
 	nsize = skb_headlen(skb) - len;
 	if (nsize < 0)
 		nsize = 0;
@@ -1823,7 +1823,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
 	tp->packets_out -= tcp_skb_pcount(next_skb);
 
 	/* changed transmit queue under us so clear hints */
-	tcp_clear_retrans_hints_partial(tp);
+	tcp_clear_all_retrans_hints(tp);
 
 	sk_wmem_free_skb(sk, next_skb);
 }
-- 
cgit v1.2.3


From 006f582c73f4eda35e06fd323193c3df43fb3459 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:20:20 -0700
Subject: tcp: convert retransmit_cnt_hint to seqno
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Main benefit in this is that we can then freely point
the retransmit_skb_hint to anywhere we want to because
there's no longer need to know what would be the count
changes involve, and since this is really used only as a
terminator, unnecessary work is one time walk at most,
and if some retransmissions are necessary after that
point later on, the walk is not full waste of time
anyway.

Since retransmit_high must be kept valid, all lost
markers must ensure that.

Now I also have learned how those "holes" in the
rexmittable skbs can appear, mtu probe does them. So
I removed the misleading comment as well.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 +-
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp_input.c  | 34 ++++++++++++++++++++--------------
 net/ipv4/tcp_output.c | 25 +++++++------------------
 4 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2e2557388e36..d7637c4b2840 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -358,7 +358,7 @@ struct tcp_sock {
 					 */
 
 	int     lost_cnt_hint;
-	int     retransmit_cnt_hint;
+	u32     retransmit_high;	/* L-bits may be on up to this seqno */
 
 	u32	lost_retrans_low;	/* Sent seq after any rxmit (lowest) */
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b71676326950..d0e90c50722b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -472,6 +472,8 @@ extern void tcp_send_delayed_ack(struct sock *sk);
 
 /* tcp_input.c */
 extern void tcp_cwnd_application_limited(struct sock *sk);
+extern void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
+					    struct sk_buff *skb);
 
 /* tcp_timer.c */
 extern void tcp_init_xmit_timers(struct sock *);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 12512336dbd8..d271cc825005 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -979,17 +979,17 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 	}
 }
 
-/* RFC: This is from the original, I doubt that this is necessary at all:
- * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
- * retransmitted past LOST markings in the first place? I'm not fully sure
- * about undo and end of connection cases, which can cause R without L?
- */
+/* This must be called before lost_out is incremented */
 static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 {
-	if ((tp->retransmit_skb_hint != NULL) &&
+	if ((tp->retransmit_skb_hint == NULL) ||
 	    before(TCP_SKB_CB(skb)->seq,
 		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-		tp->retransmit_skb_hint = NULL;
+		tp->retransmit_skb_hint = skb;
+
+	if (!tp->lost_out ||
+	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
+		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 }
 
 static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1002,6 +1002,16 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 	}
 }
 
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	tcp_verify_retransmit_hint(tp, skb);
+
+	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+		tp->lost_out += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+	}
+}
+
 /* This procedure tags the retransmission queue when SACKs arrive.
  *
  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1178,13 +1188,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 			tp->retrans_out -= tcp_skb_pcount(skb);
 
-			/* clear lost hint */
-			tp->retransmit_skb_hint = NULL;
-
-			if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
-				tp->lost_out += tcp_skb_pcount(skb);
-				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-			}
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
 		} else {
 			if (before(ack_seq, new_low_seq))
@@ -1890,6 +1894,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
@@ -1974,6 +1979,7 @@ void tcp_enter_loss(struct sock *sk, int how)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 		}
 	}
 	tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 11490958a096..cfae61b40c44 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1838,7 +1838,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	unsigned int mss = tcp_current_mss(sk, 0);
-	int lost = 0;
+	u32 prior_lost = tp->lost_out;
 
 	tcp_for_write_queue(skb, sk) {
 		if (skb == tcp_send_head(sk))
@@ -1849,17 +1849,13 @@ void tcp_simple_retransmit(struct sock *sk)
 				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 				tp->retrans_out -= tcp_skb_pcount(skb);
 			}
-			if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
-				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-				tp->lost_out += tcp_skb_pcount(skb);
-				lost = 1;
-			}
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
 		}
 	}
 
 	tcp_clear_all_retrans_hints(tp);
 
-	if (!lost)
+	if (prior_lost == tp->lost_out)
 		return;
 
 	if (tcp_is_reno(tp))
@@ -2009,15 +2005,11 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int packet_cnt;
 
-	if (tp->retransmit_skb_hint) {
+	if (tp->retransmit_skb_hint)
 		skb = tp->retransmit_skb_hint;
-		packet_cnt = tp->retransmit_cnt_hint;
-	} else {
+	else
 		skb = tcp_write_queue_head(sk);
-		packet_cnt = 0;
-	}
 
 	/* First pass: retransmit lost packets. */
 	if (tp->lost_out) {
@@ -2028,7 +2020,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 				break;
 			/* we could do better than to assign each time */
 			tp->retransmit_skb_hint = skb;
-			tp->retransmit_cnt_hint = packet_cnt;
 
 			/* Assume this retransmit will generate
 			 * only one packet for congestion window
@@ -2039,6 +2030,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 			 */
 			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 				return;
+			if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
+				break;
 
 			if (sacked & TCPCB_LOST) {
 				if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
@@ -2059,10 +2052,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 									  inet_csk(sk)->icsk_rto,
 									  TCP_RTO_MAX);
 				}
-
-				packet_cnt += tcp_skb_pcount(skb);
-				if (packet_cnt >= tp->lost_out)
-					break;
 			}
 		}
 	}
-- 
cgit v1.2.3


From b5afe7bc71a1689376c9b547376d17568469f3b3 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:21:54 -0700
Subject: tcp: add tcp_can_forward_retransmit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cfae61b40c44..957c4e3d2176 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1992,6 +1992,33 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	return err;
 }
 
+static int tcp_can_forward_retransmit(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Forward retransmissions are possible only during Recovery. */
+	if (icsk->icsk_ca_state != TCP_CA_Recovery)
+		return 0;
+
+	/* No forward retransmissions in Reno are possible. */
+	if (tcp_is_reno(tp))
+		return 0;
+
+	/* Yeah, we have to make difficult choice between forward transmission
+	 * and retransmission... Both ways have their merits...
+	 *
+	 * For now we do not retransmit anything, while we have some new
+	 * segments to send. In the other cases, follow rule 3 for
+	 * NextSeg() specified in RFC3517.
+	 */
+
+	if (tcp_may_send_now(sk))
+		return 0;
+
+	return 1;
+}
+
 /* This gets called after a retransmit timeout, and the initially
  * retransmitted data is acknowledged.  It tries to continue
  * resending the rest of the retransmit queue, until either
@@ -2057,24 +2084,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	}
 
 	/* OK, demanded retransmission is finished. */
-
-	/* Forward retransmissions are possible only during Recovery. */
-	if (icsk->icsk_ca_state != TCP_CA_Recovery)
-		return;
-
-	/* No forward retransmissions in Reno are possible. */
-	if (tcp_is_reno(tp))
-		return;
-
-	/* Yeah, we have to make difficult choice between forward transmission
-	 * and retransmission... Both ways have their merits...
-	 *
-	 * For now we do not retransmit anything, while we have some new
-	 * segments to send. In the other cases, follow rule 3 for
-	 * NextSeg() specified in RFC3517.
-	 */
-
-	if (tcp_may_send_now(sk))
+	if (!tcp_can_forward_retransmit(sk))
 		return;
 
 	/* If nothing is SACKed, highest_sack in the loop won't be valid */
-- 
cgit v1.2.3


From 34638570b58290e8cb875fb24dcbe836ffeb6cb8 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:22:17 -0700
Subject: tcp: remove obsolete validity concern
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 957c4e3d2176..6f2a3f4a1af3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2087,10 +2087,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	if (!tcp_can_forward_retransmit(sk))
 		return;
 
-	/* If nothing is SACKed, highest_sack in the loop won't be valid */
-	if (!tp->sacked_out)
-		return;
-
 	if (tp->forward_skb_hint)
 		skb = tp->forward_skb_hint;
 	else
-- 
cgit v1.2.3


From 61eb55f4db7eaf5fb2d5ec12981a8cda755bb0e1 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:22:59 -0700
Subject: tcp: Reorganize skb tagbit checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6f2a3f4a1af3..2f24ecc37067 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2032,6 +2032,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	int mib_idx;
 
 	if (tp->retransmit_skb_hint)
 		skb = tp->retransmit_skb_hint;
@@ -2059,27 +2060,26 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 				return;
 			if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
 				break;
+			if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+				continue;
 
-			if (sacked & TCPCB_LOST) {
-				if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-					int mib_idx;
-
-					if (tcp_retransmit_skb(sk, skb)) {
-						tp->retransmit_skb_hint = NULL;
-						return;
-					}
-					if (icsk->icsk_ca_state != TCP_CA_Loss)
-						mib_idx = LINUX_MIB_TCPFASTRETRANS;
-					else
-						mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-					NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-					if (skb == tcp_write_queue_head(sk))
-						inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-									  inet_csk(sk)->icsk_rto,
-									  TCP_RTO_MAX);
-				}
+			if (!(sacked & TCPCB_LOST))
+				continue;
+
+			if (tcp_retransmit_skb(sk, skb)) {
+				tp->retransmit_skb_hint = NULL;
+				return;
 			}
+			if (icsk->icsk_ca_state != TCP_CA_Loss)
+				mib_idx = LINUX_MIB_TCPFASTRETRANS;
+			else
+				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+			NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+			if (skb == tcp_write_queue_head(sk))
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+							  inet_csk(sk)->icsk_rto,
+							  TCP_RTO_MAX);
 		}
 	}
 
-- 
cgit v1.2.3


From 08ebd1721ab8fd362e90ae17b461c07b23fa2824 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:23:49 -0700
Subject: tcp: remove tp->lost_out guard to make joining diff nicer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The validity of the retransmit_high must then be ensured
if no L'ed skb exits!

This makes a minor change to behavior, we now have to
iterate the head to find out that the loop terminates.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 75 ++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 37 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2f24ecc37067..9f44be633ef6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2034,53 +2034,54 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct sk_buff *skb;
 	int mib_idx;
 
+	if (!tp->lost_out)
+		tp->retransmit_high = tp->snd_una;
+
 	if (tp->retransmit_skb_hint)
 		skb = tp->retransmit_skb_hint;
 	else
 		skb = tcp_write_queue_head(sk);
 
 	/* First pass: retransmit lost packets. */
-	if (tp->lost_out) {
-		tcp_for_write_queue_from(skb, sk) {
-			__u8 sacked = TCP_SKB_CB(skb)->sacked;
+	tcp_for_write_queue_from(skb, sk) {
+		__u8 sacked = TCP_SKB_CB(skb)->sacked;
 
-			if (skb == tcp_send_head(sk))
-				break;
-			/* we could do better than to assign each time */
-			tp->retransmit_skb_hint = skb;
-
-			/* Assume this retransmit will generate
-			 * only one packet for congestion window
-			 * calculation purposes.  This works because
-			 * tcp_retransmit_skb() will chop up the
-			 * packet to be MSS sized and all the
-			 * packet counting works out.
-			 */
-			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-				return;
-			if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
-				break;
-			if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
-				continue;
+		if (skb == tcp_send_head(sk))
+			break;
+		/* we could do better than to assign each time */
+		tp->retransmit_skb_hint = skb;
+
+		/* Assume this retransmit will generate
+		 * only one packet for congestion window
+		 * calculation purposes.  This works because
+		 * tcp_retransmit_skb() will chop up the
+		 * packet to be MSS sized and all the
+		 * packet counting works out.
+		 */
+		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+			return;
+		if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
+			break;
+		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+			continue;
 
-			if (!(sacked & TCPCB_LOST))
-				continue;
+		if (!(sacked & TCPCB_LOST))
+			continue;
 
-			if (tcp_retransmit_skb(sk, skb)) {
-				tp->retransmit_skb_hint = NULL;
-				return;
-			}
-			if (icsk->icsk_ca_state != TCP_CA_Loss)
-				mib_idx = LINUX_MIB_TCPFASTRETRANS;
-			else
-				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-			NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-			if (skb == tcp_write_queue_head(sk))
-				inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-							  inet_csk(sk)->icsk_rto,
-							  TCP_RTO_MAX);
+		if (tcp_retransmit_skb(sk, skb)) {
+			tp->retransmit_skb_hint = NULL;
+			return;
 		}
+		if (icsk->icsk_ca_state != TCP_CA_Loss)
+			mib_idx = LINUX_MIB_TCPFASTRETRANS;
+		else
+			mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+		if (skb == tcp_write_queue_head(sk))
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  inet_csk(sk)->icsk_rto,
+						  TCP_RTO_MAX);
 	}
 
 	/* OK, demanded retransmission is finished. */
-- 
cgit v1.2.3


From 0e1c54c2a405494281e0639aacc90db03b50ae77 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:24:21 -0700
Subject: tcp: reorganize retransmit code loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both loops are quite similar, so they can be combined
with little effort. As a result, forward_skb_hint becomes
obsolete as well.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 -
 include/net/tcp.h     |  1 -
 net/ipv4/tcp_output.c | 79 +++++++++++++++++++++------------------------------
 3 files changed, 33 insertions(+), 48 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d7637c4b2840..767290628292 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -342,7 +342,6 @@ struct tcp_sock {
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *scoreboard_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
-	struct sk_buff *forward_skb_hint;
 
 	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here */
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0e90c50722b..220f54cf42ec 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1042,7 +1042,6 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
 	tp->lost_skb_hint = NULL;
 	tp->scoreboard_skb_hint = NULL;
 	tp->retransmit_skb_hint = NULL;
-	tp->forward_skb_hint = NULL;
 }
 
 /* MD5 Signature */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f44be633ef6..b5b4ddcdda41 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2032,7 +2032,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	struct sk_buff *hole = NULL;
 	int mib_idx;
+	int fwd_rexmitting = 0;
 
 	if (!tp->lost_out)
 		tp->retransmit_high = tp->snd_una;
@@ -2049,7 +2051,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		if (skb == tcp_send_head(sk))
 			break;
 		/* we could do better than to assign each time */
-		tp->retransmit_skb_hint = skb;
+		if (hole == NULL)
+			tp->retransmit_skb_hint = skb;
 
 		/* Assume this retransmit will generate
 		 * only one packet for congestion window
@@ -2060,65 +2063,49 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		 */
 		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 			return;
-		if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high))
-			break;
-		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
-			continue;
-
-		if (!(sacked & TCPCB_LOST))
-			continue;
-
-		if (tcp_retransmit_skb(sk, skb)) {
-			tp->retransmit_skb_hint = NULL;
-			return;
-		}
-		if (icsk->icsk_ca_state != TCP_CA_Loss)
-			mib_idx = LINUX_MIB_TCPFASTRETRANS;
-		else
-			mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-		NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-		if (skb == tcp_write_queue_head(sk))
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-						  inet_csk(sk)->icsk_rto,
-						  TCP_RTO_MAX);
-	}
-
-	/* OK, demanded retransmission is finished. */
-	if (!tcp_can_forward_retransmit(sk))
-		return;
 
-	if (tp->forward_skb_hint)
-		skb = tp->forward_skb_hint;
-	else
-		skb = tcp_write_queue_head(sk);
+		if (fwd_rexmitting) {
+begin_fwd:
+			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+				break;
+			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
 
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-		tp->forward_skb_hint = skb;
+		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+			if (!tcp_can_forward_retransmit(sk))
+				break;
+			/* Backtrack if necessary to non-L'ed skb */
+			if (hole != NULL) {
+				skb = hole;
+				hole = NULL;
+			}
+			fwd_rexmitting = 1;
+			goto begin_fwd;
 
-		if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-			break;
+		} else if (!(sacked & TCPCB_LOST)) {
+			if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+				hole = skb;
+			continue;
 
-		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-			break;
+		} else {
+			if (icsk->icsk_ca_state != TCP_CA_Loss)
+				mib_idx = LINUX_MIB_TCPFASTRETRANS;
+			else
+				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+		}
 
-		if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 			continue;
 
-		/* Ok, retransmit it. */
 		if (tcp_retransmit_skb(sk, skb)) {
-			tp->forward_skb_hint = NULL;
-			break;
+			tp->retransmit_skb_hint = NULL;
+			return;
 		}
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 
 		if (skb == tcp_write_queue_head(sk))
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 						  inet_csk(sk)->icsk_rto,
 						  TCP_RTO_MAX);
-
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS);
 	}
 }
 
-- 
cgit v1.2.3


From f0ceb0ed86b4792a4ed9d3438f5f7572e48f9803 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:24:49 -0700
Subject: tcp: remove retransmit_skb_hint clearing from failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This doesn't much sense here afaict, probably never has. Since
fragmenting and collapsing deal the hints by themselves, there
should be very little reason for the rexmit loop to do that.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b5b4ddcdda41..f900fae8b87e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2096,10 +2096,8 @@ begin_fwd:
 		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 			continue;
 
-		if (tcp_retransmit_skb(sk, skb)) {
-			tp->retransmit_skb_hint = NULL;
+		if (tcp_retransmit_skb(sk, skb))
 			return;
-		}
 		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 
 		if (skb == tcp_write_queue_head(sk))
-- 
cgit v1.2.3


From ef9da47c7cc64d69526331f315e76b5680d4048f Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:25:15 -0700
Subject: tcp: don't clear retransmit_skb_hint when not necessary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most importantly avoid doing it with cumulative ACK. Not clearing
means that we no longer need n^2 processing in resolution of each
fast recovery.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 7 ++++++-
 net/ipv4/tcp_input.c  | 4 +++-
 net/ipv4/tcp_output.c | 8 +++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 220f54cf42ec..ea815723d414 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1037,10 +1037,15 @@ static inline void tcp_mib_init(struct net *net)
 }
 
 /* from STCP */
-static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
+static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
 {
 	tp->lost_skb_hint = NULL;
 	tp->scoreboard_skb_hint = NULL;
+}
+
+static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
+{
+	tcp_clear_retrans_hints_partial(tp);
 	tp->retransmit_skb_hint = NULL;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d017aed6edd7..44a4fffc2cc3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2925,7 +2925,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 
 		tcp_unlink_write_queue(skb, sk);
 		sk_wmem_free_skb(sk, skb);
-		tcp_clear_all_retrans_hints(tp);
+		tcp_clear_retrans_hints_partial(tp);
+		if (skb == tp->retransmit_skb_hint)
+			tp->retransmit_skb_hint = NULL;
 	}
 
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f900fae8b87e..239cea7b6c0e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -750,7 +750,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	BUG_ON(len > skb->len);
 
-	tcp_clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
 	nsize = skb_headlen(skb) - len;
 	if (nsize < 0)
 		nsize = 0;
@@ -1823,7 +1823,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
 	tp->packets_out -= tcp_skb_pcount(next_skb);
 
 	/* changed transmit queue under us so clear hints */
-	tcp_clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
+	if (next_skb == tp->retransmit_skb_hint)
+		tp->retransmit_skb_hint = skb;
 
 	sk_wmem_free_skb(sk, next_skb);
 }
@@ -1853,7 +1855,7 @@ void tcp_simple_retransmit(struct sock *sk)
 		}
 	}
 
-	tcp_clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
 
 	if (prior_lost == tp->lost_out)
 		return;
-- 
cgit v1.2.3


From 618d9f25548ba6fc3a9cd2ce5cd56f4f015b0635 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Sat, 20 Sep 2008 21:26:22 -0700
Subject: tcp: back retransmit_high when it over-estimated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If lost skb is sacked, we might have nothing to retransmit
as high as the retransmit_high is pointing to, so place
it lower to avoid unnecessary walking.

This is mainly for the case where high L'ed skbs gets sacked.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 239cea7b6c0e..8f9793a37b61 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2035,16 +2035,22 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	struct sk_buff *hole = NULL;
+	u32 last_lost;
 	int mib_idx;
 	int fwd_rexmitting = 0;
 
 	if (!tp->lost_out)
 		tp->retransmit_high = tp->snd_una;
 
-	if (tp->retransmit_skb_hint)
+	if (tp->retransmit_skb_hint) {
 		skb = tp->retransmit_skb_hint;
-	else
+		last_lost = TCP_SKB_CB(skb)->end_seq;
+		if (after(last_lost, tp->retransmit_high))
+			last_lost = tp->retransmit_high;
+	} else {
 		skb = tcp_write_queue_head(sk);
+		last_lost = tp->snd_una;
+	}
 
 	/* First pass: retransmit lost packets. */
 	tcp_for_write_queue_from(skb, sk) {
@@ -2073,6 +2079,7 @@ begin_fwd:
 			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
 
 		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+			tp->retransmit_high = last_lost;
 			if (!tcp_can_forward_retransmit(sk))
 				break;
 			/* Backtrack if necessary to non-L'ed skb */
@@ -2089,6 +2096,7 @@ begin_fwd:
 			continue;
 
 		} else {
+			last_lost = TCP_SKB_CB(skb)->end_seq;
 			if (icsk->icsk_ca_state != TCP_CA_Loss)
 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
 			else
-- 
cgit v1.2.3


From f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 Mon Sep 17 00:00:00 2001
From: Tom Quetchenbach <virtualphtn@gmail.com>
Date: Sun, 21 Sep 2008 00:21:51 -0700
Subject: tcp: advertise MSS requested by user

I'm trying to use the TCP_MAXSEG option to setsockopt() to set the MSS
for both sides of a bidirectional connection.

man tcp says: "If this option is set before connection establishment, it
also changes the MSS value announced to the other end in the initial
packet."

However, the kernel only uses the MTU/route cache to set the advertised
MSS. That means if I set the MSS to, say, 500 before calling connect(),
I will send at most 500-byte packets, but I will still receive 1500-byte
packets in reply.

This is a bug, either in the kernel or the documentation.

This patch (applies to latest net-2.6) reduces the advertised value to
that requested by the user as long as setsockopt() is called before
connect() or accept(). This seems like the behavior that one would
expect as well as that which is documented.

I've tried to make sure that things that depend on the advertised MSS
are set correctly.

Signed-off-by: Tom Quetchenbach <virtualphtn@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c   |  4 ++++
 net/ipv4/tcp_output.c | 13 ++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3dfbc21e555a..44aef1c1f373 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1364,6 +1364,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	tcp_mtup_init(newsk);
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+	if (tcp_sk(sk)->rx_opt.user_mss &&
+	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
 	tcp_initialize_rcv_mss(newsk);
 
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f9793a37b61..c3d58ee3e16f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2232,6 +2232,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	struct sk_buff *skb;
 	struct tcp_md5sig_key *md5;
 	__u8 *md5_hash_location;
+	int mss;
 
 	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
 	if (skb == NULL)
@@ -2242,13 +2243,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	skb->dst = dst_clone(dst);
 
+	mss = dst_metric(dst, RTAX_ADVMSS);
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+		mss = tp->rx_opt.user_mss;
+
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 		__u8 rcv_wscale;
 		/* Set this up on the first call only */
 		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk),
-			dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 			&req->rcv_wnd,
 			&req->window_clamp,
 			ireq->wscale_ok,
@@ -2258,8 +2263,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	memset(&opts, 0, sizeof(opts));
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_header_size = tcp_synack_options(sk, req,
-					     dst_metric(dst, RTAX_ADVMSS),
+	tcp_header_size = tcp_synack_options(sk, req, mss,
 					     skb, &opts, &md5) +
 			  sizeof(struct tcphdr);
 
@@ -2333,6 +2337,9 @@ static void tcp_connect_init(struct sock *sk)
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
+		tp->advmss = tp->rx_opt.user_mss;
+
 	tcp_initialize_rcv_mss(sk);
 
 	tcp_select_initial_window(tcp_full_space(sk),
-- 
cgit v1.2.3


From 77d40a0952b16e020ce07c4cf9fb22024448275b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Sep 2008 01:29:23 -0700
Subject: tcp: Fix order of tests in tcp_retransmit_skb()

tcp_write_queue_next() must only be made if we know that
tcp_skb_is_last() evaluates to false.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c3d58ee3e16f..a8499ef3234a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1932,8 +1932,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	/* Collapse two adjacent packets if worthwhile and we can. */
 	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 	    (skb->len < (cur_mss >> 1)) &&
-	    (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
 	    (!tcp_skb_is_last(sk, skb)) &&
+	    (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
 	    (skb_shinfo(skb)->nr_frags == 0 &&
 	     skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
 	    (tcp_skb_pcount(skb) == 1 &&
-- 
cgit v1.2.3


From a3116ac5c216fc3c145906a46df9ce542ff7dcf2 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 1 Oct 2008 07:46:49 -0700
Subject: tcp: Port redirection support for TCP

Current TCP code relies on the local port of the listening socket
being the same as the destination address of the incoming
connection. Port redirection used by many transparent proxying
techniques obviously breaks this, so we have to store the original
destination port address.

This patch extends struct inet_request_sock and stores the incoming
destination port value there. It also modifies the handshake code to
use that value as the source port when sending reply packets.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h         | 2 +-
 include/net/tcp.h               | 1 +
 net/ipv4/inet_connection_sock.c | 2 ++
 net/ipv4/syncookies.c           | 1 +
 net/ipv4/tcp_output.c           | 2 +-
 5 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index dced3f64f975..de0ecc71cf03 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -61,8 +61,8 @@ struct inet_request_sock {
 	struct request_sock	req;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	u16			inet6_rsk_offset;
-	/* 2 bytes hole, try to pack */
 #endif
+	__be16			loc_port;
 	__be32			loc_addr;
 	__be32			rmt_addr;
 	__be16			rmt_port;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 12c9b4fec040..f6cc34143154 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -976,6 +976,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	ireq->acked = 0;
 	ireq->ecn_ok = 0;
 	ireq->rmt_port = tcp_hdr(skb)->source;
+	ireq->loc_port = tcp_hdr(skb)->dest;
 }
 
 extern void tcp_enter_memory_pressure(struct sock *sk);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 432c570c9f5f..21fcc5a9045f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -516,6 +516,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
 		newicsk->icsk_bind_hash = NULL;
 
 		inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+		inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);
+		inet_sk(newsk)->sport = inet_rsk(req)->loc_port;
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 929302b2ba94..d346c22aa6ae 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -297,6 +297,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
 	req->mss		= mss;
+	ireq->loc_port		= th->dest;
 	ireq->rmt_port		= th->source;
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a8499ef3234a..493553c71d32 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2275,7 +2275,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	th->syn = 1;
 	th->ack = 1;
 	TCP_ECN_make_synack(req, th);
-	th->source = inet_sk(sk)->sport;
+	th->source = ireq->loc_port;
 	th->dest = ireq->rmt_port;
 	/* Setting of flags are superfluous here for callers (and ECE is
 	 * not even correctly set)
-- 
cgit v1.2.3


From 33f5f57eeb0c6386fdd85f9c690dc8d700ba7928 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Tue, 7 Oct 2008 14:43:06 -0700
Subject: tcp: kill pointless urg_mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It all started from me noticing that this urgent check in
tcp_clean_rtx_queue is unnecessarily inside the loop. Then
I took a longer look to it and found out that the users of
urg_mode can trivially do without, well almost, there was
one gotcha.

Bonus: those funny people who use urg with >= 2^31 write_seq -
snd_una could now rejoice too (that's the only purpose for the
between being there, otherwise a simple compare would have done
the thing). Not that I assume that the rest of the tcp code
happily lives with such mind-boggling numbers :-). Alas, it
turned out to be impossible to set wmem to such numbers anyway,
yes I really tried a big sendfile after setting some wmem but
nothing happened :-). ...Tcp_wmem is int and so is sk_sndbuf...
So I hacked a bit variable to long and found out that it seems
to work... :-)

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 ++++-----
 net/ipv4/tcp.c           |  4 +---
 net/ipv4/tcp_input.c     | 11 ++++++-----
 net/ipv4/tcp_minisocks.c |  1 +
 net/ipv4/tcp_output.c    | 18 ++++++++++++------
 5 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 767290628292..fe77e1499ab7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -312,8 +312,11 @@ struct tcp_sock {
 	u32	retrans_out;	/* Retransmitted packets out		*/
 
 	u16	urg_data;	/* Saved octet of OOB data and control flags */
-	u8	urg_mode;	/* In urgent mode		*/
 	u8	ecn_flags;	/* ECN status bits.			*/
+	u8	reordering;	/* Packet reordering metric.		*/
+	u32	snd_up;		/* Urgent pointer		*/
+
+	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 /*
  *      Options received (usually on last packet, some only on SYN packets).
  */
@@ -361,8 +364,6 @@ struct tcp_sock {
 
 	u32	lost_retrans_low;	/* Sent seq after any rxmit (lowest) */
 
-	u8	reordering;	/* Packet reordering metric.		*/
-	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	u32	high_seq;	/* snd_nxt at onset of congestion	*/
 
@@ -374,8 +375,6 @@ struct tcp_sock {
 	u32	total_retrans;	/* Total retransmits for entire connection */
 
 	u32	urg_seq;	/* Seq of received urgent pointer */
-	u32	snd_up;		/* Urgent pointer		*/
-
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
 	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7d3fe571d15f..eccb7165a80c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -497,10 +497,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 				struct sk_buff *skb)
 {
-	if (flags & MSG_OOB) {
-		tp->urg_mode = 1;
+	if (flags & MSG_OOB)
 		tp->snd_up = tp->write_seq;
-	}
 }
 
 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b76bce769dd..c19f429dc443 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2836,7 +2836,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
  */
-static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
+static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+			       u32 prior_snd_una)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2903,9 +2904,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
 
-		if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up)))
-			tp->urg_mode = 0;
-
 		tp->packets_out -= acked_pcount;
 		pkts_acked += acked_pcount;
 
@@ -2935,6 +2933,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
 			tp->lost_skb_hint = NULL;
 	}
 
+	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+		tp->snd_up = tp->snd_una;
+
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
@@ -3311,7 +3312,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, prior_fackets);
+	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
 
 	if (tp->frto_counter)
 		frto_cwnd = tcp_process_frto(sk, flag);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f976fc57892c..779f2e9d0689 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -395,6 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->pred_flags = 0;
 		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
 		newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
+		newtp->snd_up = treq->snt_isn + 1;
 
 		tcp_prequeue_init(newtp);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 493553c71d32..990a58493235 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -345,6 +345,11 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 	TCP_SKB_CB(skb)->end_seq = seq;
 }
 
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
 #define OPTION_SACK_ADVERTISE	(1 << 0)
 #define OPTION_TS		(1 << 1)
 #define OPTION_MD5		(1 << 2)
@@ -646,7 +651,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	th->check		= 0;
 	th->urg_ptr		= 0;
 
-	if (unlikely(tp->urg_mode &&
+	/* The urg_mode check is necessary during a below snd_una win probe */
+	if (unlikely(tcp_urg_mode(tp) &&
 		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
 		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
 		th->urg			= 1;
@@ -1012,7 +1018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
  *
- * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up
  * cannot be large. However, taking into account rare use of URG, this
  * is not a big flaw.
  */
@@ -1029,7 +1035,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
 	mss_now = tp->mss_cache;
 
-	if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
+	if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp))
 		doing_tso = 1;
 
 	if (dst) {
@@ -1193,7 +1199,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
 	/* Don't use the nagle rule for urgent data (or for the final FIN).
 	 * Nagle can be ignored during F-RTO too (see RFC4138).
 	 */
-	if (tp->urg_mode || (tp->frto_counter == 2) ||
+	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
 	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
 		return 1;
 
@@ -2358,6 +2364,7 @@ static void tcp_connect_init(struct sock *sk)
 	tcp_init_wl(tp, tp->write_seq, 0);
 	tp->snd_una = tp->write_seq;
 	tp->snd_sml = tp->write_seq;
+	tp->snd_up = tp->write_seq;
 	tp->rcv_nxt = 0;
 	tp->rcv_wup = 0;
 	tp->copied_seq = 0;
@@ -2567,8 +2574,7 @@ int tcp_write_wakeup(struct sock *sk)
 			tcp_event_new_data_sent(sk, skb);
 		return err;
 	} else {
-		if (tp->urg_mode &&
-		    between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
 			tcp_xmit_probe_skb(sk, 1);
 		return tcp_xmit_probe_skb(sk, 0);
 	}
-- 
cgit v1.2.3