From 2349262397b89a421adfd142aad2a7dd33710f26 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 30 Mar 2016 14:54:20 -0700
Subject: tcp: remove cwnd moderation after recovery

For non-SACK connections, cwnd is lowered to inflight plus 3 packets
when the recovery ends. This is an optional feature in the NewReno
RFC 2582 to reduce the potential burst when cwnd is "re-opened"
after recovery and inflight is low.

This feature is questionably effective because of PRR: when
the recovery ends (i.e., snd_una == high_seq) NewReno holds the
CA_Recovery state for another round trip to prevent false fast
retransmits. But if the inflight is low, PRR will overwrite the
moderated cwnd in tcp_cwnd_reduction() later regardlessly. So if a
receiver responds bogus ACKs (i.e., acking future data) to speed up
transfer after recovery, it can only induce a burst up to a window
worth of data packets by acking up to SND.NXT. A restart from (short)
idle or receiving streched ACKs can both cause such bursts as well.

On the other hand, if the recovery ends because the sender
detects the losses were spurious (e.g., reordering). This feature
unconditionally lowers a reverted cwnd even though nothing
was lost.

By principle loss recovery module should not update cwnd. Further
pacing is much more effective to reduce burst. Hence this patch
removes the cwnd moderation feature.

v2 changes: revised commit message on bogus ACKs and burst, and
            missing signature

Signed-off-by: Matt Mathis <mattmathis@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6e65f79ade8..f87b84a75691 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2252,16 +2252,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 	}
 }
 
-/* CWND moderation, preventing bursts due to too big ACKs
- * in dubious situations.
- */
-static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
-{
-	tp->snd_cwnd = min(tp->snd_cwnd,
-			   tcp_packets_in_flight(tp) + tcp_max_burst(tp));
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
 static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
 {
 	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2410,7 +2400,6 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 		/* Hold old state until something *above* high_seq
 		 * is ACKed. For Reno it is MUST to prevent false
 		 * fast retransmits (RFC2582). SACK TCP is safe. */
-		tcp_moderate_cwnd(tp);
 		if (!tcp_any_retrans_done(sk))
 			tp->retrans_stamp = 0;
 		return true;
-- 
cgit v1.2.3


From 6b084928baac562ed61866f540a96120e9c9ddb7 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Sat, 2 Apr 2016 23:08:08 -0400
Subject: tcp: use one bit in TCP_SKB_CB to mark ACK timestamps

Currently, to avoid a cache line miss for accessing skb_shinfo,
tcp_ack_tstamp skips socket that do not have
SOF_TIMESTAMPING_TX_ACK bit set in sk_tsflags. This is
implemented based on an implicit assumption that the
SOF_TIMESTAMPING_TX_ACK is set via socket options for the
duration that ACK timestamps are needed.

To implement per-write timestamps, this check should be
removed and replaced with a per-packet alternative that
quickly skips packets missing ACK timestamps marks without
a cache-line miss.

To enable per-packet marking without a cache line miss, use
one bit in TCP_SKB_CB to mark a whether a SKB might need a
ack tx timestamp or not. Further checks in tcp_ack_tstamp are not
modified and work as before.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 3 ++-
 net/ipv4/tcp.c       | 2 ++
 net/ipv4/tcp_input.c | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f8bb4a4ed3d1..a23282996ca9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -754,7 +754,8 @@ struct tcp_skb_cb {
 				TCPCB_REPAIRED)
 
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
-	/* 1 byte hole */
+	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
+			unused:7;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct inet_skb_parm	h4;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 08b8b960a8ed..ce3c9eb901a0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -432,10 +432,12 @@ static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
 {
 	if (sk->sk_tsflags) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
+		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 		sock_tx_timestamp(sk, &shinfo->tx_flags);
 		if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
 			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+		tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f87b84a75691..a26e2d262358 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3082,7 +3082,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 	const struct skb_shared_info *shinfo;
 
 	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
-	if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
 		return;
 
 	shinfo = skb_shinfo(skb);
-- 
cgit v1.2.3


From 532182cd610782db8c18230c2747626562032205 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Apr 2016 08:52:19 -0700
Subject: tcp: increment sk_drops for dropped rx packets

Now ss can report sk_drops, we can instruct TCP to increment
this per socket counter when it drops an incoming frame, to refine
monitoring and debugging.

Following patch takes care of listeners drops.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h   |  7 +++++++
 net/ipv4/tcp_input.c | 33 ++++++++++++++++++++-------------
 net/ipv4/tcp_ipv4.c  |  1 +
 net/ipv6/tcp_ipv6.c  |  1 +
 4 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7ad73db9dde2..310c4367ea83 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2012,6 +2012,13 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
 }
 
+static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
+{
+	int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+
+	atomic_add(segs, &sk->sk_drops);
+}
+
 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 			   struct sk_buff *skb);
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a26e2d262358..0ffcd07e3409 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4307,6 +4307,12 @@ static bool tcp_try_coalesce(struct sock *sk,
 	return true;
 }
 
+static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+	sk_drops_add(sk, skb);
+	__kfree_skb(skb);
+}
+
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
@@ -4331,7 +4337,7 @@ static void tcp_ofo_queue(struct sock *sk)
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
-			__kfree_skb(skb);
+			tcp_drop(sk, skb);
 			continue;
 		}
 		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -4383,7 +4389,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
-		__kfree_skb(skb);
+		tcp_drop(sk, skb);
 		return;
 	}
 
@@ -4447,7 +4453,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 			/* All the bits are present. Drop. */
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-			__kfree_skb(skb);
+			tcp_drop(sk, skb);
 			skb = NULL;
 			tcp_dsack_set(sk, seq, end_seq);
 			goto add_sack;
@@ -4486,7 +4492,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-		__kfree_skb(skb1);
+		tcp_drop(sk, skb1);
 	}
 
 add_sack:
@@ -4569,12 +4575,13 @@ err:
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int eaten = -1;
 	bool fragstolen = false;
+	int eaten = -1;
 
-	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
-		goto drop;
-
+	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+		__kfree_skb(skb);
+		return;
+	}
 	skb_dst_drop(skb);
 	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
 
@@ -4656,7 +4663,7 @@ out_of_window:
 		tcp_enter_quickack_mode(sk);
 		inet_csk_schedule_ack(sk);
 drop:
-		__kfree_skb(skb);
+		tcp_drop(sk, skb);
 		return;
 	}
 
@@ -5233,7 +5240,7 @@ syn_challenge:
 	return true;
 
 discard:
-	__kfree_skb(skb);
+	tcp_drop(sk, skb);
 	return false;
 }
 
@@ -5451,7 +5458,7 @@ csum_error:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 
 discard:
-	__kfree_skb(skb);
+	tcp_drop(sk, skb);
 }
 EXPORT_SYMBOL(tcp_rcv_established);
 
@@ -5682,7 +5689,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
 discard:
-			__kfree_skb(skb);
+			tcp_drop(sk, skb);
 			return 0;
 		} else {
 			tcp_send_ack(sk);
@@ -6043,7 +6050,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	if (!queued) {
 discard:
-		__kfree_skb(skb);
+		tcp_drop(sk, skb);
 	}
 	return 0;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e5f924b29946..059a98f5e7e1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1694,6 +1694,7 @@ discard_it:
 	return 0;
 
 discard_and_relse:
+	sk_drops_add(sk, skb);
 	if (refcounted)
 		sock_put(sk);
 	goto discard_it;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f0422e782731..5fa8fea394c9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1484,6 +1484,7 @@ discard_it:
 	return 0;
 
 discard_and_relse:
+	sk_drops_add(sk, skb);
 	if (refcounted)
 		sock_put(sk);
 	goto discard_it;
-- 
cgit v1.2.3


From 9caad864151e525929d323de96cad382da49c3b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Apr 2016 08:52:20 -0700
Subject: tcp: increment sk_drops for listeners

Goal: packets dropped by a listener are accounted for.

This adds tcp_listendrop() helper, and clears sk_drops in sk_clone_lock()
so that children do not inherit their parent drop count.

Note that we no longer increment LINUX_MIB_LISTENDROPS counter when
sending a SYNCOOKIE, since the SYN packet generated a SYNACK.
We already have a separate LINUX_MIB_SYNCOOKIESSENT

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 13 +++++++++++++
 net/core/sock.c      |  1 +
 net/ipv4/tcp_input.c |  8 +++++---
 net/ipv4/tcp_ipv4.c  |  6 +++---
 net/ipv6/tcp_ipv6.c  |  4 ++--
 5 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a23282996ca9..74d3ed5eb219 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1836,4 +1836,17 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
 		tp->data_segs_in += segs_in;
 }
 
+/*
+ * TCP listen path runs lockless.
+ * We forced "struct sock" to be const qualified to make sure
+ * we don't modify one of its field by mistake.
+ * Here, we increment sk_drops which is an atomic_t, so we can safely
+ * make sock writable again.
+ */
+static inline void tcp_listendrop(const struct sock *sk)
+{
+	atomic_inc(&((struct sock *)sk)->sk_drops);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+}
+
 #endif	/* _TCP_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 7a6a063b28b3..2f517ea56786 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1525,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_dst_cache	= NULL;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
+		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0ffcd07e3409..983f04c11177 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6339,8 +6339,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 		af_ops->send_synack(sk, dst, &fl, req,
 				    &foc, !want_cookie);
-		if (want_cookie)
-			goto drop_and_free;
+		if (want_cookie) {
+			reqsk_free(req);
+			return 0;
+		}
 	}
 	reqsk_put(req);
 	return 0;
@@ -6350,7 +6352,7 @@ drop_and_release:
 drop_and_free:
 	reqsk_free(req);
 drop:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	tcp_listendrop(sk);
 	return 0;
 }
 EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 059a98f5e7e1..f3ce0afe70aa 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -329,7 +329,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 		 * errors returned from accept().
 		 */
 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
-		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+		tcp_listendrop(req->rsk_listener);
 	}
 	reqsk_put(req);
 }
@@ -1246,7 +1246,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 				&tcp_request_sock_ipv4_ops, sk, skb);
 
 drop:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	tcp_listendrop(sk);
 	return 0;
 }
 EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1348,7 +1348,7 @@ exit_overflow:
 exit_nonewsk:
 	dst_release(dst);
 exit:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	tcp_listendrop(sk);
 	return NULL;
 put_and_exit:
 	inet_csk_prepare_forced_close(newsk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5fa8fea394c9..7cde1b6fdda3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -964,7 +964,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 				&tcp_request_sock_ipv6_ops, sk, skb);
 
 drop:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	tcp_listendrop(sk);
 	return 0; /* don't send reset */
 }
 
@@ -1169,7 +1169,7 @@ out_overflow:
 out_nonewsk:
 	dst_release(dst);
 out:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	tcp_listendrop(sk);
 	return NULL;
 }
 
-- 
cgit v1.2.3