summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c417
1 files changed, 295 insertions, 122 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f501ac048366..728f5b3d3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !sk_under_memory_pressure(sk)) {
+ !tcp_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !sk_under_memory_pressure(sk) &&
+ !tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
@@ -866,7 +866,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- if ((tp->retransmit_skb_hint == NULL) ||
+ if (!tp->retransmit_skb_hint ||
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
@@ -1130,7 +1130,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sacktag_state {
int reord;
int fack_count;
- long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ struct skb_mstamp first_sackt;
+ struct skb_mstamp last_sackt;
int flag;
};
@@ -1233,14 +1238,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
- /* Pick the earliest sequence sacked for RTT */
- if (state->rtt_us < 0) {
- struct skb_mstamp now;
-
- skb_mstamp_get(&now);
- state->rtt_us = skb_mstamp_us_delta(&now,
- xmit_time);
- }
+ if (state->first_sackt.v64 == 0)
+ state->first_sackt = *xmit_time;
+ state->last_sackt = *xmit_time;
}
if (sacked & TCPCB_LOST) {
@@ -1256,7 +1256,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
fack_count += pcount;
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
+ if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
@@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* code can come after this skb later on it's better to keep
* setting gso_size to something.
*/
- if (!skb_shinfo(prev)->gso_size) {
- skb_shinfo(prev)->gso_size = mss;
- skb_shinfo(prev)->gso_type = sk->sk_gso_type;
- }
+ if (!TCP_SKB_CB(prev)->tcp_gso_size)
+ TCP_SKB_CB(prev)->tcp_gso_size = mss;
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
- if (tcp_skb_pcount(skb) <= 1) {
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- }
+ if (tcp_skb_pcount(skb) <= 1)
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1535,7 +1531,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- if ((next_dup != NULL) &&
+ if (next_dup &&
before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
in_sack = tcp_match_skb_to_sack(sk, skb,
next_dup->start_seq,
@@ -1551,7 +1547,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
if (in_sack <= 0) {
tmp = tcp_shift_skb_data(sk, skb, state,
start_seq, end_seq, dup_sack);
- if (tmp != NULL) {
+ if (tmp) {
if (tmp != skb) {
skb = tmp;
continue;
@@ -1614,7 +1610,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
- if (next_dup == NULL)
+ if (!next_dup)
return skb;
if (before(next_dup->start_seq, skip_to_seq)) {
@@ -1634,7 +1630,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
- u32 prior_snd_una, long *sack_rtt_us)
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1642,7 +1638,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
- struct tcp_sacktag_state state;
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
@@ -1650,9 +1645,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
int i, j;
int first_sack_index;
- state.flag = 0;
- state.reord = tp->packets_out;
- state.rtt_us = -1L;
+ state->flag = 0;
+ state->reord = tp->packets_out;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1663,7 +1657,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
- state.flag |= FLAG_DSACKING_ACK;
+ state->flag |= FLAG_DSACKING_ACK;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1728,7 +1722,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
skb = tcp_write_queue_head(sk);
- state.fack_count = 0;
+ state->fack_count = 0;
i = 0;
if (!tp->sacked_out) {
@@ -1762,10 +1756,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
- skb = tcp_sacktag_skip(skb, sk, &state,
+ skb = tcp_sacktag_skip(skb, sk, state,
start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
- &state,
+ state,
start_seq,
cache->start_seq,
dup_sack);
@@ -1776,21 +1770,21 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
goto advance_sp;
skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
- &state,
+ state,
cache->end_seq);
/* ...tail remains todo... */
if (tcp_highest_sack_seq(tp) == cache->end_seq) {
/* ...but better entrypoint exists! */
skb = tcp_highest_sack(sk);
- if (skb == NULL)
+ if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
cache++;
goto walk;
}
- skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
@@ -1798,14 +1792,14 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (!before(start_seq, tcp_highest_sack_seq(tp))) {
skb = tcp_highest_sack(sk);
- if (skb == NULL)
+ if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
}
- skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, start_seq);
walk:
- skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+ skb = tcp_sacktag_walk(skb, sk, next_dup, state,
start_seq, end_seq, dup_sack);
advance_sp:
@@ -1820,14 +1814,12 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- tcp_mark_lost_retrans(sk);
-
- tcp_verify_left_out(tp);
-
- if ((state.reord < tp->fackets_out) &&
+ if ((state->reord < tp->fackets_out) &&
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
- tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+ tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
+ tcp_mark_lost_retrans(sk);
+ tcp_verify_left_out(tp);
out:
#if FASTRETRANS_DEBUG > 0
@@ -1836,8 +1828,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
- *sack_rtt_us = state.rtt_us;
- return state.flag;
+ return state->flag;
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
@@ -1926,14 +1917,13 @@ void tcp_enter_loss(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- bool new_recovery = false;
+ bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
- new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2257,7 +2247,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
(oldcnt >= packets))
break;
- mss = skb_shinfo(skb)->gso_size;
+ mss = tcp_skb_mss(skb);
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
mss, GFP_ATOMIC);
if (err < 0)
@@ -2557,6 +2547,7 @@ void tcp_enter_cwr(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
+EXPORT_SYMBOL(tcp_enter_cwr);
static void tcp_try_keep_open(struct sock *sk)
{
@@ -2700,16 +2691,21 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
+ if ((flag & FLAG_SND_UNA_ADVANCED) &&
+ tcp_try_undo_loss(sk, false))
+ return;
+
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
/* Step 3.b. A timeout is spurious if not all data are
* lost, i.e., never-retransmitted data are (s)acked.
*/
- if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, true))
return;
- if (after(tp->snd_nxt, tp->high_seq) &&
- (flag & FLAG_DATA_SACKED || is_dupack)) {
- tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+ if (after(tp->snd_nxt, tp->high_seq)) {
+ if (flag & FLAG_DATA_SACKED || is_dupack)
+ tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
@@ -2734,8 +2730,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
- if (tcp_try_undo_loss(sk, false))
- return;
tcp_xmit_retransmit_queue(sk);
}
@@ -3054,7 +3048,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, long sack_rtt_us)
+ u32 prior_snd_una,
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt, now;
@@ -3062,8 +3057,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
bool fully_acked = true;
- long ca_seq_rtt_us = -1L;
+ long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
+ long ca_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
bool rtt_update;
@@ -3099,17 +3095,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
- } else {
+ } else if (!(sacked & TCPCB_SACKED_ACKED)) {
last_ackt = skb->skb_mstamp;
WARN_ON_ONCE(last_ackt.v64 == 0);
if (!first_ackt.v64)
first_ackt = last_ackt;
- if (!(sacked & TCPCB_SACKED_ACKED)) {
- reord = min(pkts_acked, reord);
- if (!after(scb->end_seq, tp->high_seq))
- flag |= FLAG_ORIG_SACK_ACKED;
- }
+ reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_SACKED_ACKED)
@@ -3154,15 +3148,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
skb_mstamp_get(&now);
if (likely(first_ackt.v64)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+ if (sack->first_sackt.v64) {
+ sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
if (flag & FLAG_ACKED) {
- const struct tcp_congestion_ops *ca_ops
- = inet_csk(sk)->icsk_ca_ops;
-
tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
@@ -3185,11 +3180,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- if (ca_ops->pkts_acked) {
- long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
- ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
- }
-
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3199,6 +3189,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
@@ -3239,7 +3232,7 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
- unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
when, TCP_RTO_MAX);
@@ -3282,6 +3275,28 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+ u32 delta = ack - tp->snd_una;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_acked += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->snd_una = ack;
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+ u32 delta = seq - tp->rcv_nxt;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_received += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->rcv_nxt = seq;
+}
+
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -3317,11 +3332,41 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
}
}
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
return flag;
}
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{
@@ -3415,6 +3460,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sacktag_state sack_state;
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -3423,7 +3469,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
- long sack_rtt_us = -1L;
+
+ sack_state.first_sackt.v64 = 0;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3469,7 +3516,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
- tp->snd_una = ack;
+ tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
@@ -3487,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
@@ -3512,7 +3559,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
- sack_rtt_us);
+ &sack_state);
acked -= tp->packets_out;
/* Advance cwnd if state allows */
@@ -3564,7 +3611,7 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
@@ -3573,6 +3620,23 @@ old_ack:
return 0;
}
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+ bool syn, struct tcp_fastopen_cookie *foc,
+ bool exp_opt)
+{
+ /* Valid only in SYN or SYN-ACK with an even length. */
+ if (!foc || !syn || len < 0 || (len & 1))
+ return;
+
+ if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+ len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, cookie, len);
+ else if (len != 0)
+ len = -1;
+ foc->len = len;
+ foc->exp = exp_opt;
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3662,21 +3726,22 @@ void tcp_parse_options(const struct sk_buff *skb,
*/
break;
#endif
+ case TCPOPT_FASTOPEN:
+ tcp_parse_fastopen_option(
+ opsize - TCPOLEN_FASTOPEN_BASE,
+ ptr, th->syn, foc, false);
+ break;
+
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
- * 16 bits magic number. It's valid only in
- * SYN or SYN-ACK with an even size.
+ * 16 bits magic number.
*/
- if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
- get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
- foc == NULL || !th->syn || (opsize & 1))
- break;
- foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
- if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
- foc->len <= TCP_FASTOPEN_COOKIE_MAX)
- memcpy(foc->val, ptr + 2, foc->len);
- else if (foc->len != 0)
- foc->len = -1;
+ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+ get_unaligned_be16(ptr) ==
+ TCPOPT_FASTOPEN_MAGIC)
+ tcp_parse_fastopen_option(opsize -
+ TCPOLEN_EXP_FASTOPEN_BASE,
+ ptr + 2, th->syn, foc, true);
break;
}
@@ -4190,7 +4255,7 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4358,7 +4423,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
__skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
- tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
@@ -4445,13 +4510,15 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
- if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
-
+ if (eaten < 0) {
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4640,7 +4707,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
struct sk_buff *head;
u32 start, end;
- if (skb == NULL)
+ if (!skb)
return;
start = TCP_SKB_CB(skb)->seq;
@@ -4719,7 +4786,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (sk_under_memory_pressure(sk))
+ else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4763,7 +4830,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
return false;
/* If we are under soft global TCP memory pressure, do not expand. */
@@ -4799,6 +4866,8 @@ static void tcp_check_space(struct sock *sk)
{
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+ /* pairs with tcp_poll() */
+ smp_mb__after_atomic();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_new_space(sk);
@@ -5095,7 +5164,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (unlikely(sk->sk_rx_dst == NULL))
+ if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
@@ -5197,7 +5266,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
@@ -5292,7 +5361,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_ESTABLISHED);
- if (skb != NULL) {
+ if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
}
@@ -5330,8 +5399,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
- u16 mss = tp->rx_opt.mss_clamp;
- bool syn_drop;
+ u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+ bool syn_drop = false;
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
@@ -5343,16 +5412,25 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
mss = opt.mss_clamp;
}
- if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
+ if (!tp->syn_fastopen) {
+ /* Ignore an unsolicited cookie */
cookie->len = -1;
+ } else if (tp->total_retrans) {
+ /* SYN timed out and the SYN-ACK neither has a cookie nor
+ * acknowledges data. Presumably the remote received only
+ * the retransmitted (regular) SYNs: either the original
+ * SYN-data or the corresponding SYN-ACK was dropped.
+ */
+ syn_drop = (cookie->len < 0 && data);
+ } else if (cookie->len < 0 && !tp->syn_data) {
+ /* We requested a cookie but didn't get it. If we did not use
+ * the (old) exp opt format then try so next time (try_exp=1).
+ * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+ */
+ try_exp = tp->syn_fastopen_exp ? 2 : 1;
+ }
- /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
- * the remote receives only the retransmitted (regular) SYNs: either
- * the original SYN-data or the corresponding SYN-ACK is lost.
- */
- syn_drop = (cookie->len <= 0 && data && tp->total_retrans);
-
- tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
tcp_for_write_queue_from(data, sk) {
@@ -5661,11 +5739,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
req = tp->fastopen_rsk;
- if (req != NULL) {
+ if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+ if (!tcp_check_req(sk, skb, req, true))
goto discard;
}
@@ -5751,7 +5829,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* ACK we have received, this would have acknowledged
* our SYNACK so stop the SYNACK timer.
*/
- if (req != NULL) {
+ if (req) {
/* Return RST if ack_seq is invalid.
* Note that RFC793 only says to generate a
* DUPACK for it but for TCP Fast Open it seems
@@ -5913,6 +5991,97 @@ static void tcp_ecn_create_request(struct request_sock *req,
inet_rsk(req)->ecn_ok = 1;
}
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->cookie_ts = 0;
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+}
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ kmemcheck_annotate_bitfield(ireq, flags);
+ ireq->opt = NULL;
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+/*
+ * Return true if a syncookie should be sent
+ */
+static bool tcp_syn_flood_action(struct sock *sk,
+ const struct sk_buff *skb,
+ const char *proto)
+{
+ const char *msg = "Dropping request";
+ bool want_cookie = false;
+ struct listen_sock *lopt;
+
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies) {
+ msg = "Sending cookies";
+ want_cookie = true;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ } else
+#endif
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+
+ lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
+ if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
+ lopt->synflood_warned = 1;
+ pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
+ proto, ntohs(tcp_hdr(skb)->dest), msg);
+ }
+ return want_cookie;
+}
+
+static void tcp_reqsk_record_syn(const struct sock *sk,
+ struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ if (tcp_sk(sk)->save_syn) {
+ u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+ u32 *copy;
+
+ copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
+ if (copy) {
+ copy[0] = len;
+ memcpy(&copy[1], skb_network_header(skb), len);
+ req->saved_syn = copy;
+ }
+ }
+}
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -5950,7 +6119,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops);
+ req = inet_reqsk_alloc(rsk_ops, sk);
if (!req)
goto drop;
@@ -5967,6 +6136,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
+ /* Note: tcp_v6_init_req() might override ir_iif for link locals */
+ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+
af_ops->init_req(req, sk, skb);
if (security_inet_conn_request(sk, skb, req))
@@ -6039,9 +6211,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (err || want_cookie)
goto drop_and_free;
- tcp_rsk(req)->listener = NULL;
+ tcp_rsk(req)->tfo_listener = false;
af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
+ tcp_reqsk_record_syn(sk, req, skb);
return 0;