diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 223 |
1 files changed, 160 insertions, 63 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2bc8e27a163d..57ae96a04220 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -77,7 +77,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; -int sysctl_tcp_ecn __read_mostly; +int sysctl_tcp_ecn __read_mostly = 2; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 2; @@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); - if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows @@ -411,7 +411,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); - hint = min(hint, TCP_MIN_RCVMSS); + hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; @@ -597,16 +597,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_grow_window(sk, skb); } -static u32 tcp_rto_min(struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; - - if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); - return rto_min; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -695,7 +685,7 @@ static inline void tcp_set_rto(struct sock *sk) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -706,8 +696,7 @@ static inline void tcp_set_rto(struct sock *sk) /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + tcp_bound_rto(sk); } /* Save metrics learned by this TCP session. @@ -772,7 +761,7 @@ void tcp_update_metrics(struct sock *sk) set_dst_metric_rtt(dst, RTAX_RTTVAR, var); } - if (tp->snd_ssthresh >= 0xFFFF) { + if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && @@ -928,6 +917,8 @@ static void tcp_init_metrics(struct sock *sk) tcp_set_rto(sk); if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; + +cwnd: tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; return; @@ -942,6 +933,7 @@ reset: tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; } + goto cwnd; } static void tcp_update_reordering(struct sock *sk, const int metric, @@ -2308,7 +2300,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ -static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +static inline int tcp_dupack_heuristics(struct tcp_sock *tp) { return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } @@ -2433,7 +2425,7 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heurestics(tp) > tp->reordering) + if (tcp_dupack_heuristics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast @@ -3706,7 +3698,7 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - int estab) + u8 **hvpp, int estab, struct dst_entry *dst) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); @@ -3745,7 +3737,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && - !estab && sysctl_tcp_window_scaling) { + !estab && sysctl_tcp_window_scaling && + !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { @@ -3761,7 +3754,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || - (!estab && sysctl_tcp_timestamps))) { + (!estab && sysctl_tcp_timestamps && + !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); @@ -3769,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && - !estab && sysctl_tcp_sack) { + !estab && sysctl_tcp_sack && + !dst_feature(dst, RTAX_FEATURE_NO_SACK)) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); } @@ -3790,7 +3785,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - } + case TCPOPT_COOKIE: + /* This option is variable length. + */ + switch (opsize) { + case TCPOLEN_COOKIE_BASE: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_PAIR: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_MIN+0: + case TCPOLEN_COOKIE_MIN+2: + case TCPOLEN_COOKIE_MIN+4: + case TCPOLEN_COOKIE_MIN+6: + case TCPOLEN_COOKIE_MAX: + /* 16-bit multiple */ + opt_rx->cookie_plus = opsize; + *hvpp = ptr; + default: + /* ignore option */ + break; + }; + break; + }; ptr += opsize-2; length -= opsize; @@ -3818,17 +3836,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) * If it is wrong it falls back on tcp_parse_options(). */ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) + struct tcp_sock *tp, u8 **hvpp) { - if (th->doff == sizeof(struct tcphdr) >> 2) { + /* In the spirit of fast parsing, compare doff directly to constant + * values. Because equality is used, short doff can be ignored here. + */ + if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && - th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) return 1; } - tcp_parse_options(skb, &tp->rx_opt, 1); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); return 1; } @@ -4083,8 +4104,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack && + !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4113,13 +4136,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack && + !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -4433,7 +4458,7 @@ drop: } __skb_queue_head(&tp->out_of_order_queue, skb); } else { - struct sk_buff *skb1 = tp->out_of_order_queue.prev; + struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); u32 seq = TCP_SKB_CB(skb)->seq; u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -4450,15 +4475,18 @@ drop: } /* Find place to insert this segment. */ - do { + while (1) { if (!after(TCP_SKB_CB(skb1)->seq, seq)) break; - } while ((skb1 = skb1->prev) != - (struct sk_buff *)&tp->out_of_order_queue); + if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + } /* Do skb overlap to previous one? */ - if (skb1 != (struct sk_buff *)&tp->out_of_order_queue && - before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ __kfree_skb(skb); @@ -4470,15 +4498,26 @@ drop: tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { - skb1 = skb1->prev; + if (skb_queue_is_first(&tp->out_of_order_queue, + skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev( + &tp->out_of_order_queue, + skb1); } } - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + if (!skb1) + __skb_queue_head(&tp->out_of_order_queue, skb); + else + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ - while ((skb1 = skb->next) != - (struct sk_buff *)&tp->out_of_order_queue && - after(end_seq, TCP_SKB_CB(skb1)->seq)) { + while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { + skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, end_seq); @@ -4499,7 +4538,10 @@ add_sack: static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff *next = skb->next; + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(list, skb)) + next = skb_queue_next(list, skb); __skb_unlink(skb, list); __kfree_skb(skb); @@ -4510,6 +4552,9 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. + * + * If tail is NULL, this means until the end of the list. + * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ @@ -4518,15 +4563,23 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb; + struct sk_buff *skb, *n; + bool end_of_skbs; /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ - for (skb = head; skb != tail;) { + skb = head; +restart: + end_of_skbs = true; + skb_queue_walk_from_safe(list, skb, n) { + if (skb == tail) + break; /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list); - continue; + if (!skb) + break; + goto restart; } /* The first skb to collapse is: @@ -4536,16 +4589,24 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, */ if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || - before(TCP_SKB_CB(skb)->seq, start) || - (skb->next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq))) + before(TCP_SKB_CB(skb)->seq, start))) { + end_of_skbs = false; break; + } + + if (!skb_queue_is_last(list, skb)) { + struct sk_buff *next = skb_queue_next(list, skb); + if (next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { + end_of_skbs = false; + break; + } + } /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; - skb = skb->next; } - if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -4590,7 +4651,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list); - if (skb == tail || + if (!skb || + skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; @@ -4617,17 +4679,21 @@ static void tcp_collapse_ofo_queue(struct sock *sk) head = skb; for (;;) { - skb = skb->next; + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) + next = skb_queue_next(&tp->out_of_order_queue, skb); + skb = next; /* Segment is terminated when we see gap or when * we are at the end of all the queue. */ - if (skb == (struct sk_buff *)&tp->out_of_order_queue || + if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { tcp_collapse(sk, &tp->out_of_order_queue, head, skb, start, end); head = skb; - if (skb == (struct sk_buff *)&tp->out_of_order_queue) + if (!skb) break; /* Start new segment */ start = TCP_SKB_CB(skb)->seq; @@ -4688,10 +4754,11 @@ static int tcp_prune_queue(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, &sk->sk_receive_queue, - sk->sk_receive_queue.next, - (struct sk_buff *)&sk->sk_receive_queue, - tp->copied_seq, tp->rcv_nxt); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, + skb_peek(&sk->sk_receive_queue), + NULL, + tp->copied_seq, tp->rcv_nxt); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) @@ -4811,11 +4878,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ - && __tcp_select_window(sk) >= tp->rcv_wnd) || + __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ @@ -5036,10 +5103,12 @@ out: static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int syn_inerr) { + u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp, &hash_location) && + tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5327,11 +5396,14 @@ discard: static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { - struct tcp_sock *tp = tcp_sk(sk); + u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, dst); if (th->ack) { /* rfc793: @@ -5428,6 +5500,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * Change state from SYN-SENT only after copied_seq * is initialized. */ tp->copied_seq = tp->rcv_nxt; + + if (cvp != NULL && + cvp->cookie_pair_size > 0 && + tp->rx_opt.cookie_plus > 0) { + int cookie_size = tp->rx_opt.cookie_plus + - TCPOLEN_COOKIE_BASE; + int cookie_pair_size = cookie_size + + cvp->cookie_desired; + + /* A cookie extension option was sent and returned. + * Note that each incoming SYNACK replaces the + * Responder cookie. The initial exchange is most + * fragile, as protection against spoofing relies + * entirely upon the sequence and timestamp (above). + * This replacement strategy allows the correct pair to + * pass through, while any others will be filtered via + * Responder verification later. + */ + if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { + memcpy(&cvp->cookie_pair[cvp->cookie_desired], + hash_location, cookie_size); + cvp->cookie_pair_size = cookie_pair_size; + } + } + smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); |