From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 21:22:05 -0700 Subject: ipv4: Early TCP socket demux. Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8b..8416f8a68e65 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); int res; + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (unlikely(dst->obsolete)) { + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + } + if (unlikely(sk->sk_rx_dst == NULL)) + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* * Header prediction. * The code loosely follows the one in the famous @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) + if (skb != NULL) { + sk->sk_rx_dst = dst_clone(skb_dst(skb)); security_inet_conn_established(sk, skb); + } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); -- cgit v1.2.3 From 5110effee8fde2edfacac9cd12a9960ab2dc39ea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:21:03 -0700 Subject: net: Do delayed neigh confirmation. When a dst_confirm() happens, mark the confirmation as pending in the dst. Then on the next packet out, when we have the neigh in-hand, do the update. This removes the dependency in dst_confirm() of dst's having an attached neigh. While we're here, remove the explicit 'dst' NULL check, all except 2 or 3 call sites ensure it's not NULL. So just fix those cases up. Signed-off-by: David S. Miller --- include/net/dst.h | 29 +++++++++++++++++++++-------- include/net/neighbour.h | 15 --------------- net/core/dst.c | 3 ++- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp_input.c | 19 +++++++++++++------ net/ipv6/ip6_output.c | 2 +- 6 files changed, 38 insertions(+), 32 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/dst.h b/include/net/dst.h index f0bf3b8d5911..84e7a3ff968d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -51,7 +51,7 @@ struct dst_entry { int (*input)(struct sk_buff *); int (*output)(struct sk_buff *); - int flags; + unsigned short flags; #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 @@ -62,6 +62,8 @@ struct dst_entry { #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 + unsigned short pending_confirm; + short error; short obsolete; unsigned short header_len; /* more space at head required */ @@ -371,7 +373,8 @@ static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb) extern int dst_discard(struct sk_buff *skb); extern void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, int flags); + int initial_ref, int initial_obsolete, + unsigned short flags); extern void __dst_free(struct dst_entry *dst); extern struct dst_entry *dst_destroy(struct dst_entry *dst); @@ -395,14 +398,24 @@ static inline void dst_rcu_free(struct rcu_head *head) static inline void dst_confirm(struct dst_entry *dst) { - if (dst) { - struct neighbour *n; + dst->pending_confirm = 1; +} - rcu_read_lock(); - n = dst_get_neighbour_noref(dst); - neigh_confirm(n); - rcu_read_unlock(); +static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, + struct sk_buff *skb) +{ + struct hh_cache *hh; + + if (unlikely(dst->pending_confirm)) { + n->confirmed = jiffies; + dst->pending_confirm = 0; } + + hh = &n->hh; + if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) + return neigh_hh_output(hh, skb); + else + return n->output(n, skb); } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e1d18bdeebb8..344d8988842a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -309,12 +309,6 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) atomic_inc(&(n)->refcnt) -static inline void neigh_confirm(struct neighbour *neigh) -{ - if (neigh) - neigh->confirmed = jiffies; -} - static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; @@ -358,15 +352,6 @@ static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) return dev_queue_xmit(skb); } -static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) -{ - struct hh_cache *hh = &n->hh; - if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) - return neigh_hh_output(hh, skb); - else - return n->output(n, skb); -} - static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { diff --git a/net/core/dst.c b/net/core/dst.c index 43d94cedbf7c..a6e19a23a745 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -152,7 +152,7 @@ EXPORT_SYMBOL(dst_discard); const u32 dst_default_metrics[RTAX_MAX]; void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, int flags) + int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; @@ -188,6 +188,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; + dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e9a266a0535..cc52679790b2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (neigh) { - int res = neigh_output(neigh, skb); + int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return res; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8416f8a68e65..ca0d0e7c9778 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,13 +740,13 @@ void tcp_update_metrics(struct sock *sk) if (sysctl_tcp_nometrics_save) return; - dst_confirm(dst); - if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; unsigned long rtt; + dst_confirm(dst); + if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. @@ -3869,9 +3869,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_cong_avoid(sk, ack, prior_in_flight); } - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) - dst_confirm(__sk_dst_get(sk)); - + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { + struct dst_entry *dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + } return 1; no_queue: @@ -6140,9 +6142,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { + struct dst_entry *dst; + tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst_confirm(__sk_dst_get(sk)); + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); if (!sock_flag(sk, SOCK_DEAD)) /* Wake up lingering close() */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a233a7ccbc3a..c94e4aabe11b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -125,7 +125,7 @@ static int ip6_finish_output2(struct sk_buff *skb) rcu_read_lock(); neigh = dst_get_neighbour_noref(dst); if (neigh) { - int res = neigh_output(neigh, skb); + int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock(); return res; -- cgit v1.2.3 From 4aabd8ef8c43677cfee3e1e36c5a79edddb41942 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 9 Jul 2012 16:07:30 -0700 Subject: tcp: Move dynamnic metrics handling into seperate file. Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++ net/ipv4/Makefile | 2 +- net/ipv4/tcp_input.c | 188 +----------------------------------------------- net/ipv4/tcp_metrics.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 199 insertions(+), 187 deletions(-) create mode 100644 net/ipv4/tcp_metrics.c (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 53fb7d814170..98ca797001a2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,6 +388,8 @@ extern void tcp_enter_frto(struct sock *sk); extern void tcp_enter_loss(struct sock *sk, int how); extern void tcp_clear_retrans(struct tcp_sock *tp); extern void tcp_update_metrics(struct sock *sk); +extern void tcp_init_metrics(struct sock *sk); +extern void tcp_disable_fack(struct tcp_sock *tp); extern void tcp_close(struct sock *sk, long timeout); extern void tcp_init_sock(struct sock *sk); extern unsigned int tcp_poll(struct file * file, struct socket *sock, @@ -556,6 +558,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) return (tp->srtt >> 3) + tp->rttvar; } +extern void tcp_set_rto(struct sock *sk); + static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3bbcd6a..5a23e8b37106 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o \ + tcp_minisocks.o tcp_cong.o tcp_metrics.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ca0d0e7c9778..055ac49b8b40 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; -int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; @@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk) tcp_bound_rto(sk); } -/* Save metrics learned by this TCP session. - This function is called only, when TCP finishes successfully - i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. - */ -void tcp_update_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (sysctl_tcp_nometrics_save) - return; - - if (dst && (dst->flags & DST_HOST)) { - const struct inet_connection_sock *icsk = inet_csk(sk); - int m; - unsigned long rtt; - - dst_confirm(dst); - - if (icsk->icsk_backoff || !tp->srtt) { - /* This session failed to estimate rtt. Why? - * Probably, no packets returned in time. - * Reset our results. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) - dst_metric_set(dst, RTAX_RTT, 0); - return; - } - - rtt = dst_metric_rtt(dst, RTAX_RTT); - m = rtt - tp->srtt; - - /* If newly calculated rtt larger than stored one, - * store new one. Otherwise, use EWMA. Remember, - * rtt overestimation is always better than underestimation. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) { - if (m <= 0) - set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); - else - set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); - } - - if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { - unsigned long var; - if (m < 0) - m = -m; - - /* Scale deviation to rttvar fixed point */ - m >>= 1; - if (m < tp->mdev) - m = tp->mdev; - - var = dst_metric_rtt(dst, RTAX_RTTVAR); - if (m >= var) - var = m; - else - var -= (var - m) >> 2; - - set_dst_metric_rtt(dst, RTAX_RTTVAR, var); - } - - if (tcp_in_initial_slowstart(tp)) { - /* Slow start still did not finish. */ - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); - if (!dst_metric_locked(dst, RTAX_CWND) && - tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); - } else if (tp->snd_cwnd > tp->snd_ssthresh && - icsk->icsk_ca_state == TCP_CA_Open) { - /* Cong. avoidance phase, cwnd is reliable. */ - if (!dst_metric_locked(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_cwnd) >> 1); - } else { - /* Else slow start did not finish, cwnd is non-sense, - ssthresh may be also invalid. - */ - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_ssthresh) >> 1); - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); - } - - if (!dst_metric_locked(dst, RTAX_REORDERING)) { - if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && - tp->reordering != sysctl_tcp_reordering) - dst_metric_set(dst, RTAX_REORDERING, tp->reordering); - } - } -} - __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */ -static void tcp_disable_fack(struct tcp_sock *tp) +void tcp_disable_fack(struct tcp_sock *tp) { /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) @@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; } -/* Initialize metrics on socket. */ - -static void tcp_init_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (dst == NULL) - goto reset; - - dst_confirm(dst); - - if (dst_metric_locked(dst, RTAX_CWND)) - tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { - tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - } - if (dst_metric(dst, RTAX_REORDERING) && - tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); - tp->reordering = dst_metric(dst, RTAX_REORDERING); - } - - if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) - goto reset; - - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. - * - * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal circumstances sending small - * packets force peer to delay ACKs and calculation is correct too. - * The algorithm is adaptive and, provided we follow specs, it - * NEVER underestimate RTT. BUT! If peer tries to make some clever - * tricks sort of "quick acks" for time long enough to decrease RTT - * to low value, and then abruptly stops to do it and starts to delay - * ACKs, wait for troubles. - */ - if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric_rtt(dst, RTAX_RTT); - tp->rtt_seq = tp->snd_nxt; - } - if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } - tcp_set_rto(sk); -reset: - if (tp->srtt == 0) { - /* RFC6298: 5.7 We've failed to get a valid RTT sample from - * 3WHS. This is most likely due to retransmission, - * including spurious one. Reset the RTO back to 3secs - * from the more aggressive 1sec to avoid more spurious - * retransmission. - */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; - } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static void tcp_update_reordering(struct sock *sk, const int metric, const int ts) { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 000000000000..2793ecf928d3 --- /dev/null +++ b/net/ipv4/tcp_metrics.c @@ -0,0 +1,192 @@ +#include +#include + +#include +#include +#include +#include + +int sysctl_tcp_nometrics_save __read_mostly; + +/* Save metrics learned by this TCP session. This function is called + * only, when TCP finishes successfully i.e. when it enters TIME-WAIT + * or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (sysctl_tcp_nometrics_save) + return; + + if (dst && (dst->flags & DST_HOST)) { + const struct inet_connection_sock *icsk = inet_csk(sk); + int m; + unsigned long rtt; + + dst_confirm(dst); + + if (icsk->icsk_backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. + * Reset our results. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) + dst_metric_set(dst, RTAX_RTT, 0); + return; + } + + rtt = dst_metric_rtt(dst, RTAX_RTT); + m = rtt - tp->srtt; + + /* If newly calculated rtt larger than stored one, + * store new one. Otherwise, use EWMA. Remember, + * rtt overestimation is always better than underestimation. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) { + if (m <= 0) + set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); + else + set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); + } + + if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { + unsigned long var; + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + var = dst_metric_rtt(dst, RTAX_RTTVAR); + if (m >= var) + var = m; + else + var -= (var - m) >> 2; + + set_dst_metric_rtt(dst, RTAX_RTTVAR, var); + } + + if (tcp_in_initial_slowstart(tp)) { + /* Slow start still did not finish. */ + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) + dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); + if (!dst_metric_locked(dst, RTAX_CWND) && + tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) + dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); + } else if (tp->snd_cwnd > tp->snd_ssthresh && + icsk->icsk_ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!dst_metric_locked(dst, RTAX_SSTHRESH)) + dst_metric_set(dst, RTAX_SSTHRESH, + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); + if (!dst_metric_locked(dst, RTAX_CWND)) + dst_metric_set(dst, RTAX_CWND, + (dst_metric(dst, RTAX_CWND) + + tp->snd_cwnd) >> 1); + } else { + /* Else slow start did not finish, cwnd is non-sense, + ssthresh may be also invalid. + */ + if (!dst_metric_locked(dst, RTAX_CWND)) + dst_metric_set(dst, RTAX_CWND, + (dst_metric(dst, RTAX_CWND) + + tp->snd_ssthresh) >> 1); + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) + dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); + } + + if (!dst_metric_locked(dst, RTAX_REORDERING)) { + if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + dst_metric_set(dst, RTAX_REORDERING, tp->reordering); + } + } +} + +/* Initialize metrics on socket. */ + +void tcp_init_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + if (dst_metric_locked(dst, RTAX_CWND)) + tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); + if (dst_metric(dst, RTAX_SSTHRESH)) { + tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } else { + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + } + if (dst_metric(dst, RTAX_REORDERING) && + tp->reordering != dst_metric(dst, RTAX_REORDERING)) { + tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); + tp->reordering = dst_metric(dst, RTAX_REORDERING); + } + + if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) + goto reset; + + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal circumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { + tp->srtt = dst_metric_rtt(dst, RTAX_RTT); + tp->rtt_seq = tp->snd_nxt; + } + if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + } + tcp_set_rto(sk); +reset: + if (tp->srtt == 0) { + /* RFC6298: 5.7 We've failed to get a valid RTT sample from + * 3WHS. This is most likely due to retransmission, + * including spurious one. Reset the RTO back to 3secs + * from the more aggressive 1sec to avoid more spurious + * retransmission. + */ + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; + } + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; +} -- cgit v1.2.3 From a6df1ae9383697c4eb1365176002f154982325ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Jul 2012 01:41:36 +0000 Subject: tcp: add OFO snmp counters Add three SNMP TCP counters, to better track TCP behavior at global stage (netstat -s), when packets are received Out Of Order (OFO) TCPOFOQueue : Number of packets queued in OFO queue TCPOFODrop : Number of packets meant to be queued in OFO but dropped because socket rcvbuf limit hit. TCPOFOMerge : Number of packets in OFO that were merged with other packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/snmp.h | 5 ++++- net/ipv4/proc.c | 3 +++ net/ipv4/tcp_input.c | 7 +++++-- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 2e68f5ba0389..6e4c51123828 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -233,7 +233,10 @@ enum LINUX_MIB_TCPREQQFULLDOCOOKIES, /* TCPReqQFullDoCookies */ LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */ - LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ + LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ + LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ + LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ + LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8af0d44e4e22..dae25e7622cf 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -258,6 +258,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), + SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), + SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), + SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 055ac49b8b40..cc4e12f1f2f7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4397,8 +4397,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (tcp_try_rmem_schedule(sk, skb->truesize)) { - /* TODO: should increment a counter */ + if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; } @@ -4407,6 +4407,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4460,6 +4461,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); @@ -4498,6 +4500,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb1); } -- cgit v1.2.3 From 282f23c6ee343126156dd41218b22ece96d747e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 10:13:05 +0200 Subject: tcp: implement RFC 5961 3.2 Implement the RFC 5691 mitigation against Blind Reset attack using RST bit. Idea is to validate incoming RST sequence, to match RCV.NXT value, instead of previouly accepted window : (RCV.NXT <= SEG.SEQ < RCV.NXT+RCV.WND) If sequence is in window but not an exact match, send a "challenge ACK", so that the other part can resend an RST with the appropriate sequence. Add a new sysctl, tcp_challenge_ack_limit, to limit number of challenge ACK sent per second. Add a new SNMP counter to count number of challenge acks sent. (netstat -s | grep TCPChallengeACK) Signed-off-by: Eric Dumazet Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/linux/snmp.h | 1 + include/net/tcp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 31 ++++++++++++++++++++++++++++++- 6 files changed, 45 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e20c17a7d34e..e1e021594cff 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -565,6 +565,11 @@ tcp_limit_output_bytes - INTEGER reduce the size of individual GSO packet (64KB being the max) Default: 131072 +tcp_challenge_ack_limit - INTEGER + Limits number of Challenge ACK sent per second, as recommended + in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) + Default: 100 + UDP variables: udp_mem - vector of 3 INTEGERs: min, pressure, max diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 6e4c51123828..673e0e928b2b 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -237,6 +237,7 @@ enum LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ + LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */ __LINUX_MIB_MAX }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 439984b9af49..85c5090bfe25 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -254,6 +254,7 @@ extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; +extern int sysctl_tcp_challenge_ack_limit; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index dae25e7622cf..3e8e78f12a38 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -261,6 +261,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), + SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 70730f7aeafe..3f6a1e762e9c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -605,6 +605,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_challenge_ack_limit", + .data = &sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc4e12f1f2f7..c841a8990377 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 100; + int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -5247,6 +5250,23 @@ out: } #endif /* CONFIG_NET_DMA */ +static void tcp_send_challenge_ack(struct sock *sk) +{ + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 now = jiffies / HZ; + + if (now != challenge_timestamp) { + challenge_timestamp = now; + challenge_count = 0; + } + if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } +} + /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5283,7 +5303,16 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* Step 2: check RST bit */ if (th->rst) { - tcp_reset(sk); + /* RFC 5961 3.2 : + * If sequence number exactly matches RCV.NXT, then + * RESET the connection + * else + * Send a challenge ACK + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + tcp_reset(sk); + else + tcp_send_challenge_ack(sk); goto discard; } -- cgit v1.2.3 From 0c24604b68fc7810d429d6c3657b6f148270e528 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 01:41:30 +0000 Subject: tcp: implement RFC 5961 4.2 Implement the RFC 5691 mitigation against Blind Reset attack using SYN bit. Section 4.2 of RFC 5961 advises to send a Challenge ACK and drop incoming packet, instead of resetting the session. Add a new SNMP counter to count number of challenge acks sent in response to SYN packets. (netstat -s | grep TCPSYNChallenge) Remove obsolete TCPAbortOnSyn, since we no longer abort a TCP session because of a SYN flag. Signed-off-by: Eric Dumazet Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- include/linux/snmp.h | 2 +- net/ipv4/proc.c | 2 +- net/ipv4/tcp_input.c | 32 +++++++++++++++----------------- 3 files changed, 17 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 673e0e928b2b..e5fcbd079e4a 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -208,7 +208,6 @@ enum LINUX_MIB_TCPDSACKOFOSENT, /* TCPDSACKOfoSent */ LINUX_MIB_TCPDSACKRECV, /* TCPDSACKRecv */ LINUX_MIB_TCPDSACKOFORECV, /* TCPDSACKOfoRecv */ - LINUX_MIB_TCPABORTONSYN, /* TCPAbortOnSyn */ LINUX_MIB_TCPABORTONDATA, /* TCPAbortOnData */ LINUX_MIB_TCPABORTONCLOSE, /* TCPAbortOnClose */ LINUX_MIB_TCPABORTONMEMORY, /* TCPAbortOnMemory */ @@ -238,6 +237,7 @@ enum LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */ + LINUX_MIB_TCPSYNCHALLENGE, /* TCPSYNChallenge */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3e8e78f12a38..2a5240b2ea61 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), - SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), @@ -262,6 +261,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), + SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c841a8990377..8aaec5536111 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5270,8 +5270,8 @@ static void tcp_send_challenge_ack(struct sock *sk) /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) { const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); @@ -5323,20 +5323,22 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ - /* step 4: Check for a SYN in window. */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* step 4: Check for a SYN + * RFC 5691 4.2 : Send a challenge ack + */ + if (th->syn) { if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + tcp_send_challenge_ack(sk); + goto discard; } - return 1; + return true; discard: __kfree_skb(skb); - return 0; + return false; } /* @@ -5366,7 +5368,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); - int res; if (sk->sk_rx_dst) { struct dst_entry *dst = sk->sk_rx_dst; @@ -5555,9 +5556,8 @@ slow_path: * Standard slow path. */ - res = tcp_validate_incoming(sk, skb, th, 1); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 1)) + return 0; step5: if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) @@ -5877,7 +5877,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; - int res; tp->rx_opt.saw_tstamp = 0; @@ -5932,9 +5931,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - res = tcp_validate_incoming(sk, skb, th, 0); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 0)) + return 0; /* step 5: check the ACK field */ if (th->ack) { -- cgit v1.2.3 From e371589917011efe6ff8c7dfb4e9e81934ac5855 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 12:29:30 +0000 Subject: tcp: refine SYN handling in tcp_validate_incoming Followup of commit 0c24604b68fc (tcp: implement RFC 5961 4.2) As reported by Vijay Subramanian, we should send a challenge ACK instead of a dup ack if a SYN flag is set on a packet received out of window. This permits the ratelimiting to work as intended, and to increase correct SNMP counters. Suggested-by: Vijay Subramanian Signed-off-by: Eric Dumazet Acked-by: Vijay Subramanian Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8aaec5536111..fdd49f1b7a52 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5296,8 +5296,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ - if (!th->rst) + if (!th->rst) { + if (th->syn) + goto syn_challenge; tcp_send_dupack(sk, skb); + } goto discard; } @@ -5327,6 +5330,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * RFC 5691 4.2 : Send a challenge ack */ if (th->syn) { +syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); -- cgit v1.2.3 From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:05 +0000 Subject: net-tcp: Fast Open base This patch impelements the common code for both the client and server. 1. TCP Fast Open option processing. Since Fast Open does not have an option number assigned by IANA yet, it shares the experiment option code 254 by implementing draft-ietf-tcpm-experimental-options with a 16 bits magic number 0xF989. This enables global experiments without clashing the scarce(2) experimental options available for TCP. When the draft status becomes standard (maybe), the client should switch to the new option number assigned while the server supports both numbers for transistion. 2. The new sysctl tcp_fastopen 3. A place holder init function Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 10 ++++++++++ include/net/tcp.h | 9 ++++++++- net/ipv4/Makefile | 2 +- net/ipv4/syncookies.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_fastopen.c | 11 +++++++++++ net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 4 ++-- net/ipv4/tcp_output.c | 25 +++++++++++++++++++++---- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 12 files changed, 86 insertions(+), 16 deletions(-) create mode 100644 net/ipv4/tcp_fastopen.c (limited to 'net/ipv4/tcp_input.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1888169e07c7..12948f543839 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -243,6 +243,16 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) return (tcp_hdr(skb)->doff - 5) * 4; } +/* TCP Fast Open */ +#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ +#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ + +/* TCP Fast Open Cookie as stored in memory */ +struct tcp_fastopen_cookie { + s8 len; + u8 val[TCP_FASTOPEN_COOKIE_MAX]; +}; + /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; diff --git a/include/net/tcp.h b/include/net/tcp.h index 85c5090bfe25..5aed3718fde8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -170,6 +170,11 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_COOKIE 253 /* Cookie extension (experimental) */ +#define TCPOPT_EXP 254 /* Experimental */ +/* Magic number to be after the option value for sharing TCP + * experimental options. See draft-ietf-tcpm-experimental-options-00.txt + */ +#define TCPOPT_FASTOPEN_MAGIC 0xF989 /* * TCP option lengths @@ -180,6 +185,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 +#define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_COOKIE_BASE 2 /* Cookie-less header extension */ #define TCPOLEN_COOKIE_PAIR 3 /* Cookie pair header extension */ #define TCPOLEN_COOKIE_MIN (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN) @@ -222,6 +228,7 @@ extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_syncookies; +extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; @@ -418,7 +425,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, const u8 **hvpp, - int estab); + int estab, struct tcp_fastopen_cookie *foc); extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index a677d804e53e..ae2ccf2890e4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o tcp_metrics.o \ + tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eab2a7fb15d1..650e1528e1e6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) goto out; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3f6a1e762e9c..5840c3255721 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -366,6 +366,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, #endif + { + .procname = "tcp_fastopen", + .data = &sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c new file mode 100644 index 000000000000..a7f729c409d7 --- /dev/null +++ b/net/ipv4/tcp_fastopen.c @@ -0,0 +1,11 @@ +#include +#include + +int sysctl_tcp_fastopen; + +static int __init tcp_fastopen_init(void) +{ + return 0; +} + +late_initcall(tcp_fastopen_init); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd49f1b7a52..a06bb8959e7e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3732,7 +3732,8 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, - const u8 **hvpp, int estab) + const u8 **hvpp, int estab, + struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3839,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o break; } break; - } + case TCPOPT_EXP: + /* Fast Open option shares code 254 using a + * 16 bits magic number. It's valid only in + * SYN or SYN-ACK with an even size. + */ + if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || + get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || + foc == NULL || !th->syn || (opsize & 1)) + break; + foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; + if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && + foc->len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, ptr + 2, foc->len); + else if (foc->len != 0) + foc->len = -1; + break; + + } ptr += opsize-2; length -= opsize; } @@ -3882,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); return true; } @@ -5637,7 +5655,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL); if (th->ack) { /* rfc793: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d7d2fa50f07f..01aa77a97020 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1307,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c66f2ede160e..5912ac3fd240 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -97,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -534,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 15a7c7bc3e58..4849be76ccd6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_COOKIE_EXTENSION (1 << 4) +#define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { - u8 options; /* bit field of OPTION_* */ + u16 options; /* bit field of OPTION_* */ + u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ - u16 mss; /* 0 to disable */ - __u32 tsval, tsecr; /* need to include OPTION_TS */ __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; /* The sysctl int routines are generic, so check consistency here. @@ -442,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired) static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { - u8 options = opts->options; /* mungable copy */ + u16 options = opts->options; /* mungable copy */ /* Having both authentication and cookies for security is redundant, * and there's certainly not enough room. Instead, the cookie-less @@ -564,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, tp->rx_opt.dsack = 0; } + + if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { + struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + + *ptr++ = htonl((TCPOPT_EXP << 24) | + ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | + TCPOPT_FASTOPEN_MAGIC); + + memcpy(ptr, foc->val, foc->len); + if ((foc->len & 3) == 2) { + u8 *align = ((u8 *)ptr) + foc->len; + align[0] = align[1] = TCPOPT_NOP; + } + ptr += (foc->len + 3) >> 2; + } } /* Compute TCP options for SYN packets. This is not the final diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7bf3cc427c28..bb46061c813a 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -177,7 +177,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) goto out; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c9dabdd832d7..0302ec3fecfc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1033,7 +1033,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && -- cgit v1.2.3 From 8e4178c1c7b52f7c99f5fd22ef7af6b2bff409e3 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:08 +0000 Subject: net-tcp: Fast Open client - receiving SYN-ACK On receiving the SYN-ACK after SYN-data, the client needs to a) update the cached MSS and cookie (if included in SYN-ACK) b) retransmit the data not yet acknowledged by the SYN-ACK in the final ACK of the handshake. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a06bb8959e7e..38b6a811edfc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5646,6 +5646,34 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) } } +static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, + struct tcp_fastopen_cookie *cookie) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *data = tcp_write_queue_head(sk); + u16 mss = tp->rx_opt.mss_clamp; + + if (mss == tp->rx_opt.user_mss) { + struct tcp_options_received opt; + const u8 *hash_location; + + /* Get original SYNACK MSS value if user MSS sets mss_clamp */ + tcp_clear_options(&opt); + opt.user_mss = opt.mss_clamp = 0; + tcp_parse_options(synack, &opt, &hash_location, 0, NULL); + mss = opt.mss_clamp; + } + + tcp_fastopen_cache_set(sk, mss, cookie); + + if (data) { /* Retransmit unacked data in SYN */ + tcp_retransmit_skb(sk, data); + tcp_rearm_rto(sk); + return true; + } + return false; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { @@ -5653,9 +5681,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; + struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); if (th->ack) { /* rfc793: @@ -5665,11 +5694,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: */ - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) goto reset_and_undo; if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -5781,6 +5808,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_finish_connect(sk, skb); + if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc)) + return -1; + if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { -- cgit v1.2.3 From aab4874355679c70f93993cf3b3fd74643b9ac33 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:10 +0000 Subject: net-tcp: Fast Open client - detecting SYN-data drops On paths with firewalls dropping SYN with data or experimental TCP options, Fast Open connections will have experience SYN timeout and bad performance. The solution is to track such incidents in the cookie cache and disables Fast Open temporarily. Since only the original SYN includes data and/or Fast Open option, the SYN-ACK has some tell-tale sign (tcp_rcv_fastopen_synack()) to detect such drops. If a path has recurring Fast Open SYN drops, Fast Open is disabled for 2^(recurring_losses) minutes starting from four minutes up to roughly one and half day. sendmsg with MSG_FASTOPEN flag will succeed but it behaves as connect() then write(). Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ++++-- net/ipv4/tcp_input.c | 10 +++++++++- net/ipv4/tcp_metrics.c | 16 +++++++++++++--- net/ipv4/tcp_output.c | 13 +++++++++++-- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index c0258100d70c..e07878d246aa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -409,9 +409,11 @@ extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, extern bool tcp_remember_stamp(struct sock *sk); extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie); + struct tcp_fastopen_cookie *cookie, + int *syn_loss, unsigned long *last_syn_loss); extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss, - struct tcp_fastopen_cookie *cookie); + struct tcp_fastopen_cookie *cookie, + bool syn_lost); extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); extern void tcp_disable_fack(struct tcp_sock *tp); extern void tcp_close(struct sock *sk, long timeout); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 38b6a811edfc..c49a4fc175bd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5652,6 +5652,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *data = tcp_write_queue_head(sk); u16 mss = tp->rx_opt.mss_clamp; + bool syn_drop; if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; @@ -5664,7 +5665,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, mss = opt.mss_clamp; } - tcp_fastopen_cache_set(sk, mss, cookie); + /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably + * the remote receives only the retransmitted (regular) SYNs: either + * the original SYN-data or the corresponding SYN-ACK is lost. + */ + syn_drop = (cookie->len <= 0 && data && + inet_csk(sk)->icsk_retransmits); + + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); if (data) { /* Retransmit unacked data in SYN */ tcp_retransmit_skb(sk, data); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d02ff3777785..99779ae44f64 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -32,6 +32,8 @@ enum tcp_metric_index { struct tcp_fastopen_metrics { u16 mss; + u16 syn_loss:10; /* Recurring Fast Open SYN losses */ + unsigned long last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie; }; @@ -125,6 +127,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) tm->tcpm_ts = 0; tm->tcpm_ts_stamp = 0; tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.cookie.len = 0; } @@ -644,7 +647,8 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) static DEFINE_SEQLOCK(fastopen_seqlock); void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie) + struct tcp_fastopen_cookie *cookie, + int *syn_loss, unsigned long *last_syn_loss) { struct tcp_metrics_block *tm; @@ -659,14 +663,15 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, if (tfom->mss) *mss = tfom->mss; *cookie = tfom->cookie; + *syn_loss = tfom->syn_loss; + *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); } - void tcp_fastopen_cache_set(struct sock *sk, u16 mss, - struct tcp_fastopen_cookie *cookie) + struct tcp_fastopen_cookie *cookie, bool syn_lost) { struct tcp_metrics_block *tm; @@ -679,6 +684,11 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, tfom->mss = mss; if (cookie->len > 0) tfom->cookie = *cookie; + if (syn_lost) { + ++tfom->syn_loss; + tfom->last_syn_loss = jiffies; + } else + tfom->syn_loss = 0; write_sequnlock_bh(&fastopen_seqlock); } rcu_read_unlock(); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 88693281da4c..c5cfd5ec3184 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2860,10 +2860,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int space, i, err = 0, iovlen = fo->data->msg_iovlen; + int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; struct sk_buff *syn_data = NULL, *data; + unsigned long last_syn_loss = 0; + + tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, + &syn_loss, &last_syn_loss); + /* Recurring FO SYN losses: revert to regular handshake temporarily */ + if (syn_loss > 1 && + time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { + fo->cookie.len = -1; + goto fallback; + } - tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie); if (fo->cookie.len <= 0) goto fallback; -- cgit v1.2.3 From 67da22d23fa6f3324e03bcd0580b914b2e4afbf3 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:11 +0000 Subject: net-tcp: Fast Open client - cookie-less mode In trusted networks, e.g., intranet, data-center, the client does not need to use Fast Open cookie to mitigate DoS attacks. In cookie-less mode, sendmsg() with MSG_FASTOPEN flag will send SYN-data regardless of cookie availability. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 2 ++ include/linux/tcp.h | 1 + include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 8 ++++++-- net/ipv4/tcp_output.c | 6 +++++- 5 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 03964e088180..5f3ef7f7fcec 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -476,6 +476,8 @@ tcp_fastopen - INTEGER The values (bitmap) are: 1: Enables sending data in the opening SYN on the client + 5: Enables sending data in the opening SYN on the client regardless + of cookie availability. Default: 0 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1edf96afab44..9febfb685c33 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -387,6 +387,7 @@ struct tcp_sock { u8 repair_queue; u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ early_retrans_delayed:1, /* Delayed ER timer installed */ + syn_data:1, /* SYN includes data */ syn_fastopen:1; /* SYN includes Fast Open option */ /* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index e07878d246aa..bc7c134ec054 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -214,6 +214,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); /* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 +#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ extern struct inet_timewait_death_row tcp_death_row; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c49a4fc175bd..e67d685a6c0e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5650,7 +5650,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tcp_write_queue_head(sk); + struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp; bool syn_drop; @@ -5665,6 +5665,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, mss = opt.mss_clamp; } + if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ + cookie->len = -1; + /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably * the remote receives only the retransmitted (regular) SYNs: either * the original SYN-data or the corresponding SYN-ACK is lost. @@ -5816,7 +5819,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_finish_connect(sk, skb); - if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc)) + if ((tp->syn_fastopen || tp->syn_data) && + tcp_rcv_fastopen_synack(sk, skb, &foc)) return -1; if (sk->sk_write_pending || diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c5cfd5ec3184..27a32acfdb62 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2864,6 +2864,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) struct sk_buff *syn_data = NULL, *data; unsigned long last_syn_loss = 0; + tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, &syn_loss, &last_syn_loss); /* Recurring FO SYN losses: revert to regular handshake temporarily */ @@ -2873,7 +2874,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto fallback; } - if (fo->cookie.len <= 0) + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) + fo->cookie.len = -1; + else if (fo->cookie.len <= 0) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and @@ -2916,6 +2919,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) fo->copied = data->len; if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + tp->syn_data = (fo->copied > 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); goto done; } -- cgit v1.2.3 From 67b95bd78f0de85793bf30835913f6ef784a39b6 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Thu, 19 Jul 2012 21:32:18 +0000 Subject: tcp: Return bool instead of int where appropriate Applied to a set of static inline functions in tcp_input.c Signed-off-by: Vijay Subramanian Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e67d685a6c0e..21d7f8f3a7a5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2521,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ -static inline int tcp_packet_delayed(const struct tcp_sock *tp) +static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2582,7 +2582,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_may_undo(const struct tcp_sock *tp) +static inline bool tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } @@ -3371,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk) } } -static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) +static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } -static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) +static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && @@ -3387,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, +static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { @@ -4006,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(const struct sock *sk, +static inline bool tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4028,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); @@ -5214,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, return result; } -static inline int tcp_checksum_complete_user(struct sock *sk, +static inline bool tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && -- cgit v1.2.3 From 92101b3b2e3178087127709a556b091dae314e9e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Jul 2012 16:29:00 -0700 Subject: ipv4: Prepare for change of rt->rt_iif encoding. Use inet_iif() consistently, and for TCP record the input interface of cached RX dst in inet sock. rt->rt_iif is going to be encoded differently, so that we can legitimately cache input routes in the FIB info more aggressively. When the input interface is "use SKB device index" the rt->rt_iif will be set to zero. This forces us to move the TCP RX dst cache installation into the ipv4 specific code, and as well it should since doing the route caching for ipv6 is pointless at the moment since it is not inspected in the ipv6 input paths yet. Also, remove the unlikely on dst->obsolete, all ipv4 dsts have obsolete set to a non-zero value to force invocation of the check callback. Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + net/dccp/ipv4.c | 2 +- net/ipv4/icmp.c | 2 +- net/ipv4/ip_sockglue.c | 5 ++--- net/ipv4/route.c | 2 +- net/ipv4/tcp_input.c | 12 ------------ net/ipv4/tcp_ipv4.c | 24 ++++++++++++++++++------ net/sched/cls_route.c | 2 +- net/sched/em_meta.c | 2 +- net/sctp/protocol.c | 2 +- 10 files changed, 27 insertions(+), 27 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 924d7b98ab60..613cfa401672 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -172,6 +172,7 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; + int rx_dst_ifindex; struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 25428d0c50c9..176ecdba4a22 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -481,7 +481,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct rtable *rt; const struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { - .flowi4_oif = skb_rtable(skb)->rt_iif, + .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, .flowi4_tos = RT_CONN_FLAGS(sk), diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f2a06beffbd3..f2eccd531746 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -571,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->rt_iif); + dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index de29f46f68b0..5eea4a811042 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1027,10 +1027,9 @@ e_inval: void ipv4_pktinfo_prepare(struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - const struct rtable *rt = skb_rtable(skb); - if (rt) { - pktinfo->ipi_ifindex = rt->rt_iif; + if (skb_rtable(skb)) { + pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34017be87c85..f6be78119396 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -848,7 +848,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (log_martians && peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", - &ip_hdr(skb)->saddr, rt->rt_iif, + &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &rt->rt_gateway); #endif } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21d7f8f3a7a5..3e07a64ca44e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5391,18 +5391,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; - if (unlikely(dst->obsolete)) { - if (dst->ops->check(dst, 0) == NULL) { - dst_release(dst); - sk->sk_rx_dst = NULL; - } - } - } - if (unlikely(sk->sk_rx_dst == NULL)) - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - /* * Header prediction. * The code loosely follows the one in the famous diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bc5432e3c778..3e30548ac32a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1618,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb); + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (unlikely(sk->sk_rx_dst == NULL)) { + struct inet_sock *icsk = inet_sk(sk); + struct rtable *rt = skb_rtable(skb); + + sk->sk_rx_dst = dst_clone(&rt->dst); + icsk->rx_dst_ifindex = inet_iif(skb); + } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1700,14 +1714,12 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { struct dst_entry *dst = sk->sk_rx_dst; + struct inet_sock *icsk = inet_sk(sk); if (dst) dst = dst_check(dst, 0); - if (dst) { - struct rtable *rt = (struct rtable *) dst; - - if (rt->rt_iif == dev->ifindex) - skb_dst_set_noref(skb, dst); - } + if (dst && + icsk->rx_dst_ifindex == dev->ifindex) + skb_dst_set_noref(skb, dst); } } } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 36fec4227401..44f405cb9aaf 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -143,7 +143,7 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (head == NULL) goto old_method; - iif = ((struct rtable *)dst)->rt_iif; + iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 4790c696cbce..4ab6e3325573 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -264,7 +264,7 @@ META_COLLECTOR(int_rtiif) if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else - dst->value = skb_rtable(skb)->rt_iif; + dst->value = inet_iif(skb); } /************************************************************************** diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9c90811d1134..1f89c4e69645 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -568,7 +568,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, /* What interface did this skb arrive on? */ static int sctp_v4_skb_iif(const struct sk_buff *skb) { - return skb_rtable(skb)->rt_iif; + return inet_iif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ -- cgit v1.2.3 From 505fbcf035c245a1a42cd80184feecf61ee868dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jul 2012 06:23:40 +0000 Subject: ipv4: fix TCP early demux commit 92101b3b2e317 (ipv4: Prepare for change of rt->rt_iif encoding.) invalidated TCP early demux, because rx_dst_ifindex is not properly initialized and checked. Also remove the use of inet_iif(skb) in favor or skb->skb_iif Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 14 ++++++-------- net/ipv4/tcp_minisocks.c | 1 + 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e07a64ca44e..aa659e825054 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5603,6 +5603,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb != NULL) { sk->sk_rx_dst = dst_clone(skb_dst(skb)); + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b6b07c93924c..2fbd9921253f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1620,17 +1620,15 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); if (sk->sk_rx_dst) { struct dst_entry *dst = sk->sk_rx_dst; - if (dst->ops->check(dst, 0) == NULL) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, 0) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } } if (unlikely(sk->sk_rx_dst == NULL)) { - struct inet_sock *icsk = inet_sk(sk); - struct rtable *rt = skb_rtable(skb); - - sk->sk_rx_dst = dst_clone(&rt->dst); - icsk->rx_dst_ifindex = inet_iif(skb); + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; @@ -1709,11 +1707,11 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { struct dst_entry *dst = sk->sk_rx_dst; - struct inet_sock *icsk = inet_sk(sk); + if (dst) dst = dst_check(dst, 0); if (dst && - icsk->rx_dst_ifindex == skb->skb_iif) + inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5912ac3fd240..3f1cc2028edd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -388,6 +388,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_cookie_values *oldcvp = oldtp->cookie_values; newsk->sk_rx_dst = dst_clone(skb_dst(skb)); + inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif; /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to -- cgit v1.2.3 From 59ea33a68a9083ac98515e4861c00e71efdc49a1 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 27 Jul 2012 10:38:50 +0000 Subject: tcp: perform DMA to userspace only if there is a task waiting for it Back in 2006, commit 1a2449a87b ("[I/OAT]: TCP recv offload to I/OAT") added support for receive offloading to IOAT dma engine if available. The code in tcp_rcv_established() tries to perform early DMA copy if applicable. It however does so without checking whether the userspace task is actually expecting the data in the buffer. This is not a problem under normal circumstances, but there is a corner case where this doesn't work -- and that's when MSG_TRUNC flag to recvmsg() is used. If the IOAT dma engine is not used, the code properly checks whether there is a valid ucopy.task and the socket is owned by userspace, but misses the check in the dmaengine case. This problem can be observed in real trivially -- for example 'tbench' is a good reproducer, as it makes a heavy use of MSG_TRUNC. On systems utilizing IOAT, you will soon find tbench waiting indefinitely in sk_wait_data(), as they have been already early-copied in tcp_rcv_established() using dma engine. This patch introduces the same check we are performing in the simple iovec copy case to the IOAT case as well. It fixes the indefinite recvmsg(MSG_TRUNC) hangs. Signed-off-by: Jiri Kosina Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index aa659e825054..a356e1fecf9a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5475,7 +5475,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { #ifdef CONFIG_NET_DMA - if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && + tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } -- cgit v1.2.3 From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 23:20:37 +0000 Subject: net: ipv4: fix RCU races on dst refcounts commit c6cffba4ffa2 (ipv4: Fix input route performance regression.) added various fatal races with dst refcounts. crashes happen on tcp workloads if routes are added/deleted at the same time. The dst_free() calls from free_fib_info_rcu() are clearly racy. We need instead regular dst refcounting (dst_release()) and make sure dst_release() is aware of RCU grace periods : Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period before dst destruction for cached dst Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero() to make sure we dont increase a zero refcount (On a dst currently waiting an rcu grace period before destruction) rt_cache_route() must take a reference on the new cached route, and release it if was not able to install it. With this patch, my machines survive various benchmarks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 +------ include/net/inet_sock.h | 13 +++++++++++++ net/core/dst.c | 26 +++++++++++++++++++++----- net/decnet/dn_route.c | 6 ++++++ net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/route.c | 16 ++++------------ net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_minisocks.c | 3 +-- 9 files changed, 55 insertions(+), 35 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/dst.h b/include/net/dst.h index baf597890064..31a9fd39edb6 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,6 +61,7 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 +#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -382,12 +383,6 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 613cfa401672..e3fd34c83ac9 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,4 +249,17 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } +static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (atomic_inc_not_zero(&dst->__refcnt)) { + if (!(dst->flags & DST_RCU_FREE)) + dst->flags |= DST_RCU_FREE; + + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} + #endif /* _INET_SOCK_H */ diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d29414..d9e33ebe170f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,6 +258,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_rcu_destroy(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -265,10 +274,14 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); + if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { + if (dst->flags & DST_RCU_FREE) { + call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); + } else { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } } @@ -320,11 +333,14 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { + bool hold; + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - if (unlikely(dst->flags & DST_NOCACHE)) { + hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; + if (unlikely(hold)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 85a3604c87c8..26719779ad8e 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,6 +184,12 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da0cc2e6b250..e55171f184f9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head) if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) - dst_free(&nexthop_nh->nh_rth_output->dst); + dst_release(&nexthop_nh->nh_rth_output->dst); if (nexthop_nh->nh_rth_input) - dst_free(&nexthop_nh->nh_rth_input->dst); + dst_release(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fc1a81ca79a7..d6eabcfe8a90 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,11 +1199,6 @@ restart: fnhe->fnhe_stamp = jiffies; } -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p = &nh->nh_rth_output; @@ -1213,17 +1208,14 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) orig = *p; + rt->dst.flags |= DST_RCU_FREE; + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - rt_free(orig); + dst_release(&orig->dst); } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + dst_release(&rt->dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a356e1fecf9a..9be30b039ae3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fbd9921253f..7f91e5ac8277 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; + if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + if (unlikely(sk->sk_rx_dst == NULL)) + inet_sk_rx_dst_set(sk, skb); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3f1cc2028edd..232a90c3ec86 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newsk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to -- cgit v1.2.3 From c76562b6709fee5eff8a6a779be41c0bce661fd7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:41 -0700 Subject: netvm: prevent a stream-specific deadlock This patch series is based on top of "Swap-over-NBD without deadlocking v15" as it depends on the same reservation of PF_MEMALLOC reserves logic. When a user or administrator requires swap for their application, they create a swap partition and file, format it with mkswap and activate it with swapon. In diskless systems this is not an option so if swap if required then swapping over the network is considered. The two likely scenarios are when blade servers are used as part of a cluster where the form factor or maintenance costs do not allow the use of disks and thin clients. The Linux Terminal Server Project recommends the use of the Network Block Device (NBD) for swap but this is not always an option. There is no guarantee that the network attached storage (NAS) device is running Linux or supports NBD. However, it is likely that it supports NFS so there are users that want support for swapping over NFS despite any performance concern. Some distributions currently carry patches that support swapping over NFS but it would be preferable to support it in the mainline kernel. Patch 1 avoids a stream-specific deadlock that potentially affects TCP. Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC reserves. Patch 3 adds three helpers for filesystems to handle swap cache pages. For example, page_file_mapping() returns page->mapping for file-backed pages and the address_space of the underlying swap file for swap cache pages. Patch 4 adds two address_space_operations to allow a filesystem to pin all metadata relevant to a swapfile in memory. Upon successful activation, the swapfile is marked SWP_FILE and the address space operation ->direct_IO is used for writing and ->readpage for reading in swap pages. Patch 5 notes that patch 3 is bolting filesystem-specific-swapfile-support onto the side and that the default handlers have different information to what is available to the filesystem. This patch refactors the code so that there are generic handlers for each of the new address_space operations. Patch 6 adds an API to allow a vector of kernel addresses to be translated to struct pages and pinned for IO. Patch 7 adds support for using highmem pages for swap by kmapping the pages before calling the direct_IO handler. Patch 8 updates NFS to use the helpers from patch 3 where necessary. Patch 9 avoids setting PF_private on PG_swapcache pages within NFS. Patch 10 implements the new swapfile-related address_space operations for NFS and teaches the direct IO handler how to manage kernel addresses. Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO where appropriate. Patch 12 fixes a NULL pointer dereference that occurs when using swap-over-NFS. With the patches applied, it is possible to mount a swapfile that is on an NFS filesystem. Swap performance is not great with a swap stress test taking roughly twice as long to complete than if the swap device was backed by NBD. This patch: netvm: prevent a stream-specific deadlock It could happen that all !SOCK_MEMALLOC sockets have buffered so much data that we're over the global rmem limit. This will prevent SOCK_MEMALLOC buffers from receiving data, which will prevent userspace from running, which is needed to reduce the buffered data. Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once this change it applied, it is important that sockets that set SOCK_MEMALLOC do not clear the flag until the socket is being torn down. If this happens, a warning is generated and the tokens reclaimed to avoid accounting errors until the bug is fixed. [davem@davemloft.net: Warning about clearing SOCK_MEMALLOC] Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: David S. Miller Acked-by: Rik van Riel Cc: Trond Myklebust Cc: Neil Brown Cc: Christoph Hellwig Cc: Mike Christie Cc: Eric B Munson Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/sock.h | 8 +++++--- net/caif/caif_socket.c | 2 +- net/core/sock.c | 14 +++++++++++++- net/ipv4/tcp_input.c | 21 +++++++++++---------- net/sctp/ulpevent.c | 3 ++- 5 files changed, 32 insertions(+), 16 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/sock.h b/include/net/sock.h index 43a470d40d76..b3730239bf18 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1329,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) __sk_mem_schedule(sk, size, SK_MEM_SEND); } -static inline bool sk_rmem_schedule(struct sock *sk, int size) +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV); + return size<= sk->sk_forward_alloc || + __sk_mem_schedule(sk, size, SK_MEM_RECV) || + skb_pfmemalloc(skb); } static inline void sk_mem_reclaim(struct sock *sk) diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 78f1cdad5b33..095259f83902 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) err = sk_filter(sk, skb); if (err) return err; - if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); diff --git a/net/core/sock.c b/net/core/sock.c index 32fdcd2d6e8f..6b654b3ddfda 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -295,6 +295,18 @@ void sk_clear_memalloc(struct sock *sk) sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_key_slow_dec(&memalloc_socks); + + /* + * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward + * progress of swapping. However, if SOCK_MEMALLOC is cleared while + * it has rmem allocations there is a risk that the user of the + * socket cannot make forward progress due to exceeding the rmem + * limits. By rights, sk_clear_memalloc() should only be called + * on sockets being torn down but warn and reset the accounting if + * that assumption breaks. + */ + if (WARN_ON(sk->sk_forward_alloc)) + sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); @@ -396,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (err) return err; - if (!sk_rmem_schedule(sk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a356e1fecf9a..00b91b4b8665 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, size)) { + !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, size)) { + if (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - if (!sk_rmem_schedule(sk, size)) + if (!sk_rmem_schedule(sk, skb, size)) return -1; } } @@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { + if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; @@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; struct tcphdr *th; bool fragstolen; - if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) - goto err; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) goto err; + if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + goto err_free; + th = (struct tcphdr *)skb_put(skb, sizeof(*th)); skb_reset_transport_header(skb); memset(th, 0, sizeof(*th)); @@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb->truesize)) + tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto drop; eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 33d894776192..10c018a5b9fe 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (rx_count >= asoc->base.sk->sk_rcvbuf) { if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) + (!sk_rmem_schedule(asoc->base.sk, chunk->skb, + chunk->skb->truesize))) goto fail; } -- cgit v1.2.3