From e42e24c3cc072088756d84ef07b492ac2a3ae2e5 Mon Sep 17 00:00:00 2001 From: Matvejchikov Ilya Date: Mon, 24 Jul 2017 16:02:12 +0400 Subject: tcp: remove redundant argument from tcp_rcv_established() The last (4th) argument of tcp_rcv_established() is redundant as it always equals to skb->len and the skb itself is always passed as 2th agrument. There is no reason to have it. Signed-off-by: Ilya V. Matveychikov Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2521690d62d6..90a32576c3d0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1296,7 +1296,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); if (opt_skb) goto ipv6_pktoptions; return 0; -- cgit v1.2.3 From 3a3a4e3054137c5ff5d4d306ec834f6d25d7f95b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 28 Jul 2017 22:18:57 +0200 Subject: ipv6: constify inet6_protocol structures The inet6_protocol structure is only passed as the first argument to inet6_add_protocol or inet6_del_protocol, both of which are declared as const. Thus the inet6_protocol structure itself can be const. Also drop __read_mostly where present on the newly const structures. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 67ff2aaf5dcb..33865d67bcb4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1080,7 +1080,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) } -static struct inet6_protocol ip6gre_protocol __read_mostly = { +static const struct inet6_protocol ip6gre_protocol = { .handler = gre_rcv, .err_handler = ip6gre_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 90a32576c3d0..2968a33cca7d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1945,7 +1945,7 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; -static struct inet6_protocol tcpv6_protocol = { +static const struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, .early_demux_handler = tcp_v6_early_demux, .handler = tcp_v6_rcv, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4a3e65626e8b..5f8b8d766c63 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1448,7 +1448,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -static struct inet6_protocol udpv6_protocol = { +static const struct inet6_protocol udpv6_protocol = { .early_demux = udp_v6_early_demux, .early_demux_handler = udp_v6_early_demux, .handler = udpv6_rcv, -- cgit v1.2.3 From e7942d0633c47c791ece6afa038be9cf977226de Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Jul 2017 03:57:18 +0200 Subject: tcp: remove prequeue support prequeue is a tcp receive optimization that moves part of rx processing from bh to process context. This only works if the socket being processed belongs to a process that is blocked in recv on that socket. In practice, this doesn't happen anymore that often because nowadays servers tend to use an event driven (epoll) model. Even normal client applications (web browsers) commonly use many tcp connections in parallel. This has measureable impact only in netperf (which uses plain recv and thus allows prequeue use) from host to locally running vm (~4%), however, there were no changes when using netperf between two physical hosts with ixgbe interfaces. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/tcp.h | 9 ---- include/net/tcp.h | 11 ----- net/ipv4/tcp.c | 105 ----------------------------------------------- net/ipv4/tcp_input.c | 62 ---------------------------- net/ipv4/tcp_ipv4.c | 61 +-------------------------- net/ipv4/tcp_minisocks.c | 1 - net/ipv4/tcp_timer.c | 12 ------ net/ipv6/tcp_ipv6.c | 3 +- 8 files changed, 2 insertions(+), 262 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 542ca1ae02c4..32fb37cfb0d1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -192,15 +192,6 @@ struct tcp_sock { struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ - /* Data for direct copy to user */ - struct { - struct sk_buff_head prequeue; - struct task_struct *task; - struct msghdr *msg; - int memory; - int len; - } ucopy; - u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 12d68335acd4..93f115cfc8f8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1244,17 +1244,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __tcp_checksum_complete(skb); } -/* Prequeue for VJ style copy to user, combined with checksumming. */ - -static inline void tcp_prequeue_init(struct tcp_sock *tp) -{ - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - tp->ucopy.memory = 0; - skb_queue_head_init(&tp->ucopy.prequeue); -} - -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..62018ea6f45f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; @@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -static void tcp_prequeue_process(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_sock *tp = tcp_sk(sk); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - /* Clear memory counter. */ - tp->ucopy.memory = 0; -} - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int err; int target; /* Read at least this many bytes */ long timeo; - struct task_struct *user_recv = NULL; struct sk_buff *skb, *last; u32 urg_hole = 0; @@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_cleanup_rbuf(sk, copied); - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ - if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { - user_recv = current; - tp->ucopy.task = user_recv; - tp->ucopy.msg = msg; - } - - tp->ucopy.len = len; - - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise - * order will be broken at second iteration. - * More elegant solution is required!!! - * - * Look: we have the following (pseudo)queues: - * - * 1. packets in flight - * 2. backlog - * 3. prequeue - * 4. receive_queue - * - * Each queue can be processed only if the next ones - * are empty. At this point we have empty receive_queue. - * But prequeue _can_ be not empty after 2nd iteration, - * when we jumped to start of loop because backlog - * processing added something to receive_queue. - * We cannot release_sock(), because backlog contains - * packets arrived _after_ prequeued ones. - * - * Shortly, algorithm is clear --- to process all - * the queues in order. We could make it more directly, - * requeueing packets from backlog to prequeue, if - * is not empty. It is more elegant, but eats cycles, - * unfortunately. - */ - if (!skb_queue_empty(&tp->ucopy.prequeue)) - goto do_prequeue; - - /* __ Set realtime policy in scheduler __ */ - } - if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, sk_wait_data(sk, &timeo, last); } - if (user_recv) { - int chunk; - - /* __ Restore normal policy in scheduler __ */ - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); - len -= chunk; - copied += chunk; - } - - if (tp->rcv_nxt == tp->copied_seq && - !skb_queue_empty(&tp->ucopy.prequeue)) { -do_prequeue: - tcp_prequeue_process(sk); - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", @@ -1955,25 +1869,6 @@ skip_copy: break; } while (len > 0); - if (user_recv) { - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - int chunk; - - tp->ucopy.len = copied > 0 ? len : 0; - - tcp_prequeue_process(sk); - - if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - } - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index adc3f3e9468c..770ce6cb3eca 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4611,22 +4611,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto out_of_window; /* Ok. In sequence. In window. */ - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && - sock_owned_by_user(sk) && !tp->urg_data) { - int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); - - __set_current_state(TASK_RUNNING); - - if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - eaten = (chunk == skb->len); - tcp_rcv_space_adjust(sk); - } - } - if (eaten <= 0) { queue_and_out: if (eaten < 0) { @@ -5186,26 +5170,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t } } -static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int err; - - if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); - else - err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); - - if (!err) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - } - - return err; -} - /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same @@ -5446,32 +5410,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, int eaten = 0; bool fragstolen = false; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); - - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + - TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - __skb_pull(skb, tcp_header_len); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; - } - } if (!eaten) { if (tcp_checksum_complete(skb)) goto csum_error; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3a19ea28339f..a68eb4577d36 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1541,61 +1541,6 @@ void tcp_v4_early_demux(struct sk_buff *skb) } } -/* Packet is added to VJ-style prequeue for processing in process - * context, if a reader task is waiting. Apparently, this exciting - * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) - * failed somewhere. Latency? Burstiness? Well, at least now we will - * see, why it failed. 8)8) --ANK - * - */ -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (sysctl_tcp_low_latency || !tp->ucopy.task) - return false; - - if (skb->len <= tcp_hdrlen(skb) && - skb_queue_len(&tp->ucopy.prequeue) == 0) - return false; - - /* Before escaping RCU protected region, we need to take care of skb - * dst. Prequeue is only enabled for established sockets. - * For such sockets, we might need the skb dst only to set sk->sk_rx_dst - * Instead of doing full sk_rx_dst validity here, let's perform - * an optimistic check. - */ - if (likely(sk->sk_rx_dst)) - skb_dst_drop(skb); - else - skb_dst_force_safe(skb); - - __skb_queue_tail(&tp->ucopy.prequeue, skb); - tp->ucopy.memory += skb->truesize; - if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || - tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - struct sk_buff *skb1; - - BUG_ON(sock_owned_by_user(sk)); - __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, - skb_queue_len(&tp->ucopy.prequeue)); - - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb1); - - tp->ucopy.memory = 0; - } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { - wake_up_interruptible_sync_poll(sk_sleep(sk), - POLLIN | POLLRDNORM | POLLRDBAND); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * tcp_rto_min(sk)) / 4, - TCP_RTO_MAX); - } - return true; -} -EXPORT_SYMBOL(tcp_prequeue); - bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; @@ -1770,8 +1715,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); + ret = tcp_v4_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } @@ -1936,9 +1880,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif - /* Clean prequeue, it must be empty really */ - __skb_queue_purge(&tp->ucopy.prequeue); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ff83c1637d8..188a6f31356d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,7 +445,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - tcp_prequeue_init(newtp); INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeeef962a..f753f9d2fee3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk) /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); sk_mem_reclaim_partial(sk); @@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk) } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - struct sk_buff *skb; - - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - tp->ucopy.memory = 0; - } - if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2968a33cca7d..39ee8e7fc4bd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1505,8 +1505,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); + ret = tcp_v6_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } -- cgit v1.2.3 From 39294c3df2a8dc5ec64513633446511f7e842acd Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 1 Aug 2017 18:27:28 +0200 Subject: Revert "ipv6: constify inet6_protocol structures" This reverts commit 3a3a4e3054137c5ff5d4d306ec834f6d25d7f95b. inet6_add_protocol and inet6_del_protocol include casts that remove the effect of the const annotation on their parameter, leading to possible runtime crashes. Reported-by: Eric Dumazet Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 33865d67bcb4..67ff2aaf5dcb 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1080,7 +1080,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) } -static const struct inet6_protocol ip6gre_protocol = { +static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 39ee8e7fc4bd..ced5dcf37465 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1944,7 +1944,7 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; -static const struct inet6_protocol tcpv6_protocol = { +static struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, .early_demux_handler = tcp_v6_early_demux, .handler = tcp_v6_rcv, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5f8b8d766c63..4a3e65626e8b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1448,7 +1448,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -static const struct inet6_protocol udpv6_protocol = { +static struct inet6_protocol udpv6_protocol = { .early_demux = udp_v6_early_demux, .early_demux_handler = udp_v6_early_demux, .handler = udpv6_rcv, -- cgit v1.2.3 From 4297a0ef085729af98adab9131d128c576ed3044 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 7 Aug 2017 08:44:21 -0700 Subject: net: ipv6: add second dif to inet6 socket lookups Add a second device index, sdif, to inet6 socket lookups. sdif is the index for ingress devices enslaved to an l3mdev. It allows the lookups to consider the enslaved device as well as the L3 domain when searching for a socket. TCP moves the data in the cb. Prior to tcp_v4_rcv (e.g., early demux) the ingress index is obtained from IPCB using inet_sdif and after tcp_v4_rcv tcp_v4_sdif is used. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 22 +++++++++++++--------- include/net/tcp.h | 10 ++++++++++ net/dccp/ipv6.c | 4 ++-- net/ipv6/inet6_hashtables.c | 28 +++++++++++++++++----------- net/ipv6/tcp_ipv6.c | 13 ++++++++----- net/ipv6/udp.c | 7 ++++--- net/netfilter/xt_TPROXY.c | 4 ++-- 7 files changed, 56 insertions(+), 32 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index b87becacd9d3..6e91e38a31da 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -49,7 +49,8 @@ struct sock *__inet6_lookup_established(struct net *net, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, - const u16 hnum, const int dif); + const u16 hnum, const int dif, + const int sdif); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, @@ -57,7 +58,8 @@ struct sock *inet6_lookup_listener(struct net *net, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif); + const unsigned short hnum, + const int dif, const int sdif); static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, @@ -66,24 +68,25 @@ static inline struct sock *__inet6_lookup(struct net *net, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, - const int dif, + const int dif, const int sdif, bool *refcounted) { struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, - sport, daddr, hnum, dif); + sport, daddr, hnum, + dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, sdif); } static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, - int iif, + int iif, int sdif, bool *refcounted) { struct sock *sk = skb_steal_sock(skb); @@ -95,7 +98,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, doff, &ipv6_hdr(skb)->saddr, sport, &ipv6_hdr(skb)->daddr, ntohs(dport), - iif, refcounted); + iif, sdif, refcounted); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, @@ -107,13 +110,14 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, int inet6_hash(struct sock *sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ -#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \ +#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif, __sdif) \ (((__sk)->sk_portpair == (__ports)) && \ ((__sk)->sk_family == AF_INET6) && \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ (!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif))) && \ + ((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif))) && \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 2b89f1ab8552..999f3efe572b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -827,6 +827,16 @@ static inline int tcp_v6_iif(const struct sk_buff *skb) return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } + +/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline int tcp_v6_sdif(const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) + return TCP_SKB_CB(skb)->header.h6.iif; +#endif + return 0; +} #endif /* TCP_SKB_CB reference means this can not be used from early demux */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1b58eac8aad3..47a7b59b355e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -89,7 +89,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __inet6_lookup_established(net, &dccp_hashinfo, &hdr->daddr, dh->dccph_dport, &hdr->saddr, ntohs(dh->dccph_sport), - inet6_iif(skb)); + inet6_iif(skb), 0); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), @@ -687,7 +687,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb), &refcounted); + inet6_iif(skb), 0, &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b13b8f93079d..b01858f5deb1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -56,7 +56,7 @@ struct sock *__inet6_lookup_established(struct net *net, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, - const int dif) + const int dif, const int sdif) { struct sock *sk; const struct hlist_nulls_node *node; @@ -73,12 +73,12 @@ begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; - if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif)) + if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; - if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) { + if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, - const int dif, bool exact_dif) + const int dif, const int sdif, bool exact_dif) { int score = -1; @@ -110,9 +110,13 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score++; + if (sk->sk_bound_dev_if && dev_match) + score++; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -126,7 +130,7 @@ struct sock *inet6_lookup_listener(struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif) + const unsigned short hnum, const int dif, const int sdif) { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -136,7 +140,7 @@ struct sock *inet6_lookup_listener(struct net *net, u32 phash = 0; sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif, exact_dif); + score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -171,7 +175,7 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, bool refcounted; sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, - ntohs(dport), dif, &refcounted); + ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -187,8 +191,9 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; - const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); + const int sdif = l3mdev_master_ifindex_by_index(net, dif); + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); @@ -203,7 +208,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, if (sk2->sk_hash != hash) continue; - if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) { + if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, + dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ced5dcf37465..f776ec4ecf6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -350,7 +350,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __inet6_lookup_established(net, &tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), - skb->dev->ifindex); + skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), @@ -918,7 +918,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), tcp_v6_iif(skb)); + ntohs(th->source), tcp_v6_iif(skb), + tcp_v6_sdif(skb)); if (!sk1) goto out; @@ -1397,6 +1398,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, static int tcp_v6_rcv(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; @@ -1430,7 +1432,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), - th->source, th->dest, inet6_iif(skb), + th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -1563,7 +1565,8 @@ do_time_wait: skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, - ntohs(th->dest), tcp_v6_iif(skb)); + ntohs(th->dest), tcp_v6_iif(skb), + sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); @@ -1610,7 +1613,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), - inet6_iif(skb)); + inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d96a877798a7..19afcaf4a22e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -897,7 +897,7 @@ discard: static struct sock *__udp6_lib_demux_lookup(struct net *net, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif) + int dif, int sdif) { unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); @@ -908,7 +908,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && - INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) + INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; @@ -923,6 +923,7 @@ static void udp_v6_early_demux(struct sk_buff *skb) struct sock *sk; struct dst_entry *dst; int dif = skb->dev->ifindex; + int sdif = inet6_sdif(skb); if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) @@ -934,7 +935,7 @@ static void udp_v6_early_demux(struct sk_buff *skb) sk = __udp6_lib_demux_lookup(net, uh->dest, &ipv6_hdr(skb)->daddr, uh->source, &ipv6_hdr(skb)->saddr, - dif); + dif, sdif); else return; diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 94fb0fd0c667..ade4c10c28c6 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -195,7 +195,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, thoff + __tcp_hdrlen(tcph), saddr, sport, daddr, ntohs(dport), - in->ifindex); + in->ifindex, 0); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -208,7 +208,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, case NFT_LOOKUP_ESTABLISHED: sk = __inet6_lookup_established(net, &tcp_hashinfo, saddr, sport, daddr, ntohs(dport), - in->ifindex); + in->ifindex, 0); break; default: BUG(); -- cgit v1.2.3 From d624d276d1ddacbcb12ad96832ce0c7b82cd25db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 17:44:43 -0700 Subject: tcp: fix possible deadlock in TCP stack vs BPF filter Filtering the ACK packet was not put at the right place. At this place, we already allocated a child and put it into accept queue. We absolutely need to call tcp_child_process() to release its spinlock, or we will deadlock at accept() or close() time. Found by syzkaller team (Thanks a lot !) Fixes: 8fac365f63c8 ("tcp: Add a tcp_filter hook before handle ack packet") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Chenbo Feng Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a20e7f03d5f7..e9252c7df809 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1722,6 +1722,8 @@ process: */ sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1729,8 +1731,6 @@ process: } if (nsk == sk) { reqsk_put(req); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2521690d62d6..206210125fd7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1456,6 +1456,8 @@ process: } sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1464,8 +1466,6 @@ process: if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); goto discard_and_relse; -- cgit v1.2.3 From 98aaa913b4ed250324429f0a9e6d5f77a3b5276c Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Tue, 22 Aug 2017 17:08:48 -0400 Subject: tcp: Extend SOF_TIMESTAMPING_RX_SOFTWARE to TCP recvmsg When SOF_TIMESTAMPING_RX_SOFTWARE is enabled for tcp sockets, return the timestamp corresponding to the highest sequence number data returned. Previously the skb->tstamp is overwritten when a TCP packet is placed in the out of order queue. While the packet is in the ooo queue, save the timestamp in the TCB_SKB_CB. This space is shared with the gso_* options which are only used on the tx path, and a previously unused 4 byte hole. When skbs are coalesced either in the sk_receive_queue or the out_of_order_queue always choose the timestamp of the appended skb to maintain the invariant of returning the timestamp of the last byte in the recvmsg buffer. Signed-off-by: Mike Maloney Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/tcp.h | 9 +++++++- net/ipv4/tcp.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 35 ++++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 108 insertions(+), 5 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index a995004ae946..c614ff135b66 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -774,6 +774,12 @@ struct tcp_skb_cb { u16 tcp_gso_segs; u16 tcp_gso_size; }; + + /* Used to stash the receive timestamp while this skb is in the + * out of order queue, as skb->tstamp is overwritten by the + * rbnode. + */ + ktime_t swtstamp; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ @@ -790,7 +796,8 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ - unused:6; + has_rxtstamp:1, /* SKB has a RX timestamp */ + unused:5; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d25e3bcca66b..0cce4472b4a1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,6 +269,7 @@ #include #include #include +#include #include #include @@ -1695,6 +1696,61 @@ int tcp_peek_len(struct socket *sock) } EXPORT_SYMBOL(tcp_peek_len); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec(skb->tstamp); + else + tss->ts[0] = (struct timespec) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec) {0}; +} + +/* Similar to __sock_recv_timestamp, but does not require an skb */ +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) +{ + struct timeval tv; + bool has_timestamping = false; + + if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { + if (sock_flag(sk, SOCK_RCVTSTAMP)) { + if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, + sizeof(tss->ts[0]), &tss->ts[0]); + } else { + tv.tv_sec = tss->ts[0].tv_sec; + tv.tv_usec = tss->ts[0].tv_nsec / 1000; + + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(tv), &tv); + } + } + + if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) + has_timestamping = true; + else + tss->ts[0] = (struct timespec) {0}; + } + + if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { + if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) + has_timestamping = true; + else + tss->ts[2] = (struct timespec) {0}; + } + + if (has_timestamping) { + tss->ts[1] = (struct timespec) {0}; + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, + sizeof(*tss), tss); + } +} + /* * This routine copies from a sock struct into the user buffer. * @@ -1716,6 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; + struct scm_timestamping tss; + bool has_tss = false; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -1911,6 +1969,10 @@ skip_copy: if (used + offset < skb->len) continue; + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, &tss); + has_tss = true; + } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) @@ -1929,6 +1991,9 @@ skip_copy: * on connected socket. I was just happy when found this 8) --ANK */ + if (has_tss) + tcp_recv_timestamp(msg, sk, &tss); + /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d3421ee9a10a..568ccfd6dd37 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4246,9 +4246,15 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } +enum tcp_queue { + OOO_QUEUE, + RCV_QUEUE, +}; + /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket + * @dest: destination queue * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean @@ -4260,6 +4266,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, + enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4281,6 +4288,15 @@ static bool tcp_try_coalesce(struct sock *sk, TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + + if (TCP_SKB_CB(from)->has_rxtstamp) { + TCP_SKB_CB(to)->has_rxtstamp = true; + if (dest == OOO_QUEUE) + TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; + else + to->tstamp = from->tstamp; + } + return true; } @@ -4315,6 +4331,9 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); + /* Replace tstamp which was stomped by rbnode */ + if (TCP_SKB_CB(skb)->has_rxtstamp) + skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4326,7 +4345,8 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, + tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4380,6 +4400,10 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } + /* Stash tstamp to avoid being stomped on by rbnode */ + if (TCP_SKB_CB(skb)->has_rxtstamp) + TCP_SKB_CB(skb)->swtstamp = skb->tstamp; + inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); @@ -4405,7 +4429,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { + if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); @@ -4455,7 +4480,8 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; @@ -4506,7 +4532,8 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; + tcp_try_coalesce(sk, RCV_QUEUE, tail, + skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5af8b809dfbc..a63486afa7a7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1637,6 +1637,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d79a1af3252e..abba3bc2a3d9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1394,6 +1394,8 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } static int tcp_v6_rcv(struct sk_buff *skb) -- cgit v1.2.3 From a8e3bb347d8c32c25830af55466e08979df4e2be Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 28 Aug 2017 15:14:20 -0700 Subject: net: Add comment that early_demux can change via sysctl Twice patches trying to constify inet{6}_protocol have been reverted: 39294c3df2a8 ("Revert "ipv6: constify inet6_protocol structures"") to revert 3a3a4e3054137 and then 03157937fe0b5 ("Revert "ipv4: make net_protocol const"") to revert aa8db499ea67. Add a comment that the structures can not be const because the early_demux field can change based on a sysctl. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 6 ++++++ net/ipv6/tcp_ipv6.c | 3 +++ net/ipv6/udp.c | 3 +++ 3 files changed, 12 insertions(+) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d678820e4306..e31108e5ef79 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1596,6 +1596,9 @@ static const struct net_protocol igmp_protocol = { }; #endif +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, .early_demux_handler = tcp_v4_early_demux, @@ -1606,6 +1609,9 @@ static struct net_protocol tcp_protocol = { .icmp_strict_tag_validation = 1, }; +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, .early_demux_handler = udp_v4_early_demux, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index abba3bc2a3d9..38f76d8b231e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1949,6 +1949,9 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, .early_demux_handler = tcp_v6_early_demux, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2a15f1bb6ef8..976f30391356 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1472,6 +1472,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct inet6_protocol udpv6_protocol = { .early_demux = udp_v6_early_demux, .early_demux_handler = udp_v6_early_demux, -- cgit v1.2.3 From 1f3b359f1004bd34b7b0bad70b93e3c7af92a37b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2017 12:44:47 -0700 Subject: tcp: fix a request socket leak While the cited commit fixed a possible deadlock, it added a leak of the request socket, since reqsk_put() must be called if the BPF filter decided the ACK packet must be dropped. Fixes: d624d276d1dd ("tcp: fix possible deadlock in TCP stack vs BPF filter") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a63486afa7a7..d9416b5162bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1669,9 +1669,9 @@ process: */ sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 38f76d8b231e..64d94afa427f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1460,9 +1460,9 @@ process: } sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; -- cgit v1.2.3 From 275757e6bae15a8621130907a78096afd1e15d2c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 16:36:52 -0500 Subject: ipv6: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in some cases I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv6/ah6.c | 1 + net/ipv6/exthdrs.c | 1 + net/ipv6/icmp.c | 6 ++---- net/ipv6/ip6_fib.c | 4 ++++ net/ipv6/ip6_tunnel.c | 1 + net/ipv6/ip6mr.c | 1 + net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 3 ++- net/ipv6/raw.c | 4 ++++ net/ipv6/tcp_ipv6.c | 3 ++- net/ipv6/xfrm6_policy.c | 1 + 10 files changed, 19 insertions(+), 6 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 7802b72196f3..37bb33fbc742 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -271,6 +271,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) case NEXTHDR_DEST: if (dir == XFRM_POLICY_OUT) ipv6_rearrange_destopt(iph, exthdr.opth); + /* fall through */ case NEXTHDR_HOP: if (!zero_out_mutable_opts(exthdr.opth)) { net_dbg_ratelimited("overrun %sopts\n", diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 95516138e861..7835dea930b4 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -89,6 +89,7 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; + /* fall through */ case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); return false; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4e52d52a6752..6ae5dd3f4d0d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -864,10 +864,8 @@ static int icmpv6_rcv(struct sk_buff *skb) goto discard_it; hdr = icmp6_hdr(skb); - /* - * Drop through to notify - */ - + /* to notify */ + /* fall through */ case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 548af48212fc..1ada9672d198 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1780,6 +1780,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_L; #endif + /* fall through */ case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { @@ -1788,6 +1789,7 @@ static int fib6_walk_continue(struct fib6_walker *w) continue; } w->state = FWS_R; + /* fall through */ case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { @@ -1797,6 +1799,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); + /* fall through */ case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1815,6 +1818,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } skip: w->state = FWS_U; + /* fall through */ case FWS_U: if (fn == w->root) return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 825548680b1e..4212879ff35e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -593,6 +593,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case NDISC_REDIRECT: rel_type = ICMP_REDIRECT; rel_code = ICMP_REDIR_HOST; + /* fall through */ default: return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f5500f5444e9..59fad81e5f7a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1722,6 +1722,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; + /* fall through */ case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 46d6dba50698..1d2fb9267d6f 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -290,7 +290,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e4462b0ff801..761a473a07c5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1055,6 +1055,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } @@ -1077,6 +1078,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); @@ -1138,6 +1140,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } @@ -1160,6 +1163,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 64d94afa427f..ae83615b7f6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1577,8 +1577,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 11d1314ab6c5..4ed9f8cc3b6a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -152,6 +152,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) switch (nexthdr) { case NEXTHDR_FRAGMENT: onlyproto = 1; + /* fall through */ case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: -- cgit v1.2.3 From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:24 -0700 Subject: tcp: add tracepoint trace_tcp_send_reset New tracepoint trace_tcp_send_reset is added and called from tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 11 +++++++++++ net/core/net-traces.c | 2 ++ net/ipv4/tcp_ipv4.c | 6 +++++- net/ipv4/tcp_output.c | 5 +++++ net/ipv6/tcp_ipv6.c | 10 ++++++++-- 5 files changed, 31 insertions(+), 3 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 2b6fe72c6781..3e57e1ae1c6b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -77,6 +77,17 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_ARGS(sk, skb) ); +/* + * skb of trace_tcp_send_reset is the skb that caused RST. In case of + * active reset, skb should be NULL + */ +DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ diff --git a/net/core/net-traces.c b/net/core/net-traces.c index f4e4fa2db505..8dcd9b0be04a 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -49,3 +49,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); + +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e22439f05e46..eb3f3b8e1e4b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include #include +#include + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 988733f289c8..1f01f4c9c738 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3084,6 +3084,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + /* skb of trace_tcp_send_reset() keeps the skb that caused RST, + * skb here is different to the troublesome skb, so use NULL + */ + trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ae83615b7f6d..0e2529958b52 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -69,6 +69,8 @@ #include #include +#include + static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -890,7 +892,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - int oif; + int oif = 0; if (th->rst) return; @@ -939,7 +941,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - oif = sk ? sk->sk_bound_dev_if : 0; + if (sk) { + oif = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:28 -0800 Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem Note that when a new netns is created, it inherits its sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns. This change is needed so that we can refine TCP rcvbuf autotuning, to take RTT into consideration. Signed-off-by: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 -- net/ipv4/sysctl_net_ipv4.c | 32 ++++++++++++++++---------------- net/ipv4/tcp.c | 21 ++++++++------------- net/ipv4/tcp_input.c | 14 ++++++++------ net/ipv4/tcp_ipv4.c | 13 ++++++++++--- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 8 files changed, 47 insertions(+), 43 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 379550f8124a..5e12975fc658 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -155,6 +155,8 @@ struct netns_ipv4 { int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; + int sysctl_tcp_wmem[3]; + int sysctl_tcp_rmem[3]; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index babfd4da1515..2f2c69ad31b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; -extern int sysctl_tcp_wmem[3]; -extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a82b44038308..ef0ff3357a44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -440,22 +440,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { - .procname = "tcp_wmem", - .data = &sysctl_tcp_wmem, - .maxlen = sizeof(sysctl_tcp_wmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_rmem", - .data = &sysctl_tcp_rmem, - .maxlen = sizeof(sysctl_tcp_rmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -1164,6 +1148,22 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &thousand, }, + { + .procname = "tcp_wmem", + .data = &init_net.ipv4.sysctl_tcp_wmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_rmem", + .data = &init_net.ipv4.sysctl_tcp_rmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4cb19ed4628..bc71a27d5ad9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); @@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } @@ -3636,13 +3631,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54ee09cbcf7..9ceaa1fdc3ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; - int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1; + int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); + sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + net->ipv4.sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk) while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + rcvbuf = min(rcvwin / tp->advmss * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0162c577bb9c..1eac84b8044e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2409,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; - + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a9d917e4dad5..9b98d35aa0d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e2529958b52..6bb98c93edfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = { .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, -- cgit v1.2.3