From a399a8053164ec8bcb06fed52be9941a26ecde11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Aug 2012 21:13:53 +0000 Subject: time: jiffies_delta_to_clock_t() helper to the rescue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various /proc/net files sometimes report crazy timer values, expressed in clock_t units. This happens when an expired timer delta (expires - jiffies) is passed to jiffies_to_clock_t(). This function has an overflow in : return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); commit cbbc719fccdb8cb (time: Change jiffies_to_clock_t() argument type to unsigned long) only got around the problem. As we cant output negative values in /proc/net/tcp without breaking various tools, I suggest adding a jiffies_delta_to_clock_t() wrapper that caps the negative delta to a 0 value. Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Cc: Thomas Gleixner Cc: Paul Gortmaker Cc: Andrew Morton Cc: hank Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c66b90f71c9b..aa41b0e6b163 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1875,7 +1875,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) tp->write_seq-tp->snd_una, (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), timer_active, - jiffies_to_clock_t(timer_expires - jiffies), + jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, sock_i_uid(sp), icsk->icsk_probes_out, @@ -1895,10 +1895,7 @@ static void get_timewait6_sock(struct seq_file *seq, const struct in6_addr *dest, *src; __u16 destp, srcp; const struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); - int ttd = tw->tw_ttd - jiffies; - - if (ttd < 0) - ttd = 0; + long delta = tw->tw_ttd - jiffies; dest = &tw6->tw_v6_daddr; src = &tw6->tw_v6_rcv_saddr; @@ -1914,7 +1911,7 @@ static void get_timewait6_sock(struct seq_file *seq, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, tw->tw_substate, 0, 0, - 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw); } -- cgit v1.2.3 From a7cb5a49bf64ba64864ae16a6be028f8b0d3cc06 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 May 2012 01:10:10 -0600 Subject: userns: Print out socket uids in a user namespace aware fashion. Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Arnaldo Carvalho de Melo Cc: Sridhar Samudrala Acked-by: Vlad Yasevich Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- include/net/tcp.h | 3 ++- init/Kconfig | 6 ------ net/appletalk/atalk_proc.c | 3 ++- net/ipv4/ping.c | 4 +++- net/ipv4/raw.c | 4 +++- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv4/udp.c | 4 +++- net/ipv6/raw.c | 3 ++- net/ipv6/tcp_ipv6.c | 6 +++--- net/ipv6/udp.c | 3 ++- net/ipx/ipx_proc.c | 3 ++- net/key/af_key.c | 2 +- net/llc/llc_proc.c | 2 +- net/packet/af_packet.c | 2 +- net/phonet/socket.c | 6 ++++-- net/sctp/proc.c | 6 ++++-- 16 files changed, 36 insertions(+), 27 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index e19124b84cd2..91e746736a8f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1509,7 +1509,8 @@ struct tcp_iter_state { sa_family_t family; enum tcp_seq_states state; struct sock *syn_wait_sk; - int bucket, offset, sbucket, num, uid; + int bucket, offset, sbucket, num; + kuid_t uid; loff_t last_pos; }; diff --git a/init/Kconfig b/init/Kconfig index 80fae193e3a1..25a6ebb50c64 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -942,10 +942,7 @@ config UIDGID_CONVERTED depends on PROC_EVENTS = n # Networking - depends on PACKET = n depends on NET_9P = n - depends on IPX = n - depends on PHONET = n depends on NET_CLS_FLOW = n depends on NETFILTER_XT_MATCH_OWNER = n depends on NETFILTER_XT_MATCH_RECENT = n @@ -953,14 +950,11 @@ config UIDGID_CONVERTED depends on NETFILTER_NETLINK_LOG = n depends on INET = n depends on IPV6 = n - depends on IP_SCTP = n depends on AF_RXRPC = n - depends on LLC2 = n depends on NET_KEY = n depends on INET_DIAG = n depends on DNS_RESOLVER = n depends on AX25 = n - depends on ATALK = n # Filesystems depends on USB_GADGETFS = n diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index b5b1a221c242..c30f3a0717fb 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -183,7 +183,8 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v) ntohs(at->dest_net), at->dest_node, at->dest_port, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_state, SOCK_INODE(s->sk_socket)->i_uid); + s->sk_state, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(s))); out: return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 6232d476f37e..bee5eeb676f8 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -845,7 +845,9 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f, bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops), len); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ff0f071969ea..f2425785d40a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -992,7 +992,9 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a73092..642be8a4c6a3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2382,7 +2382,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct sock *sk, const struct request_sock *req, - struct seq_file *f, int i, int uid, int *len) + struct seq_file *f, int i, kuid_t uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); int ttd = req->expires - jiffies; @@ -2399,7 +2399,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->retrans, - uid, + from_kuid_munged(seq_user_ns(f), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), @@ -2450,7 +2450,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sk), + from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b4c3582a991f..53b89817c008 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2110,7 +2110,9 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), - 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops), len); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ef0579d5bca6..7af88ef01657 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1251,7 +1251,8 @@ static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, - sock_i_uid(sp), 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c66b90f71c9b..4b5b335ebde1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1803,7 +1803,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - const struct sock *sk, struct request_sock *req, int i, int uid) + const struct sock *sk, struct request_sock *req, int i, kuid_t uid) { int ttd = req->expires - jiffies; const struct in6_addr *src = &inet6_rsk(req)->loc_addr; @@ -1827,7 +1827,7 @@ static void get_openreq6(struct seq_file *seq, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->retrans, - uid, + from_kuid_munged(seq_user_ns(seq), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1877,7 +1877,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sp), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 99d0077b56b8..bbdff07eebe1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1458,7 +1458,8 @@ static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, - sock_i_uid(sp), 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c index f8ba30dfecae..02ff7f2f60d4 100644 --- a/net/ipx/ipx_proc.c +++ b/net/ipx/ipx_proc.c @@ -217,7 +217,8 @@ static int ipx_seq_socket_show(struct seq_file *seq, void *v) seq_printf(seq, "%08X %08X %02X %03d\n", sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_state, SOCK_INODE(s->sk_socket)->i_uid); + s->sk_state, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(s))); out: return 0; } diff --git a/net/key/af_key.c b/net/key/af_key.c index 34e418508a67..0481d4b51476 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3661,7 +3661,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v) atomic_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - sock_i_uid(s), + from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), sock_i_ino(s) ); return 0; diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index a1839c004357..7b4799cfbf8d 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -151,7 +151,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk) - llc->copied_seq, sk->sk_state, - sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : -1, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), llc->link); out: return 0; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ceaca7c134a0..d147317ce9ea 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3846,7 +3846,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) po->ifindex, po->running, atomic_read(&s->sk_rmem_alloc), - sock_i_uid(s), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 0acc943f713a..b7e982782255 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -612,7 +612,8 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), - sock_i_uid(sk), sock_i_ino(sk), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, atomic_read(&sk->sk_drops), &len); } @@ -796,7 +797,8 @@ static int pn_res_seq_show(struct seq_file *seq, void *v) struct sock *sk = *psk; seq_printf(seq, "%02X %5d %lu%n", - (int) (psk - pnres.sk), sock_i_uid(sk), + (int) (psk - pnres.sk), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), &len); } seq_printf(seq, "%*s\n", 63 - len, ""); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 1e2eee88c3ea..dc12febc977a 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -216,7 +216,8 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, epb->bind_addr.port, - sock_i_uid(sk), sock_i_ino(sk)); + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + sock_i_ino(sk)); sctp_seq_dump_local_addrs(seq, epb); seq_printf(seq, "\n"); @@ -324,7 +325,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) assoc->assoc_id, assoc->sndbuf_used, atomic_read(&assoc->rmem_alloc), - sock_i_uid(sk), sock_i_ino(sk), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); seq_printf(seq, " "); -- cgit v1.2.3 From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:12 +0000 Subject: tcp: TCP Fast Open Server - support TFO listeners This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- include/net/request_sock.h | 13 ------ include/net/tcp.h | 6 ++- net/core/request_sock.c | 95 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 28 +++++++++++- net/ipv4/inet_connection_sock.c | 57 ++++++++++++++++++++++--- net/ipv4/syncookies.c | 1 + net/ipv4/tcp.c | 49 ++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 4 +- net/ipv4/tcp_minisocks.c | 61 +++++++++++++++++++++----- net/ipv4/tcp_output.c | 21 ++++++--- net/ipv4/tcp_timer.c | 39 ++++++++++++++++- net/ipv6/syncookies.c | 1 + net/ipv6/tcp_ipv6.c | 5 ++- 13 files changed, 330 insertions(+), 50 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index c3cdd6c9f448..b01d8dd9ee7c 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -226,19 +226,6 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue return req; } -static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue, - struct sock *parent) -{ - struct request_sock *req = reqsk_queue_remove(queue); - struct sock *child = req->sk; - - WARN_ON(child == NULL); - - sk_acceptq_removed(parent); - __reqsk_free(req); - return child; -} - static inline int reqsk_queue_removed(struct request_sock_queue *queue, struct request_sock *req) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f8821e3293a..1421b02a7905 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -424,7 +424,8 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock * const struct tcphdr *th); extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev); + struct request_sock **prev, + bool fastopen); extern int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); extern bool tcp_use_frto(struct sock *sk); @@ -478,7 +479,8 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, extern int tcp_connect(struct sock *sk); extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp); + struct request_values *rvp, + struct tcp_fastopen_cookie *foc); extern int tcp_disconnect(struct sock *sk, int flags); void tcp_connect_init(struct sock *sk); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 9b570a6a33c5..c31d9e8668c3 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) kfree(lopt); } +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. A corner + * case might also exist in tcp_v4_hnd_req() that will trigger this locking + * order. + * + * When a TFO req is created, it needs to sock_hold its listener to prevent + * the latter data structure from going away. + * + * This function also sets "treq->listener" to NULL and unreference listener + * socket. treq->listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = tcp_rsk(req)->listener; + struct fastopen_queue *fastopenq = + inet_csk(lsk)->icsk_accept_queue.fastopenq; + + BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk)); + + tcp_sk(sk)->fastopen_rsk = NULL; + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->listener = NULL; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + reqsk_free(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + return; +} diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6681ccf5c3ee..4f70ef0b946d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive inet socket %p\n", sk); return; } + if (sk->sk_type == SOCK_STREAM) { + struct fastopen_queue *fastopenq = + inet_csk(sk)->icsk_accept_queue.fastopenq; + kfree(fastopenq); + } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(atomic_read(&sk->sk_wmem_alloc)); @@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { + /* Check special setups for testing purpose to enable TFO w/o + * requiring TCP_FASTOPEN sockopt. + * Note that only TCP sockets (SOCK_STREAM) will reach here. + * Also fastopenq may already been allocated because this + * socket was in TCP_LISTEN state previously but was + * shutdown() (rather than close()). + */ + if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && + inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { + if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) + err = fastopen_init_queue(sk, backlog); + else if ((sysctl_tcp_fastopen & + TFO_SERVER_WO_SOCKOPT2) != 0) + err = fastopen_init_queue(sk, + ((uint)sysctl_tcp_fastopen) >> 16); + else + err = 0; + if (err) + goto out; + } err = inet_csk_listen_start(sk, backlog); if (err) goto out; @@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) sock_rps_record_flow(sk2); WARN_ON(!((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); + (TCPF_ESTABLISHED | TCPF_SYN_RECV | + TCPF_CLOSE_WAIT | TCPF_CLOSE))); sock_graft(sk2, newsock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7f75f21d7b83..8464b79c493f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct sock *newsk; + struct request_sock *req; int error; lock_sock(sk); @@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) goto out_err; /* Find already established connection */ - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ @@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - - newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - WARN_ON(newsk->sk_state == TCP_SYN_RECV); + req = reqsk_queue_remove(queue); + newsk = req->sk; + + sk_acceptq_removed(sk); + if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) { + spin_lock_bh(&queue->fastopenq->lock); + if (tcp_rsk(req)->listener) { + /* We are still waiting for the final ACK from 3WHS + * so can't free req now. Instead, we set req->sk to + * NULL to signify that the child socket is taken + * so reqsk_fastopen_remove() will free the req + * when 3WHS finishes (or is aborted). + */ + req->sk = NULL; + req = NULL; + } + spin_unlock_bh(&queue->fastopenq->lock); + } out: release_sock(sk); + if (req) + __reqsk_free(req); return newsk; out_err: newsk = NULL; + req = NULL; *err = error; goto out; } @@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *acc_req; struct request_sock *req; inet_csk_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + acc_req = reqsk_queue_yank_acceptq(queue); /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(&icsk->icsk_accept_queue); + reqsk_queue_destroy(queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; @@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); + if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != tcp_rsk(req)->listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + sock_put(sk); + } inet_csk_destroy_sock(child); bh_unlock_sock(child); @@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); __reqsk_free(req); } + if (queue->fastopenq != NULL) { + /* Free all the reqs queued in rskq_rst_head. */ + spin_lock_bh(&queue->fastopenq->lock); + acc_req = queue->fastopenq->rskq_rst_head; + queue->fastopenq->rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq->lock); + while ((req = acc_req) != NULL) { + acc_req = req->dl_next; + __reqsk_free(req); + } + } WARN_ON(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 650e1528e1e6..ba48e799b031 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->listener = NULL; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2109ff4a1daf..df83d744e380 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; - /* Connected? */ - if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Connected or passive Fast Open socket? */ + if (sk->sk_state != TCP_SYN_SENT && + (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + /* Wait for a connection to finish. One exception is TCP Fast Open + * (passive side) where data is allowed to be sent before a connection + * is fully established. + */ + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && + !tcp_passive_fastopen(sk)) { if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err; + } clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + /* Wait for a connection to finish. One exception is TCP Fast Open + * (passive side) where data is allowed to be sent before a connection + * is fully established. + */ + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && + !tcp_passive_fastopen(sk)) { if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto do_error; + } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { @@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout) * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK + * XXX (TFO) - To start off we don't support SYN+ACK+FIN + * in a single packet! (May consider it later but will + * probably need API support or TCP_CORK SYN-ACK until + * data is written and socket is closed.) */ tcp_send_fin(sk); } @@ -2215,8 +2230,16 @@ adjudge_to_death: } } - if (sk->sk_state == TCP_CLOSE) + if (sk->sk_state == TCP_CLOSE) { + struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + /* We could get here with a non-NULL req if the socket is + * aborted (e.g., closed with unread data) before 3WHS + * finishes. + */ + if (req != NULL) + reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); + } /* Otherwise, socket is reprieved until protocol close. */ out: @@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else icsk->icsk_user_timeout = msecs_to_jiffies(val); break; + + case TCP_FASTOPEN: + if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | + TCPF_LISTEN))) + err = fastopen_init_queue(sk, val); + else + err = -EINVAL; + break; default: err = -ENOPROTOOPT; break; @@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator); void tcp_done(struct sock *sk) { + struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); + if (req != NULL) + reqsk_fastopen_remove(sk, req, false); sk->sk_shutdown = SHUTDOWN_MASK; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 36f02f954ac1..bb148dee1edd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, rvp); + skb = tcp_make_synack(sk, dst, req, rvp, NULL); if (skb) { __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); @@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); if (req) - return tcp_check_req(sk, skb, req, prev); + return tcp_check_req(sk, skb, req, prev, false); nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6ff7f10dce9d..e965319d610b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); + newtp->fastopen_rsk = NULL; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, EXPORT_SYMBOL(tcp_create_openreq_child); /* - * Process an incoming packet for SYN_RECV sockets represented - * as a request_sock. + * Process an incoming packet for SYN_RECV sockets represented as a + * request_sock. Normally sk is the listener socket but for TFO it + * points to the child socket. + * + * XXX (TFO) - The current impl contains a special check for ack + * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev) + struct request_sock **prev, + bool fastopen) { struct tcp_options_received tmp_opt; const u8 *hash_location; @@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; + BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); @@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. + * + * Note that even if there is new data in the SYN packet + * they will be thrown away too. */ req->rsk_ops->rtx_syn_ack(sk, req, NULL); return NULL; @@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * - * Invalid ACK: reset will be sent by listening socket + * Invalid ACK: reset will be sent by listening socket. + * Note that the ACK validity check for a Fast Open socket is done + * elsewhere and is checked directly against the child socket rather + * than req because user data may have been sent out. */ - if ((flg & TCP_FLAG_ACK) && + if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) return sk; @@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) + if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) req->ts_recent = tmp_opt.rcv_tsval; if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { @@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. + * + * XXX (TFO) - if we ever allow "data after SYN", the + * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; + /* For Fast Open no more processing is needed (sk is the + * child socket). + */ + if (fastopen) + return sk; + /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { @@ -706,11 +729,21 @@ listen_overflow: } embryonic_reset: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST)) { + /* Received a bad SYN pkt - for TFO We try not to reset + * the local connection unless it's really necessary to + * avoid becoming vulnerable to outside attack aiming at + * resetting legit local connections. + */ req->rsk_ops->send_reset(sk, skb); - - inet_csk_reqsk_queue_drop(sk, req, prev); + } else if (fastopen) { /* received a valid RST pkt */ + reqsk_fastopen_remove(sk, req, true); + tcp_reset(sk); + } + if (!fastopen) { + inet_csk_reqsk_queue_drop(sk, req, prev); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + } return NULL; } EXPORT_SYMBOL(tcp_check_req); @@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req); * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. + * + * For the vast majority of cases child->sk_state will be TCP_SYN_RECV + * when entering. But other states are possible due to a race condition + * where after __inet_lookup_established() fails but before the listener + * locked is obtained, other packets cause the same connection to + * be created. */ int tcp_child_process(struct sock *parent, struct sock *child, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d04632673a9e..9383b51f3efc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5, - struct tcp_extend_values *xvp) + struct tcp_extend_values *xvp, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - + if (foc != NULL) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = foc; + remaining -= need; + } + } /* Similar rationale to tcp_syn_options() applies here, too. * If the options fit, the same options should fit now! */ @@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk) */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp) + struct request_values *rvp, + struct tcp_fastopen_cookie *foc) { struct tcp_out_options opts; struct tcp_extend_values *xvp = tcp_xv(rvp); @@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5, xvp) + skb, &opts, &md5, xvp, foc) + sizeof(*th); skb_push(skb, tcp_header_size); @@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, } th->seq = htonl(TCP_SKB_CB(skb)->seq); - th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); + /* XXX data is queued and acked as is. No buffer/window check */ + th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b774a03bd1dc..fc04711e80c8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -304,6 +304,35 @@ static void tcp_probe_timer(struct sock *sk) } } +/* + * Timer for Fast Open socket to retransmit SYNACK. Note that the + * sk here is the child socket, not the parent (listener) socket. + */ +static void tcp_fastopen_synack_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int max_retries = icsk->icsk_syn_retries ? : + sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + struct request_sock *req; + + req = tcp_sk(sk)->fastopen_rsk; + req->rsk_ops->syn_ack_timeout(sk, req); + + if (req->retrans >= max_retries) { + tcp_write_err(sk); + return; + } + /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error + * returned from rtx_syn_ack() to make it more persistent like + * regular retransmit because if the child socket has been accepted + * it's not good to give up too easily. + */ + req->rsk_ops->rtx_syn_ack(sk, req, NULL); + req->retrans++; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX); +} + /* * The TCP retransmit timer. */ @@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk) tcp_resume_early_retransmit(sk); return; } - + if (tp->fastopen_rsk) { + BUG_ON(sk->sk_state != TCP_SYN_RECV && + sk->sk_state != TCP_FIN_WAIT1); + tcp_fastopen_synack_timer(sk); + /* Before we receive ACK to our SYN-ACK don't retransmit + * anything else (e.g., data or FIN segments). + */ + return; + } if (!tp->packets_out) goto out; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bb46061c813a..182ab9a85d6c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -190,6 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq6 = inet6_rsk(req); treq = tcp_rsk(req); + treq->listener = NULL; if (security_inet_conn_request(sk, skb, req)) goto out_free; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f99b81d53cca..09078b9bc6f6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -475,7 +475,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, rvp); + skb = tcp_make_synack(sk, dst, req, rvp, NULL); if (skb) { __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); @@ -987,7 +987,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, inet6_iif(skb)); if (req) - return tcp_check_req(sk, skb, req, prev); + return tcp_check_req(sk, skb, req, prev, false); nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, &ipv6_hdr(skb)->saddr, th->source, @@ -1179,6 +1179,7 @@ have_isn: want_cookie) goto drop_and_free; + tcp_rsk(req)->listener = NULL; inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; -- cgit v1.2.3 From 623df484a777f3c00c1ea3d6a7565b8d8ac688a1 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Sep 2012 04:18:54 +0000 Subject: tcp: extract code to compute SYNACK RTT In preparation for adding another spot where we compute the SYNACK RTT, extract this code so that it can be shared. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 9 +++++++++ net/ipv4/tcp_ipv4.c | 4 +--- net/ipv6/tcp_ipv6.c | 4 +--- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index a8cb00c0c6d9..a718d0e3d8e7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1137,6 +1137,15 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->loc_port = tcp_hdr(skb)->dest; } +/* Compute time elapsed between SYNACK and the ACK completing 3WHS */ +static inline void tcp_synack_rtt_meas(struct sock *sk, + struct request_sock *req) +{ + if (tcp_rsk(req)->snt_synack) + tcp_valid_rtt_meas(sk, + tcp_time_stamp - tcp_rsk(req)->snt_synack); +} + extern void tcp_enter_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e64abed249cc..1e66f7fb4fe6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1733,9 +1733,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - if (tcp_rsk(req)->snt_synack) - tcp_valid_rtt_meas(newsk, - tcp_time_stamp - tcp_rsk(req)->snt_synack); + tcp_synack_rtt_meas(newsk, req); newtp->total_retrans = req->retrans; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f3bfb8bbfdec..cfeeeb7fcf54 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1348,9 +1348,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - if (tcp_rsk(req)->snt_synack) - tcp_valid_rtt_meas(newsk, - tcp_time_stamp - tcp_rsk(req)->snt_synack); + tcp_synack_rtt_meas(newsk, req); newtp->total_retrans = req->retrans; newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; -- cgit v1.2.3 From 016818d076871c4ee34db1e8d74dc17ac1de626a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Sep 2012 04:18:55 +0000 Subject: tcp: TCP Fast Open Server - take SYNACK RTT after completing 3WHS When taking SYNACK RTT samples for servers using TCP Fast Open, fix the code to ensure that we only call tcp_valid_rtt_meas() after we receive the ACK that completes the 3-way handshake. Previously we were always taking an RTT sample in tcp_v4_syn_recv_sock(). However, for TCP Fast Open connections tcp_v4_conn_req_fastopen() calls tcp_v4_syn_recv_sock() at the time we receive the SYN. So for TFO we must wait until tcp_rcv_state_process() to take the RTT sample. To fix this, we wait until after TFO calls tcp_v4_syn_recv_sock() before we set the snt_synack timestamp, since tcp_synack_rtt_meas() already ensures that we only take a SYNACK RTT sample if snt_synack is non-zero. To be careful, we only take a snt_synack timestamp when a SYNACK transmit or retransmit succeeds. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 12 +++++++++--- net/ipv6/tcp_ipv6.c | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index a718d0e3d8e7..6feeccd83dd7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1125,6 +1125,7 @@ static inline void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tcp_rsk(req)->snt_synack = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2bec815ff23..bb326684495b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5983,6 +5983,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * need req so release it. */ if (req) { + tcp_synack_rtt_meas(sk, req); reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e66f7fb4fe6..0a7e020f16b5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -868,6 +868,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); + if (!tcp_rsk(req)->snt_synack && !err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; } return err; @@ -1382,6 +1384,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const struct inet_request_sock *ireq = inet_rsk(req); struct sock *child; + int err; req->retrans = 0; req->sk = NULL; @@ -1393,8 +1396,11 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, kfree_skb(skb_synack); return -1; } - ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, - ireq->rmt_addr, ireq->opt); + err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, + ireq->rmt_addr, ireq->opt); + err = net_xmit_eval(err); + if (!err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; /* XXX (TFO) - is it ok to ignore error and continue? */ spin_lock(&queue->fastopenq->lock); @@ -1612,7 +1618,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; if (dst == NULL) { dst = inet_csk_route_req(sk, &fl4, req); @@ -1650,6 +1655,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (err || want_cookie) goto drop_and_free; + tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; /* Add the request_sock to the SYN table */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cfeeeb7fcf54..d6212d6bc8d8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1169,7 +1169,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) } have_isn: tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; if (security_inet_conn_request(sk, skb, req)) goto drop_and_release; @@ -1180,6 +1179,7 @@ have_isn: want_cookie) goto drop_and_free; + tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; -- cgit v1.2.3 From 861b650101eb0c627d171eb18de81dddb93d395e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2012 02:14:53 +0000 Subject: tcp: gro: add checksuming helpers skb with CHECKSUM_NONE cant currently be handled by GRO, and we notice this deep in GRO stack in tcp[46]_gro_receive() But there are cases where GRO can be a benefit, even with a lack of checksums. This preliminary work is needed to add GRO support to tunnels. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 19 ++++++++++++++++--- net/ipv6/tcp_ipv6.c | 20 +++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 385eb79cf6aa..75735c9a6a9d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2803,6 +2803,8 @@ void tcp4_proc_exit(void) struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { const struct iphdr *iph = skb_gro_network_header(skb); + __wsum wsum; + __sum16 sum; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -2811,11 +2813,22 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - - /* fall through */ - case CHECKSUM_NONE: +flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; + + case CHECKSUM_NONE: + wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb_gro_len(skb), IPPROTO_TCP, 0); + sum = csum_fold(skb_checksum(skb, + skb_gro_offset(skb), + skb_gro_len(skb), + wsum)); + if (sum) + goto flush; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; } return tcp_gro_receive(head, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d6212d6bc8d8..49c890386ce9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -763,6 +763,8 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { const struct ipv6hdr *iph = skb_gro_network_header(skb); + __wsum wsum; + __sum16 sum; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -771,11 +773,23 @@ static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - - /* fall through */ - case CHECKSUM_NONE: +flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; + + case CHECKSUM_NONE: + wsum = ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb_gro_len(skb), + IPPROTO_TCP, 0)); + sum = csum_fold(skb_checksum(skb, + skb_gro_offset(skb), + skb_gro_len(skb), + wsum)); + if (sum) + goto flush; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; } return tcp_gro_receive(head, skb); -- cgit v1.2.3 From 4c67525849e0b7f4bd4fab2487ec9e43ea52ef29 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Fri, 12 Oct 2012 04:34:17 +0000 Subject: tcp: resets are misrouted After commit e2446eaa ("tcp_v4_send_reset: binding oif to iif in no sock case").. tcp resets are always lost, when routing is asymmetric. Yes, backing out that patch will result in misrouting of resets for dead connections which used interface binding when were alive, but we actually cannot do anything here. What's died that's died and correct handling normal unbound connections is obviously a priority. Comment to comment: > This has few benefits: > 1. tcp_v6_send_reset already did that. It was done to route resets for IPv6 link local addresses. It was a mistake to do so for global addresses. The patch fixes this as well. Actually, the problem appears to be even more serious than guaranteed loss of resets. As reported by Sergey Soloviev , those misrouted resets create a lot of arp traffic and huge amount of unresolved arp entires putting down to knees NAT firewalls which use asymmetric routing. Signed-off-by: Alexey Kuznetsov --- net/ipv4/tcp_ipv4.c | 7 ++++--- net/ipv6/tcp_ipv6.c | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 75735c9a6a9d..ef998b008a57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -708,10 +708,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; /* When socket is gone, all binding information is lost. - * routing might fail in this case. using iif for oif to - * make sure we can deliver it + * routing might fail in this case. No choice here, if we choose to force + * input interface, we will misroute in case of asymmetric route. */ - arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); + if (sk) + arg.bound_dev_if = sk->sk_bound_dev_if; net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 49c890386ce9..26175bffbaa0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -877,7 +877,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; - fl6.flowi6_oif = inet6_iif(skb); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = inet6_iif(skb); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); -- cgit v1.2.3