diff options
author | James Morris <james.l.morris@oracle.com> | 2017-11-29 12:47:41 +1100 |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2017-11-29 12:47:41 +1100 |
commit | cf40a76e7d5874bb25f4404eecc58a2e033af885 (patch) | |
tree | 8fd81cbea03c87b3d41d7ae5b1d11eadd35d6ef5 /net/ipv4/tcp.c | |
parent | ab5348c9c23cd253f5902980d2d8fe067dc24c82 (diff) | |
parent | 4fbd8d194f06c8a3fd2af1ce560ddb31f7ec8323 (diff) |
Merge tag 'v4.15-rc1' into next-seccomp
Linux 4.15-rc1
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 454 |
1 files changed, 290 insertions, 164 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..bf97317e6c97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,6 +269,8 @@ #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> +#include <linux/errqueue.h> +#include <linux/static_key.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -281,24 +283,22 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -int sysctl_tcp_min_tso_segs __read_mostly = 2; - -int sysctl_tcp_autocorking __read_mostly = 1; +#include <trace/events/tcp.h> struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif + /* * Current number of TCP sockets. */ @@ -388,6 +388,19 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) return period; } +static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) +{ + u32 rate = READ_ONCE(tp->rate_delivered); + u32 intv = READ_ONCE(tp->rate_interval_us); + u64 rate64 = 0; + + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + } + return rate64; +} + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -399,9 +412,10 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -428,6 +442,7 @@ void tcp_init_sock(struct sock *sk) tcp_assign_congestion_control(sk); tp->tsoffset = 0; + tp->rack.reo_wnd_steps = 1; sk->sk_state = TCP_CLOSE; @@ -436,15 +451,29 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +void tcp_init_transfer(struct sock *sk, int bpf_op) { + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -662,7 +691,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sysctl_tcp_autocorking && + sock_net(sk)->ipv4.sysctl_tcp_autocorking && skb != tcp_write_queue_head(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -673,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -856,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -935,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1014,7 +1043,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1034,23 +1063,29 @@ out_err: } EXPORT_SYMBOL_GPL(do_tcp_sendpages); -int tcp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) { - ssize_t res; - if (!(sk->sk_route_caps & NETIF_F_SG) || !sk_check_csum_caps(sk)) - return sock_no_sendpage(sk->sk_socket, page, offset, size, - flags); - - lock_sock(sk); + return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ - res = do_tcp_sendpages(sk, page, offset, size, flags); + return do_tcp_sendpages(sk, page, offset, size, flags); +} +EXPORT_SYMBOL_GPL(tcp_sendpage_locked); + +int tcp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendpage_locked(sk, page, offset, size, flags); release_sock(sk); - return res; + + return ret; } EXPORT_SYMBOL(tcp_sendpage); @@ -1107,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1144,9 +1179,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return err; } -int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; @@ -1155,9 +1191,25 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool sg; long timeo; - lock_sock(sk); - flags = msg->msg_flags; + + if (flags & MSG_ZEROCOPY && size) { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -EINVAL; + goto out_err; + } + + skb = tcp_write_queue_tail(sk); + uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + if (!uarg) { + err = -ENOBUFS; + goto out_err; + } + + if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + uarg->zerocopy = 0; + } + if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) @@ -1223,7 +1275,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1243,7 +1295,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1281,7 +1333,7 @@ new_segment: err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else { + } else if (!uarg || !uarg->zerocopy) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1319,6 +1371,13 @@ new_segment: page_ref_inc(pfrag->page); } pfrag->offset += copy; + } else { + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + if (err == -EMSGSIZE || err == -EEXIST) + goto new_segment; + if (err < 0) + goto do_error; + copy = err; } if (!copied) @@ -1361,11 +1420,11 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - release_sock(sk); + sock_zerocopy_put(uarg); return copied + copied_syn; do_fault: @@ -1382,6 +1441,7 @@ do_error: if (copied + copied_syn) goto out; out_err: + sock_zerocopy_put_abort(uarg); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -1389,9 +1449,20 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } - release_sock(sk); return err; } +EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); + +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendmsg_locked(sk, msg, size); + release_sock(sk); + + return ret; +} EXPORT_SYMBOL(tcp_sendmsg); /* @@ -1450,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -1525,20 +1603,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -static void tcp_prequeue_process(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_sock *tp = tcp_sk(sk); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - /* Clear memory counter. */ - tp->ucopy.memory = 0; -} - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1652,6 +1716,61 @@ int tcp_peek_len(struct socket *sock) } EXPORT_SYMBOL(tcp_peek_len); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec(skb->tstamp); + else + tss->ts[0] = (struct timespec) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec) {0}; +} + +/* Similar to __sock_recv_timestamp, but does not require an skb */ +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) +{ + struct timeval tv; + bool has_timestamping = false; + + if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { + if (sock_flag(sk, SOCK_RCVTSTAMP)) { + if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, + sizeof(tss->ts[0]), &tss->ts[0]); + } else { + tv.tv_sec = tss->ts[0].tv_sec; + tv.tv_usec = tss->ts[0].tv_nsec / 1000; + + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(tv), &tv); + } + } + + if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) + has_timestamping = true; + else + tss->ts[0] = (struct timespec) {0}; + } + + if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { + if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) + has_timestamping = true; + else + tss->ts[2] = (struct timespec) {0}; + } + + if (has_timestamping) { + tss->ts[1] = (struct timespec) {0}; + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, + sizeof(*tss), tss); + } +} + /* * This routine copies from a sock struct into the user buffer. * @@ -1671,9 +1790,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int err; int target; /* Read at least this many bytes */ long timeo; - struct task_struct *user_recv = NULL; struct sk_buff *skb, *last; u32 urg_hole = 0; + struct scm_timestamping tss; + bool has_tss = false; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -1806,51 +1926,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_cleanup_rbuf(sk, copied); - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ - if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { - user_recv = current; - tp->ucopy.task = user_recv; - tp->ucopy.msg = msg; - } - - tp->ucopy.len = len; - - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise - * order will be broken at second iteration. - * More elegant solution is required!!! - * - * Look: we have the following (pseudo)queues: - * - * 1. packets in flight - * 2. backlog - * 3. prequeue - * 4. receive_queue - * - * Each queue can be processed only if the next ones - * are empty. At this point we have empty receive_queue. - * But prequeue _can_ be not empty after 2nd iteration, - * when we jumped to start of loop because backlog - * processing added something to receive_queue. - * We cannot release_sock(), because backlog contains - * packets arrived _after_ prequeued ones. - * - * Shortly, algorithm is clear --- to process all - * the queues in order. We could make it more directly, - * requeueing packets from backlog to prequeue, if - * is not empty. It is more elegant, but eats cycles, - * unfortunately. - */ - if (!skb_queue_empty(&tp->ucopy.prequeue)) - goto do_prequeue; - - /* __ Set realtime policy in scheduler __ */ - } - if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1859,31 +1934,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, sk_wait_data(sk, &timeo, last); } - if (user_recv) { - int chunk; - - /* __ Restore normal policy in scheduler __ */ - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); - len -= chunk; - copied += chunk; - } - - if (tp->rcv_nxt == tp->copied_seq && - !skb_queue_empty(&tp->ucopy.prequeue)) { -do_prequeue: - tcp_prequeue_process(sk); - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", @@ -1941,6 +1991,10 @@ skip_copy: if (used + offset < skb->len) continue; + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, &tss); + has_tss = true; + } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) @@ -1955,29 +2009,13 @@ skip_copy: break; } while (len > 0); - if (user_recv) { - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - int chunk; - - tp->ucopy.len = copied > 0 ? len : 0; - - tcp_prequeue_process(sk); - - if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - } - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ + if (has_tss) + tcp_recv_timestamp(msg, sk, &tss); + /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); @@ -2002,6 +2040,8 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + trace_tcp_set_state(sk, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2289,6 +2329,37 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2347,7 +2418,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); @@ -2439,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sysctl_tcp_fack) - tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) @@ -2481,7 +2549,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true); + err = tcp_set_congestion_control(sk, name, true, true); release_sock(sk); return err; } @@ -2503,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + if (optlen != sizeof(key)) + return -EINVAL; + + if (copy_from_user(key, optval, optlen)) + return -EFAULT; + + return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + } default: /* fallthru */ break; @@ -2734,7 +2813,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { @@ -2744,7 +2823,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else @@ -2753,6 +2832,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EOPNOTSUPP; } break; + case TCP_FASTOPEN_NO_COOKIE: + if (val > 1 || val < 0) + err = -EINVAL; + else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + err = -EINVAL; + else + tp->fastopen_no_cookie = val; + break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; @@ -2823,7 +2910,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now, intv; + u32 now; u64 rate64; bool slow; u32 rate; @@ -2890,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); @@ -2922,13 +3008,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; - rate = READ_ONCE(tp->rate_delivered); - intv = READ_ONCE(tp->rate_interval_us); - if (rate && intv) { - rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; - do_div(rate64, intv); + rate64 = tcp_compute_delivery_rate(tp); + if (rate64) info->tcpi_delivery_rate = rate64; - } unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2938,8 +3020,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + u64 rate64; + u32 rate; - stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + + 3 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -2954,6 +3040,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); + + rate64 = tcp_compute_delivery_rate(tp); + nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); + + nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); + nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); return stats; } @@ -3075,6 +3175,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctx; + + if (get_user(len, optlen)) + return -EFAULT; + + rcu_read_lock(); + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + if (ctx) + memcpy(key, ctx->key, sizeof(key)); + else + len = 0; + rcu_read_unlock(); + + len = min_t(unsigned int, len, sizeof(key)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, key, len)) + return -EFAULT; + return 0; + } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; @@ -3137,6 +3259,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_connect; break; + case TCP_FASTOPEN_NO_COOKIE: + val = tp->fastopen_no_cookie; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; @@ -3502,13 +3628,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); |