diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 95 |
1 files changed, 77 insertions, 18 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 15d47d5e7951..6f0caf9a866d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1734,7 +1734,7 @@ int tcp_mmap(struct file *file, struct socket *sock, return -EPERM; vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - /* Instruct vm_insert_page() to not down_read(mmap_sem) */ + /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vma->vm_flags |= VM_MIXEDMAP; vma->vm_ops = &tcp_vm_ops; @@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock, } EXPORT_SYMBOL(tcp_mmap); +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, + struct page **pages, + unsigned long pages_to_map, + unsigned long *insert_addr, + u32 *length_with_pending, + u32 *seq, + struct tcp_zerocopy_receive *zc) +{ + unsigned long pages_remaining = pages_to_map; + int bytes_mapped; + int ret; + + ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); + bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); + /* Even if vm_insert_pages fails, it may have partially succeeded in + * mapping (some but not all of the pages). + */ + *seq += bytes_mapped; + *insert_addr += bytes_mapped; + if (ret) { + /* But if vm_insert_pages did fail, we have to unroll some state + * we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + *length_with_pending -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return ret; +} + static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; u32 length = 0, seq, offset, zap_len; + #define PAGE_BATCH_SIZE 8 + struct page *pages[PAGE_BATCH_SIZE]; const skb_frag_t *frags = NULL; struct vm_area_struct *vma; struct sk_buff *skb = NULL; + unsigned long pg_idx = 0; + unsigned long curr_addr; struct tcp_sock *tp; int inq; int ret; @@ -1762,16 +1796,17 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); - down_read(¤t->mm->mmap_sem); + tp = tcp_sk(sk); + + mmap_read_lock(current->mm); vma = find_vma(current->mm, address); if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return -EINVAL; } zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); - tp = tcp_sk(sk); seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); @@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint = zc->length; } ret = 0; + curr_addr = address; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { + /* If we're here, finish the current batch. */ + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pg_idx, + &curr_addr, + &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } if (skb) { if (zc->recv_skip_hint > 0) break; @@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } - zc->recv_skip_hint = skb->len - offset; offset -= skb_headlen(skb); if ((int)offset < 0 || skb_has_frag_list(skb)) @@ -1817,17 +1863,27 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint -= remaining; break; } - ret = vm_insert_page(vma, address + length, - skb_frag_page(frags)); - if (ret) - break; + pages[pg_idx] = skb_frag_page(frags); + pg_idx++; length += PAGE_SIZE; - seq += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; + if (pg_idx == PAGE_BATCH_SIZE) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } + } + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, &seq, + zc); } out: - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (length) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -2635,6 +2691,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -3190,10 +3249,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - err = tp->af_specific->md5_parse(sk, optname, optval, optlen); - else - err = -EINVAL; + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: @@ -3977,11 +4033,14 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; - sg_init_one(&sg, key->key, key->keylen); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); - return crypto_ahash_update(hp->md5_req); + sg_init_one(&sg, key->key, keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); + + /* We use data_race() because tcp_md5_do_add() might change key->key under us */ + return data_race(crypto_ahash_update(hp->md5_req)); } EXPORT_SYMBOL(tcp_md5_hash_key); |