From 7eced5ab5a7366ee7ca5360b3eca9d220c2b2887 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 3 Jun 2018 12:06:57 +0200 Subject: netfilter: nf_tables: add NFT_LOGLEVEL_* enumeration and use it This is internal, not exposed through uapi, and although it maps with userspace LOG_*, with the introduction of LOGLEVEL_AUDIT we are incurring in namespace pollution. This patch adds the NFT_LOGLEVEL_ enumeration and use it from nft_log. Fixes: 1a893b44de45 ("netfilter: nf_tables: Add audit support to log statement") Signed-off-by: Pablo Neira Ayuso Acked-by: Phil Sutter Signed-off-by: David S. Miller --- net/netfilter/nft_log.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 7eef1cffbf1b..655187bed5d8 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -111,7 +111,7 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_log *priv = nft_expr_priv(expr); if (priv->loginfo.type == NF_LOG_TYPE_LOG && - priv->loginfo.u.log.level == LOGLEVEL_AUDIT) { + priv->loginfo.u.log.level == NFT_LOGLEVEL_AUDIT) { nft_log_eval_audit(pkt); return; } @@ -166,9 +166,9 @@ static int nft_log_init(const struct nft_ctx *ctx, li->u.log.level = ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL])); } else { - li->u.log.level = LOGLEVEL_WARNING; + li->u.log.level = NFT_LOGLEVEL_WARNING; } - if (li->u.log.level > LOGLEVEL_AUDIT) { + if (li->u.log.level > NFT_LOGLEVEL_AUDIT) { err = -EINVAL; goto err1; } @@ -196,7 +196,7 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - if (li->u.log.level == LOGLEVEL_AUDIT) + if (li->u.log.level == NFT_LOGLEVEL_AUDIT) return 0; err = nf_logger_find_get(ctx->family, li->type); @@ -220,7 +220,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); - if (li->u.log.level == LOGLEVEL_AUDIT) + if (li->u.log.level == NFT_LOGLEVEL_AUDIT) return; nf_logger_put(ctx->family, li->type); -- cgit v1.2.3 From fd3a88625844907151737fc3b4201676effa6d27 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 6 Jun 2018 11:23:01 -0400 Subject: net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload holds vlan Tun, tap, virtio, packet and uml vector all use struct virtio_net_hdr to communicate packet metadata to userspace. For skbuffs with vlan, the first two return the packet as it may have existed on the wire, inserting the VLAN tag in the user buffer. Then virtio_net_hdr.csum_start needs to be adjusted by VLAN_HLEN bytes. Commit f09e2249c4f5 ("macvtap: restore vlan header on user read") added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix csum_start when VLAN tags are present") then fixed up csum_start. Virtio, packet and uml do not insert the vlan header in the user buffer. When introducing virtio_net_hdr_from_skb to deduplicate filling in the virtio_net_hdr, the variant from macvtap which adds VLAN_HLEN was applied uniformly, breaking csum offset for packets with vlan on virtio and packet. Make insertion of VLAN_HLEN optional. Convert the callers to pass it when needed. Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Fixes: 1276f24eeef2 ("packet: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- arch/um/drivers/vector_transports.c | 3 ++- drivers/net/tap.c | 5 ++++- drivers/net/tun.c | 3 ++- drivers/net/virtio_net.c | 3 ++- include/linux/virtio_net.h | 11 ++++------- net/packet/af_packet.c | 4 ++-- 6 files changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c index 9065047f844b..77e4ebc206ae 100644 --- a/arch/um/drivers/vector_transports.c +++ b/arch/um/drivers/vector_transports.c @@ -120,7 +120,8 @@ static int raw_form_header(uint8_t *header, skb, vheader, virtio_legacy_is_little_endian(), - false + false, + 0 ); return 0; diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 9b6cb780affe..f0f7cd977667 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -774,13 +774,16 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) { + int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; struct virtio_net_hdr vnet_hdr; + vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &vnet_hdr, - tap_is_little_endian(q), true)) + tap_is_little_endian(q), true, + vlan_hlen)) BUG(); if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 85e14adf5207..a192a017cc68 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2089,7 +2089,8 @@ static ssize_t tun_put_user(struct tun_struct *tun, return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun), true)) { + tun_is_little_endian(tun), true, + vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); pr_err("unexpected GSO type: " "0x%x, gso_size %d, hdr_len %d\n", diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2aaa18ec7d46..1619ee3070b6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1411,7 +1411,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) hdr = skb_vnet_hdr(skb); if (virtio_net_hdr_from_skb(skb, &hdr->hdr, - virtio_is_little_endian(vi->vdev), false)) + virtio_is_little_endian(vi->vdev), false, + 0)) BUG(); if (vi->mergeable_rx_bufs) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index f144216febc6..9397628a1967 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -58,7 +58,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, - bool has_data_valid) + bool has_data_valid, + int vlan_hlen) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ @@ -83,12 +84,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - if (skb_vlan_tag_present(skb)) - hdr->csum_start = __cpu_to_virtio16(little_endian, - skb_checksum_start_offset(skb) + VLAN_HLEN); - else - hdr->csum_start = __cpu_to_virtio16(little_endian, - skb_checksum_start_offset(skb)); + hdr->csum_start = __cpu_to_virtio16(little_endian, + skb_checksum_start_offset(skb) + vlan_hlen); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); } else if (has_data_valid && diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 54ce66f68482..ee018564b2b4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2005,7 +2005,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, return -EINVAL; *len -= sizeof(vnet_hdr); - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true)) + if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); @@ -2272,7 +2272,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (do_vnet) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le(), true)) { + vio_le(), true, 0)) { spin_lock(&sk->sk_receive_queue.lock); goto drop_n_account; } -- cgit v1.2.3 From 000ade8016400d93b4d7c89970d96b8c14773d45 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 6 Jun 2018 15:56:54 -0700 Subject: ip_tunnel: Fix name string concatenate in __ip_tunnel_create() By passing a limit of 2 bytes to strncat, strncat is limited to writing fewer bytes than what it's supposed to append to the name here. Since the bounds are checked on the line above this, just remove the string bounds checks entirely since they're unneeded. Signed-off-by: Sultan Alsawaf Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 38d906baf1df..c4f5602308ed 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -261,8 +261,8 @@ static struct net_device *__ip_tunnel_create(struct net *net, } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; - strlcpy(name, ops->kind, IFNAMSIZ); - strncat(name, "%d", 2); + strcpy(name, ops->kind); + strcat(name, "%d"); } ASSERT_RTNL(); -- cgit v1.2.3 From 8d97ca6b6755bf7ef57d323642ca9ee80d689782 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 10:29:29 -0700 Subject: bpfilter: fix OUTPUT_FORMAT CONFIG_OUTPUT_FORMAT is x86 only macro. Used objdump to extract elf file format. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Reported-by: David S. Miller Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index aafa72001fcd..e0bbe7583e58 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -21,7 +21,7 @@ endif # which bpfilter_kern.c passes further into umh blob loader at run-time quiet_cmd_copy_umh = GEN $@ cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ - $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \ + $(OBJCOPY) -I binary -O `$(OBJDUMP) -f $<|grep format|cut -d' ' -f8` \ -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ --rename-section .data=.init.rodata $< $@ -- cgit v1.2.3 From a5a16e43529b5040760ebf9bd9b056dd34861f93 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 Jun 2018 15:37:34 +0200 Subject: xsk: Fix umem fill/completion queue mmap on 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2 on 32-bit: net/xdp/xsk.c:663: warning: integer constant is too large for ‘long’ type net/xdp/xsk.c:665: warning: integer constant is too large for ‘long’ type Add the missing "ULL" suffixes to the large XDP_UMEM_PGOFF_*_RING values to fix this. net/xdp/xsk.c:663: warning: comparison is always false due to limited range of data type net/xdp/xsk.c:665: warning: comparison is always false due to limited range of data type "unsigned long" is 32-bit on 32-bit systems, hence the offset is truncated, and can never be equal to any of the XDP_UMEM_PGOFF_*_RING values. Use loff_t (and the required cast) to fix this. Fixes: 423f38329d267969 ("xsk: add umem fill queue support and mmap") Fixes: fe2308328cd2f26e ("xsk: add umem completion queue support and mmap") Signed-off-by: Geert Uytterhoeven Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 4 ++-- net/xdp/xsk.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 1fa0e977ea8d..caed8b1614ff 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -63,8 +63,8 @@ struct xdp_statistics { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 -#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL /* Rx/Tx descriptor */ struct xdp_desc { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c6ed2454f7ce..36919a254ba3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -643,7 +643,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; -- cgit v1.2.3 From c09290c5637692a9bfe7740e4c5e693efff12810 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Jun 2018 00:06:01 +0200 Subject: bpf, xdp: fix crash in xdp_umem_unaccount_pages syzkaller was able to trigger the following panic for AF_XDP: BUG: KASAN: null-ptr-deref in atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] BUG: KASAN: null-ptr-deref in atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] BUG: KASAN: null-ptr-deref in xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 Write of size 8 at addr 0000000000000060 by task syz-executor246/4527 CPU: 1 PID: 4527 Comm: syz-executor246 Not tainted 4.17.0+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 kasan_report_error mm/kasan/report.c:352 [inline] kasan_report.cold.7+0x6d/0x2fe mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278 atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 xdp_umem_reg net/xdp/xdp_umem.c:334 [inline] xdp_umem_create+0xd6c/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe In xdp_umem_reg() the call to xdp_umem_account_pages() passed with CAP_IPC_LOCK where we didn't need to end up charging rlimit on memlock for the current user and therefore umem->user continues to be NULL. Later on through fault injection syzkaller triggered a failure in either umem->pgs or umem->pages allocation such that we bail out and undo accounting in xdp_umem_unaccount_pages() where we eventually hit the panic since it tries to deref the umem->user. The code is pretty close to mm_account_pinned_pages() and mm_unaccount_pinned_pages() pair and potentially could reuse it even in a later cleanup, and it appears that the initial commit c0c77d8fb787 ("xsk: add user memory registration support sockopt") got this right while later follow-up introduced the bug via a49049ea2576 ("xsk: simplified umem setup"). Fixes: a49049ea2576 ("xsk: simplified umem setup") Reported-by: syzbot+979217770b09ebf5c407@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/xdp/xdp_umem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7eb4948a38d2..b9ef487c4618 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -132,8 +132,10 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } } static void xdp_umem_release(struct xdp_umem *umem) -- cgit v1.2.3 From 66e58e0ef80a56a1d7857b6ce121141563cdd93e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 15:31:14 -0700 Subject: bpfilter: fix race in pipe access syzbot reported the following crash [ 338.293946] bpfilter: read fail -512 [ 338.304515] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 338.311863] general protection fault: 0000 [#1] SMP KASAN [ 338.344360] RIP: 0010:__vfs_write+0x4a6/0x960 [ 338.426363] Call Trace: [ 338.456967] __kernel_write+0x10c/0x380 [ 338.460928] __bpfilter_process_sockopt+0x1d8/0x35b [ 338.487103] bpfilter_mbox_request+0x4d/0xb0 [ 338.491492] bpfilter_ip_get_sockopt+0x6b/0x90 This can happen when multiple cpus trying to talk to user mode process via bpfilter_mbox_request(). One cpu grabs the mutex while another goes to sleep on the same mutex. Then former cpu sees that umh pipe is down and shuts down the pipes. Later cpu finally acquires the mutex and crashes on freed pipe. Fix the race by using info.pid as an indicator that umh and pipes are healthy and check it after acquiring the mutex. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Reported-by: syzbot+7ade6c94abb2774c0fee@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index b13d058f8c34..09522573f611 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -24,17 +24,19 @@ static void shutdown_umh(struct umh_info *info) { struct task_struct *tsk; + if (!info->pid) + return; tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); if (tsk) force_sig(SIGKILL, tsk); fput(info->pipe_to_umh); fput(info->pipe_from_umh); + info->pid = 0; } static void __stop_umh(void) { - if (IS_ENABLED(CONFIG_INET) && - bpfilter_process_sockopt) { + if (IS_ENABLED(CONFIG_INET)) { bpfilter_process_sockopt = NULL; shutdown_umh(&info); } @@ -55,7 +57,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, struct mbox_reply reply; loff_t pos; ssize_t n; - int ret; + int ret = -EFAULT; req.is_set = is_set; req.pid = current->pid; @@ -63,6 +65,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.addr = (long)optval; req.len = optlen; mutex_lock(&bpfilter_lock); + if (!info.pid) + goto out; n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); if (n != sizeof(req)) { pr_err("write fail %zd\n", n); -- cgit v1.2.3 From 8d499533e0bc02d44283dbdab03142b599b8ba16 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 8 Jun 2018 05:02:31 +0200 Subject: net/sched: act_simple: fix parsing of TCA_DEF_DATA use nla_strlcpy() to avoid copying data beyond the length of TCA_DEF_DATA netlink attribute, in case it is less than SIMP_MAX_DATA and it does not end with '\0' character. v2: fix errors in the commit message, thanks Hangbin Liu Fixes: fa1b1cff3d06 ("net_cls_act: Make act_simple use of netlink policy.") Signed-off-by: Davide Caratti Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_simple.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 9618b4a83cee..98c4afe7c15b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -53,22 +53,22 @@ static void tcf_simp_release(struct tc_action *a) kfree(d->tcfd_defdata); } -static int alloc_defdata(struct tcf_defact *d, char *defdata) +static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) { d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; - strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } -static void reset_policy(struct tcf_defact *d, char *defdata, +static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata, struct tc_defact *p) { spin_lock_bh(&d->tcf_lock); d->tcf_action = p->action; memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); - strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); } @@ -87,7 +87,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct tcf_defact *d; bool exists = false; int ret = 0, err; - char *defdata; if (nla == NULL) return -EINVAL; @@ -110,8 +109,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; } - defdata = nla_data(tb[TCA_DEF_DATA]); - if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); @@ -119,7 +116,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return ret; d = to_defact(*a); - ret = alloc_defdata(d, defdata); + ret = alloc_defdata(d, tb[TCA_DEF_DATA]); if (ret < 0) { tcf_idr_release(*a, bind); return ret; @@ -133,7 +130,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!ovr) return -EEXIST; - reset_policy(d, defdata, parm); + reset_policy(d, tb[TCA_DEF_DATA], parm); } if (ret == ACT_P_CREATED) -- cgit v1.2.3 From 6c206b20092a3623184cff9470dba75d21507874 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Jun 2018 11:35:40 +0200 Subject: udp: fix rx queue len reported by diag and proc interface After commit 6b229cf77d68 ("udp: add batching to udp_rmem_release()") the sk_rmem_alloc field does not measure exactly anymore the receive queue length, because we batch the rmem release. The issue is really apparent only after commit 0d4a6608f68c ("udp: do rmem bulk free even if the rx sk queue is empty"): the user space can easily check for an empty socket with not-0 queue length reported by the 'ss' tool or the procfs interface. We need to use a custom UDP helper to report the correct queue length, taking into account the forward allocation deficit. Reported-by: trevor.francis@46labs.com Fixes: 6b229cf77d68 ("UDP: add batching to udp_rmem_release()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/transp_v6.h | 11 +++++++++-- include/net/udp.h | 5 +++++ net/ipv4/udp.c | 2 +- net/ipv4/udp_diag.c | 2 +- net/ipv6/datagram.c | 6 +++--- net/ipv6/udp.c | 3 ++- 6 files changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index c4f5caaf3778..f6a3543e5247 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -45,8 +45,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc); -void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, - __u16 srcp, __u16 destp, int bucket); +void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int rqueue, int bucket); +static inline void +ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, + __u16 destp, int bucket) +{ + __ip6_dgram_sock_seq_show(seq, sp, srcp, destp, sk_rmem_alloc_get(sp), + bucket); +} #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) diff --git a/include/net/udp.h b/include/net/udp.h index 7ba0ed252c52..b1ea8b0f5e6a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -247,6 +247,11 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, return htons((((u64) hash * (max - min)) >> 32) + min); } +static inline int udp_rqueue_get(struct sock *sk) +{ + return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3365362cac88..9bb27df4dac5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2772,7 +2772,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), + udp_rqueue_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index d0390d844ac8..d9ad986c7b2c 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -163,7 +163,7 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { - r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a02ad100f0d7..2ee08b6a86a4 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1019,8 +1019,8 @@ exit_f: } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); -void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, - __u16 srcp, __u16 destp, int bucket) +void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; @@ -1036,7 +1036,7 @@ void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), + rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 164afd31aebf..e6645cae403e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1523,7 +1523,8 @@ int udp6_seq_show(struct seq_file *seq, void *v) struct inet_sock *inet = inet_sk(v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); - ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + __ip6_dgram_sock_seq_show(seq, v, srcp, destp, + udp_rqueue_get(v), bucket); } return 0; } -- cgit v1.2.3 From 873aca2ee86e533665a73292c4e308ded1e9bafe Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 8 Jun 2018 15:11:47 +0200 Subject: net: bridge: Fix locking in br_fdb_find_port() Callers of br_fdb_find() need to hold the hash lock, which br_fdb_find_port() doesn't do. However, since br_fdb_find_port() is not doing any actual FDB manipulation, the hash lock is not really needed at all. So convert to br_fdb_find_rcu(), surrounded by rcu_read_lock() / _unlock() pair. The device pointer copied from inside the FDB entry is then kept alive by the RTNL lock, which br_fdb_find_port() asserts. Fixes: 4d4fd36126d6 ("net: bridge: Publish bridge accessor functions") Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b19e3104afd6..502f66349530 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -135,9 +135,11 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, return NULL; br = netdev_priv(br_dev); - f = br_fdb_find(br, addr, vid); + rcu_read_lock(); + f = br_fdb_find_rcu(br, addr, vid); if (f && f->dst) dev = f->dst->dev; + rcu_read_unlock(); return dev; } -- cgit v1.2.3 From 6d8c50dcb029872b298eea68cc6209c866fd3e14 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 7 Jun 2018 13:39:49 -0700 Subject: socket: close race condition between sock_close() and sockfs_setattr() fchownat() doesn't even hold refcnt of fd until it figures out fd is really needed (otherwise is ignored) and releases it after it resolves the path. This means sock_close() could race with sockfs_setattr(), which leads to a NULL pointer dereference since typically we set sock->sk to NULL in ->release(). As pointed out by Al, this is unique to sockfs. So we can fix this in socket layer by acquiring inode_lock in sock_close() and checking against NULL in sockfs_setattr(). sock_release() is called in many places, only the sock_close() path matters here. And fortunately, this should not affect normal sock_close() as it is only called when the last fd refcnt is gone. It only affects sock_close() with a parallel sockfs_setattr() in progress, which is not common. Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Reported-by: shankarapailoor Cc: Tetsuo Handa Cc: Lorenzo Colitti Cc: Al Viro Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/socket.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index af57d85bcb48..8a109012608a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -541,7 +541,10 @@ static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); - sock->sk->sk_uid = iattr->ia_uid; + if (sock->sk) + sock->sk->sk_uid = iattr->ia_uid; + else + err = -ENOENT; } return err; @@ -590,12 +593,16 @@ EXPORT_SYMBOL(sock_alloc); * an inode not a file. */ -void sock_release(struct socket *sock) +static void __sock_release(struct socket *sock, struct inode *inode) { if (sock->ops) { struct module *owner = sock->ops->owner; + if (inode) + inode_lock(inode); sock->ops->release(sock); + if (inode) + inode_unlock(inode); sock->ops = NULL; module_put(owner); } @@ -609,6 +616,11 @@ void sock_release(struct socket *sock) } sock->file = NULL; } + +void sock_release(struct socket *sock) +{ + __sock_release(sock, NULL); +} EXPORT_SYMBOL(sock_release); void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) @@ -1171,7 +1183,7 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma) static int sock_close(struct inode *inode, struct file *filp) { - sock_release(SOCKET_I(inode)); + __sock_release(SOCKET_I(inode), inode); return 0; } -- cgit v1.2.3 From 867f816badc01e6da655028810d468c9f935b37c Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 8 Jun 2018 22:47:10 -0400 Subject: tcp: limit sk_rcvlowat by the maximum receive buffer The user-provided value to setsockopt(SO_RCVLOWAT) can be larger than the maximum possible receive buffer. Such values mute POLLIN signals on the socket which can stall progress on the socket. Limit the user-provided value to half of the maximum receive buffer, i.e., half of sk_rcvbuf when the receive buffer size is set by the user, or otherwise half of sysctl_tcp_rmem[2]. Fixes: d1361840f8c5 ("tcp: fix SO_RCVLOWAT and RCVBUF autotuning") Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2741953adaba..141acd92e58a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1694,6 +1694,13 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { + int cap; + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + cap = sk->sk_rcvbuf >> 1; + else + cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; + val = min(val, cap); sk->sk_rcvlowat = val ? : 1; /* Check if we need to signal EPOLLIN right now */ @@ -1702,12 +1709,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - /* val comes from user space and might be close to INT_MAX */ val <<= 1; - if (val < 0) - val = INT_MAX; - - val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (val > sk->sk_rcvbuf) { sk->sk_rcvbuf = val; tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); -- cgit v1.2.3