From e4e5aefc113510c03d34e182ab30bc0cc196675c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 May 2020 15:33:51 +0200 Subject: xsk: Change two variable names for increased clarity Change two variables names so that it is clearer what they represent. The first one is xsk_list that in fact only contains the list of AF_XDP sockets with a Tx component. Change this to xsk_tx_list for improved clarity. The second variable is size in the ring structure. One might think that this is the size of the ring, but it is in fact the size of the umem, copied into the ring structure to improve performance. Rename this variable umem_size to avoid any confusion. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1588599232-24897-2-git-send-email-magnus.karlsson@intel.com --- net/xdp/xdp_umem.c | 14 +++++++------- net/xdp/xsk.c | 8 ++++---- net/xdp/xsk_queue.c | 4 ++-- net/xdp/xsk_queue.h | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index ed7a6060f73c..7211f4572760 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) if (!xs->tx) return; - spin_lock_irqsave(&umem->xsk_list_lock, flags); - list_add_rcu(&xs->list, &umem->xsk_list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); + list_add_rcu(&xs->list, &umem->xsk_tx_list); + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); } void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) @@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) if (!xs->tx) return; - spin_lock_irqsave(&umem->xsk_list_lock, flags); + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); } /* The umem is stored both in the _rx struct and the _tx struct as we do @@ -395,8 +395,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; - INIT_LIST_HEAD(&umem->xsk_list); - spin_lock_init(&umem->xsk_list_lock); + INIT_LIST_HEAD(&umem->xsk_tx_list); + spin_lock_init(&umem->xsk_tx_list_lock); refcount_set(&umem->users, 1); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f6e6609f70a3..45ffd67b367d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -75,7 +75,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); @@ -102,7 +102,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); @@ -305,7 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); } @@ -318,7 +318,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index c90e9c1e3c63..57fb81bd593c 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -9,12 +9,12 @@ #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) +void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) { if (!q) return; - q->size = size; + q->umem_size = umem_size; q->chunk_mask = chunk_mask; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b50bb5c76da5..648733ec24ac 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -30,7 +30,7 @@ struct xdp_umem_ring { struct xsk_queue { u64 chunk_mask; - u64 size; + u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -123,7 +123,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, u64 base_addr = xsk_umem_extract_addr(addr); addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->size || addr >= q->size || + if (base_addr >= q->umem_size || addr >= q->umem_size || xskq_cons_crosses_non_contig_pg(umem, addr, length)) { q->invalid_descs++; return false; @@ -134,7 +134,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (addr >= q->size) { + if (addr >= q->umem_size) { q->invalid_descs++; return false; } @@ -379,7 +379,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); +void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -- cgit v1.2.3 From 07bf2d97d1f37e7ac8d7be2d84ff108d43556a1d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 May 2020 15:33:52 +0200 Subject: xsk: Remove unnecessary member in xdp_umem Remove the unnecessary member of address in struct xdp_umem as it is only used during the umem registration. No need to carry this around as it is not used during run-time nor when unregistering the umem. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1588599232-24897-3-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 1 - net/xdp/xdp_umem.c | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index b72f1f4c3b15..67191ccaab85 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -50,7 +50,6 @@ struct xdp_umem { u32 headroom; u32 chunk_size_nohr; struct user_struct *user; - unsigned long address; refcount_t users; struct work_struct work; struct page **pgs; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7211f4572760..37ace3bc0d48 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -279,7 +279,7 @@ void xdp_put_umem(struct xdp_umem *umem) } } -static int xdp_umem_pin_pages(struct xdp_umem *umem) +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) { unsigned int gup_flags = FOLL_WRITE; long npgs; @@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = pin_user_pages(umem->address, umem->npgs, + npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); @@ -385,7 +385,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->address = (unsigned long)addr; umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK : ~((u64)chunk_size - 1); umem->size = size; @@ -404,7 +403,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) return err; - err = xdp_umem_pin_pages(umem); + err = xdp_umem_pin_pages(umem, (unsigned long)addr); if (err) goto out_account; -- cgit v1.2.3 From cb0721c7e200750907bb8ef59b12646a5cb2dadf Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:10 -0700 Subject: net: Refactor arguments of inet{,6}_bind The intent is to add an additional bind parameter in the next commit. Instead of adding another argument, let's convert all existing flag arguments into an extendable bit field. No functional changes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-4-sdf@google.com --- include/net/inet_common.h | 6 +++++- include/net/ipv6_stubs.h | 2 +- net/core/filter.c | 6 ++++-- net/ipv4/af_inet.c | 10 +++++----- net/ipv6/af_inet6.c | 10 +++++----- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index ae2ba897675c..c38f4f7d660a 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -35,8 +35,12 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +/* Don't allocate port at this moment, defer to connect. */ +#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) +/* Grab and release socket lock. */ +#define BIND_WITH_LOCK (1 << 1) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index a5f7c12c326a..6e622dd3122e 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); struct sock *(*udp6_lib_lookup)(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, diff --git a/net/core/filter.c b/net/core/filter.c index dfaf5df13722..fa9ddab5dd1f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4538,7 +4538,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, return err; if (((struct sockaddr_in *)addr)->sin_port != htons(0)) return err; - return __inet_bind(sk, addr, addr_len, true, false); + return __inet_bind(sk, addr, addr_len, + BIND_FORCE_ADDRESS_NO_PORT); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) @@ -4548,7 +4549,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, + BIND_FORCE_ADDRESS_NO_PORT); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6177c4ba0037..68e74b1b0f26 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet_bind(sk, uaddr, addr_len, false, true); + return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -520,7 +520,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; @@ -543,7 +543,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk_dst_reset(sk); err = 0; out_release_sock: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 345baa0a754f..552c2592b81c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -273,7 +273,7 @@ out_rcu_unlock: } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -400,7 +400,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); @@ -423,7 +423,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_dport = 0; inet->inet_daddr = 0; out: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); return err; out_unlock: @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, false, true); + return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet6_bind); -- cgit v1.2.3 From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:11 -0700 Subject: bpf: Allow any port in bpf_bind helper We want to have a tighter control on what ports we bind to in the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means connect() becomes slightly more expensive. The expensive part comes from the fact that we now need to call inet_csk_get_port() that verifies that the port is not used and allocates an entry in the hash table for it. Since we can't rely on "snum || !bind_address_no_port" to prevent us from calling POST_BIND hook anymore, let's add another bind flag to indicate that the call site is BPF program. v5: * fix wrong AF_INET (should be AF_INET6) in the bpf program for v6 v3: * More bpf_bind documentation refinements (Martin KaFai Lau) * Add UDP tests as well (Martin KaFai Lau) * Don't start the thread, just do socket+bind+listen (Martin KaFai Lau) v2: * Update documentation (Andrey Ignatov) * Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com --- include/net/inet_common.h | 2 + include/uapi/linux/bpf.h | 9 +- net/core/filter.c | 18 ++-- net/ipv4/af_inet.c | 10 +- net/ipv6/af_inet6.c | 12 ++- tools/include/uapi/linux/bpf.h | 9 +- .../selftests/bpf/prog_tests/connect_force_port.c | 115 +++++++++++++++++++++ .../selftests/bpf/progs/connect_force_port4.c | 28 +++++ .../selftests/bpf/progs/connect_force_port6.c | 28 +++++ 9 files changed, 203 insertions(+), 28 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_force_port.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port4.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port6.c (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c38f4f7d660a..cb2818862919 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) +/* Called from BPF program. */ +#define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/net/core/filter.c b/net/core/filter.c index fa9ddab5dd1f..da0634979f53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4525,32 +4525,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68e74b1b0f26..fcf0d12a407a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -526,10 +526,12 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out_release_sock; } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 552c2592b81c..771a462a8322 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -407,11 +407,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out; } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..47fbb20cb6a6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_port(int family, int fd, int expected) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected) { + log_err("Unexpected port %d, expected %d", ntohs(port), + expected); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + struct bpf_prog_load_attr attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + }; + struct bpf_object *obj; + int expected_port; + int prog_fd; + int err; + int fd; + + if (family == AF_INET) { + attr.file = "./connect_force_port4.o"; + attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + expected_port = 22222; + } else { + attr.file = "./connect_force_port6.o"; + attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; + expected_port = 22223; + } + + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, + 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_port(family, fd, expected_port); + + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..1b8eb34b2db0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect4") +int _connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..ae6f7d750b4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect6") +int _connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} -- cgit v1.2.3 From 138d0be35b141e09f6b267c6ae4094318d4e4491 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:10 -0700 Subject: net: bpf: Add netlink and ipv6_route bpf_iter targets This patch added netlink and ipv6_route targets, using the same seq_ops (except show() and minor changes for stop()) for /proc/net/{netlink,ipv6_route}. The net namespace for these targets are the current net namespace at file open stage, similar to /proc/net/{netlink,ipv6_route} reference counting the net namespace at seq_file open stage. Since module is not supported for now, ipv6_route is supported only if the IPV6 is built-in, i.e., not compiled as a module. The restriction can be lifted once module is properly supported for bpf_iter. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175910.2476329-1-yhs@fb.com --- fs/proc/proc_net.c | 19 +++++++++++ include/linux/proc_fs.h | 3 ++ net/ipv6/ip6_fib.c | 65 ++++++++++++++++++++++++++++++++++-- net/ipv6/route.c | 37 ++++++++++++++++++++ net/netlink/af_netlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 207 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4888c5224442..dba63b2429f0 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = { .proc_release = seq_release_net, }; +int bpf_iter_init_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + p->net = get_net(current->nsproxy->net_ns); +#endif + return 0; +} + +void bpf_iter_fini_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + put_net(p->net); +#endif +} + struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..03953c59807d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); +extern int bpf_iter_init_seq_net(void *priv_data); +extern void bpf_iter_fini_seq_net(void *priv_data); + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 46ed56719476..a1fcc0ca21af 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void) } #ifdef CONFIG_PROC_FS -static int ipv6_route_seq_show(struct seq_file *seq, void *v) +static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; @@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) return w->node && !(w->state == FWS_U && w->node == w->root); } -static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { struct net *net = seq_file_net(seq); @@ -2637,6 +2637,67 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock_bh(); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; + +static int ipv6_route_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__ipv6_route ctx; + + ctx.meta = meta; + ctx.rt = v; + return bpf_iter_run_prog(prog, &ctx); +} + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct ipv6_route_iter *iter = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return ipv6_route_native_seq_show(seq, v); + + ret = ipv6_route_prog_seq_show(prog, &meta, v); + iter->w.leaf = NULL; + + return ret; +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)ipv6_route_prog_seq_show(prog, &meta, v); + } + + ipv6_route_native_seq_stop(seq, v); +} +#else +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + return ipv6_route_native_seq_show(seq, v); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + ipv6_route_native_seq_stop(seq, v); +} +#endif + const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3912aac7854d..25f6d3e619d0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6393,6 +6393,30 @@ void __init ip6_route_init_special_entries(void) #endif } +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) + +static int __init bpf_iter_register(void) +{ + struct bpf_iter_reg reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), + }; + + return bpf_iter_reg_target(®_info); +} + +static void bpf_iter_unregister(void) +{ + bpf_iter_unreg_target("ipv6_route"); +} +#endif +#endif + int __init ip6_route_init(void) { int ret; @@ -6455,6 +6479,14 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + ret = bpf_iter_register(); + if (ret) + goto out_register_late_subsys; +#endif +#endif + for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); @@ -6487,6 +6519,11 @@ out_kmem_cache: void ip6_route_cleanup(void) { +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_unregister(); +#endif +#endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ded01ca8b20..33cda9baa979 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) return __netlink_seq_next(seq); } -static void netlink_seq_stop(struct seq_file *seq, void *v) +static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; @@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v) } -static int netlink_seq_show(struct seq_file *seq, void *v) +static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__netlink { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct netlink_sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) + +static int netlink_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__netlink ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk = nlk_sk((struct sock *)v); + return bpf_iter_run_prog(prog, &ctx); +} + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return netlink_native_seq_show(seq, v); + + if (v != SEQ_START_TOKEN) + return netlink_prog_seq_show(prog, &meta, v); + + return 0; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)netlink_prog_seq_show(prog, &meta, v); + } + + netlink_native_seq_stop(seq, v); +} +#else +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + return netlink_native_seq_show(seq, v); +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + netlink_native_seq_stop(seq, v); +} +#endif + static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, @@ -2740,6 +2802,21 @@ static const struct rhashtable_params netlink_rhashtable_params = { .automatic_shrinking = true, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static int __init bpf_iter_register(void) +{ + struct bpf_iter_reg reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), + }; + + return bpf_iter_reg_target(®_info); +} +#endif + static int __init netlink_proto_init(void) { int i; @@ -2748,6 +2825,12 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + err = bpf_iter_register(); + if (err) + goto out; +#endif + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- include/linux/bpf.h | 2 +- kernel/bpf/bpf_iter.c | 36 +++++++++++++++++------------------- kernel/bpf/map_iter.c | 18 +++++++++--------- kernel/bpf/task_iter.c | 30 ++++++++++++++++-------------- net/ipv6/route.c | 18 +++++++++--------- net/netlink/af_netlink.c | 18 +++++++++--------- 6 files changed, 61 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ab94dfd8826f..6fa773e2d1bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1153,7 +1153,7 @@ struct bpf_iter_meta { u64 seq_num; }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0a45a6cdfabd..051fb8cab62a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -8,11 +8,7 @@ struct bpf_iter_target_info { struct list_head list; - const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; + const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; @@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->fini_seq_private) - iter_priv->tinfo->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = { .release = iter_release, }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) if (!tinfo) return -ENOMEM; - tinfo->target = reg_info->target; - tinfo->seq_ops = reg_info->seq_ops; - tinfo->init_seq_private = reg_info->init_seq_private; - tinfo->fini_seq_private = reg_info->fini_seq_private; - tinfo->seq_priv_size = reg_info->seq_priv_size; + tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); @@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target) mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->target)) { + if (!strcmp(target, tinfo->reg_info->target)) { list_del(&tinfo->list); kfree(tinfo); found = true; @@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) supported = true; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; @@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->init_seq_private) { - err = tinfo->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8162e0c00b9f..c6216a5fe56e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + static int __init bpf_map_iter_init(void) { - struct bpf_iter_reg reg_info = { - .target = "bpf_map", - .seq_ops = &bpf_map_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&bpf_map_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index aeed662d8451..bd7bfd83d9e0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + static int __init task_iter_init(void) { - struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", - .seq_ops = &task_file_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), - }; - struct bpf_iter_reg task_reg_info = { - .target = "task", - .seq_ops = &task_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), - }; int ret; ret = bpf_iter_reg_target(&task_reg_info); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25f6d3e619d0..6ad2fa51a23a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6397,17 +6397,17 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) +static const struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), +}; + static int __init bpf_iter_register(void) { - struct bpf_iter_reg reg_info = { - .target = "ipv6_route", - .seq_ops = &ipv6_route_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct ipv6_route_iter), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 33cda9baa979..839827227e98 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2803,17 +2803,17 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static const struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), +}; + static int __init bpf_iter_register(void) { - struct bpf_iter_reg reg_info = { - .target = "netlink", - .seq_ops = &netlink_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct nl_seq_iter), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&netlink_reg_info); } #endif -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- include/linux/bpf.h | 2 +- kernel/bpf/bpf_iter.c | 4 ++-- net/ipv6/route.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6fa773e2d1bf..534174eca86b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1154,7 +1154,7 @@ struct bpf_iter_meta { }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); -void bpf_iter_unreg_target(const char *target); +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 051fb8cab62a..644f8626b2c0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) return 0; } -void bpf_iter_unreg_target(const char *target) +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->reg_info->target)) { + if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6ad2fa51a23a..22bf4e36c093 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6412,7 +6412,7 @@ static int __init bpf_iter_register(void) static void bpf_iter_unregister(void) { - bpf_iter_unreg_target("ipv6_route"); + bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++++++- include/net/ip6_fib.h | 7 +++++++ kernel/bpf/bpf_iter.c | 5 +++++ kernel/bpf/btf.c | 15 ++++++++++----- kernel/bpf/map_iter.c | 5 +++++ kernel/bpf/task_iter.c | 12 ++++++++++++ kernel/bpf/verifier.c | 1 - net/ipv6/ip6_fib.c | 5 ----- net/ipv6/route.c | 5 +++++ net/netlink/af_netlink.c | 5 +++++ 10 files changed, 60 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 534174eca86b..c45d198ac38c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -643,6 +643,12 @@ struct bpf_jit_poke_descriptor { u16 reason; }; +/* reg_type info for ctx arguments */ +struct bpf_ctx_arg_aux { + u32 offset; + enum bpf_reg_type reg_type; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -654,12 +660,13 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + u32 ctx_arg_info_size; + const struct bpf_ctx_arg_aux *ctx_arg_info; struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; - bool btf_id_or_null_non0_off; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1139,12 +1146,15 @@ int bpf_obj_get_user(const char __user *pathname, int flags); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); +#define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 ctx_arg_info_size; + struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; }; struct bpf_iter_meta { diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 80262d2980f5..870b646c5797 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -540,6 +540,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; +#endif + #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 644f8626b2c0..dd612b80b9fe 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + return supported; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcd233139294..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - if (off != 0 && prog->aux->btf_id_or_null_non0_off) - info->reg_type = PTR_TO_BTF_ID_OR_NULL; - else - info->reg_type = PTR_TO_BTF_ID; + info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c6216a5fe56e..c69071e334bf 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = { .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index bd7bfd83d9e0..a9b7264dda08 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static const struct bpf_iter_reg task_file_reg_info = { @@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init task_iter_init(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a1826c76bb6..a3f2af756fd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a1fcc0ca21af..250ff52c674e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2638,11 +2638,6 @@ static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) -struct bpf_iter__ipv6_route { - __bpf_md_ptr(struct bpf_iter_meta *, meta); - __bpf_md_ptr(struct fib6_info *, rt); -}; - static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 22bf4e36c093..22e56465f14d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6403,6 +6403,11 @@ static const struct bpf_iter_reg ipv6_route_reg_info = { .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__ipv6_route, rt), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_iter_register(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 839827227e98..4f2c3b14ddbf 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2809,6 +2809,11 @@ static const struct bpf_iter_reg netlink_reg_info = { .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__netlink, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_iter_register(void) -- cgit v1.2.3 From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:27 -0700 Subject: bpf: Support narrow loads from bpf_sock_addr.user_port bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly code in BPF programs, like: volatile __u32 user_port = ctx->user_port; __u16 port = bpf_ntohs(user_port); Since otherwise clang may optimize the load to be 2-byte and it's rejected by verifier. Add support for 1- and 2-byte loads same way as it's supported for other fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 15 +++++++-------- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ diff --git a/net/core/filter.c b/net/core/filter.c index da0634979f53..1fe8c0c2d408 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7029,6 +7029,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7059,10 +7060,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7958,8 +7955,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7994,9 +7991,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ -- cgit v1.2.3 From d56c2f95adb3d401bf982b6cf8fc4bb6d2f7acdd Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:45 -0700 Subject: bpf: Allow sk lookup helpers in cgroup skb Currently sk lookup helpers are allowed in tc, xdp, sk skb, and cgroup sock_addr programs. But they would be useful in cgroup skb as well so that for example cgroup skb ingress program can lookup a peer socket a packet comes from on same host and make a decision whether to allow or deny this packet based on the properties of that socket, e.g. cgroup that peer socket belongs to. Allow the following sk lookup helpers in cgroup skb: * bpf_sk_lookup_tcp; * bpf_sk_lookup_udp; * bpf_sk_release; * bpf_skc_lookup_tcp. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f8c7ee280f1582b586629436d777b6db00597d63.1589486450.git.rdna@fb.com --- net/core/filter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 1fe8c0c2d408..9c3eada5c86c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6159,6 +6159,14 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; #endif #ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: -- cgit v1.2.3 From 06d3e4c9f11afc849dc201ecf9ef7a43eeb1dddd Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:46 -0700 Subject: bpf: Allow skb_ancestor_cgroup_id helper in cgroup skb cgroup skb programs already can use bpf_skb_cgroup_id. Allow bpf_skb_ancestor_cgroup_id as well so that container policies can be implemented for a container that can have sub-cgroups dynamically created, but policies should still be implemented based on cgroup id of container itself not on an id of a sub-cgroup. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/8874194d6041eba190356453ea9f6071edf5f658.1589486450.git.rdna@fb.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 9c3eada5c86c..a47dc5b9dad4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6157,6 +6157,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: -- cgit v1.2.3 From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:47 -0700 Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers With having ability to lookup sockets in cgroup skb programs it becomes useful to access cgroup id of retrieved sockets so that policies can be implemented based on origin cgroup of such socket. For example, a container running in a cgroup can have cgroup skb ingress program that can lookup peer socket that is sending packets to a process inside the container and decide whether those packets should be allowed or denied based on cgroup id of the peer. More specifically such ingress program can implement intra-host policy "allow incoming packets only from this same container and not from any other container on same host" w/o relying on source IP addresses since quite often it can be the case that containers share same IP address on the host. Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and bpf_sk_ancestor_cgroup_id(). These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id helpers with the only difference that sk is used to get cgroup id instead of skb, and share code with them. See documentation in UAPI for more details. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com --- include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- net/core/filter.c | 60 +++++++++++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- 3 files changed, 121 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index a47dc5b9dad4..5815902bb617 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -6159,6 +6197,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3