From 18ebe16d10496db795a9930c320c45653207a548 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 1 Oct 2020 18:09:52 -0700 Subject: bpf, sockmap: Add skb_adjust_room to pop bytes off ingress payload This implements a new helper skb_adjust_room() so users can push/pop extra bytes from a BPF_SK_SKB_STREAM_VERDICT program. Some protocols may include headers and other information that we may not want to include when doing a redirect from a BPF_SK_SKB_STREAM_VERDICT program. One use case is to redirect TLS packets into a receive socket that doesn't expect TLS data. In TLS case the first 13B or so contain the protocol header. With KTLS the payload is decrypted so we should be able to redirect this to a receiving socket, but the receiving socket may not be expecting to receive a TLS header and discard the data. Using the above helper we can pop the header off and put an appropriate header on the payload. This allows for creating a proxy between protocols without extra hops through the stack or userspace. So in order to fix this case add skb_adjust_room() so users can strip the header. After this the user can strip the header and an unmodified receiver thread will work correctly when data is redirected into the ingress path of a sock. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160160099197.7052.8443193973242831692.stgit@john-Precision-5820-Tower --- net/core/filter.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 3fb6adad1957..05df73780dd3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -76,6 +76,7 @@ #include #include #include +#include static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -3479,6 +3480,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) SKB_MAX_ALLOC; } +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + u32 len_diff_abs = abs(len_diff); + bool shrink = len_diff < 0; + int ret = 0; + + if (unlikely(flags || mode)) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + + if (!shrink) { + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + __skb_push(skb, len_diff_abs); + memset(skb->data, 0, len_diff_abs); + } else { + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) + return -ENOMEM; + __skb_pull(skb, len_diff_abs); + } + bpf_compute_data_end_sk_skb(skb); + if (tls_sw_has_ctx_rx(skb->sk)) { + struct strp_msg *rxm = strp_msg(skb); + + rxm->full_len += len_diff; + } + return ret; +} + +static const struct bpf_func_proto sk_skb_adjust_room_proto = { + .func = sk_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { @@ -6745,6 +6788,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_tail || func == sk_skb_change_tail || func == bpf_skb_adjust_room || + func == sk_skb_adjust_room || func == bpf_skb_pull_data || func == sk_skb_pull_data || func == bpf_clone_redirect || @@ -7218,6 +7262,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; + case BPF_FUNC_skb_adjust_room: + return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: -- cgit v1.2.3 From eca43ee6c46db92dd850ce659316b0680d70e137 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Fri, 9 Oct 2020 07:03:25 +0000 Subject: bpf: Add tcp_notsent_lowat bpf setsockopt Adding support for TCP_NOTSENT_LOWAT sockoption (https://lwn.net/Articles/560082/) in tcp bpf programs. Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201009070325.226855-1-tehnerd@tehnerd.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 4 ++++ tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/connect4_prog.c | 19 +++++++++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/net/core/filter.c b/net/core/filter.c index 05df73780dd3..5da44b11e1ec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4827,6 +4827,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else icsk->icsk_user_timeout = val; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index b1b2773c0b9d..a943d394fd3a 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -23,6 +23,10 @@ #define TCP_CA_NAME_MAX 16 #endif +#ifndef TCP_NOTSENT_LOWAT +#define TCP_NOTSENT_LOWAT 25 +#endif + #ifndef IFNAMSIZ #define IFNAMSIZ 16 #endif @@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx) +{ + int lowat = 65535; + + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat))) + return 1; + } + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (set_keepalive(ctx)) return 0; + if (set_notsent_lowat(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3 From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- drivers/net/veth.c | 9 +++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/bpf.h | 17 +++++++++++++ net/core/dev.c | 15 +++++++++--- net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 17 +++++++++++++ 6 files changed, 106 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 091e5b4ba042..8c737668008a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev) return smp_processor_id() % dev->real_num_rx_queues; } +static struct net_device *veth_peer_dev(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + /* Callers must be under RCU read side. */ + return rcu_dereference(priv->peer); +} + static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) @@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, + .ndo_get_peer_dev = veth_peer_dev, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0533f86018dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1277,6 +1277,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1484,6 +1487,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/dev.c b/net/core/dev.c index 9d55bf5d1a65..7dd015823593 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5163,7 +5167,12 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 5da44b11e1ec..fab951c6be57 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2380,8 +2380,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; - if (unlikely(!dev)) { - kfree_skb(skb); - return -EINVAL; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !is_skb_forwardable(dev, skb) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; } - return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev) : __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) @@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From d1c362e1dd68a421cf9033404cf141a4ab734a5d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 9 Oct 2020 20:42:34 +0200 Subject: bpf: Always return target ifindex in bpf_fib_lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_fib_lookup() helper performs a neighbour lookup for the destination IP and returns BPF_FIB_LKUP_NO_NEIGH if this fails, with the expectation that the BPF program will pass the packet up the stack in this case. However, with the addition of bpf_redirect_neigh() that can be used instead to perform the neighbour lookup, at the cost of a bit of duplicated work. For that we still need the target ifindex, and since bpf_fib_lookup() already has that at the time it performs the neighbour lookup, there is really no reason why it can't just return it in any case. So let's just always return the ifindex if the FIB lookup itself succeeds. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: David Ahern Link: https://lore.kernel.org/bpf/20201009184234.134214-1-toke@redhat.com --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index fab951c6be57..96ec4cbbf4e0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5234,7 +5234,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; - params->ifindex = dev->ifindex; return 0; } @@ -5331,6 +5330,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here @@ -5456,6 +5456,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. -- cgit v1.2.3 From cfea28f890cf292d5fe90680db64b68086ef25ba Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 9 Oct 2020 11:36:16 -0700 Subject: bpf, sockmap: Skb verdict SK_PASS to self already checked rmem limits For sk_skb case where skb_verdict program returns SK_PASS to continue to pass packet up the stack, the memory limits were already checked before enqueuing in skb_queue_tail from TCP side. So, lets remove the extra checks here. The theory is if the TCP stack believes we have memory to receive the packet then lets trust the stack and not double check the limits. In fact the accounting here can cause a drop if sk_rmem_alloc has increased after the stack accepted this packet, but before the duplicate check here. And worse if this happens because TCP stack already believes the data has been received there is no retransmit. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160226857664.5692.668205469388498375.stgit@john-Precision-5820-Tower --- net/core/skmsg.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4b5f7c8fecd1..040ae1d75b65 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -771,6 +771,7 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { + struct tcp_skb_cb *tcp; struct sock *sk_other; switch (verdict) { @@ -780,16 +781,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { goto out_free; } - if (atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf) { - struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); - break; - } - goto out_free; + tcp = TCP_SKB_CB(skb); + tcp->bpf.flags |= BPF_F_INGRESS; + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + break; case __SK_REDIRECT: sk_psock_skb_redirect(skb); break; -- cgit v1.2.3 From 9ecbfb06a078c4911fb444203e8e41d93d22f886 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 9 Oct 2020 11:36:37 -0700 Subject: bpf, sockmap: On receive programs try to fast track SK_PASS ingress When we receive an skb and the ingress skb verdict program returns SK_PASS we currently set the ingress flag and put it on the workqueue so it can be turned into a sk_msg and put on the sk_msg ingress queue. Then finally telling userspace with data_ready hook. Here we observe that if the workqueue is empty then we can try to convert into a sk_msg type and call data_ready directly without bouncing through a workqueue. Its a common pattern to have a recv verdict program for visibility that always returns SK_PASS. In this case unless there is an ENOMEM error or we overrun the socket we can avoid the workqueue completely only using it when we fall back to error cases caused by memory pressure. By doing this we eliminate another case where data may be dropped if errors occur on memory limits in workqueue. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160226859704.5692.12929678876744977669.stgit@john-Precision-5820-Tower --- net/core/skmsg.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 040ae1d75b65..4b160d97b7f9 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -773,6 +773,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, { struct tcp_skb_cb *tcp; struct sock *sk_other; + int err = -EIO; switch (verdict) { case __SK_PASS: @@ -784,8 +785,20 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, tcp = TCP_SKB_CB(skb); tcp->bpf.flags |= BPF_F_INGRESS; - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + + /* If the queue is empty then we can submit directly + * into the msg queue. If its not empty we have to + * queue work otherwise we may get OOO data. Otherwise, + * if sk_psock_skb_ingress errors will be handled by + * retrying later from workqueue. + */ + if (skb_queue_empty(&psock->ingress_skb)) { + err = sk_psock_skb_ingress(psock, skb); + } + if (err < 0) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } break; case __SK_REDIRECT: sk_psock_skb_redirect(skb); -- cgit v1.2.3 From 29545f4977cf7c7b721aa4d43418632af1d6836d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 9 Oct 2020 11:36:57 -0700 Subject: bpf, sockmap: Remove skb_set_owner_w wmem will be taken later from sendpage The skb_set_owner_w is unnecessary here. The sendpage call will create a fresh skb and set the owner correctly from workqueue. Its also not entirely harmless because it consumes cycles, but also impacts resource accounting by increasing sk_wmem_alloc. This is charging the socket we are going to send to for the skb, but we will put it on the workqueue for some time before this happens so we are artifically inflating sk_wmem_alloc for this period. Further, we don't know how many skbs will be used to send the packet or how it will be broken up when sent over the new socket so charging it with one big sum is also not correct when the workqueue may break it up if facing memory pressure. Seeing we don't know how/when this is going to be sent drop the early accounting. A later patch will do proper accounting charged on receive socket for the case where skbs get enqueued on the workqueue. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160226861708.5692.17964237936462425136.stgit@john-Precision-5820-Tower --- net/core/skmsg.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4b160d97b7f9..7389d5d7e7f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -728,8 +728,6 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) (ingress && atomic_read(&sk_other->sk_rmem_alloc) <= sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); } else { -- cgit v1.2.3 From 9047f19e7ccba5073d983dd3882b5f46086b578a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 9 Oct 2020 11:37:17 -0700 Subject: bpf, sockmap: Remove dropped data on errors in redirect case In the sk_skb redirect case we didn't handle the case where we overrun the sk_rmem_alloc entry on ingress redirect or sk_wmem_alloc on egress. Because we didn't have anything implemented we simply dropped the skb. This meant data could be dropped if socket memory accounting was in place. This fixes the above dropped data case by moving the memory checks later in the code where we actually do the send or recv. This pushes those checks into the workqueue and allows us to return an EAGAIN error which in turn allows us to try again later from the workqueue. Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160226863689.5692.13861422742592309285.stgit@john-Precision-5820-Tower --- net/core/skmsg.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 7389d5d7e7f8..880b84baab5e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - if (ingress) - return sk_psock_skb_ingress(psock, skb); - else + if (!ingress) { + if (!sock_writeable(psock->sk)) + return -EAGAIN; return skb_send_sock_locked(psock->sk, skb, off, len); + } + return sk_psock_skb_ingress(psock, skb); } static void sk_psock_backlog(struct work_struct *work) @@ -709,30 +711,28 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; - bool ingress; sk_other = tcp_skb_bpf_redirect_fetch(skb); + /* This error is a buggy BPF program, it returned a redirect + * return code, but then didn't set a redirect interface. + */ if (unlikely(!sk_other)) { kfree_skb(skb); return; } psock_other = sk_psock(sk_other); + /* This error indicates the socket is being torn down or had another + * error that caused the pipe to break. We can't send a packet on + * a socket that is in this state so we drop the skb. + */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { kfree_skb(skb); return; } - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - } else { - kfree_skb(skb); - } + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) -- cgit v1.2.3 From 10d58d006356a075a7b056e0f6502db416d1a261 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 9 Oct 2020 11:37:35 -0700 Subject: bpf, sockmap: Remove skb_orphan and let normal skb_kfree do cleanup Calling skb_orphan() is unnecessary in the strp rcv handler because the skb is from a skb_clone() in __strp_recv. So it never has a destructor or a sk assigned. Plus its confusing to read because it might hint to the reader that the skb could have an sk assigned which is not true. Even if we did have an sk assigned it would be cleaner to simply wait for the upcoming kfree_skb(). Additionally, move the comment about strparser clone up so its closer to the logic it is describing and add to it so that it is more complete. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160226865548.5692.9098315689984599579.stgit@john-Precision-5820-Tower --- net/core/skmsg.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 880b84baab5e..3e78f2a80747 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -686,15 +686,16 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, { int ret; + /* strparser clones the skb before handing it to a upper layer, + * meaning we have the same data, but sk is NULL. We do want an + * sk pointer though when we run the BPF program. So we set it + * here and then NULL it to ensure we don't trigger a BUG_ON() + * in skb/sk operations later if kfree_skb is called with a + * valid skb->sk pointer and no destructor assigned. + */ skb->sk = psock->sk; bpf_compute_data_end_sk_skb(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); - /* strparser clones the skb before handing it to a upper layer, - * meaning skb_orphan has been called. We NULL sk on the way out - * to ensure we don't trigger a BUG_ON() in skb/sk operations - * later and because we are not charging the memory of this skb - * to any socket yet. - */ skb->sk = NULL; return ret; } @@ -824,7 +825,6 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) } prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - skb_orphan(skb); tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); -- cgit v1.2.3 From 0b17ad25d8d102ffb83dcc99ebd24719194ebf4f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 9 Oct 2020 11:37:55 -0700 Subject: bpf, sockmap: Add memory accounting so skbs on ingress lists are visible Move skb->sk assignment out of sk_psock_bpf_run() and into individual callers. Then we can use proper skb_set_owner_r() call to assign a sk to a skb. This improves things by also charging the truesize against the sockets sk_rmem_alloc counter. With this done we get some accounting in place to ensure the memory associated with skbs on the workqueue are still being accounted for somewhere. Finally, by using skb_set_owner_r the destructor is setup so we can just let the normal skb_kfree logic recover the memory. Combined with previous patch dropping skb_orphan() we now can recover from memory pressure and maintain accounting. Note, we will charge the skbs against their originating socket even if being redirected into another socket. Once the skb completes the redirect op the kfree_skb will give the memory back. This is important because if we charged the socket we are redirecting to (like it was done before this series) the sock_writeable() test could fail because of the skb trying to be sent is already charged against the socket. Also TLS case is special. Here we wait until we have decided not to simply PASS the packet up the stack. In the case where we PASS the packet up the stack we already have an skb which is accounted for on the TLS socket context. For the parser case we continue to just set/clear skb->sk this is because the skb being used here may be combined with other skbs or turned into multiple skbs depending on the parser logic. For example the parser could request a payload length greater than skb->len so that the strparser needs to collect multiple skbs. At any rate the final result will be handled in the strparser recv callback. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160226867513.5692.10579573214635925960.stgit@john-Precision-5820-Tower --- net/core/skmsg.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3e78f2a80747..881a5b290946 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -684,20 +684,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - int ret; - - /* strparser clones the skb before handing it to a upper layer, - * meaning we have the same data, but sk is NULL. We do want an - * sk pointer though when we run the BPF program. So we set it - * here and then NULL it to ensure we don't trigger a BUG_ON() - * in skb/sk operations later if kfree_skb is called with a - * valid skb->sk pointer and no destructor assigned. - */ - skb->sk = psock->sk; bpf_compute_data_end_sk_skb(skb); - ret = bpf_prog_run_pin_on_cpu(prog, skb); - skb->sk = NULL; - return ret; + return bpf_prog_run_pin_on_cpu(prog, skb); } static struct sk_psock *sk_psock_from_strp(struct strparser *strp) @@ -736,10 +724,11 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) schedule_work(&psock_other->work); } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) { switch (verdict) { case __SK_REDIRECT: + skb_set_owner_r(skb, sk); sk_psock_skb_redirect(skb); break; case __SK_PASS: @@ -757,11 +746,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { + /* We skip full set_owner_r here because if we do a SK_PASS + * or SK_DROP we can skip skb memory accounting and use the + * TLS context. + */ + skb->sk = psock->sk; tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, ret); + sk_psock_tls_verdict_apply(skb, psock->sk, ret); rcu_read_unlock(); return ret; } @@ -823,6 +818,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) kfree_skb(skb); goto out; } + skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { tcp_skb_bpf_redirect_clear(skb); @@ -847,8 +843,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_parser); - if (likely(prog)) + if (likely(prog)) { + skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); + skb->sk = NULL; + } rcu_read_unlock(); return ret; } -- cgit v1.2.3 From 743df8b7749fb5a289fc0c7ac94ec15533596839 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:09:07 -0700 Subject: bpf, sockmap: Check skb_verdict and skb_parser programs explicitly We are about to allow skb_verdict to run without skb_parser programs as a first step change code to check each program type specifically. This should be a mechanical change without any impact to actual result. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239294756.8495.5796595770890272219.stgit@john-Precision-5820-Tower --- net/core/sock_map.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e83a80e8f13b..a2ed5b6223b9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -230,16 +230,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; struct sk_psock *psock; - bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict); skb_parser = READ_ONCE(progs->skb_parser); - skb_progs = skb_parser && skb_verdict; - if (skb_progs) { + if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) return PTR_ERR(skb_verdict); + } + if (skb_parser) { skb_parser = bpf_prog_inc_not_zero(skb_parser); if (IS_ERR(skb_parser)) { bpf_prog_put(skb_verdict); @@ -264,7 +264,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + (skb_parser && READ_ONCE(psock->progs.skb_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -285,7 +286,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_progs && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); @@ -303,10 +304,10 @@ out_progs: if (msg_parser) bpf_prog_put(msg_parser); out: - if (skb_progs) { + if (skb_verdict) bpf_prog_put(skb_verdict); + if (skb_parser) bpf_prog_put(skb_parser); - } return ret; } -- cgit v1.2.3 From ef5659280eb13e8ac31c296f58cfdfa1684ac06b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:09:38 -0700 Subject: bpf, sockmap: Allow skipping sk_skb parser program Currently, we often run with a nop parser namely one that just does this, 'return skb->len'. This happens when either our verdict program can handle streaming data or it is only looking at socket data such as IP addresses and other metadata associated with the flow. The second case is common for a L3/L4 proxy for instance. So lets allow loading programs without the parser then we can skip the stream parser logic and avoid having to add a BPF program that is effectively a nop. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/core/sock_map.c | 22 ++++++++++----- 3 files changed, 95 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3119928fc103..fec0c5ac1c4f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node); int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 881a5b290946..654182ecf87b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -627,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.skb_parser) sk_psock_stop_strp(sk, psock); + else if (psock->progs.skb_verdict) + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -871,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t orig_len) +{ + struct sock *sk = (struct sock *)desc->arg.data; + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + int len = skb->len; + + /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + desc->error = -ENOMEM; + return 0; + } + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + len = 0; + kfree_skb(skb); + goto out; + } + skb_set_owner_r(skb, sk); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + sk_psock_verdict_apply(psock, skb, ret); +out: + rcu_read_unlock(); + return len; +} + +static void sk_psock_verdict_data_ready(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + read_descriptor_t desc; + + if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + return; + + desc.arg.data = sk; + desc.error = 0; + desc.count = 1; + + sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); +} + static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; @@ -900,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) return strp_init(&psock->parser.strp, sk, &cb); } +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_verdict_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -925,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) strp_stop(&parser->strp); parser->enabled = false; } + +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + parser->enabled = false; +} diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a2ed5b6223b9..df09c39a4dd2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock, static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { + bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; - bool strp_stop = false; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { @@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk, map); if (psock->parser.enabled && stab->progs.skb_parser) strp_stop = true; + if (psock->parser.enabled && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); - if (strp_stop) { + if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); - sk_psock_stop_strp(sk, psock); + if (strp_stop) + sk_psock_stop_strp(sk, psock); + else + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); } } @@ -288,16 +293,19 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, write_lock_bh(&sk->sk_callback_lock); if (skb_parser && skb_verdict && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock); - if (ret) { - write_unlock_bh(&sk->sk_callback_lock); - goto out_drop; - } + if (ret) + goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); + } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; +out_unlock_drop: + write_unlock_bh(&sk->sk_callback_lock); out_drop: sk_psock_put(sk, psock); out_progs: -- cgit v1.2.3