From 13d2618b48f15966d1adfe1ff6a1985f5eef40ba Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Fri, 28 Jul 2023 08:44:11 +0200 Subject: bpf: sockmap: Remove preempt_disable in sock_map_sk_acquire Disabling preemption in sock_map_sk_acquire conflicts with GFP_ATOMIC allocation later in sk_psock_init_link on PREEMPT_RT kernels, since GFP_ATOMIC might sleep on RT (see bpf: Make BPF and PREEMPT_RT co-exist patchset notes for details). This causes calling bpf_map_update_elem on BPF_MAP_TYPE_SOCKMAP maps to BUG (sleeping function called from invalid context) on RT kernels. preempt_disable was introduced together with lock_sk and rcu_read_lock in commit 99ba2b5aba24e ("bpf: sockhash, disallow bpf_tcp_close and update in parallel"), probably to match disabled migration of BPF programs, and is no longer necessary. Remove preempt_disable to fix BUG in sock_map_update_common on RT. Signed-off-by: Tomas Glozar Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/all/20200224140131.461979697@linutronix.de/ Fixes: 99ba2b5aba24 ("bpf: sockhash, disallow bpf_tcp_close and update in parallel") Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20230728064411.305576-1-tglozar@redhat.com Signed-off-by: Paolo Abeni --- net/core/sock_map.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 19538d628714..08ab108206bf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -115,7 +115,6 @@ static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); - preempt_disable(); rcu_read_lock(); } @@ -123,7 +122,6 @@ static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); - preempt_enable(); release_sock(sk); } -- cgit v1.2.3 From 7e96ec0e6605b69bb21bbf6c0ff9051e656ec2b1 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 4 Aug 2023 03:37:37 -0400 Subject: bpf, sockmap: Fix map type error in sock_map_del_link sock_map_del_link() operates on both SOCKMAP and SOCKHASH, although both types have member named "progs", the offset of "progs" member in these two types is different, so "progs" should be accessed with the real map type. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Xu Kuohai Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20230804073740.194770-2-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau --- net/core/sock_map.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 08ab108206bf..8f07fea39d9e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -146,13 +146,13 @@ static void sock_map_del_link(struct sock *sk, list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->saved_data_ready && stab->progs.stream_parser) + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.stream_verdict) + if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); -- cgit v1.2.3 From 35d2b7ffffc1d9b3dc6c761010aa3338da49165b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 29 Aug 2023 22:35:17 -0700 Subject: bpf, sockmap: Fix preempt_rt splat when using raw_spin_lock_t Sockmap and sockhash maps are a collection of psocks that are objects representing a socket plus a set of metadata needed to manage the BPF programs associated with the socket. These maps use the stab->lock to protect from concurrent operations on the maps, e.g. trying to insert to objects into the array at the same time in the same slot. Additionally, a sockhash map has a bucket lock to protect iteration and insert/delete into the hash entry. Each psock has a psock->link which is a linked list of all the maps that a psock is attached to. This allows a psock (socket) to be included in multiple sockmap and sockhash maps. This linked list is protected the psock->link_lock. They _must_ be nested correctly to avoid deadlock: lock(stab->lock) : do BPF map operations and psock insert/delete lock(psock->link_lock) : add map to psock linked list of maps unlock(psock->link_lock) unlock(stab->lock) For non PREEMPT_RT kernels both raw_spin_lock_t and spin_lock_t are guaranteed to not sleep. But, with PREEMPT_RT kernels the spin_lock_t variants may sleep. In the current code we have many patterns like this: rcu_critical_section: raw_spin_lock(stab->lock) spin_lock(psock->link_lock) <- may sleep ouch spin_unlock(psock->link_lock) raw_spin_unlock(stab->lock) rcu_critical_section Nesting spin_lock() inside a raw_spin_lock() violates locking rules for PREEMPT_RT kernels. And additionally we do alloc(GFP_ATOMICS) inside the stab->lock, but those might sleep on PREEMPT_RT kernels. The result is splats like this: ./test_progs -t sockmap_basic [ 33.344330] bpf_testmod: loading out-of-tree module taints kernel. [ 33.441933] [ 33.442089] ============================= [ 33.442421] [ BUG: Invalid wait context ] [ 33.442763] 6.5.0-rc5-01731-gec0ded2e0282 #4958 Tainted: G O [ 33.443320] ----------------------------- [ 33.443624] test_progs/2073 is trying to lock: [ 33.443960] ffff888102a1c290 (&psock->link_lock){....}-{3:3}, at: sock_map_update_common+0x2c2/0x3d0 [ 33.444636] other info that might help us debug this: [ 33.444991] context-{5:5} [ 33.445183] 3 locks held by test_progs/2073: [ 33.445498] #0: ffff88811a208d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: sock_map_update_elem_sys+0xff/0x330 [ 33.446159] #1: ffffffff842539e0 (rcu_read_lock){....}-{1:3}, at: sock_map_update_elem_sys+0xf5/0x330 [ 33.446809] #2: ffff88810d687240 (&stab->lock){+...}-{2:2}, at: sock_map_update_common+0x177/0x3d0 [ 33.447445] stack backtrace: [ 33.447655] CPU: 10 PID To fix observe we can't readily remove the allocations (for that we would need to use/create something similar to bpf_map_alloc). So convert raw_spin_lock_t to spin_lock_t. We note that sock_map_update that would trigger the allocate and potential sleep is only allowed through sys_bpf ops and via sock_ops which precludes hw interrupts and low level atomic sections in RT preempt kernel. On non RT preempt kernel there are no changes here and spin locks sections and alloc(GFP_ATOMIC) are still not sleepable. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230830053517.166611-1-john.fastabend@gmail.com --- net/core/sock_map.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8f07fea39d9e..cb11750b1df5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -18,7 +18,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; - raw_spinlock_t lock; + spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ @@ -44,7 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); + spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), @@ -411,7 +411,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; - raw_spin_lock_bh(&stab->lock); + spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); @@ -421,7 +421,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, else err = -EINVAL; - raw_spin_unlock_bh(&stab->lock); + spin_unlock_bh(&stab->lock); return err; } @@ -487,7 +487,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, psock = sk_psock(sk); WARN_ON_ONCE(!psock); - raw_spin_lock_bh(&stab->lock); + spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; @@ -501,10 +501,10 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); - raw_spin_unlock_bh(&stab->lock); + spin_unlock_bh(&stab->lock); return 0; out_unlock: - raw_spin_unlock_bh(&stab->lock); + spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: @@ -835,7 +835,7 @@ struct bpf_shtab_elem { struct bpf_shtab_bucket { struct hlist_head head; - raw_spinlock_t lock; + spinlock_t lock; }; struct bpf_shtab { @@ -910,7 +910,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { @@ -918,7 +918,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) @@ -932,7 +932,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); @@ -940,7 +940,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) sock_hash_free_elem(htab, elem); ret = 0; } - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); return ret; } @@ -1000,7 +1000,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; @@ -1026,10 +1026,10 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); return 0; out_unlock: - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); @@ -1115,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); + spin_lock_init(&htab->buckets[i].lock); } return &htab->map; @@ -1147,11 +1147,11 @@ static void sock_hash_free(struct bpf_map *map) * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's -- cgit v1.2.3