From 31e03207119a535d0b0e3b3a7f91983aeb2cb14d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:52 -0800 Subject: af_unix: Annotate data-race of gc_in_progress in wait_for_unix_gc(). gc_in_progress is changed under spin_lock(&unix_gc_lock), but wait_for_unix_gc() reads it locklessly. Let's use READ_ONCE(). Fixes: 5f23b734963e ("net: Fix soft lockups/OOM issues w/ unix garbage collector") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240123170856.41348-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/garbage.c') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2405f0f9af31..af3d2221cf6a 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -198,7 +198,7 @@ void wait_for_unix_gc(void) if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); - wait_event(unix_gc_wait, gc_in_progress == false); + wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); } /* The external entry point: unix_gc() */ -- cgit v1.2.3 From 97af84a6bba2ab2b9c704c08e67de3b5ea551bb2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:53 -0800 Subject: af_unix: Do not use atomic ops for unix_sk(sk)->inflight. When touching unix_sk(sk)->inflight, we are always under spin_lock(&unix_gc_lock). Let's convert unix_sk(sk)->inflight to the normal unsigned long. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240123170856.41348-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 4 ++-- net/unix/garbage.c | 17 ++++++++--------- net/unix/scm.c | 8 +++++--- 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 49c4640027d8..ac38b63db554 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -61,7 +61,7 @@ struct unix_sock { struct mutex iolock, bindlock; struct sock *peer; struct list_head link; - atomic_long_t inflight; + unsigned long inflight; spinlock_t lock; unsigned long gc_flags; #define UNIX_GC_CANDIDATE 0 diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ac1f2bc18fc9..1e9378036dcc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -993,11 +993,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; - u = unix_sk(sk); + u = unix_sk(sk); + u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index af3d2221cf6a..051f96a3ab02 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -166,17 +166,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), static void dec_inflight(struct unix_sock *usk) { - atomic_long_dec(&usk->inflight); + usk->inflight--; } static void inc_inflight(struct unix_sock *usk) { - atomic_long_inc(&usk->inflight); + usk->inflight++; } static void inc_inflight_move_tail(struct unix_sock *u) { - atomic_long_inc(&u->inflight); + u->inflight++; + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over @@ -237,14 +238,12 @@ void unix_gc(void) */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; - long inflight_refs; total_refs = file_count(u->sk.sk_socket->file); - inflight_refs = atomic_long_read(&u->inflight); - BUG_ON(inflight_refs < 1); - BUG_ON(total_refs < inflight_refs); - if (total_refs == inflight_refs) { + BUG_ON(!u->inflight); + BUG_ON(total_refs < u->inflight); + if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); @@ -271,7 +270,7 @@ void unix_gc(void) /* Move cursor to after the current position. */ list_move(&cursor, &u->link); - if (atomic_long_read(&u->inflight) > 0) { + if (u->inflight) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); diff --git a/net/unix/scm.c b/net/unix/scm.c index 822ce0d0d791..e92f2fad6410 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -53,12 +53,13 @@ void unix_inflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); - if (atomic_long_inc_return(&u->inflight) == 1) { + if (!u->inflight) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { BUG_ON(list_empty(&u->link)); } + u->inflight++; /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } @@ -75,10 +76,11 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); - BUG_ON(!atomic_long_read(&u->inflight)); + BUG_ON(!u->inflight); BUG_ON(list_empty(&u->link)); - if (atomic_long_dec_and_test(&u->inflight)) + u->inflight--; + if (!u->inflight) list_del_init(&u->link); /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); -- cgit v1.2.3 From 5b17307bd0789edea0675d524a2b277b93bbde62 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:54 -0800 Subject: af_unix: Return struct unix_sock from unix_get_socket(). Currently, unix_get_socket() returns struct sock, but after calling it, we always cast it to unix_sk(). Let's return struct unix_sock from unix_get_socket(). Signed-off-by: Kuniyuki Iwashima Acked-by: Pavel Begunkov Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240123170856.41348-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 2 +- net/unix/garbage.c | 19 +++++++------------ net/unix/scm.c | 19 +++++++------------ 3 files changed, 15 insertions(+), 25 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index ac38b63db554..2c98ef95017b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *skb); void io_uring_destruct_scm(struct sk_buff *skb); void unix_gc(void); void wait_for_unix_gc(void); -struct sock *unix_get_socket(struct file *filp); +struct unix_sock *unix_get_socket(struct file *filp); struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 051f96a3ab02..d5ef46a75f1f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), while (nfd--) { /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); + struct unix_sock *u = unix_get_socket(*fp++); - if (sk) { - struct unix_sock *u = unix_sk(sk); + /* Ignore non-candidates, they could have been added + * to the queues after starting the garbage collection + */ + if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { + hit = true; - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } + func(u); } } if (hit && hitlist != NULL) { diff --git a/net/unix/scm.c b/net/unix/scm.c index e92f2fad6410..b5ae5ab16777 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list); DEFINE_SPINLOCK(unix_gc_lock); EXPORT_SYMBOL(unix_gc_lock); -struct sock *unix_get_socket(struct file *filp) +struct unix_sock *unix_get_socket(struct file *filp) { - struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); /* Socket ? */ @@ -34,10 +33,10 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && ops && ops->family == PF_UNIX) - u_sock = s; + return unix_sk(s); } - return u_sock; + return NULL; } EXPORT_SYMBOL(unix_get_socket); @@ -46,13 +45,11 @@ EXPORT_SYMBOL(unix_get_socket); */ void unix_inflight(struct user_struct *user, struct file *fp) { - struct sock *s = unix_get_socket(fp); + struct unix_sock *u = unix_get_socket(fp); spin_lock(&unix_gc_lock); - if (s) { - struct unix_sock *u = unix_sk(s); - + if (u) { if (!u->inflight) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -69,13 +66,11 @@ void unix_inflight(struct user_struct *user, struct file *fp) void unix_notinflight(struct user_struct *user, struct file *fp) { - struct sock *s = unix_get_socket(fp); + struct unix_sock *u = unix_get_socket(fp); spin_lock(&unix_gc_lock); - if (s) { - struct unix_sock *u = unix_sk(s); - + if (u) { BUG_ON(!u->inflight); BUG_ON(list_empty(&u->link)); -- cgit v1.2.3 From 8b90a9f819dc2a06baae4ec1a64d875e53b824ec Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:55 -0800 Subject: af_unix: Run GC on only one CPU. If more than 16000 inflight AF_UNIX sockets exist and the garbage collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). Also, they wait for unix_gc() to complete. In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, and more if they are the GC candidate. Thus, sendmsg() significantly slows down with too many inflight AF_UNIX sockets. There is a small window to invoke multiple unix_gc() instances, which will then be blocked by the same spinlock except for one. Let's convert unix_gc() to use struct work so that it will not consume CPUs unnecessarily. Note WRITE_ONCE(gc_in_progress, true) is moved before running GC. If we leave the WRITE_ONCE() as is and use the following test to call flush_work(), a process might not call it. CPU 0 CPU 1 --- --- start work and call __unix_gc() if (work_pending(&unix_gc_work) || <-- false READ_ONCE(gc_in_progress)) <-- false flush_work(); <-- missed! WRITE_ONCE(gc_in_progress, true) Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240123170856.41348-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d5ef46a75f1f..3f599db59f9d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -86,7 +86,6 @@ /* Internal data structures and random procedures: */ static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) @@ -182,23 +181,8 @@ static void inc_inflight_move_tail(struct unix_sock *u) } static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 - -void wait_for_unix_gc(void) -{ - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); -} -/* The external entry point: unix_gc() */ -void unix_gc(void) +static void __unix_gc(struct work_struct *work) { struct sk_buff *next_skb, *skb; struct unix_sock *u; @@ -209,13 +193,6 @@ void unix_gc(void) spin_lock(&unix_gc_lock); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; - - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); - /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -322,8 +299,31 @@ void unix_gc(void) /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - wake_up(&unix_gc_wait); - - out: spin_unlock(&unix_gc_lock); } + +static DECLARE_WORK(unix_gc_work, __unix_gc); + +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 + +void wait_for_unix_gc(void) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); +} -- cgit v1.2.3 From d9f21b3613337b55cc9d4a6ead484dca68475143 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Jan 2024 09:08:56 -0800 Subject: af_unix: Try to run GC async. If more than 16000 inflight AF_UNIX sockets exist and the garbage collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). Also, they wait for unix_gc() to complete. In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, and more if they are the GC candidate. Thus, sendmsg() significantly slows down with too many inflight AF_UNIX sockets. However, if a process sends data with no AF_UNIX FD, the sendmsg() call does not need to wait for GC. After this change, only the process that meets the condition below will be blocked under such a situation. 1) cmsg contains AF_UNIX socket 2) more than 32 AF_UNIX sent by the same user are still inflight Note that even a sendmsg() call that does not meet the condition but has AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock, but we allow that as a bonus for sane users. The results below are the time spent in unix_dgram_sendmsg() sending 1 byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX sockets exist. Without series: the sane sendmsg() needs to wait gc unreasonably. $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. ^C nsecs : count distribution [...] 524288 -> 1048575 : 0 | | 1048576 -> 2097151 : 3881 |****************************************| 2097152 -> 4194303 : 214 |** | 4194304 -> 8388607 : 1 | | avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096 With series: the sane sendmsg() can finish much faster. $ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. ^C nsecs : count distribution [...] 128 -> 255 : 0 | | 256 -> 511 : 4092 |****************************************| 512 -> 1023 : 2 | | 1024 -> 2047 : 0 | | 2048 -> 4095 : 0 | | 4096 -> 8191 : 1 | | 8192 -> 16383 : 1 | | avg = 410 nsecs, total: 1680510 nsecs, count: 4096 Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 12 ++++++++++-- include/net/scm.h | 1 + net/core/scm.c | 5 +++++ net/unix/af_unix.c | 6 ++++-- net/unix/garbage.c | 10 +++++++++- 5 files changed, 29 insertions(+), 5 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 2c98ef95017b..f045bbd9017d 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -8,13 +8,21 @@ #include #include +#if IS_ENABLED(CONFIG_UNIX) +struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif + void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_destruct_scm(struct sk_buff *skb); void io_uring_destruct_scm(struct sk_buff *skb); void unix_gc(void); -void wait_for_unix_gc(void); -struct unix_sock *unix_get_socket(struct file *filp); +void wait_for_unix_gc(struct scm_fp_list *fpl); struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) diff --git a/include/net/scm.h b/include/net/scm.h index cf68acec4d70..92276a2c5543 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -25,6 +25,7 @@ struct scm_creds { struct scm_fp_list { short count; + short count_unix; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; diff --git a/net/core/scm.c b/net/core/scm.c index d0e0852a24d5..9cd4b0a01cd6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include #include #include +#include /* @@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; } @@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1e9378036dcc..1720419d93d6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1923,11 +1923,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2199,11 +2200,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 3f599db59f9d..4046c606f0e6 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -311,8 +311,9 @@ void unix_gc(void) } #define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) -void wait_for_unix_gc(void) +void wait_for_unix_gc(struct scm_fp_list *fpl) { /* If number of inflight sockets is insane, * force a garbage collect right now. @@ -324,6 +325,13 @@ void wait_for_unix_gc(void) !READ_ONCE(gc_in_progress)) unix_gc(); + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + if (READ_ONCE(gc_in_progress)) flush_work(&unix_gc_work); } -- cgit v1.2.3 From d0f6dc26346863e1f4a23117f5468614e54df064 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 29 Jan 2024 11:04:33 -0800 Subject: af_unix: Replace BUG_ON() with WARN_ON_ONCE(). This is a prep patch for the last patch in this series so that checkpatch will not warn about BUG_ON(). Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 8 ++++---- net/unix/scm.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 4046c606f0e6..af676bb8fb67 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -145,7 +145,7 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), /* An embryo cannot be in-flight, so it's safe * to use the list link. */ - BUG_ON(!list_empty(&u->link)); + WARN_ON_ONCE(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); @@ -213,8 +213,8 @@ static void __unix_gc(struct work_struct *work) total_refs = file_count(u->sk.sk_socket->file); - BUG_ON(!u->inflight); - BUG_ON(total_refs < u->inflight); + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(total_refs < u->inflight); if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); @@ -294,7 +294,7 @@ static void __unix_gc(struct work_struct *work) list_move_tail(&u->link, &gc_inflight_list); /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); + WARN_ON_ONCE(!list_empty(&gc_candidates)); /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); diff --git a/net/unix/scm.c b/net/unix/scm.c index b5ae5ab16777..505e56cf02a2 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -51,10 +51,10 @@ void unix_inflight(struct user_struct *user, struct file *fp) if (u) { if (!u->inflight) { - BUG_ON(!list_empty(&u->link)); + WARN_ON_ONCE(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { - BUG_ON(list_empty(&u->link)); + WARN_ON_ONCE(list_empty(&u->link)); } u->inflight++; /* Paired with READ_ONCE() in wait_for_unix_gc() */ @@ -71,8 +71,8 @@ void unix_notinflight(struct user_struct *user, struct file *fp) spin_lock(&unix_gc_lock); if (u) { - BUG_ON(!u->inflight); - BUG_ON(list_empty(&u->link)); + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(list_empty(&u->link)); u->inflight--; if (!u->inflight) -- cgit v1.2.3 From 11498715f266a3fb4caabba9dd575636cbcaa8f1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 29 Jan 2024 11:04:34 -0800 Subject: af_unix: Remove io_uring code for GC. Since commit 705318a99a13 ("io_uring/af_unix: disable sending io_uring over sockets"), io_uring's unix socket cannot be passed via SCM_RIGHTS, so it does not contribute to cyclic reference and no longer be candidate for garbage collection. Also, commit 6e5e6d274956 ("io_uring: drop any code related to SCM_RIGHTS") cleaned up SCM_RIGHTS code in io_uring. Let's do it in AF_UNIX as well by reverting commit 0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release") and commit 10369080454d ("net: reclaim skb->scm_io_uring bit"). Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 - net/unix/garbage.c | 25 ++----------------------- net/unix/scm.c | 6 ------ 3 files changed, 2 insertions(+), 30 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index f045bbd9017d..9e39b2ec4524 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -20,7 +20,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_destruct_scm(struct sk_buff *skb); -void io_uring_destruct_scm(struct sk_buff *skb); void unix_gc(void); void wait_for_unix_gc(struct scm_fp_list *fpl); struct sock *unix_peer_get(struct sock *sk); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index af676bb8fb67..ce5b5f87b16e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -184,12 +184,10 @@ static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { - struct sk_buff *next_skb, *skb; - struct unix_sock *u; - struct unix_sock *next; struct sk_buff_head hitlist; - struct list_head cursor; + struct unix_sock *u, *next; LIST_HEAD(not_cycle_list); + struct list_head cursor; spin_lock(&unix_gc_lock); @@ -269,30 +267,11 @@ static void __unix_gc(struct work_struct *work) spin_unlock(&unix_gc_lock); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->destructor == io_uring_destruct_scm) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); - } - } - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); - /* All candidates should have been detached by now. */ WARN_ON_ONCE(!list_empty(&gc_candidates)); diff --git a/net/unix/scm.c b/net/unix/scm.c index 505e56cf02a2..db65b0ab5947 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -148,9 +148,3 @@ void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } EXPORT_SYMBOL(unix_destruct_scm); - -void io_uring_destruct_scm(struct sk_buff *skb) -{ - unix_destruct_scm(skb); -} -EXPORT_SYMBOL(io_uring_destruct_scm); -- cgit v1.2.3 From 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 29 Jan 2024 11:04:35 -0800 Subject: af_unix: Remove CONFIG_UNIX_SCM. Originally, the code related to garbage collection was all in garbage.c. Commit f4e65870e5ce ("net: split out functions related to registering inflight socket files") moved some functions to scm.c for io_uring and added CONFIG_UNIX_SCM just in case AF_UNIX was built as module. However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX bool"), AF_UNIX is no longer built separately. Also, io_uring does not support SCM_RIGHTS now. Let's move the functions back to garbage.c Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 7 ++- net/Makefile | 2 +- net/unix/Kconfig | 5 -- net/unix/Makefile | 2 - net/unix/af_unix.c | 63 ++++++++++++++++++++- net/unix/garbage.c | 73 +++++++++++++++++++++++- net/unix/scm.c | 150 -------------------------------------------------- net/unix/scm.h | 10 ---- 8 files changed, 137 insertions(+), 175 deletions(-) delete mode 100644 net/unix/scm.c delete mode 100644 net/unix/scm.h (limited to 'net/unix/garbage.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 9e39b2ec4524..54e346152eb1 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,19 +17,20 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif +extern spinlock_t unix_gc_lock; +extern unsigned int unix_tot_inflight; + void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); -void unix_destruct_scm(struct sk_buff *skb); void unix_gc(void); void wait_for_unix_gc(struct scm_fp_list *fpl); + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) #define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_BITS 8 -extern unsigned int unix_tot_inflight; - struct unix_address { refcount_t refcnt; int len; diff --git a/net/Makefile b/net/Makefile index b06b5539e7a6..65bb8c72a35e 100644 --- a/net/Makefile +++ b/net/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX_SCM) += unix/ +obj-$(CONFIG_UNIX) += unix/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 28b232f281ab..8b5d04210d7c 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -16,11 +16,6 @@ config UNIX Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4d0..4ddd125c4642 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1720419d93d6..1cfbc586adb4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,8 +118,6 @@ #include #include -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -1790,6 +1788,52 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_notinflight(scm->fp->user, scm->fp->fp[i]); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); @@ -1837,6 +1881,21 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) spin_unlock(&unix_gc_lock); } +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index ce5b5f87b16e..9b8473dd79a4 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,11 +81,80 @@ #include #include -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; -/* Internal data structures and random procedures: */ + ops = READ_ONCE(sock->ops); + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); + } + + return NULL; +} + +DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); +static LIST_HEAD(gc_inflight_list); + +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ +void unix_inflight(struct user_struct *user, struct file *filp) +{ + struct unix_sock *u = unix_get_socket(filp); + + spin_lock(&unix_gc_lock); + + if (u) { + if (!u->inflight) { + WARN_ON_ONCE(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); + } else { + WARN_ON_ONCE(list_empty(&u->link)); + } + u->inflight++; + + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); + } + + WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); + + spin_unlock(&unix_gc_lock); +} + +void unix_notinflight(struct user_struct *user, struct file *filp) +{ + struct unix_sock *u = unix_get_socket(filp); + + spin_lock(&unix_gc_lock); + + if (u) { + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(list_empty(&u->link)); + + u->inflight--; + if (!u->inflight) + list_del_init(&u->link); + + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); + } + + WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); + + spin_unlock(&unix_gc_lock); +} static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index db65b0ab5947..000000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct unix_sock *unix_get_socket(struct file *filp) -{ - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - const struct proto_ops *ops = READ_ONCE(sock->ops); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && ops && ops->family == PF_UNIX) - return unix_sk(s); - } - - return NULL; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct unix_sock *u = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (u) { - if (!u->inflight) { - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - WARN_ON_ONCE(list_empty(&u->link)); - } - u->inflight++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct unix_sock *u = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (u) { - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f16..000000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif -- cgit v1.2.3