From: Lee Jones <lee@kernel.org>
To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Kuniyuki Iwashima <kuniyu@amazon.com>,
Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>,
Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>,
Simon Horman <horms@kernel.org>,
linux-kernel@vger.kernel.org, netdev@vger.kernel.org
Cc: stable@vger.kernel.org
Subject: [PATCH v6.6 03/26] af_unix: Try to run GC async.
Date: Wed, 21 May 2025 14:45:11 +0000 [thread overview]
Message-ID: <20250521144803.2050504-4-lee@kernel.org> (raw)
In-Reply-To: <20250521144803.2050504-1-lee@kernel.org>
From: Kuniyuki Iwashima <kuniyu@amazon.com>
[ Upstream commit d9f21b3613337b55cc9d4a6ead484dca68475143 ]
If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.
In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate. Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.
However, if a process sends data with no AF_UNIX FD, the sendmsg() call
does not need to wait for GC. After this change, only the process that
meets the condition below will be blocked under such a situation.
1) cmsg contains AF_UNIX socket
2) more than 32 AF_UNIX sent by the same user are still inflight
Note that even a sendmsg() call that does not meet the condition but has
AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
but we allow that as a bonus for sane users.
The results below are the time spent in unix_dgram_sendmsg() sending 1
byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
sockets exist.
Without series: the sane sendmsg() needs to wait gc unreasonably.
$ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
524288 -> 1048575 : 0 | |
1048576 -> 2097151 : 3881 |****************************************|
2097152 -> 4194303 : 214 |** |
4194304 -> 8388607 : 1 | |
avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
With series: the sane sendmsg() can finish much faster.
$ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg
Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
^C
nsecs : count distribution
[...]
128 -> 255 : 0 | |
256 -> 511 : 4092 |****************************************|
512 -> 1023 : 2 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 1 | |
8192 -> 16383 : 1 | |
avg = 410 nsecs, total: 1680510 nsecs, count: 4096
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit d9f21b3613337b55cc9d4a6ead484dca68475143)
Signed-off-by: Lee Jones <lee@kernel.org>
---
include/net/af_unix.h | 12 ++++++++++--
include/net/scm.h | 1 +
net/core/scm.c | 5 +++++
net/unix/af_unix.c | 6 ++++--
net/unix/garbage.c | 10 +++++++++-
5 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 7a00d7ed527b6..865e2f7bd67cf 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -8,13 +8,21 @@
#include <linux/refcount.h>
#include <net/sock.h>
+#if IS_ENABLED(CONFIG_UNIX)
+struct unix_sock *unix_get_socket(struct file *filp);
+#else
+static inline struct unix_sock *unix_get_socket(struct file *filp)
+{
+ return NULL;
+}
+#endif
+
void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_destruct_scm(struct sk_buff *skb);
void io_uring_destruct_scm(struct sk_buff *skb);
void unix_gc(void);
-void wait_for_unix_gc(void);
-struct unix_sock *unix_get_socket(struct file *filp);
+void wait_for_unix_gc(struct scm_fp_list *fpl);
struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_MOD (256 - 1)
diff --git a/include/net/scm.h b/include/net/scm.h
index e8c76b4be2fe7..1ff6a28550644 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -24,6 +24,7 @@ struct scm_creds {
struct scm_fp_list {
short count;
+ short count_unix;
short max;
struct user_struct *user;
struct file *fp[SCM_MAX_FD];
diff --git a/net/core/scm.c b/net/core/scm.c
index 737917c7ac627..574607b1c2d96 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -36,6 +36,7 @@
#include <net/compat.h>
#include <net/scm.h>
#include <net/cls_cgroup.h>
+#include <net/af_unix.h>
/*
@@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
+ fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
}
@@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fput(file);
return -EINVAL;
}
+ if (unix_get_socket(file))
+ fpl->count_unix++;
+
*fpp++ = file;
fpl->count++;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ab23c8d72122b..bb92b1ed94aaf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1885,11 +1885,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
long timeo;
int err;
- wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
if (err < 0)
return err;
+ wait_for_unix_gc(scm.fp);
+
err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB)
goto out;
@@ -2157,11 +2158,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
bool fds_sent = false;
int data_len;
- wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
if (err < 0)
return err;
+ wait_for_unix_gc(scm.fp);
+
err = -EOPNOTSUPP;
if (msg->msg_flags & MSG_OOB) {
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a2a8543613a52..96cc6b7674333 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -335,8 +335,9 @@ void unix_gc(void)
}
#define UNIX_INFLIGHT_TRIGGER_GC 16000
+#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
-void wait_for_unix_gc(void)
+void wait_for_unix_gc(struct scm_fp_list *fpl)
{
/* If number of inflight sockets is insane,
* force a garbage collect right now.
@@ -348,6 +349,13 @@ void wait_for_unix_gc(void)
!READ_ONCE(gc_in_progress))
unix_gc();
+ /* Penalise users who want to send AF_UNIX sockets
+ * but whose sockets have not been received yet.
+ */
+ if (!fpl || !fpl->count_unix ||
+ READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
+ return;
+
if (READ_ONCE(gc_in_progress))
flush_work(&unix_gc_work);
}
--
2.49.0.1112.g889b7c5bd8-goog
next prev parent reply other threads:[~2025-05-21 14:49 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-21 14:45 [PATCH v6.6 00/26] af_unix: Align with upstream to avoid a potential UAF Lee Jones
2025-05-21 14:45 ` [PATCH v6.6 01/26] af_unix: Return struct unix_sock from unix_get_socket() Lee Jones
2025-05-22 2:03 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 02/26] af_unix: Run GC on only one CPU Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` Lee Jones [this message]
2025-05-22 2:04 ` [PATCH v6.6 03/26] af_unix: Try to run GC async Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 04/26] af_unix: Replace BUG_ON() with WARN_ON_ONCE() Lee Jones
2025-05-22 2:08 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 05/26] af_unix: Remove io_uring code for GC Lee Jones
2025-05-22 2:07 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 06/26] af_unix: Remove CONFIG_UNIX_SCM Lee Jones
2025-05-22 2:03 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 07/26] af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd Lee Jones
2025-05-22 2:08 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 08/26] af_unix: Allocate struct unix_edge " Lee Jones
2025-05-22 2:06 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 09/26] af_unix: Link struct unix_edge when queuing skb Lee Jones
2025-05-22 2:05 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 10/26] af_unix: Bulk update unix_tot_inflight/unix_inflight " Lee Jones
2025-05-22 2:03 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 11/26] af_unix: Iterate all vertices by DFS Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 12/26] af_unix: Detect Strongly Connected Components Lee Jones
2025-05-22 2:06 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 13/26] af_unix: Save listener for embryo socket Lee Jones
2025-05-22 2:08 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 14/26] af_unix: Fix up unix_edge.successor " Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 15/26] af_unix: Save O(n) setup of Tarjan's algo Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 16/26] af_unix: Skip GC if no cycle exists Lee Jones
2025-05-22 2:07 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 17/26] af_unix: Avoid Tarjan's algorithm if unnecessary Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 18/26] af_unix: Assign a unique index to SCC Lee Jones
2025-05-22 2:07 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 19/26] af_unix: Detect dead SCC Lee Jones
2025-05-22 2:05 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 20/26] af_unix: Replace garbage collection algorithm Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 21/26] af_unix: Remove lock dance in unix_peek_fds() Lee Jones
2025-05-22 2:07 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 22/26] af_unix: Try not to hold unix_gc_lock during accept() Lee Jones
2025-05-22 2:07 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 23/26] af_unix: Don't access successor in unix_del_edges() during GC Lee Jones
2025-05-22 2:08 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 24/26] af_unix: Add dead flag to struct scm_fp_list Lee Jones
2025-05-22 2:05 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 25/26] af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS Lee Jones
2025-05-22 2:04 ` Sasha Levin
2025-05-21 14:45 ` [PATCH v6.6 26/26] af_unix: Fix uninit-value in __unix_walk_scc() Lee Jones
2025-05-22 2:07 ` Sasha Levin
2025-05-29 12:26 ` [PATCH v6.6 00/26] af_unix: Align with upstream to avoid a potential UAF Greg KH
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250521144803.2050504-4-lee@kernel.org \
--to=lee@kernel.org \
--cc=Rao.Shoaib@oracle.com \
--cc=axboe@kernel.dk \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=kuniyu@amazon.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mhal@rbox.co \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=sashal@kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.