* [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened.
@ 2026-03-08 3:04 Kuniyuki Iwashima
2026-03-08 19:19 ` Kuniyuki Iwashima
0 siblings, 1 reply; 2+ messages in thread
From: Kuniyuki Iwashima @ 2026-03-08 3:04 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev,
Igor Ushakov
Igor Ushakov reported that GC purged the receive queue of
an alive socket due to a race with MSG_PEEK with a nice repro.
This is the exact same issue previously fixed by commit
cbcf01128d0a ("af_unix: fix garbage collect vs MSG_PEEK").
After GC was replaced with the current algorithm, the cited
commit removed the lock dance in unix_peek_fds() and
reintroduced the same issue.
The problem is that MSG_PEEK bumps a file refcount without
interacting with GC.
Consider an SCC containing sk-A and sk-B, where sk-A is
close()d but can be recv()ed via sk-B.
The bad thing happens if sk-A is recv()ed with MSG_PEEK from
sk-B and sk-B is close()d while GC is checking unix_vertex_dead()
for sk-A and sk-B.
GC thread User thread
--------- -----------
unix_vertex_dead(sk-A)
-> true <------.
\
`------ recv(sk-B, MSG_PEEK)
invalidate !! -> sk-A's file refcount : 1 -> 2
close(sk-B)
-> sk-B's file refcount : 2 -> 1
unix_vertex_dead(sk-B)
-> true
Initially, sk-A's file refcount is 1 by the inflight fd in sk-B
recvq. GC thinks sk-A is dead because the file refcount is the
same as the number of its inflight fds.
However, sk-A's file refcount is bumped silently by MSG_PEEK,
which invalidates the previous evaluation.
At this moment, sk-B's file refcount is 2; one by the open fd,
and one by the inflight fd in sk-A. The subsequent close()
releases one refcount by the former.
Finally, GC incorrectly concludes that both sk-A and sk-B are dead.
One option is to restore the lock dance in unix_peek_fds(), but
we can resolve this more elegantly thanks to the new algorithm.
We actually do not need to synchronise MSG_PEEK with the dead
SCC detection. Even if the sequence above occurs, we can just
give up garbage-collecting the SCC if we can detect the race.
Let's notify GC when MSG_PEEK occurs and let it defer the SCC
to the next run.
This way no locking is needed on the MSG_PEEK side, and we can
avoid imposing a penalty on every MSG_PEEK unnecessarily.
Note that we can retry within unix_scc_dead() if MSG_PEEK is
detected, but we do not do so to avoid hung task splat from
abusive MSG_PEEK calls.
Fixes: 118f457da9ed ("af_unix: Remove lock dance in unix_peek_fds().")
Reported-by: Igor Ushakov <sysroot314@gmail.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
net/unix/af_unix.c | 2 ++
net/unix/af_unix.h | 1 +
net/unix/garbage.c | 71 ++++++++++++++++++++++++++++------------------
3 files changed, 46 insertions(+), 28 deletions(-)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7eaa5b187fef..b23c33df8b46 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1958,6 +1958,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+
+ unix_peek_fpl(scm->fp);
}
static void unix_destruct_scm(struct sk_buff *skb)
diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h
index c4f1b2da363d..8119dbeef3a3 100644
--- a/net/unix/af_unix.h
+++ b/net/unix/af_unix.h
@@ -29,6 +29,7 @@ void unix_del_edges(struct scm_fp_list *fpl);
void unix_update_edges(struct unix_sock *receiver);
int unix_prepare_fpl(struct scm_fp_list *fpl);
void unix_destroy_fpl(struct scm_fp_list *fpl);
+void unix_peek_fpl(struct scm_fp_list *fpl);
void unix_schedule_gc(struct user_struct *user);
/* SOCK_DIAG */
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 816e8fa2b062..468fb0ee463f 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -318,6 +318,21 @@ void unix_destroy_fpl(struct scm_fp_list *fpl)
unix_free_vertices(fpl);
}
+static bool gc_in_progress;
+static bool unix_fd_peeked;
+
+void unix_peek_fpl(struct scm_fp_list *fpl)
+{
+ if (!fpl->count_unix)
+ return;
+
+ if (!READ_ONCE(gc_in_progress))
+ return;
+
+ /* Invalidate the final refcnt check in unix_vertex_dead(). */
+ WRITE_ONCE(unix_fd_peeked, true);
+}
+
static bool unix_vertex_dead(struct unix_vertex *vertex)
{
struct unix_edge *edge;
@@ -351,6 +366,32 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
return true;
}
+static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+static bool unix_scc_dead(struct list_head *scc, bool fast)
+{
+ struct unix_vertex *vertex;
+ bool scc_dead = true;
+
+ WRITE_ONCE(unix_fd_peeked, false);
+
+ list_for_each_entry_reverse(vertex, scc, scc_entry) {
+ /* Don't restart DFS from this vertex. */
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ /* Mark vertex as off-stack for __unix_walk_scc(). */
+ if (!fast)
+ vertex->index = unix_vertex_grouped_index;
+
+ if (scc_dead)
+ scc_dead = unix_vertex_dead(vertex);
+ }
+
+ /* If MSG_PEEK intervened, defer this SCC to the next round. */
+ return scc_dead && !READ_ONCE(unix_fd_peeked);
+}
+
static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
{
struct unix_vertex *vertex;
@@ -404,9 +445,6 @@ static bool unix_scc_cyclic(struct list_head *scc)
return false;
}
-static LIST_HEAD(unix_visited_vertices);
-static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
-
static unsigned long __unix_walk_scc(struct unix_vertex *vertex,
unsigned long *last_index,
struct sk_buff_head *hitlist)
@@ -474,9 +512,7 @@ static unsigned long __unix_walk_scc(struct unix_vertex *vertex,
}
if (vertex->index == vertex->scc_index) {
- struct unix_vertex *v;
struct list_head scc;
- bool scc_dead = true;
/* SCC finalised.
*
@@ -485,18 +521,7 @@ static unsigned long __unix_walk_scc(struct unix_vertex *vertex,
*/
__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
- list_for_each_entry_reverse(v, &scc, scc_entry) {
- /* Don't restart DFS from this vertex in unix_walk_scc(). */
- list_move_tail(&v->entry, &unix_visited_vertices);
-
- /* Mark vertex as off-stack. */
- v->index = unix_vertex_grouped_index;
-
- if (scc_dead)
- scc_dead = unix_vertex_dead(v);
- }
-
- if (scc_dead) {
+ if (unix_scc_dead(&scc, false)) {
unix_collect_skb(&scc, hitlist);
} else {
if (unix_vertex_max_scc_index < vertex->scc_index)
@@ -550,19 +575,11 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
while (!list_empty(&unix_unvisited_vertices)) {
struct unix_vertex *vertex;
struct list_head scc;
- bool scc_dead = true;
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
list_add(&scc, &vertex->scc_entry);
- list_for_each_entry_reverse(vertex, &scc, scc_entry) {
- list_move_tail(&vertex->entry, &unix_visited_vertices);
-
- if (scc_dead)
- scc_dead = unix_vertex_dead(vertex);
- }
-
- if (scc_dead) {
+ if (unix_scc_dead(&scc, true)) {
cyclic_sccs--;
unix_collect_skb(&scc, hitlist);
}
@@ -577,8 +594,6 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC);
}
-static bool gc_in_progress;
-
static void unix_gc(struct work_struct *work)
{
struct sk_buff_head hitlist;
--
2.53.0.473.g4a7958ca14-goog
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened.
2026-03-08 3:04 [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened Kuniyuki Iwashima
@ 2026-03-08 19:19 ` Kuniyuki Iwashima
0 siblings, 0 replies; 2+ messages in thread
From: Kuniyuki Iwashima @ 2026-03-08 19:19 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, netdev, Igor Ushakov
On Sat, Mar 7, 2026 at 7:04 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> Igor Ushakov reported that GC purged the receive queue of
> an alive socket due to a race with MSG_PEEK with a nice repro.
>
> This is the exact same issue previously fixed by commit
> cbcf01128d0a ("af_unix: fix garbage collect vs MSG_PEEK").
>
> After GC was replaced with the current algorithm, the cited
> commit removed the lock dance in unix_peek_fds() and
> reintroduced the same issue.
>
> The problem is that MSG_PEEK bumps a file refcount without
> interacting with GC.
>
> Consider an SCC containing sk-A and sk-B, where sk-A is
> close()d but can be recv()ed via sk-B.
>
> The bad thing happens if sk-A is recv()ed with MSG_PEEK from
> sk-B and sk-B is close()d while GC is checking unix_vertex_dead()
> for sk-A and sk-B.
>
> GC thread User thread
> --------- -----------
> unix_vertex_dead(sk-A)
> -> true <------.
> \
> `------ recv(sk-B, MSG_PEEK)
> invalidate !! -> sk-A's file refcount : 1 -> 2
>
> close(sk-B)
> -> sk-B's file refcount : 2 -> 1
> unix_vertex_dead(sk-B)
> -> true
>
> Initially, sk-A's file refcount is 1 by the inflight fd in sk-B
> recvq. GC thinks sk-A is dead because the file refcount is the
> same as the number of its inflight fds.
>
> However, sk-A's file refcount is bumped silently by MSG_PEEK,
> which invalidates the previous evaluation.
>
> At this moment, sk-B's file refcount is 2; one by the open fd,
> and one by the inflight fd in sk-A. The subsequent close()
> releases one refcount by the former.
>
> Finally, GC incorrectly concludes that both sk-A and sk-B are dead.
>
> One option is to restore the lock dance in unix_peek_fds(), but
> we can resolve this more elegantly thanks to the new algorithm.
>
> We actually do not need to synchronise MSG_PEEK with the dead
> SCC detection. Even if the sequence above occurs, we can just
> give up garbage-collecting the SCC if we can detect the race.
>
> Let's notify GC when MSG_PEEK occurs and let it defer the SCC
> to the next run.
>
> This way no locking is needed on the MSG_PEEK side, and we can
> avoid imposing a penalty on every MSG_PEEK unnecessarily.
>
> Note that we can retry within unix_scc_dead() if MSG_PEEK is
> detected, but we do not do so to avoid hung task splat from
> abusive MSG_PEEK calls.
>
> Fixes: 118f457da9ed ("af_unix: Remove lock dance in unix_peek_fds().")
> Reported-by: Igor Ushakov <sysroot314@gmail.com>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> ---
> net/unix/af_unix.c | 2 ++
> net/unix/af_unix.h | 1 +
> net/unix/garbage.c | 71 ++++++++++++++++++++++++++++------------------
> 3 files changed, 46 insertions(+), 28 deletions(-)
>
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 7eaa5b187fef..b23c33df8b46 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1958,6 +1958,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
> static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
> {
> scm->fp = scm_fp_dup(UNIXCB(skb).fp);
> +
> + unix_peek_fpl(scm->fp);
> }
>
> static void unix_destruct_scm(struct sk_buff *skb)
> diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h
> index c4f1b2da363d..8119dbeef3a3 100644
> --- a/net/unix/af_unix.h
> +++ b/net/unix/af_unix.h
> @@ -29,6 +29,7 @@ void unix_del_edges(struct scm_fp_list *fpl);
> void unix_update_edges(struct unix_sock *receiver);
> int unix_prepare_fpl(struct scm_fp_list *fpl);
> void unix_destroy_fpl(struct scm_fp_list *fpl);
> +void unix_peek_fpl(struct scm_fp_list *fpl);
> void unix_schedule_gc(struct user_struct *user);
>
> /* SOCK_DIAG */
> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> index 816e8fa2b062..468fb0ee463f 100644
> --- a/net/unix/garbage.c
> +++ b/net/unix/garbage.c
> @@ -318,6 +318,21 @@ void unix_destroy_fpl(struct scm_fp_list *fpl)
> unix_free_vertices(fpl);
> }
>
> +static bool gc_in_progress;
> +static bool unix_fd_peeked;
> +
> +void unix_peek_fpl(struct scm_fp_list *fpl)
> +{
> + if (!fpl->count_unix)
> + return;
> +
> + if (!READ_ONCE(gc_in_progress))
> + return;
> +
> + /* Invalidate the final refcnt check in unix_vertex_dead(). */
> + WRITE_ONCE(unix_fd_peeked, true);
> +}
> +
> static bool unix_vertex_dead(struct unix_vertex *vertex)
> {
> struct unix_edge *edge;
> @@ -351,6 +366,32 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
> return true;
> }
>
> +static LIST_HEAD(unix_visited_vertices);
> +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
> +
> +static bool unix_scc_dead(struct list_head *scc, bool fast)
> +{
> + struct unix_vertex *vertex;
> + bool scc_dead = true;
> +
> + WRITE_ONCE(unix_fd_peeked, false);
> +
> + list_for_each_entry_reverse(vertex, scc, scc_entry) {
> + /* Don't restart DFS from this vertex. */
> + list_move_tail(&vertex->entry, &unix_visited_vertices);
> +
> + /* Mark vertex as off-stack for __unix_walk_scc(). */
> + if (!fast)
> + vertex->index = unix_vertex_grouped_index;
> +
> + if (scc_dead)
> + scc_dead = unix_vertex_dead(vertex);
> + }
> +
> + /* If MSG_PEEK intervened, defer this SCC to the next round. */
> + return scc_dead && !READ_ONCE(unix_fd_peeked);
I'll add more bold comment here and the commit message
why memory barrier for WRITE_ONCE(unix_fd_peeked, true)
is not needed, as suggested by Linus.
pw-bot: cr
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-03-08 19:19 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-08 3:04 [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened Kuniyuki Iwashima
2026-03-08 19:19 ` Kuniyuki Iwashima
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox