public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened.
@ 2026-03-08  3:04 Kuniyuki Iwashima
  2026-03-08 19:19 ` Kuniyuki Iwashima
  0 siblings, 1 reply; 2+ messages in thread
From: Kuniyuki Iwashima @ 2026-03-08  3:04 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev,
	Igor Ushakov

Igor Ushakov reported that GC purged the receive queue of
an alive socket due to a race with MSG_PEEK with a nice repro.

This is the exact same issue previously fixed by commit
cbcf01128d0a ("af_unix: fix garbage collect vs MSG_PEEK").

After GC was replaced with the current algorithm, the cited
commit removed the lock dance in unix_peek_fds() and
reintroduced the same issue.

The problem is that MSG_PEEK bumps a file refcount without
interacting with GC.

Consider an SCC containing sk-A and sk-B, where sk-A is
close()d but can be recv()ed via sk-B.

The bad thing happens if sk-A is recv()ed with MSG_PEEK from
sk-B and sk-B is close()d while GC is checking unix_vertex_dead()
for sk-A and sk-B.

  GC thread                    User thread
  ---------                    -----------
  unix_vertex_dead(sk-A)
  -> true   <------.
                    \
                     `------   recv(sk-B, MSG_PEEK)
              invalidate !!    -> sk-A's file refcount : 1 -> 2

                               close(sk-B)
                               -> sk-B's file refcount : 2 -> 1
  unix_vertex_dead(sk-B)
  -> true

Initially, sk-A's file refcount is 1 by the inflight fd in sk-B
recvq.  GC thinks sk-A is dead because the file refcount is the
same as the number of its inflight fds.

However, sk-A's file refcount is bumped silently by MSG_PEEK,
which invalidates the previous evaluation.

At this moment, sk-B's file refcount is 2; one by the open fd,
and one by the inflight fd in sk-A.  The subsequent close()
releases one refcount by the former.

Finally, GC incorrectly concludes that both sk-A and sk-B are dead.

One option is to restore the lock dance in unix_peek_fds(), but
we can resolve this more elegantly thanks to the new algorithm.

We actually do not need to synchronise MSG_PEEK with the dead
SCC detection.  Even if the sequence above occurs, we can just
give up garbage-collecting the SCC if we can detect the race.

Let's notify GC when MSG_PEEK occurs and let it defer the SCC
to the next run.

This way no locking is needed on the MSG_PEEK side, and we can
avoid imposing a penalty on every MSG_PEEK unnecessarily.

Note that we can retry within unix_scc_dead() if MSG_PEEK is
detected, but we do not do so to avoid hung task splat from
abusive MSG_PEEK calls.

Fixes: 118f457da9ed ("af_unix: Remove lock dance in unix_peek_fds().")
Reported-by: Igor Ushakov <sysroot314@gmail.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/unix/af_unix.c |  2 ++
 net/unix/af_unix.h |  1 +
 net/unix/garbage.c | 71 ++++++++++++++++++++++++++++------------------
 3 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7eaa5b187fef..b23c33df8b46 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1958,6 +1958,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+
+	unix_peek_fpl(scm->fp);
 }
 
 static void unix_destruct_scm(struct sk_buff *skb)
diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h
index c4f1b2da363d..8119dbeef3a3 100644
--- a/net/unix/af_unix.h
+++ b/net/unix/af_unix.h
@@ -29,6 +29,7 @@ void unix_del_edges(struct scm_fp_list *fpl);
 void unix_update_edges(struct unix_sock *receiver);
 int unix_prepare_fpl(struct scm_fp_list *fpl);
 void unix_destroy_fpl(struct scm_fp_list *fpl);
+void unix_peek_fpl(struct scm_fp_list *fpl);
 void unix_schedule_gc(struct user_struct *user);
 
 /* SOCK_DIAG */
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 816e8fa2b062..468fb0ee463f 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -318,6 +318,21 @@ void unix_destroy_fpl(struct scm_fp_list *fpl)
 	unix_free_vertices(fpl);
 }
 
+static bool gc_in_progress;
+static bool unix_fd_peeked;
+
+void unix_peek_fpl(struct scm_fp_list *fpl)
+{
+	if (!fpl->count_unix)
+		return;
+
+	if (!READ_ONCE(gc_in_progress))
+		return;
+
+	/* Invalidate the final refcnt check in unix_vertex_dead(). */
+	WRITE_ONCE(unix_fd_peeked, true);
+}
+
 static bool unix_vertex_dead(struct unix_vertex *vertex)
 {
 	struct unix_edge *edge;
@@ -351,6 +366,32 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
 	return true;
 }
 
+static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+static bool unix_scc_dead(struct list_head *scc, bool fast)
+{
+	struct unix_vertex *vertex;
+	bool scc_dead = true;
+
+	WRITE_ONCE(unix_fd_peeked, false);
+
+	list_for_each_entry_reverse(vertex, scc, scc_entry) {
+		/* Don't restart DFS from this vertex. */
+		list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+		/* Mark vertex as off-stack for __unix_walk_scc(). */
+		if (!fast)
+			vertex->index = unix_vertex_grouped_index;
+
+		if (scc_dead)
+			scc_dead = unix_vertex_dead(vertex);
+	}
+
+	/* If MSG_PEEK intervened, defer this SCC to the next round. */
+	return scc_dead && !READ_ONCE(unix_fd_peeked);
+}
+
 static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
 {
 	struct unix_vertex *vertex;
@@ -404,9 +445,6 @@ static bool unix_scc_cyclic(struct list_head *scc)
 	return false;
 }
 
-static LIST_HEAD(unix_visited_vertices);
-static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
-
 static unsigned long __unix_walk_scc(struct unix_vertex *vertex,
 				     unsigned long *last_index,
 				     struct sk_buff_head *hitlist)
@@ -474,9 +512,7 @@ static unsigned long __unix_walk_scc(struct unix_vertex *vertex,
 	}
 
 	if (vertex->index == vertex->scc_index) {
-		struct unix_vertex *v;
 		struct list_head scc;
-		bool scc_dead = true;
 
 		/* SCC finalised.
 		 *
@@ -485,18 +521,7 @@ static unsigned long __unix_walk_scc(struct unix_vertex *vertex,
 		 */
 		__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
 
-		list_for_each_entry_reverse(v, &scc, scc_entry) {
-			/* Don't restart DFS from this vertex in unix_walk_scc(). */
-			list_move_tail(&v->entry, &unix_visited_vertices);
-
-			/* Mark vertex as off-stack. */
-			v->index = unix_vertex_grouped_index;
-
-			if (scc_dead)
-				scc_dead = unix_vertex_dead(v);
-		}
-
-		if (scc_dead) {
+		if (unix_scc_dead(&scc, false)) {
 			unix_collect_skb(&scc, hitlist);
 		} else {
 			if (unix_vertex_max_scc_index < vertex->scc_index)
@@ -550,19 +575,11 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
 	while (!list_empty(&unix_unvisited_vertices)) {
 		struct unix_vertex *vertex;
 		struct list_head scc;
-		bool scc_dead = true;
 
 		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
 		list_add(&scc, &vertex->scc_entry);
 
-		list_for_each_entry_reverse(vertex, &scc, scc_entry) {
-			list_move_tail(&vertex->entry, &unix_visited_vertices);
-
-			if (scc_dead)
-				scc_dead = unix_vertex_dead(vertex);
-		}
-
-		if (scc_dead) {
+		if (unix_scc_dead(&scc, true)) {
 			cyclic_sccs--;
 			unix_collect_skb(&scc, hitlist);
 		}
@@ -577,8 +594,6 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
 		   cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC);
 }
 
-static bool gc_in_progress;
-
 static void unix_gc(struct work_struct *work)
 {
 	struct sk_buff_head hitlist;
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened.
  2026-03-08  3:04 [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened Kuniyuki Iwashima
@ 2026-03-08 19:19 ` Kuniyuki Iwashima
  0 siblings, 0 replies; 2+ messages in thread
From: Kuniyuki Iwashima @ 2026-03-08 19:19 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, netdev, Igor Ushakov

On Sat, Mar 7, 2026 at 7:04 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> Igor Ushakov reported that GC purged the receive queue of
> an alive socket due to a race with MSG_PEEK with a nice repro.
>
> This is the exact same issue previously fixed by commit
> cbcf01128d0a ("af_unix: fix garbage collect vs MSG_PEEK").
>
> After GC was replaced with the current algorithm, the cited
> commit removed the lock dance in unix_peek_fds() and
> reintroduced the same issue.
>
> The problem is that MSG_PEEK bumps a file refcount without
> interacting with GC.
>
> Consider an SCC containing sk-A and sk-B, where sk-A is
> close()d but can be recv()ed via sk-B.
>
> The bad thing happens if sk-A is recv()ed with MSG_PEEK from
> sk-B and sk-B is close()d while GC is checking unix_vertex_dead()
> for sk-A and sk-B.
>
>   GC thread                    User thread
>   ---------                    -----------
>   unix_vertex_dead(sk-A)
>   -> true   <------.
>                     \
>                      `------   recv(sk-B, MSG_PEEK)
>               invalidate !!    -> sk-A's file refcount : 1 -> 2
>
>                                close(sk-B)
>                                -> sk-B's file refcount : 2 -> 1
>   unix_vertex_dead(sk-B)
>   -> true
>
> Initially, sk-A's file refcount is 1 by the inflight fd in sk-B
> recvq.  GC thinks sk-A is dead because the file refcount is the
> same as the number of its inflight fds.
>
> However, sk-A's file refcount is bumped silently by MSG_PEEK,
> which invalidates the previous evaluation.
>
> At this moment, sk-B's file refcount is 2; one by the open fd,
> and one by the inflight fd in sk-A.  The subsequent close()
> releases one refcount by the former.
>
> Finally, GC incorrectly concludes that both sk-A and sk-B are dead.
>
> One option is to restore the lock dance in unix_peek_fds(), but
> we can resolve this more elegantly thanks to the new algorithm.
>
> We actually do not need to synchronise MSG_PEEK with the dead
> SCC detection.  Even if the sequence above occurs, we can just
> give up garbage-collecting the SCC if we can detect the race.
>
> Let's notify GC when MSG_PEEK occurs and let it defer the SCC
> to the next run.
>
> This way no locking is needed on the MSG_PEEK side, and we can
> avoid imposing a penalty on every MSG_PEEK unnecessarily.
>
> Note that we can retry within unix_scc_dead() if MSG_PEEK is
> detected, but we do not do so to avoid hung task splat from
> abusive MSG_PEEK calls.
>
> Fixes: 118f457da9ed ("af_unix: Remove lock dance in unix_peek_fds().")
> Reported-by: Igor Ushakov <sysroot314@gmail.com>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> ---
>  net/unix/af_unix.c |  2 ++
>  net/unix/af_unix.h |  1 +
>  net/unix/garbage.c | 71 ++++++++++++++++++++++++++++------------------
>  3 files changed, 46 insertions(+), 28 deletions(-)
>
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 7eaa5b187fef..b23c33df8b46 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1958,6 +1958,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
>  static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
>  {
>         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
> +
> +       unix_peek_fpl(scm->fp);
>  }
>
>  static void unix_destruct_scm(struct sk_buff *skb)
> diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h
> index c4f1b2da363d..8119dbeef3a3 100644
> --- a/net/unix/af_unix.h
> +++ b/net/unix/af_unix.h
> @@ -29,6 +29,7 @@ void unix_del_edges(struct scm_fp_list *fpl);
>  void unix_update_edges(struct unix_sock *receiver);
>  int unix_prepare_fpl(struct scm_fp_list *fpl);
>  void unix_destroy_fpl(struct scm_fp_list *fpl);
> +void unix_peek_fpl(struct scm_fp_list *fpl);
>  void unix_schedule_gc(struct user_struct *user);
>
>  /* SOCK_DIAG */
> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> index 816e8fa2b062..468fb0ee463f 100644
> --- a/net/unix/garbage.c
> +++ b/net/unix/garbage.c
> @@ -318,6 +318,21 @@ void unix_destroy_fpl(struct scm_fp_list *fpl)
>         unix_free_vertices(fpl);
>  }
>
> +static bool gc_in_progress;
> +static bool unix_fd_peeked;
> +
> +void unix_peek_fpl(struct scm_fp_list *fpl)
> +{
> +       if (!fpl->count_unix)
> +               return;
> +
> +       if (!READ_ONCE(gc_in_progress))
> +               return;
> +
> +       /* Invalidate the final refcnt check in unix_vertex_dead(). */
> +       WRITE_ONCE(unix_fd_peeked, true);
> +}
> +
>  static bool unix_vertex_dead(struct unix_vertex *vertex)
>  {
>         struct unix_edge *edge;
> @@ -351,6 +366,32 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
>         return true;
>  }
>
> +static LIST_HEAD(unix_visited_vertices);
> +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
> +
> +static bool unix_scc_dead(struct list_head *scc, bool fast)
> +{
> +       struct unix_vertex *vertex;
> +       bool scc_dead = true;
> +
> +       WRITE_ONCE(unix_fd_peeked, false);
> +
> +       list_for_each_entry_reverse(vertex, scc, scc_entry) {
> +               /* Don't restart DFS from this vertex. */
> +               list_move_tail(&vertex->entry, &unix_visited_vertices);
> +
> +               /* Mark vertex as off-stack for __unix_walk_scc(). */
> +               if (!fast)
> +                       vertex->index = unix_vertex_grouped_index;
> +
> +               if (scc_dead)
> +                       scc_dead = unix_vertex_dead(vertex);
> +       }
> +
> +       /* If MSG_PEEK intervened, defer this SCC to the next round. */
> +       return scc_dead && !READ_ONCE(unix_fd_peeked);

I'll add more bold comment here and the commit message
why memory barrier for WRITE_ONCE(unix_fd_peeked, true)
is not needed, as suggested by Linus.

pw-bot: cr

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-03-08 19:19 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-08  3:04 [PATCH v1 net] af_unix: Give up GC if MSG_PEEK intervened Kuniyuki Iwashima
2026-03-08 19:19 ` Kuniyuki Iwashima

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox