netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net] net: lockless sock_i_ino()
@ 2025-09-02 18:36 Eric Dumazet
  2025-09-03  5:26 ` Kuniyuki Iwashima
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-09-02 18:36 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, eric.dumazet, Eric Dumazet,
	syzbot+50603c05bbdf4dfdaffa, Sebastian Andrzej Siewior,
	Kuniyuki Iwashima

Followup of commit c51da3f7a161 ("net: remove sock_i_uid()")

A recent syzbot report was the trigger for this change.

Over the years, we had many problems caused by the
read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid().

We could fix smc_diag_dump_proto() or make a more radical move:

Instead of waiting for new syzbot reports, cache the socket
inode number in sk->sk_ino, so that we no longer
need to acquire sk->sk_callback_lock in sock_i_ino().

This makes socket dumps faster (one less cache line miss,
and two atomic ops avoided).

Prior art:

commit 25a9c8a4431c ("netlink: Add __sock_i_ino() for __netlink_diag_dump().")
commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH.")
commit efc3dbc37412 ("rds: Make rds_sock_lock BH rather than IRQ safe.")

Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.")
Reported-by: syzbot+50603c05bbdf4dfdaffa@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/68b73804.050a0220.3db4df.01d8.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Kuniyuki Iwashima <kuniyu@google.com>
---
 include/net/sock.h   | 17 +++++++++++++----
 net/core/sock.c      | 22 ----------------------
 net/mptcp/protocol.c |  1 -
 net/netlink/diag.c   |  2 +-
 4 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..fb13322a11fc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -285,6 +285,7 @@ struct sk_filter;
   *	@sk_ack_backlog: current listen backlog
   *	@sk_max_ack_backlog: listen backlog set in listen()
   *	@sk_uid: user id of owner
+  *	@sk_ino: inode number (zero if orphaned)
   *	@sk_prefer_busy_poll: prefer busypolling over softirq processing
   *	@sk_busy_poll_budget: napi processing budget when busypolling
   *	@sk_priority: %SO_PRIORITY setting
@@ -518,6 +519,7 @@ struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
+	unsigned long		sk_ino;
 	spinlock_t		sk_peer_lock;
 	int			sk_bind_phc;
 	struct pid		*sk_peer_pid;
@@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk)
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk->sk_socket = sock;
+	if (sock) {
+		WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
+		WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
+	}
 }
 
 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
@@ -2077,6 +2083,7 @@ static inline void sock_orphan(struct sock *sk)
 	sk_set_socket(sk, NULL);
 	sk->sk_wq  = NULL;
 	/* Note: sk_uid is unchanged. */
+	WRITE_ONCE(sk->sk_ino, 0);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -2087,20 +2094,22 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
-	WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
+static inline unsigned long sock_i_ino(const struct sock *sk)
+{
+	/* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
+	return READ_ONCE(sk->sk_ino);
+}
+
 static inline kuid_t sk_uid(const struct sock *sk)
 {
 	/* Paired with WRITE_ONCE() in sockfs_setattr() */
 	return READ_ONCE(sk->sk_uid);
 }
 
-unsigned long __sock_i_ino(struct sock *sk);
-unsigned long sock_i_ino(struct sock *sk);
-
 static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
 {
 	return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7c26ec8dce63..158bddd23134 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2780,28 +2780,6 @@ void sock_pfree(struct sk_buff *skb)
 EXPORT_SYMBOL(sock_pfree);
 #endif /* CONFIG_INET */
 
-unsigned long __sock_i_ino(struct sock *sk)
-{
-	unsigned long ino;
-
-	read_lock(&sk->sk_callback_lock);
-	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
-	read_unlock(&sk->sk_callback_lock);
-	return ino;
-}
-EXPORT_SYMBOL(__sock_i_ino);
-
-unsigned long sock_i_ino(struct sock *sk)
-{
-	unsigned long ino;
-
-	local_bh_disable();
-	ino = __sock_i_ino(sk);
-	local_bh_enable();
-	return ino;
-}
-EXPORT_SYMBOL(sock_i_ino);
-
 /*
  * Allocate a skb from the socket's send buffer.
  */
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9a287b75c1b3..e6fd97b21e9e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3554,7 +3554,6 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent)
 	write_lock_bh(&sk->sk_callback_lock);
 	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	sk_set_socket(sk, parent);
-	WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 61981e01fd6f..b8e58132e8af 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -168,7 +168,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 				 NETLINK_CB(cb->skb).portid,
 				 cb->nlh->nlmsg_seq,
 				 NLM_F_MULTI,
-				 __sock_i_ino(sk)) < 0) {
+				 sock_i_ino(sk)) < 0) {
 			ret = 1;
 			break;
 		}
-- 
2.51.0.338.gd7d06c2dae-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net] net: lockless sock_i_ino()
  2025-09-02 18:36 [PATCH net] net: lockless sock_i_ino() Eric Dumazet
@ 2025-09-03  5:26 ` Kuniyuki Iwashima
  2025-09-03  7:41 ` Sebastian Andrzej Siewior
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Kuniyuki Iwashima @ 2025-09-03  5:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+50603c05bbdf4dfdaffa,
	Sebastian Andrzej Siewior

On Tue, Sep 2, 2025 at 11:36 AM Eric Dumazet <edumazet@google.com> wrote:
>
> Followup of commit c51da3f7a161 ("net: remove sock_i_uid()")
>
> A recent syzbot report was the trigger for this change.
>
> Over the years, we had many problems caused by the
> read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid().
>
> We could fix smc_diag_dump_proto() or make a more radical move:
>
> Instead of waiting for new syzbot reports, cache the socket
> inode number in sk->sk_ino, so that we no longer
> need to acquire sk->sk_callback_lock in sock_i_ino().
>
> This makes socket dumps faster (one less cache line miss,
> and two atomic ops avoided).
>
> Prior art:
>
> commit 25a9c8a4431c ("netlink: Add __sock_i_ino() for __netlink_diag_dump().")
> commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH.")
> commit efc3dbc37412 ("rds: Make rds_sock_lock BH rather than IRQ safe.")
>
> Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.")
> Reported-by: syzbot+50603c05bbdf4dfdaffa@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/netdev/68b73804.050a0220.3db4df.01d8.GAE@google.com/T/#u
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] net: lockless sock_i_ino()
  2025-09-02 18:36 [PATCH net] net: lockless sock_i_ino() Eric Dumazet
  2025-09-03  5:26 ` Kuniyuki Iwashima
@ 2025-09-03  7:41 ` Sebastian Andrzej Siewior
  2025-09-03 23:30 ` patchwork-bot+netdevbpf
  2025-09-15 18:16 ` Andrei Vagin
  3 siblings, 0 replies; 7+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-09-03  7:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+50603c05bbdf4dfdaffa,
	Kuniyuki Iwashima

On 2025-09-02 18:36:03 [+0000], Eric Dumazet wrote:
> Followup of commit c51da3f7a161 ("net: remove sock_i_uid()")
> 
> A recent syzbot report was the trigger for this change.
> 
> Over the years, we had many problems caused by the
> read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid().
> 
> We could fix smc_diag_dump_proto() or make a more radical move:
> 
> Instead of waiting for new syzbot reports, cache the socket
> inode number in sk->sk_ino, so that we no longer
> need to acquire sk->sk_callback_lock in sock_i_ino().
> 
> This makes socket dumps faster (one less cache line miss,
> and two atomic ops avoided).
> 
> Prior art:
> 
> commit 25a9c8a4431c ("netlink: Add __sock_i_ino() for __netlink_diag_dump().")
> commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH.")
> commit efc3dbc37412 ("rds: Make rds_sock_lock BH rather than IRQ safe.")
> 
> Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.")
> Reported-by: syzbot+50603c05bbdf4dfdaffa@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/netdev/68b73804.050a0220.3db4df.01d8.GAE@google.com/T/#u
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Kuniyuki Iwashima <kuniyu@google.com>

Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Sebastian

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] net: lockless sock_i_ino()
  2025-09-02 18:36 [PATCH net] net: lockless sock_i_ino() Eric Dumazet
  2025-09-03  5:26 ` Kuniyuki Iwashima
  2025-09-03  7:41 ` Sebastian Andrzej Siewior
@ 2025-09-03 23:30 ` patchwork-bot+netdevbpf
  2025-09-15 18:16 ` Andrei Vagin
  3 siblings, 0 replies; 7+ messages in thread
From: patchwork-bot+netdevbpf @ 2025-09-03 23:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, horms, netdev, eric.dumazet,
	syzbot+50603c05bbdf4dfdaffa, bigeasy, kuniyu

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Tue,  2 Sep 2025 18:36:03 +0000 you wrote:
> Followup of commit c51da3f7a161 ("net: remove sock_i_uid()")
> 
> A recent syzbot report was the trigger for this change.
> 
> Over the years, we had many problems caused by the
> read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid().
> 
> [...]

Here is the summary with links:
  - [net] net: lockless sock_i_ino()
    https://git.kernel.org/netdev/net/c/5d6b58c932ec

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] net: lockless sock_i_ino()
  2025-09-02 18:36 [PATCH net] net: lockless sock_i_ino() Eric Dumazet
                   ` (2 preceding siblings ...)
  2025-09-03 23:30 ` patchwork-bot+netdevbpf
@ 2025-09-15 18:16 ` Andrei Vagin
  2025-09-15 18:51   ` Eric Dumazet
  3 siblings, 1 reply; 7+ messages in thread
From: Andrei Vagin @ 2025-09-15 18:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+50603c05bbdf4dfdaffa,
	Sebastian Andrzej Siewior, Kuniyuki Iwashima

On Tue, Sep 02, 2025 at 06:36:03PM +0000, Eric Dumazet wrote:
> @@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk)
>  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
>  {
>  	sk->sk_socket = sock;
> +	if (sock) {
> +		WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
> +		WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
> +	}

Hi Eric.

This change breaks CRIU [1]. The issue is that socket_diag reports two
sockets with the same inode number. It seems inet_csk_clone_lock copies
sk->sk_ino to child sockets, but sk_set_socket doesn’t reset it to zero
when sock is NULL.

[1] https://github.com/checkpoint-restore/criu/issues/2744

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] net: lockless sock_i_ino()
  2025-09-15 18:16 ` Andrei Vagin
@ 2025-09-15 18:51   ` Eric Dumazet
  2025-09-15 19:01     ` Andrei Vagin
  0 siblings, 1 reply; 7+ messages in thread
From: Eric Dumazet @ 2025-09-15 18:51 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+50603c05bbdf4dfdaffa,
	Sebastian Andrzej Siewior, Kuniyuki Iwashima

On Mon, Sep 15, 2025 at 11:16 AM Andrei Vagin <avagin@google.com> wrote:
>
> On Tue, Sep 02, 2025 at 06:36:03PM +0000, Eric Dumazet wrote:
> > @@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk)
> >  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
> >  {
> >       sk->sk_socket = sock;
> > +     if (sock) {
> > +             WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
> > +             WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
> > +     }
>
> Hi Eric.
>
> This change breaks CRIU [1]. The issue is that socket_diag reports two
> sockets with the same inode number. It seems inet_csk_clone_lock copies
> sk->sk_ino to child sockets, but sk_set_socket doesn’t reset it to zero
> when sock is NULL.

Hi Andrei, thanks for the report.

Could you test this patch ?

diff --git a/include/net/sock.h b/include/net/sock.h
index 0fd465935334160eeda7c1ea608f5d6161f02cb1..36e11b2afb223bf18ff0596d634e885cca549d0f
100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2063,6 +2063,9 @@ static inline void sk_set_socket(struct sock
*sk, struct socket *sock)
        if (sock) {
                WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
                WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
+       } else {
+               /* Note: sk_uid is unchanged. */
+               WRITE_ONCE(sk->sk_ino, 0);
        }
 }

@@ -2084,8 +2087,6 @@ static inline void sock_orphan(struct sock *sk)
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
-       /* Note: sk_uid is unchanged. */
-       WRITE_ONCE(sk->sk_ino, 0);
        write_unlock_bh(&sk->sk_callback_lock);
 }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] net: lockless sock_i_ino()
  2025-09-15 18:51   ` Eric Dumazet
@ 2025-09-15 19:01     ` Andrei Vagin
  0 siblings, 0 replies; 7+ messages in thread
From: Andrei Vagin @ 2025-09-15 19:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+50603c05bbdf4dfdaffa,
	Sebastian Andrzej Siewior, Kuniyuki Iwashima

On Mon, Sep 15, 2025 at 11:51 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Sep 15, 2025 at 11:16 AM Andrei Vagin <avagin@google.com> wrote:
> >
> > On Tue, Sep 02, 2025 at 06:36:03PM +0000, Eric Dumazet wrote:
> > > @@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk)
> > >  static inline void sk_set_socket(struct sock *sk, struct socket *sock)
> > >  {
> > >       sk->sk_socket = sock;
> > > +     if (sock) {
> > > +             WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
> > > +             WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
> > > +     }
> >
> > Hi Eric.
> >
> > This change breaks CRIU [1]. The issue is that socket_diag reports two
> > sockets with the same inode number. It seems inet_csk_clone_lock copies
> > sk->sk_ino to child sockets, but sk_set_socket doesn’t reset it to zero
> > when sock is NULL.
>
> Hi Andrei, thanks for the report.
>
> Could you test this patch ?
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 0fd465935334160eeda7c1ea608f5d6161f02cb1..36e11b2afb223bf18ff0596d634e885cca549d0f
> 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2063,6 +2063,9 @@ static inline void sk_set_socket(struct sock
> *sk, struct socket *sock)
>         if (sock) {
>                 WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
>                 WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
> +       } else {
> +               /* Note: sk_uid is unchanged. */
> +               WRITE_ONCE(sk->sk_ino, 0);

It fixes the issue. Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-09-15 19:01 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-02 18:36 [PATCH net] net: lockless sock_i_ino() Eric Dumazet
2025-09-03  5:26 ` Kuniyuki Iwashima
2025-09-03  7:41 ` Sebastian Andrzej Siewior
2025-09-03 23:30 ` patchwork-bot+netdevbpf
2025-09-15 18:16 ` Andrei Vagin
2025-09-15 18:51   ` Eric Dumazet
2025-09-15 19:01     ` Andrei Vagin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).