netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
@ 2025-11-20 10:11 Tetsuo Handa
  2025-11-21  9:06 ` Oleksij Rempel
  0 siblings, 1 reply; 8+ messages in thread
From: Tetsuo Handa @ 2025-11-20 10:11 UTC (permalink / raw)
  To: linux-can, Network Development, Marc Kleine-Budde, Oleksij Rempel

Hello.

I am using a debug printk() patch for j1939_priv which records/counts where
refcount for j1939_priv has changed, and syzbot succeeded to record/count a
j1939_priv leak in next-20251119
( https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 ).

The output from the debug printk() patch is shown below. I think that
understanding what actions have been taken on this j1939_priv object will
help you finding the cause of j1939_priv leak bug.

Regards.



unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
Call trace for vcan0@ffff888031c9c000 +1 at
     j1939_priv_create net/can/j1939/main.c:150 [inline]
     j1939_netdev_start+0x1de/0xa30 net/can/j1939/main.c:282
     j1939_sk_bind+0x926/0xca0 net/can/j1939/socket.c:498
     __sys_bind_socket net/socket.c:1878 [inline]
     __sys_bind+0x2c6/0x3e0 net/socket.c:1909
Call trace for vcan0@ffff888031c9c000 +1 at
     j1939_priv_get net/can/j1939/main.c:191 [inline]
     j1939_can_rx_register net/can/j1939/main.c:199 [inline]
     j1939_netdev_start+0x615/0xa30 net/can/j1939/main.c:305
     j1939_sk_bind+0x926/0xca0 net/can/j1939/socket.c:498
     __sys_bind_socket net/socket.c:1878 [inline]
     __sys_bind+0x2c6/0x3e0 net/socket.c:1909
Call trace for vcan0@ffff888031c9c000 +1 at
     j1939_sk_bind+0xa02/0xca0 net/can/j1939/socket.c:510
     __sys_bind_socket net/socket.c:1878 [inline]
     __sys_bind+0x2c6/0x3e0 net/socket.c:1909
Call trace for vcan0@ffff888031c9c000 +1 at
     j1939_jsk_add net/can/j1939/socket.c:81 [inline]
     j1939_sk_bind+0x769/0xca0 net/can/j1939/socket.c:530
     __sys_bind_socket net/socket.c:1878 [inline]
     __sys_bind+0x2c6/0x3e0 net/socket.c:1909
Call trace for vcan0@ffff888031c9c000 +2 at
     j1939_session_new+0x127/0x450 net/can/j1939/transport.c:1503
     j1939_tp_send+0x338/0x8c0 net/can/j1939/transport.c:2018
     j1939_sk_send_loop net/can/j1939/socket.c:1159 [inline]
     j1939_sk_sendmsg+0xaf8/0x1350 net/can/j1939/socket.c:1282
     sock_sendmsg_nosec net/socket.c:727 [inline]
     __sock_sendmsg+0x21c/0x270 net/socket.c:746
Call trace for vcan0@ffff888031c9c000 +2 at
     j1939_priv_get net/can/j1939/main.c:191 [inline]
     j1939_can_recv+0x17f/0xa30 net/can/j1939/main.c:54
     deliver net/can/af_can.c:575 [inline]
     can_rcv_filter+0x357/0x7d0 net/can/af_can.c:609
     can_receive+0x312/0x450 net/can/af_can.c:666
     can_rcv+0x145/0x270 net/can/af_can.c:690
     __netif_receive_skb_one_core net/core/dev.c:6130 [inline]
     __netif_receive_skb+0x164/0x380 net/core/dev.c:6243
     process_backlog+0x622/0x1530 net/core/dev.c:6595
     __napi_poll+0xae/0x320 net/core/dev.c:7659
     napi_poll net/core/dev.c:7722 [inline]
     net_rx_action+0x672/0xe50 net/core/dev.c:7874
     handle_softirqs+0x27d/0x880 kernel/softirq.c:626
Call trace for vcan0@ffff888031c9c000 +1 at
     j1939_session_new+0x127/0x450 net/can/j1939/transport.c:1503
     j1939_session_fresh_new net/can/j1939/transport.c:1543 [inline]
     j1939_xtp_rx_rts_session_new net/can/j1939/transport.c:1628 [inline]
     j1939_xtp_rx_rts+0xd16/0x18b0 net/can/j1939/transport.c:1749
     j1939_tp_cmd_recv net/can/j1939/transport.c:2071 [inline]
     j1939_tp_recv+0xb24/0x1040 net/can/j1939/transport.c:2158
     j1939_can_recv+0x6a0/0xa30 net/can/j1939/main.c:108
     deliver net/can/af_can.c:575 [inline]
     can_rcv_filter+0x357/0x7d0 net/can/af_can.c:609
     can_receive+0x312/0x450 net/can/af_can.c:666
     can_rcv+0x145/0x270 net/can/af_can.c:690
     __netif_receive_skb_one_core net/core/dev.c:6130 [inline]
     __netif_receive_skb+0x164/0x380 net/core/dev.c:6243
     process_backlog+0x622/0x1530 net/core/dev.c:6595
     __napi_poll+0xae/0x320 net/core/dev.c:7659
     napi_poll net/core/dev.c:7722 [inline]
     net_rx_action+0x672/0xe50 net/core/dev.c:7874
     handle_softirqs+0x27d/0x880 kernel/softirq.c:626
Call trace for vcan0@ffff888031c9c000 -2 at
     j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
     j1939_can_recv+0x6e0/0xa30 net/can/j1939/main.c:115
     deliver net/can/af_can.c:575 [inline]
     can_rcv_filter+0x357/0x7d0 net/can/af_can.c:609
     can_receive+0x312/0x450 net/can/af_can.c:666
     can_rcv+0x145/0x270 net/can/af_can.c:690
     __netif_receive_skb_one_core net/core/dev.c:6130 [inline]
     __netif_receive_skb+0x164/0x380 net/core/dev.c:6243
     process_backlog+0x622/0x1530 net/core/dev.c:6595
     __napi_poll+0xae/0x320 net/core/dev.c:7659
     napi_poll net/core/dev.c:7722 [inline]
     net_rx_action+0x672/0xe50 net/core/dev.c:7874
     handle_softirqs+0x27d/0x880 kernel/softirq.c:626
Call trace for vcan0@ffff888031c9c000 -2 at
     j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
     j1939_session_destroy net/can/j1939/transport.c:285 [inline]
     __j1939_session_release net/can/j1939/transport.c:294 [inline]
     kref_put include/linux/kref.h:65 [inline]
     j1939_session_put+0x2f0/0x460 net/can/j1939/transport.c:299
     j1939_tp_rxtimer+0x177/0x3d0 net/can/j1939/transport.c:1266
     __run_hrtimer kernel/time/hrtimer.c:1777 [inline]
     __hrtimer_run_queues+0x51c/0xc70 kernel/time/hrtimer.c:1841
     hrtimer_run_softirq+0x187/0x2b0 kernel/time/hrtimer.c:1858
     handle_softirqs+0x27d/0x880 kernel/softirq.c:626
Call trace for vcan0@ffff888031c9c000 -1 at
     j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
     j1939_jsk_del net/can/j1939/socket.c:94 [inline]
     j1939_sk_release+0x408/0x7c0 net/can/j1939/socket.c:649
     __sock_release net/socket.c:662 [inline]
     sock_close+0xc3/0x240 net/socket.c:1459
     __fput+0x44c/0xa70 fs/file_table.c:468
     task_work_run+0x1d4/0x260 kernel/task_work.c:233
     resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
     __exit_to_user_mode_loop kernel/entry/common.c:44 [inline]
     exit_to_user_mode_loop+0xff/0x4f0 kernel/entry/common.c:75
     __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline]
     syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline]
     syscall_exit_to_user_mode_work include/linux/entry-common.h:159 [inline]
     syscall_exit_to_user_mode include/linux/entry-common.h:194 [inline]
     do_syscall_64+0x2e9/0xfa0 arch/x86/entry/syscall_64.c:100
     entry_SYSCALL_64_after_hwframe+0x77/0x7f
Call trace for vcan0@ffff888031c9c000 -1 at
     j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
     j1939_can_rx_unregister net/can/j1939/main.c:221 [inline]
     __j1939_rx_release net/can/j1939/main.c:230 [inline]
     kref_put_mutex include/linux/kref.h:86 [inline]
     j1939_netdev_stop+0xa6/0x190 net/can/j1939/main.c:325
     j1939_sk_release+0x471/0x7c0 net/can/j1939/socket.c:654
     __sock_release net/socket.c:662 [inline]
     sock_close+0xc3/0x240 net/socket.c:1459
     __fput+0x44c/0xa70 fs/file_table.c:468
     task_work_run+0x1d4/0x260 kernel/task_work.c:233
     resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
     __exit_to_user_mode_loop kernel/entry/common.c:44 [inline]
     exit_to_user_mode_loop+0xff/0x4f0 kernel/entry/common.c:75
     __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline]
     syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline]
     syscall_exit_to_user_mode_work include/linux/entry-common.h:159 [inline]
     syscall_exit_to_user_mode include/linux/entry-common.h:194 [inline]
     do_syscall_64+0x2e9/0xfa0 arch/x86/entry/syscall_64.c:100
     entry_SYSCALL_64_after_hwframe+0x77/0x7f
Call trace for vcan0@ffff888031c9c000 -1 at
     j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
     j1939_sk_release+0x471/0x7c0 net/can/j1939/socket.c:654
     __sock_release net/socket.c:662 [inline]
     sock_close+0xc3/0x240 net/socket.c:1459
     __fput+0x44c/0xa70 fs/file_table.c:468
     task_work_run+0x1d4/0x260 kernel/task_work.c:233
     resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
     __exit_to_user_mode_loop kernel/entry/common.c:44 [inline]
     exit_to_user_mode_loop+0xff/0x4f0 kernel/entry/common.c:75
     __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline]
     syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline]
     syscall_exit_to_user_mode_work include/linux/entry-common.h:159 [inline]
     syscall_exit_to_user_mode include/linux/entry-common.h:194 [inline]
     do_syscall_64+0x2e9/0xfa0 arch/x86/entry/syscall_64.c:100
     entry_SYSCALL_64_after_hwframe+0x77/0x7f
Call trace for vcan0@ffff888031c9c000 -1 at
     j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
     j1939_sk_sock_destruct+0x52/0x90 net/can/j1939/socket.c:386
     __sk_destruct+0x85/0x880 net/core/sock.c:2350
     rcu_do_batch kernel/rcu/tree.c:2605 [inline]
     rcu_core+0xcab/0x1770 kernel/rcu/tree.c:2861
     handle_softirqs+0x27d/0x880 kernel/softirq.c:626
balance for vcan0@j1939_priv is 1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-20 10:11 [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 Tetsuo Handa
@ 2025-11-21  9:06 ` Oleksij Rempel
  2025-11-21  9:33   ` Tetsuo Handa
  0 siblings, 1 reply; 8+ messages in thread
From: Oleksij Rempel @ 2025-11-21  9:06 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: linux-can, Network Development, Marc Kleine-Budde

Hello Tetsuo,

On Thu, Nov 20, 2025 at 07:11:22PM +0900, Tetsuo Handa wrote:
> Hello.
> 
> I am using a debug printk() patch for j1939_priv which records/counts where
> refcount for j1939_priv has changed, and syzbot succeeded to record/count a
> j1939_priv leak in next-20251119
> ( https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 ).
> 
> The output from the debug printk() patch is shown below. I think that
> understanding what actions have been taken on this j1939_priv object will
> help you finding the cause of j1939_priv leak bug.

Hm, looks like we have a race where new session is created in
j1939_xtp_rx_rts(), just at the moment where we call
j1939_can_rx_unregister().

Haw about following change:

--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -214,6 +214,7 @@ static void __j1939_rx_release(struct kref *kref)
                                               rx_kref);
 
        j1939_can_rx_unregister(priv);
+       j1939_cancel_active_session(priv, NULL);
        j1939_ecu_unmap_all(priv);
        j1939_priv_set(priv->ndev, NULL);
        mutex_unlock(&j1939_netdev_lock);

Best Regards,
Oleksij
-- 
Pengutronix e.K.                           |                             |
Steuerwalder Str. 21                       | http://www.pengutronix.de/  |
31137 Hildesheim, Germany                  | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-21  9:06 ` Oleksij Rempel
@ 2025-11-21  9:33   ` Tetsuo Handa
  2025-11-21 10:00     ` Oleksij Rempel
  0 siblings, 1 reply; 8+ messages in thread
From: Tetsuo Handa @ 2025-11-21  9:33 UTC (permalink / raw)
  To: Oleksij Rempel; +Cc: linux-can, Network Development, Marc Kleine-Budde

On 2025/11/21 18:06, Oleksij Rempel wrote:
> Hm, looks like we have a race where new session is created in
> j1939_xtp_rx_rts(), just at the moment where we call
> j1939_can_rx_unregister().
> 
> Haw about following change:
> 
> --- a/net/can/j1939/main.c
> +++ b/net/can/j1939/main.c
> @@ -214,6 +214,7 @@ static void __j1939_rx_release(struct kref *kref)
>                                                rx_kref);
>  
>         j1939_can_rx_unregister(priv);
> +       j1939_cancel_active_session(priv, NULL);
>         j1939_ecu_unmap_all(priv);
>         j1939_priv_set(priv->ndev, NULL);
>         mutex_unlock(&j1939_netdev_lock);
> 

Well, j1939_cancel_active_session(priv, NULL) is already called from
j1939_netdev_notify(NETDEV_UNREGISTER). Unless a session is recreated
after NETDEV_UNREGISTER event was handled, I can't imagine such race.

We can see that there are three j1939_session_new() calls but only
two j1939_session_destroy() calls. There might be a refcount leak on
j1939_session which prevents j1939_priv from dropping final refcount.

  Call trace for vcan0@ffff888031c9c000 +2 at
       j1939_session_new+0x127/0x450 net/can/j1939/transport.c:1503
       j1939_tp_send+0x338/0x8c0 net/can/j1939/transport.c:2018

  Call trace for vcan0@ffff888031c9c000 +1 at
       j1939_session_new+0x127/0x450 net/can/j1939/transport.c:1503
       j1939_session_fresh_new net/can/j1939/transport.c:1543 [inline]
       j1939_xtp_rx_rts_session_new net/can/j1939/transport.c:1628 [inline]
       j1939_xtp_rx_rts+0xd16/0x18b0 net/can/j1939/transport.c:1749

  Call trace for vcan0@ffff888031c9c000 -2 at
       j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
       j1939_session_destroy net/can/j1939/transport.c:285 [inline]
       __j1939_session_release net/can/j1939/transport.c:294 [inline]
       kref_put include/linux/kref.h:65 [inline]

Do we want to update
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/net/can/j1939?id=5ac798f79b48065b0284216c7a0057271185a882
in order to also try tracing refcount for j1939_session ?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-21  9:33   ` Tetsuo Handa
@ 2025-11-21 10:00     ` Oleksij Rempel
  2025-11-21 10:19       ` Tetsuo Handa
  0 siblings, 1 reply; 8+ messages in thread
From: Oleksij Rempel @ 2025-11-21 10:00 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: linux-can, Network Development, Marc Kleine-Budde

On Fri, Nov 21, 2025 at 06:33:02PM +0900, Tetsuo Handa wrote:
> On 2025/11/21 18:06, Oleksij Rempel wrote:
> > Hm, looks like we have a race where new session is created in
> > j1939_xtp_rx_rts(), just at the moment where we call
> > j1939_can_rx_unregister().
> > 
> > Haw about following change:
> > 
> > --- a/net/can/j1939/main.c
> > +++ b/net/can/j1939/main.c
> > @@ -214,6 +214,7 @@ static void __j1939_rx_release(struct kref *kref)
> >                                                rx_kref);
> >  
> >         j1939_can_rx_unregister(priv);
> > +       j1939_cancel_active_session(priv, NULL);
> >         j1939_ecu_unmap_all(priv);
> >         j1939_priv_set(priv->ndev, NULL);
> >         mutex_unlock(&j1939_netdev_lock);
> > 
> 
> Well, j1939_cancel_active_session(priv, NULL) is already called from
> j1939_netdev_notify(NETDEV_UNREGISTER). Unless a session is recreated
> after NETDEV_UNREGISTER event was handled, I can't imagine such race.
> 
> We can see that there are three j1939_session_new() calls but only
> two j1939_session_destroy() calls. There might be a refcount leak on
> j1939_session which prevents j1939_priv from dropping final refcount.
> 
>   Call trace for vcan0@ffff888031c9c000 +2 at
>        j1939_session_new+0x127/0x450 net/can/j1939/transport.c:1503
>        j1939_tp_send+0x338/0x8c0 net/can/j1939/transport.c:2018
> 
>   Call trace for vcan0@ffff888031c9c000 +1 at
>        j1939_session_new+0x127/0x450 net/can/j1939/transport.c:1503
>        j1939_session_fresh_new net/can/j1939/transport.c:1543 [inline]
>        j1939_xtp_rx_rts_session_new net/can/j1939/transport.c:1628 [inline]
>        j1939_xtp_rx_rts+0xd16/0x18b0 net/can/j1939/transport.c:1749
> 
>   Call trace for vcan0@ffff888031c9c000 -2 at
>        j1939_priv_put+0x23/0x370 net/can/j1939/main.c:184
>        j1939_session_destroy net/can/j1939/transport.c:285 [inline]
>        __j1939_session_release net/can/j1939/transport.c:294 [inline]
>        kref_put include/linux/kref.h:65 [inline]
> 
> Do we want to update
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/net/can/j1939?id=5ac798f79b48065b0284216c7a0057271185a882
> in order to also try tracing refcount for j1939_session ?

Ack.

-- 
Pengutronix e.K.                           |                             |
Steuerwalder Str. 21                       | http://www.pengutronix.de/  |
31137 Hildesheim, Germany                  | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-21 10:00     ` Oleksij Rempel
@ 2025-11-21 10:19       ` Tetsuo Handa
  2025-11-21 10:31         ` Oleksij Rempel
  0 siblings, 1 reply; 8+ messages in thread
From: Tetsuo Handa @ 2025-11-21 10:19 UTC (permalink / raw)
  To: Oleksij Rempel; +Cc: linux-can, Network Development, Marc Kleine-Budde

On 2025/11/21 19:00, Oleksij Rempel wrote:
>> Do we want to update
>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/net/can/j1939?id=5ac798f79b48065b0284216c7a0057271185a882
>> in order to also try tracing refcount for j1939_session ?
> 
> Ack.
> 

I see.

By the way, I am thinking

diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 88e7160d4248..f12679446990 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -477,16 +477,22 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 		struct net_device *ndev;
 
 		ndev = dev_get_by_index(net, addr->can_ifindex);
 		if (!ndev) {
 			ret = -ENODEV;
 			goto out_release_sock;
 		}
 
+		if (ndev->reg_state != NETREG_REGISTERED) {
+			dev_put(ndev);
+			ret = -ENODEV;
+			goto out_release_sock;
+		}
+
 		can_ml = can_get_ml_priv(ndev);
 		if (!can_ml) {
 			dev_put(ndev);
 			ret = -ENODEV;
 			goto out_release_sock;
 		}
 
 		if (!(ndev->flags & IFF_UP)) {

as an alternative approach for
https://lkml.kernel.org/r/9a3f9a95-1f58-4d67-9ab4-1ca360f86f79@I-love.SAKURA.ne.jp
because I consider that getting a new refcount on net_device should be avoided
when NETDEV_UNREGISTER event has already started.

Maybe we can do similar thing for j1939_session in order to avoid getting a new
refcount on j1939_priv when NETDEV_UNREGISTER event has already started.

diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index fbf5c8001c9d..b22568fecba5 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -1492,6 +1492,8 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
 	struct j1939_session *session;
 	struct j1939_sk_buff_cb *skcb;
 
+	if (priv->ndev->reg_state != NETREG_REGISTERED)
+		return NULL;
 	session = kzalloc(sizeof(*session), gfp_any());
 	if (!session)
 		return NULL;



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-21 10:19       ` Tetsuo Handa
@ 2025-11-21 10:31         ` Oleksij Rempel
  2025-11-22  7:00           ` Tetsuo Handa
  0 siblings, 1 reply; 8+ messages in thread
From: Oleksij Rempel @ 2025-11-21 10:31 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: linux-can, Network Development, Marc Kleine-Budde

On Fri, Nov 21, 2025 at 07:19:24PM +0900, Tetsuo Handa wrote:
> On 2025/11/21 19:00, Oleksij Rempel wrote:
> >> Do we want to update
> >> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/net/can/j1939?id=5ac798f79b48065b0284216c7a0057271185a882
> >> in order to also try tracing refcount for j1939_session ?
> > 
> > Ack.
> > 
> 
> I see.
> 
> By the way, I am thinking
> 
> diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
> index 88e7160d4248..f12679446990 100644
> --- a/net/can/j1939/socket.c
> +++ b/net/can/j1939/socket.c
> @@ -477,16 +477,22 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
>  		struct net_device *ndev;
>  
>  		ndev = dev_get_by_index(net, addr->can_ifindex);
>  		if (!ndev) {
>  			ret = -ENODEV;
>  			goto out_release_sock;
>  		}
>  
> +		if (ndev->reg_state != NETREG_REGISTERED) {
> +			dev_put(ndev);
> +			ret = -ENODEV;
> +			goto out_release_sock;
> +		}
> +
>  		can_ml = can_get_ml_priv(ndev);
>  		if (!can_ml) {
>  			dev_put(ndev);
>  			ret = -ENODEV;
>  			goto out_release_sock;
>  		}
>  
>  		if (!(ndev->flags & IFF_UP)) {
> 
> as an alternative approach for
> https://lkml.kernel.org/r/9a3f9a95-1f58-4d67-9ab4-1ca360f86f79@I-love.SAKURA.ne.jp
> because I consider that getting a new refcount on net_device should be avoided
> when NETDEV_UNREGISTER event has already started.
> 
> Maybe we can do similar thing for j1939_session in order to avoid getting a new
> refcount on j1939_priv when NETDEV_UNREGISTER event has already started.
> 
> diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
> index fbf5c8001c9d..b22568fecba5 100644
> --- a/net/can/j1939/transport.c
> +++ b/net/can/j1939/transport.c
> @@ -1492,6 +1492,8 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
>  	struct j1939_session *session;
>  	struct j1939_sk_buff_cb *skcb;
>  
> +	if (priv->ndev->reg_state != NETREG_REGISTERED)
> +		return NULL;
>  	session = kzalloc(sizeof(*session), gfp_any());
>  	if (!session)
>  		return NULL;
> 

Yes, it make sense to proactively prevent the session. Good idea.

-- 
Pengutronix e.K.                           |                             |
Steuerwalder Str. 21                       | http://www.pengutronix.de/  |
31137 Hildesheim, Germany                  | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-21 10:31         ` Oleksij Rempel
@ 2025-11-22  7:00           ` Tetsuo Handa
  2025-11-22 13:03             ` Tetsuo Handa
  0 siblings, 1 reply; 8+ messages in thread
From: Tetsuo Handa @ 2025-11-22  7:00 UTC (permalink / raw)
  To: Oleksij Rempel; +Cc: linux-can, Network Development, Marc Kleine-Budde

On 2025/11/21 19:31, Oleksij Rempel wrote:
>> diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
>> index fbf5c8001c9d..b22568fecba5 100644
>> --- a/net/can/j1939/transport.c
>> +++ b/net/can/j1939/transport.c
>> @@ -1492,6 +1492,8 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
>>  	struct j1939_session *session;
>>  	struct j1939_sk_buff_cb *skcb;
>>  
>> +	if (priv->ndev->reg_state != NETREG_REGISTERED)
>> +		return NULL;
>>  	session = kzalloc(sizeof(*session), gfp_any());
>>  	if (!session)
>>  		return NULL;
>>
> 
> Yes, it make sense to proactively prevent the session. Good idea.
> 

Done updating my patch. But it turned out that your guess was correct before
syzbot tries my patch. A sample j1939 program (generated by Google AI search)
succeeded to trigger this race if below delay injection patch is applied.

--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -1492,6 +1494,9 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
        struct j1939_session *session;
        struct j1939_sk_buff_cb *skcb;

+       printk("j1939_session_new() delay start\n");
+       mdelay(5000); // <= Run "ip link del dev vcan0 type vcan" here.
+       printk("j1939_session_new() delay end\n");
        session = kzalloc(sizeof(*session), gfp_any());
        if (!session)
                return NULL;

So, not only we need to make sure that all existing j1939_session are destroyed
but also we need to make sure that no new j1939_session is created if underlying
net_device is no longer in NETREG_REGISTERED state.

F.Y.I. Below change did not help. Also, although j1939_netdev_notify(NETDEV_UNREGISTER)
is called periodically, j1939_cancel_active_session(priv, NULL) is called only once
because j1939_sk_netdev_event_unregister(priv) calls j1939_priv_set(priv->ndev, NULL)
 from __j1939_rx_release(). There is currently no mechanism to retry when we hit
the race above.

--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -214,6 +214,7 @@ static void __j1939_rx_release(struct kref *kref)
                                               rx_kref);
 
        j1939_can_rx_unregister(priv);
+       j1939_cancel_active_session(priv, NULL);
        j1939_ecu_unmap_all(priv);
        j1939_priv_set(priv->ndev, NULL);
        mutex_unlock(&j1939_netdev_lock);


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2
  2025-11-22  7:00           ` Tetsuo Handa
@ 2025-11-22 13:03             ` Tetsuo Handa
  0 siblings, 0 replies; 8+ messages in thread
From: Tetsuo Handa @ 2025-11-22 13:03 UTC (permalink / raw)
  To: Oleksij Rempel; +Cc: linux-can, Network Development, Marc Kleine-Budde

On 2025/11/22 16:00, Tetsuo Handa wrote:
> So, not only we need to make sure that all existing j1939_session are destroyed
> but also we need to make sure that no new j1939_session is created if underlying
> net_device is no longer in NETREG_REGISTERED state.

For your testing, here is a delay injection patch and a complete reproducer.

---------- delay injection patch start ----------
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index fbf5c8001c9d..601a32397f72 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -1492,6 +1492,9 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
 	struct j1939_session *session;
 	struct j1939_sk_buff_cb *skcb;
 
+	pr_info("%s() delay start\n", __func__);
+	mdelay(5000);
+	pr_info("%s() delay end\n", __func__);
 	session = kzalloc(sizeof(*session), gfp_any());
 	if (!session)
 		return NULL;
---------- delay injection patch end ----------

---------- j1939_example.c start ----------
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sched.h>
#include <linux/can.h>
#include <linux/can/j1939.h>
#include <net/if.h>
#include <errno.h>

#define IF_NAME "vcan0"
#define SRC_ADDR 0x20 // SA
#define DST_ADDR 0x30 // DA
#define PGN_TX   0x12300 // Sender PGN
#define PGN_RX   0x12300 // Receiver PGN

static void sender_task(int sock_s);
static void receiver_task(int sock_r);

int main(int argc, char *argv[])
{
	int sock_s, sock_r;
	struct sockaddr_can addr_s, addr_r;
	struct ifreq ifr;

	// Create a new namespace.
	if (unshare(CLONE_NEWNET)) {
		perror("unshare failed");
		return 1;
	}

	// Create vcan0 in that namespace.
	system("/usr/sbin/ip link add dev vcan0 type vcan");
	system("/usr/sbin/ip link set up vcan0");

	sock_s = socket(PF_CAN, SOCK_DGRAM, CAN_J1939);
	sock_r = socket(PF_CAN, SOCK_DGRAM, CAN_J1939);

	if (sock_s < 0 || sock_r < 0) {
		perror("socket creation failed");
		return 1;
	}

	strcpy(ifr.ifr_name, IF_NAME);
	if (ioctl(sock_s, SIOCGIFINDEX, &ifr) < 0) {
		perror("ioctl SIOCGIFINDEX failed");
		return 1;
	}

	addr_s.can_family = AF_CAN;
	addr_s.can_ifindex = ifr.ifr_ifindex;
	addr_s.can_addr.j1939.name = J1939_NO_NAME;
	addr_s.can_addr.j1939.addr = SRC_ADDR;
	addr_s.can_addr.j1939.pgn = J1939_NO_PGN;
	// Delete vcan0 in that namespace while bind() on vcan0 is in progress.
	if (fork() == 0) {
		sleep(1);
		system("/usr/sbin/ip link del dev vcan0 type vcan");
		_exit(0);
	}
	// Delay is injected by the kernel side.
	if (bind(sock_s, (struct sockaddr *)&addr_s, sizeof(addr_s)) < 0) {
		perror("sender bind failed");
		return 1;
	}

	addr_r.can_family = AF_CAN;
	addr_r.can_ifindex = ifr.ifr_ifindex;
	addr_r.can_addr.j1939.name = J1939_NO_NAME;
	addr_r.can_addr.j1939.addr = DST_ADDR;
	addr_r.can_addr.j1939.pgn = PGN_RX;
	if (bind(sock_r, (struct sockaddr *)&addr_r, sizeof(addr_r)) < 0) {
		perror("receiver bind failed");
		return 1;
	}

	printf("J1939 sockets set up on %s\n", IF_NAME);
	printf("Sender (SA 0x%02X) and Receiver (PGN 0x%05X) ready.\n", SRC_ADDR, PGN_RX);
	sender_task(sock_s);
	receiver_task(sock_r);
	return 0;
}

static void sender_task(int sock_s) {
	struct sockaddr_can addr_dest;
	socklen_t len_dest = sizeof(addr_dest);
	char data[] = "Hello J1939 Localhost!";

	addr_dest.can_family = AF_CAN;
	addr_dest.can_ifindex = 0;
	addr_dest.can_addr.j1939.name = J1939_NO_NAME;
	addr_dest.can_addr.j1939.addr = DST_ADDR;
	addr_dest.can_addr.j1939.pgn = PGN_TX;
	printf("Sending message: \"%s\" (Length: %lu)\n", data, (unsigned long)strlen(data) + 1);
	if (sendto(sock_s, data, strlen(data) + 1, 0, (struct sockaddr *)&addr_dest, len_dest) < 0) {
		perror("sendto failed");
	} else {
		printf("Message sent successfully.\n");
	}
}

static void receiver_task(int sock_r) {
	struct sockaddr_can addr_src;
	socklen_t len_src = sizeof(addr_src);
	char buffer[256];
	ssize_t bytes_received;

	printf("Waiting for messages...\n");
	bytes_received = recvfrom(sock_r, buffer, sizeof(buffer) - 1, 0, (struct sockaddr *)&addr_src, &len_src);
	if (bytes_received < 0) {
		perror("recvfrom failed");
	} else {
		buffer[bytes_received] = '\0';
		printf("Received %zd bytes from SA 0x%02X, PGN 0x%05X: \"%s\"\n",
		       bytes_received, addr_src.can_addr.j1939.addr, addr_src.can_addr.j1939.pgn, buffer);
	}
}
---------- j1939_example.c end ----------


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-11-22 13:03 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-20 10:11 [can/j1939] unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 Tetsuo Handa
2025-11-21  9:06 ` Oleksij Rempel
2025-11-21  9:33   ` Tetsuo Handa
2025-11-21 10:00     ` Oleksij Rempel
2025-11-21 10:19       ` Tetsuo Handa
2025-11-21 10:31         ` Oleksij Rempel
2025-11-22  7:00           ` Tetsuo Handa
2025-11-22 13:03             ` Tetsuo Handa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).