public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
@ 2026-03-17 11:12 Jiayuan Chen
  2026-03-19  1:15 ` Jakub Kicinski
  2026-03-19 12:33 ` [net,v1] " Paolo Abeni
  0 siblings, 2 replies; 9+ messages in thread
From: Jiayuan Chen @ 2026-03-17 11:12 UTC (permalink / raw)
  To: netdev
  Cc: Jiayuan Chen, syzbot+afbcf622635e98bf40d2, Jiayuan Chen,
	David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Taehee Yoo, linux-kernel

From: Jiayuan Chen <jiayuan.chen@shopee.com>

syzbot reported a possible circular locking dependency:

  fs_reclaim --> sk_lock-AF_INET6 --> &idev->mc_lock

  CPU0                              CPU1
  ----                              ----
  lock(&idev->mc_lock)
                                    lock(sk_lock-AF_INET6)
                                    lock(&idev->mc_lock)  // blocked
  kzalloc(GFP_KERNEL)
    fs_reclaim
      ...nbd I/O...
        sk_lock-AF_INET6            // blocked -> DEADLOCK

__ipv6_dev_mc_inc() does GFP_KERNEL allocation inside mc_lock via
mca_alloc(). This can enter memory reclaim, which through nbd block
I/O may need sk_lock-AF_INET6. But sk_lock -> mc_lock already exists
via setsockopt -> __ipv6_sock_mc_join, so we have a deadlock.

Before commit 63ed8de4be81 ("mld: add mc_lock for protecting
per-interface mld data"), only RTNL was held during the allocation.
The lock ordering was always RTNL -> sk_lock (the nbd path doesn't
involve RTNL), so there was no circular dependency.

Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
GFP_KERNEL allocation before mc_lock, mca_init() initializes under
mc_lock. If the address already exists, the pre-allocated memory is
simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
it also does GFP_KERNEL allocation.

Fixes: 63ed8de4be81 ("mld: add mc_lock for protecting per-interface mld data")
Reported-by: syzbot+afbcf622635e98bf40d2@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/69b7dc76.050a0220.248e02.0113.GAE@google.com/T/
Cc: Jiayuan Chen <jiayuan.chen@linux.dev>
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
---
 net/ipv6/mcast.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 3330adcf26db..2dfa7ed54d17 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -860,18 +860,16 @@ static void ma_put(struct ifmcaddr6 *mc)
 	}
 }
 
-static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
-				   const struct in6_addr *addr,
-				   unsigned int mode)
+static struct ifmcaddr6 *mca_alloc(void)
 {
-	struct ifmcaddr6 *mc;
+	return kzalloc_obj(struct ifmcaddr6);
+}
 
+static void mca_init(struct inet6_dev *idev, const struct in6_addr *addr,
+		     unsigned int mode, struct ifmcaddr6 *mc)
+{
 	mc_assert_locked(idev);
 
-	mc = kzalloc_obj(*mc);
-	if (!mc)
-		return NULL;
-
 	INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work);
 
 	mc->mca_addr = *addr;
@@ -887,8 +885,6 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 	if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
 	    IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
 		mc->mca_flags |= MAF_NOREPORT;
-
-	return mc;
 }
 
 static void inet6_ifmcaddr_notify(struct net_device *dev,
@@ -932,6 +928,7 @@ static void inet6_ifmcaddr_notify(struct net_device *dev,
 static int __ipv6_dev_mc_inc(struct net_device *dev,
 			     const struct in6_addr *addr, unsigned int mode)
 {
+	struct ifmcaddr6 *mc_alloced;
 	struct inet6_dev *idev;
 	struct ifmcaddr6 *mc;
 
@@ -940,10 +937,17 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
 	if (!idev)
 		return -EINVAL;
 
+	mc_alloced = mca_alloc();
+	if (!mc_alloced) {
+		in6_dev_put(idev);
+		return -ENOMEM;
+	}
+
 	mutex_lock(&idev->mc_lock);
 
 	if (READ_ONCE(idev->dead)) {
 		mutex_unlock(&idev->mc_lock);
+		kfree(mc_alloced);
 		in6_dev_put(idev);
 		return -ENODEV;
 	}
@@ -953,26 +957,24 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
 			mc->mca_users++;
 			ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
 			mutex_unlock(&idev->mc_lock);
+			kfree(mc_alloced);
 			in6_dev_put(idev);
 			return 0;
 		}
 	}
 
-	mc = mca_alloc(idev, addr, mode);
-	if (!mc) {
-		mutex_unlock(&idev->mc_lock);
-		in6_dev_put(idev);
-		return -ENOMEM;
-	}
+	mca_init(idev, addr, mode, mc_alloced);
+	mc = mc_alloced;
 
 	rcu_assign_pointer(mc->next, idev->mc_list);
 	rcu_assign_pointer(idev->mc_list, mc);
 
 	mld_del_delrec(idev, mc);
 	igmp6_group_added(mc);
-	inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
 	mutex_unlock(&idev->mc_lock);
 
+	inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
+
 	return 0;
 }
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-17 11:12 [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc() Jiayuan Chen
@ 2026-03-19  1:15 ` Jakub Kicinski
  2026-03-19  3:04   ` Jiayuan Chen
  2026-03-19 12:33 ` [net,v1] " Paolo Abeni
  1 sibling, 1 reply; 9+ messages in thread
From: Jakub Kicinski @ 2026-03-19  1:15 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: netdev, Jiayuan Chen, syzbot+afbcf622635e98bf40d2,
	David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Taehee Yoo, linux-kernel

On Tue, 17 Mar 2026 19:12:07 +0800 Jiayuan Chen wrote:
> syzbot reported a possible circular locking dependency:
> 
>   fs_reclaim --> sk_lock-AF_INET6 --> &idev->mc_lock
> 
>   CPU0                              CPU1
>   ----                              ----
>   lock(&idev->mc_lock)
>                                     lock(sk_lock-AF_INET6)
>                                     lock(&idev->mc_lock)  // blocked
>   kzalloc(GFP_KERNEL)
>     fs_reclaim
>       ...nbd I/O...
>         sk_lock-AF_INET6            // blocked -> DEADLOCK
> 
> __ipv6_dev_mc_inc() does GFP_KERNEL allocation inside mc_lock via
> mca_alloc(). This can enter memory reclaim, which through nbd block
> I/O may need sk_lock-AF_INET6. But sk_lock -> mc_lock already exists
> via setsockopt -> __ipv6_sock_mc_join, so we have a deadlock.
> 
> Before commit 63ed8de4be81 ("mld: add mc_lock for protecting
> per-interface mld data"), only RTNL was held during the allocation.
> The lock ordering was always RTNL -> sk_lock (the nbd path doesn't
> involve RTNL), so there was no circular dependency.
> 
> Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
> GFP_KERNEL allocation before mc_lock, mca_init() initializes under
> mc_lock. If the address already exists, the pre-allocated memory is
> simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
> it also does GFP_KERNEL allocation.

Moving the allocation seems fine, but also having to move the
notification, potentially letting the notification go out of order
makes me wonder if we aren't better off adding helpers for taking this
lock which also call memalloc_noio_{save,restore} ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-19  1:15 ` Jakub Kicinski
@ 2026-03-19  3:04   ` Jiayuan Chen
  2026-03-19  3:26     ` Jakub Kicinski
  0 siblings, 1 reply; 9+ messages in thread
From: Jiayuan Chen @ 2026-03-19  3:04 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, Jiayuan Chen, syzbot+afbcf622635e98bf40d2,
	David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Taehee Yoo, linux-kernel


On 3/19/26 9:15 AM, Jakub Kicinski wrote:
> On Tue, 17 Mar 2026 19:12:07 +0800 Jiayuan Chen wrote:
>> syzbot reported a possible circular locking dependency:
>>
>>    fs_reclaim --> sk_lock-AF_INET6 --> &idev->mc_lock
>>
>>    CPU0                              CPU1
>>    ----                              ----
>>    lock(&idev->mc_lock)
>>                                      lock(sk_lock-AF_INET6)
>>                                      lock(&idev->mc_lock)  // blocked
>>    kzalloc(GFP_KERNEL)
>>      fs_reclaim
>>        ...nbd I/O...
>>          sk_lock-AF_INET6            // blocked -> DEADLOCK
>>
>> __ipv6_dev_mc_inc() does GFP_KERNEL allocation inside mc_lock via
>> mca_alloc(). This can enter memory reclaim, which through nbd block
>> I/O may need sk_lock-AF_INET6. But sk_lock -> mc_lock already exists
>> via setsockopt -> __ipv6_sock_mc_join, so we have a deadlock.
>>
>> Before commit 63ed8de4be81 ("mld: add mc_lock for protecting
>> per-interface mld data"), only RTNL was held during the allocation.
>> The lock ordering was always RTNL -> sk_lock (the nbd path doesn't
>> involve RTNL), so there was no circular dependency.
>>
>> Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
>> GFP_KERNEL allocation before mc_lock, mca_init() initializes under
>> mc_lock. If the address already exists, the pre-allocated memory is
>> simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
>> it also does GFP_KERNEL allocation.
> Moving the allocation seems fine, but also having to move the
> notification, potentially letting the notification go out of order
> makes me wonder if we aren't better off adding helpers for taking this
> lock which also call memalloc_noio_{save,restore} ?
Yeah, using memalloc_noio helpers is simpler. I checked and there
are about 18 places taking mc_lock, so having a common mc_lock()/mc_unlock()
wrapper that does the noio save/restore covers them all (if necessary).

The only thing that feels a bit odd is using memalloc_noio in the networking
subsystem. It makes sense in block/fs to protect itself from recursion.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-19  3:04   ` Jiayuan Chen
@ 2026-03-19  3:26     ` Jakub Kicinski
  2026-03-19  4:12       ` Jiayuan Chen
  2026-03-19 12:44       ` Paolo Abeni
  0 siblings, 2 replies; 9+ messages in thread
From: Jakub Kicinski @ 2026-03-19  3:26 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: netdev, Jiayuan Chen, syzbot+afbcf622635e98bf40d2,
	David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Taehee Yoo, linux-kernel

On Thu, 19 Mar 2026 11:04:24 +0800 Jiayuan Chen wrote:
> >> Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
> >> GFP_KERNEL allocation before mc_lock, mca_init() initializes under
> >> mc_lock. If the address already exists, the pre-allocated memory is
> >> simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
> >> it also does GFP_KERNEL allocation.  
> > Moving the allocation seems fine, but also having to move the
> > notification, potentially letting the notification go out of order
> > makes me wonder if we aren't better off adding helpers for taking this
> > lock which also call memalloc_noio_{save,restore} ?  
> Yeah, using memalloc_noio helpers is simpler. I checked and there
> are about 18 places taking mc_lock, so having a common mc_lock()/mc_unlock()
> wrapper that does the noio save/restore covers them all (if necessary).
> 
> The only thing that feels a bit odd is using memalloc_noio in the networking
> subsystem. It makes sense in block/fs to protect itself from recursion.

Totally agree that it feels a bit odd that we have to worry about IO,
but unless we can figure out a way to prevent nbd sockets from getting
here all our solutions are dealing with noio in networking code :(
IMHO it's better to acknowledge this with the explicit memalloc_noio 
so future developers don't break things again with a mis-placed
allocation.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-19  3:26     ` Jakub Kicinski
@ 2026-03-19  4:12       ` Jiayuan Chen
  2026-03-19 12:44       ` Paolo Abeni
  1 sibling, 0 replies; 9+ messages in thread
From: Jiayuan Chen @ 2026-03-19  4:12 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, Jiayuan Chen, syzbot+afbcf622635e98bf40d2,
	David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Taehee Yoo, linux-kernel


On 3/19/26 11:26 AM, Jakub Kicinski wrote:
>> Yeah, using memalloc_noio helpers is simpler. I checked and there
>> are about 18 places taking mc_lock, so having a common mc_lock()/mc_unlock()
>> wrapper that does the noio save/restore covers them all (if necessary).
>>
>> The only thing that feels a bit odd is using memalloc_noio in the networking
>> subsystem. It makes sense in block/fs to protect itself from recursion.
> Totally agree that it feels a bit odd that we have to worry about IO,
> but unless we can figure out a way to prevent nbd sockets from getting
> here all our solutions are dealing with noio in networking code 🙁
> IMHO it's better to acknowledge this with the explicit memalloc_noio
> so future developers don't break things again with a mis-placed
> allocation.


Makes sense. I checked and there are about 18 mc_lock sites, and more
than 10 of them have GFP_KERNEL allocations under the lock. If we want
to fix the mc_lock + GFP_KERNEL combination properly, it makes sense to
convert all of them to noio helpers.

But if that feels like too much for this fix, I can just focus on the one
reported by syzbot.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [net,v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-17 11:12 [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc() Jiayuan Chen
  2026-03-19  1:15 ` Jakub Kicinski
@ 2026-03-19 12:33 ` Paolo Abeni
  1 sibling, 0 replies; 9+ messages in thread
From: Paolo Abeni @ 2026-03-19 12:33 UTC (permalink / raw)
  To: jiayuan.chen
  Cc: ap420073, dsahern, horms, kuba, jiayuan.chen, edumazet, pabeni,
	linux-kernel, davem, netdev, syzbot+afbcf622635e98bf40d2

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.

For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()

This commit fixes a circular locking dependency deadlock by moving the
GFP_KERNEL allocation outside the mc_lock critical section. The fix
splits mca_alloc() into allocation and initialization phases and also
moves inet6_ifmcaddr_notify() outside the lock.

Link: https://lore.kernel.org/all/69b7dc76.050a0220.248e02.0113.GAE@google.com/T/

> diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
> --- a/net/ipv6/mcast.c
> +++ b/net/ipv6/mcast.c

[ ... ]

> @@ -953,25 +957,23 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
>  			mc->mca_users++;
>  			ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
>  			mutex_unlock(&idev->mc_lock);
> +			kfree(mc_alloced);
>  			in6_dev_put(idev);
>  			return 0;
>  		}
>  	}
>
> -	mc = mca_alloc(idev, addr, mode);
> -	if (!mc) {
> -		mutex_unlock(&idev->mc_lock);
> -		in6_dev_put(idev);
> -		return -ENOMEM;
> -	}
> +	mca_init(idev, addr, mode, mc_alloced);
> +	mc = mc_alloced;
>
>  	rcu_assign_pointer(mc->next, idev->mc_list);
>  	rcu_assign_pointer(idev->mc_list, mc);
>
>  	mld_del_delrec(idev, mc);
>  	igmp6_group_added(mc);
> +	mutex_unlock(&idev->mc_lock);
> +
>  	inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
> -	mutex_unlock(&idev->mc_lock);
                ^^^^

Can this lead to a use-after-free on mc?

After mutex_unlock(&idev->mc_lock), the mc object has been published to
idev->mc_list with mca_refcnt=1. At this point, a concurrent
ipv6_mc_destroy_dev() could acquire mc_lock, iterate mc_list, and call
ma_put() on mc:

net/ipv6/mcast.c:ipv6_mc_destroy_dev() {
    mutex_lock(&idev->mc_lock);
    while ((mc = rcu_dereference_protected(idev->mc_list, ...)) != NULL) {
        ...
        ma_put(mc);
    }
    mutex_unlock(&idev->mc_lock);
}

Since mca_refcnt starts at 1 (set by mca_init), ma_put() would decrement
it to 0 and call kfree_rcu(mc, rcu):

net/ipv6/mcast.c:ma_put() {
    if (refcount_dec_and_test(&mc->mca_refcnt)) {
        in6_dev_put(mc->idev);
        kfree_rcu(mc, rcu);
    }
}

Then inet6_ifmcaddr_notify() calls nlmsg_new(..., GFP_KERNEL) which can
sleep and create an RCU quiescent state, allowing the kfree_rcu callback
to execute and free mc before inet6_fill_ifmcaddr() accesses mc->idev,
mc->idev->dev->ifindex, and mc->mca_addr.

Should the fix take an additional refcount on mc (refcount_inc(&mc->mca_refcnt))
before releasing mc_lock, then drop it with ma_put(mc) after
inet6_ifmcaddr_notify() returns?

>
>  	return 0;
>  }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-19  3:26     ` Jakub Kicinski
  2026-03-19  4:12       ` Jiayuan Chen
@ 2026-03-19 12:44       ` Paolo Abeni
  2026-03-19 15:36         ` Wouter Verhelst
  2026-03-23  6:54         ` Kuniyuki Iwashima
  1 sibling, 2 replies; 9+ messages in thread
From: Paolo Abeni @ 2026-03-19 12:44 UTC (permalink / raw)
  To: Jakub Kicinski, Jiayuan Chen, Josef Bacik
  Cc: netdev, Jiayuan Chen, syzbot+afbcf622635e98bf40d2,
	David S. Miller, David Ahern, Eric Dumazet, Simon Horman,
	Taehee Yoo, linux-kernel, nbd

Adding Josef.

On 3/19/26 4:26 AM, Jakub Kicinski wrote:
> On Thu, 19 Mar 2026 11:04:24 +0800 Jiayuan Chen wrote:
>>>> Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
>>>> GFP_KERNEL allocation before mc_lock, mca_init() initializes under
>>>> mc_lock. If the address already exists, the pre-allocated memory is
>>>> simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
>>>> it also does GFP_KERNEL allocation.  
>>> Moving the allocation seems fine, but also having to move the
>>> notification, potentially letting the notification go out of order
>>> makes me wonder if we aren't better off adding helpers for taking this
>>> lock which also call memalloc_noio_{save,restore} ?  
>> Yeah, using memalloc_noio helpers is simpler. I checked and there
>> are about 18 places taking mc_lock, so having a common mc_lock()/mc_unlock()
>> wrapper that does the noio save/restore covers them all (if necessary).
>>
>> The only thing that feels a bit odd is using memalloc_noio in the networking
>> subsystem. It makes sense in block/fs to protect itself from recursion.
> 
> Totally agree that it feels a bit odd that we have to worry about IO,
> but unless we can figure out a way to prevent nbd sockets from getting
> here all our solutions are dealing with noio in networking code :(
> IMHO it's better to acknowledge this with the explicit memalloc_noio 
> so future developers don't break things again with a mis-placed
> allocation.

I think a problem here is that the nbd socket is still exposed to
user-space, while in use by the block device. I fear that the more
syzkaller will learn new tricks, the more we will have to had strange
noio all around the networking code.

I *think* we could prevent this kind of races with something alike the
following:
- nbd sets a DOIO sk flag on the sockets it uses.
- the socket layer prevents socketopts()/ioctl() entirely on DOIO sk

I'm not sure if that could break nbd users, but allowing the user-space
to mess with the socket used for backing a block device looks very
dangerous.

/P


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-19 12:44       ` Paolo Abeni
@ 2026-03-19 15:36         ` Wouter Verhelst
  2026-03-23  6:54         ` Kuniyuki Iwashima
  1 sibling, 0 replies; 9+ messages in thread
From: Wouter Verhelst @ 2026-03-19 15:36 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Jakub Kicinski, Jiayuan Chen, Josef Bacik, netdev, Jiayuan Chen,
	syzbot+afbcf622635e98bf40d2, David S. Miller, David Ahern,
	Eric Dumazet, Simon Horman, Taehee Yoo, linux-kernel, nbd

Hi Paolo,

On Thu, Mar 19, 2026 at 01:44:00PM +0100, Paolo Abeni wrote:
> Adding Josef.
> 
> On 3/19/26 4:26 AM, Jakub Kicinski wrote:
> > On Thu, 19 Mar 2026 11:04:24 +0800 Jiayuan Chen wrote:
> >>>> Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
> >>>> GFP_KERNEL allocation before mc_lock, mca_init() initializes under
> >>>> mc_lock. If the address already exists, the pre-allocated memory is
> >>>> simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
> >>>> it also does GFP_KERNEL allocation.  
> >>> Moving the allocation seems fine, but also having to move the
> >>> notification, potentially letting the notification go out of order
> >>> makes me wonder if we aren't better off adding helpers for taking this
> >>> lock which also call memalloc_noio_{save,restore} ?  
> >> Yeah, using memalloc_noio helpers is simpler. I checked and there
> >> are about 18 places taking mc_lock, so having a common mc_lock()/mc_unlock()
> >> wrapper that does the noio save/restore covers them all (if necessary).
> >>
> >> The only thing that feels a bit odd is using memalloc_noio in the networking
> >> subsystem. It makes sense in block/fs to protect itself from recursion.
> > 
> > Totally agree that it feels a bit odd that we have to worry about IO,
> > but unless we can figure out a way to prevent nbd sockets from getting
> > here all our solutions are dealing with noio in networking code :(
> > IMHO it's better to acknowledge this with the explicit memalloc_noio 
> > so future developers don't break things again with a mis-placed
> > allocation.
> 
> I think a problem here is that the nbd socket is still exposed to
> user-space, while in use by the block device. I fear that the more
> syzkaller will learn new tricks, the more we will have to had strange
> noio all around the networking code.
> 
> I *think* we could prevent this kind of races with something alike the
> following:
> - nbd sets a DOIO sk flag on the sockets it uses.
> - the socket layer prevents socketopts()/ioctl() entirely on DOIO sk
> 
> I'm not sure if that could break nbd users, but allowing the user-space
> to mess with the socket used for backing a block device looks very
> dangerous.

I didn't see the rest of the thread, but.

There are two ways of configuring the nbd device, through nbd-client:

- The "old" way has userspace do the initial negotiation, then calls
  NBD_DO_IT on the NBD device with the socket to tell the kernel to go
  ahead and use the socket
- The "new" way has userspace do the initial negotiation, then sends a
  netlink message with the socket to tell the kernel to go ahead and use
  the socket.

So, really, it's the other way around: userspace isn't playing with the
socket, the kernel is playing with a socket that originated in
userspace.

If userspace can be trusted to configure a socket for the kernel, then
surely that shouldn't be a problem? ;-)

(apologies if I didn't understand the problem correctly)

-- 
"I never had a C in history!"
"Yeah, but there was so much less of it when you were my age!"
 -- Joe Brockmeier recounting a conversation with his father, cfgmgmtcamp 2026, Ghent

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc()
  2026-03-19 12:44       ` Paolo Abeni
  2026-03-19 15:36         ` Wouter Verhelst
@ 2026-03-23  6:54         ` Kuniyuki Iwashima
  1 sibling, 0 replies; 9+ messages in thread
From: Kuniyuki Iwashima @ 2026-03-23  6:54 UTC (permalink / raw)
  To: pabeni
  Cc: ap420073, davem, dsahern, edumazet, horms, jiayuan.chen,
	jiayuan.chen, josef, kuba, linux-kernel, nbd, netdev,
	syzbot+afbcf622635e98bf40d2

From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 19 Mar 2026 13:44:00 +0100
> Adding Josef.
> 
> On 3/19/26 4:26 AM, Jakub Kicinski wrote:
> > On Thu, 19 Mar 2026 11:04:24 +0800 Jiayuan Chen wrote:
> >>>> Split mca_alloc() into mca_alloc() + mca_init(): mca_alloc() does the
> >>>> GFP_KERNEL allocation before mc_lock, mca_init() initializes under
> >>>> mc_lock. If the address already exists, the pre-allocated memory is
> >>>> simply freed. Also move inet6_ifmcaddr_notify() outside mc_lock since
> >>>> it also does GFP_KERNEL allocation.  
> >>> Moving the allocation seems fine, but also having to move the
> >>> notification, potentially letting the notification go out of order
> >>> makes me wonder if we aren't better off adding helpers for taking this
> >>> lock which also call memalloc_noio_{save,restore} ?  
> >> Yeah, using memalloc_noio helpers is simpler. I checked and there
> >> are about 18 places taking mc_lock, so having a common mc_lock()/mc_unlock()
> >> wrapper that does the noio save/restore covers them all (if necessary).
> >>
> >> The only thing that feels a bit odd is using memalloc_noio in the networking
> >> subsystem. It makes sense in block/fs to protect itself from recursion.
> > 
> > Totally agree that it feels a bit odd that we have to worry about IO,
> > but unless we can figure out a way to prevent nbd sockets from getting
> > here all our solutions are dealing with noio in networking code :(
> > IMHO it's better to acknowledge this with the explicit memalloc_noio 
> > so future developers don't break things again with a mis-placed
> > allocation.
> 
> I think a problem here is that the nbd socket is still exposed to
> user-space, while in use by the block device.

Right, and this also prevents us from changing lockdep keys
(due to lock_sock_fast()).


> I fear that the more
> syzkaller will learn new tricks, the more we will have to had strange
> noio all around the networking code.

I agree.  We have 100+ unpublished reports for this variant, and
I started looking into them last week.


> 
> I *think* we could prevent this kind of races with something alike the
> following:
> - nbd sets a DOIO sk flag on the sockets it uses.
> - the socket layer prevents socketopts()/ioctl() entirely on DOIO sk

NBD sets sk->sk_allocation, so if we use it around, there should
be no real race, but still lockdep would not be happy.

The problem is sendmsg() and shutdown() invoked from NBD (recmsg()
is okay which is not under tx_lock), and my idea is to implemnt the
trylock variant for lock_sock() and use it for TCP. ( AF_UNIX does
not have the race because it does not use lock_sock() in sendmsg()
and shutdown() )

sendmsg() will be retried with -ERESTARTSYS where needed thanks to
was_interrupted().  While sendmsg() for disconnect and shutdown()
could be missed, but probably not a big deal (the peer will get
RST when it send something) ?

If this looks good, I'll send a formal patch.

---8<---
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index fe63f3c55d0d..9003baf52be8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -45,6 +45,8 @@
 #include <linux/nbd.h>
 #include <linux/nbd-netlink.h>
 #include <net/genetlink.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nbd.h>
@@ -302,6 +304,19 @@ static int nbd_disconnected(struct nbd_config *config)
 		test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
 }
 
+static void nbd_sock_shutdown(struct sock *sk)
+{
+	if (sk_is_stream_unix(sk)) {
+		kernel_sock_shutdown(sk->sk_socket, SHUT_RDWR);
+		return;
+	}
+
+	if (lock_sock_try(sk)) {
+		inet_shutdown_locked(sk->sk_socket, SHUT_RDWR);
+		release_sock(sk);
+	}
+}
+
 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 				int notify)
 {
@@ -315,7 +330,8 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 		}
 	}
 	if (!nsock->dead) {
-		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+		nbd_sock_shutdown(nsock->sock->sk);
+
 		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
 			if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
 					       &nbd->config->runtime_flags)) {
@@ -548,6 +564,22 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
 	return BLK_EH_DONE;
 }
 
+static int nbd_sock_sendmsg(struct socket *sock, struct msghdr *msg)
+{
+	struct sock *sk = sock->sk;
+	int err = -ERESTARTSYS;
+
+	if (sk_is_stream_unix(sk))
+		return sock_sendmsg(sock, msg);
+
+	if (lock_sock_try(sk)) {
+		err = tcp_sendmsg_locked(sk, msg, msg_data_left(msg));
+		release_sock(sk);
+	}
+
+	return err;
+}
+
 static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
 		       struct iov_iter *iter, int msg_flags, int *sent)
 {
@@ -573,7 +605,7 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
 			msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 
 			if (send)
-				result = sock_sendmsg(sock, &msg);
+				result = nbd_sock_sendmsg(sock, &msg);
 			else
 				result = sock_recvmsg(sock, &msg, msg.msg_flags);
 
@@ -1228,6 +1260,13 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
 		return NULL;
 	}
 
+	if (READ_ONCE(sock->sk->sk_state) != TCP_ESTABLISHED) {
+		dev_err(disk_to_dev(nbd->disk), "Unsupported socket: not connected yet.\n");
+		*err = -ENOTCONN;
+		sockfd_put(sock);
+		return NULL;
+	}
+
 	if (sock->ops->shutdown == sock_no_shutdown) {
 		dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
 		*err = -EINVAL;
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 5dd2bf24449e..c085c39573c9 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -38,6 +38,7 @@ void inet_splice_eof(struct socket *sock);
 int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		 int flags);
 int inet_shutdown(struct socket *sock, int how);
+int inet_shutdown_locked(struct socket *sock, int how);
 int inet_listen(struct socket *sock, int backlog);
 int __inet_listen_sk(struct sock *sk, int backlog);
 void inet_sock_destruct(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index 6c9a83016e95..203a60661fce 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1710,6 +1710,24 @@ static inline void lock_sock(struct sock *sk)
 }
 
 void __lock_sock(struct sock *sk);
+
+static inline bool lock_sock_try(struct sock *sk)
+{
+	if (!spin_trylock_bh(&sk->sk_lock.slock))
+		return false;
+
+	if (sk->sk_lock.owned) {
+		spin_unlock_bh(&sk->sk_lock.slock);
+		return false;
+	}
+
+	sk->sk_lock.owned = 1;
+	spin_unlock_bh(&sk->sk_lock.slock);
+
+	mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+	return true;
+}
+
 void __release_sock(struct sock *sk);
 void release_sock(struct sock *sk);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8036e76aa1e4..bde8ddcc28f0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -896,21 +896,11 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 }
 EXPORT_SYMBOL(inet_recvmsg);
 
-int inet_shutdown(struct socket *sock, int how)
+static int __inet_shutdown(struct socket *sock, int how)
 {
 	struct sock *sk = sock->sk;
 	int err = 0;
 
-	/* This should really check to make sure
-	 * the socket is a TCP socket. (WHY AC...)
-	 */
-	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
-		       1->2 bit 2 snds.
-		       2->3 */
-	if ((how & ~SHUTDOWN_MASK) || !how)	/* MAXINT->0 */
-		return -EINVAL;
-
-	lock_sock(sk);
 	if (sock->state == SS_CONNECTING) {
 		if ((1 << sk->sk_state) &
 		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
@@ -947,11 +937,45 @@ int inet_shutdown(struct socket *sock, int how)
 
 	/* Wake up anyone sleeping in poll. */
 	sk->sk_state_change(sk);
+
+	return err;
+}
+
+int inet_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	/* maps 0->1 has the advantage of making bit 1 rcvs and
+	 *      1->2 bit 2 snds.
+	 *      2->3
+	 */
+	how++;
+
+	if ((how & ~SHUTDOWN_MASK) || !how)
+		return -EINVAL;
+
+	lock_sock(sk);
+	err = __inet_shutdown(sock, how);
 	release_sock(sk);
+
 	return err;
 }
 EXPORT_SYMBOL(inet_shutdown);
 
+int inet_shutdown_locked(struct socket *sock, int how)
+{
+	sock_owned_by_me(sock->sk);
+
+	how++;
+
+	if ((how & ~SHUTDOWN_MASK) || !how)
+		return -EINVAL;
+
+	return __inet_shutdown(sock, how);
+}
+EXPORT_SYMBOL_GPL(inet_shutdown_locked);
+
 /*
  *	ioctl() calls you can issue on an INET socket. Most of these are
  *	device configuration and stuff and very rarely used. Some ioctls
---8<---

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2026-03-23  6:54 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-17 11:12 [PATCH net v1] net/ipv6: mcast: fix circular locking dependency in __ipv6_dev_mc_inc() Jiayuan Chen
2026-03-19  1:15 ` Jakub Kicinski
2026-03-19  3:04   ` Jiayuan Chen
2026-03-19  3:26     ` Jakub Kicinski
2026-03-19  4:12       ` Jiayuan Chen
2026-03-19 12:44       ` Paolo Abeni
2026-03-19 15:36         ` Wouter Verhelst
2026-03-23  6:54         ` Kuniyuki Iwashima
2026-03-19 12:33 ` [net,v1] " Paolo Abeni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox