netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
@ 2025-08-01  9:09 Menglong Dong
  2025-08-01  9:46 ` Eric Dumazet
  2025-08-11  5:27 ` kernel test robot
  0 siblings, 2 replies; 9+ messages in thread
From: Menglong Dong @ 2025-08-01  9:09 UTC (permalink / raw)
  To: edumazet, kuniyu, kraig
  Cc: ncardwell, davem, dsahern, kuba, pabeni, horms, netdev,
	linux-kernel

For now, the socket lookup will terminate if the socket is reuse port in
inet_lhash2_lookup(), which makes the socket is not the best match.

For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
but socket1 bind on "eth0". We create socket1 first, and then socket2.
Then, all connections will goto socket2, which is not expected, as socket1
has higher priority.

This can cause unexpected behavior if TCP MD5 keys is used, as described
in Documentation/networking/vrf.rst -> Applications.

Therefore, we compute a score for the reuseport socket and add it to the
list with order in __inet_hash(). Sockets with high score will be added
to the head.

Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
  the lookup for reuseport O(1)
---
 include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
 include/net/sock.h            |  5 ++++
 net/ipv4/inet_hashtables.c    | 49 ++++++++++++++++++++++++++++++++---
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 89186c499dd4..da500f4ae142 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 #define hlist_nulls_next_rcu(node) \
 	(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
 
+/**
+ * hlist_nulls_pprev_rcu - returns the element of the list after @node.
+ * @node: element of the list.
+ */
+#define hlist_nulls_pprev_rcu(node) \
+	(*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
+
 /**
  * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
@@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
 	}
 }
 
+/**
+ * hlist_nulls_add_before_rcu
+ * @n: the new element to add to the hash list.
+ * @next: the existing element to add the new element before.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
+					      struct hlist_nulls_node *next)
+{
+	WRITE_ONCE(n->pprev, next->pprev);
+	n->next = next;
+	rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
+	WRITE_ONCE(next->pprev, &n->next);
+}
+
 /* after that hlist_nulls_del will work */
 static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..42aa1919eeee 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
 	hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
 }
 
+static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
+{
+	hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
+}
+
 static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	sock_hold(sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..80d8bec41a58 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
 	return score;
 }
 
+static inline int compute_reuseport_score(struct sock *sk)
+{
+	int score = 0;
+
+	if (sk->sk_bound_dev_if)
+		score += 2;
+
+	if (sk->sk_family == PF_INET)
+		score += 10;
+
+	/* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
+	 * as it's optional in compute_score(). Thank God, this is the only
+	 * variable condition, which we can't judge now.
+	 */
+	if (READ_ONCE(sk->sk_incoming_cpu))
+		score++;
+
+	return score;
+}
+
 /**
  * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
  * @net: network namespace.
@@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
 	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 }
 
+static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
+{
+	const struct hlist_nulls_node *node;
+	int score, curscore;
+	struct sock *sk2;
+
+	curscore = compute_reuseport_score(sk);
+	/* lookup the socket to insert before */
+	sk_nulls_for_each_rcu(sk2, node, head) {
+		if (!sk2->sk_reuseport)
+			continue;
+		score = compute_reuseport_score(sk2);
+		if (score <= curscore) {
+			__sk_nulls_add_node_before_rcu(sk, sk2);
+			return;
+		}
+	}
+
+	__sk_nulls_add_node_tail_rcu(sk, head);
+}
+
 int __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
@@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 			goto unlock;
 	}
 	sock_set_flag(sk, SOCK_RCU_FREE);
-	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
-		sk->sk_family == AF_INET6)
-		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
-	else
+	if (!sk->sk_reuseport)
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+	else
+		inet_hash_reuseport(sk, &ilb2->nulls_head);
+
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-01  9:09 [PATCH net v2] net: ip: order the reuseport socket in __inet_hash Menglong Dong
@ 2025-08-01  9:46 ` Eric Dumazet
  2025-08-01 10:42   ` Menglong Dong
  2025-08-11  5:27 ` kernel test robot
  1 sibling, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2025-08-01  9:46 UTC (permalink / raw)
  To: Menglong Dong
  Cc: kuniyu, kraig, ncardwell, davem, dsahern, kuba, pabeni, horms,
	netdev, linux-kernel

On Fri, Aug 1, 2025 at 2:09 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> For now, the socket lookup will terminate if the socket is reuse port in
> inet_lhash2_lookup(), which makes the socket is not the best match.
>
> For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> but socket1 bind on "eth0". We create socket1 first, and then socket2.
> Then, all connections will goto socket2, which is not expected, as socket1
> has higher priority.
>
> This can cause unexpected behavior if TCP MD5 keys is used, as described
> in Documentation/networking/vrf.rst -> Applications.
>
> Therefore, we compute a score for the reuseport socket and add it to the
> list with order in __inet_hash(). Sockets with high score will be added
> to the head.
>
> Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>

You forgot a Fixes: tag, and a selftest.

> ---
> v2:
> - As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
>   the lookup for reuseport O(1)

Keeping sorted the list is difficult, we would have to intercept
SO_BINDTODEVICE, SO_BINDTOIFINDEX, SO_INCOMING_CPU.

This also makes the patch risky to backport to stable versions,
because it is complex and possibly buggy.

Therefore I prefer your first approach.

> ---
>  include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
>  include/net/sock.h            |  5 ++++
>  net/ipv4/inet_hashtables.c    | 49 ++++++++++++++++++++++++++++++++---
>  3 files changed, 84 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> index 89186c499dd4..da500f4ae142 100644
> --- a/include/linux/rculist_nulls.h
> +++ b/include/linux/rculist_nulls.h
> @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
>  #define hlist_nulls_next_rcu(node) \
>         (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
>
> +/**
> + * hlist_nulls_pprev_rcu - returns the element of the list after @node.
> + * @node: element of the list.
> + */
> +#define hlist_nulls_pprev_rcu(node) \
> +       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> +
>  /**
>   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
>   * @n: the element to delete from the hash list.
> @@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
>         }
>  }
>
> +/**
> + * hlist_nulls_add_before_rcu
> + * @n: the new element to add to the hash list.
> + * @next: the existing element to add the new element before.
> + *
> + * Description:
> + * Adds the specified element to the specified hlist
> + * before the specified node while permitting racing traversals.
> + *
> + * The caller must take whatever precautions are necessary
> + * (such as holding appropriate locks) to avoid racing
> + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> + * or hlist_nulls_del_rcu(), running on this same list.
> + * However, it is perfectly legal to run concurrently with
> + * the _rcu list-traversal primitives, such as
> + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
> + * problems on Alpha CPUs.
> + */
> +static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
> +                                             struct hlist_nulls_node *next)
> +{
> +       WRITE_ONCE(n->pprev, next->pprev);
I do not think WRITE_ONCE() is necessary here, @n is private to this cpu,
and following rcu_assign_pointer() has the needed barrier.

> +       n->next = next;
> +       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> +       WRITE_ONCE(next->pprev, &n->next);
> +}
> +
>  /* after that hlist_nulls_del will work */
>  static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
>  {
> diff --git a/include/net/sock.h b/include/net/sock.h
> index c8a4b283df6f..42aa1919eeee 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
>         hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
>  }
>
> +static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
> +{
> +       hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
> +}
> +
>  static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
>  {
>         sock_hold(sk);
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index ceeeec9b7290..80d8bec41a58 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
>         return score;
>  }
>
> +static inline int compute_reuseport_score(struct sock *sk)
> +{
> +       int score = 0;
> +
> +       if (sk->sk_bound_dev_if)
> +               score += 2;
> +
> +       if (sk->sk_family == PF_INET)
> +               score += 10;
> +
> +       /* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
> +        * as it's optional in compute_score(). Thank God, this is the only

Please do not bring God here.

> +        * variable condition, which we can't judge now.
> +        */
> +       if (READ_ONCE(sk->sk_incoming_cpu))
> +               score++;
> +
> +       return score;
> +}
> +
>  /**
>   * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
>   * @net: network namespace.
> @@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
>         return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
>  }
>
> +static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
> +{
> +       const struct hlist_nulls_node *node;
> +       int score, curscore;
> +       struct sock *sk2;
> +
> +       curscore = compute_reuseport_score(sk);
> +       /* lookup the socket to insert before */
> +       sk_nulls_for_each_rcu(sk2, node, head) {
> +               if (!sk2->sk_reuseport)
> +                       continue;
> +               score = compute_reuseport_score(sk2);
> +               if (score <= curscore) {
> +                       __sk_nulls_add_node_before_rcu(sk, sk2);
> +                       return;
> +               }
> +       }
> +
> +       __sk_nulls_add_node_tail_rcu(sk, head);
> +}
> +
>  int __inet_hash(struct sock *sk, struct sock *osk)
>  {
>         struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
> @@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
>                         goto unlock;
>         }
>         sock_set_flag(sk, SOCK_RCU_FREE);
> -       if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> -               sk->sk_family == AF_INET6)
> -               __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> -       else
> +       if (!sk->sk_reuseport)
>                 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> +       else
> +               inet_hash_reuseport(sk, &ilb2->nulls_head);
> +
>         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  unlock:
>         spin_unlock(&ilb2->lock);
> --
> 2.50.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-01  9:46 ` Eric Dumazet
@ 2025-08-01 10:42   ` Menglong Dong
  2025-08-01 16:46     ` Kuniyuki Iwashima
  0 siblings, 1 reply; 9+ messages in thread
From: Menglong Dong @ 2025-08-01 10:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: kuniyu, kraig, ncardwell, davem, dsahern, kuba, pabeni, horms,
	netdev, linux-kernel

On Fri, Aug 1, 2025 at 5:46 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Aug 1, 2025 at 2:09 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > For now, the socket lookup will terminate if the socket is reuse port in
> > inet_lhash2_lookup(), which makes the socket is not the best match.
> >
> > For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> > but socket1 bind on "eth0". We create socket1 first, and then socket2.
> > Then, all connections will goto socket2, which is not expected, as socket1
> > has higher priority.
> >
> > This can cause unexpected behavior if TCP MD5 keys is used, as described
> > in Documentation/networking/vrf.rst -> Applications.
> >
> > Therefore, we compute a score for the reuseport socket and add it to the
> > list with order in __inet_hash(). Sockets with high score will be added
> > to the head.
> >
> > Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
> > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>
> You forgot a Fixes: tag, and a selftest.

I was not sure if it should be a Fixes, I'll add it in the next version.
Kuniyuki's test case is nice. Should I put the selftests in the
commit log?

>
> > ---
> > v2:
> > - As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
> >   the lookup for reuseport O(1)
>
> Keeping sorted the list is difficult, we would have to intercept
> SO_BINDTODEVICE, SO_BINDTOIFINDEX, SO_INCOMING_CPU.
>
> This also makes the patch risky to backport to stable versions,
> because it is complex and possibly buggy.
>
> Therefore I prefer your first approach.

Kuniyuki also has a similar patch:
https://lore.kernel.org/netdev/CADxym3ZY7Lm9mgv83e2db7o3ZZMcLDa=vDf6nJSs1m0_tUk5Bg@mail.gmail.com/T/#m56ee67b2fdf85ce568fd1339def92c53232d5b49

Will his be better and stable? Kuniyuki say the first approach
kill the O(1) lookup for reuseport socket :/

Anyway, I'll send a V3 with the first approach, and with
the Fixes + selftests

Thanks!
Menglong Dong

>
> > ---
> >  include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
> >  include/net/sock.h            |  5 ++++
> >  net/ipv4/inet_hashtables.c    | 49 ++++++++++++++++++++++++++++++++---
> >  3 files changed, 84 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> > index 89186c499dd4..da500f4ae142 100644
> > --- a/include/linux/rculist_nulls.h
> > +++ b/include/linux/rculist_nulls.h
> > @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
> >  #define hlist_nulls_next_rcu(node) \
> >         (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
> >
> > +/**
> > + * hlist_nulls_pprev_rcu - returns the element of the list after @node.
> > + * @node: element of the list.
> > + */
> > +#define hlist_nulls_pprev_rcu(node) \
> > +       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> > +
> >  /**
> >   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> >   * @n: the element to delete from the hash list.
> > @@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
> >         }
> >  }
> >
> > +/**
> > + * hlist_nulls_add_before_rcu
> > + * @n: the new element to add to the hash list.
> > + * @next: the existing element to add the new element before.
> > + *
> > + * Description:
> > + * Adds the specified element to the specified hlist
> > + * before the specified node while permitting racing traversals.
> > + *
> > + * The caller must take whatever precautions are necessary
> > + * (such as holding appropriate locks) to avoid racing
> > + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> > + * or hlist_nulls_del_rcu(), running on this same list.
> > + * However, it is perfectly legal to run concurrently with
> > + * the _rcu list-traversal primitives, such as
> > + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
> > + * problems on Alpha CPUs.
> > + */
> > +static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
> > +                                             struct hlist_nulls_node *next)
> > +{
> > +       WRITE_ONCE(n->pprev, next->pprev);
> I do not think WRITE_ONCE() is necessary here, @n is private to this cpu,
> and following rcu_assign_pointer() has the needed barrier.
>
> > +       n->next = next;
> > +       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> > +       WRITE_ONCE(next->pprev, &n->next);
> > +}
> > +
> >  /* after that hlist_nulls_del will work */
> >  static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
> >  {
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index c8a4b283df6f..42aa1919eeee 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
> >         hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
> >  }
> >
> > +static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
> > +{
> > +       hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
> > +}
> > +
> >  static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
> >  {
> >         sock_hold(sk);
> > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> > index ceeeec9b7290..80d8bec41a58 100644
> > --- a/net/ipv4/inet_hashtables.c
> > +++ b/net/ipv4/inet_hashtables.c
> > @@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
> >         return score;
> >  }
> >
> > +static inline int compute_reuseport_score(struct sock *sk)
> > +{
> > +       int score = 0;
> > +
> > +       if (sk->sk_bound_dev_if)
> > +               score += 2;
> > +
> > +       if (sk->sk_family == PF_INET)
> > +               score += 10;
> > +
> > +       /* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
> > +        * as it's optional in compute_score(). Thank God, this is the only
>
> Please do not bring God here.
>
> > +        * variable condition, which we can't judge now.
> > +        */
> > +       if (READ_ONCE(sk->sk_incoming_cpu))
> > +               score++;
> > +
> > +       return score;
> > +}
> > +
> >  /**
> >   * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
> >   * @net: network namespace.
> > @@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
> >         return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
> >  }
> >
> > +static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
> > +{
> > +       const struct hlist_nulls_node *node;
> > +       int score, curscore;
> > +       struct sock *sk2;
> > +
> > +       curscore = compute_reuseport_score(sk);
> > +       /* lookup the socket to insert before */
> > +       sk_nulls_for_each_rcu(sk2, node, head) {
> > +               if (!sk2->sk_reuseport)
> > +                       continue;
> > +               score = compute_reuseport_score(sk2);
> > +               if (score <= curscore) {
> > +                       __sk_nulls_add_node_before_rcu(sk, sk2);
> > +                       return;
> > +               }
> > +       }
> > +
> > +       __sk_nulls_add_node_tail_rcu(sk, head);
> > +}
> > +
> >  int __inet_hash(struct sock *sk, struct sock *osk)
> >  {
> >         struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
> > @@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> >                         goto unlock;
> >         }
> >         sock_set_flag(sk, SOCK_RCU_FREE);
> > -       if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> > -               sk->sk_family == AF_INET6)
> > -               __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> > -       else
> > +       if (!sk->sk_reuseport)
> >                 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> > +       else
> > +               inet_hash_reuseport(sk, &ilb2->nulls_head);
> > +
> >         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> >  unlock:
> >         spin_unlock(&ilb2->lock);
> > --
> > 2.50.1
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-01 10:42   ` Menglong Dong
@ 2025-08-01 16:46     ` Kuniyuki Iwashima
  2025-08-02  0:59       ` Menglong Dong
  0 siblings, 1 reply; 9+ messages in thread
From: Kuniyuki Iwashima @ 2025-08-01 16:46 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Eric Dumazet, kraig, ncardwell, davem, dsahern, kuba, pabeni,
	horms, netdev, linux-kernel

On Fri, Aug 1, 2025 at 3:42 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Fri, Aug 1, 2025 at 5:46 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Fri, Aug 1, 2025 at 2:09 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >
> > > For now, the socket lookup will terminate if the socket is reuse port in
> > > inet_lhash2_lookup(), which makes the socket is not the best match.
> > >
> > > For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> > > but socket1 bind on "eth0". We create socket1 first, and then socket2.
> > > Then, all connections will goto socket2, which is not expected, as socket1
> > > has higher priority.
> > >
> > > This can cause unexpected behavior if TCP MD5 keys is used, as described
> > > in Documentation/networking/vrf.rst -> Applications.
> > >
> > > Therefore, we compute a score for the reuseport socket and add it to the
> > > list with order in __inet_hash(). Sockets with high score will be added
> > > to the head.
> > >
> > > Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
> > > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> >
> > You forgot a Fixes: tag, and a selftest.
>
> I was not sure if it should be a Fixes, I'll add it in the next version.
> Kuniyuki's test case is nice. Should I put the selftests in the
> commit log?

The python example is handy and easy to understand the
issue, so feel free to add it to the commit log if needed.

But please add a separate patch to add a test under
tools/testing/selftest/net/.

It will help us not introduce regression in the future as it's
run for each patch by NIPA CI.


>
> >
> > > ---
> > > v2:
> > > - As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
> > >   the lookup for reuseport O(1)
> >
> > Keeping sorted the list is difficult, we would have to intercept
> > SO_BINDTODEVICE, SO_BINDTOIFINDEX, SO_INCOMING_CPU.
> >
> > This also makes the patch risky to backport to stable versions,
> > because it is complex and possibly buggy.
> >
> > Therefore I prefer your first approach.
>
> Kuniyuki also has a similar patch:
> https://lore.kernel.org/netdev/CADxym3ZY7Lm9mgv83e2db7o3ZZMcLDa=vDf6nJSs1m0_tUk5Bg@mail.gmail.com/T/#m56ee67b2fdf85ce568fd1339def92c53232d5b49
>
> Will his be better and stable? Kuniyuki say the first approach
> kill the O(1) lookup for reuseport socket :/

At least your compute_reuseport_score() is wrong;
so_incoming_cpu is not considered to group reuseport
sockets, it does not take wildcard and ipv6_only into
account, etc..

And I agree this is net-next material, a bit risky to backport.

Once net-next is open, I'll follow up to restore the O(1)
lookup with a few more patches to handle corner cases
that I mentioned in v1 thread.

>
> Anyway, I'll send a V3 with the first approach, and with
> the Fixes + selftests

nit: The subject prefix should start with "tcp:" as UDP
and SCTP do not seem to have this issue.


>
> Thanks!
> Menglong Dong
>
> >
> > > ---
> > >  include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
> > >  include/net/sock.h            |  5 ++++
> > >  net/ipv4/inet_hashtables.c    | 49 ++++++++++++++++++++++++++++++++---
> > >  3 files changed, 84 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> > > index 89186c499dd4..da500f4ae142 100644
> > > --- a/include/linux/rculist_nulls.h
> > > +++ b/include/linux/rculist_nulls.h
> > > @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
> > >  #define hlist_nulls_next_rcu(node) \
> > >         (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
> > >
> > > +/**
> > > + * hlist_nulls_pprev_rcu - returns the element of the list after @node.
> > > + * @node: element of the list.
> > > + */
> > > +#define hlist_nulls_pprev_rcu(node) \
> > > +       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> > > +
> > >  /**
> > >   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> > >   * @n: the element to delete from the hash list.
> > > @@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
> > >         }
> > >  }
> > >
> > > +/**
> > > + * hlist_nulls_add_before_rcu
> > > + * @n: the new element to add to the hash list.
> > > + * @next: the existing element to add the new element before.
> > > + *
> > > + * Description:
> > > + * Adds the specified element to the specified hlist
> > > + * before the specified node while permitting racing traversals.
> > > + *
> > > + * The caller must take whatever precautions are necessary
> > > + * (such as holding appropriate locks) to avoid racing
> > > + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> > > + * or hlist_nulls_del_rcu(), running on this same list.
> > > + * However, it is perfectly legal to run concurrently with
> > > + * the _rcu list-traversal primitives, such as
> > > + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
> > > + * problems on Alpha CPUs.
> > > + */
> > > +static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
> > > +                                             struct hlist_nulls_node *next)
> > > +{
> > > +       WRITE_ONCE(n->pprev, next->pprev);
> > I do not think WRITE_ONCE() is necessary here, @n is private to this cpu,
> > and following rcu_assign_pointer() has the needed barrier.
> >
> > > +       n->next = next;
> > > +       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> > > +       WRITE_ONCE(next->pprev, &n->next);
> > > +}
> > > +
> > >  /* after that hlist_nulls_del will work */
> > >  static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
> > >  {
> > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > index c8a4b283df6f..42aa1919eeee 100644
> > > --- a/include/net/sock.h
> > > +++ b/include/net/sock.h
> > > @@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
> > >         hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
> > >  }
> > >
> > > +static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
> > > +{
> > > +       hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
> > > +}
> > > +
> > >  static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
> > >  {
> > >         sock_hold(sk);
> > > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> > > index ceeeec9b7290..80d8bec41a58 100644
> > > --- a/net/ipv4/inet_hashtables.c
> > > +++ b/net/ipv4/inet_hashtables.c
> > > @@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
> > >         return score;
> > >  }
> > >
> > > +static inline int compute_reuseport_score(struct sock *sk)
> > > +{
> > > +       int score = 0;
> > > +
> > > +       if (sk->sk_bound_dev_if)
> > > +               score += 2;
> > > +
> > > +       if (sk->sk_family == PF_INET)
> > > +               score += 10;
> > > +
> > > +       /* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
> > > +        * as it's optional in compute_score(). Thank God, this is the only
> >
> > Please do not bring God here.
> >
> > > +        * variable condition, which we can't judge now.
> > > +        */
> > > +       if (READ_ONCE(sk->sk_incoming_cpu))
> > > +               score++;
> > > +
> > > +       return score;
> > > +}
> > > +
> > >  /**
> > >   * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
> > >   * @net: network namespace.
> > > @@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
> > >         return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
> > >  }
> > >
> > > +static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
> > > +{
> > > +       const struct hlist_nulls_node *node;
> > > +       int score, curscore;
> > > +       struct sock *sk2;
> > > +
> > > +       curscore = compute_reuseport_score(sk);
> > > +       /* lookup the socket to insert before */
> > > +       sk_nulls_for_each_rcu(sk2, node, head) {
> > > +               if (!sk2->sk_reuseport)
> > > +                       continue;
> > > +               score = compute_reuseport_score(sk2);
> > > +               if (score <= curscore) {
> > > +                       __sk_nulls_add_node_before_rcu(sk, sk2);
> > > +                       return;
> > > +               }
> > > +       }
> > > +
> > > +       __sk_nulls_add_node_tail_rcu(sk, head);
> > > +}
> > > +
> > >  int __inet_hash(struct sock *sk, struct sock *osk)
> > >  {
> > >         struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
> > > @@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> > >                         goto unlock;
> > >         }
> > >         sock_set_flag(sk, SOCK_RCU_FREE);
> > > -       if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> > > -               sk->sk_family == AF_INET6)
> > > -               __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> > > -       else
> > > +       if (!sk->sk_reuseport)
> > >                 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> > > +       else
> > > +               inet_hash_reuseport(sk, &ilb2->nulls_head);
> > > +
> > >         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> > >  unlock:
> > >         spin_unlock(&ilb2->lock);
> > > --
> > > 2.50.1
> > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-01 16:46     ` Kuniyuki Iwashima
@ 2025-08-02  0:59       ` Menglong Dong
  0 siblings, 0 replies; 9+ messages in thread
From: Menglong Dong @ 2025-08-02  0:59 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: Eric Dumazet, kraig, ncardwell, davem, dsahern, kuba, pabeni,
	horms, netdev, linux-kernel

On Sat, Aug 2, 2025 at 12:46 AM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> On Fri, Aug 1, 2025 at 3:42 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Fri, Aug 1, 2025 at 5:46 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Fri, Aug 1, 2025 at 2:09 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > > >
> > > > For now, the socket lookup will terminate if the socket is reuse port in
> > > > inet_lhash2_lookup(), which makes the socket is not the best match.
> > > >
> > > > For example, we have socket1 and socket2 both listen on "0.0.0.0:1234",
> > > > but socket1 bind on "eth0". We create socket1 first, and then socket2.
> > > > Then, all connections will goto socket2, which is not expected, as socket1
> > > > has higher priority.
> > > >
> > > > This can cause unexpected behavior if TCP MD5 keys is used, as described
> > > > in Documentation/networking/vrf.rst -> Applications.
> > > >
> > > > Therefore, we compute a score for the reuseport socket and add it to the
> > > > list with order in __inet_hash(). Sockets with high score will be added
> > > > to the head.
> > > >
> > > > Link: https://lore.kernel.org/netdev/20250731123309.184496-1-dongml2@chinatelecom.cn/
> > > > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> > >
> > > You forgot a Fixes: tag, and a selftest.
> >
> > I was not sure if it should be a Fixes, I'll add it in the next version.
> > Kuniyuki's test case is nice. Should I put the selftests in the
> > commit log?
>
> The python example is handy and easy to understand the
> issue, so feel free to add it to the commit log if needed.
>
> But please add a separate patch to add a test under
> tools/testing/selftest/net/.
>
> It will help us not introduce regression in the future as it's
> run for each patch by NIPA CI.

Ok! I'll add a testcase to tools/testing/selftest/net/

>
>
> >
> > >
> > > > ---
> > > > v2:
> > > > - As Kuniyuki advised, sort the reuseport socket in __inet_hash() to keep
> > > >   the lookup for reuseport O(1)
> > >
> > > Keeping sorted the list is difficult, we would have to intercept
> > > SO_BINDTODEVICE, SO_BINDTOIFINDEX, SO_INCOMING_CPU.
> > >
> > > This also makes the patch risky to backport to stable versions,
> > > because it is complex and possibly buggy.
> > >
> > > Therefore I prefer your first approach.
> >
> > Kuniyuki also has a similar patch:
> > https://lore.kernel.org/netdev/CADxym3ZY7Lm9mgv83e2db7o3ZZMcLDa=vDf6nJSs1m0_tUk5Bg@mail.gmail.com/T/#m56ee67b2fdf85ce568fd1339def92c53232d5b49
> >
> > Will his be better and stable? Kuniyuki say the first approach
> > kill the O(1) lookup for reuseport socket :/
>
> At least your compute_reuseport_score() is wrong;
> so_incoming_cpu is not considered to group reuseport
> sockets, it does not take wildcard and ipv6_only into
> account, etc..

Should ipv6_only be considered? If socketA is ipv6_only,
and socketB is not, a IPv6 connection should select
socketA? I don't see such logic in compute_score() :/

>
> And I agree this is net-next material, a bit risky to backport.
>
> Once net-next is open, I'll follow up to restore the O(1)
> lookup with a few more patches to handle corner cases
> that I mentioned in v1 thread.
>
> >
> > Anyway, I'll send a V3 with the first approach, and with
> > the Fixes + selftests
>
> nit: The subject prefix should start with "tcp:" as UDP
> and SCTP do not seem to have this issue.

Ok!

>
>
> >
> > Thanks!
> > Menglong Dong
> >
> > >
> > > > ---
> > > >  include/linux/rculist_nulls.h | 34 ++++++++++++++++++++++++
> > > >  include/net/sock.h            |  5 ++++
> > > >  net/ipv4/inet_hashtables.c    | 49 ++++++++++++++++++++++++++++++++---
> > > >  3 files changed, 84 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> > > > index 89186c499dd4..da500f4ae142 100644
> > > > --- a/include/linux/rculist_nulls.h
> > > > +++ b/include/linux/rculist_nulls.h
> > > > @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
> > > >  #define hlist_nulls_next_rcu(node) \
> > > >         (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
> > > >
> > > > +/**
> > > > + * hlist_nulls_pprev_rcu - returns the element of the list after @node.
> > > > + * @node: element of the list.
> > > > + */
> > > > +#define hlist_nulls_pprev_rcu(node) \
> > > > +       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> > > > +
> > > >  /**
> > > >   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> > > >   * @n: the element to delete from the hash list.
> > > > @@ -145,6 +152,33 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
> > > >         }
> > > >  }
> > > >
> > > > +/**
> > > > + * hlist_nulls_add_before_rcu
> > > > + * @n: the new element to add to the hash list.
> > > > + * @next: the existing element to add the new element before.
> > > > + *
> > > > + * Description:
> > > > + * Adds the specified element to the specified hlist
> > > > + * before the specified node while permitting racing traversals.
> > > > + *
> > > > + * The caller must take whatever precautions are necessary
> > > > + * (such as holding appropriate locks) to avoid racing
> > > > + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> > > > + * or hlist_nulls_del_rcu(), running on this same list.
> > > > + * However, it is perfectly legal to run concurrently with
> > > > + * the _rcu list-traversal primitives, such as
> > > > + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
> > > > + * problems on Alpha CPUs.
> > > > + */
> > > > +static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
> > > > +                                             struct hlist_nulls_node *next)
> > > > +{
> > > > +       WRITE_ONCE(n->pprev, next->pprev);
> > > I do not think WRITE_ONCE() is necessary here, @n is private to this cpu,
> > > and following rcu_assign_pointer() has the needed barrier.
> > >
> > > > +       n->next = next;
> > > > +       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> > > > +       WRITE_ONCE(next->pprev, &n->next);
> > > > +}
> > > > +
> > > >  /* after that hlist_nulls_del will work */
> > > >  static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
> > > >  {
> > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > index c8a4b283df6f..42aa1919eeee 100644
> > > > --- a/include/net/sock.h
> > > > +++ b/include/net/sock.h
> > > > @@ -885,6 +885,11 @@ static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nu
> > > >         hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
> > > >  }
> > > >
> > > > +static inline void __sk_nulls_add_node_before_rcu(struct sock *sk, struct sock *next)
> > > > +{
> > > > +       hlist_nulls_add_before_rcu(&sk->sk_nulls_node, &next->sk_nulls_node);
> > > > +}
> > > > +
> > > >  static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
> > > >  {
> > > >         sock_hold(sk);
> > > > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> > > > index ceeeec9b7290..80d8bec41a58 100644
> > > > --- a/net/ipv4/inet_hashtables.c
> > > > +++ b/net/ipv4/inet_hashtables.c
> > > > @@ -334,6 +334,26 @@ static inline int compute_score(struct sock *sk, const struct net *net,
> > > >         return score;
> > > >  }
> > > >
> > > > +static inline int compute_reuseport_score(struct sock *sk)
> > > > +{
> > > > +       int score = 0;
> > > > +
> > > > +       if (sk->sk_bound_dev_if)
> > > > +               score += 2;
> > > > +
> > > > +       if (sk->sk_family == PF_INET)
> > > > +               score += 10;
> > > > +
> > > > +       /* the priority of sk_incoming_cpu should be lower than sk_bound_dev_if,
> > > > +        * as it's optional in compute_score(). Thank God, this is the only
> > >
> > > Please do not bring God here.
> > >
> > > > +        * variable condition, which we can't judge now.
> > > > +        */
> > > > +       if (READ_ONCE(sk->sk_incoming_cpu))
> > > > +               score++;
> > > > +
> > > > +       return score;
> > > > +}
> > > > +
> > > >  /**
> > > >   * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
> > > >   * @net: network namespace.
> > > > @@ -739,6 +759,27 @@ static int inet_reuseport_add_sock(struct sock *sk,
> > > >         return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
> > > >  }
> > > >
> > > > +static void inet_hash_reuseport(struct sock *sk, struct hlist_nulls_head *head)
> > > > +{
> > > > +       const struct hlist_nulls_node *node;
> > > > +       int score, curscore;
> > > > +       struct sock *sk2;
> > > > +
> > > > +       curscore = compute_reuseport_score(sk);
> > > > +       /* lookup the socket to insert before */
> > > > +       sk_nulls_for_each_rcu(sk2, node, head) {
> > > > +               if (!sk2->sk_reuseport)
> > > > +                       continue;
> > > > +               score = compute_reuseport_score(sk2);
> > > > +               if (score <= curscore) {
> > > > +                       __sk_nulls_add_node_before_rcu(sk, sk2);
> > > > +                       return;
> > > > +               }
> > > > +       }
> > > > +
> > > > +       __sk_nulls_add_node_tail_rcu(sk, head);
> > > > +}
> > > > +
> > > >  int __inet_hash(struct sock *sk, struct sock *osk)
> > > >  {
> > > >         struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
> > > > @@ -761,11 +802,11 @@ int __inet_hash(struct sock *sk, struct sock *osk)
> > > >                         goto unlock;
> > > >         }
> > > >         sock_set_flag(sk, SOCK_RCU_FREE);
> > > > -       if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
> > > > -               sk->sk_family == AF_INET6)
> > > > -               __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
> > > > -       else
> > > > +       if (!sk->sk_reuseport)
> > > >                 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
> > > > +       else
> > > > +               inet_hash_reuseport(sk, &ilb2->nulls_head);
> > > > +
> > > >         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> > > >  unlock:
> > > >         spin_unlock(&ilb2->lock);
> > > > --
> > > > 2.50.1
> > > >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-01  9:09 [PATCH net v2] net: ip: order the reuseport socket in __inet_hash Menglong Dong
  2025-08-01  9:46 ` Eric Dumazet
@ 2025-08-11  5:27 ` kernel test robot
  2025-08-16  2:18   ` [LTP] " Wei Gao
  1 sibling, 1 reply; 9+ messages in thread
From: kernel test robot @ 2025-08-11  5:27 UTC (permalink / raw)
  To: Menglong Dong
  Cc: oe-lkp, lkp, Menglong Dong, rcu, netdev, ltp, edumazet, kuniyu,
	kraig, ncardwell, davem, dsahern, kuba, pabeni, horms,
	linux-kernel, oliver.sang



Hello,

kernel test robot noticed "BUG:KASAN:slab-use-after-free_in__inet_hash" on:

commit: 859ca60b71ef223e210d3d003a225d9ca70879fd ("[PATCH net v2] net: ip: order the reuseport socket in __inet_hash")
url: https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/net-ip-order-the-reuseport-socket-in-__inet_hash/20250801-171131
base: https://git.kernel.org/cgit/linux/kernel/git/davem/net.git 01051012887329ea78eaca19b1d2eac4c9f601b5
patch link: https://lore.kernel.org/all/20250801090949.129941-1-dongml2@chinatelecom.cn/
patch subject: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash

in testcase: ltp
version: ltp-x86_64-6505f9e29-1_20250802
with following parameters:

	disk: 1HDD
	fs: ext4
	test: fs_perms_simple



config: x86_64-rhel-9.4-ltp
compiler: gcc-12
test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz (Ivy Bridge) with 8G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com


kern :err : [  128.186735] BUG: KASAN: slab-use-after-free in __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800) 
kern  :err   : [  128.186868] Read of size 2 at addr ffff8882125c5f10 by task isc-net-0001/3160

kern  :err   : [  128.187050] CPU: 2 UID: 108 PID: 3160 Comm: isc-net-0001 Tainted: G S                  6.16.0-06590-g859ca60b71ef #1 PREEMPT(voluntary)
kern  :err   : [  128.187056] Tainted: [S]=CPU_OUT_OF_SPEC
kern  :err   : [  128.187058] Hardware name: Hewlett-Packard p6-1451cx/2ADA, BIOS 8.15 02/05/2013
kern  :err   : [  128.187060] Call Trace:
kern  :err   : [  128.187063]  <TASK>
kern :err : [  128.187065] dump_stack_lvl (lib/dump_stack.c:123 (discriminator 1)) 
kern :err : [  128.187072] print_address_description+0x2c/0x390 
kern :err : [  128.187079] ? __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800) 
kern :err : [  128.187084] print_report (mm/kasan/report.c:483) 
kern :err : [  128.187088] ? kasan_addr_to_slab (mm/kasan/common.c:37) 
kern :err : [  128.187092] ? __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800) 
kern :err : [  128.187096] kasan_report (mm/kasan/report.c:597) 
kern :err : [  128.187101] ? __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800) 
kern :err : [  128.187106] __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800) 
kern :err : [  128.187111] inet_csk_listen_start (net/ipv4/inet_connection_sock.c:1356) 
kern :err : [  128.187115] __inet_listen_sk (net/ipv4/af_inet.c:219) 
kern :err : [  128.187120] ? __pfx___inet_listen_sk (net/ipv4/af_inet.c:192) 
kern :err : [  128.187123] ? _raw_spin_lock_bh (arch/x86/include/asm/atomic.h:107 include/linux/atomic/atomic-arch-fallback.h:2170 include/linux/atomic/atomic-instrumented.h:1302 include/asm-generic/qspinlock.h:111 include/linux/spinlock.h:187 include/linux/spinlock_api_smp.h:127 kernel/locking/spinlock.c:178) 
kern :err : [  128.187128] ? __pfx__raw_spin_lock_bh (kernel/locking/spinlock.c:177) 
kern :err : [  128.187134] inet_listen (net/ipv4/af_inet.c:240) 
kern :err : [  128.187138] __sys_listen (include/linux/file.h:62 include/linux/file.h:83 net/socket.c:1918) 
kern :err : [  128.187144] __x64_sys_listen (net/socket.c:1930) 
kern :err : [  128.187148] ? __x64_sys_getsockname (net/socket.c:2145) 
kern :err : [  128.187152] do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :err : [  128.187155] ? do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :err : [  128.187159] ? do_sock_setsockopt (net/socket.c:2313) 
kern :err : [  128.187163] ? __x64_sys_bind (net/socket.c:1892) 
kern :err : [  128.187167] ? do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :err : [  128.187169] ? alloc_fd (fs/file.c:612) 
kern :err : [  128.187174] ? fdget (include/linux/file.h:57 fs/file.c:1176 fs/file.c:1181) 
kern :err : [  128.187178] ? fput (arch/x86/include/asm/atomic64_64.h:79 include/linux/atomic/atomic-arch-fallback.h:2913 include/linux/atomic/atomic-arch-fallback.h:3364 include/linux/atomic/atomic-long.h:698 include/linux/atomic/atomic-instrumented.h:3767 include/linux/file_ref.h:157 fs/file_table.c:544) 
kern :err : [  128.187181] ? __sys_setsockopt (include/linux/file.h:63 include/linux/file.h:83 net/socket.c:2361) 
kern :err : [  128.187185] ? __x64_sys_setsockopt (net/socket.c:2372) 
kern :err : [  128.187188] ? do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :err : [  128.187191] ? __x64_sys_openat (fs/open.c:1461) 
kern :err : [  128.187194] ? __pfx___x64_sys_openat (fs/open.c:1461) 
kern :err : [  128.187198] ? __x64_sys_setsockopt (net/socket.c:2372) 
kern :err : [  128.187201] ? count_memcg_events (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/atomic/atomic-instrumented.h:33 mm/memcontrol.c:560 mm/memcontrol.c:585 mm/memcontrol.c:564 mm/memcontrol.c:848) 
kern :err : [  128.187206] ? do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :err : [  128.187209] ? handle_mm_fault (mm/memory.c:6272 mm/memory.c:6425) 
kern :err : [  128.187213] ? do_user_addr_fault (arch/x86/include/asm/atomic.h:93 include/linux/atomic/atomic-arch-fallback.h:949 include/linux/atomic/atomic-instrumented.h:401 include/linux/refcount.h:389 include/linux/refcount.h:432 include/linux/mmap_lock.h:142 include/linux/mmap_lock.h:237 arch/x86/mm/fault.c:1338) 
kern :err : [  128.187218] ? exc_page_fault (arch/x86/include/asm/irqflags.h:37 arch/x86/include/asm/irqflags.h:114 arch/x86/mm/fault.c:1484 arch/x86/mm/fault.c:1532) 
kern :err : [  128.187223] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
kern  :err   : [  128.187227] RIP: 0033:0x7fe51b028897
kern :err : [ 128.187231] Code: f0 ff ff 77 06 c3 0f 1f 44 00 00 48 8b 15 61 75 0c 00 f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 b8 32 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 39 75 0c 00 f7 d8 64 89 01 48
All code
========
   0:	f0 ff                	lock (bad)
   2:	ff 77 06             	push   0x6(%rdi)
   5:	c3                   	ret
   6:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
   b:	48 8b 15 61 75 0c 00 	mov    0xc7561(%rip),%rdx        # 0xc7573
  12:	f7 d8                	neg    %eax
  14:	64 89 02             	mov    %eax,%fs:(%rdx)
  17:	b8 ff ff ff ff       	mov    $0xffffffff,%eax
  1c:	c3                   	ret
  1d:	66 0f 1f 44 00 00    	nopw   0x0(%rax,%rax,1)
  23:	b8 32 00 00 00       	mov    $0x32,%eax
  28:	0f 05                	syscall
  2a:*	48 3d 01 f0 ff ff    	cmp    $0xfffffffffffff001,%rax		<-- trapping instruction
  30:	73 01                	jae    0x33
  32:	c3                   	ret
  33:	48 8b 0d 39 75 0c 00 	mov    0xc7539(%rip),%rcx        # 0xc7573
  3a:	f7 d8                	neg    %eax
  3c:	64 89 01             	mov    %eax,%fs:(%rcx)
  3f:	48                   	rex.W

Code starting with the faulting instruction
===========================================
   0:	48 3d 01 f0 ff ff    	cmp    $0xfffffffffffff001,%rax
   6:	73 01                	jae    0x9
   8:	c3                   	ret
   9:	48 8b 0d 39 75 0c 00 	mov    0xc7539(%rip),%rcx        # 0xc7549
  10:	f7 d8                	neg    %eax
  12:	64 89 01             	mov    %eax,%fs:(%rcx)
  15:	48                   	rex.W
kern  :err   : [  128.187235] RSP: 002b:00007fe5169fe0f8 EFLAGS: 00000217 ORIG_RAX: 0000000000000032
kern  :err   : [  128.187239] RAX: ffffffffffffffda RBX: 00007fe516a1d760 RCX: 00007fe51b028897
kern  :err   : [  128.187241] RDX: 0000000000000002 RSI: 000000000000000a RDI: 000000000000002c
kern  :err   : [  128.187243] RBP: 0000000000000000 R08: 0000000000008000 R09: 00000000ffffffff
kern  :err   : [  128.187245] R10: 00007fe5169fe024 R11: 0000000000000217 R12: 00007fe51bbd1d70
kern  :err   : [  128.187248] R13: 000000000000000a R14: 00007fe5182de000 R15: 00007fe516a1d5d0
kern  :err   : [  128.187252]  </TASK>

kern  :err   : [  128.192052] Allocated by task 2436:
kern :warn : [  128.192126] kasan_save_stack (mm/kasan/common.c:48) 
kern :warn : [  128.192209] kasan_save_track (arch/x86/include/asm/current.h:25 mm/kasan/common.c:60 mm/kasan/common.c:69) 
kern :warn : [  128.192289] __kasan_slab_alloc (mm/kasan/common.c:319 mm/kasan/common.c:345) 
kern :warn : [  128.192373] kmem_cache_alloc_noprof (mm/slub.c:4148 mm/slub.c:4197 mm/slub.c:4204) 
kern :warn : [  128.192466] sk_prot_alloc (net/core/sock.c:2233 (discriminator 2)) 
kern :warn : [  128.192545] sk_alloc (net/core/sock.c:2295) 
kern :warn : [  128.192615] inet_create (net/ipv4/af_inet.c:1733 (discriminator 2)) 
kern :warn : [  128.192717] __sock_create (net/socket.c:1590) 
kern :warn : [  128.192796] __sys_socket (net/socket.c:1686 net/socket.c:1669 net/socket.c:1731) 
kern :warn : [  128.192874] __x64_sys_socket (net/socket.c:1743) 
kern :warn : [  128.192956] do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :warn : [  128.193034] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 

kern  :err   : [  128.193176] Freed by task 0:
kern :warn : [  128.193240] kasan_save_stack (mm/kasan/common.c:48) 
kern :warn : [  128.193321] kasan_save_track (arch/x86/include/asm/current.h:25 mm/kasan/common.c:60 mm/kasan/common.c:69) 
kern :warn : [  128.193401] kasan_save_free_info (mm/kasan/generic.c:579) 
kern :warn : [  128.193487] __kasan_slab_free (mm/kasan/common.c:271) 
kern :warn : [  128.193569] slab_free_after_rcu_debug (mm/slub.c:4693) 
kern :warn : [  128.193663] rcu_do_batch (arch/x86/include/asm/preempt.h:27 kernel/rcu/tree.c:2583) 
kern :warn : [  128.193740] rcu_core (kernel/rcu/tree.c:2834) 
kern :warn : [  128.193812] handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:580) 
kern :warn : [  128.193894] __irq_exit_rcu (kernel/softirq.c:614 kernel/softirq.c:453 kernel/softirq.c:680) 
kern :warn : [  128.193977] sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1050 arch/x86/kernel/apic/apic.c:1050) 
kern :warn : [  128.194074] asm_sysvec_apic_timer_interrupt (arch/x86/include/asm/idtentry.h:574) 

kern  :err   : [  128.194217] Last potentially related work creation:
kern :warn : [  128.194312] kasan_save_stack (mm/kasan/common.c:48) 
kern :warn : [  128.194393] kasan_record_aux_stack (mm/kasan/generic.c:548) 
kern :warn : [  128.194481] kmem_cache_free (mm/slub.c:2344 mm/slub.c:4643 mm/slub.c:4745) 
kern :warn : [  128.194563] __sk_destruct (net/core/sock.c:2279 net/core/sock.c:2373) 
kern :warn : [  128.194642] rcu_do_batch (arch/x86/include/asm/preempt.h:27 kernel/rcu/tree.c:2583) 
kern :warn : [  128.194719] rcu_core (kernel/rcu/tree.c:2834) 
kern :warn : [  128.194791] handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:580) 
kern :warn : [  128.194873] __irq_exit_rcu (kernel/softirq.c:614 kernel/softirq.c:453 kernel/softirq.c:680) 
kern :warn : [  128.194955] sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1050 arch/x86/kernel/apic/apic.c:1050) 
kern :warn : [  128.195052] asm_sysvec_apic_timer_interrupt (arch/x86/include/asm/idtentry.h:574) 

kern  :err   : [  128.195194] Second to last potentially related work creation:
kern :warn : [  128.195303] kasan_save_stack (mm/kasan/common.c:48) 
kern :warn : [  128.195383] kasan_record_aux_stack (mm/kasan/generic.c:548) 
kern :warn : [  128.195472] __call_rcu_common+0xc8/0x980 
kern :warn : [  128.195571] inet_release (net/ipv4/af_inet.c:436) 
kern :warn : [  128.195648] __sock_release (net/socket.c:650) 
kern :warn : [  128.195727] sock_close (net/socket.c:1441) 
kern :warn : [  128.195799] __fput (fs/file_table.c:468) 
kern :warn : [  128.195869] fput_close_sync (fs/file_table.c:571) 
kern :warn : [  128.195951] __x64_sys_close (fs/open.c:1590 fs/open.c:1572 fs/open.c:1572) 
kern :warn : [  128.196032] do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) 
kern :warn : [  128.196109] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 

kern  :err   : [  128.196250] The buggy address belongs to the object at ffff8882125c5f00
which belongs to the cache TCP of size 2304
kern  :err   : [  128.196468] The buggy address is located 16 bytes inside of
freed 2304-byte region [ffff8882125c5f00, ffff8882125c6800)

kern  :err   : [  128.196733] The buggy address belongs to the physical page:
kern  :warn  : [  128.196839] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff8882125c5580 pfn:0x2125c0
kern  :warn  : [  128.197008] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
kern  :warn  : [  128.197148] memcg:ffff888217e99e01
kern  :warn  : [  128.197221] anon flags: 0x17ffffc0000040(head|node=0|zone=2|lastcpupid=0x1fffff)
kern  :warn  : [  128.197358] page_type: f5(slab)
kern  :warn  : [  128.197429] raw: 0017ffffc0000040 ffff88810221c640 0000000000000000 0000000000000001


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250811/202508110750.a66a4225-lkp@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-11  5:27 ` kernel test robot
@ 2025-08-16  2:18   ` Wei Gao
  2025-08-16  2:35     ` Kuniyuki Iwashima
  0 siblings, 1 reply; 9+ messages in thread
From: Wei Gao @ 2025-08-16  2:18 UTC (permalink / raw)
  To: kernel test robot
  Cc: Menglong Dong, kuniyu, kraig, lkp, netdev, dsahern, linux-kernel,
	rcu, edumazet, horms, oe-lkp, kuba, pabeni, ncardwell, davem, ltp,
	Menglong Dong

On Mon, Aug 11, 2025 at 01:27:12PM +0800, kernel test robot wrote:
> 
> 
> Hello,
> 
> kernel test robot noticed "BUG:KASAN:slab-use-after-free_in__inet_hash" on:
> 
> commit: 859ca60b71ef223e210d3d003a225d9ca70879fd ("[PATCH net v2] net: ip: order the reuseport socket in __inet_hash")
> url: https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/net-ip-order-the-reuseport-socket-in-__inet_hash/20250801-171131
> base: https://git.kernel.org/cgit/linux/kernel/git/davem/net.git 01051012887329ea78eaca19b1d2eac4c9f601b5
> patch link: https://lore.kernel.org/all/20250801090949.129941-1-dongml2@chinatelecom.cn/
> patch subject: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
> 
> in testcase: ltp
> version: ltp-x86_64-6505f9e29-1_20250802
> with following parameters:
> 
> 	disk: 1HDD
> 	fs: ext4
> 	test: fs_perms_simple
> 
> 
> 
> config: x86_64-rhel-9.4-ltp
> compiler: gcc-12
> test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz (Ivy Bridge) with 8G memory
> 
> (please refer to attached dmesg/kmsg for entire log/backtrace)
> 
> 
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <oliver.sang@intel.com>
> | Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com
> 
> 
> kern :err : [  128.186735] BUG: KASAN: slab-use-after-free in __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800) 

This kasan error not related with LTP case, i guess it triggered by network
related process such as bind etc. I try to give following patch to fix
kasan error, correct me if any mistake, thanks.

From: Wei Gao <wegao@suse.com>
Date: Sat, 16 Aug 2025 09:32:56 +0800
Subject: [PATCH v1] net: Fix BUG:KASAN:slab-use-after-free_in__inet_hash

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com
Signed-off-by: Wei Gao <wegao@suse.com>
---
 include/linux/rculist_nulls.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index da500f4ae142..5def9009c507 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -57,7 +57,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
  * @node: element of the list.
  */
 #define hlist_nulls_pprev_rcu(node) \
-       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
+       (*((struct hlist_nulls_node __rcu __force **)(node)->pprev))

 /**
  * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
@@ -175,7 +175,7 @@ static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
 {
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
-       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
+       rcu_assign_pointer(hlist_nulls_pprev_rcu(next), n);
        WRITE_ONCE(next->pprev, &n->next);
 }

--
2.43.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-16  2:18   ` [LTP] " Wei Gao
@ 2025-08-16  2:35     ` Kuniyuki Iwashima
  2025-08-16  7:27       ` Wei Gao
  0 siblings, 1 reply; 9+ messages in thread
From: Kuniyuki Iwashima @ 2025-08-16  2:35 UTC (permalink / raw)
  To: Wei Gao
  Cc: kernel test robot, Menglong Dong, kraig, lkp, netdev, dsahern,
	linux-kernel, rcu, edumazet, horms, oe-lkp, kuba, pabeni,
	ncardwell, davem, ltp, Menglong Dong

On Fri, Aug 15, 2025 at 7:18 PM Wei Gao <wegao@suse.com> wrote:
>
> On Mon, Aug 11, 2025 at 01:27:12PM +0800, kernel test robot wrote:
> >
> >
> > Hello,
> >
> > kernel test robot noticed "BUG:KASAN:slab-use-after-free_in__inet_hash" on:
> >
> > commit: 859ca60b71ef223e210d3d003a225d9ca70879fd ("[PATCH net v2] net: ip: order the reuseport socket in __inet_hash")
> > url: https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/net-ip-order-the-reuseport-socket-in-__inet_hash/20250801-171131
> > base: https://git.kernel.org/cgit/linux/kernel/git/davem/net.git 01051012887329ea78eaca19b1d2eac4c9f601b5
> > patch link: https://lore.kernel.org/all/20250801090949.129941-1-dongml2@chinatelecom.cn/
> > patch subject: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
> >
> > in testcase: ltp
> > version: ltp-x86_64-6505f9e29-1_20250802
> > with following parameters:
> >
> >       disk: 1HDD
> >       fs: ext4
> >       test: fs_perms_simple
> >
> >
> >
> > config: x86_64-rhel-9.4-ltp
> > compiler: gcc-12
> > test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz (Ivy Bridge) with 8G memory
> >
> > (please refer to attached dmesg/kmsg for entire log/backtrace)
> >
> >
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <oliver.sang@intel.com>
> > | Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com
> >
> >
> > kern :err : [  128.186735] BUG: KASAN: slab-use-after-free in __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800)
>
> This kasan error not related with LTP case, i guess it triggered by network
> related process such as bind etc. I try to give following patch to fix
> kasan error, correct me if any mistake, thanks.

Note that the report was for the patch in the mailing list
and the patch was not applied to net-next.git nor net.git.


>
> From: Wei Gao <wegao@suse.com>
> Date: Sat, 16 Aug 2025 09:32:56 +0800
> Subject: [PATCH v1] net: Fix BUG:KASAN:slab-use-after-free_in__inet_hash
>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com
> Signed-off-by: Wei Gao <wegao@suse.com>
> ---
>  include/linux/rculist_nulls.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> index da500f4ae142..5def9009c507 100644
> --- a/include/linux/rculist_nulls.h
> +++ b/include/linux/rculist_nulls.h
> @@ -57,7 +57,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
>   * @node: element of the list.
>   */
>  #define hlist_nulls_pprev_rcu(node) \
> -       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> +       (*((struct hlist_nulls_node __rcu __force **)(node)->pprev))
>
>  /**
>   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> @@ -175,7 +175,7 @@ static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
>  {
>         WRITE_ONCE(n->pprev, next->pprev);
>         n->next = next;
> -       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> +       rcu_assign_pointer(hlist_nulls_pprev_rcu(next), n);
>         WRITE_ONCE(next->pprev, &n->next);
>  }
>
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [LTP] [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
  2025-08-16  2:35     ` Kuniyuki Iwashima
@ 2025-08-16  7:27       ` Wei Gao
  0 siblings, 0 replies; 9+ messages in thread
From: Wei Gao @ 2025-08-16  7:27 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: kernel test robot, Menglong Dong, kraig, lkp, netdev, dsahern,
	linux-kernel, rcu, edumazet, horms, oe-lkp, kuba, pabeni,
	ncardwell, davem, ltp, Menglong Dong

On Fri, Aug 15, 2025 at 07:35:10PM -0700, Kuniyuki Iwashima wrote:
> On Fri, Aug 15, 2025 at 7:18 PM Wei Gao <wegao@suse.com> wrote:
> >
> > On Mon, Aug 11, 2025 at 01:27:12PM +0800, kernel test robot wrote:
> > >
> > >
> > > Hello,
> > >
> > > kernel test robot noticed "BUG:KASAN:slab-use-after-free_in__inet_hash" on:
> > >
> > > commit: 859ca60b71ef223e210d3d003a225d9ca70879fd ("[PATCH net v2] net: ip: order the reuseport socket in __inet_hash")
> > > url: https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/net-ip-order-the-reuseport-socket-in-__inet_hash/20250801-171131
> > > base: https://git.kernel.org/cgit/linux/kernel/git/davem/net.git 01051012887329ea78eaca19b1d2eac4c9f601b5
> > > patch link: https://lore.kernel.org/all/20250801090949.129941-1-dongml2@chinatelecom.cn/
> > > patch subject: [PATCH net v2] net: ip: order the reuseport socket in __inet_hash
> > >
> > > in testcase: ltp
> > > version: ltp-x86_64-6505f9e29-1_20250802
> > > with following parameters:
> > >
> > >       disk: 1HDD
> > >       fs: ext4
> > >       test: fs_perms_simple
> > >
> > >
> > >
> > > config: x86_64-rhel-9.4-ltp
> > > compiler: gcc-12
> > > test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz (Ivy Bridge) with 8G memory
> > >
> > > (please refer to attached dmesg/kmsg for entire log/backtrace)
> > >
> > >
> > >
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <oliver.sang@intel.com>
> > > | Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com
> > >
> > >
> > > kern :err : [  128.186735] BUG: KASAN: slab-use-after-free in __inet_hash (net/ipv4/inet_hashtables.c:749 net/ipv4/inet_hashtables.c:800)
> >
> > This kasan error not related with LTP case, i guess it triggered by network
> > related process such as bind etc. I try to give following patch to fix
> > kasan error, correct me if any mistake, thanks.
> 
> Note that the report was for the patch in the mailing list
> and the patch was not applied to net-next.git nor net.git.
Thanks for note. 
Since this email sent to LTP group so i got this. Since
i'm interested in this 'kasan' problem, so trying to fix it.
> 
> 
> >
> > From: Wei Gao <wegao@suse.com>
> > Date: Sat, 16 Aug 2025 09:32:56 +0800
> > Subject: [PATCH v1] net: Fix BUG:KASAN:slab-use-after-free_in__inet_hash
> >
> > Reported-by: kernel test robot <oliver.sang@intel.com>
> > Closes: https://lore.kernel.org/oe-lkp/202508110750.a66a4225-lkp@intel.com
> > Signed-off-by: Wei Gao <wegao@suse.com>
> > ---
> >  include/linux/rculist_nulls.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> > index da500f4ae142..5def9009c507 100644
> > --- a/include/linux/rculist_nulls.h
> > +++ b/include/linux/rculist_nulls.h
> > @@ -57,7 +57,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
> >   * @node: element of the list.
> >   */
> >  #define hlist_nulls_pprev_rcu(node) \
> > -       (*((struct hlist_nulls_node __rcu __force **)&(node)->pprev))
> > +       (*((struct hlist_nulls_node __rcu __force **)(node)->pprev))
> >
> >  /**
> >   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> > @@ -175,7 +175,7 @@ static inline void hlist_nulls_add_before_rcu(struct hlist_nulls_node *n,
> >  {
> >         WRITE_ONCE(n->pprev, next->pprev);
> >         n->next = next;
> > -       rcu_assign_pointer(hlist_nulls_pprev_rcu(n), n);
> > +       rcu_assign_pointer(hlist_nulls_pprev_rcu(next), n);
> >         WRITE_ONCE(next->pprev, &n->next);
> >  }
> >
> > --
> > 2.43.0
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-08-16  7:27 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-01  9:09 [PATCH net v2] net: ip: order the reuseport socket in __inet_hash Menglong Dong
2025-08-01  9:46 ` Eric Dumazet
2025-08-01 10:42   ` Menglong Dong
2025-08-01 16:46     ` Kuniyuki Iwashima
2025-08-02  0:59       ` Menglong Dong
2025-08-11  5:27 ` kernel test robot
2025-08-16  2:18   ` [LTP] " Wei Gao
2025-08-16  2:35     ` Kuniyuki Iwashima
2025-08-16  7:27       ` Wei Gao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).