Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 1/2] ip_gre: Make GRE and GRETAP devices always NETIF_F_LLTX
From: Peilin Ye @ 2022-04-21 22:44 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Hideaki YOSHIFUJI, David Ahern,
	Paolo Abeni
  Cc: Peilin Ye, Cong Wang, Eric Dumazet, netdev, linux-kernel,
	Peilin Ye
In-Reply-To: <cover.1650580763.git.peilin.ye@bytedance.com>

From: Peilin Ye <peilin.ye@bytedance.com>

Recently we made o_seqno atomic_t.  Stop special-casing TUNNEL_SEQ, and
always mark GRE[TAP] devices as NETIF_F_LLTX, since we no longer need
the TX lock (&txq->_xmit_lock).

Depends on patch "ip_gre, ip6_gre: Fix race condition on o_seqno in
collect_md mode".

Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
---
 net/ipv4/ip_gre.c | 50 +++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8cf86e42c1d1..a81c2964f70b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -748,6 +748,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	__be16 flags;
 	int len;
 
 	len = tunnel->tun_hlen;
@@ -763,19 +764,15 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 	if (set_mtu)
 		dev->mtu = max_t(int, dev->mtu - len, 68);
 
-	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
-		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
-		    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
-			dev->features |= NETIF_F_GSO_SOFTWARE;
-			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
-		} else {
-			dev->features &= ~NETIF_F_GSO_SOFTWARE;
-			dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
-		}
-		dev->features |= NETIF_F_LLTX;
-	} else {
+	flags = tunnel->parms.o_flags;
+
+	if (flags & TUNNEL_SEQ ||
+	    (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+		dev->features &= ~NETIF_F_GSO_SOFTWARE;
 		dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
-		dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
+	} else {
+		dev->features |= NETIF_F_GSO_SOFTWARE;
+		dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 	}
 }
 
@@ -949,6 +946,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 static void __gre_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel;
+	__be16 flags;
 
 	tunnel = netdev_priv(dev);
 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -957,25 +955,21 @@ static void __gre_tunnel_init(struct net_device *dev)
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 	dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
 
-	dev->features		|= GRE_FEATURES;
+	dev->features		|= GRE_FEATURES | NETIF_F_LLTX;
 	dev->hw_features	|= GRE_FEATURES;
 
-	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
-		/* TCP offload with GRE SEQ is not supported, nor
-		 * can we support 2 levels of outer headers requiring
-		 * an update.
-		 */
-		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
-		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
-			dev->features    |= NETIF_F_GSO_SOFTWARE;
-			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
-		}
+	flags = tunnel->parms.o_flags;
 
-		/* Can use a lockless transmit, unless we generate
-		 * output sequences
-		 */
-		dev->features |= NETIF_F_LLTX;
-	}
+	/* TCP offload with GRE SEQ is not supported, nor can we support 2
+	 * levels of outer headers requiring an update.
+	 */
+	if (flags & TUNNEL_SEQ)
+		return;
+	if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)
+		return;
+
+	dev->features |= NETIF_F_GSO_SOFTWARE;
+	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 }
 
 static int ipgre_tunnel_init(struct net_device *dev)
-- 
2.20.1


^ permalink raw reply related

* [PATCH net-next 2/2] ip6_gre: Make IP6GRE and IP6GRETAP devices always NETIF_F_LLTX
From: Peilin Ye @ 2022-04-21 22:45 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Hideaki YOSHIFUJI, David Ahern,
	Paolo Abeni
  Cc: Peilin Ye, Cong Wang, Eric Dumazet, netdev, linux-kernel,
	Peilin Ye
In-Reply-To: <cover.1650580763.git.peilin.ye@bytedance.com>

From: Peilin Ye <peilin.ye@bytedance.com>

Recently we made o_seqno atomic_t.  Stop special-casing TUNNEL_SEQ, and
always mark IP6GRE[TAP] devices as NETIF_F_LLTX, since we no longer need
the TX lock (&txq->_xmit_lock).

Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
---
 net/ipv6/ip6_gre.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 5136959b3dc5..4e37f7c29900 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -382,11 +382,6 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
 		goto failed_free;
 
 	ip6gre_tnl_link_config(nt, 1);
-
-	/* Can use a lockless transmit, unless we generate output sequences */
-	if (!(nt->parms.o_flags & TUNNEL_SEQ))
-		dev->features |= NETIF_F_LLTX;
-
 	ip6gre_tunnel_link(ign, nt);
 	return nt;
 
@@ -1445,26 +1440,23 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 static void ip6gre_tnl_init_features(struct net_device *dev)
 {
 	struct ip6_tnl *nt = netdev_priv(dev);
+	__be16 flags;
 
-	dev->features		|= GRE6_FEATURES;
+	dev->features		|= GRE6_FEATURES | NETIF_F_LLTX;
 	dev->hw_features	|= GRE6_FEATURES;
 
-	if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
-		/* TCP offload with GRE SEQ is not supported, nor
-		 * can we support 2 levels of outer headers requiring
-		 * an update.
-		 */
-		if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
-		    nt->encap.type == TUNNEL_ENCAP_NONE) {
-			dev->features    |= NETIF_F_GSO_SOFTWARE;
-			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
-		}
+	flags = nt->parms.o_flags;
 
-		/* Can use a lockless transmit, unless we generate
-		 * output sequences
-		 */
-		dev->features |= NETIF_F_LLTX;
-	}
+	/* TCP offload with GRE SEQ is not supported, nor can we support 2
+	 * levels of outer headers requiring an update.
+	 */
+	if (flags & TUNNEL_SEQ)
+		return;
+	if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE)
+		return;
+
+	dev->features |= NETIF_F_GSO_SOFTWARE;
+	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 }
 
 static int ip6gre_tunnel_init_common(struct net_device *dev)
-- 
2.20.1


^ permalink raw reply related

* Re: [net-next v1] net: Add a second bind table hashed by port and address
From: Eric Dumazet @ 2022-04-21 22:50 UTC (permalink / raw)
  To: Joanne Koong; +Cc: netdev, Martin KaFai Lau, David Miller, Jakub Kicinski
In-Reply-To: <20220421221449.1817041-1-joannelkoong@gmail.com>

On Thu, Apr 21, 2022 at 3:16 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> We currently have one tcp bind table (bhash) which hashes by port
> number only. In the socket bind path, we check for bind conflicts by
> traversing the specified port's inet_bind2_bucket while holding the
> bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()).
>
> In instances where there are tons of sockets hashed to the same port
> at different addresses, checking for a bind conflict is time-intensive
> and can cause softirq cpu lockups, as well as stops new tcp connections
> since __inet_inherit_port() also contests for the spinlock.
>
> This patch proposes adding a second bind table, bhash2, that hashes by
> port and ip address. Searching the bhash2 table leads to significantly
> faster conflict resolution and less time holding the spinlock.
> When experimentally testing this on a local server, the results for how
> long a bind request takes were as follows:
>
> when there are ~24k sockets already bound to the port -
>
> ipv4:
> before - 0.002317 seconds
> with bhash2 - 0.000018 seconds
>
> ipv6:
> before - 0.002431 seconds
> with bhash2 - 0.000021 seconds


Hi Joanne

Do you have a test for this ? Are you using 24k IPv6 addresses on the host ?

I fear we add some extra code and cost for quite an unusual configuration.

Thanks.

>
> when there are ~12 million sockets already bound to the port -
>
> ipv4:
> before - 7.498583 seconds
> with bhash2 - 0.000021 seconds
>
> ipv6:
> before - 7.813554 seconds
> with bhash2 - 0.000029 seconds
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  include/net/inet_connection_sock.h |   3 +
>  include/net/inet_hashtables.h      |  56 ++++++-
>  include/net/sock.h                 |  14 ++
>  net/dccp/proto.c                   |  14 +-
>  net/ipv4/inet_connection_sock.c    | 227 +++++++++++++++++++++--------
>  net/ipv4/inet_hashtables.c         | 188 ++++++++++++++++++++++--
>  net/ipv4/tcp.c                     |  14 +-
>  7 files changed, 438 insertions(+), 78 deletions(-)
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 3908296d103f..d89a78d10294 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -25,6 +25,7 @@
>  #undef INET_CSK_CLEAR_TIMERS
>
>  struct inet_bind_bucket;
> +struct inet_bind2_bucket;
>  struct tcp_congestion_ops;
>
>  /*
> @@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
>   *
>   * @icsk_accept_queue:    FIFO of established children
>   * @icsk_bind_hash:       Bind node
> + * @icsk_bind2_hash:      Bind node in the bhash2 table
>   * @icsk_timeout:         Timeout
>   * @icsk_retransmit_timer: Resend (no ack)
>   * @icsk_rto:             Retransmit timeout
> @@ -84,6 +86,7 @@ struct inet_connection_sock {
>         struct inet_sock          icsk_inet;
>         struct request_sock_queue icsk_accept_queue;
>         struct inet_bind_bucket   *icsk_bind_hash;
> +       struct inet_bind2_bucket  *icsk_bind2_hash;
>         unsigned long             icsk_timeout;
>         struct timer_list         icsk_retransmit_timer;
>         struct timer_list         icsk_delack_timer;
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index f72ec113ae56..143a33d815c2 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -90,11 +90,30 @@ struct inet_bind_bucket {
>         struct hlist_head       owners;
>  };
>
> +struct inet_bind2_bucket {
> +       possible_net_t          ib_net;
> +       int                     l3mdev;
> +       unsigned short          port;
> +       union {
> +#if IS_ENABLED(CONFIG_IPV6)
> +               struct in6_addr         v6_rcv_saddr;
> +#endif
> +               __be32                  rcv_saddr;
> +       };
> +       struct hlist_node       node;           /* Node in the inet2_bind_hashbucket chain */
> +       struct hlist_head       owners;         /* List of sockets hashed to this bucket */
> +};
> +
>  static inline struct net *ib_net(struct inet_bind_bucket *ib)
>  {
>         return read_pnet(&ib->ib_net);
>  }
>
> +static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
> +{
> +       return read_pnet(&ib->ib_net);
> +}
> +
>  #define inet_bind_bucket_for_each(tb, head) \
>         hlist_for_each_entry(tb, head, node)
>
> @@ -103,6 +122,15 @@ struct inet_bind_hashbucket {
>         struct hlist_head       chain;
>  };
>
> +/* This is synchronized using the inet_bind_hashbucket's spinlock.
> + * Instead of having separate spinlocks, the inet_bind2_hashbucket can share
> + * the inet_bind_hashbucket's given that in every case where the bhash2 table
> + * is useful, a lookup in the bhash table also occurs.
> + */
> +struct inet_bind2_hashbucket {
> +       struct hlist_head       chain;
> +};
> +
>  /* Sockets can be hashed in established or listening table.
>   * We must use different 'nulls' end-of-chain value for all hash buckets :
>   * A socket might transition from ESTABLISH to LISTEN state without
> @@ -138,6 +166,11 @@ struct inet_hashinfo {
>          */
>         struct kmem_cache               *bind_bucket_cachep;
>         struct inet_bind_hashbucket     *bhash;
> +       /* The 2nd binding table hashed by port and address.
> +        * This is used primarily for expediting the resolution of bind conflicts.
> +        */
> +       struct kmem_cache               *bind2_bucket_cachep;
> +       struct inet_bind2_hashbucket    *bhash2;
>         unsigned int                    bhash_size;
>
>         /* The 2nd listener table hashed by local port and address */
> @@ -221,6 +254,27 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
>  void inet_bind_bucket_destroy(struct kmem_cache *cachep,
>                               struct inet_bind_bucket *tb);
>
> +static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb, struct net *net,
> +                                          const unsigned short port, int l3mdev)
> +{
> +       return net_eq(ib_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev;
> +}
> +
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
> +                        struct inet_bind2_hashbucket *head, const unsigned short port,
> +                        int l3mdev, const struct sock *sk);
> +
> +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb);
> +
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, const unsigned short port,
> +                      int l3mdev, struct sock *sk, struct inet_bind2_hashbucket **head);
> +
> +bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, struct net *net,
> +                                      const unsigned short port, int l3mdev,
> +                                      const struct sock *sk);
> +
>  static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
>                                const u32 bhash_size)
>  {
> @@ -228,7 +282,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
>  }
>
>  void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
> -                   const unsigned short snum);
> +                   struct inet_bind2_bucket *tb2, const unsigned short snum);
>
>  /* These can have wildcards, don't try too hard. */
>  static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
> diff --git a/include/net/sock.h b/include/net/sock.h
> index c4b91fc19b9c..a2198d5674f6 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -352,6 +352,7 @@ struct sk_filter;
>    *    @sk_txtime_report_errors: set report errors mode for SO_TXTIME
>    *    @sk_txtime_unused: unused txtime flags
>    *    @ns_tracker: tracker for netns reference
> +  *    @sk_bind2_node: bind node in the bhash2 table
>    */
>  struct sock {
>         /*
> @@ -542,6 +543,7 @@ struct sock {
>  #endif
>         struct rcu_head         sk_rcu;
>         netns_tracker           ns_tracker;
> +       struct hlist_node       sk_bind2_node;
>  };
>
>  enum sk_pacing {
> @@ -822,6 +824,16 @@ static inline void sk_add_bind_node(struct sock *sk,
>         hlist_add_head(&sk->sk_bind_node, list);
>  }
>
> +static inline void __sk_del_bind2_node(struct sock *sk)
> +{
> +       __hlist_del(&sk->sk_bind2_node);
> +}
> +
> +static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
> +{
> +       hlist_add_head(&sk->sk_bind2_node, list);
> +}
> +
>  #define sk_for_each(__sk, list) \
>         hlist_for_each_entry(__sk, list, sk_node)
>  #define sk_for_each_rcu(__sk, list) \
> @@ -839,6 +851,8 @@ static inline void sk_add_bind_node(struct sock *sk,
>         hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
>  #define sk_for_each_bound(__sk, list) \
>         hlist_for_each_entry(__sk, list, sk_bind_node)
> +#define sk_for_each_bound_bhash2(__sk, list) \
> +       hlist_for_each_entry(__sk, list, sk_bind2_node)
>
>  /**
>   * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
> diff --git a/net/dccp/proto.c b/net/dccp/proto.c
> index a976b4d29892..e65768370170 100644
> --- a/net/dccp/proto.c
> +++ b/net/dccp/proto.c
> @@ -1121,6 +1121,12 @@ static int __init dccp_init(void)
>                                   SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
>         if (!dccp_hashinfo.bind_bucket_cachep)
>                 goto out_free_hashinfo2;
> +       dccp_hashinfo.bind2_bucket_cachep =
> +               kmem_cache_create("dccp_bind2_bucket",
> +                                 sizeof(struct inet_bind2_bucket), 0,
> +                                 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
> +       if (!dccp_hashinfo.bind2_bucket_cachep)
> +               goto out_free_bind_bucket_cachep;
>
>         /*
>          * Size and allocate the main established and bind bucket
> @@ -1151,7 +1157,7 @@ static int __init dccp_init(void)
>
>         if (!dccp_hashinfo.ehash) {
>                 DCCP_CRIT("Failed to allocate DCCP established hash table");
> -               goto out_free_bind_bucket_cachep;
> +               goto out_free_bind2_bucket_cachep;
>         }
>
>         for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
> @@ -1170,6 +1176,8 @@ static int __init dccp_init(void)
>                         continue;
>                 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
>                         __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
> +               dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *)
> +                       __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
>         } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
>
>         if (!dccp_hashinfo.bhash) {
> @@ -1180,6 +1188,7 @@ static int __init dccp_init(void)
>         for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
>                 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
>                 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
> +               INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
>         }
>
>         rc = dccp_mib_init();
> @@ -1214,6 +1223,8 @@ static int __init dccp_init(void)
>         inet_ehash_locks_free(&dccp_hashinfo);
>  out_free_dccp_ehash:
>         free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
> +out_free_bind2_bucket_cachep:
> +       kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
>  out_free_bind_bucket_cachep:
>         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
>  out_free_hashinfo2:
> @@ -1222,6 +1233,7 @@ static int __init dccp_init(void)
>         dccp_hashinfo.bhash = NULL;
>         dccp_hashinfo.ehash = NULL;
>         dccp_hashinfo.bind_bucket_cachep = NULL;
> +       dccp_hashinfo.bind2_bucket_cachep = NULL;
>         return rc;
>  }
>
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 1e5b53c2bb26..482935f0c8f6 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -117,6 +117,30 @@ bool inet_rcv_saddr_any(const struct sock *sk)
>         return !sk->sk_rcv_saddr;
>  }
>
> +static bool use_bhash2_on_bind(const struct sock *sk)
> +{
> +#if IS_ENABLED(CONFIG_IPV6)
> +       int addr_type;
> +
> +       if (sk->sk_family == AF_INET6) {
> +               addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
> +               return addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED;
> +       }
> +#endif
> +       return sk->sk_rcv_saddr != htonl(INADDR_ANY);
> +}
> +
> +static u32 get_bhash2_nulladdr_hash(const struct sock *sk, struct net *net, int port)
> +{
> +#if IS_ENABLED(CONFIG_IPV6)
> +       struct in6_addr nulladdr = {};
> +
> +       if (sk->sk_family == AF_INET6)
> +               return ipv6_portaddr_hash(net, &nulladdr, port);
> +#endif
> +       return ipv4_portaddr_hash(net, 0, port);
> +}
> +
>  void inet_get_local_port_range(struct net *net, int *low, int *high)
>  {
>         unsigned int seq;
> @@ -130,16 +154,58 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
>  }
>  EXPORT_SYMBOL(inet_get_local_port_range);
>
> -static int inet_csk_bind_conflict(const struct sock *sk,
> -                                 const struct inet_bind_bucket *tb,
> -                                 bool relax, bool reuseport_ok)
> +static bool bind_conflict_exist(const struct sock *sk, struct sock *sk2,
> +                               kuid_t sk_uid, bool relax, bool reuseport_cb_ok,
> +                               bool reuseport_ok)
> +{
> +       if (sk != sk2 && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if ||
> +                         sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +               if (sk->sk_reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) {
> +                       if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport &&
> +                                      reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT ||
> +                                                          uid_eq(sk_uid, sock_i_uid(sk2)))))
> +                               return true;
> +               } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport ||
> +                          !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT &&
> +                                               !uid_eq(sk_uid, sock_i_uid(sk2)))) {
> +                       return true;
> +               }
> +       }
> +       return false;
> +}
> +
> +static bool check_bhash2_conflict(const struct sock *sk, struct inet_bind2_bucket *tb2,
> +                                 kuid_t sk_uid, bool relax, bool reuseport_cb_ok,
> +                                 bool reuseport_ok)
>  {
>         struct sock *sk2;
> -       bool reuseport_cb_ok;
> -       bool reuse = sk->sk_reuse;
> -       bool reuseport = !!sk->sk_reuseport;
> -       struct sock_reuseport *reuseport_cb;
> +
> +       sk_for_each_bound_bhash2(sk2, &tb2->owners) {
> +               if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
> +                       continue;
> +
> +               if (bind_conflict_exist(sk, sk2, sk_uid, relax,
> +                                       reuseport_cb_ok, reuseport_ok))
> +                       return true;
> +       }
> +       return false;
> +}
> +
> +/* This should be called only when the corresponding inet_bind_bucket spinlock is held */
> +static int inet_csk_bind_conflict(const struct sock *sk, int port,
> +                                 struct inet_bind_bucket *tb,
> +                                 struct inet_bind2_bucket *tb2, /* may be null */
> +                                 bool relax, bool reuseport_ok)
> +{
> +       struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
>         kuid_t uid = sock_i_uid((struct sock *)sk);
> +       struct sock_reuseport *reuseport_cb;
> +       struct inet_bind2_hashbucket *head2;
> +       bool reuseport_cb_ok;
> +       struct sock *sk2;
> +       struct net *net;
> +       int l3mdev;
> +       u32 hash;
>
>         rcu_read_lock();
>         reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
> @@ -150,36 +216,40 @@ static int inet_csk_bind_conflict(const struct sock *sk,
>         /*
>          * Unlike other sk lookup places we do not check
>          * for sk_net here, since _all_ the socks listed
> -        * in tb->owners list belong to the same net - the
> -        * one this bucket belongs to.
> +        * in tb->owners and tb2->owners list belong
> +        * to the same net
>          */
>
> -       sk_for_each_bound(sk2, &tb->owners) {
> -               if (sk != sk2 &&
> -                   (!sk->sk_bound_dev_if ||
> -                    !sk2->sk_bound_dev_if ||
> -                    sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> -                       if (reuse && sk2->sk_reuse &&
> -                           sk2->sk_state != TCP_LISTEN) {
> -                               if ((!relax ||
> -                                    (!reuseport_ok &&
> -                                     reuseport && sk2->sk_reuseport &&
> -                                     reuseport_cb_ok &&
> -                                     (sk2->sk_state == TCP_TIME_WAIT ||
> -                                      uid_eq(uid, sock_i_uid(sk2))))) &&
> -                                   inet_rcv_saddr_equal(sk, sk2, true))
> -                                       break;
> -                       } else if (!reuseport_ok ||
> -                                  !reuseport || !sk2->sk_reuseport ||
> -                                  !reuseport_cb_ok ||
> -                                  (sk2->sk_state != TCP_TIME_WAIT &&
> -                                   !uid_eq(uid, sock_i_uid(sk2)))) {
> -                               if (inet_rcv_saddr_equal(sk, sk2, true))
> -                                       break;
> -                       }
> -               }
> +       if (!use_bhash2_on_bind(sk)) {
> +               sk_for_each_bound(sk2, &tb->owners)
> +                       if (bind_conflict_exist(sk, sk2, uid, relax,
> +                                               reuseport_cb_ok, reuseport_ok) &&
> +                           inet_rcv_saddr_equal(sk, sk2, true))
> +                               return true;
> +
> +               return false;
>         }
> -       return sk2 != NULL;
> +
> +       if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok))
> +               return true;
> +
> +       net = sock_net(sk);
> +
> +       /* check there's no conflict with an existing IPV6_ADDR_ANY (if ipv6) or
> +        * INADDR_ANY (if ipv4) socket.
> +        */
> +       hash = get_bhash2_nulladdr_hash(sk, net, port);
> +       head2 = &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
> +
> +       l3mdev = inet_sk_bound_l3mdev(sk);
> +       inet_bind_bucket_for_each(tb2, &head2->chain)
> +               if (check_bind2_bucket_match_nulladdr(tb2, net, port, l3mdev, sk))
> +                       break;
> +
> +       if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok))
> +               return true;
> +
> +       return false;
>  }
>
>  /*
> @@ -187,16 +257,20 @@ static int inet_csk_bind_conflict(const struct sock *sk,
>   * inet_bind_hashbucket lock held.
>   */
>  static struct inet_bind_hashbucket *
> -inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
> +inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret,
> +                       struct inet_bind2_bucket **tb2_ret,
> +                       struct inet_bind2_hashbucket **head2_ret, int *port_ret)
>  {
>         struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
> -       int port = 0;
> +       struct inet_bind2_hashbucket *head2;
>         struct inet_bind_hashbucket *head;
>         struct net *net = sock_net(sk);
> -       bool relax = false;
>         int i, low, high, attempt_half;
> +       struct inet_bind2_bucket *tb2;
>         struct inet_bind_bucket *tb;
>         u32 remaining, offset;
> +       bool relax = false;
> +       int port = 0;
>         int l3mdev;
>
>         l3mdev = inet_sk_bound_l3mdev(sk);
> @@ -235,10 +309,11 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
>                 head = &hinfo->bhash[inet_bhashfn(net, port,
>                                                   hinfo->bhash_size)];
>                 spin_lock_bh(&head->lock);
> +               tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2);
>                 inet_bind_bucket_for_each(tb, &head->chain)
> -                       if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
> -                           tb->port == port) {
> -                               if (!inet_csk_bind_conflict(sk, tb, relax, false))
> +                       if (check_bind_bucket_match(tb, net, port, l3mdev)) {
> +                               if (!inet_csk_bind_conflict(sk, port, tb, tb2,
> +                                                           relax, false))
>                                         goto success;
>                                 goto next_port;
>                         }
> @@ -268,6 +343,8 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
>  success:
>         *port_ret = port;
>         *tb_ret = tb;
> +       *tb2_ret = tb2;
> +       *head2_ret = head2;
>         return head;
>  }
>
> @@ -363,54 +440,77 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
>  {
>         bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
>         struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
> -       int ret = 1, port = snum;
> +       bool bhash_created = false, bhash2_created = false;
> +       struct inet_bind2_bucket *tb2 = NULL;
> +       struct inet_bind2_hashbucket *head2;
> +       struct inet_bind_bucket *tb = NULL;
>         struct inet_bind_hashbucket *head;
>         struct net *net = sock_net(sk);
> -       struct inet_bind_bucket *tb = NULL;
> +       int ret = 1, port = snum;
> +       bool found_port = false;
>         int l3mdev;
>
>         l3mdev = inet_sk_bound_l3mdev(sk);
>
>         if (!port) {
> -               head = inet_csk_find_open_port(sk, &tb, &port);
> +               head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
>                 if (!head)
>                         return ret;
> +               if (tb && tb2)
> +                       goto success;
> +               found_port = true;
> +       } else {
> +               head = &hinfo->bhash[inet_bhashfn(net, port,
> +                                                 hinfo->bhash_size)];
> +               spin_lock_bh(&head->lock);
> +               inet_bind_bucket_for_each(tb, &head->chain)
> +                       if (check_bind_bucket_match(tb, net, port, l3mdev))
> +                               break;
> +
> +               tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2);
> +       }
> +
> +       if (!tb) {
> +               tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head,
> +                                            port, l3mdev);
>                 if (!tb)
> -                       goto tb_not_found;
> -               goto success;
> +                       goto fail_unlock;
> +               bhash_created = true;
> +       }
> +
> +       if (!tb2) {
> +               tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
> +                                              net, head2, port, l3mdev, sk);
> +               if (!tb2)
> +                       goto fail_unlock;
> +               bhash2_created = true;
>         }
> -       head = &hinfo->bhash[inet_bhashfn(net, port,
> -                                         hinfo->bhash_size)];
> -       spin_lock_bh(&head->lock);
> -       inet_bind_bucket_for_each(tb, &head->chain)
> -               if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
> -                   tb->port == port)
> -                       goto tb_found;
> -tb_not_found:
> -       tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
> -                                    net, head, port, l3mdev);
> -       if (!tb)
> -               goto fail_unlock;
> -tb_found:
> -       if (!hlist_empty(&tb->owners)) {
> +
> +       /* If we had to find an open port, we already checked for conflicts */
> +       if (!found_port && !hlist_empty(&tb->owners)) {
>                 if (sk->sk_reuse == SK_FORCE_REUSE)
>                         goto success;
> -
>                 if ((tb->fastreuse > 0 && reuse) ||
>                     sk_reuseport_match(tb, sk))
>                         goto success;
> -               if (inet_csk_bind_conflict(sk, tb, true, true))
> +               if (inet_csk_bind_conflict(sk, port, tb, tb2, true, true))
>                         goto fail_unlock;
>         }
>  success:
>         inet_csk_update_fastreuse(tb, sk);
> -
>         if (!inet_csk(sk)->icsk_bind_hash)
> -               inet_bind_hash(sk, tb, port);
> +               inet_bind_hash(sk, tb, tb2, port);
>         WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
> +       WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
>         ret = 0;
>
>  fail_unlock:
> +       if (ret) {
> +               if (bhash_created)
> +                       inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
> +               if (bhash2_created)
> +                       inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
> +       }
>         spin_unlock_bh(&head->lock);
>         return ret;
>  }
> @@ -957,6 +1057,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
>
>                 inet_sk_set_state(newsk, TCP_SYN_RECV);
>                 newicsk->icsk_bind_hash = NULL;
> +               newicsk->icsk_bind2_hash = NULL;
>
>                 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
>                 inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 17440840a791..9f0bece06609 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -81,6 +81,41 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
>         return tb;
>  }
>
> +struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
> +                                                  struct net *net,
> +                                                  struct inet_bind2_hashbucket *head,
> +                                                  const unsigned short port,
> +                                                  int l3mdev,
> +                                                  const struct sock *sk)
> +{
> +       struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
> +
> +       if (tb) {
> +               write_pnet(&tb->ib_net, net);
> +               tb->l3mdev    = l3mdev;
> +               tb->port      = port;
> +#if IS_ENABLED(CONFIG_IPV6)
> +               if (sk->sk_family == AF_INET6)
> +                       tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
> +               else
> +#endif
> +                       tb->rcv_saddr = sk->sk_rcv_saddr;
> +               INIT_HLIST_HEAD(&tb->owners);
> +               hlist_add_head(&tb->node, &head->chain);
> +       }
> +       return tb;
> +}
> +
> +static bool bind2_bucket_addr_match(struct inet_bind2_bucket *tb2, struct sock *sk)
> +{
> +#if IS_ENABLED(CONFIG_IPV6)
> +       if (sk->sk_family == AF_INET6)
> +               return ipv6_addr_equal(&tb2->v6_rcv_saddr,
> +                                      &sk->sk_v6_rcv_saddr);
> +#endif
> +       return tb2->rcv_saddr == sk->sk_rcv_saddr;
> +}
> +
>  /*
>   * Caller must hold hashbucket lock for this tb with local BH disabled
>   */
> @@ -92,12 +127,25 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
>         }
>  }
>
> +/* Caller must hold the lock for the corresponding hashbucket in the bhash table
> + * with local BH disabled
> + */
> +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
> +{
> +       if (hlist_empty(&tb->owners)) {
> +               __hlist_del(&tb->node);
> +               kmem_cache_free(cachep, tb);
> +       }
> +}
> +
>  void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
> -                   const unsigned short snum)
> +                   struct inet_bind2_bucket *tb2, const unsigned short snum)
>  {
>         inet_sk(sk)->inet_num = snum;
>         sk_add_bind_node(sk, &tb->owners);
>         inet_csk(sk)->icsk_bind_hash = tb;
> +       sk_add_bind2_node(sk, &tb2->owners);
> +       inet_csk(sk)->icsk_bind2_hash = tb2;
>  }
>
>  /*
> @@ -109,6 +157,7 @@ static void __inet_put_port(struct sock *sk)
>         const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
>                         hashinfo->bhash_size);
>         struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
> +       struct inet_bind2_bucket *tb2;
>         struct inet_bind_bucket *tb;
>
>         spin_lock(&head->lock);
> @@ -117,6 +166,13 @@ static void __inet_put_port(struct sock *sk)
>         inet_csk(sk)->icsk_bind_hash = NULL;
>         inet_sk(sk)->inet_num = 0;
>         inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
> +
> +       if (inet_csk(sk)->icsk_bind2_hash) {
> +               tb2 = inet_csk(sk)->icsk_bind2_hash;
> +               __sk_del_bind2_node(sk);
> +               inet_csk(sk)->icsk_bind2_hash = NULL;
> +               inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
> +       }
>         spin_unlock(&head->lock);
>  }
>
> @@ -133,14 +189,19 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
>         struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
>         unsigned short port = inet_sk(child)->inet_num;
>         const int bhash = inet_bhashfn(sock_net(sk), port,
> -                       table->bhash_size);
> +                                      table->bhash_size);
>         struct inet_bind_hashbucket *head = &table->bhash[bhash];
> +       struct inet_bind2_hashbucket *head_bhash2;
> +       bool created_inet_bind_bucket = false;
> +       struct net *net = sock_net(sk);
> +       struct inet_bind2_bucket *tb2;
>         struct inet_bind_bucket *tb;
>         int l3mdev;
>
>         spin_lock(&head->lock);
>         tb = inet_csk(sk)->icsk_bind_hash;
> -       if (unlikely(!tb)) {
> +       tb2 = inet_csk(sk)->icsk_bind2_hash;
> +       if (unlikely(!tb || !tb2)) {
>                 spin_unlock(&head->lock);
>                 return -ENOENT;
>         }
> @@ -153,25 +214,45 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
>                  * as that of the child socket. We have to look up or
>                  * create a new bind bucket for the child here. */
>                 inet_bind_bucket_for_each(tb, &head->chain) {
> -                       if (net_eq(ib_net(tb), sock_net(sk)) &&
> -                           tb->l3mdev == l3mdev && tb->port == port)
> +                       if (check_bind_bucket_match(tb, net, port, l3mdev))
>                                 break;
>                 }
>                 if (!tb) {
>                         tb = inet_bind_bucket_create(table->bind_bucket_cachep,
> -                                                    sock_net(sk), head, port,
> -                                                    l3mdev);
> +                                                    net, head, port, l3mdev);
>                         if (!tb) {
>                                 spin_unlock(&head->lock);
>                                 return -ENOMEM;
>                         }
> +                       created_inet_bind_bucket = true;
>                 }
>                 inet_csk_update_fastreuse(tb, child);
> +
> +               goto bhash2_find;
> +       } else if (!bind2_bucket_addr_match(tb2, child)) {
> +               l3mdev = inet_sk_bound_l3mdev(sk);
> +
> +bhash2_find:
> +               tb2 = inet_bind2_bucket_find(table, net, port, l3mdev, child,
> +                                            &head_bhash2);
> +               if (!tb2) {
> +                       tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
> +                                                      net, head_bhash2, port, l3mdev,
> +                                                      child);
> +                       if (!tb2)
> +                               goto error;
> +               }
>         }
> -       inet_bind_hash(child, tb, port);
> +       inet_bind_hash(child, tb, tb2, port);
>         spin_unlock(&head->lock);
>
>         return 0;
> +
> +error:
> +       if (created_inet_bind_bucket)
> +               inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
> +       spin_unlock(&head->lock);
> +       return -ENOMEM;
>  }
>  EXPORT_SYMBOL_GPL(__inet_inherit_port);
>
> @@ -722,6 +803,71 @@ void inet_unhash(struct sock *sk)
>  }
>  EXPORT_SYMBOL_GPL(inet_unhash);
>
> +static inline bool check_bind2_bucket_match(struct inet_bind2_bucket *tb, struct net *net,
> +                                           unsigned short port, int l3mdev, struct sock *sk)
> +{
> +#if IS_ENABLED(CONFIG_IPV6)
> +       if (sk->sk_family == AF_INET6)
> +               return net_eq(ib2_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev &&
> +                       ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
> +       else
> +#endif
> +               return net_eq(ib2_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev &&
> +                       tb->rcv_saddr == sk->sk_rcv_saddr;
> +}
> +
> +bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, struct net *net,
> +                                      const unsigned short port, int l3mdev, const struct sock *sk)
> +{
> +#if IS_ENABLED(CONFIG_IPV6)
> +       struct in6_addr nulladdr = {};
> +
> +       if (sk->sk_family == AF_INET6)
> +               return net_eq(ib2_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev &&
> +                       ipv6_addr_equal(&tb->v6_rcv_saddr, &nulladdr);
> +       else
> +#endif
> +               return net_eq(ib2_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev &&
> +                       tb->rcv_saddr == 0;
> +}
> +
> +static struct inet_bind2_hashbucket *
> +inet_bhashfn_portaddr(struct inet_hashinfo *hinfo, const struct sock *sk,
> +                     const struct net *net, unsigned short port)
> +{
> +       u32 hash;
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +       if (sk->sk_family == AF_INET6)
> +               hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
> +       else
> +#endif
> +               hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
> +       return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
> +}
> +
> +/* This should only be called when the spinlock for the socket's corresponding
> + * bind_hashbucket is held
> + */
> +struct inet_bind2_bucket *
> +inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, const unsigned short port,
> +                      int l3mdev, struct sock *sk, struct inet_bind2_hashbucket **head)
> +{
> +       struct inet_bind2_bucket *bhash2 = NULL;
> +       struct inet_bind2_hashbucket *h;
> +
> +       h = inet_bhashfn_portaddr(hinfo, sk, net, port);
> +       inet_bind_bucket_for_each(bhash2, &h->chain) {
> +               if (check_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
> +                       break;
> +       }
> +
> +       if (head)
> +               *head = h;
> +
> +       return bhash2;
> +}
> +
>  /* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
>   * Note that we use 32bit integers (vs RFC 'short integers')
>   * because 2^16 is not a multiple of num_ephemeral and this
> @@ -740,10 +886,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>  {
>         struct inet_hashinfo *hinfo = death_row->hashinfo;
>         struct inet_timewait_sock *tw = NULL;
> +       struct inet_bind2_hashbucket *head2;
>         struct inet_bind_hashbucket *head;
>         int port = inet_sk(sk)->inet_num;
>         struct net *net = sock_net(sk);
> +       struct inet_bind2_bucket *tb2;
>         struct inet_bind_bucket *tb;
> +       bool tb_created = false;
>         u32 remaining, offset;
>         int ret, i, low, high;
>         int l3mdev;
> @@ -797,8 +946,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>                  * the established check is already unique enough.
>                  */
>                 inet_bind_bucket_for_each(tb, &head->chain) {
> -                       if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
> -                           tb->port == port) {
> +                       if (check_bind_bucket_match(tb, net, port, l3mdev)) {
>                                 if (tb->fastreuse >= 0 ||
>                                     tb->fastreuseport >= 0)
>                                         goto next_port;
> @@ -816,6 +964,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>                         spin_unlock_bh(&head->lock);
>                         return -ENOMEM;
>                 }
> +               tb_created = true;
>                 tb->fastreuse = -1;
>                 tb->fastreuseport = -1;
>                 goto ok;
> @@ -831,6 +980,17 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>         return -EADDRNOTAVAIL;
>
>  ok:
> +       /* Find the corresponding tb2 bucket since we need to
> +        * add the socket to the bhash2 table as well
> +        */
> +       tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2);
> +       if (!tb2) {
> +               tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
> +                                              head2, port, l3mdev, sk);
> +               if (!tb2)
> +                       goto error;
> +       }
> +
>         /* If our first attempt found a candidate, skip next candidate
>          * in 1/16 of cases to add some noise.
>          */
> @@ -839,7 +999,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>         WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
>
>         /* Head lock still held and bh's disabled */
> -       inet_bind_hash(sk, tb, port);
> +       inet_bind_hash(sk, tb, tb2, port);
>         if (sk_unhashed(sk)) {
>                 inet_sk(sk)->inet_sport = htons(port);
>                 inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
> @@ -851,6 +1011,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>                 inet_twsk_deschedule_put(tw);
>         local_bh_enable();
>         return 0;
> +
> +error:
> +       if (tb_created)
> +               inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
> +       spin_unlock_bh(&head->lock);
> +       return -ENOMEM;
>  }
>
>  /*
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index cf18fbcbf123..5a143c9afd20 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4627,6 +4627,12 @@ void __init tcp_init(void)
>                                   SLAB_HWCACHE_ALIGN | SLAB_PANIC |
>                                   SLAB_ACCOUNT,
>                                   NULL);
> +       tcp_hashinfo.bind2_bucket_cachep =
> +               kmem_cache_create("tcp_bind2_bucket",
> +                                 sizeof(struct inet_bind2_bucket), 0,
> +                                 SLAB_HWCACHE_ALIGN | SLAB_PANIC |
> +                                 SLAB_ACCOUNT,
> +                                 NULL);
>
>         /* Size and allocate the main established and bind bucket
>          * hash tables.
> @@ -4649,8 +4655,9 @@ void __init tcp_init(void)
>         if (inet_ehash_locks_alloc(&tcp_hashinfo))
>                 panic("TCP: failed to alloc ehash_locks");
>         tcp_hashinfo.bhash =
> -               alloc_large_system_hash("TCP bind",
> -                                       sizeof(struct inet_bind_hashbucket),
> +               alloc_large_system_hash("TCP bind bhash tables",
> +                                       sizeof(struct inet_bind_hashbucket) +
> +                                       sizeof(struct inet_bind2_hashbucket),
>                                         tcp_hashinfo.ehash_mask + 1,
>                                         17, /* one slot per 128 KB of memory */
>                                         0,
> @@ -4659,9 +4666,12 @@ void __init tcp_init(void)
>                                         0,
>                                         64 * 1024);
>         tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
> +       tcp_hashinfo.bhash2 =
> +               (struct inet_bind2_hashbucket *)(tcp_hashinfo.bhash + tcp_hashinfo.bhash_size);
>         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
>                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
>                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
> +               INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
>         }
>
>
> --
> 2.30.2
>

^ permalink raw reply

* Re: [PATCH v2 bpf 07/11] samples/bpf: fix uin64_t format literals
From: Alexander Lobakin @ 2022-04-21 22:55 UTC (permalink / raw)
  To: David Laight
  Cc: Alexander Lobakin, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Maciej Fijalkowski, Song Liu,
	Kumar Kartikeya Dwivedi, bpf@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <ca0733f123bf498a831324c4692a0df8@AcuMS.aculab.com>

From: David Laight <David.Laight@ACULAB.COM>
Date: Thu, 21 Apr 2022 07:46:05 +0000

> From: Alexander Lobakin
> > Sent: 21 April 2022 01:40
> >
> > There's a couple places where uin64_t is being passed as an %lu
> > format argument. That type is defined as unsigned long on 64-bit
> > systems and as unsigned long long on 32-bit, so neither %lu nor
> > %llu are not universal.
> > One of the options is %PRIu64, but since it's always 8-byte long,
> > just cast it to the _proper_ __u64 and print as %llu.
>
> Is __u64 guaranteed to be 'unsigned long long' ? No reason why it should be.
> I think you need to cast to (unsigned long long).

__u64 can be unsigned long only if an architecture uses int-l64.h
instead of int-ll64.h. This is currently possible for Alpha and
PPC64 when __SANE_USERSPACE_TYPES__ is not defined -- I guess you
know what that flag does.
I messed up a bit and didn't notice that samples/bpf/Makefile
defines this flag only for MIPS. IMO it should be defined in here
unconditionally, but I guess it's out of bpf-fixes scope, so I'll
go with unsigned long long in v3 (got to resend with no PGP crap
anyway lol).

>
> 	David

--- 8< ---

> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)

Thanks,
Al


^ permalink raw reply

* [PATCH net 0/2] Fix Ocelot VLAN regressions introduced by FDB isolation
From: Vladimir Oltean @ 2022-04-21 23:01 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver

There are 2 regressions in the VLAN handling code of the ocelot/felix
DSA driver which can be seen when running the bridge_vlan_aware.sh
selftest. These manifest in the form of valid VLAN configurations being
rejected by the driver with incorrect extack messages.

First regression occurs when we attempt to install an egress-untagged
bridge VLAN to a bridge port that was brought up *while* it was under a
bridge (not before).

The second regression occurs when a port leaves a VLAN-aware bridge and
then re-joins it.

Both issues can be traced back to the recent commit which added VLAN
based FDB isolation in the ocelot driver.

Vladimir Oltean (2):
  net: mscc: ocelot: ignore VID 0 added by 8021q module
  net: mscc: ocelot: don't add VID 0 to ocelot->vlans when leaving
    VLAN-aware bridge

 drivers/net/ethernet/mscc/ocelot.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

-- 
2.25.1


^ permalink raw reply

* [PATCH net 1/2] net: mscc: ocelot: ignore VID 0 added by 8021q module
From: Vladimir Oltean @ 2022-04-21 23:01 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver
In-Reply-To: <20220421230105.3570690-1-vladimir.oltean@nxp.com>

Both the felix DSA driver and ocelot switchdev driver declare
dev->features & NETIF_F_HW_VLAN_CTAG_FILTER under certain circumstances*,
so the 8021q module will add VID 0 to our RX filter when the port goes
up, to ensure 802.1p traffic is not dropped.

We treat VID 0 as a special value (OCELOT_STANDALONE_PVID) which
deliberately does not have a struct ocelot_bridge_vlan associated with
it. Instead, this gets programmed to the VLAN table in ocelot_vlan_init().

If we allow external calls to modify VID 0, we reach the following
situation:

 # ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up
 # ip link set swp0 master br0
 # ip link set swp0 up # this adds VID 0 to ocelot->vlans with untagged=false
bridge vlan
port              vlan-id
swp0              1 PVID Egress Untagged # the bridge also adds VID 1
br0               1 PVID Egress Untagged
 # bridge vlan add dev swp0 vid 100 untagged
Error: mscc_ocelot_switch_lib: Port with egress-tagged VLANs cannot have more than one egress-untagged (native) VLAN.

This configuration should have been accepted, because
ocelot_port_manage_port_tag() should select OCELOT_PORT_TAG_NATIVE.
Yet it isn't, because we have an entry in ocelot->vlans which says
VID 0 should be egress-tagged, something the hardware can't do.

Fix this by suppressing additions/deletions on VID 0 and managing this
VLAN exclusively using OCELOT_STANDALONE_PVID.

*DSA toggles it when the port becomes VLAN-aware by joining a VLAN-aware
bridge. Ocelot declares it unconditionally for some reason.

Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index ee9c607d62a7..951c4529f6cd 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -629,6 +629,13 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
 {
 	int err;
 
+	/* Ignore VID 0 added to our RX filter by the 8021q module, since
+	 * that collides with OCELOT_STANDALONE_PVID and changes it from
+	 * egress-untagged to egress-tagged.
+	 */
+	if (!vid)
+		return 0;
+
 	err = ocelot_vlan_member_add(ocelot, port, vid, untagged);
 	if (err)
 		return err;
@@ -651,6 +658,9 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid)
 	bool del_pvid = false;
 	int err;
 
+	if (!vid)
+		return 0;
+
 	if (ocelot_port->pvid_vlan && ocelot_port->pvid_vlan->vid == vid)
 		del_pvid = true;
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH net 2/2] net: mscc: ocelot: don't add VID 0 to ocelot->vlans when leaving VLAN-aware bridge
From: Vladimir Oltean @ 2022-04-21 23:01 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver
In-Reply-To: <20220421230105.3570690-1-vladimir.oltean@nxp.com>

DSA, through dsa_port_bridge_leave(), first notifies the port of the
fact that it left a bridge, then, if that bridge was VLAN-aware, it
notifies the port of the change in VLAN awareness state, towards
VLAN-unaware mode.

So ocelot_port_vlan_filtering() can be called when ocelot_port->bridge
is NULL, and this makes ocelot_add_vlan_unaware_pvid() create a struct
ocelot_bridge_vlan with a vid of 0 and an "untagged" setting of true on
that port.

In a way this structure correctly reflects the reality, but by design,
VID 0 (OCELOT_STANDALONE_PVID) was not meant to be kept in the bridge
VLAN list of the driver, but managed separately.

Having OCELOT_STANDALONE_PVID in ocelot->vlans makes us trip up on
several sanity checks that did not expect to have this VID there.
For example, after we leave a VLAN-aware bridge and we re-join it, we
can no longer program egress-tagged VLANs to hardware:

 # ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up
 # ip link set swp0 master br0
 # ip link set swp0 nomaster
 # ip link set swp0 master br0
 # bridge vlan add dev swp0 vid 100
Error: mscc_ocelot_switch_lib: Port with more than one egress-untagged VLAN cannot have egress-tagged VLANs.

But this configuration is in fact supported by the hardware, since we
could use OCELOT_PORT_TAG_NATIVE. According to its comment:

/* all VLANs except the native VLAN and VID 0 are egress-tagged */

yet when assessing the eligibility for this mode, we do not check for
VID 0 in ocelot_port_uses_native_vlan(), instead we just ensure that
ocelot_port_num_untagged_vlans() == 1. This is simply because VID 0
doesn't have a bridge VLAN structure.

The way I identify the problem is that ocelot_port_vlan_filtering(false)
only means to call ocelot_add_vlan_unaware_pvid() when we dynamically
turn off VLAN awareness for a bridge we are under, and the PVID changes
from the bridge PVID to a reserved PVID based on the bridge number.

Since OCELOT_STANDALONE_PVID is statically added to the VLAN table
during ocelot_vlan_init() and never removed afterwards, calling
ocelot_add_vlan_unaware_pvid() for it is not intended and does not serve
any purpose.

Fix the issue by avoiding the call to ocelot_add_vlan_unaware_pvid(vid=0)
when we're resetting VLAN awareness after leaving the bridge, to become
a standalone port.

Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 951c4529f6cd..ca71b62a44dc 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -551,7 +551,7 @@ int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
 	struct ocelot_vcap_block *block = &ocelot->block[VCAP_IS1];
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct ocelot_vcap_filter *filter;
-	int err;
+	int err = 0;
 	u32 val;
 
 	list_for_each_entry(filter, &block->rules, list) {
@@ -570,7 +570,7 @@ int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
 	if (vlan_aware)
 		err = ocelot_del_vlan_unaware_pvid(ocelot, port,
 						   ocelot_port->bridge);
-	else
+	else if (ocelot_port->bridge)
 		err = ocelot_add_vlan_unaware_pvid(ocelot, port,
 						   ocelot_port->bridge);
 	if (err)
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v2 bpf 00/11] bpf: random unpopular userspace fixes (32 bit et al)
From: Toke Høiland-Jørgensen @ 2022-04-21 23:17 UTC (permalink / raw)
  To: Alexander Lobakin, Alexei Starovoitov
  Cc: Alexander Lobakin, Daniel Borkmann, Andrii Nakryiko,
	Maciej Fijalkowski, Song Liu, Kumar Kartikeya Dwivedi, bpf,
	Network Development, LKML
In-Reply-To: <20220421223201.322686-1-alobakin@pm.me>

Alexander Lobakin <alobakin@pm.me> writes:

> From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Date: Wed, 20 Apr 2022 17:40:34 -0700
>
>> On Wed, Apr 20, 2022 at 5:38 PM Alexander Lobakin <alobakin@pm.me> wrote:
>>
>> Again?
>>
>> -----BEGIN PGP MESSAGE-----
>> Version: ProtonMail
>>
>> wcFMA165ASBBe6s8AQ/8C9y4TqXgASA5xBT7UIf2GyTQRjKWcy/6kT1dkjkF
>> FldAOhehhgLYjLJzNAIkecOQfz/XNapW3GdrQDq11pq9Bzs1SJJekGXlHVIW
>>
>> Sorry I'm tossing the series out of patchwork.
>
> Oh sorry, I was hoping upgrading Bridge would help >_<
>
> Let me know if you're reading this particular message in your inbox
> finely. Toke guessed it precisely regarding the per-recipient lists
> -- Proton by default saves every address I've ever sent mails to to
> Contacts and then tries to fetch PGP public keys for each contact.
> Again, for some reason, for a couple addresses, including
> ast@kernel.org, it managed to fetch something, but that something
> was sorta broken. So at the end I've been having broken PGP for
> the address I've never manually set or ev
> en wanted PGP.
> If it's still messed, I'll contact support then. Sorry again for
> this.

Heh, yeah, now that I was in the direct Cc list, I got your message in
encrypted form as well. So, erm, I'm reading it "fine" now that I
figured out how to get my MUA to decrypt it. Probably not what you want
for patch submissions, though... :P

-Toke

^ permalink raw reply

* Re: [PATCH net] net: dsa: flood multicast to CPU when slave has IFF_PROMISC
From: Florian Fainelli @ 2022-04-21 23:20 UTC (permalink / raw)
  To: Vladimir Oltean, netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Andrew Lunn,
	Vivien Didelot, Vladimir Oltean
In-Reply-To: <20220421224222.3563522-1-vladimir.oltean@nxp.com>

On 4/21/22 15:42, Vladimir Oltean wrote:
> Certain DSA switches can eliminate flooding to the CPU when none of the
> ports have the IFF_ALLMULTI or IFF_PROMISC flags set. This is done by
> synthesizing a call to dsa_port_bridge_flags() for the CPU port, a call
> which normally comes from the bridge driver via switchdev.
> 
> The bridge port flags and IFF_PROMISC|IFF_ALLMULTI have slightly
> different semantics, and due to inattention/lack of proper testing, the
> IFF_PROMISC flag allows unknown unicast to be flooded to the CPU, but
> not unknown multicast.
> 
> This must be fixed by setting both BR_FLOOD (unicast) and BR_MCAST_FLOOD
> in the synthesized dsa_port_bridge_flags() call, since IFF_PROMISC means
> that packets should not be filtered regardless of their MAC DA.
> 
> Fixes: 7569459a52c9 ("net: dsa: manage flooding on the CPU ports")
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH] cgroup: don't queue css_release_work if one already pending
From: Tejun Heo @ 2022-04-21 23:43 UTC (permalink / raw)
  To: Tadeusz Struk
  Cc: Michal Koutný, cgroups, Zefan Li, Johannes Weiner,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d
In-Reply-To: <584183e2-2473-6185-e07d-f478da118b87@linaro.org>

Hello,

On Thu, Apr 14, 2022 at 10:51:18AM -0700, Tadeusz Struk wrote:
> What happened was, the write triggered:
> cgroup_subtree_control_write()->cgroup_apply_control()->cgroup_apply_control_enable()->css_create()
> 
> which, allocates and initializes the css, then fails in cgroup_idr_alloc(),
> bails out and calls queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);

Yes, but this css hasn't been installed yet.

> then cgroup_subtree_control_write() bails out to out_unlock:, which then goes:
> 
> cgroup_kn_unlock()->cgroup_put()->css_put()->percpu_ref_put(&css->refcnt)->percpu_ref_put_many(ref)

And this is a different css. cgroup->self which isn't connected to the half
built css which got destroyed in css_create().

So, I have a bit of difficulty following this scenario. The way that the
current code uses destroy_work is definitely nasty and it'd probably be a
good idea to separate out the different use cases, but let's first
understand what's failing.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: IPv6 multicast with VRF
From: David Ahern @ 2022-04-21 23:44 UTC (permalink / raw)
  To: Vladimir Oltean; +Cc: netdev
In-Reply-To: <20220421092429.waykidesd7de4q3o@skbuf>

On 4/21/22 3:24 AM, Vladimir Oltean wrote:
>>>  ip -6 route get ff02::1%eth0
>>> Error: inet6 prefix is expected rather than "ff02::1%eth0".
>>
>> ip -6 ro get oif eth0 ff02::1
>>
>> (too many syntax differences between tools)
> 
> Could you explain why specifying the oif is needed here? If I don't do

multicast and linklocal are local to a device, so you need to specify
which interface to use.

> it, I still can't find the route. Either that, or what would an
> application need to do to find the route from the VRF FIB?

applications bind their sockets to a VRF device or a port device.


^ permalink raw reply

* Re: [PATCH] cgroup: don't queue css_release_work if one already pending
From: Tejun Heo @ 2022-04-22  0:00 UTC (permalink / raw)
  To: Michal Koutný
  Cc: Tadeusz Struk, cgroups, Zefan Li, Johannes Weiner,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d
In-Reply-To: <20220414164409.GA5404@blackbody.suse.cz>

On Thu, Apr 14, 2022 at 06:44:09PM +0200, Michal Koutný wrote:
> I suspect the double-queuing is a result of the fact that there exists
> only the single reference to the css->refcnt. I.e. it's
> percpu_ref_kill_and_confirm()'d and released both at the same time.
> 
> (Normally (when not killing the last reference), css->destroy_work reuse
> is not a problem because of the sequenced chain
> css_killed_work_fn()->css_put()->css_release().)

If this is the case, we need to hold an extra reference to be put by the
css_killed_work_fn(), right?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH] WireGuard: restrict packet handling to non-isolated CPUs.
From: Jason A. Donenfeld @ 2022-04-22  0:02 UTC (permalink / raw)
  To: Charles-Francois Natali
  Cc: wireguard, netdev, linux-crypto, Daniel Jordan, Steffen Klassert
In-Reply-To: <20220405212129.2270-1-cf.natali@gmail.com>

netdev@ - Original thread is at
https://lore.kernel.org/wireguard/20220405212129.2270-1-cf.natali@gmail.com/

Hi Charles-François,

On Tue, Apr 05, 2022 at 10:21:29PM +0100, Charles-Francois Natali wrote:
> WireGuard currently uses round-robin to dispatch the handling of
> packets, handling them on all online CPUs, including isolated ones
> (isolcpus).
> 
> This is unfortunate because it causes significant latency on isolated
> CPUs - see e.g. below over 240 usec:
> 
> kworker/47:1-2373323 [047] 243644.756405: funcgraph_entry: |
> process_one_work() { kworker/47:1-2373323 [047] 243644.756406:
> funcgraph_entry: | wg_packet_decrypt_worker() { [...]
> kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: 0.591 us | }
> kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: ! 242.655 us
> | }
> 
> Instead, restrict to non-isolated CPUs.

Huh, interesting... I haven't seen this feature before. What's the
intended use case? To never run _anything_ on those cores except
processes you choose? To run some things but not intensive things? Is it
sort of a RT-lite?

I took a look in padata/pcrypt and it doesn't look like they're
examining the housekeeping mask at all. Grepping for
housekeeping_cpumask doesn't appear to show many results in things like
workqueues, but rather in core scheduling stuff. So I'm not quite sure
what to make of this patch.

I suspect the thing to do might be to patch both wireguard and padata,
and send a patch series to me, the padata people, and
netdev@vger.kernel.org, and we can all hash this out together.

Regarding your patch, is there a way to make that a bit more succinct,
without introducing all of those helper functions? It seems awfully
verbose for something that seems like a matter of replacing the online
mask with the housekeeping mask.

Jason

^ permalink raw reply

* Re: [PATCH] WireGuard: restrict packet handling to non-isolated CPUs.
From: Stephen Hemminger @ 2022-04-22  0:40 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Charles-Francois Natali, wireguard, netdev, linux-crypto,
	Daniel Jordan, Steffen Klassert
In-Reply-To: <YmHwjdfZJJ2DeLTK@zx2c4.com>

On Fri, 22 Apr 2022 02:02:21 +0200
"Jason A. Donenfeld" <Jason@zx2c4.com> wrote:

> netdev@ - Original thread is at
> https://lore.kernel.org/wireguard/20220405212129.2270-1-cf.natali@gmail.com/
> 
> Hi Charles-François,
> 
> On Tue, Apr 05, 2022 at 10:21:29PM +0100, Charles-Francois Natali wrote:
> > WireGuard currently uses round-robin to dispatch the handling of
> > packets, handling them on all online CPUs, including isolated ones
> > (isolcpus).
> > 
> > This is unfortunate because it causes significant latency on isolated
> > CPUs - see e.g. below over 240 usec:
> > 
> > kworker/47:1-2373323 [047] 243644.756405: funcgraph_entry: |
> > process_one_work() { kworker/47:1-2373323 [047] 243644.756406:
> > funcgraph_entry: | wg_packet_decrypt_worker() { [...]
> > kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: 0.591 us | }
> > kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: ! 242.655 us
> > | }
> > 
> > Instead, restrict to non-isolated CPUs.  
> 
> Huh, interesting... I haven't seen this feature before. What's the
> intended use case? To never run _anything_ on those cores except
> processes you choose? To run some things but not intensive things? Is it
> sort of a RT-lite?
> 
> I took a look in padata/pcrypt and it doesn't look like they're
> examining the housekeeping mask at all. Grepping for
> housekeeping_cpumask doesn't appear to show many results in things like
> workqueues, but rather in core scheduling stuff. So I'm not quite sure
> what to make of this patch.
> 
> I suspect the thing to do might be to patch both wireguard and padata,
> and send a patch series to me, the padata people, and
> netdev@vger.kernel.org, and we can all hash this out together.
> 
> Regarding your patch, is there a way to make that a bit more succinct,
> without introducing all of those helper functions? It seems awfully
> verbose for something that seems like a matter of replacing the online
> mask with the housekeeping mask.
> 
> Jason

Applications like DPDK that do polling often use isolcpus or cgroups
to keep unwanted rabble off of their cpus.  Having wireguard use those
cpus seems bad.

^ permalink raw reply

* [PATCH v2 bpf-next 2/2] selftests/bpf: handle batch operations for map-in-map bpf-maps
From: Takshak Chahande @ 2022-04-22  0:50 UTC (permalink / raw)
  To: netdev, bpf; +Cc: andrii, ast, kernel-team, ctakshak, ndixit, kafai, andriin
In-Reply-To: <20220422005044.4099919-1-ctakshak@fb.com>

This patch adds up test cases that handles 4 combinations:
a) outer map: BPF_MAP_TYPE_ARRAY_OF_MAPS
   inner maps: BPF_MAP_TYPE_ARRAY and BPF_MAP_TYPE_HASH
b) outer map: BPF_MAP_TYPE_HASH_OF_MAPS
   inner maps: BPF_MAP_TYPE_ARRAY and BPF_MAP_TYPE_HASH

Changes v1 => v2:
1. Fixed no format arguments error (Andrii)

Signed-off-by: Takshak Chahande <ctakshak@fb.com>
---
 .../bpf/map_tests/map_in_map_batch_ops.c      | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c
new file mode 100644
index 000000000000..9b705ec9562e
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+#define OUTER_MAP_ENTRIES 10
+
+static __u32 get_map_id_from_fd(int map_fd)
+{
+	struct bpf_map_info map_info = {};
+	uint32_t info_len = sizeof(map_info);
+	int ret;
+
+	ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+	CHECK(ret < 0, "Finding map info failed", "error:%s\n",
+	      strerror(errno));
+
+	return map_info.id;
+}
+
+/* This creates number of OUTER_MAP_ENTRIES maps that will be stored
+ * in outer map and return the created map_fds
+ */
+static void create_inner_maps(enum bpf_map_type map_type,
+			      __u32 *inner_map_fds)
+{
+	int map_fd, map_index, ret;
+	__u32 map_key = 0, map_id;
+	char map_name[15];
+
+	for (map_index = 0; map_index < OUTER_MAP_ENTRIES; map_index++) {
+		memset(map_name, 0, sizeof(map_name));
+		sprintf(map_name, "inner_map_fd_%d", map_index);
+		map_fd = bpf_map_create(map_type, map_name, sizeof(__u32),
+					sizeof(__u32), 1, NULL);
+		CHECK(map_fd < 0,
+		      "inner bpf_map_create() failed",
+		      "map_type=(%d) map_name(%s), error:%s\n",
+		      map_type, map_name, strerror(errno));
+
+		/* keep track of the inner map fd as it is required
+		 * to add records in outer map
+		 */
+		inner_map_fds[map_index] = map_fd;
+
+		/* Add entry into this created map
+		 * eg: map1 key = 0, value = map1's map id
+		 *     map2 key = 0, value = map2's map id
+		 */
+		map_id = get_map_id_from_fd(map_fd);
+		ret = bpf_map_update_elem(map_fd, &map_key, &map_id, 0);
+		CHECK(ret != 0,
+		      "bpf_map_update_elem failed",
+		      "map_type=(%d) map_name(%s), error:%s\n",
+		      map_type, map_name, strerror(errno));
+	}
+}
+
+static int create_outer_map(enum bpf_map_type map_type, __u32 inner_map_fd)
+{
+	int outer_map_fd;
+
+	LIBBPF_OPTS(bpf_map_create_opts, attr);
+	attr.inner_map_fd = inner_map_fd;
+	outer_map_fd = bpf_map_create(map_type, "outer_map", sizeof(__u32),
+				      sizeof(__u32), OUTER_MAP_ENTRIES,
+				      &attr);
+	CHECK(outer_map_fd < 0,
+	      "outer bpf_map_create()",
+	      "map_type=(%d), error:%s\n",
+	      map_type, strerror(errno));
+
+	return outer_map_fd;
+}
+
+static void validate_fetch_results(int outer_map_fd, __u32 *inner_map_fds,
+				   __u32 *fetched_keys, __u32 *fetched_values,
+				   __u32 max_entries_fetched)
+{
+	__u32 inner_map_key, inner_map_value;
+	int inner_map_fd, entry, err;
+	__u32 outer_map_value;
+
+	for (entry = 0; entry < max_entries_fetched; ++entry) {
+		outer_map_value = fetched_values[entry];
+		inner_map_fd = bpf_map_get_fd_by_id(outer_map_value);
+		CHECK(inner_map_fd < 0,
+		      "Failed to get inner map fd",
+		      "from id(%d), error=%s\n",
+		      outer_map_value, strerror(errno));
+		err = bpf_map_get_next_key(inner_map_fd, NULL, &inner_map_key);
+		CHECK(err != 0,
+		      "Failed to get inner map key",
+		      "error=%s\n", strerror(errno));
+
+		err = bpf_map_lookup_elem(inner_map_fd, &inner_map_key,
+					  &inner_map_value);
+		CHECK(err != 0,
+		      "Failed to get inner map value",
+		      "for key(%d), error=%s\n",
+		      inner_map_key, strerror(errno));
+
+		/* Actual value validation */
+		CHECK(outer_map_value != inner_map_value,
+		      "Failed to validate inner map value",
+		      "fetched(%d) and lookedup(%d)!\n",
+		      outer_map_value, inner_map_value);
+	}
+}
+
+static void fetch_and_validate(int outer_map_fd,
+			       __u32 *inner_map_fds,
+			       struct bpf_map_batch_opts *opts,
+			       __u32 batch_size, bool delete_entries)
+{
+	__u32 *fetched_keys, *fetched_values, fetched_entries = 0;
+	__u32 next_batch_key = 0, step_size = 5;
+	int err, retries = 0, max_retries = 3;
+	__u32 value_size = sizeof(__u32);
+
+	fetched_keys = calloc(batch_size, value_size);
+	fetched_values = calloc(batch_size, value_size);
+
+	while (fetched_entries < batch_size) {
+		err = delete_entries
+		      ? bpf_map_lookup_and_delete_batch(outer_map_fd,
+				      fetched_entries ? &next_batch_key : NULL,
+				      &next_batch_key,
+				      fetched_keys + fetched_entries,
+				      fetched_values + fetched_entries,
+				      &step_size, opts)
+		      : bpf_map_lookup_batch(outer_map_fd,
+				      fetched_entries ? &next_batch_key : NULL,
+				      &next_batch_key,
+				      fetched_keys + fetched_entries,
+				      fetched_values + fetched_entries,
+				      &step_size, opts);
+		CHECK((err < 0 && (errno != ENOENT && errno != ENOSPC)),
+		      "lookup with steps failed",
+		      "error: %s\n", strerror(errno));
+
+		fetched_entries += step_size;
+		/* retry for max_retries if ENOSPC */
+		if (errno == ENOSPC)
+			++retries;
+
+		if (retries >= max_retries)
+			break;
+	}
+
+	CHECK((fetched_entries != batch_size && err != ENOSPC),
+	      "Unable to fetch expected entries !",
+	      "fetched_entries(%d) and batch_size(%d) error: (%d):%s\n",
+	      fetched_entries, batch_size, errno, strerror(errno));
+
+	/* validate the fetched entries */
+	validate_fetch_results(outer_map_fd, inner_map_fds, fetched_keys,
+			       fetched_values, fetched_entries);
+	printf("batch_op is successful for batch_size(%d)\n", batch_size);
+
+	free(fetched_keys);
+	free(fetched_values);
+}
+
+static void _map_in_map_batch_ops(enum bpf_map_type outer_map_type,
+				  enum bpf_map_type inner_map_type)
+{
+	__u32 *outer_map_keys, *inner_map_fds;
+	__u32 max_entries = OUTER_MAP_ENTRIES;
+	__u32 value_size = sizeof(__u32);
+	int batch_size[2] = {5, 10};
+	__u32 map_index, op_index;
+	int outer_map_fd, ret;
+	DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+			    .elem_flags = 0,
+			    .flags = 0,
+	);
+
+	outer_map_keys = calloc(max_entries, value_size);
+	inner_map_fds = calloc(max_entries, value_size);
+	create_inner_maps(inner_map_type, inner_map_fds);
+
+	outer_map_fd = create_outer_map(outer_map_type, *inner_map_fds);
+	/* create outer map keys */
+	for (map_index = 0; map_index < max_entries; map_index++)
+		outer_map_keys[map_index] =
+			((outer_map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+			 ? 9 : 1000) - map_index;
+
+	/* batch operation - map_update */
+	ret = bpf_map_update_batch(outer_map_fd, outer_map_keys,
+				   inner_map_fds, &max_entries, &opts);
+	CHECK(ret != 0,
+	      "Failed to update the outer map batch ops",
+	      "error=%s\n", strerror(errno));
+
+	/* batch operation - map_lookup */
+	for (op_index = 0; op_index < 2; ++op_index)
+		fetch_and_validate(outer_map_fd, inner_map_fds, &opts,
+				   batch_size[op_index], false);
+
+	/* batch operation - map_lookup_delete */
+	if (outer_map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+		fetch_and_validate(outer_map_fd, inner_map_fds, &opts,
+				   max_entries, true /*delete*/);
+
+	free(inner_map_fds);
+	free(outer_map_keys);
+}
+
+void test_map_in_map_batch_ops_array(void)
+{
+	_map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_ARRAY);
+	printf("%s:PASS with inner ARRAY map\n", __func__);
+	_map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH);
+	printf("%s:PASS with inner HASH map\n", __func__);
+}
+
+void test_map_in_map_batch_ops_hash(void)
+{
+	_map_in_map_batch_ops(BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_ARRAY);
+	printf("%s:PASS with inner ARRAY map\n", __func__);
+	_map_in_map_batch_ops(BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_HASH);
+	printf("%s:PASS with inner HASH map\n", __func__);
+}
-- 
2.30.2


^ permalink raw reply related

* [PATCH v2 bpf-next 1/2] bpf: Extend batch operations for map-in-map bpf-maps
From: Takshak Chahande @ 2022-04-22  0:50 UTC (permalink / raw)
  To: netdev, bpf; +Cc: andrii, ast, kernel-team, ctakshak, ndixit, kafai, andriin

This patch extends batch operations support for map-in-map map-types:
BPF_MAP_TYPE_HASH_OF_MAPS and BPF_MAP_TYPE_ARRAY_OF_MAPS

A usecase where outer HASH map holds hundred of VIP entries and its
associated reuse-ports per VIP stored in REUSEPORT_SOCKARRAY type
inner map, needs to do batch operation for performance gain.

This patch leverages the exiting generic functions for most of the batch
operations. As map-in-map's value contains the actual reference of the inner map,
for BPF_MAP_TYPE_HASH_OF_MAPS type, it needed an extra step to fetch the
map_id from the reference value.

selftests are added in next patch

Changes v1 => v2:
- Change is in selftests associated with this

Signed-off-by: Takshak Chahande <ctakshak@fb.com>
---
 kernel/bpf/arraymap.c |  2 ++
 kernel/bpf/hashtab.c  | 12 ++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7f145aefbff8..f0852b6617cc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1344,6 +1344,8 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = array_of_map_gen_lookup,
+	.map_lookup_batch = generic_map_lookup_batch,
+	.map_update_batch = generic_map_update_batch,
 	.map_check_btf = map_check_no_btf,
 	.map_btf_name = "bpf_array",
 	.map_btf_id = &array_of_maps_map_btf_id,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c68fbebc8c00..fd537bfba84c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -139,7 +139,7 @@ static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
 
 static void htab_init_buckets(struct bpf_htab *htab)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
@@ -1594,7 +1594,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
 	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
 	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
-	u32 batch, max_count, size, bucket_size;
+	u32 batch, max_count, size, bucket_size, map_id;
 	struct htab_elem *node_to_free = NULL;
 	u64 elem_map_flags, map_flags;
 	struct hlist_nulls_head *head;
@@ -1719,6 +1719,13 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 			}
 		} else {
 			value = l->key + roundup_key_size;
+			if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+				struct bpf_map **inner_map = value;
+				 /* Actual value is the id of the inner map */
+				map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+				value = &map_id;
+			}
+
 			if (elem_map_flags & BPF_F_LOCK)
 				copy_map_value_locked(map, dst_val, value,
 						      true);
@@ -2425,6 +2432,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = htab_of_map_gen_lookup,
 	.map_check_btf = map_check_no_btf,
+	BATCH_OPS(htab),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_of_maps_map_btf_id,
 };
-- 
2.30.2


^ permalink raw reply related

* Re: [PATCH v5 net-next 4/4] rtnetlink: return EINVAL when request cannot succeed
From: Vinicius Costa Gomes @ 2022-04-22  1:27 UTC (permalink / raw)
  To: Stephen Hemminger, Florent Fourcot
  Cc: netdev, cong.wang, edumazet, Brian Baboch
In-Reply-To: <20220415122601.0b793cb9@hermes.local>

Stephen Hemminger <stephen@networkplumber.org> writes:

> On Fri, 15 Apr 2022 18:53:30 +0200
> Florent Fourcot <florent.fourcot@wifirst.fr> wrote:
>
>> A request without interface name/interface index/interface group cannot
>> work. We should return EINVAL
>> 
>> Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
>> Signed-off-by: Brian Baboch <brian.baboch@wifirst.fr>
>> ---
>>  net/core/rtnetlink.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
>> index 73f2cbc440c9..b943336908a7 100644
>> --- a/net/core/rtnetlink.c
>> +++ b/net/core/rtnetlink.c
>> @@ -3457,7 +3457,7 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
>>  			return rtnl_group_changelink(skb, net,
>>  						nla_get_u32(tb[IFLA_GROUP]),
>>  						ifm, extack, tb);
>> -		return -ENODEV;
>> +		return -EINVAL;
>
> Sometimes changing errno can be viewed as ABI change and break applications.

It seems that this is already breaking applications. iproute2 ip-link
depends on the returned error:

https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/tree/ip/iplink.c#n221


Cheers,
-- 
Vinicius

^ permalink raw reply

* [PATCH] net/wireless: add debugfs exit function
From: Bernard Zhao @ 2022-04-22  1:28 UTC (permalink / raw)
  To: Jakub Kicinski, Kalle Valo, David S. Miller, Paolo Abeni,
	Matthias Brugger, linux-wireless, netdev, linux-arm-kernel,
	linux-mediatek, linux-kernel
  Cc: bernard, Bernard Zhao

This patch add exit debugfs function to mt7601u.
Debugfs need to be cleanup when module is unloaded or load fail.

Signed-off-by: Bernard Zhao <zhaojunkui2008@126.com>
---
 drivers/net/wireless/mediatek/mt7601u/debugfs.c | 9 +++++++--
 drivers/net/wireless/mediatek/mt7601u/init.c    | 1 +
 drivers/net/wireless/mediatek/mt7601u/mt7601u.h | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt7601u/debugfs.c b/drivers/net/wireless/mediatek/mt7601u/debugfs.c
index 20669eacb66e..5ae27aae685b 100644
--- a/drivers/net/wireless/mediatek/mt7601u/debugfs.c
+++ b/drivers/net/wireless/mediatek/mt7601u/debugfs.c
@@ -9,6 +9,8 @@
 #include "mt7601u.h"
 #include "eeprom.h"
 
+static struct dentry *dir;
+
 static int
 mt76_reg_set(void *data, u64 val)
 {
@@ -124,8 +126,6 @@ DEFINE_SHOW_ATTRIBUTE(mt7601u_eeprom_param);
 
 void mt7601u_init_debugfs(struct mt7601u_dev *dev)
 {
-	struct dentry *dir;
-
 	dir = debugfs_create_dir("mt7601u", dev->hw->wiphy->debugfsdir);
 	if (!dir)
 		return;
@@ -138,3 +138,8 @@ void mt7601u_init_debugfs(struct mt7601u_dev *dev)
 	debugfs_create_file("ampdu_stat", 0400, dir, dev, &mt7601u_ampdu_stat_fops);
 	debugfs_create_file("eeprom_param", 0400, dir, dev, &mt7601u_eeprom_param_fops);
 }
+
+void mt7601u_exit_debugfs(struct mt7601u_dev *dev)
+{
+	debugfs_remove(dir);
+}
diff --git a/drivers/net/wireless/mediatek/mt7601u/init.c b/drivers/net/wireless/mediatek/mt7601u/init.c
index 5d9e952b2966..eacdd5785fa6 100644
--- a/drivers/net/wireless/mediatek/mt7601u/init.c
+++ b/drivers/net/wireless/mediatek/mt7601u/init.c
@@ -427,6 +427,7 @@ void mt7601u_cleanup(struct mt7601u_dev *dev)
 	mt7601u_stop_hardware(dev);
 	mt7601u_dma_cleanup(dev);
 	mt7601u_mcu_cmd_deinit(dev);
+	mt7601u_exit_debugfs(dev);
 }
 
 struct mt7601u_dev *mt7601u_alloc_device(struct device *pdev)
diff --git a/drivers/net/wireless/mediatek/mt7601u/mt7601u.h b/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
index a122f1dd38f6..a77bfef0d39f 100644
--- a/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
+++ b/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
@@ -279,6 +279,7 @@ struct mt7601u_rxwi;
 extern const struct ieee80211_ops mt7601u_ops;
 
 void mt7601u_init_debugfs(struct mt7601u_dev *dev);
+void mt7601u_exit_debugfs(struct mt7601u_dev *dev);
 
 u32 mt7601u_rr(struct mt7601u_dev *dev, u32 offset);
 void mt7601u_wr(struct mt7601u_dev *dev, u32 offset, u32 val);
-- 
2.33.1


^ permalink raw reply related

* Re: [PATCH net-next] net/af_packet: add VLAN support for AF_PACKET SOCK_RAW GSO
From: Hangbin Liu @ 2022-04-22  2:08 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: netdev, Michael S . Tsirkin, Jason Wang, David S . Miller,
	Jakub Kicinski, Paolo Abeni, Maxim Mikityanskiy, virtualization,
	Balazs Nemeth, Mike Pattrick, Eric Dumazet
In-Reply-To: <CA+FuTSckEJVUH1Q2vBxGbfPgVteyDVmTfjJC6hBj=qRP+JcAxA@mail.gmail.com>

Hi Willem,
On Thu, Apr 21, 2022 at 10:15:16AM -0400, Willem de Bruijn wrote:
> Your approach does sound simpler than the above. Thanks for looking
> into that alternative, though.

Sorry I have to bring this topic back. I just remembered that
tpacket_snd() already called skb_probe_transport_header() before calling
virtio_net_hdr_* functions. e.g.

- tpacket_snd()
  - tpacket_fill_skb()
    - packet_parse_headers()
      - skb_probe_transport_header()
  - if (po->has_vnet_hdr)
    - virtio_net_hdr_to_skb()
    - virtio_net_hdr_set_proto()

While for packet_snd()

- packet_snd()
  - if (has_vnet_hdr)
    - virtio_net_hdr_to_skb()
    - virtio_net_hdr_set_proto()
  - packet_parse_headers()
    - skb_probe_transport_header()

If we split skb_probe_transport_header() from packet_parse_headers() and
move it before calling virtio_net_hdr_* function in packet_snd(). Should
we do the same for tpacket_snd(), i.e. move skb_probe_transport_header()
after the virtio_net_hdr_* function?

I think it really doesn't matter whether calls skb_probe_transport_header()
before or after virtio_net_hdr_* functions if we could set the skb->protocol
and network header correctly. Because

- skb_probe_transport_header()
  - skb_flow_dissect_flow_keys_basic()
    - __skb_flow_dissect()

In __skb_flow_dissect()
```
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
```

So when data is NULL, we need to make sure the protocol, network header offset,
packet header length are correct.

Before this patch, the VLAN packet network header offset is incorrect when calls
skb_probe_transport_header(). After the fix, this issue should be gone
and we can call skb_probe_transport_header() safely.

So my conclusion is. There is no need to split packet_parse_headers(). Move
packet_parse_headers() before calling virtio_net_hdr_* function in packet_snd()
should be safe.

Please pardon me if I didn't make something clear.
Let's me know what do you think.

Thanks
Hangbin

^ permalink raw reply

* Re: [PATCH] batman-adv: remove unnecessary type castings
From: kernel test robot @ 2022-04-22  2:20 UTC (permalink / raw)
  To: Yu Zhe, mareklindner, sw, a, sven, davem, kuba, pabeni
  Cc: kbuild-all, b.a.t.m.a.n, netdev, linux-kernel, liqiong,
	kernel-janitors, Yu Zhe
In-Reply-To: <20220421154829.9775-1-yuzhe@nfschina.com>

Hi Yu,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Yu-Zhe/batman-adv-remove-unnecessary-type-castings/20220421-235254
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: m68k-defconfig (https://download.01.org/0day-ci/archive/20220422/202204221027.ETcMYyKP-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/2474b41c585e849d3546e0aba8f3c862735a04ff
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yu-Zhe/batman-adv-remove-unnecessary-type-castings/20220421-235254
        git checkout 2474b41c585e849d3546e0aba8f3c862735a04ff
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash net/batman-adv/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   net/batman-adv/bridge_loop_avoidance.c: In function 'batadv_choose_claim':
>> net/batman-adv/bridge_loop_avoidance.c:68:42: warning: initialization discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
      68 |         struct batadv_bla_claim *claim = data;
         |                                          ^~~~


vim +/const +68 net/batman-adv/bridge_loop_avoidance.c

    53	
    54	static void batadv_bla_periodic_work(struct work_struct *work);
    55	static void
    56	batadv_bla_send_announce(struct batadv_priv *bat_priv,
    57				 struct batadv_bla_backbone_gw *backbone_gw);
    58	
    59	/**
    60	 * batadv_choose_claim() - choose the right bucket for a claim.
    61	 * @data: data to hash
    62	 * @size: size of the hash table
    63	 *
    64	 * Return: the hash index of the claim
    65	 */
    66	static inline u32 batadv_choose_claim(const void *data, u32 size)
    67	{
  > 68		struct batadv_bla_claim *claim = data;
    69		u32 hash = 0;
    70	
    71		hash = jhash(&claim->addr, sizeof(claim->addr), hash);
    72		hash = jhash(&claim->vid, sizeof(claim->vid), hash);
    73	
    74		return hash % size;
    75	}
    76	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply

* Re: [PATCH net-next 1/3] ipv4: Don't reset ->flowi4_scope in ip_rt_fix_tos().
From: David Ahern @ 2022-04-22  2:30 UTC (permalink / raw)
  To: Guillaume Nault, David Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Hideaki YOSHIFUJI, dccp
In-Reply-To: <c3fdfe3353158c9b9da14602619fb82db5e77f27.1650470610.git.gnault@redhat.com>

On 4/20/22 5:21 PM, Guillaume Nault wrote:
> All callers already initialise ->flowi4_scope with RT_SCOPE_UNIVERSE,
> either by manual field assignment, memset(0) of the whole structure or
> implicit structure initialisation of on-stack variables
> (RT_SCOPE_UNIVERSE actually equals 0).
> 
> Therefore, we don't need to always initialise ->flowi4_scope in
> ip_rt_fix_tos(). We only need to reduce the scope to RT_SCOPE_LINK when
> the special RTO_ONLINK flag is present in the tos.
> 
> This will allow some code simplification, like removing
> ip_rt_fix_tos(). Also, the long term idea is to remove RTO_ONLINK
> entirely by properly initialising ->flowi4_scope, instead of
> overloading ->flowi4_tos with a special flag. Eventually, this will
> allow to convert ->flowi4_tos to dscp_t.
> 
> Signed-off-by: Guillaume Nault <gnault@redhat.com>
> ---
> It's important for the correctness of this patch that all callers
> initialise ->flowi4_scope to 0 (in one way or another). Auditing all of
> them is long, although each case is pretty trivial.
> 
> If it helps, I can send a patch series that converts implicit
> initialisation of ->flowi4_scope with an explicit assignment to
> RT_SCOPE_UNIVERSE. This would also have the advantage of making it
> clear to future readers that ->flowi4_scope _has_ to be initialised. I
> haven't sent such patch series to not overwhelm reviewers with trivial
> and not technically-required changes (there are 40+ places to modify,
> scattered over 30+ different files). But if anyone prefers explicit
> initialisation everywhere, then just let me know and I'll send such
> patches.

There are a handful of places that open code the initialization of the
flow struct. I *think* I found all of them in 40867d74c374.

> ---
>  net/ipv4/route.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index e839d424b861..d8f82c0ac132 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -503,8 +503,8 @@ static void ip_rt_fix_tos(struct flowi4 *fl4)
>  	__u8 tos = RT_FL_TOS(fl4);
>  
>  	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
> -	fl4->flowi4_scope = tos & RTO_ONLINK ?
> -			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
> +	if (tos & RTO_ONLINK)
> +		fl4->flowi4_scope = RT_SCOPE_LINK;
>  }
>  
>  static void __build_flow_key(const struct net *net, struct flowi4 *fl4,

Reviewed-by: David Ahern <dsahern@kernel.org>

^ permalink raw reply

* Re: [PATCH net-next 2/3] ipv4: Avoid using RTO_ONLINK with ip_route_connect().
From: David Ahern @ 2022-04-22  2:32 UTC (permalink / raw)
  To: Guillaume Nault, David Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, Hideaki YOSHIFUJI, dccp
In-Reply-To: <492f91626cab774d7dda27147629c3d56537f847.1650470610.git.gnault@redhat.com>

On 4/20/22 5:21 PM, Guillaume Nault wrote:
> Now that ip_rt_fix_tos() doesn't reset ->flowi4_scope unconditionally,
> we don't have to rely on the RTO_ONLINK bit to properly set the scope
> of a flowi4 structure. We can just set ->flowi4_scope explicitly and
> avoid using RTO_ONLINK in ->flowi4_tos.
> 
> This patch converts callers of ip_route_connect(). Instead of setting
> the tos parameter with RT_CONN_FLAGS(sk), as all callers do, we can:
> 
>   1- Drop the tos parameter from ip_route_connect(): its value was
>      entirely based on sk, which is also passed as parameter.
> 
>   2- Set ->flowi4_scope depending on the SOCK_LOCALROUTE socket option
>      instead of always initialising it with RT_SCOPE_UNIVERSE (let's
>      define ip_sock_rt_scope() for this purpose).
> 
>   3- Avoid overloading ->flowi4_tos with RTO_ONLINK: since the scope is
>      now properly initialised, we don't need to tell ip_rt_fix_tos() to
>      adjust ->flowi4_scope for us. So let's define ip_sock_rt_tos(),
>      which is the same as RT_CONN_FLAGS() but without the RTO_ONLINK
>      bit overload.
> 
> Note:
>   In the original ip_route_connect() code, __ip_route_output_key()
>   might clear the RTO_ONLINK bit of fl4->flowi4_tos (because of
>   ip_rt_fix_tos()). Therefore flowi4_update_output() had to reuse the
>   original tos variable. Now that we don't set RTO_ONLINK any more,
>   this is not a problem and we can use fl4->flowi4_tos in
>   flowi4_update_output().
> 
> Signed-off-by: Guillaume Nault <gnault@redhat.com>
> ---
>  include/net/route.h | 36 ++++++++++++++++++++++++------------
>  net/dccp/ipv4.c     |  5 ++---
>  net/ipv4/af_inet.c  |  6 +++---
>  net/ipv4/datagram.c |  7 +++----
>  net/ipv4/tcp_ipv4.c |  5 ++---
>  5 files changed, 34 insertions(+), 25 deletions(-)
> 

Reviewed-by: David Ahern <dsahern@kernel.org>



^ permalink raw reply

* Re: [PATCH] batman-adv: remove unnecessary type castings
From: kernel test robot @ 2022-04-22  2:41 UTC (permalink / raw)
  To: Yu Zhe, mareklindner, sw, a, sven, davem, kuba, pabeni
  Cc: kbuild-all, b.a.t.m.a.n, netdev, linux-kernel, liqiong,
	kernel-janitors, Yu Zhe
In-Reply-To: <20220421154829.9775-1-yuzhe@nfschina.com>

Hi Yu,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Yu-Zhe/batman-adv-remove-unnecessary-type-castings/20220421-235254
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: x86_64-randconfig-a013 (https://download.01.org/0day-ci/archive/20220422/202204221051.PRtLc0f7-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/2474b41c585e849d3546e0aba8f3c862735a04ff
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yu-Zhe/batman-adv-remove-unnecessary-type-castings/20220421-235254
        git checkout 2474b41c585e849d3546e0aba8f3c862735a04ff
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash net/batman-adv/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   net/batman-adv/bridge_loop_avoidance.c: In function 'batadv_choose_claim':
>> net/batman-adv/bridge_loop_avoidance.c:68:42: warning: initialization discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
      68 |         struct batadv_bla_claim *claim = data;
         |                                          ^~~~
--
   net/batman-adv/translation-table.c: In function 'batadv_choose_tt':
>> net/batman-adv/translation-table.c:109:12: warning: assignment discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
     109 |         tt = data;
         |            ^


vim +/const +68 net/batman-adv/bridge_loop_avoidance.c

    53	
    54	static void batadv_bla_periodic_work(struct work_struct *work);
    55	static void
    56	batadv_bla_send_announce(struct batadv_priv *bat_priv,
    57				 struct batadv_bla_backbone_gw *backbone_gw);
    58	
    59	/**
    60	 * batadv_choose_claim() - choose the right bucket for a claim.
    61	 * @data: data to hash
    62	 * @size: size of the hash table
    63	 *
    64	 * Return: the hash index of the claim
    65	 */
    66	static inline u32 batadv_choose_claim(const void *data, u32 size)
    67	{
  > 68		struct batadv_bla_claim *claim = data;
    69		u32 hash = 0;
    70	
    71		hash = jhash(&claim->addr, sizeof(claim->addr), hash);
    72		hash = jhash(&claim->vid, sizeof(claim->vid), hash);
    73	
    74		return hash % size;
    75	}
    76	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply

* Re: [PATCH -next] libbpf: Add additional null-pointer checking in make_parent_dir
From: cuigaosheng @ 2022-04-22  2:45 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Martin Lau,
	Song Liu, Yonghong Song, john fastabend, KP Singh, Networking,
	bpf, open list, gongruiqi1, wangweiyang2
In-Reply-To: <CAEf4Bza3inoAHsS0w=nKXNgxyFqzPXJVyDHq03Foody6Vgp7=Q@mail.gmail.com>

I don't understand why we don't check path for NULL, bpf_link__pin is an external
interface, It will be called by external functions and provide input parameters,
for example in samples/bpf/hbm.c:
> 201 link = bpf_program__attach_cgroup(bpf_prog, cg1); 202 if 
> (libbpf_get_error(link)) { 203 fprintf(stderr, "ERROR: 
> bpf_program__attach_cgroup failed\n"); 204 goto err; 205 } 206 207 
> sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); 208 rc = 
> bpf_link__pin(link, cg_pin_path); 209 if (rc < 0) { 210 printf("ERROR: 
> bpf_link__pin failed: %d\n", rc); 211 goto err; 212 }
if cg_pin_path is NULL, strdup(NULL) will trigger a segmentation fault in
make_parent_dir, I think we should avoid this and add null-pointer checking
for path, just like check_path:
> 7673 static int check_path(const char *path) 7674 { 7675 char *cp, 
> errmsg[STRERR_BUFSIZE]; 7676 struct statfs st_fs; 7677 char *dname, 
> *dir; 7678 int err = 0; 7679 7680 if (path == NULL) 7681 return 
> -EINVAL; 7682 7683 dname = strdup(path); 7684 if (dname == NULL) 7685 
> return -ENOMEM; 7686 7687 dir = dirname(dname); 7688 if (statfs(dir, 
> &st_fs)) { 7689 cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); 
> 7690 pr_warn("failed to statfs %s: %s\n", dir, cp); 7691 err = -errno; 
> 7692 } 7693 free(dname); 7694 7695 if (!err && st_fs.f_type != 
> BPF_FS_MAGIC) { 7696 pr_warn("specified path %s is not on BPF FS\n", 
> path); 7697 err = -EINVAL; 7698 } 7699 7700 return err; 7701 }

Thanks.

在 2022/4/22 0:55, Andrii Nakryiko 写道:
> On Thu, Apr 21, 2022 at 6:01 AM Gaosheng Cui <cuigaosheng1@huawei.com> wrote:
>> The make_parent_dir is called without null-pointer checking for path,
>> such as bpf_link__pin. To ensure there is no null-pointer dereference
>> in make_parent_dir, so make_parent_dir requires additional null-pointer
>> checking for path.
>>
>> Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
>> ---
>>   tools/lib/bpf/libbpf.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index b53e51884f9e..5786e6184bf5 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -7634,6 +7634,9 @@ static int make_parent_dir(const char *path)
>>          char *dname, *dir;
>>          int err = 0;
>>
>> +       if (path == NULL)
>> +               return -EINVAL;
>> +
> API contract is that path shouldn't be NULL. Just like we don't check
> link, obj, prog for NULL in every single API, I don't think we need to
> do it here, unless I'm missing something?
>
>>          dname = strdup(path);
>>          if (dname == NULL)
>>                  return -ENOMEM;
>> --
>> 2.25.1
>>
> .

^ permalink raw reply

* Re: [PATCH] batman-adv: remove unnecessary type castings
From: kernel test robot @ 2022-04-22  2:51 UTC (permalink / raw)
  To: Yu Zhe, mareklindner, sw, a, sven, davem, kuba, pabeni
  Cc: llvm, kbuild-all, b.a.t.m.a.n, netdev, linux-kernel, liqiong,
	kernel-janitors, Yu Zhe
In-Reply-To: <20220421154829.9775-1-yuzhe@nfschina.com>

Hi Yu,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Yu-Zhe/batman-adv-remove-unnecessary-type-castings/20220421-235254
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20220422/202204221034.hfPA4RPW-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 5bd87350a5ae429baf8f373cb226a57b62f87280)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/2474b41c585e849d3546e0aba8f3c862735a04ff
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yu-Zhe/batman-adv-remove-unnecessary-type-castings/20220421-235254
        git checkout 2474b41c585e849d3546e0aba8f3c862735a04ff
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash net/batman-adv/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/batman-adv/bridge_loop_avoidance.c:68:27: error: initializing 'struct batadv_bla_claim *' with an expression of type 'const void *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers]
           struct batadv_bla_claim *claim = data;
                                    ^       ~~~~
   1 error generated.
--
>> net/batman-adv/translation-table.c:109:5: error: assigning to 'struct batadv_tt_common_entry *' from 'const void *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers]
           tt = data;
              ^ ~~~~
   1 error generated.


vim +68 net/batman-adv/bridge_loop_avoidance.c

    53	
    54	static void batadv_bla_periodic_work(struct work_struct *work);
    55	static void
    56	batadv_bla_send_announce(struct batadv_priv *bat_priv,
    57				 struct batadv_bla_backbone_gw *backbone_gw);
    58	
    59	/**
    60	 * batadv_choose_claim() - choose the right bucket for a claim.
    61	 * @data: data to hash
    62	 * @size: size of the hash table
    63	 *
    64	 * Return: the hash index of the claim
    65	 */
    66	static inline u32 batadv_choose_claim(const void *data, u32 size)
    67	{
  > 68		struct batadv_bla_claim *claim = data;
    69		u32 hash = 0;
    70	
    71		hash = jhash(&claim->addr, sizeof(claim->addr), hash);
    72		hash = jhash(&claim->vid, sizeof(claim->vid), hash);
    73	
    74		return hash % size;
    75	}
    76	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox