netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v3 0/2] udp: Fix multicast performance issues.
@ 2014-07-16  3:28 David Held
  2014-07-16  3:28 ` [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: David Held @ 2014-07-16  3:28 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb

Fix performance issues with listening to many different multicast
sockets on different addresses with the same port. Instead of always
using hash1, fall back to hash2 lookup when hash1 lookup is long.
Patch 1 is a general cleanup and simplification which also makes the
main implementation in Patch 2 simpler.

Eric's recent change 63c6f81cdde5 avoided this being an issue in early
demux. This makes it work for regular delivery as well.

v1->v2
 - updated hash collision detection

v2->v3
 - avoid flushing under lock unnecessarily at ARRAY_SIZE boundary

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-16  3:28 [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Held
@ 2014-07-16  3:28 ` David Held
  2014-07-16  7:07   ` Eric Dumazet
  2014-07-16  3:28 ` [PATCH net-next v3 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
  2014-07-17  6:30 ` [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Miller
  2 siblings, 1 reply; 6+ messages in thread
From: David Held @ 2014-07-16  3:28 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb, David Held

Switch to using sk_nulls_for_each which shortens the code and makes it
easier to update.

Signed-off-by: David Held <drheld@google.com>
---
 net/ipv4/udp.c | 48 ++++++++++----------------------
 net/ipv6/udp.c | 88 +++++++++++++++++++++++-----------------------------------
 2 files changed, 49 insertions(+), 87 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f6dfe52..1347a24 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
-static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
-					     __be16 loc_port, __be32 loc_addr,
-					     __be16 rmt_port, __be32 rmt_addr,
-					     int dif)
-{
-	struct hlist_nulls_node *node;
-	unsigned short hnum = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		if (__udp_is_mcast_sock(net, sk,
-					loc_port, loc_addr,
-					rmt_port, rmt_addr,
-					dif, hnum))
-			goto found;
-	}
-	sk = NULL;
-found:
-	return sk;
-}
-
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -1664,23 +1644,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable)
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = skb->dev->ifindex;
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = skb->dev->ifindex;
-	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		stack[count++] = sk;
-		sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
-				       daddr, uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk,
+					uh->dest, daddr,
+					uh->source, saddr,
+					dif, hnum)) {
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
+			stack[count++] = sk;
 		}
 	}
 	/*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c2bd28f..6bcfbb8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -698,43 +698,26 @@ drop:
 	return -1;
 }
 
-static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
-				      __be16 loc_port, const struct in6_addr *loc_addr,
-				      __be16 rmt_port, const struct in6_addr *rmt_addr,
-				      int dif)
+static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
+				   __be16 loc_port, const struct in6_addr *loc_addr,
+				   __be16 rmt_port, const struct in6_addr *rmt_addr,
+				   int dif, unsigned short hnum)
 {
-	struct hlist_nulls_node *node;
-	unsigned short num = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (!net_eq(sock_net(sk), net))
-			continue;
-
-		if (udp_sk(sk)->udp_port_hash == num &&
-		    sk->sk_family == PF_INET6) {
-			if (inet->inet_dport) {
-				if (inet->inet_dport != rmt_port)
-					continue;
-			}
-			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
-			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
-				continue;
-
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
-				continue;
+	struct inet_sock *inet = inet_sk(sk);
 
-			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-				if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
-					continue;
-			}
-			if (!inet6_mc_check(sk, loc_addr, rmt_addr))
-				continue;
-			return sk;
-		}
-	}
-	return NULL;
+	if (!net_eq(sock_net(sk), net))
+		return false;
+
+	if (udp_sk(sk)->udp_port_hash != hnum ||
+	    sk->sk_family != PF_INET6 ||
+	    (inet->inet_dport && inet->inet_dport != rmt_port) ||
+	    (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+		    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+		return false;
+	if (!inet6_mc_check(sk, loc_addr, rmt_addr))
+		return false;
+	return true;
 }
 
 static void flush_stack(struct sock **stack, unsigned int count,
@@ -783,28 +766,27 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
 	const struct udphdr *uh = udp_hdr(skb);
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = inet6_iif(skb);
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = inet6_iif(skb);
-	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		/* If zero checksum and no_check is not on for
-		 * the socket then skip it.
-		 */
-		if (uh->check || udp_sk(sk)->no_check6_rx)
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_v6_is_mcast_sock(net, sk,
+					   uh->dest, daddr,
+					   uh->source, saddr,
+					   dif, hnum) &&
+		    /* If zero checksum and no_check is not on for
+		     * the socket then skip it.
+		     */
+		    (uh->check || udp_sk(sk)->no_check6_rx)) {
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
 			stack[count++] = sk;
-
-		sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
-				       uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
 		}
 	}
 	/*
-- 
2.0.0.526.g5318336

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH net-next v3 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
  2014-07-16  3:28 [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Held
  2014-07-16  3:28 ` [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
@ 2014-07-16  3:28 ` David Held
  2014-07-16  7:13   ` Eric Dumazet
  2014-07-17  6:30 ` [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Miller
  2 siblings, 1 reply; 6+ messages in thread
From: David Held @ 2014-07-16  3:28 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb, David Held

Many multicast sources can have the same port which can result in a very
large list when hashing by port only. Hash by address and port instead
if this is the case. This makes multicast more similar to unicast.

On a 24-core machine receiving from 500 multicast sockets on the same
port, before this patch 80% of system CPU was used up by spin locking
and only ~25% of packets were successfully delivered.

With this patch, all packets are delivered and kernel overhead is ~8%
system CPU on spinlocks.

Signed-off-by: David Held <drheld@google.com>
---
 include/net/sock.h | 14 ++++++++++++++
 net/ipv4/udp.c     | 31 +++++++++++++++++++++----------
 net/ipv6/udp.c     | 30 ++++++++++++++++++++----------
 3 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index cb84b2f..6734cab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
 #define sk_for_each_bound(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
+/**
+ * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @offset:	offset of hlist_node within the struct.
+ *
+ */
+#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
+	     pos = pos->next)
+
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
 	/* Careful only use this in a context where these parameters
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1347a24..21ca57d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1616,6 +1616,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -1648,10 +1650,20 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = skb->dev->ifindex;
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+			    udp_table.mask;
+		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+		hslot = &udp_table.hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_is_mcast_sock(net, sk,
 					uh->dest, daddr,
 					uh->source, saddr,
@@ -1661,24 +1673,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				count = 0;
 			}
 			stack[count++] = sk;
+			sock_hold(sk);
 		}
 	}
-	/*
-	 * before releasing chain lock, we must take a reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	/*
 	 * do the slow work with no lock held
 	 */
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6bcfbb8..21f4fe9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -741,6 +741,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -770,10 +771,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = inet6_iif(skb);
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+			    udp_table.mask;
+		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+		hslot = &udp_table.hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_v6_is_mcast_sock(net, sk,
 					   uh->dest, daddr,
 					   uh->source, saddr,
@@ -787,21 +798,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				count = 0;
 			}
 			stack[count++] = sk;
+			sock_hold(sk);
 		}
 	}
-	/*
-	 * before releasing the lock, we must take reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
-- 
2.0.0.526.g5318336

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-16  3:28 ` [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
@ 2014-07-16  7:07   ` Eric Dumazet
  0 siblings, 0 replies; 6+ messages in thread
From: Eric Dumazet @ 2014-07-16  7:07 UTC (permalink / raw)
  To: David Held; +Cc: netdev, davem, willemb

On Tue, 2014-07-15 at 23:28 -0400, David Held wrote:
> Switch to using sk_nulls_for_each which shortens the code and makes it
> easier to update.
> 
> Signed-off-by: David Held <drheld@google.com>
> ---
>  net/ipv4/udp.c | 48 ++++++++++----------------------
>  net/ipv6/udp.c | 88 +++++++++++++++++++++++-----------------------------------
>  2 files changed, 49 insertions(+), 87 deletions(-)

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next v3 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
  2014-07-16  3:28 ` [PATCH net-next v3 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
@ 2014-07-16  7:13   ` Eric Dumazet
  0 siblings, 0 replies; 6+ messages in thread
From: Eric Dumazet @ 2014-07-16  7:13 UTC (permalink / raw)
  To: David Held; +Cc: netdev, davem, willemb

On Tue, 2014-07-15 at 23:28 -0400, David Held wrote:
> Many multicast sources can have the same port which can result in a very
> large list when hashing by port only. Hash by address and port instead
> if this is the case. This makes multicast more similar to unicast.
> 
> On a 24-core machine receiving from 500 multicast sockets on the same
> port, before this patch 80% of system CPU was used up by spin locking
> and only ~25% of packets were successfully delivered.
> 
> With this patch, all packets are delivered and kernel overhead is ~8%
> system CPU on spinlocks.
> 
> Signed-off-by: David Held <drheld@google.com>
> ---

Signed-off-by: Eric Dumazet <edumazet@google.com>

Thanks David.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next v3 0/2] udp: Fix multicast performance issues.
  2014-07-16  3:28 [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Held
  2014-07-16  3:28 ` [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
  2014-07-16  3:28 ` [PATCH net-next v3 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
@ 2014-07-17  6:30 ` David Miller
  2 siblings, 0 replies; 6+ messages in thread
From: David Miller @ 2014-07-17  6:30 UTC (permalink / raw)
  To: drheld; +Cc: netdev, eric.dumazet, willemb

From: David Held <drheld@google.com>
Date: Tue, 15 Jul 2014 23:28:30 -0400

> Fix performance issues with listening to many different multicast
> sockets on different addresses with the same port. Instead of always
> using hash1, fall back to hash2 lookup when hash1 lookup is long.
> Patch 1 is a general cleanup and simplification which also makes the
> main implementation in Patch 2 simpler.
> 
> Eric's recent change 63c6f81cdde5 avoided this being an issue in early
> demux. This makes it work for regular delivery as well.
> 
> v1->v2
>  - updated hash collision detection
> 
> v2->v3
>  - avoid flushing under lock unnecessarily at ARRAY_SIZE boundary

Series applied, thanks a lot David.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-07-17  6:30 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-07-16  3:28 [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Held
2014-07-16  3:28 ` [PATCH net-next v3 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
2014-07-16  7:07   ` Eric Dumazet
2014-07-16  3:28 ` [PATCH net-next v3 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
2014-07-16  7:13   ` Eric Dumazet
2014-07-17  6:30 ` [PATCH net-next v3 0/2] udp: Fix multicast performance issues David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).