netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 4/5 v2] soreuseport: TCP/IPv6 implementation
@ 2013-01-21  0:07 Tom Herbert
  2013-01-29 12:16 ` [PATCH net-next] ipv6: Fix inet6_csk_bind_conflict so it builds with user namespaces enabled Eric W. Biederman
  0 siblings, 1 reply; 3+ messages in thread
From: Tom Herbert @ 2013-01-21  0:07 UTC (permalink / raw)
  To: netdev, davem; +Cc: netdev, eric.dumazet

Motivation for soreuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket.  This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads.  In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets.  We have seen the  disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest.  With so_reusport the distribution is
uniform.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/net/inet6_hashtables.h         |    5 ++++-
 include/net/netfilter/nf_tproxy_core.h |    1 +
 net/ipv6/inet6_connection_sock.c       |   19 ++++++++++++++-----
 net/ipv6/inet6_hashtables.c            |   19 ++++++++++++++++---
 net/ipv6/tcp_ipv6.c                    |    4 +++-
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 9e34c87..7ca75cb 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net,
 
 extern struct sock *inet6_lookup_listener(struct net *net,
 					  struct inet_hashinfo *hashinfo,
+					  const struct in6_addr *saddr,
+					  const __be16 sport,
 					  const struct in6_addr *daddr,
 					  const unsigned short hnum,
 					  const int dif);
@@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
 	if (sk)
 		return sk;
 
-	return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif);
+	return inet6_lookup_listener(net, hashinfo, saddr, sport,
+				     daddr, hnum, dif);
 }
 
 static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 1937964..36d9379 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -152,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
 			break;
 		case NFT_LOOKUP_LISTENER:
 			sk = inet6_lookup_listener(net, &tcp_hashinfo,
+						   saddr, sport,
 						   daddr, ntohs(dport),
 						   in->ifindex);
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3064785..e4297a3 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -32,6 +32,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 {
 	const struct sock *sk2;
 	const struct hlist_node *node;
+	int reuse = sk->sk_reuse;
+	int reuseport = sk->sk_reuseport;
+	int uid = sock_i_uid((struct sock *)sk);
 
 	/* We must walk the whole port owner list in this case. -DaveM */
 	/*
@@ -42,11 +45,17 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 		if (sk != sk2 &&
 		    (!sk->sk_bound_dev_if ||
 		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
-		    (!sk->sk_reuse || !sk2->sk_reuse ||
-		     sk2->sk_state == TCP_LISTEN) &&
-		     ipv6_rcv_saddr_equal(sk, sk2))
-			break;
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			if ((!reuse || !sk2->sk_reuse ||
+			     sk2->sk_state == TCP_LISTEN) &&
+			    (!reuseport || !sk2->sk_reuseport ||
+			     (sk2->sk_state != TCP_TIME_WAIT &&
+			      !uid_eq(uid,
+				      sock_i_uid((struct sock *)sk2))))) {
+				if (ipv6_rcv_saddr_equal(sk, sk2))
+					break;
+			}
+		}
 	}
 
 	return node != NULL;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index dea17fd..32b4a16 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 struct sock *inet6_lookup_listener(struct net *net,
-		struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
+		struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
+		const __be16 sport, const struct in6_addr *daddr,
 		const unsigned short hnum, const int dif)
 {
 	struct sock *sk;
 	const struct hlist_nulls_node *node;
 	struct sock *result;
-	int score, hiscore;
+	int score, hiscore, matches = 0, reuseport = 0;
+	u32 phash = 0;
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 
 	rcu_read_lock();
 begin:
 	result = NULL;
-	hiscore = -1;
+	hiscore = 0;
 	sk_nulls_for_each(sk, node, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr, dif);
 		if (score > hiscore) {
 			hiscore = score;
 			result = sk;
+			reuseport = sk->sk_reuseport;
+			if (reuseport) {
+				phash = inet6_ehashfn(net, daddr, hnum,
+						      saddr, sport);
+				matches = 1;
+			}
+		} else if (score == hiscore && reuseport) {
+			matches++;
+			if (((u64)phash * matches) >> 32 == 0)
+				result = sk;
+			phash = next_pseudo_random32(phash);
 		}
 	}
 	/*
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3701c3c..06087e5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -834,7 +834,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 		 * no RST generated if md5 hash doesn't match.
 		 */
 		sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
-					   &tcp_hashinfo, &ipv6h->daddr,
+					   &tcp_hashinfo, &ipv6h->saddr,
+					   th->source, &ipv6h->daddr,
 					   ntohs(th->source), inet6_iif(skb));
 		if (!sk1)
 			return;
@@ -1598,6 +1599,7 @@ do_time_wait:
 		struct sock *sk2;
 
 		sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
 					    ntohs(th->dest), inet6_iif(skb));
 		if (sk2 != NULL) {
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH net-next] ipv6: Fix inet6_csk_bind_conflict so it builds with user namespaces enabled
  2013-01-21  0:07 [PATCH 4/5 v2] soreuseport: TCP/IPv6 implementation Tom Herbert
@ 2013-01-29 12:16 ` Eric W. Biederman
  2013-01-29 20:20   ` David Miller
  0 siblings, 1 reply; 3+ messages in thread
From: Eric W. Biederman @ 2013-01-29 12:16 UTC (permalink / raw)
  To: davem; +Cc: netdev, netdev, eric.dumazet, Tom Herbert


When attempting to build linux-next with user namespaces enabled I ran
into this fun build error.

  CC      net/ipv6/inet6_connection_sock.o
.../net/ipv6/inet6_connection_sock.c: In function ‘inet6_csk_bind_conflict’:
.../net/ipv6/inet6_connection_sock.c:37:12: error: incompatible types when initializing type ‘int’ using
 type ‘kuid_t’
.../net/ipv6/inet6_connection_sock.c:54:30: error: incompatible type for argument 1 of ‘uid_eq’
.../include/linux/uidgid.h:48:20: note: expected ‘kuid_t’ but argument is of type ‘int’
make[3]: *** [net/ipv6/inet6_connection_sock.o] Error 1
make[2]: *** [net/ipv6] Error 2
make[2]: *** Waiting for unfinished jobs....

Using kuid_t instead of int to hold the uid fixes this.

Cc: Tom Herbert <therbert@google.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 net/ipv6/inet6_connection_sock.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e4297a3..b386a2c 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -34,7 +34,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 	const struct hlist_node *node;
 	int reuse = sk->sk_reuse;
 	int reuseport = sk->sk_reuseport;
-	int uid = sock_i_uid((struct sock *)sk);
+	kuid_t uid = sock_i_uid((struct sock *)sk);
 
 	/* We must walk the whole port owner list in this case. -DaveM */
 	/*
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next] ipv6: Fix inet6_csk_bind_conflict so it builds with user namespaces enabled
  2013-01-29 12:16 ` [PATCH net-next] ipv6: Fix inet6_csk_bind_conflict so it builds with user namespaces enabled Eric W. Biederman
@ 2013-01-29 20:20   ` David Miller
  0 siblings, 0 replies; 3+ messages in thread
From: David Miller @ 2013-01-29 20:20 UTC (permalink / raw)
  To: ebiederm; +Cc: netdev, netdev, eric.dumazet, therbert

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Tue, 29 Jan 2013 04:16:18 -0800

> 
> When attempting to build linux-next with user namespaces enabled I ran
> into this fun build error.
> 
>   CC      net/ipv6/inet6_connection_sock.o
> .../net/ipv6/inet6_connection_sock.c: In function ‘inet6_csk_bind_conflict’:
> .../net/ipv6/inet6_connection_sock.c:37:12: error: incompatible types when initializing type ‘int’ using
>  type ‘kuid_t’
> .../net/ipv6/inet6_connection_sock.c:54:30: error: incompatible type for argument 1 of ‘uid_eq’
> .../include/linux/uidgid.h:48:20: note: expected ‘kuid_t’ but argument is of type ‘int’
> make[3]: *** [net/ipv6/inet6_connection_sock.o] Error 1
> make[2]: *** [net/ipv6] Error 2
> make[2]: *** Waiting for unfinished jobs....
> 
> Using kuid_t instead of int to hold the uid fixes this.
> 
> Cc: Tom Herbert <therbert@google.com>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2013-01-29 20:21 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-01-21  0:07 [PATCH 4/5 v2] soreuseport: TCP/IPv6 implementation Tom Herbert
2013-01-29 12:16 ` [PATCH net-next] ipv6: Fix inet6_csk_bind_conflict so it builds with user namespaces enabled Eric W. Biederman
2013-01-29 20:20   ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).