Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev, Alexandru Copot,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-1-git-send-email-alex.mihai.c@gmail.com>

This results in a massive improvement when there are many sockets
bound to the same port, but different addresses for both bind() and
listen() system calls (both call inet_csk_get_port).

Tests were run with 16000 subinterfaces each with a distinct
IPv4 address. The sockets are first bound to the same port and
then put on listen().

* Without patch and without SO_REUSEADDR:
    * bind:   1.543 s
    * listen: 3.050 s

* Without patch and with SO_REUSEADDR set:
    * bind:   0.066 s
    * listen: 3.050 s

* With patch and SO_REUSEADDR set / without SO_REUSEADDR:
    * bind:   0.066 s
    * listen: 0.095 s

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h   |   48 +++++++++++++++
 net/ipv4/inet_connection_sock.c |   63 ++++++++------------
 net/ipv4/inet_hashtables.c      |  125 ++++++++++++++++++++++++++++++++++++++-
 net/ipv6/inet6_hashtables.c     |   95 +++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+), 39 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index bc06168..2f589bb 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,6 +81,15 @@ struct inet_bind_bucket {
 	struct net		*ib_net;
 #endif
 	unsigned short		port;
+	union {
+		struct in6_addr ib_addr_ipv6;
+		struct {
+			__be32	_1;
+			__be32	_2;
+			__be32	_3;
+			__be32	ib_addr_ipv4;
+		};
+	};
 	signed short		fastreuse;
 	int			num_owners;
 	struct hlist_node	node;
@@ -226,6 +235,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 
 extern struct inet_bind_bucket *
 	    inet_bind_bucket_create(struct kmem_cache *cachep,
+				    struct sock *sk,
 				    struct net *net,
 				    struct inet_bind_hashbucket *head,
 				    struct inet_bind_hashbucket *portaddr_head,
@@ -257,6 +267,14 @@ static inline struct inet_bind_hashbucket *
 	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
 }
 
+
+struct inet_bind_bucket *
+inet4_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead);
+
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline unsigned int inet6_portaddr_bhashfn(struct net *net,
 						  const struct in6_addr *addr6,
@@ -283,6 +301,14 @@ static inline struct inet_bind_hashbucket *
 	unsigned int h = inet6_portaddr_bhashfn(net, addr6, port);
 	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
 }
+
+
+struct inet_bind_bucket *
+	inet6_find_bind_buckets(struct sock *sk,
+				unsigned short port,
+				struct inet_bind_hashbucket **p_bhead,
+				struct inet_bind_hashbucket **p_portaddr_bhead);
+
 #endif
 
 
@@ -306,6 +332,28 @@ static inline struct inet_bind_hashbucket *
 	return inet4_portaddr_hashbucket(hinfo, net, INADDR_ANY, port);
 }
 
+
+static inline struct inet_bind_bucket *
+	inet_find_bind_buckets(struct sock *sk,
+			       unsigned short port,
+			       struct inet_bind_hashbucket **p_bhead,
+			       struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet4_find_bind_buckets(sk, port, p_bhead,
+				p_portaddr_bhead);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		return inet6_find_bind_buckets(sk, port, p_bhead,
+				p_portaddr_bhead);
+#endif
+	}
+	WARN(1, "unrecognised sk->sk_family in inet_portaddr_hashbucket");
+	return NULL;
+}
+
+
 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 			   const unsigned short snum);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 336531a..bd92466 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -100,8 +100,7 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct inet_bind_hashbucket *head;
-	struct hlist_node *node;
+	struct inet_bind_hashbucket *head, *portaddr_bhead;
 	struct inet_bind_bucket *tb;
 	int ret, attempts = 5;
 	struct net *net = sock_net(sk);
@@ -120,31 +119,26 @@ again:
 		do {
 			if (inet_is_reserved_local_port(rover))
 				goto next_nolock;
-			head = &hashinfo->bhash[inet_bhashfn(net, rover,
-					hashinfo->bhash_size)];
-			spin_lock(&head->lock);
-			inet_bind_bucket_for_each(tb, node, &head->chain)
-				if (net_eq(ib_net(tb), net) && tb->port == rover) {
-					if (tb->fastreuse > 0 &&
-					    sk->sk_reuse &&
-					    sk->sk_state != TCP_LISTEN &&
-					    (tb->num_owners < smallest_size || smallest_size == -1)) {
-						smallest_size = tb->num_owners;
-						smallest_rover = rover;
-						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
-						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-							snum = smallest_rover;
-							goto tb_found;
-						}
-					}
-					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-						snum = rover;
-						goto tb_found;
-					}
-					goto next;
+
+			tb = inet_find_bind_buckets(sk, rover, &head, &portaddr_bhead);
+			if (!tb)
+				break;
+			if (tb->fastreuse > 0 && sk->sk_reuse &&
+			    sk->sk_state != TCP_LISTEN &&
+			    (tb->num_owners < smallest_size || smallest_size == -1)) {
+				smallest_size = tb->num_owners;
+				smallest_rover = rover;
+				if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+				    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+					snum = smallest_rover;
+					goto tb_found;
 				}
-			break;
-		next:
+			}
+			if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+				snum = rover;
+				goto tb_found;
+			}
+			spin_unlock(&portaddr_bhead->lock);
 			spin_unlock(&head->lock);
 		next_nolock:
 			if (++rover > high)
@@ -171,12 +165,9 @@ again:
 		snum = rover;
 	} else {
 have_snum:
-		head = &hashinfo->bhash[inet_bhashfn(net, snum,
-				hashinfo->bhash_size)];
-		spin_lock(&head->lock);
-		inet_bind_bucket_for_each(tb, node, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == snum)
-				goto tb_found;
+		tb = inet_find_bind_buckets(sk, snum, &head, &portaddr_bhead);
+		if (tb)
+			goto tb_found;
 	}
 	tb = NULL;
 	goto tb_not_found;
@@ -194,6 +185,7 @@ tb_found:
 			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
 				    smallest_size != -1 && --attempts >= 0) {
+					spin_unlock(&portaddr_bhead->lock);
 					spin_unlock(&head->lock);
 					goto again;
 				}
@@ -205,12 +197,8 @@ tb_found:
 tb_not_found:
 	ret = 1;
 	if (!tb) {
-		struct inet_bind_hashbucket *portaddr_head;
-		portaddr_head = inet_portaddr_hashbucket(hashinfo, sk, snum);
-		spin_lock(&portaddr_head->lock);
 		tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-				net, head, portaddr_head, snum);
-		spin_unlock(&portaddr_head->lock);
+				sk, net, head, portaddr_bhead, snum);
 		if (!tb)
 			goto fail_unlock;
 	}
@@ -229,6 +217,7 @@ success:
 	ret = 0;
 
 fail_unlock:
+	spin_unlock(&portaddr_bhead->lock);
 	spin_unlock(&head->lock);
 fail:
 	local_bh_enable();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index edb2a4e..26c7f9d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -29,6 +29,7 @@
  * The bindhash mutex for snum's hash chain must be held here.
  */
 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+						 struct sock *sk,
 						 struct net *net,
 						 struct inet_bind_hashbucket *head,
 						 struct inet_bind_hashbucket *portaddr_head,
@@ -37,6 +38,32 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
 
 	if (tb != NULL) {
+		switch (sk->sk_family) {
+		case AF_INET:
+			/* ::ffff:x.y.z.y is the IPv4-mapped IPv6 address for
+			 * IPv4 address x.y.z.t, but only if it's not the any addr */
+			if (INADDR_ANY == sk_rcv_saddr(sk))
+				memset(&tb->ib_addr_ipv6, 0, sizeof(struct in6_addr));
+			else
+				ipv6_addr_set(&tb->ib_addr_ipv6, 0, 0,
+					      htonl(0x0000FFFF),
+					      sk_rcv_saddr(sk));
+
+			/* if no alignment problems appear, the IPv4 address
+			 * should be written to ib_addr_ipv6. If this gets
+			 * triggered check the inet_bind_bucket structure. */
+			WARN_ON(tb->ib_addr_ipv4 != sk_rcv_saddr(sk));
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case AF_INET6:
+			memcpy(&tb->ib_addr_ipv6, &inet6_sk(sk)->rcv_saddr,
+					sizeof(struct in6_addr));
+			break;
+#endif
+		default:
+			WARN(1, "unrecognised sk_family in inet_bind_bucket_create");
+		}
+
 		write_pnet(&tb->ib_net, hold_net(net));
 		tb->port      = snum;
 		tb->fastreuse = 0;
@@ -142,8 +169,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 				break;
 		}
 		if (!node) {
+			portaddr_head = inet_portaddr_hashbucket(table, sk, tb->port);
+
 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
-						     sock_net(sk), head,
+						     sk, sock_net(sk), head,
 						     portaddr_head, port);
 			if (!tb) {
 				spin_unlock(&head->lock);
@@ -521,7 +550,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
 			spin_lock(&portaddr_head->lock);
 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, portaddr_head, port);
+					sk, net, head, portaddr_head, port);
 			spin_unlock(&portaddr_head->lock);
 
 			if (!tb) {
@@ -584,6 +613,98 @@ out:
 	}
 }
 
+struct inet_bind_bucket *
+inet4_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	struct net *net = sock_net(sk);
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_bucket *tb = NULL;
+	struct hlist_node *node;
+
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
+	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
+	portaddr_bhead = inet4_portaddr_hashbucket(hinfo, net,
+				sk_rcv_saddr(sk), port);
+	portaddrany_bhead = inet4_portaddr_hashbucket(hinfo, net,
+						INADDR_ANY, port);
+
+	*p_portaddr_bhead = portaddr_bhead;
+	*p_bhead = bhead;
+
+	/*
+	 * prevent dead locks by always taking locks in a fixed order:
+	 * - always take the port-only lock first. This is done because in some
+	 *   other places this is the lock taken, being folllowed in only some
+	 *   cases by the portaddr lock.
+	 * - between portaddr and portaddrany always choose the one with the
+	 *   lower address. Unlock ordering is not important, as long as the
+	 *   locking order is consistent.
+	 * - make sure to not take the same lock twice
+	 */
+	spin_lock(&bhead->lock);
+	if (portaddr_bhead > portaddrany_bhead) {
+		spin_lock(&portaddrany_bhead->lock);
+		spin_lock(&portaddr_bhead->lock);
+	} else if (portaddr_bhead < portaddrany_bhead) {
+		spin_lock(&portaddr_bhead->lock);
+		spin_lock(&portaddrany_bhead->lock);
+	} else {
+		spin_lock(&portaddr_bhead->lock);
+	}
+
+	if (sk_rcv_saddr(sk) != INADDR_ANY) {
+		struct inet_bind_hashbucket *_head;
+
+		_head = portaddr_bhead;
+		if (bhead->count < portaddr_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
+					goto found;
+		}
+		_head = portaddrany_bhead;
+		if (bhead->count < portaddrany_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == INADDR_ANY))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == INADDR_ANY))
+					goto found;
+		}
+	} else {
+		inet_bind_bucket_for_each(tb, node, &bhead->chain)
+			if ((ib_net(tb) == net) && (tb->port == port))
+				goto found;
+	}
+
+	tb = NULL;
+found:
+	if (portaddr_bhead != portaddrany_bhead)
+		spin_unlock(&portaddrany_bhead->lock);
+
+	/* the other locks remain taken, as the caller
+	 * may want to change the hash tabels */
+	return tb;
+}
+
+
 /*
  * Bind a port for a connect operation and hash it.
  */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 73f1a00..62f1eff 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -294,6 +294,101 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
 					  inet->inet_dport);
 }
 
+
+struct inet_bind_bucket *
+inet6_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	struct net *net = sock_net(sk);
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_bucket *tb = NULL;
+	struct hlist_node *node;
+
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
+	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
+	portaddr_bhead = inet6_portaddr_hashbucket(hinfo, net,
+				inet6_rcv_saddr(sk), port);
+	portaddrany_bhead = inet6_portaddr_hashbucket(hinfo, net,
+				&in6addr_any, port);
+
+	*p_portaddr_bhead = portaddr_bhead;
+	*p_bhead = bhead;
+
+	/*
+	 * prevent dead locks by always taking locks in a fixed order:
+	 * - always take the port-only lock first. This is done because in some
+	 *   other places this is the lock taken, being folllowed in only some
+	 *   cases by the portaddr lock.
+	 * - between portaddr and portaddrany always choose the one with the
+	 *   lower address. Unlock ordering is not important, as long as the
+	 *   locking order is consistent.
+	 * - make sure to not take the same lock twice
+	 */
+	spin_lock(&bhead->lock);
+	if (portaddr_bhead > portaddrany_bhead) {
+		spin_lock(&portaddrany_bhead->lock);
+		spin_lock(&portaddr_bhead->lock);
+	} else if (portaddr_bhead < portaddrany_bhead) {
+		spin_lock(&portaddr_bhead->lock);
+		spin_lock(&portaddrany_bhead->lock);
+	} else {
+		spin_lock(&portaddr_bhead->lock);
+	}
+
+	if (ipv6_addr_any(inet6_rcv_saddr(sk))) {
+		struct inet_bind_hashbucket *_head;
+
+		_head = portaddr_bhead;
+		if (bhead->count < portaddr_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    ipv6_addr_equal(&tb->ib_addr_ipv6,
+						    inet6_rcv_saddr(sk)))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    ipv6_addr_equal(&tb->ib_addr_ipv6,
+						    inet6_rcv_saddr(sk)))
+					goto found;
+		}
+		_head = portaddrany_bhead;
+		if (bhead->count < portaddrany_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    ipv6_addr_any(&tb->ib_addr_ipv6))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    ipv6_addr_any(&tb->ib_addr_ipv6))
+					goto found;
+		}
+	} else {
+		inet_bind_bucket_for_each(tb, node, &bhead->chain)
+			if ((ib_net(tb) == net) && (tb->port == port))
+				goto found;
+	}
+
+	tb = NULL;
+found:
+	if (portaddr_bhead != portaddrany_bhead)
+		spin_unlock(&portaddrany_bhead->lock);
+
+	/* the other locks remain taken, as the caller
+	 * may want to change the hash tabels */
+	return tb;
+}
+
+
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 		       struct sock *sk)
 {
-- 
1.7.10.2

^ permalink raw reply related

* [RFC PATCH 3/4] inet: add/remove inet buckets in the second bind hash
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev, Alexandru Copot,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-1-git-send-email-alex.mihai.c@gmail.com>

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h    |   77 +++++++++++++++++++++++++++++++++++---
 include/net/inet_timewait_sock.h |    3 +-
 net/ipv4/inet_connection_sock.c  |   13 +++++--
 net/ipv4/inet_hashtables.c       |   34 ++++++++++++++---
 net/ipv4/inet_timewait_sock.c    |   15 +++++---
 5 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index a6d0db2..bc06168 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -225,13 +225,15 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 }
 
 extern struct inet_bind_bucket *
-		    inet_bind_bucket_create(struct kmem_cache *cachep,
-					    struct net *net,
-					    struct inet_bind_hashbucket *head,
-					    const unsigned short snum);
+	    inet_bind_bucket_create(struct kmem_cache *cachep,
+				    struct net *net,
+				    struct inet_bind_hashbucket *head,
+				    struct inet_bind_hashbucket *portaddr_head,
+				    const unsigned short snum);
 extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
 				     struct inet_bind_bucket *tb,
-				     struct inet_bind_hashbucket *head);
+				     struct inet_bind_hashbucket *head,
+				     struct inet_bind_hashbucket *portaddr_head);
 
 static inline int inet_bhashfn(struct net *net,
 		const __u16 lport, const int bhash_size)
@@ -239,6 +241,71 @@ static inline int inet_bhashfn(struct net *net,
 	return (lport + net_hash_mix(net)) & (bhash_size - 1);
 }
 
+static inline unsigned int inet4_portaddr_bhashfn(struct net *net, __be32 saddr,
+						  unsigned int port)
+{
+	return jhash_1word(saddr, net_hash_mix(net)) ^ port;
+}
+
+static inline struct inet_bind_hashbucket *
+		inet4_portaddr_hashbucket(struct inet_hashinfo *hinfo,
+					  struct net *net,
+					  __be32 saddr,
+					  unsigned int port)
+{
+	unsigned int h = inet4_portaddr_bhashfn(net, saddr, port);
+	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline unsigned int inet6_portaddr_bhashfn(struct net *net,
+						  const struct in6_addr *addr6,
+						  unsigned int port)
+{
+	unsigned int hash, mix = net_hash_mix(net);
+
+	if (ipv6_addr_any(addr6))
+		hash = jhash_1word(0, mix);
+	else if (ipv6_addr_v4mapped(addr6))
+		hash = jhash_1word(addr6->s6_addr32[3], mix);
+	else
+		hash = jhash2(addr6->s6_addr32, 4, mix);
+
+	return hash ^ port;
+}
+
+static inline struct inet_bind_hashbucket *
+		inet6_portaddr_hashbucket(struct inet_hashinfo *hinfo,
+					  struct net *net,
+					  const struct in6_addr *addr6,
+					  unsigned int port)
+{
+	unsigned int h = inet6_portaddr_bhashfn(net, addr6, port);
+	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
+}
+#endif
+
+
+static inline struct inet_bind_hashbucket *
+		inet_portaddr_hashbucket(struct inet_hashinfo *hinfo,
+					 struct sock  *sk,
+					 unsigned int port)
+{
+	struct net *net = sock_net(sk);
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet4_portaddr_hashbucket(hinfo, net,
+				inet_sk(sk)->inet_rcv_saddr, port);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		return inet6_portaddr_hashbucket(hinfo, net,
+				&inet6_sk(sk)->rcv_saddr, port);
+#endif
+	}
+	WARN(1, "unrecognised sk->sk_family in inet_portaddr_hashbucket");
+	return inet4_portaddr_hashbucket(hinfo, net, INADDR_ANY, port);
+}
+
 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 			   const unsigned short snum);
 
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 725e903..d60d8a9 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -199,7 +199,8 @@ extern int inet_twsk_unhash(struct inet_timewait_sock *tw);
 
 extern int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 				 struct inet_hashinfo *hashinfo,
-				 struct inet_bind_hashbucket *head);
+				 struct inet_bind_hashbucket *head,
+				 struct inet_bind_hashbucket *portaddr_head);
 
 extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 						  const int state);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 95e61596..336531a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -204,9 +204,16 @@ tb_found:
 	}
 tb_not_found:
 	ret = 1;
-	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-					net, head, snum)) == NULL)
-		goto fail_unlock;
+	if (!tb) {
+		struct inet_bind_hashbucket *portaddr_head;
+		portaddr_head = inet_portaddr_hashbucket(hashinfo, sk, snum);
+		spin_lock(&portaddr_head->lock);
+		tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+				net, head, portaddr_head, snum);
+		spin_unlock(&portaddr_head->lock);
+		if (!tb)
+			goto fail_unlock;
+	}
 	if (hlist_empty(&tb->owners)) {
 		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 			tb->fastreuse = 1;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c1f6f28..edb2a4e 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -31,6 +31,7 @@
 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 						 struct net *net,
 						 struct inet_bind_hashbucket *head,
+						 struct inet_bind_hashbucket *portaddr_head,
 						 const unsigned short snum)
 {
 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
@@ -43,6 +44,8 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
 		head->count++;
+		hlist_add_head(&tb->portaddr_node, &portaddr_head->chain);
+		portaddr_head->count++;
 	}
 	return tb;
 }
@@ -51,11 +54,14 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb,
-			      struct inet_bind_hashbucket *head)
+			      struct inet_bind_hashbucket *head,
+			      struct inet_bind_hashbucket *portaddr_head)
 {
 	if (hlist_empty(&tb->owners)) {
 		head->count--;
 		__hlist_del(&tb->node);
+		portaddr_head->count--;
+		__hlist_del(&tb->portaddr_node);
 		release_net(ib_net(tb));
 		kmem_cache_free(cachep, tb);
 	}
@@ -83,17 +89,22 @@ static void __inet_put_port(struct sock *sk)
 	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
 			hashinfo->bhash_size);
 	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+	struct inet_bind_hashbucket *portaddr_head =
+		inet_portaddr_hashbucket(hashinfo, sk, inet_sk(sk)->inet_num);
 	struct inet_bind_bucket *tb;
 
 	atomic_dec(&hashinfo->bsockets);
 
 	spin_lock(&head->lock);
+	spin_lock(&portaddr_head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
 	__sk_del_bind_node(sk);
 	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb,
+				 head, portaddr_head);
+	spin_unlock(&portaddr_head->lock);
 	spin_unlock(&head->lock);
 }
 
@@ -112,6 +123,8 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 	const int bhash = inet_bhashfn(sock_net(sk), port,
 			table->bhash_size);
 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
+	struct inet_bind_hashbucket *portaddr_head =
+		inet_portaddr_hashbucket(table, sk, port);
 	struct inet_bind_bucket *tb;
 
 	spin_lock(&head->lock);
@@ -130,7 +143,8 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 		}
 		if (!node) {
 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
-						     sock_net(sk), head, port);
+						     sock_net(sk), head,
+						     portaddr_head, port);
 			if (!tb) {
 				spin_unlock(&head->lock);
 				return -ENOMEM;
@@ -462,7 +476,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	const unsigned short snum = inet_sk(sk)->inet_num;
-	struct inet_bind_hashbucket *head;
+	struct inet_bind_hashbucket *head, *portaddr_head;
 	struct inet_bind_bucket *tb;
 	int ret;
 	struct net *net = sock_net(sk);
@@ -504,8 +518,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 				}
 			}
 
+			portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
+			spin_lock(&portaddr_head->lock);
 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, port);
+					net, head, portaddr_head, port);
+			spin_unlock(&portaddr_head->lock);
+
 			if (!tb) {
 				spin_unlock(&head->lock);
 				break;
@@ -529,8 +547,12 @@ ok:
 			inet_sk(sk)->inet_sport = htons(port);
 			twrefcnt += hash(sk, tw);
 		}
+		portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
+		spin_lock(&portaddr_head->lock);
 		if (tw)
-			twrefcnt += inet_twsk_bind_unhash(tw, hinfo, head);
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo,
+							  head, portaddr_head);
+		spin_unlock(&portaddr_head->lock);
 		spin_unlock(&head->lock);
 
 		if (tw) {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b7bcd0..29f8061 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -50,7 +50,8 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
  */
 int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 			  struct inet_hashinfo *hashinfo,
-			  struct inet_bind_hashbucket *head)
+			  struct inet_bind_hashbucket *head,
+			  struct inet_bind_hashbucket *portaddr_head)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
@@ -59,7 +60,8 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb,
+				 head, portaddr_head);
 	/*
 	 * We cannot call inet_twsk_put() ourself under lock,
 	 * caller must call it for us.
@@ -71,7 +73,7 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 			     struct inet_hashinfo *hashinfo)
 {
-	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead;
 	int refcnt;
 	/* Unlink from established hashes. */
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
@@ -83,9 +85,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 			hashinfo->bhash_size)];
-
+	portaddr_bhead = inet_portaddr_hashbucket(hashinfo, (struct sock *)tw,
+						  tw->tw_num);
 	spin_lock(&bhead->lock);
-	refcnt += inet_twsk_bind_unhash(tw, hashinfo, bhead);
+	spin_lock(&portaddr_bhead->lock);
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo, bhead, portaddr_bhead);
+	spin_unlock(&portaddr_bhead->lock);
 	spin_unlock(&bhead->lock);
 
 #ifdef SOCK_REFCNT_DEBUG
-- 
1.7.10.2

^ permalink raw reply related

* [RFC PATCH 2/4] inet: add a second bind hash
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev, Alexandru Copot,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-1-git-send-email-alex.mihai.c@gmail.com>

Add a second bind hash table which hashes by bound port and address.

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h |   13 ++++++++++---
 net/dccp/proto.c              |   36 ++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp.c                |   16 ++++++++++++++++
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 8c6addc..a6d0db2 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -84,6 +84,7 @@ struct inet_bind_bucket {
 	signed short		fastreuse;
 	int			num_owners;
 	struct hlist_node	node;
+	struct hlist_node	portaddr_node;
 	struct hlist_head	owners;
 };
 
@@ -94,6 +95,8 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
 
 #define inet_bind_bucket_for_each(tb, pos, head) \
 	hlist_for_each_entry(tb, pos, head, node)
+#define inet_portaddr_bind_bucket_for_each(tb, pos, head) \
+	hlist_for_each_entry(tb, pos, head, portaddr_node)
 
 struct inet_bind_hashbucket {
 	spinlock_t		lock;
@@ -129,13 +132,17 @@ struct inet_hashinfo {
 	unsigned int			ehash_mask;
 	unsigned int			ehash_locks_mask;
 
-	/* Ok, let's try this, I give up, we do need a local binding
-	 * TCP hash as well as the others for fast bind/connect.
+	/*
+	 * bhash:		hashes the buckets by port.
+	 * portaddr_bhash:	hashes bind buckets by bound port and address.
+	 *			When bhash gets too large, we try to lookup on
+	 *			portaddr_bhash.
 	 */
 	struct inet_bind_hashbucket	*bhash;
+	struct inet_bind_hashbucket	*portaddr_bhash;
 
 	unsigned int			bhash_size;
-	/* 4 bytes hole on 64 bit */
+	unsigned int			portaddr_bhash_size;
 
 	struct kmem_cache		*bind_bucket_cachep;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e777beb..298f5c1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1109,7 +1109,7 @@ EXPORT_SYMBOL_GPL(dccp_debug);
 static int __init dccp_init(void)
 {
 	unsigned long goal;
-	int ehash_order, bhash_order, i;
+	int ehash_order, bhash_order, portaddr_bhash_order, i;
 	int rc;
 
 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
@@ -1189,9 +1189,34 @@ static int __init dccp_init(void)
 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
 	}
 
+	portaddr_bhash_order = bhash_order;
+
+	do {
+		dccp_hashinfo.portaddr_bhash_size =
+			(1UL << portaddr_bhash_order) *
+			PAGE_SIZE / sizeof(struct inet_bind_hashbucket);
+		if ((dccp_hashinfo.portaddr_bhash_size > (64 * 1024)) &&
+				portaddr_bhash_order > 0)
+			continue;
+		dccp_hashinfo.portaddr_bhash = (struct inet_bind_hashbucket *)
+			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN,
+					 portaddr_bhash_order);
+	} while (!dccp_hashinfo.portaddr_bhash && --portaddr_bhash_order >= 0);
+
+	if (!dccp_hashinfi.portaddr_bhash) {
+		DCCP_CRIT("Failed to allocate DCCP portaddr bind hash table");
+		goto out_free_dccp_hash;
+	}
+
+	for (i = 0; i < dccp_hashinfo.portaddr_bhash_size; i++) {
+		dccp_hashinfo.portaddr_bhash[i].count = 0;
+		spin_lock_init(&dccp_hashinfo.portaddr_bhash[i].lock);
+		INIT_HLIST_HEAD(&dccp_hashinfo.portaddr_bhash[i].chain);
+	}
+
 	rc = dccp_mib_init();
 	if (rc)
-		goto out_free_dccp_bhash;
+		goto out_free_dccp_portaddr_bhash;
 
 	rc = dccp_ackvec_init();
 	if (rc)
@@ -1215,6 +1240,10 @@ out_ackvec_exit:
 	dccp_ackvec_exit();
 out_free_dccp_mib:
 	dccp_mib_exit();
+out_free_dccp_portaddr_bhash:
+	free_pages((unsigned long)dccp_hashinfo.portaddr_bhash,
+		   portaddr_bhash_order);
+	dccp_hashinfo.portaddr_bhash = NULL;
 out_free_dccp_bhash:
 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
 out_free_dccp_locks:
@@ -1239,6 +1268,9 @@ static void __exit dccp_fini(void)
 	free_pages((unsigned long)dccp_hashinfo.bhash,
 		   get_order(dccp_hashinfo.bhash_size *
 			     sizeof(struct inet_bind_hashbucket)));
+	free_pages((unsigned long)dccp_hashinfo.portaddr_bhash,
+		   get_order(dccp_hashinfo.portaddr_bhash_size *
+			     sizeof(struct inet_bind_hashbucket)));
 	free_pages((unsigned long)dccp_hashinfo.ehash,
 		   get_order((dccp_hashinfo.ehash_mask + 1) *
 			     sizeof(struct inet_ehash_bucket)));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 52cdf67..7dd3e19 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3538,6 +3538,22 @@ void __init tcp_init(void)
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
 
+	tcp_hashinfo.portaddr_bhash =
+		alloc_large_system_hash("TCP portaddr_bind",
+					sizeof(struct inet_bind_hashbucket),
+					tcp_hashinfo.bhash_size,
+					(totalram_pages >= 128 * 1024) ?
+					13 : 15,
+					0,
+					&tcp_hashinfo.portaddr_bhash_size,
+					NULL,
+					64 * 1024);
+	tcp_hashinfo.portaddr_bhash_size = 1U << tcp_hashinfo.portaddr_bhash_size;
+	for (i = 0; i < tcp_hashinfo.portaddr_bhash_size; i++) {
+		tcp_hashinfo.portaddr_bhash[i].count = 0;
+		spin_lock_init(&tcp_hashinfo.portaddr_bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.portaddr_bhash[i].chain);
+	}
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 
-- 
1.7.10.2

^ permalink raw reply related

* [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev, Alexandru Copot,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-1-git-send-email-alex.mihai.c@gmail.com>

The counter will be used by the upcoming INET lookup algorithm to
choose the shortest chain after secondary hash is added.

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h    |    4 +++-
 include/net/inet_timewait_sock.h |    4 +++-
 net/dccp/proto.c                 |    1 +
 net/ipv4/inet_hashtables.c       |    9 ++++++---
 net/ipv4/inet_timewait_sock.c    |    7 ++++---
 net/ipv4/tcp.c                   |    1 +
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f..8c6addc 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -98,6 +98,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
 struct inet_bind_hashbucket {
 	spinlock_t		lock;
 	struct hlist_head	chain;
+	unsigned int		count;
 };
 
 /*
@@ -222,7 +223,8 @@ extern struct inet_bind_bucket *
 					    struct inet_bind_hashbucket *head,
 					    const unsigned short snum);
 extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
-				     struct inet_bind_bucket *tb);
+				     struct inet_bind_bucket *tb,
+				     struct inet_bind_hashbucket *head);
 
 static inline int inet_bhashfn(struct net *net,
 		const __u16 lport, const int bhash_size)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index ba52c83..725e903 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -30,6 +30,7 @@
 #include <linux/atomic.h>
 
 struct inet_hashinfo;
+struct inet_bind_hashbucket;
 
 #define INET_TWDR_RECYCLE_SLOTS_LOG	5
 #define INET_TWDR_RECYCLE_SLOTS		(1 << INET_TWDR_RECYCLE_SLOTS_LOG)
@@ -197,7 +198,8 @@ extern void inet_twsk_put(struct inet_timewait_sock *tw);
 extern int inet_twsk_unhash(struct inet_timewait_sock *tw);
 
 extern int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-				 struct inet_hashinfo *hashinfo);
+				 struct inet_hashinfo *hashinfo,
+				 struct inet_bind_hashbucket *head);
 
 extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 						  const int state);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6c7c78b..e777beb 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1184,6 +1184,7 @@ static int __init dccp_init(void)
 	}
 
 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+		dccp_hashinfo.bhash[i].count = 0;
 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
 	}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7880af9..c1f6f28 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -42,6 +42,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		tb->num_owners = 0;
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
+		head->count++;
 	}
 	return tb;
 }
@@ -49,9 +50,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 /*
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb,
+			      struct inet_bind_hashbucket *head)
 {
 	if (hlist_empty(&tb->owners)) {
+		head->count--;
 		__hlist_del(&tb->node);
 		release_net(ib_net(tb));
 		kmem_cache_free(cachep, tb);
@@ -90,7 +93,7 @@ static void __inet_put_port(struct sock *sk)
 	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
 	spin_unlock(&head->lock);
 }
 
@@ -527,7 +530,7 @@ ok:
 			twrefcnt += hash(sk, tw);
 		}
 		if (tw)
-			twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo, head);
 		spin_unlock(&head->lock);
 
 		if (tw) {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3..5b7bcd0 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -49,7 +49,8 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
  *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-			  struct inet_hashinfo *hashinfo)
+			  struct inet_hashinfo *hashinfo,
+			  struct inet_bind_hashbucket *head)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
@@ -58,7 +59,7 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
 	/*
 	 * We cannot call inet_twsk_put() ourself under lock,
 	 * caller must call it for us.
@@ -84,7 +85,7 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 			hashinfo->bhash_size)];
 
 	spin_lock(&bhead->lock);
-	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo, bhead);
 	spin_unlock(&bhead->lock);
 
 #ifdef SOCK_REFCNT_DEBUG
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bb485fc..52cdf67 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3533,6 +3533,7 @@ void __init tcp_init(void)
 					64 * 1024);
 	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+		tcp_hashinfo.bhash[i].count = 0;
 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
-- 
1.7.10.2

^ permalink raw reply related

* [RFC PATCH 0/4] inet: add second hash table
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev, Alexandru Copot,
	Daniel Baluta, Lucian Grijincu

This patchset implements all the operations needed to use a second
(port,address) bind hash table for inet. It uses a similar approach
as the UDP implementation.

The performance improvements for port allocation are very good and
detailed in the last message.

This is based on a series of patches written by Lucian Grijincu at Ixia.

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
Alexandru Copot (4):
      inet: add counter to inet_bind_hashbucket
      inet: add a second bind hash
      inet: add/remove inet buckets in the second bind hash
      inet: use second hash in inet_csk_get_port

 include/net/inet_hashtables.h    |  140 +++++++++++++++++++++++++++++++--
 include/net/inet_timewait_sock.h |    5 +-
 net/dccp/proto.c                 |   37 ++++++++-
 net/ipv4/inet_connection_sock.c  |   66 ++++++++--------
 net/ipv4/inet_hashtables.c       |  158 ++++++++++++++++++++++++++++++++++++--
 net/ipv4/inet_timewait_sock.c    |   16 ++--
 net/ipv4/tcp.c                   |   17 ++++
 net/ipv6/inet6_hashtables.c      |   95 +++++++++++++++++++++++
 8 files changed, 477 insertions(+), 57 deletions(-)

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-30  7:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1338361587.2760.94.camel@edumazet-glaptop>

2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
> On Wed, 2012-05-30 at 08:51 +0200, Jean-Michel Hautbois wrote:
>> 2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
>> > On Wed, 2012-05-30 at 08:28 +0200, Jean-Michel Hautbois wrote:
>> >
>> >> If this can help, setting tx queue length to 5000 seems to make the
>> >> problem disappear.
>> >
>> > Then you should have drops at Qdisc layer (before your change to 5000)
>> >
>> > tc -s -d qdisc
>> >
>> >> I didn't specified it : MTU is 4096, UDP packets are 4000 bytes.
>> >
>>
>> Yes :
>> qdisc mq 0: dev eth1 root
>>  Sent 5710049154383 bytes 1413544639 pkt (dropped 73078, overlimits 0
>> requeues 281540)
>>  backlog 0b 0p requeues 281540
>>
>> Why ? With a 2.6.26 kernel it works well with a tx queue length of 1000.
>
> If you send big bursts of packets, then you need a large enough queue.
>
> Maybe your kernel is now faster than before and queue fills faster, or
> TX ring is smaller ?
>
> ethtool -g eth0
>
> Note that everybody try to reduce dumb queue sizes because of latencies.
>

TX ring is not the same :
On 3.2 :
$> ethtool -g eth1
Ring parameters for eth1:
Pre-set maximums:
RX:             1024
RX Mini:        0
RX Jumbo:       0
TX:             2048
Current hardware settings:
RX:             1024
RX Mini:        0
RX Jumbo:       0
TX:             2048


On 2.6.26 :
$>ethtool -g eth1
Ring parameters for eth1:
Pre-set maximums:
RX:             1024
RX Mini:        0
RX Jumbo:       0
TX:             2048
Current hardware settings:
RX:             1003
RX Mini:        0
RX Jumbo:       0
TX:             0

I can't set TX ring using ethtool -G eth1 tx N : operation not supported
I am not really impacted by latency, but the lower the better.

JM

^ permalink raw reply

* Re: [PATCH] iwlwifi: fix typo 'IWL_WATCHHDOG_DISABLED'
From: Johannes Berg @ 2012-05-30  7:18 UTC (permalink / raw)
  To: Paul Bolle
  Cc: Wey-Yi Guy, Intel Linux Wireless, John W. Linville,
	linux-wireless, netdev, linux-kernel
In-Reply-To: <1338361709.1780.3.camel@x61.thuisdomein>

On Wed, 2012-05-30 at 09:08 +0200, Paul Bolle wrote:
> Commit 7c5ba4a830cbb730770129b0004e2a06e47dbac5 ("iwlwifi: move queue
> watchdog into transport") introduced the named constant
> 'IWL_WATCHHDOG_DISABLED'. Rename it to 'IWL_WATCHDOG_DISABLED'.

Thanks, I've picked this up. Since we're in the merge window John isn't
taking new patches right now so it'll be a while until it shows up.

johannes

^ permalink raw reply

* Re: [PATCH 19/22] net/smsc911x: Repair broken failure paths
From: Linus Walleij @ 2012-05-30  7:16 UTC (permalink / raw)
  To: Lee Jones
  Cc: linux-arm-kernel, arnd, linus.walleij, grant.likely, cjb, broonie,
	sameo, netdev
In-Reply-To: <1338353260-10097-20-git-send-email-lee.jones@linaro.org>

On Wed, May 30, 2012 at 12:47 PM, Lee Jones <lee.jones@linaro.org> wrote:

> Current failure paths attempt to free resources which we failed to request
> and disable resources which we failed to enable ones. This leads to kernel
> oops/panic. This patch does some simple re-ordering to prevent this from
> happening.
>
> Cc: netdev@vger.kernel.org
> Signed-off-by: Lee Jones <lee.jones@linaro.org>

Acked-by: Linus Walleij <linus.walleij@linaro.org>

Thanks,
Linus Walleij

^ permalink raw reply

* [PATCH] iwlwifi: fix typo 'IWL_WATCHHDOG_DISABLED'
From: Paul Bolle @ 2012-05-30  7:08 UTC (permalink / raw)
  To: Johannes Berg, Wey-Yi Guy, Intel Linux Wireless, John W. Linville
  Cc: linux-wireless, netdev, linux-kernel

Commit 7c5ba4a830cbb730770129b0004e2a06e47dbac5 ("iwlwifi: move queue
watchdog into transport") introduced the named constant
'IWL_WATCHHDOG_DISABLED'. Rename it to 'IWL_WATCHDOG_DISABLED'.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
---
Just something I noticed while researching another issue. Compile tested
(by compiling iwl-1000.0, iwl-5000.o and iwl-agn.o).

 drivers/net/wireless/iwlwifi/iwl-1000.c   |    2 +-
 drivers/net/wireless/iwlwifi/iwl-5000.c   |    2 +-
 drivers/net/wireless/iwlwifi/iwl-agn.c    |    2 +-
 drivers/net/wireless/iwlwifi/iwl-config.h |    2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/iwlwifi/iwl-1000.c b/drivers/net/wireless/iwlwifi/iwl-1000.c
index 2629a66..2b7f615 100644
--- a/drivers/net/wireless/iwlwifi/iwl-1000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-1000.c
@@ -64,7 +64,7 @@ static const struct iwl_base_params iwl1000_base_params = {
 	.support_ct_kill_exit = true,
 	.plcp_delta_threshold = IWL_MAX_PLCP_ERR_EXT_LONG_THRESHOLD_DEF,
 	.chain_noise_scale = 1000,
-	.wd_timeout = IWL_WATCHHDOG_DISABLED,
+	.wd_timeout = IWL_WATCHDOG_DISABLED,
 	.max_event_log_size = 128,
 };
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c
index 8e26bc8..a355ade 100644
--- a/drivers/net/wireless/iwlwifi/iwl-5000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-5000.c
@@ -62,7 +62,7 @@ static const struct iwl_base_params iwl5000_base_params = {
 	.led_compensation = 51,
 	.plcp_delta_threshold = IWL_MAX_PLCP_ERR_LONG_THRESHOLD_DEF,
 	.chain_noise_scale = 1000,
-	.wd_timeout = IWL_WATCHHDOG_DISABLED,
+	.wd_timeout = IWL_WATCHDOG_DISABLED,
 	.max_event_log_size = 512,
 	.no_idle_support = true,
 };
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index ec36e2b..7eaa979 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -1539,7 +1539,7 @@ static struct iwl_op_mode *iwl_op_mode_dvm_start(struct iwl_trans *trans,
 		trans_cfg.queue_watchdog_timeout =
 			priv->cfg->base_params->wd_timeout;
 	else
-		trans_cfg.queue_watchdog_timeout = IWL_WATCHHDOG_DISABLED;
+		trans_cfg.queue_watchdog_timeout = IWL_WATCHDOG_DISABLED;
 	trans_cfg.command_names = iwl_dvm_cmd_strings;
 
 	ucode_flags = fw->ucode_capa.flags;
diff --git a/drivers/net/wireless/iwlwifi/iwl-config.h b/drivers/net/wireless/iwlwifi/iwl-config.h
index 67b28aa..5f54742 100644
--- a/drivers/net/wireless/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/iwlwifi/iwl-config.h
@@ -113,7 +113,7 @@ enum iwl_led_mode {
 #define IWL_MAX_PLCP_ERR_THRESHOLD_DISABLE	0
 
 /* TX queue watchdog timeouts in mSecs */
-#define IWL_WATCHHDOG_DISABLED	0
+#define IWL_WATCHDOG_DISABLED	0
 #define IWL_DEF_WD_TIMEOUT	2000
 #define IWL_LONG_WD_TIMEOUT	10000
 #define IWL_MAX_WD_TIMEOUT	120000
-- 
1.7.7.6

^ permalink raw reply related

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Eric Dumazet @ 2012-05-30  7:06 UTC (permalink / raw)
  To: Jean-Michel Hautbois; +Cc: netdev
In-Reply-To: <CAL8zT=h2hoTWDMpTe8P-xfm-USXD6spx8fFVmm1UXyOfuuqhZw@mail.gmail.com>

On Wed, 2012-05-30 at 08:51 +0200, Jean-Michel Hautbois wrote:
> 2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
> > On Wed, 2012-05-30 at 08:28 +0200, Jean-Michel Hautbois wrote:
> >
> >> If this can help, setting tx queue length to 5000 seems to make the
> >> problem disappear.
> >
> > Then you should have drops at Qdisc layer (before your change to 5000)
> >
> > tc -s -d qdisc
> >
> >> I didn't specified it : MTU is 4096, UDP packets are 4000 bytes.
> >
> 
> Yes :
> qdisc mq 0: dev eth1 root
>  Sent 5710049154383 bytes 1413544639 pkt (dropped 73078, overlimits 0
> requeues 281540)
>  backlog 0b 0p requeues 281540
> 
> Why ? With a 2.6.26 kernel it works well with a tx queue length of 1000.

If you send big bursts of packets, then you need a large enough queue.

Maybe your kernel is now faster than before and queue fills faster, or
TX ring is smaller ?

ethtool -g eth0

Note that everybody try to reduce dumb queue sizes because of latencies.

^ permalink raw reply

* Re: [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: David Miller @ 2012-05-30  7:02 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jasowang, netdev, linux-kernel, stable, mst
In-Reply-To: <1338360383.2760.84.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 30 May 2012 08:46:23 +0200

> Why doing this test in the while (1) block, it should be done before the
> loop...
> 
> Or even in the caller, note net/unix/af_unix.c does this right.
> 
>         if (len > SKB_MAX_ALLOC)
>                 data_len = min_t(size_t,
>                                  len - SKB_MAX_ALLOC,
>                                  MAX_SKB_FRAGS * PAGE_SIZE);
> 
>         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
>                                    msg->msg_flags & MSG_DONTWAIT, &err);

My impression is that the callers should be fixed to.  It makes no sense
to penalize the call sites that get this right.

And yes, if we do check it in sock_alloc_send_pskb() it should be done
at function entry, not inside the loop.

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-30  6:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1338360536.2760.85.camel@edumazet-glaptop>

2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
> On Wed, 2012-05-30 at 08:28 +0200, Jean-Michel Hautbois wrote:
>
>> If this can help, setting tx queue length to 5000 seems to make the
>> problem disappear.
>
> Then you should have drops at Qdisc layer (before your change to 5000)
>
> tc -s -d qdisc
>
>> I didn't specified it : MTU is 4096, UDP packets are 4000 bytes.
>

Yes :
qdisc mq 0: dev eth1 root
 Sent 5710049154383 bytes 1413544639 pkt (dropped 73078, overlimits 0
requeues 281540)
 backlog 0b 0p requeues 281540

Why ? With a 2.6.26 kernel it works well with a tx queue length of 1000.

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Eric Dumazet @ 2012-05-30  6:48 UTC (permalink / raw)
  To: Jean-Michel Hautbois; +Cc: netdev
In-Reply-To: <CAL8zT=jKhRjVCdYRCerMzimxPAhP5Gi+JBBfuKjG-rfg-LMoVw@mail.gmail.com>

On Wed, 2012-05-30 at 08:28 +0200, Jean-Michel Hautbois wrote:

> If this can help, setting tx queue length to 5000 seems to make the
> problem disappear.

Then you should have drops at Qdisc layer (before your change to 5000)

tc -s -d qdisc

> I didn't specified it : MTU is 4096, UDP packets are 4000 bytes.

^ permalink raw reply

* Re: [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Eric Dumazet @ 2012-05-30  6:46 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, davem, linux-kernel, stable, mst
In-Reply-To: <20120530054702.6146.8503.stgit@amd-6168-8-1.englab.nay.redhat.com>

On Wed, 2012-05-30 at 13:47 +0800, Jason Wang wrote:
> We need to validate the number of pages consumed by data_len, otherwise frags
> array could be overflowed by userspace. So this patch validate data_len and
> return -EMSGSIZE when data_len may occupies more frags than MAX_SKB_FRAGS.
> 
> Cc: stable@vger.kernel.org [2.6.27+]
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  net/core/sock.c |    8 ++++++--
>  1 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 653f8c0..4ad5fa5 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1599,6 +1599,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
>  
>  	timeo = sock_sndtimeo(sk, noblock);
>  	while (1) {
> +		int npages;
>  		err = sock_error(sk);
>  		if (err != 0)
>  			goto failure;
> @@ -1607,17 +1608,20 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
>  		if (sk->sk_shutdown & SEND_SHUTDOWN)
>  			goto failure;
>  
> +		err = -EMSGSIZE;
> +		npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
> +		if (npages > MAX_SKB_FRAGS)
> +			goto failure;
> +
>  		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
>  			skb = alloc_skb(header_len, gfp_mask);
>  			if (skb) {
> -				int npages;
>  				int i;
>  
>  				/* No pages, we're done... */
>  				if (!data_len)
>  					break;
>  
> -				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
>  				skb->truesize += data_len;
>  				skb_shinfo(skb)->nr_frags = npages;
>  				for (i = 0; i < npages; i++) {
> 


Why doing this test in the while (1) block, it should be done before the
loop...

Or even in the caller, note net/unix/af_unix.c does this right.

        if (len > SKB_MAX_ALLOC)
                data_len = min_t(size_t,
                                 len - SKB_MAX_ALLOC,
                                 MAX_SKB_FRAGS * PAGE_SIZE);

        skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
                                   msg->msg_flags & MSG_DONTWAIT, &err);

^ permalink raw reply

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-30  6:41 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Jesper Dangaard Brouer, Jesper Dangaard Brouer, netdev,
	Christoph Paasch, David S. Miller, Martin Topholm,
	Florian Westphal, opurdila, Hans Schillstrom, Tom Herbert
In-Reply-To: <m2bol6lqxo.fsf@firstfloor.org>

On Tue, 2012-05-29 at 12:37 -0700, Andi Kleen wrote:

> So basically handling syncookie lockless? 
> 
> Makes sense. Syncookies is a bit obsolete these days of course, due
> to the lack of options. But may be still useful for this.
> 
> Obviously you'll need to clean up the patch and support IPv6,
> but the basic idea looks good to me.

Also TCP Fast Open should be a good way to make the SYN flood no more
effective.

Yuchung Cheng and Jerry Chu should upstream this code in a very near
future.

Another way to mitigate SYN scalability issues before the full RCU
solution I was cooking is to either :

1) Use a hardware filter (like on Intel NICS) to force all SYN packets
going to one queue (so that they are all serviced on one CPU)

2) Tweak RPS (__skb_get_rxhash()) so that SYN packets rxhash is not
dependent on src port/address, to get same effect (All SYN packets
processed by one cpu). Note this only address the SYN flood problem, not
the general 3WHS scalability one, since if real connection is
established, the third packet (ACK from client) will have the 'real'
rxhash and will be processed by another cpu.

(Of course, RPS must be enabled to benefit from this)

Untested patch to get the idea :

 include/net/flow_keys.h   |    1 +
 net/core/dev.c            |    8 ++++++++
 net/core/flow_dissector.c |    9 +++++++++
 3 files changed, 18 insertions(+)

diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index 80461c1..b5bae21 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -10,6 +10,7 @@ struct flow_keys {
 		__be16 port16[2];
 	};
 	u8 ip_proto;
+	u8 tcpflags;
 };
 
 extern bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow);
diff --git a/net/core/dev.c b/net/core/dev.c
index cd09819..c9c039e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/static_key.h>
 #include <net/flow_keys.h>
+#include <net/tcp.h>
 
 #include "net-sysfs.h"
 
@@ -2614,6 +2615,12 @@ void __skb_get_rxhash(struct sk_buff *skb)
 		return;
 
 	if (keys.ports) {
+		if ((keys.tcpflags & (TCPHDR_SYN | TCPHDR_ACK)) == TCPHDR_SYN) {
+			hash = jhash_2words((__force u32)keys.dst,
+					    (__force u32)keys.port16[1],
+					    hashrnd);
+			goto end;
+		}
 		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
 			swap(keys.port16[0], keys.port16[1]);
 		skb->l4_rxhash = 1;
@@ -2626,6 +2633,7 @@ void __skb_get_rxhash(struct sk_buff *skb)
 	hash = jhash_3words((__force u32)keys.dst,
 			    (__force u32)keys.src,
 			    (__force u32)keys.ports, hashrnd);
+end:
 	if (!hash)
 		hash = 1;
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a225089..cd4aedf 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -137,6 +137,15 @@ ipv6:
 		ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
 		if (ports)
 			flow->ports = *ports;
+		if (ip_proto == IPPROTO_TCP) {
+			__u8 *tcpflags, _tcpflags;
+
+			tcpflags = skb_header_pointer(skb, nhoff + 13,
+						      sizeof(_tcpflags),
+						      &_tcpflags);
+			if (tcpflags)
+				flow->tcpflags = *tcpflags;
+		}
 	}
 
 	return true;

^ permalink raw reply related

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-30  6:28 UTC (permalink / raw)
  To: netdev
In-Reply-To: <CAL8zT=hHAc5-wGMKgumV=K9jG6tDxzej9Dffe9UYQ24gdWQRtw@mail.gmail.com>

2012/5/29 Jean-Michel Hautbois <jhautbois@gmail.com>:
> Hi list,
>
> I am using a NC553i ethernet card connected on a HP 10GbE Flex-10.
> I am sending UDP multicast packets from one blade to another (HP
> ProLiant BL460c G7) which has stricly the same HW.
>
> I have lots of packet loss from Tx to Rx, and I can't understand why.
> I suspected TX coalescing but since 3.4 I can't set this parameter
> (and adaptive-tx is on by default).
> I have tried the same test with a debian lenny (2.6.26 kernel and HP
> drivers) and it works very well (adaptive-tx is off).
>
> Here is the netstat (from Tx point of view) :
>
> $> netstat -s eth1 > before ; sleep 10 ; netstat -s eth1 > after
> $> beforeafter before after
> Ip:
>    280769 total packets received
>    4 with invalid addresses
>    0 forwarded
>    0 incoming packets discarded
>    275063 incoming packets delivered
>    305430 requests sent out
>    0 dropped because of missing route
> Icmp:
>    0 ICMP messages received
>    0 input ICMP message failed.
>    ICMP input histogram:
>        destination unreachable: 0
>        echo requests: 0
>    0 ICMP messages sent
>    0 ICMP messages failed
>    ICMP output histogram:
>        destination unreachable: 0
>        echo replies: 0
> IcmpMsg:
>        InType3: 0
>        InType8: 0
>        OutType0: 0
>        OutType3: 0
> Tcp:
>    18 active connections openings
>    18 passive connection openings
>    0 failed connection attempts
>    0 connection resets received
>    0 connections established
>    3681 segments received
>    3650 segments send out
>    0 segments retransmited
>    0 bad segments received.
>    0 resets sent
> Udp:
>    12626 packets received
>    0 packets to unknown port received.
>    0 packet receive errors
>    259025 packets sent
> UdpLite:
> TcpExt:
>    0 invalid SYN cookies received
>    0 packets pruned from receive queue because of socket buffer overrun
>    14 TCP sockets finished time wait in fast timer
>    0 packets rejects in established connections because of timestamp
>    61 delayed acks sent
>    0 delayed acks further delayed because of locked socket
>    Quick ack mode was activated 0 times
>    2924 packets directly queued to recvmsg prequeue.
>    32 bytes directly in process context from backlog
>    48684 bytes directly received in process context from prequeue
>    232 packet headers predicted
>    1991 packets header predicted and directly queued to user
>    132 acknowledgments not containing data payload received
>    2230 predicted acknowledgments
>    0 times recovered from packet loss by selective acknowledgements
>    0 congestion windows recovered without slow start after partial ack
>    0 TCP data loss events
>    0 timeouts after SACK recovery
>    0 fast retransmits
>    0 forward retransmits
>    0 retransmits in slow start
>    0 other TCP timeouts
>    1 times receiver scheduled too late for direct processing
>    0 packets collapsed in receive queue due to low socket buffer
>    0 DSACKs sent for old packets
>    0 DSACKs received
>    0 connections reset due to unexpected data
>    0 connections reset due to early user close
>    0 connections aborted due to timeout
>    0 times unabled to send RST due to no memory
>    TCPSackShifted: 0
>    TCPSackMerged: 0
>    TCPSackShiftFallback: 0
>    TCPBacklogDrop: 0
>    TCPDeferAcceptDrop: 0
> IpExt:
>    InMcastPkts: -652745397
>    OutMcastPkts: 301498
>    InBcastPkts: 13
>    InOctets: -2004227752
>    OutOctets: -2096666083
>    InMcastOctets: 1058181285
>    OutMcastOctets: -1510963815
>    InBcastOctets: 1014
>
> And ethtool diff :
> $> ethtool -S eth1 > before ; sleep 10 ; ethtool -S eth1 > after
> $> beforeafter before after
> NIC statistics:
>     rx_crc_errors: 0
>     rx_alignment_symbol_errors: 0
>     rx_pause_frames: 0
>     rx_control_frames: 0
>     rx_in_range_errors: 0
>     rx_out_range_errors: 0
>     rx_frame_too_long: 0
>     rx_address_mismatch_drops: 6
>     rx_dropped_too_small: 0
>     rx_dropped_too_short: 0
>     rx_dropped_header_too_small: 0
>     rx_dropped_tcp_length: 0
>     rx_dropped_runt: 0
>     rxpp_fifo_overflow_drop: 0
>     rx_input_fifo_overflow_drop: 0
>     rx_ip_checksum_errs: 0
>     rx_tcp_checksum_errs: 0
>     rx_udp_checksum_errs: 0
>     tx_pauseframes: 0
>     tx_controlframes: 0
>     rx_priority_pause_frames: 0
>     pmem_fifo_overflow_drop: 0
>     jabber_events: 0
>     rx_drops_no_pbuf: 0
>     rx_drops_no_erx_descr: 0
>     rx_drops_no_tpre_descr: 0
>     rx_drops_too_many_frags: 0
>     forwarded_packets: 0
>     rx_drops_mtu: 0
>     eth_red_drops: 0
>     be_on_die_temperature: 0
>     rxq0: rx_bytes: 0
>     rxq0: rx_pkts: 0
>     rxq0: rx_compl: 0
>     rxq0: rx_mcast_pkts: 0
>     rxq0: rx_post_fail: 0
>     rxq0: rx_drops_no_skbs: 0
>     rxq0: rx_drops_no_frags: 0
>     txq0: tx_compl: 257113
>     txq0: tx_bytes: 1038623935
>     txq0: tx_pkts: 257113
>     txq0: tx_reqs: 257113
>     txq0: tx_wrbs: 514226
>     txq0: tx_stops: 10
>
> As you can see, there is 10 tx_stops in 10 seconds (it varies, can be 3 to 15).
> Any thoughts ?
>
> Regards,
> JM

If this can help, setting tx queue length to 5000 seems to make the
problem disappear.
I didn't specified it : MTU is 4096, UDP packets are 4000 bytes.

JM

^ permalink raw reply

* [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Jason Wang @ 2012-05-30  5:47 UTC (permalink / raw)
  To: netdev, davem, linux-kernel; +Cc: stable, mst

We need to validate the number of pages consumed by data_len, otherwise frags
array could be overflowed by userspace. So this patch validate data_len and
return -EMSGSIZE when data_len may occupies more frags than MAX_SKB_FRAGS.

Cc: stable@vger.kernel.org [2.6.27+]
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/core/sock.c |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 653f8c0..4ad5fa5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1599,6 +1599,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 
 	timeo = sock_sndtimeo(sk, noblock);
 	while (1) {
+		int npages;
 		err = sock_error(sk);
 		if (err != 0)
 			goto failure;
@@ -1607,17 +1608,20 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 		if (sk->sk_shutdown & SEND_SHUTDOWN)
 			goto failure;
 
+		err = -EMSGSIZE;
+		npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+		if (npages > MAX_SKB_FRAGS)
+			goto failure;
+
 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
 			skb = alloc_skb(header_len, gfp_mask);
 			if (skb) {
-				int npages;
 				int i;
 
 				/* No pages, we're done... */
 				if (!data_len)
 					break;
 
-				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 				skb->truesize += data_len;
 				skb_shinfo(skb)->nr_frags = npages;
 				for (i = 0; i < npages; i++) {

^ permalink raw reply related

* Re: [PATCH v3] drop_monitor: convert to modular building
From: Eric Dumazet @ 2012-05-30  5:29 UTC (permalink / raw)
  To: Neil Horman; +Cc: David Miller, netdev, bhutchings
In-Reply-To: <20120529193348.GB9258@hmsreliant.think-freely.org>

On Tue, 2012-05-29 at 15:33 -0400, Neil Horman wrote:

> Eric,
> 	Just FYI, I sent a series upstream to implement autoloading of generic
> netlink families.  Please be awarem, that I've tested these with a hacked
> version of dropwatch, and it works great, but with the normal version of
> dropwatch, the drop_monitor module still doesn't autoload.  This is due to libnl
> not explicitly requesting a family when genl_ctrl_family_resolve is called.
> Instead of trying to load the module, it dumps the existing registered families
> via a NLM_F_DUMP message.  I'm working on updating libnl to correct this
> currently and will cc you on the patch.

Excellent, thanks Neil

^ permalink raw reply

* [PATCH 19/22] net/smsc911x: Repair broken failure paths
From: Lee Jones @ 2012-05-30  4:47 UTC (permalink / raw)
  To: linux-arm-kernel, arnd, linus.walleij, grant.likely, cjb, broonie,
	sameo
  Cc: Lee Jones, netdev
In-Reply-To: <1338353260-10097-1-git-send-email-lee.jones@linaro.org>

Current failure paths attempt to free resources which we failed to request
and disable resources which we failed to enable ones. This leads to kernel
oops/panic. This patch does some simple re-ordering to prevent this from
happening.

Cc: netdev@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/net/ethernet/smsc/smsc911x.c |    7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index dab9c6f..1466e5d 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2390,11 +2390,11 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 
 	retval = smsc911x_request_resources(pdev);
 	if (retval)
-		goto out_return_resources;
+		goto out_request_resources_fail;
 
 	retval = smsc911x_enable_resources(pdev);
 	if (retval)
-		goto out_disable_resources;
+		goto out_enable_resources_fail;
 
 	if (pdata->ioaddr == NULL) {
 		SMSC_WARN(pdata, probe, "Error smsc911x base address invalid");
@@ -2501,8 +2501,9 @@ out_free_irq:
 	free_irq(dev->irq, dev);
 out_disable_resources:
 	(void)smsc911x_disable_resources(pdev);
-out_return_resources:
+out_enable_resources_fail:
 	smsc911x_free_resources(pdev);
+out_request_resources_fail:
 	platform_set_drvdata(pdev, NULL);
 	iounmap(pdata->ioaddr);
 	free_netdev(dev);
-- 
1.7.9.5

^ permalink raw reply related

* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-30  4:45 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: christoph.paasch, netdev, David S. Miller, Martin Topholm,
	Florian Westphal, opurdila, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338322661.7747.17.camel@localhost>

On Tue, 2012-05-29 at 22:17 +0200, Jesper Dangaard Brouer wrote:
> On Mon, 2012-05-28 at 18:14 +0200, Christoph Paasch wrote:
> 

> > Concerning (1):
> > I think, there are places where you may have troube because you don't
> > hold the lock.
> > E.g., in tcp_make_synack (called by tcp_v4_send_synack from your
> > tcp_v4_syn_conn_limit) there is:
> > 
> > if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
> > 	(req->window_clamp > tcp_full_space(sk) ||
> > 	 req->window_clamp == 0))
> > 	req->window_clamp = tcp_full_space(sk);
> > 
> > Thus, tcp_full_space(sk) may have different values between the check and
> > setting req->window_clamp.
> 
> This should be simply solved by using a local stack variable, for
> storing the result from tcp_full_space(sk).  Its likely that GCC already
> does this behind our back.
> 

Thats not the proper way to handle that situation.

A local stack variable makes no such guarantee. You need ACCESS_ONCE().

This is exactly the kind of things that RCU takes care of.

^ permalink raw reply

* [RFC PATCH v1 3/3] ixgbe: add setlink, getlink support to ixgbe and ixgbevf
From: John Fastabend @ 2012-05-30  3:07 UTC (permalink / raw)
  To: krkumar2, hadi, shemminger, mst, buytenh, eilong
  Cc: sri, gregory.v.rose, netdev, bhutchings, jeffrey.t.kirsher,
	eric.w.multanen
In-Reply-To: <20120530030531.7443.72024.stgit@jf-dev1-dcblab>

This adds support for the net device ops to manage the embedded
hardware bridge on ixgbe devices. With this patch the bridge
mode can be toggled between VEB and VEPA to support stacking
macvlan devices or using the embedded switch without any SW
component in 802.1Qbg/br environments.

Additionally, this adds source address pruning to the ixgbevf
driver to prune any frames sent back from a reflective relay on
the switch. This is required because the existing hardware does
not support this. Without it frames get pushed into the stack
with its own src mac which is invalid per 802.1Qbg VEPA
definition.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  100 ++++++++++++++++++++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c    |    3 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   10 ++
 3 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index bf20457..55dfb06 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3144,7 +3144,6 @@ static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
 	IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset ^ 1), 0);
 	IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), (1 << vf_shift));
 	IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset ^ 1), 0);
-	IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
 
 	/* Map PF MAC address in RAR Entry 0 to first pool following VFs */
 	hw->mac.ops.set_vmdq(hw, 0, adapter->num_vfs);
@@ -3158,8 +3157,6 @@ static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
 	gcr_ext |= IXGBE_GCR_EXT_VT_MODE_64;
 	IXGBE_WRITE_REG(hw, IXGBE_GCR_EXT, gcr_ext);
 
-	/* enable Tx loopback for VF/PF communication */
-	IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
 	/* Enable MAC Anti-Spoofing */
 	hw->mac.ops.set_mac_anti_spoofing(hw,
 					   (adapter->num_vfs != 0),
@@ -6844,6 +6841,101 @@ static int ixgbe_ndo_fdb_dump(struct sk_buff *skb,
 	return idx;
 }
 
+static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
+				    struct nlmsghdr *nlh)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+	struct nlattr *attr, *bridge;
+	int rem;
+
+	if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
+		return -EOPNOTSUPP;
+
+	bridge = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_BRIDGE);
+
+	nla_for_each_nested(attr, bridge, rem) {
+		__u16 mode;
+		u32 reg = 0;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+
+		mode = nla_get_u16(attr);
+		if (mode == BRIDGE_MODE_VEPA)
+			reg = 0;
+		else if (mode == BRIDGE_MODE_VEB)
+			reg = IXGBE_PFDTXGSWC_VT_LBEN;
+		else
+			return -EINVAL;
+
+		IXGBE_WRITE_REG(&adapter->hw, IXGBE_PFDTXGSWC, reg);
+
+		e_info(drv, "enabling bridge mode: %s\n",
+			mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+	}
+
+	return 0;
+}
+
+static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				    struct net_device *dev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	struct nlattr *bridge;
+	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+	u16 bridge_mode;
+
+	if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
+		return 0;
+
+	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifi_family = AF_BRIDGE;
+	ifm->__ifi_pad = 0;
+	ifm->ifi_type = dev->type;
+	ifm->ifi_index = dev->ifindex;
+	ifm->ifi_flags = dev_get_flags(dev);
+	ifm->ifi_change = 0;
+
+
+	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+	    nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+	    nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+	    (dev->master &&
+	     nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) ||
+	    (dev->addr_len &&
+	     nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+	    (dev->ifindex != dev->iflink &&
+	     nla_put_u32(skb, IFLA_LINK, dev->iflink)))
+		goto nla_put_failure;
+
+	if (IXGBE_READ_REG(&adapter->hw, IXGBE_PFDTXGSWC) & 1)
+		bridge_mode = BRIDGE_MODE_VEB;
+	else
+		bridge_mode = BRIDGE_MODE_VEPA;
+
+	bridge = nla_nest_start(skb, IFLA_BRIDGE);
+	if (!bridge)
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) ||
+	    nla_put_u16(skb, IFLA_BRIDGE_MODE, bridge_mode)) {
+		nla_nest_cancel(skb, bridge);
+		goto nla_put_failure;
+	}
+	nla_nest_end(skb, bridge);
+
+	return nlmsg_end(skb, nlh);
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
@@ -6883,6 +6975,8 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_fdb_add		= ixgbe_ndo_fdb_add,
 	.ndo_fdb_del		= ixgbe_ndo_fdb_del,
 	.ndo_fdb_dump		= ixgbe_ndo_fdb_dump,
+	.ndo_bridge_setlink	= ixgbe_ndo_bridge_setlink,
+	.ndo_bridge_getlink	= ixgbe_ndo_bridge_getlink,
 };
 
 static void __devinit ixgbe_probe_vf(struct ixgbe_adapter *adapter,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 2d971d1..bd932c6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -135,6 +135,9 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter,
 		}
 	}
 
+	/* Initialize default switching mode VEB */
+	IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN);
+
 	/* If call to enable VFs succeeded then allocate memory
 	 * for per VF control structures.
 	 */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index f69ec42..f8d6f04 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -575,6 +575,16 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		}
 		skb->protocol = eth_type_trans(skb, adapter->netdev);
 
+		/* Workaround hardware that can't do proper VEPA multicast
+		 * source pruning.
+		 */
+		if ((skb->pkt_type & (PACKET_BROADCAST | PACKET_MULTICAST)) &&
+		    !(compare_ether_addr(adapter->netdev->dev_addr,
+					eth_hdr(skb)->h_source))) {
+			dev_kfree_skb_irq(skb);
+			goto next_desc;
+		}
+
 		ixgbevf_receive_skb(q_vector, skb, staterr, rx_ring, rx_desc);
 
 next_desc:

^ permalink raw reply related

* [RFC PATCH v1 2/3] net: add VEPA, VEB bridge mode
From: John Fastabend @ 2012-05-30  3:07 UTC (permalink / raw)
  To: krkumar2, hadi, shemminger, mst, buytenh, eilong
  Cc: sri, gregory.v.rose, netdev, bhutchings, jeffrey.t.kirsher,
	eric.w.multanen
In-Reply-To: <20120530030531.7443.72024.stgit@jf-dev1-dcblab>

Hardware switches may support enabling and disabling the
loopback switch which puts the device in a VEPA mode defined
in the IEEE 802.1Qbg specification. In this mode frames are
not switched in the hardware but sent directly to the switch.
SR-IOV capabable NICs will likely support this mode I am
aware of at least two such devices.

This patch adds an additional IFLA_BRIDGE_MODE attribute
that can be set and dumped via the PF_BRIDGE:{SET|GET}LINK
operations. Also anticipating bridge attributes that may
be common for both embedded bridges and software bridges
this adds a flags attribute IFLA_BRIDGE_FLAGS currently
used to determine if the IFLA_BRIDGE command or event is
being generated to/from an embedded bridge or software
bridge. Finally, the event generation is pulled out of
the bridge module and into rtnetlink proper.

For example using the macvlan driver in VEPA mode on top of
an embedded switch requires putting the embedded switch into
a VEPA mode to get the expected results.

	--------  --------
        | VEPA |  | VEPA |       <-- macvlan vepa edge relays
        --------  --------
           |        |
           |        |
        ------------------
        |      VEPA      |       <-- embedded switch in NIC
        ------------------
                |
                |
        -------------------
        | external switch |      <-- shiny new physical
	-------------------          switch with VEPA support

A packet sent from the macvlan VEPA at the top could be
loopbacked on the embedded switch and never seen by the
external switch. So in order for this to work the embedded
switch needs to be set in the VEPA state via the above
described commands.

CC: Lennert Buytenhek <buytenh@wantstofly.org>
CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/if_link.h |   16 ++++++++++
 net/bridge/br_netlink.c |    2 -
 net/core/rtnetlink.c    |   73 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f715750..30489e5 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -139,6 +139,7 @@ enum {
 	IFLA_NET_NS_FD,
 	IFLA_EXT_MASK,		/* Extended info mask, VFs, etc */
 	IFLA_PROMISCUITY,	/* Promiscuity count: > 0 means acts PROMISC */
+	IFLA_BRIDGE,		/* Bridge attributes */
 #define IFLA_PROMISCUITY IFLA_PROMISCUITY
 	__IFLA_MAX
 };
@@ -396,4 +397,19 @@ struct ifla_port_vsi {
 	__u8 pad[3];
 };
 
+/* Bridge Flags */
+#define BRIDGE_FLAGS_MASTER	0	/* Bridge command to/from master */
+#define BRIDGE_FLAGS_SELF	1	/* Bridge command to/from lowerdev */
+
+#define BRIDGE_MODE_VEB		0	/* Default loopback mode */
+#define BRIDGE_MODE_VEPA	1	/* 802.1Qbg defined VEPA mode */
+
+/* Bridge management nested attributes */
+enum {
+	IFLA_BRIDGE_FLAGS,
+	IFLA_BRIDGE_MODE,
+	__IFLA_BRIDGE_MAX,
+};
+#define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
+
 #endif /* _LINUX_IF_LINK_H */
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index f207234..8edbe0d 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -166,8 +166,6 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
 	br_port_state_selection(p->br);
 	spin_unlock_bh(&p->br->lock);
 
-	br_ifinfo_notify(RTM_NEWLINK, p);
-
 	return 0;
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index de6c371..9cd50ab 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1121,6 +1121,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_AF_SPEC]		= { .type = NLA_NESTED },
 	[IFLA_EXT_MASK]		= { .type = NLA_U32 },
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE]		= { .type = NLA_NESTED },
 };
 EXPORT_SYMBOL(ifla_policy);
 
@@ -1158,6 +1159,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
 };
 
+static const struct nla_policy bridge_policy[IFLA_BRIDGE_MAX + 1] = {
+	[IFLA_BRIDGE_FLAGS]	= { .type = NLA_U16 },
+	[IFLA_BRIDGE_MODE]	= { .type = NLA_U16 },
+};
+
 struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 {
 	struct net *net;
@@ -2280,13 +2286,60 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+static inline size_t bridge_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+		+ nla_total_size(IFNAMSIZ)	/* IFLA_IFNAME */
+		+ nla_total_size(MAX_ADDR_LEN)	/* IFLA_ADDRESS */
+		+ nla_total_size(sizeof(u32))	/* IFLA_MASTER */
+		+ nla_total_size(sizeof(u32))	/* IFLA_MTU */
+		+ nla_total_size(sizeof(u32))	/* IFLA_LINK */
+		+ nla_total_size(sizeof(u32))	/* IFLA_OPERSTATE */
+		+ nla_total_size(sizeof(u8))	/* IFLA_PROTINFO */
+		+ nla_total_size(sizeof(struct nlattr))	/* IFLA_BRIDGE */
+		+ nla_total_size(sizeof(u16))	/* IFLA_BRIDGE_FLAGS */
+		+ nla_total_size(sizeof(u16));	/* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev, u16 flags)
+{
+	struct net *net = dev_net(dev);
+	struct net_device *master = dev->master;
+	struct sk_buff *skb;
+	int err = -EOPNOTSUPP;
+
+	skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+	if (!skb) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	if (!flags && master && master->netdev_ops->ndo_bridge_getlink)
+		err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
+	else if (dev->netdev_ops->ndo_bridge_getlink)
+		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev);
+
+	if (err < 0)
+		goto errout;
+
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	return 0;
+errout:
+	WARN_ON(err == -EMSGSIZE);
+	kfree_skb(skb);
+	rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+	return err;
+}
+
 static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			       void *arg)
 {
 	struct net *net = sock_net(skb->sk);
 	struct ifinfomsg *ifm;
 	struct net_device *dev;
-	int err = -EINVAL;
+	struct nlattr *bridge, *attr;
+	int rem, err = -EOPNOTSUPP;
+	u16 flags = 0;
 
 	if (nlmsg_len(nlh) < sizeof(*ifm))
 		return -EINVAL;
@@ -2301,16 +2354,22 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENODEV;
 	}
 
-	if (dev->master && dev->master->netdev_ops->ndo_bridge_setlink) {
-		err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh);
-		if (err)
-			goto out;
+	bridge = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_BRIDGE);
+	nla_for_each_nested(attr, bridge, rem) {
+		if (nla_type(attr) == IFLA_BRIDGE_FLAGS)
+			flags = nla_get_u16(attr);
 	}
 
-	if (dev->netdev_ops->ndo_bridge_setlink)
+	if (!flags && dev->master &&
+	    dev->master->netdev_ops->ndo_bridge_setlink)
+		err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh);
+	else if ((flags & BRIDGE_FLAGS_SELF) &&
+		   dev->netdev_ops->ndo_bridge_setlink)
 		err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
 
-out:
+	/* Generate event to notify upper layer of bridge change */
+	if (!err)
+		err = rtnl_bridge_notify(dev, flags);
 	return err;
 }
 

^ permalink raw reply related

* [RFC PATCH v1 1/3] net: create generic bridge ops
From: John Fastabend @ 2012-05-30  3:07 UTC (permalink / raw)
  To: krkumar2, hadi, shemminger, mst, buytenh, eilong
  Cc: sri, gregory.v.rose, netdev, bhutchings, jeffrey.t.kirsher,
	eric.w.multanen
In-Reply-To: <20120530030531.7443.72024.stgit@jf-dev1-dcblab>

The PF_BRIDGE:RTM_{GET|SET}LINK nlmsg family and type are
currently embedded in the ./net/bridge module. This prohibits
them from being used by other bridging devices. One example
of this being hardware that has embedded bridging components.

In order to use these nlmsg types more generically this patch
adds two net_device_ops hooks. One to set link bridge attributes
and another to dump the current bride attributes.

	ndo_bridge_setlink()
	ndo_bridge_getlink()

This avoids adding many ndo_ops to the net_device but does
require drivers do more nlmsg handling.

CC: Lennert Buytenhek <buytenh@wantstofly.org>
CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/netdevice.h |   10 ++++++
 net/bridge/br_device.c    |    2 +
 net/bridge/br_netlink.c   |   73 ++++++++----------------------------------
 net/bridge/br_private.h   |    3 ++
 net/core/rtnetlink.c      |   78 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 106 insertions(+), 60 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e7fd468..a307d77 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -916,6 +916,10 @@ struct netdev_fcoe_hbainfo {
  *		       struct net_device *dev, int idx)
  *	Used to add FDB entries to dump requests. Implementers should add
  *	entries to skb and update idx with the number of entries.
+ *
+ * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh)
+ * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
+ *			     struct net_device *dev)
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1025,6 +1029,12 @@ struct net_device_ops {
 						struct netlink_callback *cb,
 						struct net_device *dev,
 						int idx);
+
+	int			(*ndo_bridge_setlink)(struct net_device *dev,
+						      struct nlmsghdr *nlh);
+	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
+						      u32 pid, u32 seq,
+						      struct net_device *dev);
 };
 
 /*
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 929e48aed..e942180 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -320,6 +320,8 @@ static const struct net_device_ops br_netdev_ops = {
 	.ndo_fdb_add		 = br_fdb_add,
 	.ndo_fdb_del		 = br_fdb_delete,
 	.ndo_fdb_dump		 = br_fdb_dump,
+	.ndo_bridge_getlink	 = br_getlink,
+	.ndo_bridge_setlink	 = br_setlink,
 };
 
 static void br_dev_free(struct net_device *dev)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 2080485..f207234 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -111,54 +111,33 @@ errout:
 /*
  * Dump information about all ports, in response to GETLINK
  */
-static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+	       struct net_device *dev)
 {
-	struct net *net = sock_net(skb->sk);
-	struct net_device *dev;
-	int idx;
-
-	idx = 0;
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
-		struct net_bridge_port *port = br_port_get_rcu(dev);
-
-		/* not a bridge port */
-		if (!port || idx < cb->args[0])
-			goto skip;
-
-		if (br_fill_ifinfo(skb, port,
-				   NETLINK_CB(cb->skb).pid,
-				   cb->nlh->nlmsg_seq, RTM_NEWLINK,
-				   NLM_F_MULTI) < 0)
-			break;
-skip:
-		++idx;
-	}
-	rcu_read_unlock();
-	cb->args[0] = idx;
+	int err = 0;
+	struct net_bridge_port *port = br_port_get_rcu(dev);
+
+	/* not a bridge port */
+	if (!port)
+		goto out;
 
-	return skb->len;
+	err = br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI);
+out:
+	return err;
 }
 
 /*
  * Change state of port (ie from forwarding to blocking etc)
  * Used by spanning tree in user space.
  */
-static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
 {
-	struct net *net = sock_net(skb->sk);
 	struct ifinfomsg *ifm;
 	struct nlattr *protinfo;
-	struct net_device *dev;
 	struct net_bridge_port *p;
 	u8 new_state;
 
-	if (nlmsg_len(nlh) < sizeof(*ifm))
-		return -EINVAL;
-
 	ifm = nlmsg_data(nlh);
-	if (ifm->ifi_family != AF_BRIDGE)
-		return -EPFNOSUPPORT;
 
 	protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO);
 	if (!protinfo || nla_len(protinfo) < sizeof(u8))
@@ -168,10 +147,6 @@ static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
 	if (new_state > BR_STATE_BLOCKING)
 		return -EINVAL;
 
-	dev = __dev_get_by_index(net, ifm->ifi_index);
-	if (!dev)
-		return -ENODEV;
-
 	p = br_port_get_rtnl(dev);
 	if (!p)
 		return -EINVAL;
@@ -218,29 +193,7 @@ static struct rtnl_link_ops br_link_ops __read_mostly = {
 
 int __init br_netlink_init(void)
 {
-	int err;
-
-	err = rtnl_link_register(&br_link_ops);
-	if (err < 0)
-		goto err1;
-
-	err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL,
-			      br_dump_ifinfo, NULL);
-	if (err)
-		goto err2;
-	err = __rtnl_register(PF_BRIDGE, RTM_SETLINK,
-			      br_rtm_setlink, NULL, NULL);
-	if (err)
-		goto err3;
-
-	return 0;
-
-err3:
-	rtnl_unregister_all(PF_BRIDGE);
-err2:
-	rtnl_link_unregister(&br_link_ops);
-err1:
-	return err;
+	return rtnl_link_register(&br_link_ops);
 }
 
 void __exit br_netlink_fini(void)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1a8ad4f..659907c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -552,6 +552,9 @@ extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr)
 extern int br_netlink_init(void);
 extern void br_netlink_fini(void);
 extern void br_ifinfo_notify(int event, struct net_bridge_port *port);
+extern int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg);
+extern int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		      struct net_device *dev);
 
 #ifdef CONFIG_SYSFS
 /* br_sysfs_if.c */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 21318d1..de6c371 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2239,6 +2239,81 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	int idx = 0;
+	u32 pid = NETLINK_CB(cb->skb).pid;
+	u32 seq = cb->nlh->nlmsg_seq;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		struct net_device *master = dev->master;
+
+		if (idx < cb->args[0])
+			continue;
+
+		if (master && master->netdev_ops->ndo_bridge_getlink) {
+			const struct net_device_ops *bops = master->netdev_ops;
+			int err = bops->ndo_bridge_getlink(skb, pid, seq, dev);
+
+			if (err < 0)
+				break;
+			else
+				idx++;
+		}
+
+		if (ops->ndo_bridge_getlink) {
+			int err = ops->ndo_bridge_getlink(skb, pid, seq, dev);
+
+			if (err < 0)
+				break;
+			else
+				idx++;
+		}
+	}
+	rcu_read_unlock();
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifinfomsg *ifm;
+	struct net_device *dev;
+	int err = -EINVAL;
+
+	if (nlmsg_len(nlh) < sizeof(*ifm))
+		return -EINVAL;
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_family != AF_BRIDGE)
+		return -EPFNOSUPPORT;
+
+	dev = __dev_get_by_index(net, ifm->ifi_index);
+	if (!dev) {
+		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+		return -ENODEV;
+	}
+
+	if (dev->master && dev->master->netdev_ops->ndo_bridge_setlink) {
+		err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh);
+		if (err)
+			goto out;
+	}
+
+	if (dev->netdev_ops->ndo_bridge_setlink)
+		err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+
+out:
+	return err;
+}
+
 /* Protected by RTNL sempahore.  */
 static struct rtattr **rta_buf;
 static int rtattr_max;
@@ -2415,5 +2490,8 @@ void __init rtnetlink_init(void)
 	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
 	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
 	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
+
+	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
+	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
 }
 

^ permalink raw reply related

* [RFC PATCH v1 0/3] Expose switching attributes via PF_BRIDGE
From: John Fastabend @ 2012-05-30  3:06 UTC (permalink / raw)
  To: krkumar2, hadi, shemminger, mst, buytenh, eilong
  Cc: sri, gregory.v.rose, netdev, bhutchings, jeffrey.t.kirsher,
	eric.w.multanen

This series decouples the remaining netlink PF_BRIDGE messages
from the bridging module and moves them into rtnetlink proper.
By doing this we can use these netlink messages to handle any
type of bridge and extend the attributes as needed.

I hope this resolves some of the concerns with the DSA patch
below and expect the attached series can be extended to
support the DSA infrastructure as needed:

http://patchwork.ozlabs.org/patch/16578/

Also this should resolve a patch here that tried to expose
the switching modes but did so using a device specific hook:

http://lists.openwall.net/netdev/2012/04/16/10

I've used a hacked version of the 'bridge' tool Stephen
Hemminger submitted as an RFC some months back to test this
the output looks like this:

[root@jf-dev1-dcblab iproute2]# ./br/br bridge show
eth2: bridge mode: VEB		embedded
eth3: bridge mode: VEB		embedded
[root@jf-dev1-dcblab iproute2]# ./br/br bridge mode dev eth2 mode vepa
[root@jf-dev1-dcblab iproute2]# ./br/br bridge show
eth2: bridge mode: VEPA		embedded
eth3: bridge mode: VEB		embedded

I could have just added a ndo op and IFLA_XXX message to
set the switching mode but IMHO this is not going to scale
as more bridging functionality becomes offloaded. The DSA
example is a case where we already have a fully offloaded
switch. Any solution we come up with should support both
embedded switches and SW switches.

Any comments would be appreciated Thanks!

---

John Fastabend (3):
      ixgbe: add setlink, getlink support to ixgbe and ixgbevf
      net: add VEPA, VEB bridge mode
      net: create generic bridge ops


 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  100 +++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c    |    3 
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   10 ++
 include/linux/if_link.h                           |   16 ++
 include/linux/netdevice.h                         |   10 ++
 net/bridge/br_device.c                            |    2 
 net/bridge/br_netlink.c                           |   75 ++---------
 net/bridge/br_private.h                           |    3 
 net/core/rtnetlink.c                              |  137 +++++++++++++++++++++
 9 files changed, 291 insertions(+), 65 deletions(-)

-- 
Signature

^ permalink raw reply

* Re: [PATCH 0/3] net: implement auto-loading of generic netlink modules
From: David Miller @ 2012-05-30  2:34 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, eric.dumazet, jchapman
In-Reply-To: <1338319842-18395-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 29 May 2012 15:30:39 -0400

> 
> Eric D. recently noted that the drop_monitor module didn't autoload when the
> dropwatch user space utility started.  Looking into this I noted that theres no
> formal macro set to define module aliases that can be used by a request module
> call in the generic netlink family lookup path.  Currenlty the
> net-pf-*-proto-*-type-<n> format is used, but the macros which form this expect
> <n> to be a well defined integer, which generic netlink doesn't use for family
> definitions.  So this series creates a new macro that create a
> net-pf-*-proto-*-name format where name can be any arbitrary string, allowing us
> to apend family-<x> where x is a generic netlink family name.  With these
> macros, we can easily autoload modules that register generic netlink families
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

Looks great, all applied, thanks Neil.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox