Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev, Alexandru Copot,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-1-git-send-email-alex.mihai.c@gmail.com>

This results in a massive improvement when there are many sockets
bound to the same port, but different addresses for both bind() and
listen() system calls (both call inet_csk_get_port).

Tests were run with 16000 subinterfaces each with a distinct
IPv4 address. The sockets are first bound to the same port and
then put on listen().

* Without patch and without SO_REUSEADDR:
    * bind:   1.543 s
    * listen: 3.050 s

* Without patch and with SO_REUSEADDR set:
    * bind:   0.066 s
    * listen: 3.050 s

* With patch and SO_REUSEADDR set / without SO_REUSEADDR:
    * bind:   0.066 s
    * listen: 0.095 s

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h   |   48 +++++++++++++++
 net/ipv4/inet_connection_sock.c |   63 ++++++++------------
 net/ipv4/inet_hashtables.c      |  125 ++++++++++++++++++++++++++++++++++++++-
 net/ipv6/inet6_hashtables.c     |   95 +++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+), 39 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index bc06168..2f589bb 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,6 +81,15 @@ struct inet_bind_bucket {
 	struct net		*ib_net;
 #endif
 	unsigned short		port;
+	union {
+		struct in6_addr ib_addr_ipv6;
+		struct {
+			__be32	_1;
+			__be32	_2;
+			__be32	_3;
+			__be32	ib_addr_ipv4;
+		};
+	};
 	signed short		fastreuse;
 	int			num_owners;
 	struct hlist_node	node;
@@ -226,6 +235,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 
 extern struct inet_bind_bucket *
 	    inet_bind_bucket_create(struct kmem_cache *cachep,
+				    struct sock *sk,
 				    struct net *net,
 				    struct inet_bind_hashbucket *head,
 				    struct inet_bind_hashbucket *portaddr_head,
@@ -257,6 +267,14 @@ static inline struct inet_bind_hashbucket *
 	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
 }
 
+
+struct inet_bind_bucket *
+inet4_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead);
+
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline unsigned int inet6_portaddr_bhashfn(struct net *net,
 						  const struct in6_addr *addr6,
@@ -283,6 +301,14 @@ static inline struct inet_bind_hashbucket *
 	unsigned int h = inet6_portaddr_bhashfn(net, addr6, port);
 	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
 }
+
+
+struct inet_bind_bucket *
+	inet6_find_bind_buckets(struct sock *sk,
+				unsigned short port,
+				struct inet_bind_hashbucket **p_bhead,
+				struct inet_bind_hashbucket **p_portaddr_bhead);
+
 #endif
 
 
@@ -306,6 +332,28 @@ static inline struct inet_bind_hashbucket *
 	return inet4_portaddr_hashbucket(hinfo, net, INADDR_ANY, port);
 }
 
+
+static inline struct inet_bind_bucket *
+	inet_find_bind_buckets(struct sock *sk,
+			       unsigned short port,
+			       struct inet_bind_hashbucket **p_bhead,
+			       struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet4_find_bind_buckets(sk, port, p_bhead,
+				p_portaddr_bhead);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		return inet6_find_bind_buckets(sk, port, p_bhead,
+				p_portaddr_bhead);
+#endif
+	}
+	WARN(1, "unrecognised sk->sk_family in inet_portaddr_hashbucket");
+	return NULL;
+}
+
+
 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 			   const unsigned short snum);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 336531a..bd92466 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -100,8 +100,7 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct inet_bind_hashbucket *head;
-	struct hlist_node *node;
+	struct inet_bind_hashbucket *head, *portaddr_bhead;
 	struct inet_bind_bucket *tb;
 	int ret, attempts = 5;
 	struct net *net = sock_net(sk);
@@ -120,31 +119,26 @@ again:
 		do {
 			if (inet_is_reserved_local_port(rover))
 				goto next_nolock;
-			head = &hashinfo->bhash[inet_bhashfn(net, rover,
-					hashinfo->bhash_size)];
-			spin_lock(&head->lock);
-			inet_bind_bucket_for_each(tb, node, &head->chain)
-				if (net_eq(ib_net(tb), net) && tb->port == rover) {
-					if (tb->fastreuse > 0 &&
-					    sk->sk_reuse &&
-					    sk->sk_state != TCP_LISTEN &&
-					    (tb->num_owners < smallest_size || smallest_size == -1)) {
-						smallest_size = tb->num_owners;
-						smallest_rover = rover;
-						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
-						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-							snum = smallest_rover;
-							goto tb_found;
-						}
-					}
-					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-						snum = rover;
-						goto tb_found;
-					}
-					goto next;
+
+			tb = inet_find_bind_buckets(sk, rover, &head, &portaddr_bhead);
+			if (!tb)
+				break;
+			if (tb->fastreuse > 0 && sk->sk_reuse &&
+			    sk->sk_state != TCP_LISTEN &&
+			    (tb->num_owners < smallest_size || smallest_size == -1)) {
+				smallest_size = tb->num_owners;
+				smallest_rover = rover;
+				if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+				    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+					snum = smallest_rover;
+					goto tb_found;
 				}
-			break;
-		next:
+			}
+			if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+				snum = rover;
+				goto tb_found;
+			}
+			spin_unlock(&portaddr_bhead->lock);
 			spin_unlock(&head->lock);
 		next_nolock:
 			if (++rover > high)
@@ -171,12 +165,9 @@ again:
 		snum = rover;
 	} else {
 have_snum:
-		head = &hashinfo->bhash[inet_bhashfn(net, snum,
-				hashinfo->bhash_size)];
-		spin_lock(&head->lock);
-		inet_bind_bucket_for_each(tb, node, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == snum)
-				goto tb_found;
+		tb = inet_find_bind_buckets(sk, snum, &head, &portaddr_bhead);
+		if (tb)
+			goto tb_found;
 	}
 	tb = NULL;
 	goto tb_not_found;
@@ -194,6 +185,7 @@ tb_found:
 			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
 				    smallest_size != -1 && --attempts >= 0) {
+					spin_unlock(&portaddr_bhead->lock);
 					spin_unlock(&head->lock);
 					goto again;
 				}
@@ -205,12 +197,8 @@ tb_found:
 tb_not_found:
 	ret = 1;
 	if (!tb) {
-		struct inet_bind_hashbucket *portaddr_head;
-		portaddr_head = inet_portaddr_hashbucket(hashinfo, sk, snum);
-		spin_lock(&portaddr_head->lock);
 		tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-				net, head, portaddr_head, snum);
-		spin_unlock(&portaddr_head->lock);
+				sk, net, head, portaddr_bhead, snum);
 		if (!tb)
 			goto fail_unlock;
 	}
@@ -229,6 +217,7 @@ success:
 	ret = 0;
 
 fail_unlock:
+	spin_unlock(&portaddr_bhead->lock);
 	spin_unlock(&head->lock);
 fail:
 	local_bh_enable();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index edb2a4e..26c7f9d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -29,6 +29,7 @@
  * The bindhash mutex for snum's hash chain must be held here.
  */
 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+						 struct sock *sk,
 						 struct net *net,
 						 struct inet_bind_hashbucket *head,
 						 struct inet_bind_hashbucket *portaddr_head,
@@ -37,6 +38,32 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
 
 	if (tb != NULL) {
+		switch (sk->sk_family) {
+		case AF_INET:
+			/* ::ffff:x.y.z.y is the IPv4-mapped IPv6 address for
+			 * IPv4 address x.y.z.t, but only if it's not the any addr */
+			if (INADDR_ANY == sk_rcv_saddr(sk))
+				memset(&tb->ib_addr_ipv6, 0, sizeof(struct in6_addr));
+			else
+				ipv6_addr_set(&tb->ib_addr_ipv6, 0, 0,
+					      htonl(0x0000FFFF),
+					      sk_rcv_saddr(sk));
+
+			/* if no alignment problems appear, the IPv4 address
+			 * should be written to ib_addr_ipv6. If this gets
+			 * triggered check the inet_bind_bucket structure. */
+			WARN_ON(tb->ib_addr_ipv4 != sk_rcv_saddr(sk));
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case AF_INET6:
+			memcpy(&tb->ib_addr_ipv6, &inet6_sk(sk)->rcv_saddr,
+					sizeof(struct in6_addr));
+			break;
+#endif
+		default:
+			WARN(1, "unrecognised sk_family in inet_bind_bucket_create");
+		}
+
 		write_pnet(&tb->ib_net, hold_net(net));
 		tb->port      = snum;
 		tb->fastreuse = 0;
@@ -142,8 +169,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 				break;
 		}
 		if (!node) {
+			portaddr_head = inet_portaddr_hashbucket(table, sk, tb->port);
+
 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
-						     sock_net(sk), head,
+						     sk, sock_net(sk), head,
 						     portaddr_head, port);
 			if (!tb) {
 				spin_unlock(&head->lock);
@@ -521,7 +550,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
 			spin_lock(&portaddr_head->lock);
 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, portaddr_head, port);
+					sk, net, head, portaddr_head, port);
 			spin_unlock(&portaddr_head->lock);
 
 			if (!tb) {
@@ -584,6 +613,98 @@ out:
 	}
 }
 
+struct inet_bind_bucket *
+inet4_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	struct net *net = sock_net(sk);
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_bucket *tb = NULL;
+	struct hlist_node *node;
+
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
+	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
+	portaddr_bhead = inet4_portaddr_hashbucket(hinfo, net,
+				sk_rcv_saddr(sk), port);
+	portaddrany_bhead = inet4_portaddr_hashbucket(hinfo, net,
+						INADDR_ANY, port);
+
+	*p_portaddr_bhead = portaddr_bhead;
+	*p_bhead = bhead;
+
+	/*
+	 * prevent dead locks by always taking locks in a fixed order:
+	 * - always take the port-only lock first. This is done because in some
+	 *   other places this is the lock taken, being folllowed in only some
+	 *   cases by the portaddr lock.
+	 * - between portaddr and portaddrany always choose the one with the
+	 *   lower address. Unlock ordering is not important, as long as the
+	 *   locking order is consistent.
+	 * - make sure to not take the same lock twice
+	 */
+	spin_lock(&bhead->lock);
+	if (portaddr_bhead > portaddrany_bhead) {
+		spin_lock(&portaddrany_bhead->lock);
+		spin_lock(&portaddr_bhead->lock);
+	} else if (portaddr_bhead < portaddrany_bhead) {
+		spin_lock(&portaddr_bhead->lock);
+		spin_lock(&portaddrany_bhead->lock);
+	} else {
+		spin_lock(&portaddr_bhead->lock);
+	}
+
+	if (sk_rcv_saddr(sk) != INADDR_ANY) {
+		struct inet_bind_hashbucket *_head;
+
+		_head = portaddr_bhead;
+		if (bhead->count < portaddr_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
+					goto found;
+		}
+		_head = portaddrany_bhead;
+		if (bhead->count < portaddrany_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == INADDR_ANY))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == INADDR_ANY))
+					goto found;
+		}
+	} else {
+		inet_bind_bucket_for_each(tb, node, &bhead->chain)
+			if ((ib_net(tb) == net) && (tb->port == port))
+				goto found;
+	}
+
+	tb = NULL;
+found:
+	if (portaddr_bhead != portaddrany_bhead)
+		spin_unlock(&portaddrany_bhead->lock);
+
+	/* the other locks remain taken, as the caller
+	 * may want to change the hash tabels */
+	return tb;
+}
+
+
 /*
  * Bind a port for a connect operation and hash it.
  */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 73f1a00..62f1eff 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -294,6 +294,101 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
 					  inet->inet_dport);
 }
 
+
+struct inet_bind_bucket *
+inet6_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	struct net *net = sock_net(sk);
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_bucket *tb = NULL;
+	struct hlist_node *node;
+
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
+	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
+	portaddr_bhead = inet6_portaddr_hashbucket(hinfo, net,
+				inet6_rcv_saddr(sk), port);
+	portaddrany_bhead = inet6_portaddr_hashbucket(hinfo, net,
+				&in6addr_any, port);
+
+	*p_portaddr_bhead = portaddr_bhead;
+	*p_bhead = bhead;
+
+	/*
+	 * prevent dead locks by always taking locks in a fixed order:
+	 * - always take the port-only lock first. This is done because in some
+	 *   other places this is the lock taken, being folllowed in only some
+	 *   cases by the portaddr lock.
+	 * - between portaddr and portaddrany always choose the one with the
+	 *   lower address. Unlock ordering is not important, as long as the
+	 *   locking order is consistent.
+	 * - make sure to not take the same lock twice
+	 */
+	spin_lock(&bhead->lock);
+	if (portaddr_bhead > portaddrany_bhead) {
+		spin_lock(&portaddrany_bhead->lock);
+		spin_lock(&portaddr_bhead->lock);
+	} else if (portaddr_bhead < portaddrany_bhead) {
+		spin_lock(&portaddr_bhead->lock);
+		spin_lock(&portaddrany_bhead->lock);
+	} else {
+		spin_lock(&portaddr_bhead->lock);
+	}
+
+	if (ipv6_addr_any(inet6_rcv_saddr(sk))) {
+		struct inet_bind_hashbucket *_head;
+
+		_head = portaddr_bhead;
+		if (bhead->count < portaddr_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    ipv6_addr_equal(&tb->ib_addr_ipv6,
+						    inet6_rcv_saddr(sk)))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    ipv6_addr_equal(&tb->ib_addr_ipv6,
+						    inet6_rcv_saddr(sk)))
+					goto found;
+		}
+		_head = portaddrany_bhead;
+		if (bhead->count < portaddrany_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    ipv6_addr_any(&tb->ib_addr_ipv6))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    ipv6_addr_any(&tb->ib_addr_ipv6))
+					goto found;
+		}
+	} else {
+		inet_bind_bucket_for_each(tb, node, &bhead->chain)
+			if ((ib_net(tb) == net) && (tb->port == port))
+				goto found;
+	}
+
+	tb = NULL;
+found:
+	if (portaddr_bhead != portaddrany_bhead)
+		spin_unlock(&portaddrany_bhead->lock);
+
+	/* the other locks remain taken, as the caller
+	 * may want to change the hash tabels */
+	return tb;
+}
+
+
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 		       struct sock *sk)
 {
-- 
1.7.10.2

^ permalink raw reply related

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-30  7:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, netdev, Christoph Paasch, David S. Miller,
	Martin Topholm, Florian Westphal, opurdila, Hans Schillstrom,
	Tom Herbert
In-Reply-To: <1338360073.2760.81.camel@edumazet-glaptop>

On Wed, 2012-05-30 at 08:41 +0200, Eric Dumazet wrote:
> On Tue, 2012-05-29 at 12:37 -0700, Andi Kleen wrote:
> 
> > So basically handling syncookie lockless? 
> > 
> > Makes sense. Syncookies is a bit obsolete these days of course, due
> > to the lack of options. But may be still useful for this.
> > 
> > Obviously you'll need to clean up the patch and support IPv6,
> > but the basic idea looks good to me.
> 
> Also TCP Fast Open should be a good way to make the SYN flood no more
> effective.

Sounds interesting, but TCP Fast Open is primarily concerned with
enabling data exchange during SYN establishment.  I don't see any
indication that they have implemented parallel SYN handling.

Implementing parallel SYN handling, should also benefit their work.
After studying this code path, I also see great performance benefit in
also optimizing the normal 3WHS on sock's in sk_state == LISTEN.
Perhaps we should split up the code path for LISTEN vs. ESTABLISHED, as
they are very entangled at the moment AFAIKS.

> Yuchung Cheng and Jerry Chu should upstream this code in a very near
> future.

Looking forward to see the code, and the fallout discussions, on
transferring data on SYN packets.


> Another way to mitigate SYN scalability issues before the full RCU
> solution I was cooking is to either :
> 
> 1) Use a hardware filter (like on Intel NICS) to force all SYN packets
> going to one queue (so that they are all serviced on one CPU)
> 
> 2) Tweak RPS (__skb_get_rxhash()) so that SYN packets rxhash is not
> dependent on src port/address, to get same effect (All SYN packets
> processed by one cpu). Note this only address the SYN flood problem, not
> the general 3WHS scalability one, since if real connection is
> established, the third packet (ACK from client) will have the 'real'
> rxhash and will be processed by another cpu.

I don't like the idea of overloading one CPU with SYN packets. As the
attacker can still cause a DoS on new connections.

My "unlocked" parallel SYN cookie approach, should favor established
connections, as they are allowed to run under a BH lock, and thus don't
let new SYN packets in (on this CPU), until the establish conn packet is
finished.  Unless I have misunderstood something... I think I have,
established connections have their own/seperate struck sock, and thus
this is another slock spinlock, right?. (Well let Eric bash me for
this ;-))

[...cut...]

^ permalink raw reply

* Re: [RFC PATCH 0/4] inet: add second hash table
From: Eric Dumazet @ 2012-05-30  7:57 UTC (permalink / raw)
  To: Alexandru Copot
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-1-git-send-email-alex.mihai.c@gmail.com>

On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:
> This patchset implements all the operations needed to use a second
> (port,address) bind hash table for inet. It uses a similar approach
> as the UDP implementation.
> 
> The performance improvements for port allocation are very good and
> detailed in the last message.
> 
> This is based on a series of patches written by Lucian Grijincu at Ixia.
> 
> Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
> Cc: Daniel Baluta <dbaluta@ixiacom.com>
> Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
> ---
> Alexandru Copot (4):
>       inet: add counter to inet_bind_hashbucket
>       inet: add a second bind hash
>       inet: add/remove inet buckets in the second bind hash
>       inet: use second hash in inet_csk_get_port
> 
>  include/net/inet_hashtables.h    |  140 +++++++++++++++++++++++++++++++--
>  include/net/inet_timewait_sock.h |    5 +-
>  net/dccp/proto.c                 |   37 ++++++++-
>  net/ipv4/inet_connection_sock.c  |   66 ++++++++--------
>  net/ipv4/inet_hashtables.c       |  158 ++++++++++++++++++++++++++++++++++++--
>  net/ipv4/inet_timewait_sock.c    |   16 ++--
>  net/ipv4/tcp.c                   |   17 ++++
>  net/ipv6/inet6_hashtables.c      |   95 +++++++++++++++++++++++
>  8 files changed, 477 insertions(+), 57 deletions(-)


Its a huge change (with many details to look at), for a yet to be
understood need.

What sensible workload needs this at all ?

^ permalink raw reply

* Re: [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket
From: Eric Dumazet @ 2012-05-30  8:00 UTC (permalink / raw)
  To: Alexandru Copot
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu
In-Reply-To: <1338363410-6562-2-git-send-email-alex.mihai.c@gmail.com>

On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:
> The counter will be used by the upcoming INET lookup algorithm to
> choose the shortest chain after secondary hash is added.
> 
> Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
> Cc: Daniel Baluta <dbaluta@ixiacom.com>
> Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
> ---
>  include/net/inet_hashtables.h    |    4 +++-
>  include/net/inet_timewait_sock.h |    4 +++-
>  net/dccp/proto.c                 |    1 +
>  net/ipv4/inet_hashtables.c       |    9 ++++++---
>  net/ipv4/inet_timewait_sock.c    |    7 ++++---
>  net/ipv4/tcp.c                   |    1 +
>  6 files changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 808fc5f..8c6addc 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -98,6 +98,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
>  struct inet_bind_hashbucket {
>  	spinlock_t		lock;
>  	struct hlist_head	chain;
> +	unsigned int		count;
>  };
>  

Are you still using 32bit kernel ?

better use :

struct inet_bind_hashbucket {
	spinlock_t		lock;
	unsigned int		count;
 	struct hlist_head	chain;
};

^ permalink raw reply

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Hans Schillstrom @ 2012-05-30  8:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, Jesper Dangaard Brouer, Jesper Dangaard Brouer,
	netdev@vger.kernel.org, Christoph Paasch, David S. Miller,
	Martin Topholm, Florian Westphal, opurdila@ixiacom.com,
	Tom Herbert
In-Reply-To: <1338360073.2760.81.camel@edumazet-glaptop>

On Wednesday 30 May 2012 08:41:13 Eric Dumazet wrote:
> On Tue, 2012-05-29 at 12:37 -0700, Andi Kleen wrote:
> 
> > So basically handling syncookie lockless? 
> > 
> > Makes sense. Syncookies is a bit obsolete these days of course, due
> > to the lack of options. But may be still useful for this.
> > 
> > Obviously you'll need to clean up the patch and support IPv6,
> > but the basic idea looks good to me.
> 
> Also TCP Fast Open should be a good way to make the SYN flood no more
> effective.
> 
> Yuchung Cheng and Jerry Chu should upstream this code in a very near
> future.
> 
> Another way to mitigate SYN scalability issues before the full RCU
> solution I was cooking is to either :
> 
> 1) Use a hardware filter (like on Intel NICS) to force all SYN packets
> going to one queue (so that they are all serviced on one CPU)

We have this option running right now, and it gave slightly higher values.
The upside is only one core is running at 100% load.

To be able to process more SYN an attempt was made to spread them with RPS to 
2 other cores gave 60% more SYN:s per sec
i.e. syn filter in NIC sending all irq:s to one core gave ~ 52k syn. pkts/sec
adding RPS and sending syn to two other core:s gave ~80k  syn. pkts/sec
Adding more cores than two didn't help that much.

> 2) Tweak RPS (__skb_get_rxhash()) so that SYN packets rxhash is not
> dependent on src port/address, to get same effect (All SYN packets
> processed by one cpu). Note this only address the SYN flood problem, not
> the general 3WHS scalability one, since if real connection is
> established, the third packet (ACK from client) will have the 'real'
> rxhash and will be processed by another cpu.

Neither the NIC:s SYN filter or this scale that well..

> (Of course, RPS must be enabled to benefit from this)
> 
> Untested patch to get the idea :
> 
>  include/net/flow_keys.h   |    1 +
>  net/core/dev.c            |    8 ++++++++
>  net/core/flow_dissector.c |    9 +++++++++
>  3 files changed, 18 insertions(+)
> 
> diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
> index 80461c1..b5bae21 100644
> --- a/include/net/flow_keys.h
> +++ b/include/net/flow_keys.h
> @@ -10,6 +10,7 @@ struct flow_keys {
>  		__be16 port16[2];
>  	};
>  	u8 ip_proto;
> +	u8 tcpflags;
>  };
>  
>  extern bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index cd09819..c9c039e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -135,6 +135,7 @@
>  #include <linux/net_tstamp.h>
>  #include <linux/static_key.h>
>  #include <net/flow_keys.h>
> +#include <net/tcp.h>
>  
>  #include "net-sysfs.h"
>  
> @@ -2614,6 +2615,12 @@ void __skb_get_rxhash(struct sk_buff *skb)
>  		return;
>  
>  	if (keys.ports) {
> +		if ((keys.tcpflags & (TCPHDR_SYN | TCPHDR_ACK)) == TCPHDR_SYN) {
> +			hash = jhash_2words((__force u32)keys.dst,
> +					    (__force u32)keys.port16[1],
> +					    hashrnd);
> +			goto end;
> +		}
>  		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
>  			swap(keys.port16[0], keys.port16[1]);
>  		skb->l4_rxhash = 1;
> @@ -2626,6 +2633,7 @@ void __skb_get_rxhash(struct sk_buff *skb)
>  	hash = jhash_3words((__force u32)keys.dst,
>  			    (__force u32)keys.src,
>  			    (__force u32)keys.ports, hashrnd);
> +end:
>  	if (!hash)
>  		hash = 1;
>  
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index a225089..cd4aedf 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -137,6 +137,15 @@ ipv6:
>  		ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
>  		if (ports)
>  			flow->ports = *ports;
> +		if (ip_proto == IPPROTO_TCP) {
> +			__u8 *tcpflags, _tcpflags;
> +
> +			tcpflags = skb_header_pointer(skb, nhoff + 13,
> +						      sizeof(_tcpflags),
> +						      &_tcpflags);
> +			if (tcpflags)
> +				flow->tcpflags = *tcpflags;
> +		}
>  	}
>  
>  	return true;
> 
> 
> 

-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-30  8:15 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Andi Kleen, netdev, Christoph Paasch, David S. Miller,
	Martin Topholm, Florian Westphal, opurdila, Hans Schillstrom,
	Tom Herbert
In-Reply-To: <1338363926.7747.55.camel@localhost>

On Wed, 2012-05-30 at 09:45 +0200, Jesper Dangaard Brouer wrote:

> Sounds interesting, but TCP Fast Open is primarily concerned with
> enabling data exchange during SYN establishment.  I don't see any
> indication that they have implemented parallel SYN handling.
> 

Not at all, TCP fast open main goal is to allow connection establishment
with a single packet (thus removing one RTT). This also removes the
whole idea of having half-sockets (in SYN_RCV state)

Then, allowing DATA in the SYN packet is an extra bonus, only if the
whole request can fit in the packet (it is unlikely for typical http
requests)


> Implementing parallel SYN handling, should also benefit their work.

Why do you think I am working on this ? Hint : I am a Google coworker.

> After studying this code path, I also see great performance benefit in
> also optimizing the normal 3WHS on sock's in sk_state == LISTEN.
> Perhaps we should split up the code path for LISTEN vs. ESTABLISHED, as
> they are very entangled at the moment AFAIKS.
> 
> > Yuchung Cheng and Jerry Chu should upstream this code in a very near
> > future.
> 
> Looking forward to see the code, and the fallout discussions, on
> transferring data on SYN packets.
> 

Problem is this code will be delayed if we change net-next code in this
area, because we'll have to rebase and retest everything.

> 
> > Another way to mitigate SYN scalability issues before the full RCU
> > solution I was cooking is to either :
> > 
> > 1) Use a hardware filter (like on Intel NICS) to force all SYN packets
> > going to one queue (so that they are all serviced on one CPU)
> > 
> > 2) Tweak RPS (__skb_get_rxhash()) so that SYN packets rxhash is not
> > dependent on src port/address, to get same effect (All SYN packets
> > processed by one cpu). Note this only address the SYN flood problem, not
> > the general 3WHS scalability one, since if real connection is
> > established, the third packet (ACK from client) will have the 'real'
> > rxhash and will be processed by another cpu.
> 
> I don't like the idea of overloading one CPU with SYN packets. As the
> attacker can still cause a DoS on new connections.
> 

One CPU can handle more than one million SYN per second, while 32 cpus
fighting on socket lock can not handle 1 % of this load.

If Intel chose to implement this hardware filter in their NIC, its for a
good reason.


> My "unlocked" parallel SYN cookie approach, should favor established
> connections, as they are allowed to run under a BH lock, and thus don't
> let new SYN packets in (on this CPU), until the establish conn packet is
> finished.  Unless I have misunderstood something... I think I have,
> established connections have their own/seperate struck sock, and thus
> this is another slock spinlock, right?. (Well let Eric bash me for
> this ;-))

It seems you forgot I have patches to have full parallelism, not only
the SYNCOOKIE hack.

I am still polishing them, its a _long_ process, especially if network
tree changes a lot.

If you believe you can beat me on this, please let me know so that I can
switch to other tasks.

^ permalink raw reply

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-30  8:24 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: Andi Kleen, Jesper Dangaard Brouer, Jesper Dangaard Brouer,
	netdev@vger.kernel.org, Christoph Paasch, David S. Miller,
	Martin Topholm, Florian Westphal, Tom Herbert
In-Reply-To: <201205301013.10797.hans.schillstrom@ericsson.com>

On Wed, 2012-05-30 at 10:03 +0200, Hans Schillstrom wrote:

> We have this option running right now, and it gave slightly higher values.
> The upside is only one core is running at 100% load.
> 
> To be able to process more SYN an attempt was made to spread them with RPS to 
> 2 other cores gave 60% more SYN:s per sec
> i.e. syn filter in NIC sending all irq:s to one core gave ~ 52k syn. pkts/sec
> adding RPS and sending syn to two other core:s gave ~80k  syn. pkts/sec
> Adding more cores than two didn't help that much.

When you say 52.000 pkt/s, is that for fully established sockets, or
SYNFLOOD ?

19.23 us to handle _one_ SYN message seems pretty wrong to me, if there
is no contention on listener socket.

^ permalink raw reply

* Re: Strange latency spikes/TX network stalls on Sun Fire X4150(x86) and e1000e
From: Eric Dumazet @ 2012-05-30  8:40 UTC (permalink / raw)
  To: Hiroaki SHIMODA
  Cc: Tom Herbert, Denys Fedoryshchenko, netdev, e1000-devel,
	jeffrey.t.kirsher, jesse.brandeburg, davem
In-Reply-To: <20120530090602.6204d857.shimoda.hiroaki@gmail.com>

On Wed, 2012-05-30 at 09:06 +0900, Hiroaki SHIMODA wrote:
> While reading the bql code, I have some questions.
> 
> 1) dql_completed() and dql_queued() can be called concurrently,
>    so dql->num_queued could change while processing
>    dql_completed().
>    Is it intentional to refer num_queued from "dql->" each time ?
> 

not sure it can have problems, but doing the read once is indeed a good
plan.

> 2) From the comment in the code
>    *   - The queue was over-limit in the previous interval and
>    *     when enqueuing it was possible that all queued data
>    *     had been consumed.
> 
>    and
> 
>    * Queue was not starved, check if the limit can be decreased.
>    * A decrease is only considered if the queue has been busy in
>    * the whole interval (the check above). 
> 
>    the calculation of all_prev_completed should take into account
>    completed == dql->prev_num_queued case ?
>    On current implementation, limit shrinks easily and some NIC
>    hit TX stalls.
>    To mitigate TX stalls, should we fix all_prev_completed rather
>    than individual driver ?
> 

Not sure what you mean

> 3) limit calculation fails to consider integer wrap around in
>    one place ?
> 

Yes

> Here is the patch what I meant.
> 
> diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
> @@ -11,22 +11,27 @@
>  #include <linux/dynamic_queue_limits.h>
>  
>  #define POSDIFF(A, B) ((A) > (B) ? (A) - (B) : 0)
> +#define POSDIFFI(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
> +#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
>  
>  /* Records completed count and recalculates the queue limit */
>  void dql_completed(struct dql *dql, unsigned int count)
>  {
>  	unsigned int inprogress, prev_inprogress, limit;
> -	unsigned int ovlimit, all_prev_completed, completed;
> +	unsigned int ovlimit, completed, num_queued;
> +	bool all_prev_completed;
> +
> +	num_queued = dql->num_queued;


I suggest :

	num_queued = ACCESS_ONCE(dql->num_queued);
	
Or else compiler is free to do whatever he wants.

^ permalink raw reply

* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-30  8:44 UTC (permalink / raw)
  To: christoph.paasch
  Cc: netdev, Eric Dumazet, David S. Miller, Martin Topholm,
	Florian Westphal, opurdila, Hans Schillstrom, Andi Kleen
In-Reply-To: <4FC53353.2050801@uclouvain.be>

On Tue, 2012-05-29 at 22:36 +0200, Christoph Paasch wrote:
[...cut...]

> >> Concerning (2):
> >>
> >> Imagine, a SYN coming in, when the reqsk-queue is not yet full. A
> >> request-sock will be added to the reqsk-queue. Then, a retransmission of
> >> this SYN comes in and the queue got full by the time. This time
> >> tcp_v4_syn_conn_limit will do syn-cookies and thus generate a different
> >> seq-number for the SYN/ACK.
> > 
> > I have addressed your issue, by checking the reqsk_queue in
> > tcp_v4_syn_conn_limit() before allocating a new req via
> > inet_reqsk_alloc().
> > If I find an existing reqsk, I choose to drop it, so the SYN cookie
> > SYN-ACK takes precedence, as the path/handling of the last ACK doesn't
> > find this reqsk. This is done under the lock.
> 
> Then the receiver will receive two SYN/ACK's for the same SYN with
> different sequence-numbers. As the "SYN cookie SYN-ACK" will arrive
> second, it will be discarded and seq-numbers from the first one will be
> taken on the client-side.

I thought that the retransmitted SYN packet, were caused by the SYN-ACK
didn't reach the client?

> Then, the connection will never establish, as both sides "agreed" on
> different sequence numbers.
> 
> I would say, you have to handle the retransmitted SYN as in
> tcp_v4_hnd_req by calling tcp_check_req.

Choosing that code path, should be easy by simply returning 0 (no_limit)
from my function tcp_v4_syn_conn_limit(), to indicate that the normal
slow code path should be chosen.

I guess this will not pose a big attack angle, as the entries in
reqsk_queue will be fairly small.

^ permalink raw reply

* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-30  8:50 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: christoph.paasch, netdev, David S. Miller, Martin Topholm,
	Florian Westphal, opurdila, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338367497.7747.72.camel@localhost>

On Wed, 2012-05-30 at 10:44 +0200, Jesper Dangaard Brouer wrote:

> Choosing that code path, should be easy by simply returning 0 (no_limit)
> from my function tcp_v4_syn_conn_limit(), to indicate that the normal
> slow code path should be chosen.
> 
> I guess this will not pose a big attack angle, as the entries in
> reqsk_queue will be fairly small.

Not sure what you mean.

I know some people have 64K entries in it.

(sk_ack_backlog / sk_max_ack_backlog being 16bits, 
listen(fd, 65536 + 1) can give unexpected results)

^ permalink raw reply

* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Christoph Paasch @ 2012-05-30  8:53 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, Eric Dumazet, David S. Miller, Martin Topholm,
	Florian Westphal, opurdila, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338367497.7747.72.camel@localhost>

On 05/30/2012 10:44 AM, Jesper Dangaard Brouer wrote:
>> > 
>> > Then the receiver will receive two SYN/ACK's for the same SYN with
>> > different sequence-numbers. As the "SYN cookie SYN-ACK" will arrive
>> > second, it will be discarded and seq-numbers from the first one will be
>> > taken on the client-side.
> I thought that the retransmitted SYN packet, were caused by the SYN-ACK
> didn't reach the client?

Or, if the SYN/ACK got somehow delayed in the network and the
SYN-retransmission timer on the client-side fires before the SYN/ACK
reaches the client.


Christoph


-- 
Christoph Paasch
PhD Student

IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://mptcp.info.ucl.ac.be
Université Catholique de Louvain
-- 

^ permalink raw reply

* Re: [PATCH] l2tp: fix oops in L2TP IP sockets for connect() AF_UNSPEC case
From: James Chapman @ 2012-05-30  8:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, levinsasha928
In-Reply-To: <20120529.172008.875375243438479060.davem@davemloft.net>

On 29/05/12 22:20, David Miller wrote:
> From: James Chapman <jchapman@katalix.com>
> Date: Tue, 29 May 2012 14:30:42 +0100
> 
>> An application may call connect() to disconnect a socket using an
>> address with family AF_UNSPEC. The L2TP IP sockets were not handling
>> this case when the socket is not bound and an attempt to connect()
>> using AF_UNSPEC in such cases would result in an oops. This patch
>> addresses the problem by protecting the sk_prot->disconnect() call
>> against trying to unhash the socket before it is bound.
>>
>> The L2TP IPv4 and IPv6 sockets have the same problem. Both are fixed
>> by this patch.
>>
>> The patch also adds more checks that the sockaddr supplied to bind()
>> and connect() calls is valid.
>>
>>  RIP: 0010:[<ffffffff82e133b0>]  [<ffffffff82e133b0>] inet_unhash+0x50/0xd0
>>  RSP: 0018:ffff88001989be28  EFLAGS: 00010293
>>  Stack:
>>   ffff8800407a8000 0000000000000000 ffff88001989be78 ffffffff82e3a249
>>   ffffffff82e3a050 ffff88001989bec8 ffff88001989be88 ffff8800407a8000
>>   0000000000000010 ffff88001989bec8 ffff88001989bea8 ffffffff82e42639
>>  Call Trace:
>>  [<ffffffff82e3a249>] udp_disconnect+0x1f9/0x290
>>  [<ffffffff82e42639>] inet_dgram_connect+0x29/0x80
>>  [<ffffffff82d012fc>] sys_connect+0x9c/0x100
>>
>> Reported-by: Sasha Levin <levinsasha928@gmail.com>
>> Signed-off-by: James Chapman <jchapman@katalix.com>
> 
> Applied and queued up for -stable, thanks James.

The patch doesn't apply to stable due to recent l2tp_ip changes (IPv6
support) already merged. I'll spin a version for -stable.


-- 
James Chapman
Katalix Systems Ltd
http://www.katalix.com
Catalysts for your Embedded Linux software development

^ permalink raw reply

* Re: [PATCH] l2tp: fix oops in L2TP IP sockets for connect() AF_UNSPEC case
From: David Miller @ 2012-05-30  9:05 UTC (permalink / raw)
  To: jchapman; +Cc: netdev, levinsasha928
In-Reply-To: <4FC5E022.6020609@katalix.com>

From: James Chapman <jchapman@katalix.com>
Date: Wed, 30 May 2012 09:53:54 +0100

> The patch doesn't apply to stable due to recent l2tp_ip changes (IPv6
> support) already merged. I'll spin a version for -stable.

That would be helpful, please do.

^ permalink raw reply

* [PATCH stable] l2tp: fix oops in L2TP IP sockets for connect() AF_UNSPEC case
From: James Chapman @ 2012-05-30  9:13 UTC (permalink / raw)
  To: netdev; +Cc: levinsasha928, James Chapman

An application may call connect() to disconnect a socket using an
address with family AF_UNSPEC. The L2TP IP sockets were not handling
this case when the socket is not bound and an attempt to connect()
using AF_UNSPEC in such cases would result in an oops. This patch
addresses the problem by protecting the sk_prot->disconnect() call
against trying to unhash the socket before it is bound.

The patch also adds more checks that the sockaddr supplied to bind()
and connect() calls is valid.

 RIP: 0010:[<ffffffff82e133b0>]  [<ffffffff82e133b0>] inet_unhash+0x50/0xd0
 RSP: 0018:ffff88001989be28  EFLAGS: 00010293
 Stack:
  ffff8800407a8000 0000000000000000 ffff88001989be78 ffffffff82e3a249
  ffffffff82e3a050 ffff88001989bec8 ffff88001989be88 ffff8800407a8000
  0000000000000010 ffff88001989bec8 ffff88001989bea8 ffffffff82e42639
 Call Trace:
 [<ffffffff82e3a249>] udp_disconnect+0x1f9/0x290
 [<ffffffff82e42639>] inet_dgram_connect+0x29/0x80
 [<ffffffff82d012fc>] sys_connect+0x9c/0x100

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: James Chapman <jchapman@katalix.com>

---
A version of this patch is already applied to the net tree.

 net/l2tp/l2tp_ip.c |   30 ++++++++++++++++++++++++------
 1 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 6274f0b..cc8ad7b 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -251,9 +251,16 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr;
-	int ret = -EINVAL;
+	int ret;
 	int chk_addr_ret;
 
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		return -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_l2tpip))
+		return -EINVAL;
+	if (addr->l2tp_family != AF_INET)
+		return -EINVAL;
+
 	ret = -EADDRINUSE;
 	read_lock_bh(&l2tp_ip_lock);
 	if (__l2tp_ip_bind_lookup(&init_net, addr->l2tp_addr.s_addr, sk->sk_bound_dev_if, addr->l2tp_conn_id))
@@ -284,6 +291,8 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk_del_node_init(sk);
 	write_unlock_bh(&l2tp_ip_lock);
 	ret = 0;
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
 out:
 	release_sock(sk);
 
@@ -304,13 +313,14 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	__be32 saddr;
 	int oif, rc;
 
-	rc = -EINVAL;
+	if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
+		return -EINVAL;
+
 	if (addr_len < sizeof(*lsa))
-		goto out;
+		return -EINVAL;
 
-	rc = -EAFNOSUPPORT;
 	if (lsa->l2tp_family != AF_INET)
-		goto out;
+		return -EAFNOSUPPORT;
 
 	lock_sock(sk);
 
@@ -364,6 +374,14 @@ out:
 	return rc;
 }
 
+static int l2tp_ip_disconnect(struct sock *sk, int flags)
+{
+	if (sock_flag(sk, SOCK_ZAPPED))
+		return 0;
+
+	return udp_disconnect(sk, flags);
+}
+
 static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
 			   int *uaddr_len, int peer)
 {
@@ -599,7 +617,7 @@ static struct proto l2tp_ip_prot = {
 	.close		   = l2tp_ip_close,
 	.bind		   = l2tp_ip_bind,
 	.connect	   = l2tp_ip_connect,
-	.disconnect	   = udp_disconnect,
+	.disconnect	   = l2tp_ip_disconnect,
 	.ioctl		   = udp_ioctl,
 	.destroy	   = l2tp_ip_destroy_sock,
 	.setsockopt	   = ip_setsockopt,
-- 
1.7.0.4

^ permalink raw reply related

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-30  9:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, netdev, Christoph Paasch, David S. Miller,
	Martin Topholm, Florian Westphal, Hans Schillstrom,
	Martin Topholm
In-Reply-To: <1338365702.2760.112.camel@edumazet-glaptop>

On Wed, 2012-05-30 at 10:15 +0200, Eric Dumazet wrote:
> On Wed, 2012-05-30 at 09:45 +0200, Jesper Dangaard Brouer wrote:
> 
> > Sounds interesting, but TCP Fast Open is primarily concerned with
> > enabling data exchange during SYN establishment.  I don't see any
> > indication that they have implemented parallel SYN handling.
> > 
> 
> Not at all, TCP fast open main goal is to allow connection establishment
> with a single packet (thus removing one RTT). This also removes the
> whole idea of having half-sockets (in SYN_RCV state)
> 
> Then, allowing DATA in the SYN packet is an extra bonus, only if the
> whole request can fit in the packet (it is unlikely for typical http
> requests)
> 
> 
> > Implementing parallel SYN handling, should also benefit their work.
> 
> Why do you think I am working on this ? Hint : I am a Google coworker.

Did know you work for Google, but didn't know you worked actively on
parallel SYN handling.  Your previous quote "eventually in a short
time", indicated to me, that I should solve the issue my self first, and
then we would replace my code with your full solution later.


> > After studying this code path, I also see great performance benefit in
> > also optimizing the normal 3WHS on sock's in sk_state == LISTEN.
> > Perhaps we should split up the code path for LISTEN vs. ESTABLISHED, as
> > they are very entangled at the moment AFAIKS.
> > 
> > > Yuchung Cheng and Jerry Chu should upstream this code in a very near
> > > future.
> > 
> > Looking forward to see the code, and the fallout discussions, on
> > transferring data on SYN packets.
> > 
> 
> Problem is this code will be delayed if we change net-next code in this
> area, because we'll have to rebase and retest everything.

Okay, don't want to delay your work.  We can wait merging my cleanup
patches, and I can take the pain of rebasing them after your work is
merged.  And then we will see if my performance patches have gotten
obsolete.

I'm going to post some updated v2 patches, just because I know some
people that are desperate for a quick solution to their DDoS issues, and
are willing patch their kernels for production.

 
> > > Another way to mitigate SYN scalability issues before the full RCU
> > > solution I was cooking is to either :
> > > 
> > > 1) Use a hardware filter (like on Intel NICS) to force all SYN packets
> > > going to one queue (so that they are all serviced on one CPU)
> > > 
> > > 2) Tweak RPS (__skb_get_rxhash()) so that SYN packets rxhash is not
> > > dependent on src port/address, to get same effect (All SYN packets
> > > processed by one cpu). Note this only address the SYN flood problem, not
> > > the general 3WHS scalability one, since if real connection is
> > > established, the third packet (ACK from client) will have the 'real'
> > > rxhash and will be processed by another cpu.
> > 
> > I don't like the idea of overloading one CPU with SYN packets. As the
> > attacker can still cause a DoS on new connections.
> > 
> 
> One CPU can handle more than one million SYN per second, while 32 cpus
> fighting on socket lock can not handle 1 % of this load.

Not sure, one CPU can handle 1Mpps on this particular path.  And Hans
have some other measurements, although I'm assuming he has small CPUs.
But if you are working on the real solution, we don't need to discuss
this :-)


> If Intel chose to implement this hardware filter in their NIC, its for a
> good reason.
> 
> 
> > My "unlocked" parallel SYN cookie approach, should favor established
> > connections, as they are allowed to run under a BH lock, and thus don't
> > let new SYN packets in (on this CPU), until the establish conn packet is
> > finished.  Unless I have misunderstood something... I think I have,
> > established connections have their own/seperate struck sock, and thus
> > this is another slock spinlock, right?. (Well let Eric bash me for
> > this ;-))
> 
> It seems you forgot I have patches to have full parallelism, not only
> the SYNCOOKIE hack.

I'm so much, looking forward to this :-)

> I am still polishing them, its a _long_ process, especially if network
> tree changes a lot.
> 
> If you believe you can beat me on this, please let me know so that I can
> switch to other tasks.

I don't dare to go into that battle with the network ninja, I surrender.
DaveM, Eric's patches take precedence over mine...

/me Crawing back into my cave, and switching to boring bugzilla cases of
backporting kernel patches instead...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-30  9:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <CAL8zT=jHWhDdv-DyXL8XnL8qRc7o-jZfA7ADU-qt54hB5aLyCw@mail.gmail.com>

2012/5/30 Jean-Michel Hautbois <jhautbois@gmail.com>:
> 2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
>> On Wed, 2012-05-30 at 08:51 +0200, Jean-Michel Hautbois wrote:
>>> 2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
>>> > On Wed, 2012-05-30 at 08:28 +0200, Jean-Michel Hautbois wrote:
>>> >
>>> >> If this can help, setting tx queue length to 5000 seems to make the
>>> >> problem disappear.
>>> >
>>> > Then you should have drops at Qdisc layer (before your change to 5000)
>>> >
>>> > tc -s -d qdisc
>>> >
>>> >> I didn't specified it : MTU is 4096, UDP packets are 4000 bytes.
>>> >
>>>
>>> Yes :
>>> qdisc mq 0: dev eth1 root
>>>  Sent 5710049154383 bytes 1413544639 pkt (dropped 73078, overlimits 0
>>> requeues 281540)
>>>  backlog 0b 0p requeues 281540
>>>
>>> Why ? With a 2.6.26 kernel it works well with a tx queue length of 1000.
>>
>> If you send big bursts of packets, then you need a large enough queue.
>>
>> Maybe your kernel is now faster than before and queue fills faster, or
>> TX ring is smaller ?
>>
>> ethtool -g eth0
>>
>> Note that everybody try to reduce dumb queue sizes because of latencies.
>>
>
> TX ring is not the same :
> On 3.2 :
> $> ethtool -g eth1
> Ring parameters for eth1:
> Pre-set maximums:
> RX:             1024
> RX Mini:        0
> RX Jumbo:       0
> TX:             2048
> Current hardware settings:
> RX:             1024
> RX Mini:        0
> RX Jumbo:       0
> TX:             2048
>
>
> On 2.6.26 :
> $>ethtool -g eth1
> Ring parameters for eth1:
> Pre-set maximums:
> RX:             1024
> RX Mini:        0
> RX Jumbo:       0
> TX:             2048
> Current hardware settings:
> RX:             1003
> RX Mini:        0
> RX Jumbo:       0
> TX:             0
>
> I can't set TX ring using ethtool -G eth1 tx N : operation not supported
> I am not really impacted by latency, but the lower the better.
>
> JM

I used vmstat in order to see the differences between the two kernels.
The main difference is the number of interrupts per second.
I have an average of 87500 on 3.2 and 7500 on 2.6, 10 times lower !
I suspect the be2net driver to be the main cause, and I checkes the
/proc/interrupts file in order to be sure.

I have for eth1-tx on 2.6.26 about 2200 interrupts per second and 23000 on 3.2.
BTW, it is named eth1-q0 on 3.2 (and tx and rx are the same IRQ)
whereas there is eth1-rx0 and eth1-tx on 2.6.26.

JM

^ permalink raw reply

* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-30  9:46 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Andi Kleen, netdev, Christoph Paasch, David S. Miller,
	Martin Topholm, Florian Westphal, Hans Schillstrom
In-Reply-To: <1338369863.7747.96.camel@localhost>

On Wed, 2012-05-30 at 11:24 +0200, Jesper Dangaard Brouer wrote:

> I don't dare to go into that battle with the network ninja, I surrender.
> DaveM, Eric's patches take precedence over mine...
> 
> /me Crawing back into my cave, and switching to boring bugzilla cases of
> backporting kernel patches instead...
> 

Hey, I only wanted to say that we were working on the same area and that
we should expect conflicts.

In the long term, we want a scalable listener solution, but I can
understand if some customers want an immediate solution (SYN flood
mitigation)

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Eric Dumazet @ 2012-05-30  9:56 UTC (permalink / raw)
  To: Jean-Michel Hautbois; +Cc: netdev
In-Reply-To: <CAL8zT=gJT1Frn_44SU0CrpZjPxwC_VuHFE4k9jvOGNmFomzhHA@mail.gmail.com>

On Wed, 2012-05-30 at 11:40 +0200, Jean-Michel Hautbois wrote:

> I used vmstat in order to see the differences between the two kernels.
> The main difference is the number of interrupts per second.
> I have an average of 87500 on 3.2 and 7500 on 2.6, 10 times lower !
> I suspect the be2net driver to be the main cause, and I checkes the
> /proc/interrupts file in order to be sure.
> 
> I have for eth1-tx on 2.6.26 about 2200 interrupts per second and 23000 on 3.2.
> BTW, it is named eth1-q0 on 3.2 (and tx and rx are the same IRQ)
> whereas there is eth1-rx0 and eth1-tx on 2.6.26.
> 

Might be different coalescing params :

ethtool -c eth1

^ permalink raw reply

* RE: Difficulties to get 1Gbps on be2net ethernet card
From: Sathya.Perla @ 2012-05-30 10:04 UTC (permalink / raw)
  To: jhautbois, eric.dumazet; +Cc: netdev
In-Reply-To: <CAL8zT=gJT1Frn_44SU0CrpZjPxwC_VuHFE4k9jvOGNmFomzhHA@mail.gmail.com>

>-----Original Message-----
>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
>Behalf Of Jean-Michel Hautbois
>
>2012/5/30 Jean-Michel Hautbois <jhautbois@gmail.com>:
>
>I used vmstat in order to see the differences between the two kernels.
>The main difference is the number of interrupts per second.
>I have an average of 87500 on 3.2 and 7500 on 2.6, 10 times lower !
>I suspect the be2net driver to be the main cause, and I checkes the
>/proc/interrupts file in order to be sure.
>
>I have for eth1-tx on 2.6.26 about 2200 interrupts per second and 23000 on 3.2.
>BTW, it is named eth1-q0 on 3.2 (and tx and rx are the same IRQ)
>whereas there is eth1-rx0 and eth1-tx on 2.6.26.

Yes, there is an issue with be2net interrupt mitigation in the recent code with
RX and TX on the same Evt-Q (commit 10ef9ab4). The high interrupt rate happens when a TX blast is
done while RX is relatively silent on a queue pair. Interrupt rate due to TX completions is not being
mitigated.

I have a fix and will send it out soon..

thanks,
-Sathya

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-30 10:06 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1338371774.2760.134.camel@edumazet-glaptop>

2012/5/30 Eric Dumazet <eric.dumazet@gmail.com>:
> On Wed, 2012-05-30 at 11:40 +0200, Jean-Michel Hautbois wrote:
>
>> I used vmstat in order to see the differences between the two kernels.
>> The main difference is the number of interrupts per second.
>> I have an average of 87500 on 3.2 and 7500 on 2.6, 10 times lower !
>> I suspect the be2net driver to be the main cause, and I checkes the
>> /proc/interrupts file in order to be sure.
>>
>> I have for eth1-tx on 2.6.26 about 2200 interrupts per second and 23000 on 3.2.
>> BTW, it is named eth1-q0 on 3.2 (and tx and rx are the same IRQ)
>> whereas there is eth1-rx0 and eth1-tx on 2.6.26.
>>
>
> Might be different coalescing params :
>
> ethtool -c eth1
>

Yes, as stated in my first e-mail, this is different, in 2.6.26 the
adaptive-tx coalescing is off, while it is on for 3.4 (sorry, I said
3.2 before but it is 3.4).
But I can't change this setting since commit 10ef9ab...
JM

^ permalink raw reply

* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-30 10:07 UTC (permalink / raw)
  To: Sathya.Perla; +Cc: eric.dumazet, netdev
In-Reply-To: <3367B80B08154D42A3B2BC708B5D41F647C678B73F@EXMAIL.ad.emulex.com>

2012/5/30  <Sathya.Perla@emulex.com>:
>>-----Original Message-----
>>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
>>Behalf Of Jean-Michel Hautbois
>>
>>2012/5/30 Jean-Michel Hautbois <jhautbois@gmail.com>:
>>
>>I used vmstat in order to see the differences between the two kernels.
>>The main difference is the number of interrupts per second.
>>I have an average of 87500 on 3.2 and 7500 on 2.6, 10 times lower !
>>I suspect the be2net driver to be the main cause, and I checkes the
>>/proc/interrupts file in order to be sure.
>>
>>I have for eth1-tx on 2.6.26 about 2200 interrupts per second and 23000 on 3.2.
>>BTW, it is named eth1-q0 on 3.2 (and tx and rx are the same IRQ)
>>whereas there is eth1-rx0 and eth1-tx on 2.6.26.
>
> Yes, there is an issue with be2net interrupt mitigation in the recent code with
> RX and TX on the same Evt-Q (commit 10ef9ab4). The high interrupt rate happens when a TX blast is
> done while RX is relatively silent on a queue pair. Interrupt rate due to TX completions is not being
> mitigated.
>
> I have a fix and will send it out soon..
>
> thanks,
> -Sathya

Hi Sathya !
Thanks for this information !
I had the correct diagnostic :). I am waiting for your fix.

Regards,
JM

^ permalink raw reply

* Re: [PATCH RFC] virtio-net: remove useless disable on freeze
From: Rusty Russell @ 2012-05-30 10:11 UTC (permalink / raw)
  To: Michael S. Tsirkin, netdev; +Cc: Amit Shah, linux-kernel, kvm, virtualization
In-Reply-To: <20120528125325.GA22576@redhat.com>

On Mon, 28 May 2012 15:53:25 +0300, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Apr 04, 2012 at 12:19:54PM +0300, Michael S. Tsirkin wrote:
> > disable_cb is just an optimization: it
> > can not guarantee that there are no callbacks.
> > 
> > I didn't yet figure out whether a callback
> > in freeze will trigger a bug, but disable_cb
> > won't address it in any case. So let's remove
> > the useless calls as a first step.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Looks like this isn't in the 3.5 pull request -
> just lost in the shuffle?
> disable_cb is advisory so can't be relied upon.

I always (try to?) reply as I accept patches.

This one did slip by, but it's harmless so no need to push AFAICT.

Applied.

Thanks!
Rusty.

^ permalink raw reply

* [PATCH v5 0/7] ARM: davinci: add support for the am1808 based enbw_cmc board
From: Heiko Schocher @ 2012-05-30 10:18 UTC (permalink / raw)
  To: davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/
  Cc: Heiko Schocher, linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ,
	linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	David Woodhouse, Ben Dooks, Wolfram Sang, Sekhar Nori,
	Kevin Hilman, Wolfgang Denk, Sergei Shtylyov, Grant Likely

this patchserie add support for the davinci am1808 based
enbw_cmc board.

changes for v2:
Post this patchserie now as v2, as reworked in the
comments I got for the RFC serie.

changes for v3:
- Interrupt Controller:
  - comment from Sergei Shtylyov:
    - rename compatible" prop to "ti,cp_intc"
    - cp_intc_init() is now also for the of case
      the name of the init function (it calls the
      "new" __cp_intc_init() function, which was
      the "old" cp_intc_init()). Through this
      rework the changes for OF is better visible.
      As the OF case uses the irq_domain rework from
      Grant Likely, maybe the none OF case can use
      this also, but this should be tested on a hw ...

changes for v4:
- Interrupt Controller:
  - split in two patches as Nori Sekhar suggested
    one for the irq_domain change
    one for DT support
  - add comment from Grant Likely for the DT part:
    remove if/else clause, not needed.
    Make use of DT runtime configurable
    The non OF case is not tested!

changes for v5:
- Interrupt Controller:
  add comments from Sergei Shtylyov:
  - s/intc/cp_intc in commit subject
  - Codingstyle fixes
  add comment from Grant Likely:
  - rename compatible" prop to "ti,cp-intc"
  - call irq_domain_add also in the non DT case
    (was fixed in v4)
  - switched from using d->irq to d->hwirq for the hardware
    irq number in irq_chip hooks
- I2C DT support:
  add comments from Grant Likely:
  - do not change value of dev->dev->platform_data, instead
    hold a copy in davinci_i2c_dev.
      
Got no comments to the following points, I noted in the
RFC series, so posting this patchseries with them:

- ARM: davinci: configure davinci aemif chipselects through OF
  not moved to mfd, as mentioned in this discussion:
  http://davinci-linux-open-source.1494791.n2.nabble.com/PATCH-arm-davinci-configure-davinci-aemif-chipselects-through-OF-td7059739.html
  instead use a phandle in the DTS, so drivers which
  uses the davinci aemif, can call davinci_aemif_setup_timing_of()

  This is just thought as an RFC ... The enbw_cmc board
  support not really need to setup this bus timings, as
  they are setup in U-Boot ... but I want to post this,
  as I think, it is a nice to have, and I am not really
  sure, if this has to be a MFD device (If so, all bus
  interfaces for other SoCs should be converted also to
  MFD devices) ... as an example how this can be used
  I add this to the davinci nand controller OF support
  patch, in this patchserie.

- ARM: davinci: mux: add OF support
  I want to get rid of the pin setup code in board code ...
  This patch introduces a davinci_cfg_reg_of() function,
  which davinci drivers can call, if they found a
  "pinmux-handle", so used in the following drivers in
  this patchserie:

  drivers/net/ethernet/ti/davinci_emac
  drivers/i2c/busses/i2c-davinci.c
  drivers/mtd/nand/davinci_nand.c

  This is removed for v4 serie, as Nori Sekhar suggested.

- post this board support with USB support, even though
  USB is only working with the 10 ms "workaround", posted here:
  http://comments.gmane.org/gmane.linux.usb.general/54505
  I see this issue also on the AM1808 TMDXEXP1808L evalboard.

  change for v4:
  The 10 ms delay is no longer needed, see discussion here:

  http://www.spinics.net/lists/linux-usb/msg64232.html

  shows the way to go ...

- MMC and USB are not using OF support yet, ideas how to port
  this are welcome. I need for USB and MMC board specific
  callbacks, how to solve this with OF support?

Signed-off-by: Heiko Schocher <hs-ynQEQJNshbs@public.gmane.org>
Cc: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org
Cc: devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ@public.gmane.org
Cc: davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/@public.gmane.org
Cc: linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org
Cc: linux-i2c-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
Cc: Ben Dooks <ben-linux-elnMNo+KYs3YtjvyW6yDsg@public.gmane.org>
Cc: Wolfram Sang <w.sang-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
Cc: Sekhar Nori <nsekhar-l0cyMroinI0@public.gmane.org>
Cc: Kevin Hilman <khilman-l0cyMroinI0@public.gmane.org>
Cc: Wolfgang Denk <wd-ynQEQJNshbs@public.gmane.org>
Cc: Sergei Shtylyov <sshtylyov-Igf4POYTYCDQT0dZR+AlfA@public.gmane.org>
Cc: Grant Likely <grant.likely-s3s/WqlpOiPyB63q8FvJNQ@public.gmane.org>

Heiko Schocher (7):
  ARM: davinci, cp_intc: Add irq domain support
  ARM: davinci, cp_intc: Add OF support for TI interrupt controller
  ARM: davinci: configure davinci aemif chipselects through OF
  ARM: davinci: net: davinci_emac: add OF support
  ARM: davinci: i2c: add OF support
  ARM: mtd: nand: davinci: add OF support for davinci nand controller
  ARM: davinci: add support for the am1808 based enbw_cmc board

 .../devicetree/bindings/arm/davinci/aemif.txt      |  119 +++++++
 .../devicetree/bindings/arm/davinci/i2c.txt        |   31 ++
 .../devicetree/bindings/arm/davinci/intc.txt       |   27 ++
 .../devicetree/bindings/arm/davinci/nand.txt       |   72 ++++
 .../devicetree/bindings/net/davinci_emac.txt       |   41 +++
 arch/arm/boot/dts/enbw_cmc.dts                     |  172 +++++++++
 arch/arm/configs/enbw_cmc_defconfig                |  123 +++++++
 arch/arm/mach-davinci/Kconfig                      |    9 +
 arch/arm/mach-davinci/Makefile                     |    1 +
 arch/arm/mach-davinci/aemif.c                      |   86 +++++-
 arch/arm/mach-davinci/board-enbw-cmc.c             |  374 ++++++++++++++++++++
 arch/arm/mach-davinci/cp_intc.c                    |   83 ++++-
 arch/arm/mach-davinci/include/mach/aemif.h         |    1 +
 arch/arm/mach-davinci/include/mach/uncompress.h    |    1 +
 drivers/i2c/busses/i2c-davinci.c                   |   49 +++-
 drivers/mtd/nand/davinci_nand.c                    |   80 ++++-
 drivers/net/ethernet/ti/davinci_emac.c             |   87 +++++-
 17 files changed, 1335 insertions(+), 21 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/arm/davinci/aemif.txt
 create mode 100644 Documentation/devicetree/bindings/arm/davinci/i2c.txt
 create mode 100644 Documentation/devicetree/bindings/arm/davinci/intc.txt
 create mode 100644 Documentation/devicetree/bindings/arm/davinci/nand.txt
 create mode 100644 Documentation/devicetree/bindings/net/davinci_emac.txt
 create mode 100644 arch/arm/boot/dts/enbw_cmc.dts
 create mode 100644 arch/arm/configs/enbw_cmc_defconfig
 create mode 100644 arch/arm/mach-davinci/board-enbw-cmc.c

-- 
1.7.7.6

^ permalink raw reply

* [PATCH v5 7/7] ARM: davinci: add support for the am1808 based enbw_cmc board
From: Heiko Schocher @ 2012-05-30 10:19 UTC (permalink / raw)
  To: davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/
  Cc: Heiko Schocher, linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ,
	linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	David Woodhouse, Ben Dooks, Wolfram Sang, Sekhar Nori,
	Kevin Hilman, Wolfgang Denk, Scott Wood, Sylwester Nawrocki
In-Reply-To: <1338373143-7467-1-git-send-email-hs-ynQEQJNshbs@public.gmane.org>

- AM1808 based board
- 64 MiB DDR ram
- 2 MiB Nor flash
- 128 MiB NAND flash
- use internal RTC
- I2C support
- hwmon lm75 support
- UBI/UBIFS support
- MMC support
- USB OTG support

Signed-off-by: Heiko Schocher <hs-ynQEQJNshbs@public.gmane.org>
Cc: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org
Cc: devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ@public.gmane.org
Cc: davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/@public.gmane.org
Cc: linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org
Cc: linux-i2c-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
Cc: Ben Dooks <ben-linux-elnMNo+KYs3YtjvyW6yDsg@public.gmane.org>
Cc: Wolfram Sang <w.sang-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
Cc: Sekhar Nori <nsekhar-l0cyMroinI0@public.gmane.org>
Cc: Kevin Hilman <khilman-l0cyMroinI0@public.gmane.org>
Cc: Wolfgang Denk <wd-ynQEQJNshbs@public.gmane.org>
Cc: Scott Wood <scottwood-KZfg59tc24xl57MIdRCFDg@public.gmane.org>
Cc: Sylwester Nawrocki <s.nawrocki-Sze3O3UU22JBDgjK7y7TUQ@public.gmane.org>

---
- post this board support with USB support, even though
  USB is only working with the 10 ms "workaround", posted here:
  http://comments.gmane.org/gmane.linux.usb.general/54505
  I see this issue also on the AM1808 TMDXEXP1808L evalboard.
- MMC and USB are not using OF support yet, ideas how to port
  this are welcome. I need for USB and MMC boards board
  specific callbacks, how to solve this with OF support?

- changes for v2:
  - changes in the nand node due to comments from Scott Wood:
    - add "ti,davinci-" prefix
    - Dashes are preferred to underscores
    - rename "nandflash" to "nand"
    - introduce new "ti,davinci" specific properties for setting
      up ecc_mode, ecc_bits, options and bbt options, instead
      using linux defines
  - changes for i2c due to comments from Sylwester Nawrocki:
    - use "cell-index" instead "id"
    - OF_DEV_AUXDATA in the machine code, instead pre-define
      platform device name
  - add comment from Grant Likely for i2c:
    - removed "id" resp. "cell-index" completely
    - fixed documentation
    - use of_match_ptr()
    - use devm_kzalloc() for allocating plattform data mem
    - fixed a whitespace issue
  - add net comments from Grant Likely:
    - add prefix "ti,davinci-" to davinci specific property names
    - remove version property
    - use compatible name "ti,davinci-dm6460-emac"
  - add comment from Grant Likely:
    - rename compatible node
    - do not use cell-index
    - CONFIG_OF required for this board
    TODO:
    - create a generic board support file, as I got no
      answer to my ping to grant, maybe this could be done
      in a second step?
- changes for v3:
  - add comments from Sergei Shtylyov:
    - rename compatible" prop to "ti,cp_intc"
    - cp_intc_init now used for Interrupt controller init
- changes for v4:
  add comment from Nori Sekhar:
  - rename davinci emac compatible property to "ti,davinci-dm6467-emac"
  - remove "pinmux-handle" property as discussed here:
    http://www.spinics.net/lists/arm-kernel/msg175701.html
    with Nori Sekhar

- changes for v5:
  add comments from Grant Likely:
  - rename compatible" prop to "ti,cp-intc"

 arch/arm/boot/dts/enbw_cmc.dts                  |  172 +++++++++++
 arch/arm/configs/enbw_cmc_defconfig             |  123 ++++++++
 arch/arm/mach-davinci/Kconfig                   |    9 +
 arch/arm/mach-davinci/Makefile                  |    1 +
 arch/arm/mach-davinci/board-enbw-cmc.c          |  374 +++++++++++++++++++++++
 arch/arm/mach-davinci/include/mach/uncompress.h |    1 +
 6 files changed, 680 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/boot/dts/enbw_cmc.dts
 create mode 100644 arch/arm/configs/enbw_cmc_defconfig
 create mode 100644 arch/arm/mach-davinci/board-enbw-cmc.c

diff --git a/arch/arm/boot/dts/enbw_cmc.dts b/arch/arm/boot/dts/enbw_cmc.dts
new file mode 100644
index 0000000..19c7559
--- /dev/null
+++ b/arch/arm/boot/dts/enbw_cmc.dts
@@ -0,0 +1,172 @@
+/*
+ * Device Tree for the EnBW CMC plattform
+ *
+ * Copyright 2011 DENX Software Engineering GmbH
+ * Heiko Schocher <hs-ynQEQJNshbs@public.gmane.org>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+/dts-v1/;
+/include/ "skeleton.dtsi"
+
+/ {
+	model = "EnBW CMC";
+	compatible = "enbw,cmc";
+
+	aliases {
+		ethernet0 = &eth0;
+	};
+
+	arm {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		ranges = <0 0xfffee000 0x00020000>;
+		intc: interrupt-controller@1 {
+			compatible = "ti,cp-intc";
+			interrupt-controller;
+			#interrupt-cells = <1>;
+			ti,intc-size = <101>;
+			reg = <0x0 0x2000>;
+		};
+	};
+	soc@1c00000 {
+		compatible = "ti,da850";
+		#address-cells = <1>;
+		#size-cells = <1>;
+		ranges = <0x0 0x01c00000 0x400000>;
+
+		serial0: serial@1c42000 {
+			compatible = "ti,da850", "ns16550a";
+			reg = <0x42000 0x100>;
+			clock-frequency = <150000000>;
+			reg-shift = <2>;
+			interrupts = <25>;
+			interrupt-parent = <&intc>;
+		};
+		serial1: serial@1d0c000 {
+			compatible = "ti,da850", "ns16550a";
+			reg = <0x10c000 0x100>;
+			clock-frequency = <150000000>;
+			reg-shift = <2>;
+			interrupts = <53>;
+			interrupt-parent = <&intc>;
+		};
+		serial2: serial@1d0d000 {
+			compatible = "ti,da850", "ns16550a";
+			reg = <0x10d000 0x100>;
+			clock-frequency = <150000000>;
+			reg-shift = <2>;
+			interrupts = <61>;
+			interrupt-parent = <&intc>;
+		};
+
+		eth0: emac@1e20000 {
+			compatible = "ti,davinci-dm6467-emac";
+			reg = <0x220000 0x4000>;
+			ti,davinci-ctrl-reg-offset = <0x3000>;
+			ti,davinci-ctrl-mod-reg-offset = <0x2000>;
+			ti,davinci-ctrl-ram-offset = <0>;
+			ti,davinci-ctrl-ram-size = <0x2000>;
+			local-mac-address = [ 00 00 00 00 00 00 ];
+			interrupts = <33
+					34
+					35
+					36
+					>;
+			interrupt-parent = <&intc>;
+		};
+
+		i2c@1c22000 {
+			compatible = "ti,davinci-i2c";
+			reg = <0x22000 0x1000>;
+			clock-frequency = <100000>;
+			interrupts = <15>;
+			interrupt-parent = <&intc>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			dtt@48 {
+				compatible = "national,lm75";
+				reg = <0x48>;
+			};
+		};
+	};
+	onchipram@8000000 {
+		compatible = "ti,davinci-onchipram";
+		#address-cells = <1>;
+		#size-cells = <1>;
+		ranges = <0x0 0x80000000 0x20000>;
+	};
+	aemif@60000000 {
+		compatible = "ti,davinci-aemif";
+		#address-cells = <2>;
+		#size-cells = <1>;
+		reg = <0x68000000 0x80000>;
+		ranges = <2 0 0x60000000 0x02000000
+			  3 0 0x62000000 0x02000000
+			  4 0 0x64000000 0x02000000
+			  5 0 0x66000000 0x02000000
+			  6 0 0x68000000 0x02000000>;
+		cs2@68000000 {
+			compatible = "ti,davinci-cs";
+			#address-cells = <1>;
+			#size-cells = <1>;
+			/* all timings in nanoseconds */
+			cs = <2>;
+			asize = <1>;
+			ta = <0>;
+			rhold = <7>;
+			rstrobe = <42>;
+			rsetup = <14>;
+			whold = <7>;
+			wstrobe = <42>;
+			wsetup = <14>;
+			ew = <0>;
+			ss = <0>;
+		};
+		flash@2,0 {
+			compatible = "cfi-flash";
+			reg = <2 0x0 0x400000>;
+			#address-cells = <1>;
+			#size-cells = <1>;
+			bank-width = <2>;
+			device-width = <2>;
+		};
+		nand_cs: cs3@68000000 {
+			compatible = "ti,davinci-cs";
+			#address-cells = <1>;
+			#size-cells = <1>;
+			/* all timings in nanoseconds */
+			cs = <3>;
+			asize = <0>;
+			ta = <0>;
+			rhold = <7>;
+			rstrobe = <42>;
+			rsetup = <7>;
+			whold = <7>;
+			wstrobe = <14>;
+			wsetup = <7>;
+			ew = <0>;
+			ss = <0>;
+		};
+		nand@3,0 {
+			compatible = "ti,davinci-nand";
+			reg = <3 0x0 0x807ff
+				6 0x0 0x8000>;
+			#address-cells = <1>;
+			#size-cells = <1>;
+			ti,davinci-chipselect = <1>;
+			ti,davinci-mask-ale = <0>;
+			ti,davinci-mask-cle = <0>;
+			ti,davinci-mask-chipsel = <0>;
+			ti,davinci-ecc-mode = "hw";
+			ti,davinci-ecc-bits = <4>;
+			ti,davinci-nand-use-bbt;
+			timing-handle = <&nand_cs>;
+		};
+
+	};
+};
diff --git a/arch/arm/configs/enbw_cmc_defconfig b/arch/arm/configs/enbw_cmc_defconfig
new file mode 100644
index 0000000..9d98e7f
--- /dev/null
+++ b/arch/arm/configs/enbw_cmc_defconfig
@@ -0,0 +1,123 @@
+CONFIG_EXPERIMENTAL=y
+# CONFIG_SWAP is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_EXPERT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+# CONFIG_BLK_DEV_BSG is not set
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+CONFIG_ARCH_DAVINCI=y
+CONFIG_ARCH_DAVINCI_DA850=y
+# CONFIG_MACH_DAVINCI_DA850_EVM is not set
+CONFIG_GPIO_PCA953X=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_PREEMPT=y
+CONFIG_AEABI=y
+# CONFIG_OABI_COMPAT is not set
+CONFIG_USE_OF=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6=y
+CONFIG_NETFILTER=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+# CONFIG_FW_LOADER is not set
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLKDEVS=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_MTD_PHYSMAP=y
+CONFIG_MTD_PHYSMAP_OF=y
+CONFIG_MTD_NAND=y
+CONFIG_MTD_NAND_DAVINCI=y
+CONFIG_MTD_UBI=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=1
+CONFIG_BLK_DEV_RAM_SIZE=32768
+CONFIG_EEPROM_AT24=y
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_NETDEVICES=y
+CONFIG_MII=y
+CONFIG_TI_DAVINCI_EMAC=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_EVBUG=y
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=3
+CONFIG_SERIAL_8250_RUNTIME_UARTS=3
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_HW_RANDOM=y
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_DAVINCI=y
+CONFIG_GPIO_SYSFS=y
+CONFIG_GPIO_PCF857X=y
+CONFIG_SENSORS_LM75=y
+CONFIG_WATCHDOG=y
+CONFIG_WATCHDOG_CORE=y
+CONFIG_DAVINCI_WATCHDOG=y
+# CONFIG_HID_SUPPORT is not set
+CONFIG_USB=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MUSB_HDRC=y
+CONFIG_USB_MUSB_DA8XX=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_UAS=y
+CONFIG_USB_LIBUSUAL=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_FUSB300=y
+CONFIG_USB_ETH=y
+CONFIG_MMC=y
+CONFIG_MMC_DAVINCI=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_OMAP=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_UBIFS_FS=y
+CONFIG_CRAMFS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_DEBUG_FS=y
+CONFIG_TIMER_STATS=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_MUTEXES=y
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
+# CONFIG_CRYPTO_HW is not set
+CONFIG_CRC_CCITT=m
+CONFIG_CRC_T10DIF=m
diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig
index 32d837d..4cb0469 100644
--- a/arch/arm/mach-davinci/Kconfig
+++ b/arch/arm/mach-davinci/Kconfig
@@ -202,6 +202,15 @@ config DA850_WL12XX
 	  Say Y if you want to use a wl1271 expansion card connected to the
 	  AM18x EVM.
 
+config MACH_ENBW_CMC
+	bool "EnBW Communication Module Compact"
+	default ARCH_DAVINCI_DA850
+	depends on ARCH_DAVINCI_DA850
+	select OF
+	help
+	  Say Y here to select the EnBW Communication Module Compact
+	  board.
+
 config GPIO_PCA953X
 	default MACH_DAVINCI_DA850_EVM
 
diff --git a/arch/arm/mach-davinci/Makefile b/arch/arm/mach-davinci/Makefile
index 2db78bd..12f3166 100644
--- a/arch/arm/mach-davinci/Makefile
+++ b/arch/arm/mach-davinci/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_MACH_DAVINCI_DA850_EVM)	+= board-da850-evm.o
 obj-$(CONFIG_MACH_TNETV107X)		+= board-tnetv107x-evm.o
 obj-$(CONFIG_MACH_MITYOMAPL138)		+= board-mityomapl138.o
 obj-$(CONFIG_MACH_OMAPL138_HAWKBOARD)	+= board-omapl138-hawk.o
+obj-$(CONFIG_MACH_ENBW_CMC)		+= board-enbw-cmc.o
 
 # Power Management
 obj-$(CONFIG_CPU_FREQ)			+= cpufreq.o
diff --git a/arch/arm/mach-davinci/board-enbw-cmc.c b/arch/arm/mach-davinci/board-enbw-cmc.c
new file mode 100644
index 0000000..fcec14f
--- /dev/null
+++ b/arch/arm/mach-davinci/board-enbw-cmc.c
@@ -0,0 +1,374 @@
+/*
+ * EnBW Communication Module Compact board
+ * Copyright 2011 DENX Software Engineering GmbH
+ * Author: Heiko Schocher <hs-ynQEQJNshbs@public.gmane.org>
+ *
+ * based on:
+ * TI DA850/OMAP-L138 EVM board
+ *
+ * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * Derived from: arch/arm/mach-davinci/board-da850-evm.c
+ * Original Copyrights follow:
+ *
+ * 2007, 2009 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ */
+#include <linux/console.h>
+#include <linux/gpio.h>
+#include <linux/gpio_keys.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/physmap.h>
+#include <linux/of.h>
+#include <linux/of_net.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/flash.h>
+#include <asm/mach-types.h>
+#include <asm/mach/arch.h>
+#include <mach/aemif.h>
+#include <mach/cp_intc.h>
+#include <mach/da8xx.h>
+#include <mach/mux.h>
+#include <mach/nand.h>
+#include <mach/spi.h>
+
+#define ENBW_CMC_MMCSD_CD_PIN          GPIO_TO_PIN(3, 13)
+
+/*
+ * USB1 VBUS is controlled by GPIO7[12], over-current is reported on GPIO7[8].
+ */
+#define DA850_USB_VBUS_PIN	GPIO_TO_PIN(7, 12)
+#define ON_BD_USB_OVC		GPIO_TO_PIN(7, 8)
+
+#if defined(CONFIG_USB_OHCI_HCD)
+static irqreturn_t enbw_cmc_usb_ocic_irq(int irq, void *dev_id);
+static da8xx_ocic_handler_t enbw_cmc_usb_ocic_handler;
+
+static int enbw_cmc_usb_set_power(unsigned port, int on)
+{
+	gpio_set_value(DA850_USB_VBUS_PIN, on);
+	return 0;
+}
+
+static int enbw_cmc_usb_get_power(unsigned port)
+{
+	return gpio_get_value(DA850_USB_VBUS_PIN);
+}
+
+static int enbw_cmc_usb_get_oci(unsigned port)
+{
+	return !gpio_get_value(ON_BD_USB_OVC);
+}
+
+static irqreturn_t enbw_cmc_usb_ocic_irq(int, void *);
+
+static int enbw_cmc_usb_ocic_notify(da8xx_ocic_handler_t handler)
+{
+	int irq         = gpio_to_irq(ON_BD_USB_OVC);
+	int error       = 0;
+
+	if (handler != NULL) {
+		enbw_cmc_usb_ocic_handler = handler;
+
+		error = request_irq(irq, enbw_cmc_usb_ocic_irq,
+					IRQF_DISABLED | IRQF_TRIGGER_RISING |
+					IRQF_TRIGGER_FALLING,
+					"OHCI over-current indicator", NULL);
+		if (error)
+			pr_err("%s: could not request IRQ to watch "
+				"over-current indicator changes\n", __func__);
+	} else {
+		free_irq(irq, NULL);
+	}
+	return error;
+}
+
+static struct da8xx_ohci_root_hub enbw_cmc_usb11_pdata = {
+	.set_power      = enbw_cmc_usb_set_power,
+	.get_power      = enbw_cmc_usb_get_power,
+	.get_oci        = enbw_cmc_usb_get_oci,
+	.ocic_notify    = enbw_cmc_usb_ocic_notify,
+	.potpgt         = (10 + 1) / 2,  /* 10 ms max */
+};
+
+static irqreturn_t enbw_cmc_usb_ocic_irq(int irq, void *dev_id)
+{
+	enbw_cmc_usb_ocic_handler(&enbw_cmc_usb11_pdata, 1);
+	return IRQ_HANDLED;
+}
+#endif
+
+static __init void enbw_cmc_usb_init(void)
+{
+	int ret;
+	u32 cfgchip2;
+
+	/* Set up USB clock/mode in the CFGCHIP2 register. */
+	cfgchip2 = __raw_readl(DA8XX_SYSCFG0_VIRT(DA8XX_CFGCHIP2_REG));
+
+	/* USB2.0 PHY reference clock is AUXCLK with 24MHz */
+	cfgchip2 &= ~CFGCHIP2_REFFREQ;
+	cfgchip2 |=  CFGCHIP2_REFFREQ_24MHZ;
+
+	/*
+	 * Select internal reference clock for USB 2.0 PHY
+	 * and use it as a clock source for USB 1.1 PHY
+	 * (this is the default setting anyway).
+	 */
+	cfgchip2 &= ~CFGCHIP2_USB1PHYCLKMUX;
+	cfgchip2 |=  CFGCHIP2_USB2PHYCLKMUX;
+
+	cfgchip2 &= ~CFGCHIP2_OTGMODE;
+	cfgchip2 |=  CFGCHIP2_SESENDEN | CFGCHIP2_VBDTCTEN;
+
+	__raw_writel(cfgchip2, DA8XX_SYSCFG0_VIRT(DA8XX_CFGCHIP2_REG));
+
+	/*
+	 * SP2525A @ 5V supplies 500mA,
+	 * with the power on to power good time of 10 ms.
+	 */
+	ret = da8xx_register_usb20(500, 10);
+	if (ret)
+		pr_warning("%s: USB 2.0 registration failed: %d\n",
+			   __func__, ret);
+
+#if defined(CONFIG_USB_OHCI_HCD)
+	ret = gpio_request_one(DA850_USB_VBUS_PIN,
+			GPIOF_DIR_OUT, "USB 1.1 VBUS");
+	if (ret < 0) {
+		pr_err("%s: failed to request GPIO for USB 1.1 port "
+			"power control: %d\n", __func__, ret);
+		return;
+	}
+	gpio_direction_input(DA850_USB_VBUS_PIN);
+
+	ret = gpio_request(ON_BD_USB_OVC, "ON_BD_USB_OVC");
+	if (ret) {
+		printk(KERN_ERR "%s: failed to request GPIO for USB 1.1 port "
+		       "over-current indicator: %d\n", __func__, ret);
+		gpio_free(DA850_USB_VBUS_PIN);
+		return;
+	}
+	gpio_direction_input(ON_BD_USB_OVC);
+
+	ret = da8xx_register_usb11(&enbw_cmc_usb11_pdata);
+	if (ret) {
+		pr_warning("%s: USB 1.1 registration failed: %d\n",
+			   __func__, ret);
+		gpio_free(ON_BD_USB_OVC);
+		gpio_free(DA850_USB_VBUS_PIN);
+	}
+#endif
+
+	return;
+}
+
+static int enbw_cmc_mmc_get_ro(int index)
+{
+	return 0;
+}
+
+static int enbw_cmc_mmc_get_cd(int index)
+{
+	return gpio_get_value(ENBW_CMC_MMCSD_CD_PIN) ? 1 : 0;
+}
+
+static struct davinci_mmc_config enbw_cmc_mmc_config = {
+	.get_ro		= enbw_cmc_mmc_get_ro,
+	.get_cd		= enbw_cmc_mmc_get_cd,
+	.wires		= 4,
+	.max_freq	= 50000000,
+	.caps		= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
+	.version	= MMC_CTLR_VERSION_2,
+};
+
+static int __init enbw_cmc_config_emac(void)
+{
+	void __iomem *cfg_chip3_base;
+	u32 val;
+	struct davinci_soc_info *soc_info = &davinci_soc_info;
+
+	if (!machine_is_enbw_cmc())
+		return 0;
+
+	cfg_chip3_base = DA8XX_SYSCFG0_VIRT(DA8XX_CFGCHIP3_REG);
+	val = __raw_readl(cfg_chip3_base);
+	val &= ~BIT(8);
+	pr_info("EMAC: MII PHY configured, RMII PHY will not be"
+						" functional\n");
+
+	/* configure the CFGCHIP3 register for MII */
+	__raw_writel(val, cfg_chip3_base);
+
+	/* use complete info from OF */
+	soc_info->emac_pdata = NULL;
+
+	return 0;
+}
+device_initcall(enbw_cmc_config_emac);
+
+static const s16 da850_dma0_rsv_chans[][2] = {
+	/* (offset, number) */
+	{-1, -1}
+};
+
+static const s16 da850_dma0_rsv_slots[][2] = {
+	/* (offset, number) */
+	{-1, -1}
+};
+
+static const s16 da850_dma1_rsv_chans[][2] = {
+	/* (offset, number) */
+	{-1, -1}
+};
+
+static const s16 da850_dma1_rsv_slots[][2] = {
+	/* (offset, number) */
+	{-1, -1}
+};
+
+static struct edma_rsv_info da850_edma_cc0_rsv = {
+	.rsv_chans	= da850_dma0_rsv_chans,
+	.rsv_slots	= da850_dma0_rsv_slots,
+};
+
+static struct edma_rsv_info da850_edma_cc1_rsv = {
+	.rsv_chans	= da850_dma1_rsv_chans,
+	.rsv_slots	= da850_dma1_rsv_slots,
+};
+
+static struct edma_rsv_info *da850_edma_rsv[2] = {
+	&da850_edma_cc0_rsv,
+	&da850_edma_cc1_rsv,
+};
+
+#ifdef CONFIG_CPU_FREQ
+static __init int da850_evm_init_cpufreq(void)
+{
+	switch (system_rev & 0xF) {
+	case 3:
+		da850_max_speed = 456000;
+		break;
+	case 2:
+		da850_max_speed = 408000;
+		break;
+	case 1:
+		da850_max_speed = 372000;
+		break;
+	}
+
+	return da850_register_cpufreq("pll0_sysclk3");
+}
+#else
+static __init int da850_evm_init_cpufreq(void) { return 0; }
+#endif
+
+struct of_dev_auxdata enbw_cmc_auxdata_lookup[] __initdata = {
+	OF_DEV_AUXDATA("ti,davinci-wdt", 0x01c21000, "ti,davinci-wdt", NULL),
+	OF_DEV_AUXDATA("ti,davinci-i2c", 0x01c22000, "i2c_davinci.1", NULL),
+	OF_DEV_AUXDATA("ti,davinci-i2c", 0x01e28000, "i2c_davinci.2", NULL),
+	OF_DEV_AUXDATA("ti,davinci-dm6467-emac", 0x01e20000, "davinci_emac.1",
+			NULL),
+	{}
+};
+
+const struct of_device_id enbw_cmc_bus_match_table[] = {
+	{ .compatible = "simple-bus", },
+	{ .compatible = "ti,da850", },
+	{ .compatible = "ti,davinci-onchipram", },
+	{ .compatible = "ti,davinci-aemif", },
+	{} /* Empty terminated list */
+};
+
+static __init void enbw_cmc_init(void)
+{
+	int ret;
+
+	of_platform_populate(NULL, enbw_cmc_bus_match_table,
+		enbw_cmc_auxdata_lookup, NULL);
+
+	ret = da8xx_register_watchdog();
+	if (ret)
+		pr_warning("enbw_cmc_init: watchdog registration failed: %d\n",
+				ret);
+
+	ret = da850_register_edma(da850_edma_rsv);
+	if (ret)
+		pr_warning("enbw_cmc_init: edma registration failed: %d\n",
+				ret);
+
+	/*
+	 * shut down uart 0 this port is not used on the board
+	 */
+	__raw_writel(0, IO_ADDRESS(DA8XX_UART0_BASE) + 0x30);
+
+	ret = da8xx_register_rtc();
+	if (ret)
+		pr_warning("enbw_cmc_init: rtc setup failed: %d\n", ret);
+
+	ret = da850_evm_init_cpufreq();
+	if (ret)
+		pr_warning("enbw_cmc_init: cpufreq registration failed: %d\n",
+				ret);
+
+	ret = da8xx_register_cpuidle();
+	if (ret)
+		pr_warning("enbw_cmc_init: cpuidle registration failed: %d\n",
+				ret);
+
+	ret = gpio_request(ENBW_CMC_MMCSD_CD_PIN, "MMC CD\n");
+	if (ret)
+		pr_warning("enbw_cmc_init: can not open GPIO %d\n",
+				ENBW_CMC_MMCSD_CD_PIN);
+	gpio_direction_input(ENBW_CMC_MMCSD_CD_PIN);
+
+	ret = da850_register_mmcsd1(&enbw_cmc_mmc_config);
+	if (ret)
+		pr_warning("enbw_cmc_init: mmcsd1 registration failed:"
+				" %d\n", ret);
+
+	enbw_cmc_usb_init();
+}
+
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+static int __init enbw_cmc_console_init(void)
+{
+	if (!machine_is_enbw_cmc())
+		return 0;
+
+	return add_preferred_console("ttyS", 2, "115200");
+}
+console_initcall(enbw_cmc_console_init);
+#endif
+
+static void __init enbw_cmc_map_io(void)
+{
+	da850_init();
+}
+
+static const char *enbw_cmc_board_compat[] __initconst = {
+	"enbw,cmc",
+	NULL
+};
+
+MACHINE_START(ENBW_CMC, "EnBW CMC")
+	.map_io		= enbw_cmc_map_io,
+	.init_irq	= cp_intc_init,
+	.timer		= &davinci_timer,
+	.init_machine	= enbw_cmc_init,
+	.dt_compat	= enbw_cmc_board_compat,
+	.dma_zone_size	= SZ_128M,
+	.restart	= da8xx_restart,
+MACHINE_END
diff --git a/arch/arm/mach-davinci/include/mach/uncompress.h b/arch/arm/mach-davinci/include/mach/uncompress.h
index da2fb2c..6119543 100644
--- a/arch/arm/mach-davinci/include/mach/uncompress.h
+++ b/arch/arm/mach-davinci/include/mach/uncompress.h
@@ -98,6 +98,7 @@ static inline void __arch_decomp_setup(unsigned long arch_id)
 		DEBUG_LL_DA8XX(davinci_da850_evm,	2);
 		DEBUG_LL_DA8XX(mityomapl138,		1);
 		DEBUG_LL_DA8XX(omapl138_hawkboard,	2);
+		DEBUG_LL_DA8XX(enbw_cmc,		2);
 
 		/* TNETV107x boards */
 		DEBUG_LL_TNETV107X(tnetv107x,		1);
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH v5 4/7] ARM: davinci: net: davinci_emac: add OF support
From: Heiko Schocher @ 2012-05-30 10:19 UTC (permalink / raw)
  To: davinci-linux-open-source
  Cc: Heiko Schocher, linux-arm-kernel, devicetree-discuss, netdev,
	Grant Likely, Sekhar Nori, Wolfgang Denk, Anatoly Sivov
In-Reply-To: <1338373143-7467-1-git-send-email-hs@denx.de>

add of support for the davinci_emac driver.

Signed-off-by: Heiko Schocher <hs@denx.de>
Cc: davinci-linux-open-source@linux.davincidsp.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: devicetree-discuss@lists.ozlabs.org
Cc: netdev@vger.kernel.org
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Sekhar Nori <nsekhar@ti.com>
Cc: Wolfgang Denk <wd@denx.de>
Cc: Anatoly Sivov <mm05@mail.ru>

---
- changes for v2:
  - add comment from Anatoly Sivov
    - fix typo in davinci_emac.txt
  - add comment from Grant Likely:
    - add prefix "ti,davinci-" to davinci specific property names
    - remove version property
    - use compatible name "ti,davinci-dm6460-emac"
    - use devm_kzalloc()
    - use of_match_ptr()
    - document all new properties
    - remove of_address_to_resource() and do not overwrite
      resource table
    - whitespace fixes
    - remove hw_ram_addr as it is not used in current
      board code
- no changes for v3
- changes for v4:
  add comments from Nori Sekhar:
  - move devictree documentation to:
    Documentation/devicetree/bindings/net/davinci_emac.txt
  - fix typo in it
  - rename compatible property to "ti,davinci-dm6467-emac"
  - remove pinmux-handle
  - set version directly in pdata->version
- no changes for v5

 .../devicetree/bindings/net/davinci_emac.txt       |   41 +++++++++
 drivers/net/ethernet/ti/davinci_emac.c             |   87 +++++++++++++++++++-
 2 files changed, 127 insertions(+), 1 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/davinci_emac.txt

diff --git a/Documentation/devicetree/bindings/net/davinci_emac.txt b/Documentation/devicetree/bindings/net/davinci_emac.txt
new file mode 100644
index 0000000..48b259e
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/davinci_emac.txt
@@ -0,0 +1,41 @@
+* Texas Instruments Davinci EMAC
+
+This file provides information, what the device node
+for the davinci_emac interface contains.
+
+Required properties:
+- compatible: "ti,davinci-dm6467-emac";
+- reg: Offset and length of the register set for the device
+- ti,davinci-ctrl-reg-offset: offset to control register
+- ti,davinci-ctrl-mod-reg-offset: offset to control module register
+- ti,davinci-ctrl-ram-offset: offset to control module ram
+- ti,davinci-ctrl-ram-size: size of control module ram
+- ti,davinci-rmii-en: use RMII
+- ti,davinci-no-bd-ram: has the emac controller BD RAM
+- phy-handle: Contains a phandle to an Ethernet PHY.
+              if not, davinci_emac driver defaults to 100/FULL
+- interrupts: interrupt mapping for the davinci emac interrupts sources:
+              4 sources: <Receive Threshold Interrupt
+			  Receive Interrupt
+			  Transmit Interrupt
+			  Miscellaneous Interrupt>
+
+Optional properties:
+- local-mac-address : 6 bytes, mac address
+
+Example (enbw_cmc board):
+	eth0: emac@1e20000 {
+		compatible = "ti,davinci-dm6467-emac";
+		reg = <0x220000 0x4000>;
+		ti,davinci-ctrl-reg-offset = <0x3000>;
+		ti,davinci-ctrl-mod-reg-offset = <0x2000>;
+		ti,davinci-ctrl-ram-offset = <0>;
+		ti,davinci-ctrl-ram-size = <0x2000>;
+		local-mac-address = [ 00 00 00 00 00 00 ];
+		interrupts = <33
+				34
+				35
+				36
+				>;
+		interrupt-parent = <&intc>;
+	};
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index 4da93a5..645618d 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -58,6 +58,12 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/davinci_emac.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_net.h>
+
+#include <mach/mux.h>
 
 #include <asm/irq.h>
 #include <asm/page.h>
@@ -339,6 +345,9 @@ struct emac_priv {
 	u32 rx_addr_type;
 	atomic_t cur_tx;
 	const char *phy_id;
+#ifdef CONFIG_OF
+	struct device_node *phy_node;
+#endif
 	struct phy_device *phydev;
 	spinlock_t lock;
 	/*platform specific members*/
@@ -1762,6 +1771,75 @@ static const struct net_device_ops emac_netdev_ops = {
 #endif
 };
 
+#ifdef CONFIG_OF
+static struct emac_platform_data
+	*davinci_emac_of_get_pdata(struct platform_device *pdev,
+	struct emac_priv *priv)
+{
+	struct device_node *np;
+	struct emac_platform_data *pdata = NULL;
+	const u8 *mac_addr;
+	u32 data;
+	int ret;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+		if (!pdata)
+			goto nodata;
+	}
+
+	np = pdev->dev.of_node;
+	if (!np)
+		goto nodata;
+	else
+		pdata->version = EMAC_VERSION_2;
+
+	mac_addr = of_get_mac_address(np);
+	if (mac_addr)
+		memcpy(pdata->mac_addr, mac_addr, ETH_ALEN);
+
+	ret = of_property_read_u32(np, "ti,davinci-ctrl-reg-offset", &data);
+	if (!ret)
+		pdata->ctrl_reg_offset = data;
+
+	ret = of_property_read_u32(np, "ti,davinci-ctrl-mod-reg-offset",
+		&data);
+	if (!ret)
+		pdata->ctrl_mod_reg_offset = data;
+
+	ret = of_property_read_u32(np, "ti,davinci-ctrl-ram-offset", &data);
+	if (!ret)
+		pdata->ctrl_ram_offset = data;
+
+	ret = of_property_read_u32(np, "ti,davinci-ctrl-ram-size", &data);
+	if (!ret)
+		pdata->ctrl_ram_size = data;
+
+	ret = of_property_read_u32(np, "ti,davinci-rmii-en", &data);
+	if (!ret)
+		pdata->rmii_en = data;
+
+	ret = of_property_read_u32(np, "ti,davinci-no-bd-ram", &data);
+	if (!ret)
+		pdata->no_bd_ram = data;
+
+	priv->phy_node = of_parse_phandle(np, "phy-handle", 0);
+	if (!priv->phy_node)
+		pdata->phy_id = "";
+
+	pdev->dev.platform_data = pdata;
+nodata:
+	return  pdata;
+}
+#else
+static struct emac_platform_data
+	*davinci_emac_of_get_pdata(struct platform_device *pdev,
+	struct emac_priv *priv)
+{
+	return  pdev->dev.platform_data;
+}
+#endif
 /**
  * davinci_emac_probe: EMAC device probe
  * @pdev: The DaVinci EMAC device that we are removing
@@ -1804,7 +1882,7 @@ static int __devinit davinci_emac_probe(struct platform_device *pdev)
 
 	spin_lock_init(&priv->lock);
 
-	pdata = pdev->dev.platform_data;
+	pdata = davinci_emac_of_get_pdata(pdev, priv);
 	if (!pdata) {
 		dev_err(&pdev->dev, "no platform data\n");
 		rc = -ENODEV;
@@ -2015,6 +2093,12 @@ static const struct dev_pm_ops davinci_emac_pm_ops = {
 	.resume		= davinci_emac_resume,
 };
 
+static const struct of_device_id davinci_emac_of_match[] = {
+	{.compatible = "ti,davinci-dm6467-emac", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, davinci_emac_of_match);
+
 /**
  * davinci_emac_driver: EMAC platform driver structure
  */
@@ -2023,6 +2107,7 @@ static struct platform_driver davinci_emac_driver = {
 		.name	 = "davinci_emac",
 		.owner	 = THIS_MODULE,
 		.pm	 = &davinci_emac_pm_ops,
+		.of_match_table = of_match_ptr(davinci_emac_of_match),
 	},
 	.probe = davinci_emac_probe,
 	.remove = __devexit_p(davinci_emac_remove),
-- 
1.7.7.6

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox