Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 2/6][INET]: Consolidate inet(6)_hash_connect.
From: Arnaldo Carvalho de Melo @ 2008-01-31 15:42 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Pavel Emelyanov, David Miller,
	Linux Netdev List, devel
In-Reply-To: <20080131133954.GU1819@ghostprotocols.net>

Em Thu, Jan 31, 2008 at 11:39:55AM -0200, Arnaldo Carvalho de Melo escreveu:
> Em Thu, Jan 31, 2008 at 04:18:51PM +0300, Pavel Emelyanov escreveu:
> > Arnaldo Carvalho de Melo wrote:
> > > Em Thu, Jan 31, 2008 at 03:32:09PM +0300, Pavel Emelyanov escreveu:
> > >> These two functions are the same except for what they call
> > >> to "check_established" and "hash" for a socket.
> > >>
> > >> This saves half-a-kilo for ipv4 and ipv6.
> > > 
> > > Good stuff!
> > > 
> > > Yesterday I was perusing tcp_hash and I think we could have the hashinfo
> > > pointer stored perhaps in sk->sk_prot.
> > > 
> > > That way we would be able to kill tcp_hash(), inet_put_port() could
> > > receive just sk, etc.
> > 
> > But each proto will still have its own hashfn, so proto's 
> > callbacks will be called to hash/unhash sockets, so this will 
> > give us just one extra dereference. No?
> > 
> > > What do you think?
> > 
> > Hmmm... Even raw_hash, etc may become simpler. On the other hand
> > maybe this is a good idea, but I'm not very common with this code
> > yet to foresee such things in advance... I think that we should
> > try to prepare a patch and look, but if you have smth ready, then
> > it's better to review your stuff first.
> 
> gimme some minutes

A bit more than minutes tho, but here it is, I'm testing it now.

Take a look and if testing is ok I'll submit it with a proper
description.

- Arnaldo

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index fdff630..62a5b69 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -49,7 +49,7 @@ static inline int inet6_sk_ehashfn(const struct sock *sk)
 	return inet6_ehashfn(laddr, lport, faddr, fport);
 }
 
-extern void __inet6_hash(struct inet_hashinfo *hashinfo, struct sock *sk);
+extern void __inet6_hash(struct sock *sk);
 
 /*
  * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 133cf30..f00f057 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -29,7 +29,6 @@
 #undef INET_CSK_CLEAR_TIMERS
 
 struct inet_bind_bucket;
-struct inet_hashinfo;
 struct tcp_congestion_ops;
 
 /*
@@ -59,6 +58,8 @@ struct inet_connection_sock_af_ops {
 				int level, int optname,
 				char __user *optval, int __user *optlen);
 	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
+	int	    (*bind_conflict)(const struct sock *sk,
+				     const struct inet_bind_bucket *tb);
 };
 
 /** inet_connection_sock - INET connection oriented sock
@@ -244,10 +245,7 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk,
 						const __be32 laddr);
 extern int inet_csk_bind_conflict(const struct sock *sk,
 				  const struct inet_bind_bucket *tb);
-extern int inet_csk_get_port(struct inet_hashinfo *hashinfo,
-			     struct sock *sk, unsigned short snum,
-			     int (*bind_conflict)(const struct sock *sk,
-						  const struct inet_bind_bucket *tb));
+extern int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 extern struct dst_entry* inet_csk_route_req(struct sock *sk,
 					    const struct request_sock *req);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index c23c4ed..48ac620 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -221,9 +221,9 @@ static inline int inet_sk_listen_hashfn(const struct sock *sk)
 }
 
 /* Caller must disable local BH processing. */
-static inline void __inet_inherit_port(struct inet_hashinfo *table,
-				       struct sock *sk, struct sock *child)
+static inline void __inet_inherit_port(struct sock *sk, struct sock *child)
 {
+	struct inet_hashinfo *table = sk->sk_prot->hashinfo;
 	const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
 	struct inet_bind_bucket *tb;
@@ -235,15 +235,14 @@ static inline void __inet_inherit_port(struct inet_hashinfo *table,
 	spin_unlock(&head->lock);
 }
 
-static inline void inet_inherit_port(struct inet_hashinfo *table,
-				     struct sock *sk, struct sock *child)
+static inline void inet_inherit_port(struct sock *sk, struct sock *child)
 {
 	local_bh_disable();
-	__inet_inherit_port(table, sk, child);
+	__inet_inherit_port(sk, child);
 	local_bh_enable();
 }
 
-extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk);
+extern void inet_put_port(struct sock *sk);
 
 extern void inet_listen_wlock(struct inet_hashinfo *hashinfo);
 
@@ -266,41 +265,9 @@ static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo)
 		wake_up(&hashinfo->lhash_wait);
 }
 
-extern void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk);
-extern void __inet_hash_nolisten(struct inet_hashinfo *hinfo, struct sock *sk);
-
-static inline void inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
-{
-	if (sk->sk_state != TCP_CLOSE) {
-		local_bh_disable();
-		__inet_hash(hashinfo, sk);
-		local_bh_enable();
-	}
-}
-
-static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
-{
-	rwlock_t *lock;
-
-	if (sk_unhashed(sk))
-		goto out;
-
-	if (sk->sk_state == TCP_LISTEN) {
-		local_bh_disable();
-		inet_listen_wlock(hashinfo);
-		lock = &hashinfo->lhash_lock;
-	} else {
-		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-		write_lock_bh(lock);
-	}
-
-	if (__sk_del_node_init(sk))
-		sock_prot_inuse_add(sk->sk_prot, -1);
-	write_unlock_bh(lock);
-out:
-	if (sk->sk_state == TCP_LISTEN)
-		wake_up(&hashinfo->lhash_wait);
-}
+extern void __inet_hash_nolisten(struct sock *sk);
+extern void inet_hash(struct sock *sk);
+extern void inet_unhash(struct sock *sk);
 
 extern struct sock *__inet_lookup_listener(struct net *net,
 					   struct inet_hashinfo *hashinfo,
@@ -425,7 +392,7 @@ extern int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		struct sock *sk,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **),
-		void (*hash)(struct inet_hashinfo *, struct sock *));
+			       void (*hash)(struct sock *sk));
 extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
 			     struct sock *sk);
 #endif /* _INET_HASHTABLES_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index e3fb4c0..8a7889b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -496,6 +496,7 @@ extern int sk_wait_data(struct sock *sk, long *timeo);
 
 struct request_sock_ops;
 struct timewait_sock_ops;
+struct inet_hashinfo;
 
 /* Networking protocol blocks we attach to sockets.
  * socket layer -> transport layer interface
@@ -578,6 +579,8 @@ struct proto {
 	struct request_sock_ops	*rsk_prot;
 	struct timewait_sock_ops *twsk_prot;
 
+	struct inet_hashinfo	*hashinfo;
+
 	struct module		*owner;
 
 	char			name[32];
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index ebe59d9..287a62b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -271,8 +271,6 @@ extern struct sk_buff	*dccp_make_response(struct sock *sk,
 
 extern int	   dccp_connect(struct sock *sk);
 extern int	   dccp_disconnect(struct sock *sk, int flags);
-extern void	   dccp_hash(struct sock *sk);
-extern void	   dccp_unhash(struct sock *sk);
 extern int	   dccp_getsockopt(struct sock *sk, int level, int optname,
 				   char __user *optval, int __user *optlen);
 extern int	   dccp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index c982ad8..474075a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -38,12 +38,6 @@
  */
 static struct socket *dccp_v4_ctl_socket;
 
-static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
-{
-	return inet_csk_get_port(&dccp_hashinfo, sk, snum,
-				 inet_csk_bind_conflict);
-}
-
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -408,8 +402,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
-	__inet_hash_nolisten(&dccp_hashinfo, newsk);
-	__inet_inherit_port(&dccp_hashinfo, sk, newsk);
+	__inet_hash_nolisten(newsk);
+	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
@@ -898,6 +892,7 @@ static struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
+	.bind_conflict	   = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
@@ -937,10 +932,10 @@ static struct proto dccp_v4_prot = {
 	.sendmsg		= dccp_sendmsg,
 	.recvmsg		= dccp_recvmsg,
 	.backlog_rcv		= dccp_v4_do_rcv,
-	.hash			= dccp_hash,
-	.unhash			= dccp_unhash,
+	.hash			= inet_hash,
+	.unhash			= inet_unhash,
 	.accept			= inet_csk_accept,
-	.get_port		= dccp_v4_get_port,
+	.get_port		= inet_csk_get_port,
 	.shutdown		= dccp_shutdown,
 	.destroy		= dccp_destroy_sock,
 	.orphan_count		= &dccp_orphan_count,
@@ -948,6 +943,7 @@ static struct proto dccp_v4_prot = {
 	.obj_size		= sizeof(struct dccp_sock),
 	.rsk_prot		= &dccp_request_sock_ops,
 	.twsk_prot		= &dccp_timewait_sock_ops,
+	.hashinfo		= &dccp_hashinfo,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_dccp_setsockopt,
 	.compat_getsockopt	= compat_dccp_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ed0a005..490333d 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -39,21 +39,15 @@ static struct socket *dccp_v6_ctl_socket;
 static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
 static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
 
-static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
-{
-	return inet_csk_get_port(&dccp_hashinfo, sk, snum,
-				 inet6_csk_bind_conflict);
-}
-
 static void dccp_v6_hash(struct sock *sk)
 {
 	if (sk->sk_state != DCCP_CLOSED) {
 		if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
-			dccp_hash(sk);
+			inet_hash(sk);
 			return;
 		}
 		local_bh_disable();
-		__inet6_hash(&dccp_hashinfo, sk);
+		__inet6_hash(sk);
 		local_bh_enable();
 	}
 }
@@ -630,8 +624,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
 
-	__inet6_hash(&dccp_hashinfo, newsk);
-	inet_inherit_port(&dccp_hashinfo, sk, newsk);
+	__inet6_hash(newsk);
+	inet_inherit_port(sk, newsk);
 
 	return newsk;
 
@@ -1054,6 +1048,7 @@ static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1123,9 +1118,9 @@ static struct proto dccp_v6_prot = {
 	.recvmsg	   = dccp_recvmsg,
 	.backlog_rcv	   = dccp_v6_do_rcv,
 	.hash		   = dccp_v6_hash,
-	.unhash		   = dccp_unhash,
+	.unhash		   = inet_unhash,
 	.accept		   = inet_csk_accept,
-	.get_port	   = dccp_v6_get_port,
+	.get_port	   = inet_csk_get_port,
 	.shutdown	   = dccp_shutdown,
 	.destroy	   = dccp_v6_destroy_sock,
 	.orphan_count	   = &dccp_orphan_count,
@@ -1133,6 +1128,7 @@ static struct proto dccp_v6_prot = {
 	.obj_size	   = sizeof(struct dccp6_sock),
 	.rsk_prot	   = &dccp6_request_sock_ops,
 	.twsk_prot	   = &dccp6_timewait_sock_ops,
+	.hashinfo	   = &dccp_hashinfo,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_dccp_setsockopt,
 	.compat_getsockopt = compat_dccp_getsockopt,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0bed4a6..e3f5d37 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -78,7 +78,7 @@ void dccp_set_state(struct sock *sk, const int state)
 		sk->sk_prot->unhash(sk);
 		if (inet_csk(sk)->icsk_bind_hash != NULL &&
 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
-			inet_put_port(&dccp_hashinfo, sk);
+			inet_put_port(sk);
 		/* fall through */
 	default:
 		if (oldstate == DCCP_OPEN)
@@ -173,20 +173,6 @@ const char *dccp_state_name(const int state)
 
 EXPORT_SYMBOL_GPL(dccp_state_name);
 
-void dccp_hash(struct sock *sk)
-{
-	inet_hash(&dccp_hashinfo, sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_hash);
-
-void dccp_unhash(struct sock *sk)
-{
-	inet_unhash(&dccp_hashinfo, sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_unhash);
-
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -268,7 +254,7 @@ int dccp_destroy_sock(struct sock *sk)
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
-		inet_put_port(&dccp_hashinfo, sk);
+		inet_put_port(sk);
 
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index de5a41d..b189278 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,11 +78,9 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
  */
-int inet_csk_get_port(struct inet_hashinfo *hashinfo,
-		      struct sock *sk, unsigned short snum,
-		      int (*bind_conflict)(const struct sock *sk,
-					   const struct inet_bind_bucket *tb))
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
+	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
 	struct inet_bind_hashbucket *head;
 	struct hlist_node *node;
 	struct inet_bind_bucket *tb;
@@ -142,7 +140,7 @@ tb_found:
 			goto success;
 		} else {
 			ret = 1;
-			if (bind_conflict(sk, tb))
+			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
 				goto fail_unlock;
 		}
 	}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 48d4500..90f422c 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -66,8 +66,9 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 /*
  * Get rid of any references to a local port held by the given sock.
  */
-static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+static void __inet_put_port(struct sock *sk)
 {
+	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
 	const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
 	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
 	struct inet_bind_bucket *tb;
@@ -81,10 +82,10 @@ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
 	spin_unlock(&head->lock);
 }
 
-void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+void inet_put_port(struct sock *sk)
 {
 	local_bh_disable();
-	__inet_put_port(hashinfo, sk);
+	__inet_put_port(sk);
 	local_bh_enable();
 }
 
@@ -317,8 +318,9 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
 					  inet->dport);
 }
 
-void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk)
+void __inet_hash_nolisten(struct sock *sk)
 {
+	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
 	struct hlist_head *list;
 	rwlock_t *lock;
 	struct inet_ehash_bucket *head;
@@ -337,13 +339,14 @@ void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
 
-void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
+static void __inet_hash(struct sock *sk)
 {
+	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
 	struct hlist_head *list;
 	rwlock_t *lock;
 
 	if (sk->sk_state != TCP_LISTEN) {
-		__inet_hash_nolisten(hashinfo, sk);
+		__inet_hash_nolisten(sk);
 		return;
 	}
 
@@ -357,13 +360,48 @@ void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
 	write_unlock(lock);
 	wake_up(&hashinfo->lhash_wait);
 }
-EXPORT_SYMBOL_GPL(__inet_hash);
+
+void inet_hash(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		local_bh_disable();
+		__inet_hash(sk);
+		local_bh_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+	rwlock_t *lock;
+	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
+
+	if (sk_unhashed(sk))
+		goto out;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		local_bh_disable();
+		inet_listen_wlock(hashinfo);
+		lock = &hashinfo->lhash_lock;
+	} else {
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+		write_lock_bh(lock);
+	}
+
+	if (__sk_del_node_init(sk))
+		sock_prot_inuse_add(sk->sk_prot, -1);
+	write_unlock_bh(lock);
+out:
+	if (sk->sk_state == TCP_LISTEN)
+		wake_up(&hashinfo->lhash_wait);
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
 
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		struct sock *sk,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **),
-		void (*hash)(struct inet_hashinfo *, struct sock *))
+		void (*hash)(struct sock *sk))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	const unsigned short snum = inet_sk(sk)->num;
@@ -427,7 +465,7 @@ ok:
 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
 			inet_sk(sk)->sport = htons(port);
-			hash(hinfo, sk);
+			hash(sk);
 		}
 		spin_unlock(&head->lock);
 
@@ -444,7 +482,7 @@ ok:
 	tb  = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		hash(hinfo, sk);
+		hash(sk);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a0d373b..071e83a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1669,7 +1669,7 @@ void tcp_set_state(struct sock *sk, int state)
 		sk->sk_prot->unhash(sk);
 		if (inet_csk(sk)->icsk_bind_hash &&
 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
-			inet_put_port(&tcp_hashinfo, sk);
+			inet_put_port(sk);
 		/* fall through */
 	default:
 		if (oldstate==TCP_ESTABLISHED)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77c1939..63414ea 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -108,22 +108,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 };
 
-static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
-{
-	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
-				 inet_csk_bind_conflict);
-}
-
-static void tcp_v4_hash(struct sock *sk)
-{
-	inet_hash(&tcp_hashinfo, sk);
-}
-
-void tcp_unhash(struct sock *sk)
-{
-	inet_unhash(&tcp_hashinfo, sk);
-}
-
 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 {
 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
@@ -1478,8 +1462,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
-	__inet_hash_nolisten(&tcp_hashinfo, newsk);
-	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
+	__inet_hash_nolisten(newsk);
+	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
@@ -1827,6 +1811,7 @@ struct inet_connection_sock_af_ops ipv4_specific = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
+	.bind_conflict	   = inet_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
@@ -1926,7 +1911,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
-		inet_put_port(&tcp_hashinfo, sk);
+		inet_put_port(sk);
 
 	/*
 	 * If sendmsg cached page exists, toss it.
@@ -2435,9 +2420,9 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.recvmsg		= tcp_recvmsg,
 	.backlog_rcv		= tcp_v4_do_rcv,
-	.hash			= tcp_v4_hash,
-	.unhash			= tcp_unhash,
-	.get_port		= tcp_v4_get_port,
+	.hash			= inet_hash,
+	.unhash			= inet_unhash,
+	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.orphan_count		= &tcp_orphan_count,
@@ -2450,6 +2435,7 @@ struct proto tcp_prot = {
 	.obj_size		= sizeof(struct tcp_sock),
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
+	.hashinfo		= &tcp_hashinfo,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
@@ -2467,7 +2453,6 @@ void __init tcp_v4_init(struct net_proto_family *ops)
 EXPORT_SYMBOL(ipv4_specific);
 EXPORT_SYMBOL(tcp_hashinfo);
 EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_unhash);
 EXPORT_SYMBOL(tcp_v4_conn_request);
 EXPORT_SYMBOL(tcp_v4_connect);
 EXPORT_SYMBOL(tcp_v4_do_rcv);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d325a99..43f3993 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -22,9 +22,9 @@
 #include <net/inet6_hashtables.h>
 #include <net/ip.h>
 
-void __inet6_hash(struct inet_hashinfo *hashinfo,
-				struct sock *sk)
+void __inet6_hash(struct sock *sk)
 {
+	struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo;
 	struct hlist_head *list;
 	rwlock_t *lock;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 59d0029..12750f2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -86,12 +86,6 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
 #endif
 
-static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
-{
-	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
-				 inet6_csk_bind_conflict);
-}
-
 static void tcp_v6_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
@@ -100,7 +94,7 @@ static void tcp_v6_hash(struct sock *sk)
 			return;
 		}
 		local_bh_disable();
-		__inet6_hash(&tcp_hashinfo, sk);
+		__inet6_hash(sk);
 		local_bh_enable();
 	}
 }
@@ -1504,8 +1498,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
-	__inet6_hash(&tcp_hashinfo, newsk);
-	inet_inherit_port(&tcp_hashinfo, sk, newsk);
+	__inet6_hash(newsk);
+	inet_inherit_port(sk, newsk);
 
 	return newsk;
 
@@ -1833,6 +1827,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1864,6 +1859,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+	.bind_conflict	   = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -2127,8 +2123,8 @@ struct proto tcpv6_prot = {
 	.recvmsg		= tcp_recvmsg,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.hash			= tcp_v6_hash,
-	.unhash			= tcp_unhash,
-	.get_port		= tcp_v6_get_port,
+	.unhash			= inet_unhash,
+	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
@@ -2141,6 +2137,7 @@ struct proto tcpv6_prot = {
 	.obj_size		= sizeof(struct tcp6_sock),
 	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
+	.hashinfo		= &tcp_hashinfo,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,

^ permalink raw reply related

* Re: rtl8150: use default MTU of 1500
From: Petko Manolov @ 2008-01-31 15:42 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: netdev, jgarzik
In-Reply-To: <20080130193742.GA13631@xi.wantstofly.org>

On Wed, 30 Jan 2008, Lennert Buytenhek wrote:

> The RTL8150 driver uses an MTU of 1540 by default, which causes a
> bunch of problems -- it prevents booting from NFS root, for one.

Agreed, although it is a bit strange how this particular bug has sneaked 
up for so long...


cheers,
Petko



> Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
> Cc: Petko Manolov <petkan@nucleusys.com>
>
> --- linux-2.6.24-git7.orig/drivers/net/usb/rtl8150.c	2008-01-24 23:58:37.000000000 +0100
> +++ linux-2.6.24-git7/drivers/net/usb/rtl8150.c	2008-01-30 20:29:00.000000000 +0100
> @@ -925,9 +925,8 @@
> 	netdev->hard_start_xmit = rtl8150_start_xmit;
> 	netdev->set_multicast_list = rtl8150_set_multicast;
> 	netdev->set_mac_address = rtl8150_set_mac_address;
> 	netdev->get_stats = rtl8150_netdev_stats;
> -	netdev->mtu = RTL8150_MTU;
> 	SET_ETHTOOL_OPS(netdev, &ops);
> 	dev->intr_interval = 100;	/* 100ms */
>
> 	if (!alloc_all_urbs(dev)) {
>

^ permalink raw reply

* Re: e1000 full-duplex TCP performance well below wire speed
From: Bruce Allen @ 2008-01-31 15:54 UTC (permalink / raw)
  To: Bill Fink
  Cc: SANGTAE HA, Linux Kernel Mailing List, netdev, Stephen Hemminger
In-Reply-To: <20080131064533.ef0ae932.billfink@mindspring.com>

Hi Bill,

> I see similar results on my test systems

Thanks for this report and for confirming our observations.  Could you 
please confirm that a single-port bidrectional UDP link runs at wire 
speed?  This helps to localize the problem to the TCP stack or interaction 
of the TCP stack with the e1000 driver and hardware.

Cheers,
 	Bruce

^ permalink raw reply

* Re: e1000 full-duplex TCP performance well below wire speed
From: Bruce Allen @ 2008-01-31 15:57 UTC (permalink / raw)
  To: David Acker
  Cc: Bill Fink, SANGTAE HA, Linux Kernel Mailing List, netdev,
	Stephen Hemminger
In-Reply-To: <47A1E026.2070805@roinet.com>

Hi David,

> Could this be an issue with pause frames?  At a previous job I remember 
> having issues with a similar configuration using two broadcom sb1250 3 
> gigE port devices. If I ran bidirectional tests on a single pair of 
> ports connected via cross over, it was slower than when I gave each 
> direction its own pair of ports.  The problem turned out to be that 
> pause frame generation and handling was not configured correctly.

We had PAUSE frames turned off for our testing.  The idea is to let TCP 
do the flow and congestion control.

The problem with PAUSE+TCP is that it can cause head-of-line blocking, 
where a single oversubscribed output port on a switch can PAUSE a large 
number of flows on other paths.

Cheers,
 	Bruce

^ permalink raw reply

* Re: rtl8150: use default MTU of 1500
From: Lennert Buytenhek @ 2008-01-31 16:05 UTC (permalink / raw)
  To: Petko Manolov; +Cc: netdev, jgarzik
In-Reply-To: <alpine.DEB.1.00.0801311741400.4476@bender.nucleusys.com>

On Thu, Jan 31, 2008 at 05:42:34PM +0200, Petko Manolov wrote:

> > The RTL8150 driver uses an MTU of 1540 by default, which causes a
> > bunch of problems -- it prevents booting from NFS root, for one.
> 
> Agreed, although it is a bit strange how this particular bug has
> sneaked up for so long...

I posted this patch sometime in 2006, and you asked me a question
about it then (why we don't just set RTL8150_MTU to 1500 -- the
answer would be that RTL8150_MTU is used in a couple more places
in the driver, including for allocing skbuffs), but I failed to
follow up to that question at the time, which is why I assume it got
dropped.

I have been carrying the patch in my own tree since then, and only
noticed recently that the patch never made it upstream.


cheers,
Lennert


> >Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
> >Cc: Petko Manolov <petkan@nucleusys.com>
> >
> >--- linux-2.6.24-git7.orig/drivers/net/usb/rtl8150.c	2008-01-24 
> >23:58:37.000000000 +0100
> >+++ linux-2.6.24-git7/drivers/net/usb/rtl8150.c	2008-01-30 
> >20:29:00.000000000 +0100
> >@@ -925,9 +925,8 @@
> >	netdev->hard_start_xmit = rtl8150_start_xmit;
> >	netdev->set_multicast_list = rtl8150_set_multicast;
> >	netdev->set_mac_address = rtl8150_set_mac_address;
> >	netdev->get_stats = rtl8150_netdev_stats;
> >-	netdev->mtu = RTL8150_MTU;
> >	SET_ETHTOOL_OPS(netdev, &ops);
> >	dev->intr_interval = 100;	/* 100ms */
> >
> >	if (!alloc_all_urbs(dev)) {
> >

^ permalink raw reply

* Re: e1000 full-duplex TCP performance well below wire speed
From: Carsten Aulbert @ 2008-01-31 16:09 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Bruce Allen, Brandeburg, Jesse, netdev, Henning Fehrmann,
	Bruce Allen
In-Reply-To: <p738x265p7p.fsf@bingen.suse.de>

Hi Andi,

Andi Kleen wrote:
> Another issue with full duplex TCP not mentioned yet is that if TSO is used 
> the output  will be somewhat bursty and might cause problems with the 
> TCP ACK clock of the other direction because the ACKs would need 
> to squeeze in between full TSO bursts.
> 
> You could try disabling TSO with ethtool.

I just tried that:

https://n0.aei.uni-hannover.de/wiki/index.php/NetworkTestNetperf3

It seems that the numbers do get better (sweet-spot seems to be MTU6000 
with 914 MBit/s and 927 MBit/s), however for other settings the results 
vary a lot so I'm not sure how large the statistical fluctuations are.

Next test I'll try if it makes sense to enlarge the ring buffers.

Thanks

Carsten

^ permalink raw reply

* Re: [PATCH] [1/1] Deprecate tcp_tw_{reuse,recycle}
From: Ben Greear @ 2008-01-31 16:41 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev
In-Reply-To: <200801310755.02110.ak@suse.de>

Andi Kleen wrote:
>> I believe the problem was that all of my ports were used up with
>> TIME_WAIT sockets and so it couldn't create more.  My test
>> case was similar to this:
>>     
>
> Ah that's simple to solve then :- use more IP addresses and bind 
> to them in RR in your user program.
>
> Arguably the Linux TCP code should be able to do this by itself
> when enough IP addresses are available, but it's not very hard
> to do in user space using bind(2)
>
> BTW it's also an very unusual case -- in most cases there are more
> remote IP addresses
>   
This could be done, but it does decrease our options for testing certain 
scenarios.
>> So, is there a better way to max out the connections per second without 
>> having to use tcp_tw_recycle?
>>     
>
> Well did you profile where the bottle necks were?
>
> Perhaps also just increase the memory allowed for TCP sockets.
>   
I may be missing something, but I believe the issue is that the sockets 
wait around a while (maybe 30 seconds
or so) in TIME_WAIT state.  So, even if we use all 64k of the local port 
range, that will limit us to about 2000 new sockets
per second, as we have to wait for old ones to transition out of TIME_WAIT.

I guess I could probably decrease TIME_WAIT, but then all of my 
connections would be affected, not just the
ones on the ports creating very large numbers of connections per 
second.  From 'man tcp', it does not seem
I can set the TIME_WAIT on a per-socket basis.

I don't know exactly how the tcp_tw_recycle works, but it seems like it 
could be made to only
take affect when all local ports are used up in TIME_WAIT.  It could 
then recycle the oldest one
as a new socket is requested.  For any normal program, it would be very 
unlikely to ever need to
recycle in this case because there would be enough free IP/port pairs 
available.  But, for weird things
like my own, at least it could be made to work w/out hacking the global 
TIME_WAIT.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com



^ permalink raw reply

* Re: [PATCH 0/6] preparations to enable netdevice notifiers inside a namespace (resend)
From: Benjamin Thery @ 2008-01-31 16:47 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: Linux Containers, netdev, David Miller, Denis V. Lunev
In-Reply-To: <47A1E21C.40909-GANU6spQydw@public.gmane.org>

On Jan 31, 2008 3:58 PM, Daniel Lezcano <daniel.lezcano-GANU6spQydw@public.gmane.org> wrote:

> Denis V. Lunev wrote:
> > Here are some preparations and cleanups to enable network device/inet
> > address notifiers inside a namespace.
> >
> > This set of patches has been originally sent last Friday. One cleanup
> > patch from the original series is dropped as wrong, thanks to Daniel
> > Lezcano.
>
> Can you explain please.


I think Denis refers to the patch called "3/7 Prohibit assignment of
0.0.0.0as interface address." ,
he dropped because it was inappropriate, no?

-- Benjamin

--
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH] [1/1] Deprecate tcp_tw_{reuse,recycle}
From: Andi Kleen @ 2008-01-31 16:49 UTC (permalink / raw)
  To: Ben Greear; +Cc: Andi Kleen, netdev
In-Reply-To: <47A1FA42.4070503@candelatech.com>

On Thu, Jan 31, 2008 at 08:41:38AM -0800, Ben Greear wrote:
> I don't know exactly how the tcp_tw_recycle works, but it seems like it 
> could be made to only
> take affect when all local ports are used up in TIME_WAIT.  

TIME-WAIT does not actually use up local ports; it uses up remote ports
because it is done on the LISTEN socket which has always a fixed
local port. And it has no idea how many ports the other end has left.

-Andi

^ permalink raw reply

* [PATCH 1/1]: Add support for aes-ctr to ipsec
From: Joy Latten @ 2008-01-31 16:59 UTC (permalink / raw)
  To: netdev; +Cc: davem, herbert

Very sorry, re-posting as first patch was incomplete.

The below patch allows IPsec to use CTR mode with
AES encryption algorithm. Tested this using setkey
in ipsec-tools.

regards,
Joy


Signed-off-by: Joy Latten <latten@austin.ibm.com>

--

diff -urpN net-2.6.25/include/linux/pfkeyv2.h net-2.6.25.patch/include/linux/pfkeyv2.h
--- net-2.6.25/include/linux/pfkeyv2.h	2008-01-29 11:48:00.000000000 -0600
+++ net-2.6.25.patch/include/linux/pfkeyv2.h	2008-01-29 13:43:59.000000000 -0600
@@ -298,6 +298,7 @@ struct sadb_x_sec_ctx {
 #define SADB_X_EALG_BLOWFISHCBC		7
 #define SADB_EALG_NULL			11
 #define SADB_X_EALG_AESCBC		12
+#define SADB_X_EALG_AESCTR		13
 #define SADB_X_EALG_CAMELLIACBC		22
 #define SADB_EALG_MAX                   253 /* last EALG */
 /* private allocations should use 249-255 (RFC2407) */
diff -urpN net-2.6.25/net/xfrm/xfrm_algo.c net-2.6.25.patch/net/xfrm/xfrm_algo.c
--- net-2.6.25/net/xfrm/xfrm_algo.c	2008-01-29 11:48:03.000000000 -0600
+++ net-2.6.25.patch/net/xfrm/xfrm_algo.c	2008-01-29 13:42:43.000000000 -0600
@@ -300,6 +300,23 @@ static struct xfrm_algo_desc ealg_list[]
 		.sadb_alg_maxbits = 256
 	}
 },
+{
+	.name = "rfc3686(ctr(aes))",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 128,
+			.defkeybits = 160, /* 128-bit key + 32-bit nonce */
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AESCTR,
+		.sadb_alg_ivlen	= 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
 };
 
 static struct xfrm_algo_desc calg_list[] = {

^ permalink raw reply

* RE: e1000 full-duplex TCP performance well below wire speed
From: Brandeburg, Jesse @ 2008-01-31 17:20 UTC (permalink / raw)
  To: Carsten Aulbert; +Cc: Bruce Allen, netdev, Henning Fehrmann, Bruce Allen
In-Reply-To: <47A1E553.8010006@aei.mpg.de>

Carsten Aulbert wrote:
> We are using MSI, /proc/interrupts look like:
> n0003:~# cat /proc/interrupts
> 378:   17234866          0          0          0   PCI-MSI-edge     
> eth1 
> 379:     129826          0          0          0   PCI-MSI-edge 
> eth0

> (sorry for the line break).
> 
> What we don't understand is why only core0 gets the interrupts, since
> the affinity is set to f:
> # cat /proc/irq/378/smp_affinity
> f

without CONFIG_IRQBALANCE set, and no irqbalance daemon running, this is
expected.  Seems it is also dependent upon your system hardware.
 
> Right now, irqbalance is not running, though I can give it shot if
> people think this will make a difference.

probably won't make much of a difference if you only have a single
interrupt source generating interrupts.  If you are using both adapters
simultaneously, please use smp_affinity or turn on irqbalance.
 
>> I would suggest you try TCP_RR with a command line something like
>> this: netperf -t TCP_RR -H <hostname> -C -c -- -b 4 -r 64K
> 
> I did that and the results can be found here:
> https://n0.aei.uni-hannover.de/wiki/index.php/NetworkTest

seems something went wrong and all you ran was the 1 byte tests, where
it should have been 64K both directions (request/response).
 
> The results with netperf running like
> netperf -t TCP_STREAM -H <host> -l 20
> can be found here:
> https://n0.aei.uni-hannover.de/wiki/index.php/NetworkTestNetperf1

 
> I reran the tests with
> netperf -t <test> -H <host> -l 20 -c -C
> or in the case of TCP_RR with the suggested burst settings -b 4 -r 64k

I get: 
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to foo
(134.134.3.121) port 0 AF_INET : first burst 4
Local /Remote
Socket Size   Request Resp.  Elapsed Trans.   CPU    CPU    S.dem
S.dem
Send   Recv   Size    Size   Time    Rate     local  remote local
remote
bytes  bytes  bytes   bytes  secs.   per sec  % S    % S    us/Tr
us/Tr

16384  87380  65536   65536  10.00   1565.34  14.17  27.18  362.220
347.243
16384  87380 
 
>> Yes, InterruptThrottleRate=8000 means there will be no more than 8000
>> ints/second from that adapter, and if interrupts are generated faster
>> than that they are "aggregated."
>> 
>> Interestingly since you are interested in ultra low latency, and may
>> be willing to give up some cpu for it during bulk transfers you
>> should try InterruptThrottleRate=1 (can generate up to 70000 ints/s)
>> 
> 
> On the web page you'll see that there are about 4000 interrupts/s for
> most tests and up to 20,000/s for the TCP_RR test. Shall I change the
> throttle rate?

that's the auto-tuning, I suggest just InterruptThrottleRate=4000 or
8000 if all you're concerned about is bulk traffic performance.
 
>>>> just for completeness can you post the dump of ethtool -e eth0 and
>>>> lspci -vvv?
>>> Yup, we'll give that info also.
> 
> n0002:~# ethtool -e eth1
> Offset          Values
> ------          ------
> 0x0000          00 30 48 93 94 2d 20 0d 46 f7 57 00 ff ff ff ff
> 0x0010          ff ff ff ff 6b 02 9a 10 d9 15 9a 10 86 80 df 80
> 0x0020          00 00 00 20 54 7e 00 00 00 10 da 00 04 00 00 27
> 0x0030          c9 6c 50 31 32 07 0b 04 84 29 00 00 00 c0 06 07
> 0x0040          08 10 00 00 04 0f ff 7f 01 4d ff ff ff ff ff ff
> 0x0050          14 00 1d 00 14 00 1d 00 af aa 1e 00 00 00 1d 00
> 0x0060          00 01 00 40 1e 12 ff ff ff ff ff ff ff ff ff ff
> 0x0070          ff ff ff ff ff ff ff ff ff ff ff ff ff ff cf 2f

this looks fine.
 
> lspci -vvv for this card:
> 0e:00.0 Ethernet controller: Intel Corporation 82573L Gigabit Ethernet
> Controller
>          Subsystem: Super Micro Computer Inc Unknown device 109a
>          Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
> ParErr- Stepping- SERR+ FastB2B-
>          Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast
> >TAbort- <TAbort- <MAbort- >SERR- <PERR-
>          Latency: 0, Cache Line Size: 64 bytes
>          Interrupt: pin A routed to IRQ 378
>          Region 0: Memory at ee200000 (32-bit, non-prefetchable)
>          [size=128K] Region 2: I/O ports at 5000 [size=32]
>          Capabilities: [c8] Power Management version 2
>                  Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA
> PME(D0+,D1-,D2-,D3hot+,D3cold+)
>                  Status: D0 PME-Enable- DSel=0 DScale=1 PME-
>          Capabilities: [d0] Message Signalled Interrupts: Mask- 64bit+
> Queue=0/0 Enable+
>                  Address: 00000000fee0f00c  Data: 41b9
>          Capabilities: [e0] Express Endpoint IRQ 0
>                  Device: Supported: MaxPayload 256 bytes, PhantFunc 0,
> ExtTag-
>                  Device: Latency L0s <512ns, L1 <64us
>                  Device: AtnBtn- AtnInd- PwrInd-
>                  Device: Errors: Correctable- Non-Fatal- Fatal-
>                  Unsupported- Device: RlxdOrd+ ExtTag- PhantFunc-
>                  AuxPwr- NoSnoop+ Device: MaxPayload 128 bytes,
>                  MaxReadReq 512 bytes Link: Supported Speed 2.5Gb/s,
> Width x1, ASPM unknown, 
> Port 0
>                  Link: Latency L0s <128ns, L1 <64us
>                  Link: ASPM Disabled RCB 64 bytes CommClk- ExtSynch-
>                  Link: Speed 2.5Gb/s, Width x1
>          Capabilities: [100] Advanced Error Reporting
>          Capabilities: [140] Device Serial Number
> 2d-94-93-ff-ff-48-30-00 

this also looks good, no APSM, MSI enabled, 
 

^ permalink raw reply

* Re: [PATCH] Disable TSO for non standard qdiscs
From: Stephen Hemminger @ 2008-01-31 17:23 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev
In-Reply-To: <20080131124632.GA25299@basil.nowhere.org>

On Thu, 31 Jan 2008 13:46:32 +0100
Andi Kleen <andi@firstfloor.org> wrote:

> 
> TSO interacts badly with many queueing disciplines because they rely on 
> reordering packets from different streams and the large TSO packets can 
> make this difficult. This patch disables TSO for sockets that send over 
> devices with non standard queueing disciplines. That's anything but noop 
> or pfifo_fast and pfifo right now.
> 
> Longer term other queueing disciplines could be checked if they
> are also ok with TSO. If yes they can set the TCQ_F_GSO_OK flag too.
> 
> It is still enabled for the standard pfifo_fast because that will never
> reorder packets with the same type-of-service. This means 99+% of all users
> will still be able to use TSO just fine.
> 
> The status is only set up at socket creation so a shifted route
> will not reenable TSO on a existing socket. I don't think that's a 
> problem though.
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 


Fix the broken qdisc instead.

-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>

^ permalink raw reply

* Re: e1000 full-duplex TCP performance well below wire speed
From: Carsten Aulbert @ 2008-01-31 17:27 UTC (permalink / raw)
  To: Brandeburg, Jesse; +Cc: Bruce Allen, netdev, Henning Fehrmann, Bruce Allen
In-Reply-To: <36D9DB17C6DE9E40B059440DB8D95F52044F8EC9@orsmsx418.amr.corp.intel.com>

Hi all,

Brandeburg, Jesse wrote:
>>> I would suggest you try TCP_RR with a command line something like
>>> this: netperf -t TCP_RR -H <hostname> -C -c -- -b 4 -r 64K
>> I did that and the results can be found here:
>> https://n0.aei.uni-hannover.de/wiki/index.php/NetworkTest
> 
> seems something went wrong and all you ran was the 1 byte tests, where
> it should have been 64K both directions (request/response).
>  

Yes, shell-quoting got me there. I'll re-run the tests, so please don't 
look at the TCP_RR results too closely. I think I'll be able to run 
maybe one or two more tests today, rest will follow tomorrow.

Thanks for bearing with me

Carsten

PS: Am I right that the TCP_RR tests should only be run on a single node 
at a time, not on both ends simultaneously?

^ permalink raw reply

* Re: e1000 full-duplex TCP performance well below wire speed
From: Bill Fink @ 2008-01-31 17:36 UTC (permalink / raw)
  To: Bruce Allen
  Cc: SANGTAE HA, Linux Kernel Mailing List, netdev, Stephen Hemminger
In-Reply-To: <Pine.LNX.4.63.0801310951060.10967@trinity.phys.uwm.edu>

Hi Bruce,

On Thu, 31 Jan 2008, Bruce Allen wrote:

> > I see similar results on my test systems
> 
> Thanks for this report and for confirming our observations.  Could you 
> please confirm that a single-port bidrectional UDP link runs at wire 
> speed?  This helps to localize the problem to the TCP stack or interaction 
> of the TCP stack with the e1000 driver and hardware.

Yes, a single-port bidirectional UDP test gets full GigE line rate
in both directions with no packet loss.

[bill@chance4 ~]$ nuttcp -f-beta -Itx -u -Ru -w2m 192.168.6.79 & nuttcp -f-beta -Irx -r -u -Ru -w2m 192.168.6.79
tx:  1187.0078 MB /  10.04 sec =  992.0550 Mbps 19 %TX 7 %RX 0 / 151937 drop/pkt 0.00 %loss
rx:  1187.1016 MB /  10.03 sec =  992.3408 Mbps 19 %TX 7 %RX 0 / 151949 drop/pkt 0.00 %loss

						-Bill

^ permalink raw reply

* RE: e1000 full-duplex TCP performance well below wire speed
From: Brandeburg, Jesse @ 2008-01-31 17:33 UTC (permalink / raw)
  To: Carsten Aulbert; +Cc: Bruce Allen, netdev, Henning Fehrmann, Bruce Allen
In-Reply-To: <47A204E5.7060406@aei.mpg.de>

Carsten Aulbert wrote:
> PS: Am I right that the TCP_RR tests should only be run on a single
> node at a time, not on both ends simultaneously?

yes, they are a request/response test, and so perform the bidirectional
test with a single node starting the test.

^ permalink raw reply

* Re: [PATCH 0/6] preparations to enable netdevice notifiers inside a namespace (resend)
From: Daniel Lezcano @ 2008-01-31 17:46 UTC (permalink / raw)
  To: Benjamin Thery; +Cc: Denis V. Lunev, Linux Containers, netdev
In-Reply-To: <939d53060801310847y31c4542do271edd48e44745c7@mail.gmail.com>

Benjamin Thery wrote:
> On Jan 31, 2008 3:58 PM, Daniel Lezcano <daniel.lezcano@free.fr> wrote:
> 
>> Denis V. Lunev wrote:
>>> Here are some preparations and cleanups to enable network device/inet
>>> address notifiers inside a namespace.
>>>
>>> This set of patches has been originally sent last Friday. One cleanup
>>> patch from the original series is dropped as wrong, thanks to Daniel
>>> Lezcano.
>> Can you explain please.
> 
> 
> I think Denis refers to the patch called "3/7 Prohibit assignment of
> 0.0.0.0as interface address." ,
> he dropped because it was inappropriate, no?

Yes, you are right, Denis explained me in a private email. I think I 
really need to sleep a little more :)

^ permalink raw reply

* Re: e1000 full-duplex TCP performance well below wire speed
From: Rick Jones @ 2008-01-31 17:55 UTC (permalink / raw)
  To: Carsten Aulbert
  Cc: Brandeburg, Jesse, Bruce Allen, netdev, Henning Fehrmann,
	Bruce Allen
In-Reply-To: <47A1B294.8080609@aei.mpg.de>

> netperf was used without any special tuning parameters. Usually we start 
> two processes on two hosts which start (almost) simultaneously, last for 
> 20-60 seconds and simply use UDP_STREAM (works well) and TCP_STREAM, i.e.
> 
> on 192.168.0.202: netperf -H 192.168.2.203 -t TCP_STREAL -l 20
> on 192.168.0.203: netperf -H 192.168.2.202 -t TCP_STREAL -l 20
> 
> 192.168.0.20[23] here is on eth0 which cannot do jumbo frames, thus we 
> use the .2. part for eth1 for a range of mtus.
> 
> The server is started on both nodes with the start-stop-daemon and no 
> special parameters I'm aware of.


So long as you are relying on external (netperf relative) means to 
report the throughput, those command lines would be fine.  I wouldn't be 
comfortably relying on the sum of the netperf-reported throughtputs with 
those comand lines though.  Netperf2 has no test synchronization, so two 
separate commands, particularly those initiated on different systems, 
are subject to skew errors.  99 times out of ten they might be epsilon, 
but I get a _little_ paranoid there.

There are three alternatives:

1) use netperf4.  not as convenient for "quick" testing at present, but 
it has explicit test synchronization, so  you "know" that the numbers 
presented are from when all connections were actively transferring data

2) use the aforementioned "burst" TCP_RR test.  This is then a single 
netperf with data flowing both ways on a single connection so no issue 
of skew, but perhaps an issue of being one connection and so one process 
on each end.

3) start both tests from the same system and follow the suggestions 
contained in :

<http://www.netperf.org/svn/netperf2/tags/netperf-2.4.4/doc/netperf.html>

particluarly:

<http://www.netperf.org/svn/netperf2/tags/netperf-2.4.4/doc/netperf.html#Using-Netperf-to-Measure-Aggregate-Performance>

and use a combination of TCP_STREAM and TCP_MAERTS (STREAM backwards) tests.

happy benchmarking,

rick jones

^ permalink raw reply

* [NET_SCHED 00/04]: External SFQ classifiers/flow classifier
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, shemminger, Patrick McHardy

These patches add support for external classifiers to SFQ and add a
new "flow" classifier, which can do hashing based on user-specified
keys or deterministic mapping of keys to classes. Additionally there
is a patch to make the SFQ queues visisble as classes to verify that
the hash is indeed doing something useful and a patch to consifiy
struct tcf_ext_map, which I had queued in the same tree.

Please apply, thanks.


 include/linux/pkt_cls.h   |   50 ++++
 include/linux/pkt_sched.h |    5 +
 include/net/pkt_cls.h     |    6 +-
 net/sched/Kconfig         |   11 +
 net/sched/Makefile        |    1 +
 net/sched/cls_api.c       |    6 +-
 net/sched/cls_basic.c     |    2 +-
 net/sched/cls_flow.c      |  660 +++++++++++++++++++++++++++++++++++++++++++++
 net/sched/cls_fw.c        |    2 +-
 net/sched/cls_route.c     |    2 +-
 net/sched/cls_tcindex.c   |    2 +-
 net/sched/cls_u32.c       |    2 +-
 net/sched/sch_sfq.c       |  134 +++++++++-
 13 files changed, 868 insertions(+), 15 deletions(-)
 create mode 100644 net/sched/cls_flow.c

Patrick McHardy (4):
      [NET_SCHED]: Constify struct tcf_ext_map
      [NET_SCHED]: sch_sfq: add support for external classifiers
      [NET_SCHED]: sch_sfq: make internal queues visible as classes
      [NET_SCHED]: Add flow classifier

^ permalink raw reply

* [NET_SCHED 01/04]: Constify struct tcf_ext_map
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, shemminger, Patrick McHardy
In-Reply-To: <20080131175758.25151.20370.sendpatchset@localhost.localdomain>

[NET_SCHED]: Constify struct tcf_ext_map

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 12e33ddf57910b685501df10bd92223ea9b98fd6
tree 1ce47c7b6b6b968940f3dc28f9d7839e78c85089
parent 8af03e782cae1e0a0f530ddd22301cdd12cf9dc0
author Patrick McHardy <kaber@trash.net> Wed, 30 Jan 2008 21:59:26 +0100
committer Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 18:52:55 +0100

 include/net/pkt_cls.h   |    6 +++---
 net/sched/cls_api.c     |    6 +++---
 net/sched/cls_basic.c   |    2 +-
 net/sched/cls_fw.c      |    2 +-
 net/sched/cls_route.c   |    2 +-
 net/sched/cls_tcindex.c |    2 +-
 net/sched/cls_u32.c     |    2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8716eb7..d349c66 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -131,14 +131,14 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 
 extern int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb,
 	                     struct nlattr *rate_tlv, struct tcf_exts *exts,
-	                     struct tcf_ext_map *map);
+	                     const struct tcf_ext_map *map);
 extern void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts);
 extern void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 	                     struct tcf_exts *src);
 extern int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
-	                 struct tcf_ext_map *map);
+	                 const struct tcf_ext_map *map);
 extern int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
-	                       struct tcf_ext_map *map);
+	                       const struct tcf_ext_map *map);
 
 /**
  * struct tcf_pkt_info - packet information
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3377ca0..0fbedca 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -482,7 +482,7 @@ EXPORT_SYMBOL(tcf_exts_destroy);
 
 int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb,
 		  struct nlattr *rate_tlv, struct tcf_exts *exts,
-		  struct tcf_ext_map *map)
+		  const struct tcf_ext_map *map)
 {
 	memset(exts, 0, sizeof(*exts));
 
@@ -535,7 +535,7 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 EXPORT_SYMBOL(tcf_exts_change);
 
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
-	      struct tcf_ext_map *map)
+		  const struct tcf_ext_map *map)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (map->action && exts->action) {
@@ -571,7 +571,7 @@ EXPORT_SYMBOL(tcf_exts_dump);
 
 
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
-			struct tcf_ext_map *map)
+			const struct tcf_ext_map *map)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (exts->action)
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index bfb4342..956915c 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -35,7 +35,7 @@ struct basic_filter
 	struct list_head	link;
 };
 
-static struct tcf_ext_map basic_ext_map = {
+static const struct tcf_ext_map basic_ext_map = {
 	.action = TCA_BASIC_ACT,
 	.police = TCA_BASIC_POLICE
 };
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 436a6e7..b0f90e5 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -47,7 +47,7 @@ struct fw_filter
 	struct tcf_exts		exts;
 };
 
-static struct tcf_ext_map fw_ext_map = {
+static const struct tcf_ext_map fw_ext_map = {
 	.action = TCA_FW_ACT,
 	.police = TCA_FW_POLICE
 };
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index f7e7d39..784dcb8 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -62,7 +62,7 @@ struct route4_filter
 
 #define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
 
-static struct tcf_ext_map route_ext_map = {
+static const struct tcf_ext_map route_ext_map = {
 	.police = TCA_ROUTE4_POLICE,
 	.action = TCA_ROUTE4_ACT
 };
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index ee60b2d..7a7bff5 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -55,7 +55,7 @@ struct tcindex_data {
 	int fall_through;	/* 0: only classify if explicit match */
 };
 
-static struct tcf_ext_map tcindex_ext_map = {
+static const struct tcf_ext_map tcindex_ext_map = {
 	.police = TCA_TCINDEX_POLICE,
 	.action = TCA_TCINDEX_ACT
 };
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index e8a7756..b18fa95 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -82,7 +82,7 @@ struct tc_u_common
 	u32			hgenerator;
 };
 
-static struct tcf_ext_map u32_ext_map = {
+static const struct tcf_ext_map u32_ext_map = {
 	.action = TCA_U32_ACT,
 	.police = TCA_U32_POLICE
 };

^ permalink raw reply related

* [NET_SCHED 02/04]: sch_sfq: add support for external classifiers
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, shemminger, Patrick McHardy
In-Reply-To: <20080131175758.25151.20370.sendpatchset@localhost.localdomain>

[NET_SCHED]: sch_sfq: add support for external classifiers

Add support for external classifiers to allow using different flow hash
functions similar to ESFQ. When no classifier is attached the built-in
hash is used as before.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 6049892cc4acca9af393e134e4cdaf6b3e1ccad9
tree 9a8347d45808de2aef14486e5792fcab58baf3fe
parent 12e33ddf57910b685501df10bd92223ea9b98fd6
author Patrick McHardy <kaber@trash.net> Wed, 30 Jan 2008 21:59:27 +0100
committer Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 18:52:55 +0100

 net/sched/sch_sfq.c |   95 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 91af539..d818d19 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -95,6 +95,7 @@ struct sfq_sched_data
 	int		limit;
 
 /* Variables */
+	struct tcf_proto *filter_list;
 	struct timer_list perturb_timer;
 	u32		perturbation;
 	sfq_index	tail;		/* Index of current slot in round */
@@ -155,6 +156,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 	return sfq_fold_hash(q, h, h2);
 }
 
+static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+				 int *qerr)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return sfq_hash(q, skb) + 1;
+
+	*qerr = NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
+			return TC_H_MIN(res.classid);
+	}
+	return 0;
+}
+
 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 {
 	sfq_index p, n;
@@ -245,8 +279,18 @@ static int
 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned hash = sfq_hash(q, skb);
+	unsigned int hash;
 	sfq_index x;
+	int ret;
+
+	hash = sfq_classify(skb, sch, &ret);
+	if (hash == 0) {
+		if (ret == NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	hash--;
 
 	x = q->ht[hash];
 	if (x == SFQ_DEPTH) {
@@ -289,8 +333,18 @@ static int
 sfq_requeue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned hash = sfq_hash(q, skb);
+	unsigned int hash;
 	sfq_index x;
+	int ret;
+
+	hash = sfq_classify(skb, sch, &ret);
+	if (hash == 0) {
+		if (ret == NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	hash--;
 
 	x = q->ht[hash];
 	if (x == SFQ_DEPTH) {
@@ -465,6 +519,8 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 static void sfq_destroy(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(q->filter_list);
 	del_timer(&q->perturb_timer);
 }
 
@@ -490,9 +546,40 @@ nla_put_failure:
 	return -1;
 }
 
+static int sfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	return;
+}
+
+static const struct Qdisc_class_ops sfq_class_ops = {
+	.get		=	sfq_get,
+	.change		=	sfq_change_class,
+	.tcf_chain	=	sfq_find_tcf,
+	.walk		=	sfq_walk,
+};
+
 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
+	.cl_ops		=	&sfq_class_ops,
 	.id		=	"sfq",
 	.priv_size	=	sizeof(struct sfq_sched_data),
 	.enqueue	=	sfq_enqueue,

^ permalink raw reply related

* [NET_SCHED 03/04]: sch_sfq: make internal queues visible as classes
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, shemminger, Patrick McHardy
In-Reply-To: <20080131175758.25151.20370.sendpatchset@localhost.localdomain>

[NET_SCHED]: sch_sfq: make internal queues visible as classes

Add support for dumping statistics and make internal queues visible
as classes.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 7a281f8ef334a35d699682315e9f80a3e006376c
tree 0a2cbd55e22f1913e9cf0cc28da2956952110243
parent 6049892cc4acca9af393e134e4cdaf6b3e1ccad9
author Patrick McHardy <kaber@trash.net> Wed, 30 Jan 2008 21:59:29 +0100
committer Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 18:52:56 +0100

 include/linux/pkt_sched.h |    5 +++++
 net/sched/sch_sfq.c       |   41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 3276135..dbb7ac3 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -150,6 +150,11 @@ struct tc_sfq_qopt
 	unsigned	flows;		/* Maximal number of flows  */
 };
 
+struct tc_sfq_xstats
+{
+	__s32		allot;
+};
+
 /*
  *  NOTE: limit, divisor and flows are hardwired to code at the moment.
  *
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index d818d19..a20e2ef 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -566,15 +566,54 @@ static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 	return &q->filter_list;
 }
 
+static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				struct gnet_dump *d)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	sfq_index idx = q->ht[cl-1];
+	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
+	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
+
+	if (gnet_stats_copy_queue(d, &qs) < 0)
+		return -1;
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
-	return;
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
+		if (q->ht[i] == SFQ_DEPTH ||
+		    arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
 }
 
 static const struct Qdisc_class_ops sfq_class_ops = {
 	.get		=	sfq_get,
 	.change		=	sfq_change_class,
 	.tcf_chain	=	sfq_find_tcf,
+	.dump		=	sfq_dump_class,
+	.dump_stats	=	sfq_dump_class_stats,
 	.walk		=	sfq_walk,
 };
 

^ permalink raw reply related

* [NET_SCHED 04/04]: Add flow classifier
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, shemminger, Patrick McHardy
In-Reply-To: <20080131175758.25151.20370.sendpatchset@localhost.localdomain>

[NET_SCHED]: Add flow classifier

Add new "flow" classifier, which is meant to extend the SFQ hashing
capabilities without hard-coding new hash functions and also allows
deterministic mappings of keys to classes, replacing some out of tree
iptables patches like IPCLASSIFY (maps IPs to classes), IPMARK (maps
IPs to marks, with fw filters to classes), ...

Some examples:

- Classic SFQ hash:

  tc filter add ... flow hash \
  	keys src,dst,proto,proto-src,proto-dst divisor 1024

- Classic SFQ hash, but using information from conntrack to work properly in
  combination with NAT:

  tc filter add ... flow hash \
  	keys nfct-src,nfct-dst,proto,nfct-proto-src,nfct-proto-dst divisor 1024

- Map destination IPs of 192.168.0.0/24 to classids 1-257:

  tc filter add ... flow map \
  	key dst addend -192.168.0.0 divisor 256

- alternatively:

  tc filter add ... flow map \
  	key dst and 0xff

- similar, but reverse ordered:

  tc filter add ... flow map \
  	key dst and 0xff xor 0xff

Perturbation is currently not supported because we can't reliable kill the
timer on destruction.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 91a3a09ce63cba8df30ac42133a40dd64c0a7259
tree 2572feb8ffd88e6abf9270d2137af2a4cf7f542a
parent 7a281f8ef334a35d699682315e9f80a3e006376c
author Patrick McHardy <kaber@trash.net> Wed, 30 Jan 2008 21:59:31 +0100
committer Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 18:52:56 +0100

 include/linux/pkt_cls.h |   50 ++++
 net/sched/Kconfig       |   11 +
 net/sched/Makefile      |    1 
 net/sched/cls_flow.c    |  660 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 722 insertions(+), 0 deletions(-)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 30b8571..1c1dba9 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -328,6 +328,56 @@ enum
 
 #define TCA_TCINDEX_MAX     (__TCA_TCINDEX_MAX - 1)
 
+/* Flow filter */
+
+enum
+{
+	FLOW_KEY_SRC,
+	FLOW_KEY_DST,
+	FLOW_KEY_PROTO,
+	FLOW_KEY_PROTO_SRC,
+	FLOW_KEY_PROTO_DST,
+	FLOW_KEY_IIF,
+	FLOW_KEY_PRIORITY,
+	FLOW_KEY_MARK,
+	FLOW_KEY_NFCT,
+	FLOW_KEY_NFCT_SRC,
+	FLOW_KEY_NFCT_DST,
+	FLOW_KEY_NFCT_PROTO_SRC,
+	FLOW_KEY_NFCT_PROTO_DST,
+	FLOW_KEY_RTCLASSID,
+	FLOW_KEY_SKUID,
+	FLOW_KEY_SKGID,
+	__FLOW_KEY_MAX,
+};
+
+#define FLOW_KEY_MAX	(__FLOW_KEY_MAX - 1)
+
+enum
+{
+	FLOW_MODE_MAP,
+	FLOW_MODE_HASH,
+};
+
+enum
+{
+	TCA_FLOW_UNSPEC,
+	TCA_FLOW_KEYS,
+	TCA_FLOW_MODE,
+	TCA_FLOW_BASECLASS,
+	TCA_FLOW_RSHIFT,
+	TCA_FLOW_ADDEND,
+	TCA_FLOW_MASK,
+	TCA_FLOW_XOR,
+	TCA_FLOW_DIVISOR,
+	TCA_FLOW_ACT,
+	TCA_FLOW_POLICE,
+	TCA_FLOW_EMATCHES,
+	__TCA_FLOW_MAX
+};
+
+#define TCA_FLOW_MAX	(__TCA_FLOW_MAX - 1)
+
 /* Basic filter */
 
 enum
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 87af7c9..bccf42b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -307,6 +307,17 @@ config NET_CLS_RSVP6
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_rsvp6.
 
+config NET_CLS_FLOW
+	tristate "Flow classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  a configurable combination of packet keys. This is mostly useful
+	  in combination with SFQ.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_flow.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 81ecbe8..1d2b0f7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_NET_CLS_RSVP)	+= cls_rsvp.o
 obj-$(CONFIG_NET_CLS_TCINDEX)	+= cls_tcindex.o
 obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
+obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
 obj-$(CONFIG_NET_EMATCH)	+= ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
new file mode 100644
index 0000000..5a7f6a3
--- /dev/null
+++ b/net/sched/cls_flow.c
@@ -0,0 +1,660 @@
+/*
+ * net/sched/cls_flow.c		Generic flow classifier
+ *
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/pkt_cls.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/route.h>
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+struct flow_head {
+	struct list_head	filters;
+};
+
+struct flow_filter {
+	struct list_head	list;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+	u32			handle;
+
+	u32			nkeys;
+	u32			keymask;
+	u32			mode;
+	u32			mask;
+	u32			xor;
+	u32			rshift;
+	u32			addend;
+	u32			divisor;
+	u32			baseclass;
+};
+
+static u32 flow_hashrnd __read_mostly;
+static int flow_hashrnd_initted __read_mostly;
+
+static const struct tcf_ext_map flow_ext_map = {
+	.action	= TCA_FLOW_ACT,
+	.police	= TCA_FLOW_POLICE,
+};
+
+static inline u32 addr_fold(void *addr)
+{
+	unsigned long a = (unsigned long)addr;
+
+	return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
+}
+
+static u32 flow_get_src(const struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		return ntohl(ip_hdr(skb)->saddr);
+	case __constant_htons(ETH_P_IPV6):
+		return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
+	default:
+		return addr_fold(skb->sk);
+	}
+}
+
+static u32 flow_get_dst(const struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		return ntohl(ip_hdr(skb)->daddr);
+	case __constant_htons(ETH_P_IPV6):
+		return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
+	default:
+		return addr_fold(skb->dst) ^ (__force u16)skb->protocol;
+	}
+}
+
+static u32 flow_get_proto(const struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		return ip_hdr(skb)->protocol;
+	case __constant_htons(ETH_P_IPV6):
+		return ipv6_hdr(skb)->nexthdr;
+	default:
+		return 0;
+	}
+}
+
+static int has_ports(u8 protocol)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+	case IPPROTO_SCTP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static u32 flow_get_proto_src(const struct sk_buff *skb)
+{
+	u32 res = 0;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP): {
+		struct iphdr *iph = ip_hdr(skb);
+
+		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
+		    has_ports(iph->protocol))
+			res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4));
+		break;
+	}
+	case __constant_htons(ETH_P_IPV6): {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		if (has_ports(iph->nexthdr))
+			res = ntohs(*(__be16 *)&iph[1]);
+		break;
+	}
+	default:
+		res = addr_fold(skb->sk);
+	}
+
+	return res;
+}
+
+static u32 flow_get_proto_dst(const struct sk_buff *skb)
+{
+	u32 res = 0;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP): {
+		struct iphdr *iph = ip_hdr(skb);
+
+		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
+		    has_ports(iph->protocol))
+			res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2));
+		break;
+	}
+	case __constant_htons(ETH_P_IPV6): {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		if (has_ports(iph->nexthdr))
+			res = ntohs(*(__be16 *)((void *)&iph[1] + 2));
+		break;
+	}
+	default:
+		res = addr_fold(skb->dst) ^ (__force u16)skb->protocol;
+	}
+
+	return res;
+}
+
+static u32 flow_get_iif(const struct sk_buff *skb)
+{
+	return skb->iif;
+}
+
+static u32 flow_get_priority(const struct sk_buff *skb)
+{
+	return skb->priority;
+}
+
+static u32 flow_get_mark(const struct sk_buff *skb)
+{
+	return skb->mark;
+}
+
+static u32 flow_get_nfct(const struct sk_buff *skb)
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	return addr_fold(skb->nfct);
+#else
+	return 0;
+#endif
+}
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#define CTTUPLE(skb, member)						\
+({									\
+	enum ip_conntrack_info ctinfo;					\
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);			\
+	if (ct == NULL)							\
+		goto fallback;						\
+	ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member;			\
+})
+#else
+#define CTTUPLE(skb, member)						\
+({									\
+	goto fallback;							\
+	0;								\
+})
+#endif
+
+static u32 flow_get_nfct_src(const struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		return ntohl(CTTUPLE(skb, src.u3.ip));
+	case __constant_htons(ETH_P_IPV6):
+		return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
+	}
+fallback:
+	return flow_get_src(skb);
+}
+
+static u32 flow_get_nfct_dst(const struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		return ntohl(CTTUPLE(skb, dst.u3.ip));
+	case __constant_htons(ETH_P_IPV6):
+		return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
+	}
+fallback:
+	return flow_get_dst(skb);
+}
+
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb)
+{
+	return ntohs(CTTUPLE(skb, src.u.all));
+fallback:
+	return flow_get_proto_src(skb);
+}
+
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb)
+{
+	return ntohs(CTTUPLE(skb, dst.u.all));
+fallback:
+	return flow_get_proto_dst(skb);
+}
+
+static u32 flow_get_rtclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (skb->dst)
+		return skb->dst->tclassid;
+#endif
+	return 0;
+}
+
+static u32 flow_get_skuid(const struct sk_buff *skb)
+{
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
+		return skb->sk->sk_socket->file->f_uid;
+	return 0;
+}
+
+static u32 flow_get_skgid(const struct sk_buff *skb)
+{
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
+		return skb->sk->sk_socket->file->f_gid;
+	return 0;
+}
+
+static u32 flow_key_get(const struct sk_buff *skb, int key)
+{
+	switch (key) {
+	case FLOW_KEY_SRC:
+		return flow_get_src(skb);
+	case FLOW_KEY_DST:
+		return flow_get_dst(skb);
+	case FLOW_KEY_PROTO:
+		return flow_get_proto(skb);
+	case FLOW_KEY_PROTO_SRC:
+		return flow_get_proto_src(skb);
+	case FLOW_KEY_PROTO_DST:
+		return flow_get_proto_dst(skb);
+	case FLOW_KEY_IIF:
+		return flow_get_iif(skb);
+	case FLOW_KEY_PRIORITY:
+		return flow_get_priority(skb);
+	case FLOW_KEY_MARK:
+		return flow_get_mark(skb);
+	case FLOW_KEY_NFCT:
+		return flow_get_nfct(skb);
+	case FLOW_KEY_NFCT_SRC:
+		return flow_get_nfct_src(skb);
+	case FLOW_KEY_NFCT_DST:
+		return flow_get_nfct_dst(skb);
+	case FLOW_KEY_NFCT_PROTO_SRC:
+		return flow_get_nfct_proto_src(skb);
+	case FLOW_KEY_NFCT_PROTO_DST:
+		return flow_get_nfct_proto_dst(skb);
+	case FLOW_KEY_RTCLASSID:
+		return flow_get_rtclassid(skb);
+	case FLOW_KEY_SKUID:
+		return flow_get_skuid(skb);
+	case FLOW_KEY_SKGID:
+		return flow_get_skgid(skb);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			 struct tcf_result *res)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+	u32 keymask;
+	u32 classid;
+	unsigned int n, key;
+	int r;
+
+	list_for_each_entry(f, &head->filters, list) {
+		u32 keys[f->nkeys];
+
+		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+			continue;
+
+		keymask = f->keymask;
+
+		for (n = 0; n < f->nkeys; n++) {
+			key = ffs(keymask) - 1;
+			keymask &= ~(1 << key);
+			keys[n] = flow_key_get(skb, key);
+		}
+
+		if (f->mode == FLOW_MODE_HASH)
+			classid = jhash2(keys, f->nkeys, flow_hashrnd);
+		else {
+			classid = keys[0];
+			classid = (classid & f->mask) ^ f->xor;
+			classid = (classid >> f->rshift) + f->addend;
+		}
+
+		if (f->divisor)
+			classid %= f->divisor;
+
+		res->class   = 0;
+		res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
+
+		r = tcf_exts_exec(skb, &f->exts, res);
+		if (r < 0)
+			continue;
+		return r;
+	}
+	return -1;
+}
+
+static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
+	[TCA_FLOW_KEYS]		= { .type = NLA_U32 },
+	[TCA_FLOW_MODE]		= { .type = NLA_U32 },
+	[TCA_FLOW_BASECLASS]	= { .type = NLA_U32 },
+	[TCA_FLOW_RSHIFT]	= { .type = NLA_U32 },
+	[TCA_FLOW_ADDEND]	= { .type = NLA_U32 },
+	[TCA_FLOW_MASK]		= { .type = NLA_U32 },
+	[TCA_FLOW_XOR]		= { .type = NLA_U32 },
+	[TCA_FLOW_DIVISOR]	= { .type = NLA_U32 },
+	[TCA_FLOW_ACT]		= { .type = NLA_NESTED },
+	[TCA_FLOW_POLICE]	= { .type = NLA_NESTED },
+	[TCA_FLOW_EMATCHES]	= { .type = NLA_NESTED },
+};
+
+static int flow_change(struct tcf_proto *tp, unsigned long base,
+		       u32 handle, struct nlattr **tca,
+		       unsigned long *arg)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_FLOW_MAX + 1];
+	struct tcf_exts e;
+	struct tcf_ematch_tree t;
+	unsigned int nkeys = 0;
+	u32 baseclass = 0;
+	u32 keymask = 0;
+	u32 mode;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_FLOW_BASECLASS]) {
+		baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
+		if (TC_H_MIN(baseclass) == 0)
+			return -EINVAL;
+	}
+
+	if (tb[TCA_FLOW_KEYS]) {
+		keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
+		if (fls(keymask) - 1 > FLOW_KEY_MAX)
+			return -EOPNOTSUPP;
+
+		nkeys = hweight32(keymask);
+		if (nkeys == 0)
+			return -EINVAL;
+	}
+
+	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
+	if (err < 0)
+		goto err1;
+
+	f = (struct flow_filter *)*arg;
+	if (f != NULL) {
+		err = -EINVAL;
+		if (f->handle != handle && handle)
+			goto err2;
+
+		mode = f->mode;
+		if (tb[TCA_FLOW_MODE])
+			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+		if (mode != FLOW_MODE_HASH && nkeys > 1)
+			goto err2;
+	} else {
+		err = -EINVAL;
+		if (!handle)
+			goto err2;
+		if (!tb[TCA_FLOW_KEYS])
+			goto err2;
+
+		mode = FLOW_MODE_MAP;
+		if (tb[TCA_FLOW_MODE])
+			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+		if (mode != FLOW_MODE_HASH && nkeys > 1)
+			goto err2;
+
+		if (TC_H_MAJ(baseclass) == 0)
+			baseclass = TC_H_MAKE(tp->q->handle, baseclass);
+		if (TC_H_MIN(baseclass) == 0)
+			baseclass = TC_H_MAKE(baseclass, 1);
+
+		err = -ENOBUFS;
+		f = kzalloc(sizeof(*f), GFP_KERNEL);
+		if (f == NULL)
+			goto err2;
+
+		f->handle = handle;
+		f->mask	  = ~0U;
+	}
+
+	tcf_exts_change(tp, &f->exts, &e);
+	tcf_em_tree_change(tp, &f->ematches, &t);
+
+	tcf_tree_lock(tp);
+
+	if (tb[TCA_FLOW_KEYS]) {
+		f->keymask = keymask;
+		f->nkeys   = nkeys;
+	}
+
+	f->mode = mode;
+
+	if (tb[TCA_FLOW_MASK])
+		f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+	if (tb[TCA_FLOW_XOR])
+		f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+	if (tb[TCA_FLOW_RSHIFT])
+		f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+	if (tb[TCA_FLOW_ADDEND])
+		f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+
+	if (tb[TCA_FLOW_DIVISOR])
+		f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+	if (baseclass)
+		f->baseclass = baseclass;
+
+	if (*arg == 0)
+		list_add_tail(&f->list, &head->filters);
+
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long)f;
+	return 0;
+
+err2:
+	tcf_em_tree_destroy(tp, &t);
+err1:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
+{
+	tcf_exts_destroy(tp, &f->exts);
+	tcf_em_tree_destroy(tp, &f->ematches);
+	kfree(f);
+}
+
+static int flow_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct flow_filter *f = (struct flow_filter *)arg;
+
+	tcf_tree_lock(tp);
+	list_del(&f->list);
+	tcf_tree_unlock(tp);
+	flow_destroy_filter(tp, f);
+	return 0;
+}
+
+static int flow_init(struct tcf_proto *tp)
+{
+	struct flow_head *head;
+
+	if (!flow_hashrnd_initted) {
+		get_random_bytes(&flow_hashrnd, 4);
+		flow_hashrnd_initted = 1;
+	}
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+	INIT_LIST_HEAD(&head->filters);
+	tp->root = head;
+	return 0;
+}
+
+static void flow_destroy(struct tcf_proto *tp)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f, *next;
+
+	list_for_each_entry_safe(f, next, &head->filters, list) {
+		list_del(&f->list);
+		flow_destroy_filter(tp, f);
+	}
+	kfree(head);
+}
+
+static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+
+	list_for_each_entry(f, &head->filters, list)
+		if (f->handle == handle)
+			return (unsigned long)f;
+	return 0;
+}
+
+static void flow_put(struct tcf_proto *tp, unsigned long f)
+{
+	return;
+}
+
+static int flow_dump(struct tcf_proto *tp, unsigned long fh,
+		     struct sk_buff *skb, struct tcmsg *t)
+{
+	struct flow_filter *f = (struct flow_filter *)fh;
+	struct nlattr *nest;
+
+	if (f == NULL)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask);
+	NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode);
+
+	if (f->mask != ~0 || f->xor != 0) {
+		NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask);
+		NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor);
+	}
+	if (f->rshift)
+		NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift);
+	if (f->addend)
+		NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend);
+
+	if (f->divisor)
+		NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor);
+	if (f->baseclass)
+		NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass);
+
+	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
+		goto nla_put_failure;
+
+	if (f->ematches.hdr.nmatches &&
+	    tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, nest);
+	return -1;
+}
+
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+
+	list_for_each_entry(f, &head->filters, list) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static struct tcf_proto_ops cls_flow_ops __read_mostly = {
+	.kind		= "flow",
+	.classify	= flow_classify,
+	.init		= flow_init,
+	.destroy	= flow_destroy,
+	.change		= flow_change,
+	.delete		= flow_delete,
+	.get		= flow_get,
+	.put		= flow_put,
+	.dump		= flow_dump,
+	.walk		= flow_walk,
+	.owner		= THIS_MODULE,
+};
+
+static int __init cls_flow_init(void)
+{
+	return register_tcf_proto_ops(&cls_flow_ops);
+}
+
+static void __exit cls_flow_exit(void)
+{
+	unregister_tcf_proto_ops(&cls_flow_ops);
+}
+
+module_init(cls_flow_init);
+module_exit(cls_flow_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("TC flow classifier");

^ permalink raw reply related

* Re: [PATCH] Disable TSO for non standard qdiscs
From: Andi Kleen @ 2008-01-31 18:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Andi Kleen, netdev
In-Reply-To: <20080131092327.75b9c369@extreme>

> Fix the broken qdisc instead.

What do you mean? I don't think the qdiscs are broken.
I cannot think of any way how e.g. TBF can do anything useful
with large TSO packets.

-Andi

^ permalink raw reply

* [IPROUTE 01/02]: Add support for SFQ xstats
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Linux Netdev List

[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: 01.diff --]
[-- Type: text/x-patch, Size: 1604 bytes --]

[IPROUTE]: Add support for SFQ xstats

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 196870f762ee393438c42115425f4af69e5b5186
tree 5650c1f93cc58886f8f97a0e55e374c157b96e2e
parent 54bb35c69cec6c730a4ac95530a1d2ca6670f73b
author Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 15:10:07 +0100
committer Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 15:10:07 +0100

 include/linux/pkt_sched.h |    5 +++++
 tc/q_sfq.c                |   17 +++++++++++++++++
 2 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 3276135..4ccd684 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -150,6 +150,11 @@ struct tc_sfq_qopt
 	unsigned	flows;		/* Maximal number of flows  */
 };
 
+struct tc_sfq_xstats
+{
+	__u32		allot;
+};
+
 /*
  *  NOTE: limit, divisor and flows are hardwired to code at the moment.
  *
diff --git a/tc/q_sfq.c b/tc/q_sfq.c
index 05385cf..ce4dade 100644
--- a/tc/q_sfq.c
+++ b/tc/q_sfq.c
@@ -100,8 +100,25 @@ static int sfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
 	return 0;
 }
 
+static int sfq_print_xstats(struct qdisc_util *qu, FILE *f,
+			    struct rtattr *xstats)
+{
+	struct tc_sfq_xstats *st;
+
+	if (xstats == NULL)
+		return 0;
+	if (RTA_PAYLOAD(xstats) < sizeof(*st))
+		return -1;
+	st = RTA_DATA(xstats);
+
+	fprintf(f, " allot %d ", st->allot);
+	fprintf(f, "\n");
+	return 0;
+}
+
 struct qdisc_util sfq_qdisc_util = {
 	.id		= "sfq",
 	.parse_qopt	= sfq_parse_opt,
 	.print_qopt	= sfq_print_opt,
+	.print_xstats	= sfq_print_xstats,
 };

^ permalink raw reply related

* [IPROUTE 02/02]: Add flow classifier support
From: Patrick McHardy @ 2008-01-31 17:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Linux Netdev List

[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: 02.diff --]
[-- Type: text/x-patch, Size: 10761 bytes --]

[IPROUTE]: Add flow classifier support

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit ac3df2d7e37826b06cc9093f50d829a9da1873a4
tree b33a2b29abdcea0267fe7a357d282a4c2f67124b
parent 196870f762ee393438c42115425f4af69e5b5186
author Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 18:52:47 +0100
committer Patrick McHardy <kaber@trash.net> Thu, 31 Jan 2008 18:52:47 +0100

 include/linux/pkt_cls.h |   50 +++++++
 tc/Makefile             |    1 
 tc/f_flow.c             |  347 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 398 insertions(+), 0 deletions(-)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index afb79d0..16869c2 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -328,6 +328,56 @@ enum
 
 #define TCA_TCINDEX_MAX     (__TCA_TCINDEX_MAX - 1)
 
+/* Flow filter */
+
+enum
+{
+	FLOW_KEY_SRC,
+	FLOW_KEY_DST,
+	FLOW_KEY_PROTO,
+	FLOW_KEY_PROTO_SRC,
+	FLOW_KEY_PROTO_DST,
+	FLOW_KEY_IIF,
+	FLOW_KEY_PRIORITY,
+	FLOW_KEY_MARK,
+	FLOW_KEY_NFCT,
+	FLOW_KEY_NFCT_SRC,
+	FLOW_KEY_NFCT_DST,
+	FLOW_KEY_NFCT_PROTO_SRC,
+	FLOW_KEY_NFCT_PROTO_DST,
+	FLOW_KEY_RTCLASSID,
+	FLOW_KEY_SKUID,
+	FLOW_KEY_SKGID,
+	__FLOW_KEY_MAX,
+};
+
+#define FLOW_KEY_MAX	(__FLOW_KEY_MAX - 1)
+
+enum
+{
+	FLOW_MODE_MAP,
+	FLOW_MODE_HASH,
+};
+
+enum
+{
+	TCA_FLOW_UNSPEC,
+	TCA_FLOW_KEYS,
+	TCA_FLOW_MODE,
+	TCA_FLOW_BASECLASS,
+	TCA_FLOW_RSHIFT,
+	TCA_FLOW_ADDEND,
+	TCA_FLOW_MASK,
+	TCA_FLOW_XOR,
+	TCA_FLOW_DIVISOR,
+	TCA_FLOW_ACT,
+	TCA_FLOW_POLICE,
+	TCA_FLOW_EMATCHES,
+	__TCA_FLOW_MAX
+};
+
+#define TCA_FLOW_MAX	(__TCA_FLOW_MAX - 1)
+
 /* Basic filter */
 
 enum
diff --git a/tc/Makefile b/tc/Makefile
index 0facc88..7ece958 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -18,6 +18,7 @@ TCMODULES += f_u32.o
 TCMODULES += f_route.o
 TCMODULES += f_fw.o
 TCMODULES += f_basic.o
+TCMODULES += f_flow.o
 TCMODULES += q_dsmark.o
 TCMODULES += q_gred.o
 TCMODULES += f_tcindex.o
diff --git a/tc/f_flow.c b/tc/f_flow.c
new file mode 100644
index 0000000..eca05cd
--- /dev/null
+++ b/tc/f_flow.c
@@ -0,0 +1,347 @@
+/*
+ * f_flow.c		Flow filter
+ *
+ * 		This program is free software; you can redistribute it and/or
+ * 		modify it under the terms of the GNU General Public License
+ * 		as published by the Free Software Foundation; either version
+ * 		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Patrick McHardy <kaber@trash.net>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "utils.h"
+#include "tc_util.h"
+#include "m_ematch.h"
+
+static void explain(void)
+{
+	fprintf(stderr,
+"Usage: ... flow ...\n"
+"\n"
+" [mapping mode]: map key KEY [ OPS ] ...\n"
+" [hashing mode]: hash keys KEY-LIST ...\n"
+"\n"
+"                 [ divisor NUM ] [ baseclass ID ] [ match EMATCH_TREE ]\n"
+"                 [ police POLICE_SPEC ] [ action ACTION_SPEC ]\n"
+"\n"
+"KEY-LIST := [ KEY-LIST , ] KEY\n"
+"KEY      := [ src | dst | proto | proto-src | proto-dst | iif | priority | \n"
+"              mark | nfct | nfct-src | nfct-dst | nfct-proto-src | \n"
+"              nfct-proto-dst | rt-classid | sk-uid | sk-gid ]\n"
+"OPS      := [ or NUM | and NUM | xor NUM | rshift NUM | addend NUM ]\n"
+"ID       := X:Y\n"
+	);
+}
+
+static const char *flow_keys[FLOW_KEY_MAX+1] = {
+	[FLOW_KEY_SRC]			= "src",
+	[FLOW_KEY_DST]			= "dst",
+	[FLOW_KEY_PROTO]		= "proto",
+	[FLOW_KEY_PROTO_SRC]		= "proto-src",
+	[FLOW_KEY_PROTO_DST]		= "proto-dst",
+	[FLOW_KEY_IIF]			= "iif",
+	[FLOW_KEY_PRIORITY]		= "priority",
+	[FLOW_KEY_MARK]			= "mark",
+	[FLOW_KEY_NFCT]			= "nfct",
+	[FLOW_KEY_NFCT_SRC]		= "nfct-src",
+	[FLOW_KEY_NFCT_DST]		= "nfct-dst",
+	[FLOW_KEY_NFCT_PROTO_SRC]	= "nfct-proto-src",
+	[FLOW_KEY_NFCT_PROTO_DST]	= "nfct-proto-dst",
+	[FLOW_KEY_RTCLASSID]		= "rt-classid",
+	[FLOW_KEY_SKUID]		= "sk-uid",
+	[FLOW_KEY_SKGID]		= "sk-gid",
+};
+
+static int flow_parse_keys(__u32 *keys, __u32 *nkeys, char *argv)
+{
+	char *s, *sep;
+	unsigned int i;
+
+	*keys = 0;
+	*nkeys = 0;
+	s = argv;
+	while (s != NULL) {
+		sep = strchr(s, ',');
+		if (sep)
+			*sep = '\0';
+
+		for (i = 0; i <= FLOW_KEY_MAX; i++) {
+			if (matches(s, flow_keys[i]) == 0) {
+				*keys |= 1 << i;
+				(*nkeys)++;
+				break;
+			}
+		}
+		if (i > FLOW_KEY_MAX) {
+			fprintf(stderr, "Unknown flow key \"%s\"\n", s);
+			return -1;
+		}
+		s = sep ? sep + 1 : NULL;
+	}
+	return 0;
+}
+
+static void transfer_bitop(__u32 *mask, __u32 *xor, __u32 m, __u32 x)
+{
+	*xor = x ^ (*xor & m);
+	*mask &= m;
+}
+
+static int get_addend(__u32 *addend, char *argv, __u32 keys)
+{
+	inet_prefix addr;
+	int sign = 0;
+	__u32 tmp;
+
+	if (*argv == '-') {
+		sign = 1;
+		argv++;
+	}
+
+	if (get_u32(&tmp, argv, 0) == 0)
+		goto out;
+
+	if (keys & (FLOW_KEY_SRC | FLOW_KEY_DST |
+		    FLOW_KEY_NFCT_SRC | FLOW_KEY_NFCT_DST) &&
+	    get_addr(&addr, argv, AF_UNSPEC) == 0) {
+		switch (addr.family) {
+		case AF_INET:
+			tmp = ntohl(addr.data[0]);
+			goto out;
+		case AF_INET6:
+			tmp = ntohl(addr.data[3]);
+			goto out;
+		}
+	}
+
+	return -1;
+out:
+	if (sign)
+		tmp = -tmp;
+	*addend = tmp;
+	return 0;
+}
+
+static int flow_parse_opt(struct filter_util *fu, char *handle,
+			  int argc, char **argv, struct nlmsghdr *n)
+{
+	struct tc_police tp;
+	struct tcmsg *t = NLMSG_DATA(n);
+	struct rtattr *tail;
+	__u32 mask = ~0U, xor = 0;
+	__u32 keys = 0, nkeys = 0;
+	__u32 mode = FLOW_MODE_MAP;
+	__u32 tmp;
+
+	memset(&tp, 0, sizeof(tp));
+
+	if (handle) {
+		if (get_u32(&t->tcm_handle, handle, 0)) {
+			fprintf(stderr, "Illegal \"handle\"\n");
+			return -1;
+		}
+	}
+
+	tail = NLMSG_TAIL(n);
+	addattr_l(n, 4096, TCA_OPTIONS, NULL, 0);
+
+	while (argc > 0) {
+		if (matches(*argv, "map") == 0) {
+			mode = FLOW_MODE_MAP;
+		} else if (matches(*argv, "hash") == 0) {
+			mode = FLOW_MODE_HASH;
+		} else if (matches(*argv, "keys") == 0) {
+			NEXT_ARG();
+			if (flow_parse_keys(&keys, &nkeys, *argv))
+				return -1;
+			addattr32(n, 4096, TCA_FLOW_KEYS, keys);
+		} else if (matches(*argv, "and") == 0) {
+			NEXT_ARG();
+			if (get_u32(&tmp, *argv, 0)) {
+				fprintf(stderr, "Illegal \"mask\"\n");
+				return -1;
+			}
+			transfer_bitop(&mask, &xor, tmp, 0);
+		} else if (matches(*argv, "or") == 0) {
+			NEXT_ARG();
+			if (get_u32(&tmp, *argv, 0)) {
+				fprintf(stderr, "Illegal \"or\"\n");
+				return -1;
+			}
+			transfer_bitop(&mask, &xor, ~tmp, tmp);
+		} else if (matches(*argv, "xor") == 0) {
+			NEXT_ARG();
+			if (get_u32(&tmp, *argv, 0)) {
+				fprintf(stderr, "Illegal \"xor\"\n");
+				return -1;
+			}
+			transfer_bitop(&mask, &xor, ~0, tmp);
+		} else if (matches(*argv, "rshift") == 0) {
+			NEXT_ARG();
+			if (get_u32(&tmp, *argv, 0)) {
+				fprintf(stderr, "Illegal \"rshift\"\n");
+				return -1;
+			}
+			addattr32(n, 4096, TCA_FLOW_RSHIFT, tmp);
+		} else if (matches(*argv, "addend") == 0) {
+			NEXT_ARG();
+			if (get_addend(&tmp, *argv, keys)) {
+				fprintf(stderr, "Illegal \"addend\"\n");
+				return -1;
+			}
+			addattr32(n, 4096, TCA_FLOW_ADDEND, tmp);
+		} else if (matches(*argv, "divisor") == 0) {
+			NEXT_ARG();
+			if (get_u32(&tmp, *argv, 0)) {
+				fprintf(stderr, "Illegal \"divisor\"\n");
+				return -1;
+			}
+			addattr32(n, 4096, TCA_FLOW_DIVISOR, tmp);
+		} else if (matches(*argv, "baseclass") == 0) {
+			NEXT_ARG();
+			if (get_tc_classid(&tmp, *argv) || TC_H_MIN(tmp) == 0) {
+				fprintf(stderr, "Illegal \"baseclass\"\n");
+				return -1;
+			}
+			addattr32(n, 4096, TCA_FLOW_BASECLASS, tmp);
+		} else if (matches(*argv, "police") == 0) {
+			NEXT_ARG();
+			if (parse_police(&argc, &argv, TCA_FLOW_POLICE, n)) {
+				fprintf(stderr, "Illegal \"police\"\n");
+				return -1;
+			}
+			continue;
+		} else if (matches(*argv, "action") == 0) {
+			NEXT_ARG();
+			if (parse_action(&argc, &argv, TCA_FLOW_ACT, n)) {
+				fprintf(stderr, "Illegal \"action\"\n");
+				return -1;
+			}
+			continue;
+		} else if (matches(*argv, "match") == 0) {
+			NEXT_ARG();
+			if (parse_ematch(&argc, &argv, TCA_FLOW_EMATCHES, n)) {
+				fprintf(stderr, "Illegal \"ematch\"\n");
+				return -1;
+			}
+			continue;
+		} else if (matches(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "What is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argv++, argc--;
+	}
+
+	if (nkeys > 1 && mode != FLOW_MODE_HASH) {
+		fprintf(stderr, "Invalid mode \"map\" for multiple keys\n");
+		return -1;
+	}
+	addattr32(n, 4096, TCA_FLOW_MODE, mode);
+
+	if (mask != ~0 || xor != 0) {
+		addattr32(n, 4096, TCA_FLOW_MASK, mask);
+		addattr32(n, 4096, TCA_FLOW_XOR, xor);
+	}
+
+	tail->rta_len = (void *)NLMSG_TAIL(n) - (void *)tail;
+	return 0;
+}
+
+static int flow_print_opt(struct filter_util *fu, FILE *f, struct rtattr *opt,
+			  __u32 handle)
+{
+	struct rtattr *tb[TCA_FLOW_MAX+1];
+	SPRINT_BUF(b1);
+	unsigned int i;
+	__u32 mask = ~0, val = 0;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	parse_rtattr_nested(tb, TCA_FLOW_MAX, opt);
+
+	fprintf(f, "handle 0x%x ", handle);
+
+	if (tb[TCA_FLOW_MODE]) {
+		__u32 mode = *(__u32 *)RTA_DATA(tb[TCA_FLOW_MODE]);
+
+		switch (mode) {
+		case FLOW_MODE_MAP:
+			fprintf(f, "map ");
+			break;
+		case FLOW_MODE_HASH:
+			fprintf(f, "hash ");
+			break;
+		}
+	}
+
+	if (tb[TCA_FLOW_KEYS]) {
+		__u32 keymask = *(__u32 *)RTA_DATA(tb[TCA_FLOW_KEYS]);
+		char *sep = "";
+
+		fprintf(f, "keys ");
+		for (i = 0; i <= FLOW_KEY_MAX; i++) {
+			if (keymask & (1 << i)) {
+				fprintf(f, "%s%s", sep, flow_keys[i]);
+				sep = ",";
+			}
+		}
+		fprintf(f, " ");
+	}
+
+	if (tb[TCA_FLOW_MASK])
+		mask = *(__u32 *)RTA_DATA(tb[TCA_FLOW_MASK]);
+	if (tb[TCA_FLOW_XOR])
+		val = *(__u32 *)RTA_DATA(tb[TCA_FLOW_XOR]);
+
+	if (mask != ~0 || val != 0) {
+		__u32 or = (mask & val) ^ val;
+		__u32 xor = mask & val;
+
+		if (mask != ~0)
+			fprintf(f, "and 0x%.8x ", mask);
+		if (xor != 0)
+			fprintf(f, "xor 0x%.8x ", xor);
+		if (or != 0)
+			fprintf(f, "or 0x%.8x ", or);
+	}
+
+	if (tb[TCA_FLOW_RSHIFT])
+		fprintf(f, "rshift %u ",
+			*(__u32 *)RTA_DATA(tb[TCA_FLOW_RSHIFT]));
+	if (tb[TCA_FLOW_ADDEND])
+		fprintf(f, "addend 0x%x ",
+			*(__u32 *)RTA_DATA(tb[TCA_FLOW_ADDEND]));
+
+	if (tb[TCA_FLOW_DIVISOR])
+		fprintf(f, "divisor %u ",
+			*(__u32 *)RTA_DATA(tb[TCA_FLOW_DIVISOR]));
+	if (tb[TCA_FLOW_BASECLASS])
+		fprintf(f, "baseclass %s ",
+			sprint_tc_classid(*(__u32 *)RTA_DATA(tb[TCA_FLOW_BASECLASS]), b1));
+
+	if (tb[TCA_FLOW_EMATCHES])
+		print_ematch(f, tb[TCA_FLOW_EMATCHES]);
+	if (tb[TCA_FLOW_POLICE])
+		tc_print_police(f, tb[TCA_FLOW_POLICE]);
+	if (tb[TCA_FLOW_ACT]) {
+		fprintf(f, "\n");
+		tc_print_action(f, tb[TCA_FLOW_ACT]);
+	}
+	return 0;
+}
+
+struct filter_util flow_filter_util = {
+	.id		= "flow",
+	.parse_fopt	= flow_parse_opt,
+	.print_fopt	= flow_print_opt,
+};

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox