Netdev List
 help / color / mirror / Atom feed
* [PATCH 17/21] datapath: Always use tun_key addresses for route lookup
From: Simon Horman @ 2012-05-24  9:09 UTC (permalink / raw)
  To: dev; +Cc: netdev, Kyle Mestery, Simon Horman
In-Reply-To: <1337850554-10339-1-git-send-email-horms@verge.net.au>

The tun_key should always be present and correct.
Mutable no longer stores correct address information
and the saddr and daddr fields will be removed.

Cc: Kyle Mestery <kmestery@cisco.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 datapath/tunnel.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index b997cb8..ba18055 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -690,46 +690,44 @@ static inline int rt_genid(struct net *net)
 }
 #endif
 
-static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
-				   u8 ipproto, __be32 daddr, __be32 saddr,
-				   u8 tos)
+static struct rtable *__find_route(struct net *net, u8 ipproto,
+				   struct ovs_key_ipv4_tunnel *tun_key, u8 tos)
 {
 	/* Tunnel configuration keeps DSCP part of TOS bits, But Linux
 	 * router expect RT_TOS bits only. */
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
 	struct flowi fl = { .nl_u = { .ip4_u = {
-					.daddr = daddr,
-					.saddr = saddr,
+					.daddr = tun_key->ipv4_dst,
+					.saddr = tun_key->ipv4_src,
 					.tos   = RT_TOS(tos) } },
 					.proto = ipproto };
 	struct rtable *rt;
 
-	if (unlikely(ip_route_output_key(port_key_get_net(&mutable->key), &rt, &fl)))
+	if (unlikely(ip_route_output_key(net, &rt, &fl)))
 		return ERR_PTR(-EADDRNOTAVAIL);
 
 	return rt;
 #else
-	struct flowi4 fl = { .daddr = daddr,
-			     .saddr = saddr,
+	struct flowi4 fl = { .daddr = tun_key->ipv4_dst,
+			     .saddr = tun_key->ipv4_src,
 			     .flowi4_tos = RT_TOS(tos),
 			     .flowi4_proto = ipproto };
 
-	return ip_route_output_key(port_key_get_net(&mutable->key), &fl);
+	return ip_route_output_key(net, &fl);
 #endif
 }
 
-static struct rtable *find_route(struct vport *vport,
-				 const struct tnl_mutable_config *mutable,
-				 u8 tos, __be32 daddr, __be32 saddr)
+static struct rtable *find_route(struct vport *vport, struct net *net,
+				 struct ovs_key_ipv4_tunnel *tun_key, u8 tos)
 {
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
 	struct rtable *rt;
 
 	tos = RT_TOS(tos);
 
-	rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto,
-			  daddr, saddr, tos);
+	rt = __find_route(net, tnl_vport->tnl_ops->ipproto,
+			  tun_key, tos);
 	if (IS_ERR(rt))
 		return NULL;
 
@@ -860,12 +858,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	struct dst_entry *unattached_dst = NULL;
 	int sent_len = 0;
 	__be16 frag_off = 0;
-	__be32 daddr;
-	__be32 saddr;
 	u8 ttl;
 	u8 inner_tos;
 	u8 tos;
 
+	if (!OVS_CB(skb)->tun_key)
+		goto error_free;
+
 	/* Validate the protocol headers before we try to use them. */
 	if (skb->protocol == htons(ETH_P_8021Q) &&
 	    !vlan_tx_tag_present(skb)) {
@@ -906,16 +905,9 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	else
 		tos = mutable->tos;
 
-	if (OVS_CB(skb)->tun_key) {
-		daddr = OVS_CB(skb)->tun_key->ipv4_dst;
-		saddr = OVS_CB(skb)->tun_key->ipv4_src;
-	} else {
-		daddr = mutable->key.daddr;
-		saddr = mutable->key.saddr;
-	}
-
 	/* Route lookup */
-	rt = find_route(vport, mutable, tos, daddr, saddr);
+	rt = find_route(vport, port_key_get_net(&mutable->key),
+			OVS_CB(skb)->tun_key, tos);
 	if (unlikely(!rt))
 		goto error_free;
 	unattached_dst = &rt_dst(rt);
-- 
1.7.10.2.484.gcd07cc5

^ permalink raw reply related

* [PATCH 19/21] datapath: Simplify vport lookup
From: Simon Horman @ 2012-05-24  9:09 UTC (permalink / raw)
  To: dev; +Cc: netdev, Kyle Mestery, Simon Horman
In-Reply-To: <1337850554-10339-1-git-send-email-horms@verge.net.au>

The lookup is now only based on the net and tunnel type.
It should be possible to either get rid of the lookup alltogether
or push it into the GRE and CAPWAP implementations, but this
change is simpler for now

Cc: Kyle Mestery <kmestery@cisco.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 datapath/tunnel.c       | 110 +++---------------------------------------------
 datapath/tunnel.h       |  18 ++------
 datapath/vport-capwap.c |   7 +--
 datapath/vport-gre.c    |  10 ++---
 4 files changed, 16 insertions(+), 129 deletions(-)

diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index 39aa2af..a303d8d 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -56,18 +56,6 @@
 
 static struct hlist_head *port_table __read_mostly;
 
-/*
- * These are just used as an optimization: they don't require any kind of
- * synchronization because we could have just as easily read the value before
- * the port change happened.
- */
-static unsigned int key_local_remote_ports __read_mostly;
-static unsigned int key_remote_ports __read_mostly;
-static unsigned int key_multicast_ports __read_mostly;
-static unsigned int local_remote_ports __read_mostly;
-static unsigned int remote_ports __read_mostly;
-static unsigned int multicast_ports __read_mostly;
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
 #define rt_dst(rt) (rt->dst)
 #else
@@ -97,27 +85,6 @@ static void assign_config_rcu(struct vport *vport,
 	call_rcu(&old_config->rcu, free_config_rcu);
 }
 
-static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
-{
-	bool is_multicast = ipv4_is_multicast(mutable->key.daddr);
-
-	if (mutable->flags & TNL_F_IN_KEY_MATCH) {
-		if (mutable->key.saddr)
-			return &local_remote_ports;
-		else if (is_multicast)
-			return &multicast_ports;
-		else
-			return &remote_ports;
-	} else {
-		if (mutable->key.saddr)
-			return &key_local_remote_ports;
-		else if (is_multicast)
-			return &key_multicast_ports;
-		else
-			return &key_remote_ports;
-	}
-}
-
 static u32 port_hash(const struct port_lookup_key *key)
 {
 	return jhash2((u32 *)key, (PORT_KEY_LEN / sizeof(u32)), 0);
@@ -137,8 +104,6 @@ static void port_table_add_port(struct vport *vport)
 	mutable = rtnl_dereference(tnl_vport->mutable);
 	hash = port_hash(&mutable->key);
 	hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
-
-	(*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
 }
 
 static void port_table_remove_port(struct vport *vport)
@@ -146,12 +111,9 @@ static void port_table_remove_port(struct vport *vport)
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
 
 	hlist_del_init_rcu(&tnl_vport->hash_node);
-
-	(*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
 }
 
-static struct vport *port_table_lookup(struct port_lookup_key *key,
-				       const struct tnl_mutable_config **pmutable)
+static struct vport *port_table_lookup(struct port_lookup_key *key)
 {
 	struct hlist_node *n;
 	struct hlist_head *bucket;
@@ -164,79 +126,21 @@ static struct vport *port_table_lookup(struct port_lookup_key *key,
 		struct tnl_mutable_config *mutable;
 
 		mutable = rcu_dereference_rtnl(tnl_vport->mutable);
-		if (!memcmp(&mutable->key, key, PORT_KEY_LEN)) {
-			*pmutable = mutable;
+		if (!memcmp(&mutable->key, key, PORT_KEY_LEN))
 			return tnl_vport_to_vport(tnl_vport);
-		}
 	}
 
 	return NULL;
 }
 
-struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
-				__be64 key, int tunnel_type,
-				const struct tnl_mutable_config **mutable)
+struct vport *ovs_tnl_find_port(struct net *net, u32 tunnel_type)
 {
 	struct port_lookup_key lookup;
-	struct vport *vport;
-	bool is_multicast = ipv4_is_multicast(saddr);
 
 	port_key_set_net(&lookup, net);
-	lookup.saddr = saddr;
-	lookup.daddr = daddr;
-
-	/* First try for exact match on in_key. */
-	lookup.in_key = key;
-	lookup.tunnel_type = tunnel_type | TNL_T_KEY_EXACT;
-	if (!is_multicast && key_local_remote_ports) {
-		vport = port_table_lookup(&lookup, mutable);
-		if (vport)
-			return vport;
-	}
-	if (key_remote_ports) {
-		lookup.saddr = 0;
-		vport = port_table_lookup(&lookup, mutable);
-		if (vport)
-			return vport;
-
-		lookup.saddr = saddr;
-	}
-
-	/* Then try matches that wildcard in_key. */
-	lookup.in_key = 0;
-	lookup.tunnel_type = tunnel_type | TNL_T_KEY_MATCH;
-	if (!is_multicast && local_remote_ports) {
-		vport = port_table_lookup(&lookup, mutable);
-		if (vport)
-			return vport;
-	}
-	if (remote_ports) {
-		lookup.saddr = 0;
-		vport = port_table_lookup(&lookup, mutable);
-		if (vport)
-			return vport;
-	}
+	lookup.tunnel_type = tunnel_type;
 
-	if (is_multicast) {
-		lookup.saddr = 0;
-		lookup.daddr = saddr;
-		if (key_multicast_ports) {
-			lookup.tunnel_type = tunnel_type | TNL_T_KEY_EXACT;
-			lookup.in_key = key;
-			vport = port_table_lookup(&lookup, mutable);
-			if (vport)
-				return vport;
-		}
-		if (multicast_ports) {
-			lookup.tunnel_type = tunnel_type | TNL_T_KEY_MATCH;
-			lookup.in_key = 0;
-			vport = port_table_lookup(&lookup, mutable);
-			if (vport)
-				return vport;
-		}
-	}
-
-	return NULL;
+	return port_table_lookup(&lookup);
 }
 
 static void ecn_decapsulate(struct sk_buff *skb)
@@ -1008,11 +912,9 @@ static int tnl_set_config(struct net *net,
 			  struct tnl_mutable_config *mutable)
 {
 	const struct vport *old_vport;
-	const struct tnl_mutable_config *old_mutable;
 
 	mutable->flags = 0;
 	port_key_set_net(&mutable->key, net);
-	mutable->key.daddr = htonl(0);
 	mutable->key.tunnel_type = tnl_ops->tunnel_type;
 
 	mutable->tunnel_hlen = tnl_ops->hdr_len(mutable);
@@ -1021,7 +923,7 @@ static int tnl_set_config(struct net *net,
 
 	mutable->tunnel_hlen += sizeof(struct iphdr);
 
-	old_vport = port_table_lookup(&mutable->key, &old_mutable);
+	old_vport = port_table_lookup(&mutable->key);
 	if (old_vport && old_vport != cur_vport)
 		return -EEXIST;
 
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index 330df27..cddb88e 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -35,16 +35,9 @@
 
 /*
  * One of these goes in struct tnl_ops and in tnl_find_port().
- * These values are in the same namespace as other TNL_T_* values, so
- * only the least significant 10 bits are available to define protocol
- * identifiers.
  */
-#define TNL_T_PROTO_GRE		0
-#define TNL_T_PROTO_CAPWAP	1
-
-/* These flags are only needed when calling tnl_find_port(). */
-#define TNL_T_KEY_EXACT		(1 << 10)
-#define TNL_T_KEY_MATCH		(1 << 11)
+#define TNL_T_PROTO_GRE			0
+#define TNL_T_PROTO_CAPWAP		1
 
 /* Private flags not exposed to userspace in this form. */
 #define TNL_F_IN_KEY_MATCH	(1 << 16) /* Store the key in tun_id to
@@ -66,12 +59,9 @@
  * @tunnel_type: Set of TNL_T_* flags that define lookup.
  */
 struct port_lookup_key {
-	__be64 in_key;
 #ifdef CONFIG_NET_NS
 	struct net *net;
 #endif
-	__be32 saddr;
-	__be32 daddr;
 	u32    tunnel_type;
 };
 
@@ -212,9 +202,7 @@ const unsigned char *ovs_tnl_get_addr(const struct vport *vport);
 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb);
 void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb);
 
-struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
-				__be64 key, int tunnel_type,
-				const struct tnl_mutable_config **mutable);
+struct vport *ovs_tnl_find_port(struct net *net, u32 tunnel_type);
 bool ovs_tnl_frag_needed(struct vport *vport,
 			 const struct tnl_mutable_config *mutable,
 			 struct sk_buff *skb, unsigned int mtu,
diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
index f26a7d2..a180b87 100644
--- a/datapath/vport-capwap.c
+++ b/datapath/vport-capwap.c
@@ -314,7 +314,6 @@ error:
 static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct vport *vport;
-	const struct tnl_mutable_config *mutable;
 	struct iphdr *iph;
 	struct ovs_key_ipv4_tunnel tun_key;
 	__be64 key = 0;
@@ -327,15 +326,13 @@ static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
 		goto out;
 
 	iph = ip_hdr(skb);
-	vport = ovs_tnl_find_port(sock_net(sk), iph->daddr, iph->saddr, key,
-				  TNL_T_PROTO_CAPWAP, &mutable);
+	vport = ovs_tnl_find_port(dev_net(skb->dev), TNL_T_PROTO_CAPWAP);
 	if (unlikely(!vport)) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 		goto error;
 	}
 
-	tun_key_init(&tun_key, iph,
-		     mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0);
+	tun_key_init(&tun_key, iph, key);
 	OVS_CB(skb)->tun_key = &tun_key;
 
 	ovs_tnl_rcv(vport, skb);
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
index f610097..8fab193 100644
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -170,6 +170,8 @@ static int parse_header(struct iphdr *iph, __be16 *flags, __be64 *key)
 /* Called with rcu_read_lock and BH disabled. */
 static void gre_err(struct sk_buff *skb, u32 info)
 {
+#warning fix gre_err
+#if 0
 	struct vport *vport;
 	const struct tnl_mutable_config *mutable;
 	const int type = icmp_hdr(skb)->type;
@@ -292,6 +294,7 @@ out:
 	skb_set_mac_header(skb, orig_mac_header);
 	skb_set_network_header(skb, orig_nw_header);
 	skb->protocol = htons(ETH_P_IP);
+#endif
 }
 
 static bool check_checksum(struct sk_buff *skb)
@@ -324,7 +327,6 @@ static bool check_checksum(struct sk_buff *skb)
 static int gre_rcv(struct sk_buff *skb)
 {
 	struct vport *vport;
-	const struct tnl_mutable_config *mutable;
 	int hdr_len;
 	struct iphdr *iph;
 	struct ovs_key_ipv4_tunnel tun_key;
@@ -345,16 +347,14 @@ static int gre_rcv(struct sk_buff *skb)
 		goto error;
 
 	iph = ip_hdr(skb);
-	vport = ovs_tnl_find_port(dev_net(skb->dev), iph->daddr, iph->saddr, key,
-				  TNL_T_PROTO_GRE, &mutable);
+	vport = ovs_tnl_find_port(dev_net(skb->dev), TNL_T_PROTO_GRE);
 	if (unlikely(!vport)) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 		goto error;
 	}
 
 
-	tun_key_init(&tun_key, iph,
-		     mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0);
+	tun_key_init(&tun_key, iph, key);
 	OVS_CB(skb)->tun_key = &tun_key;
 
 	__skb_pull(skb, hdr_len);
-- 
1.7.10.2.484.gcd07cc5

^ permalink raw reply related

* [PATCH 21/21] datapath: Always use tun_key flags
From: Simon Horman @ 2012-05-24  9:09 UTC (permalink / raw)
  To: dev; +Cc: netdev, Kyle Mestery, Simon Horman
In-Reply-To: <1337850554-10339-1-git-send-email-horms@verge.net.au>

These flags should always be valid and allows the flags
element of tnl_mutable_config to be removed.

The flags in mutable were actually not being set due to a previous patch in
this series, so all flag-related features, except outgoing ken and csum
which were restored in a previous patch, were disabled.

Cc: Kyle Mestery <kmestery@cisco.com>
Signed-of-by: Simon Horman <horms@verge.net.au>
---
 datapath/tunnel.c | 13 ++++++-------
 datapath/tunnel.h |  4 ----
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index 982de25..a91e319 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -482,7 +482,7 @@ bool ovs_tnl_frag_needed(struct vport *vport,
 	 * not symmetric then PMTUD needs to be disabled since we won't have
 	 * any way of synthesizing packets.
 	 */
-	if ((mutable->flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
+	if ((OVS_CB(skb)->tun_key->tun_flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
 	    (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) {
 		ntun_key = *tun_key;
 		OVS_CB(nskb)->tun_key = &ntun_key;
@@ -503,9 +503,9 @@ static bool check_mtu(struct sk_buff *skb,
 		      const struct tnl_mutable_config *mutable, int tun_hlen,
 		      const struct rtable *rt, __be16 *frag_offp)
 {
-	bool df_inherit = mutable->flags & TNL_F_DF_INHERIT;
-	bool pmtud = mutable->flags & TNL_F_PMTUD;
-	__be16 frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
+	bool df_inherit = OVS_CB(skb)->tun_key->tun_flags & TNL_F_DF_INHERIT;
+	bool pmtud = OVS_CB(skb)->tun_key->tun_flags & TNL_F_PMTUD;
+	__be16 frag_off = OVS_CB(skb)->tun_key->tun_flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
 	int mtu = 0;
 	unsigned int packet_length = skb->len - ETH_HLEN;
 
@@ -804,7 +804,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	else
 		inner_tos = 0;
 
-	if (mutable->flags & TNL_F_TOS_INHERIT)
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_TOS_INHERIT)
 		tos = inner_tos;
 	else
 		tos = OVS_CB(skb)->tun_key->ipv4_tos;
@@ -851,7 +851,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
 	if (!ttl)
 		ttl = ip4_dst_hoplimit(&rt_dst(rt));
-	if (mutable->flags & TNL_F_TTL_INHERIT) {
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_TTL_INHERIT) {
 		if (skb->protocol == htons(ETH_P_IP))
 			ttl = ip_hdr(skb)->ttl;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -919,7 +919,6 @@ static int tnl_set_config(struct net *net,
 {
 	const struct vport *old_vport;
 
-	mutable->flags = 0;
 	port_key_set_net(&mutable->key, net);
 	mutable->key.tunnel_type = tnl_ops->tunnel_type;
 
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index a32241f..4893903 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -86,7 +86,6 @@ static inline void port_key_set_net(struct port_lookup_key *key, struct net *net
  * @seq: Sequence number for distinguishing configuration versions.
  * @eth_addr: Source address for packets generated by tunnel itself
  * (e.g. ICMP fragmentation needed messages).
- * @flags: TNL_F_* flags.
  */
 struct tnl_mutable_config {
 	struct port_lookup_key key;
@@ -95,9 +94,6 @@ struct tnl_mutable_config {
 	unsigned seq;
 
 	unsigned char eth_addr[ETH_ALEN];
-
-	/* Configured via OVS_TUNNEL_ATTR_* attributes. */
-	u32	flags;
 };
 
 struct tnl_ops {
-- 
1.7.10.2.484.gcd07cc5

^ permalink raw reply related

* [PATCH 20/21] datapath: Use tun_key flags for id and csum settings on transmit
From: Simon Horman @ 2012-05-24  9:09 UTC (permalink / raw)
  To: dev; +Cc: netdev, Kyle Mestery, Simon Horman
In-Reply-To: <1337850554-10339-1-git-send-email-horms@verge.net.au>

The use of these flags in the tnl_mutable_config structure
are no longer correct as a tunnel device may be used to
transmit packets for many different tunnels.

This change restores the checksum and out key behavior of
tunneling.

Cc: Kyle Mestery <kmestery@cisco.com>
Signed-of-by: Simon Horman <horms@verge.net.au>
---
 datapath/tunnel.c       | 58 ++++++++++++++++++++++++-------------------------
 datapath/tunnel.h       | 12 +++-------
 datapath/vport-capwap.c | 28 ++++++++++++------------
 datapath/vport-gre.c    | 33 ++++++++++++++--------------
 4 files changed, 63 insertions(+), 68 deletions(-)

diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index a303d8d..982de25 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -500,7 +500,7 @@ bool ovs_tnl_frag_needed(struct vport *vport,
 
 static bool check_mtu(struct sk_buff *skb,
 		      struct vport *vport,
-		      const struct tnl_mutable_config *mutable,
+		      const struct tnl_mutable_config *mutable, int tun_hlen,
 		      const struct rtable *rt, __be16 *frag_offp)
 {
 	bool df_inherit = mutable->flags & TNL_F_DF_INHERIT;
@@ -524,10 +524,7 @@ static bool check_mtu(struct sk_buff *skb,
 		    eth_hdr(skb)->h_proto == htons(ETH_P_8021Q))
 			vlan_header = VLAN_HLEN;
 
-		mtu = dst_mtu(&rt_dst(rt))
-			- ETH_HLEN
-			- mutable->tunnel_hlen
-			- vlan_header;
+		mtu = dst_mtu(&rt_dst(rt)) - ETH_HLEN - tun_hlen - vlan_header;
 	}
 
 	if (skb->protocol == htons(ETH_P_IP)) {
@@ -569,11 +566,10 @@ static bool check_mtu(struct sk_buff *skb,
 }
 
 static void create_tunnel_header(const struct vport *vport,
-				 const struct tnl_mutable_config *mutable,
-				 const struct rtable *rt, void *header)
+				 const struct rtable *rt, struct sk_buff *skb)
 {
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct iphdr *iph = header;
+	struct iphdr *iph = (struct iphdr *)skb->data;
 
 	iph->version	= 4;
 	iph->ihl	= sizeof(struct iphdr) >> 2;
@@ -584,7 +580,7 @@ static void create_tunnel_header(const struct vport *vport,
 	if (!iph->ttl)
 		iph->ttl = ip4_dst_hoplimit(&rt_dst(rt));
 
-	tnl_vport->tnl_ops->build_header(vport, mutable, iph + 1);
+	tnl_vport->tnl_ops->build_header(vport, skb);
 }
 
 #ifdef HAVE_RT_GENID
@@ -657,16 +653,14 @@ static bool need_linearize(const struct sk_buff *skb)
 	return false;
 }
 
-static struct sk_buff *handle_offloads(struct sk_buff *skb,
-				       const struct tnl_mutable_config *mutable,
+static struct sk_buff *handle_offloads(struct sk_buff *skb, int tun_hlen,
 				       const struct rtable *rt)
 {
 	int min_headroom;
 	int err;
 
 	min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
-			+ mutable->tunnel_hlen
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ tun_hlen + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
 
 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 		int head_delta = SKB_DATA_ALIGN(min_headroom -
@@ -719,15 +713,14 @@ error:
 	return ERR_PTR(err);
 }
 
-static int send_frags(struct sk_buff *skb,
-		      const struct tnl_mutable_config *mutable)
+static int send_frags(struct sk_buff *skb, int tun_hlen)
 {
 	int sent_len;
 
 	sent_len = 0;
 	while (skb) {
 		struct sk_buff *next = skb->next;
-		int frag_len = skb->len - mutable->tunnel_hlen;
+		int frag_len = skb->len - tun_hlen;
 		int err;
 
 		skb->next = NULL;
@@ -752,6 +745,14 @@ free_frags:
 	return sent_len;
 }
 
+static int tunnel_hlen(struct tnl_vport *tnl_vport, struct sk_buff *skb)
+{
+	int tun_hlen = tnl_vport->tnl_ops->hdr_len(skb);
+	if (tun_hlen < 0)
+		return tun_hlen;
+	return tun_hlen + sizeof(struct iphdr);
+}
+
 int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
@@ -765,6 +766,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	u8 ttl;
 	u8 inner_tos;
 	u8 tos;
+	int tun_hlen;
 
 	if (!OVS_CB(skb)->tun_key)
 		goto error_free;
@@ -822,13 +824,17 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	skb_dst_drop(skb);
 	skb_clear_rxhash(skb);
 
+	tun_hlen = tunnel_hlen(tnl_vport, skb);
+	if (unlikely(tun_hlen < 0))
+		goto error;
+
 	/* Offloading */
-	skb = handle_offloads(skb, mutable, rt);
+	skb = handle_offloads(skb, tun_hlen, rt);
 	if (IS_ERR(skb))
 		goto error;
 
 	/* MTU */
-	if (unlikely(!check_mtu(skb, vport, mutable, rt, &frag_off))) {
+	if (unlikely(!check_mtu(skb, vport, mutable, tun_hlen, rt, &frag_off))) {
 		err = VPORT_E_TX_DROPPED;
 		goto error_free;
 	}
@@ -837,7 +843,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	 * If we are over the MTU, allow the IP stack to handle fragmentation.
 	 * Fragmentation is a slow path anyways.
 	 */
-	if (unlikely(skb->len + mutable->tunnel_hlen > dst_mtu(&rt_dst(rt)))) {
+	if (unlikely(skb->len + tun_hlen > dst_mtu(&rt_dst(rt)))) {
 		unattached_dst = &rt_dst(rt);
 		dst_hold(unattached_dst);
 	}
@@ -862,8 +868,8 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 		if (unlikely(vlan_deaccel_tag(skb)))
 			goto next;
 
-		skb_push(skb, mutable->tunnel_hlen);
-		create_tunnel_header(vport, mutable, rt, skb->data);
+		skb_push(skb, tun_hlen);
+		create_tunnel_header(vport, rt, skb);
 		skb_reset_network_header(skb);
 
 		if (next_skb)
@@ -880,12 +886,12 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 		iph->frag_off = frag_off;
 		ip_select_ident(iph, &rt_dst(rt), NULL);
 
-		skb = tnl_vport->tnl_ops->update_header(vport, mutable,
+		skb = tnl_vport->tnl_ops->update_header(vport, tun_hlen,
 							&rt_dst(rt), skb);
 		if (unlikely(!skb))
 			goto next;
 
-		sent_len += send_frags(skb, mutable);
+		sent_len += send_frags(skb, tun_hlen);
 next:
 		skb = next_skb;
 	}
@@ -917,12 +923,6 @@ static int tnl_set_config(struct net *net,
 	port_key_set_net(&mutable->key, net);
 	mutable->key.tunnel_type = tnl_ops->tunnel_type;
 
-	mutable->tunnel_hlen = tnl_ops->hdr_len(mutable);
-	if (mutable->tunnel_hlen < 0)
-		return mutable->tunnel_hlen;
-
-	mutable->tunnel_hlen += sizeof(struct iphdr);
-
 	old_vport = port_table_lookup(&mutable->key);
 	if (old_vport && old_vport != cur_vport)
 		return -EEXIST;
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index cddb88e..a32241f 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -84,10 +84,8 @@ static inline void port_key_set_net(struct port_lookup_key *key, struct net *net
  * attributes.
  * @rcu: RCU callback head for deferred destruction.
  * @seq: Sequence number for distinguishing configuration versions.
- * @tunnel_hlen: Tunnel header length.
  * @eth_addr: Source address for packets generated by tunnel itself
  * (e.g. ICMP fragmentation needed messages).
- * @out_key: Key to use on output, 0 if this tunnel has no fixed output key.
  * @flags: TNL_F_* flags.
  */
 struct tnl_mutable_config {
@@ -96,12 +94,9 @@ struct tnl_mutable_config {
 
 	unsigned seq;
 
-	unsigned tunnel_hlen;
-
 	unsigned char eth_addr[ETH_ALEN];
 
 	/* Configured via OVS_TUNNEL_ATTR_* attributes. */
-	__be64	out_key;
 	u32	flags;
 };
 
@@ -114,7 +109,7 @@ struct tnl_ops {
 	 * build_header() (i.e. excludes the IP header).  Returns a negative
 	 * error code if the configuration is invalid.
 	 */
-	int (*hdr_len)(const struct tnl_mutable_config *);
+	int (*hdr_len)(struct sk_buff *skb);
 
 	/*
 	 * Builds the static portion of the tunnel header, which is stored in
@@ -124,8 +119,7 @@ struct tnl_ops {
 	 * in some circumstances caching is disabled and this function will be
 	 * called for every packet, so try not to make it too slow.
 	 */
-	void (*build_header)(const struct vport *,
-			     const struct tnl_mutable_config *, void *header);
+	void (*build_header)(const struct vport *, struct sk_buff *);
 
 	/*
 	 * Updates the cached header of a packet to match the actual packet
@@ -136,7 +130,7 @@ struct tnl_ops {
 	 * of fragmentation).
 	 */
 	struct sk_buff *(*update_header)(const struct vport *,
-					 const struct tnl_mutable_config *,
+					 int tun_hlen,
 					 struct dst_entry *, struct sk_buff *);
 };
 
diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
index a180b87..102a207 100644
--- a/datapath/vport-capwap.c
+++ b/datapath/vport-capwap.c
@@ -155,16 +155,17 @@ static struct inet_frags frag_state = {
 	.secret_interval = CAPWAP_FRAG_SECRET_INTERVAL,
 };
 
-static int capwap_hdr_len(const struct tnl_mutable_config *mutable)
+static int capwap_hdr_len(struct sk_buff *skb)
 {
 	int size = CAPWAP_MIN_HLEN;
 
 	/* CAPWAP has no checksums. */
-	if (mutable->flags & TNL_F_CSUM)
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_CSUM) {
 		return -EINVAL;
 
 	/* if keys are specified, then add WSI field */
-	if (mutable->out_key || (mutable->flags & TNL_F_OUT_KEY_ACTION)) {
+	if (OVS_CB(skb)->tun_key->tun_id ||
+	    OVS_CB(skb)->tun_key->tun_flags & TNL_F_OUT_KEY_ACTION)
 		size += sizeof(struct capwaphdr_wsi) +
 			sizeof(struct capwaphdr_wsi_key);
 	}
@@ -172,11 +173,10 @@ static int capwap_hdr_len(const struct tnl_mutable_config *mutable)
 	return size;
 }
 
-static void capwap_build_header(const struct vport *vport,
-				const struct tnl_mutable_config *mutable,
-				void *header)
+static void capwap_build_header(const struct vport *vport, struct sk_buff *skb)
 {
-	struct udphdr *udph = header;
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct udphdr *udph = (struct udphdr *)(iph + 1);
 	struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
 
 	udph->source = htons(CAPWAP_SRC_PORT);
@@ -186,7 +186,8 @@ static void capwap_build_header(const struct vport *vport,
 	cwh->frag_id = 0;
 	cwh->frag_off = 0;
 
-	if (mutable->out_key || (mutable->flags & TNL_F_OUT_KEY_ACTION)) {
+	if (OVS_CB(skb)->tun_key->tun_id ||
+	    OVS_CB(skb)->tun_key->tun_flags & TNL_F_OUT_KEY_ACTION) {
 		struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
 
 		cwh->begin = CAPWAP_KEYED;
@@ -197,9 +198,9 @@ static void capwap_build_header(const struct vport *vport,
 		wsi->flags = CAPWAP_WSI_F_KEY64;
 		wsi->reserved_padding = 0;
 
-		if (mutable->out_key) {
+		if (OVS_CB(skb)->tun_key->tun_id) {
 			struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key *)(wsi + 1);
-			opt->key = mutable->out_key;
+			opt->key = OVS_CB(skb)->tun_key->tun_id;
 		}
 	} else {
 		/* make packet readable by old capwap code */
@@ -208,13 +209,12 @@ static void capwap_build_header(const struct vport *vport,
 }
 
 static struct sk_buff *capwap_update_header(const struct vport *vport,
-					    const struct tnl_mutable_config *mutable,
-					    struct dst_entry *dst,
+					    int tun_hlen, struct dst_entry *dst,
 					    struct sk_buff *skb)
 {
 	struct udphdr *udph = udp_hdr(skb);
 
-	if (mutable->flags & TNL_F_OUT_KEY_ACTION) {
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_OUT_KEY_ACTION) {
 		/* first field in WSI is key */
 		struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
 		struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
@@ -226,7 +226,7 @@ static struct sk_buff *capwap_update_header(const struct vport *vport,
 	udph->len = htons(skb->len - skb_transport_offset(skb));
 
 	if (unlikely(skb->len - skb_network_offset(skb) > dst_mtu(dst))) {
-		unsigned int hlen = skb_transport_offset(skb) + capwap_hdr_len(mutable);
+		unsigned int hlen = skb_transport_offset(skb) + capwap_hdr_len(skb);
 		skb = fragment(skb, vport, dst, hlen);
 	}
 
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
index 8fab193..b6a4308 100644
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -45,16 +45,17 @@ struct gre_base_hdr {
 	__be16 protocol;
 };
 
-static int gre_hdr_len(const struct tnl_mutable_config *mutable)
+static int gre_hdr_len(struct sk_buff *skb)
 {
 	int len;
 
 	len = GRE_HEADER_SECTION;
 
-	if (mutable->flags & TNL_F_CSUM)
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_CSUM)
 		len += GRE_HEADER_SECTION;
 
-	if (mutable->out_key || mutable->flags & TNL_F_OUT_KEY_ACTION)
+	if (OVS_CB(skb)->tun_key->tun_id ||
+	    OVS_CB(skb)->tun_key->tun_flags & TNL_F_OUT_KEY_ACTION)
 		len += GRE_HEADER_SECTION;
 
 	return len;
@@ -70,41 +71,41 @@ static __be32 be64_get_low32(__be64 x)
 #endif
 }
 
-static void gre_build_header(const struct vport *vport,
-			     const struct tnl_mutable_config *mutable,
-			     void *header)
+static void gre_build_header(const struct vport *vport, struct sk_buff *skb)
 {
-	struct gre_base_hdr *greh = header;
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct gre_base_hdr *greh = (struct gre_base_hdr *)(iph + 1);
 	__be32 *options = (__be32 *)(greh + 1);
 
 	greh->protocol = htons(ETH_P_TEB);
 	greh->flags = 0;
 
-	if (mutable->flags & TNL_F_CSUM) {
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_CSUM) {
 		greh->flags |= GRE_CSUM;
 		*options = 0;
 		options++;
 	}
 
-	if (mutable->out_key || mutable->flags & TNL_F_OUT_KEY_ACTION)
+	if (OVS_CB(skb)->tun_key->tun_id ||
+	    OVS_CB(skb)->tun_key->tun_flags & TNL_F_OUT_KEY_ACTION)
 		greh->flags |= GRE_KEY;
 
-	if (mutable->out_key)
-		*options = be64_get_low32(mutable->out_key);
+	if (OVS_CB(skb)->tun_key->tun_id)
+		*options = be64_get_low32(OVS_CB(skb)->tun_key->tun_id);
 }
 
 static struct sk_buff *gre_update_header(const struct vport *vport,
-					 const struct tnl_mutable_config *mutable,
-					 struct dst_entry *dst,
+					 int tun_hlen, struct dst_entry *dst,
 					 struct sk_buff *skb)
 {
-	__be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen
+	__be32 *options = (__be32 *)(skb_network_header(skb) + tun_hlen
 					       - GRE_HEADER_SECTION);
 
-	if (mutable->out_key || mutable->flags & TNL_F_OUT_KEY_ACTION)
+	if (OVS_CB(skb)->tun_key->tun_id ||
+	    OVS_CB(skb)->tun_key->tun_flags & TNL_F_OUT_KEY_ACTION)
 		options--;
 
-	if (mutable->flags & TNL_F_CSUM)
+	if (OVS_CB(skb)->tun_key->tun_flags & TNL_F_CSUM)
 		*(__sum16 *)options = csum_fold(skb_checksum(skb,
 						skb_transport_offset(skb),
 						skb->len - skb_transport_offset(skb),
-- 
1.7.10.2.484.gcd07cc5

^ permalink raw reply related

* [PATCH 16/21] datapath: remove tunnel cache
From: Simon Horman @ 2012-05-24  9:09 UTC (permalink / raw)
  To: dev; +Cc: netdev, Kyle Mestery, Simon Horman
In-Reply-To: <1337850554-10339-1-git-send-email-horms@verge.net.au>

As tunndevs no longer have a daddr the cache can no longer built in this way.
Furthermore, its not clear to me what the value of keeping the cache is in
the context of moving towards allowing use of in-tree tunnelling.

Cc: Kyle Mestery <kmestery@cisco.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 datapath/tunnel.c | 384 +++---------------------------------------------------
 datapath/tunnel.h |  52 --------
 2 files changed, 20 insertions(+), 416 deletions(-)

diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index cdcb0a7..b997cb8 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -52,43 +52,9 @@
 #include "vport-generic.h"
 #include "vport-internal_dev.h"
 
-#ifdef NEED_CACHE_TIMEOUT
-/*
- * On kernels where we can't quickly detect changes in the rest of the system
- * we use an expiration time to invalidate the cache.  A shorter expiration
- * reduces the length of time that we may potentially blackhole packets while
- * a longer time increases performance by reducing the frequency that the
- * cache needs to be rebuilt.  A variety of factors may cause the cache to be
- * invalidated before the expiration time but this is the maximum.  The time
- * is expressed in jiffies.
- */
-#define MAX_CACHE_EXP HZ
-#endif
-
-/*
- * Interval to check for and remove caches that are no longer valid.  Caches
- * are checked for validity before they are used for packet encapsulation and
- * old caches are removed at that time.  However, if no packets are sent through
- * the tunnel then the cache will never be destroyed.  Since it holds
- * references to a number of system objects, the cache will continue to use
- * system resources by not allowing those objects to be destroyed.  The cache
- * cleaner is periodically run to free invalid caches.  It does not
- * significantly affect system performance.  A lower interval will release
- * resources faster but will itself consume resources by requiring more frequent
- * checks.  A longer interval may result in messages being printed to the kernel
- * message buffer about unreleased resources.  The interval is expressed in
- * jiffies.
- */
-#define CACHE_CLEANER_INTERVAL (5 * HZ)
-
-#define CACHE_DATA_ALIGN 16
 #define PORT_TABLE_SIZE  1024
 
 static struct hlist_head *port_table __read_mostly;
-static int port_table_count;
-
-static void cache_cleaner(struct work_struct *work);
-static DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner);
 
 /*
  * These are just used as an optimization: they don't require any kind of
@@ -108,60 +74,17 @@ static unsigned int multicast_ports __read_mostly;
 #define rt_dst(rt) (rt->u.dst)
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
-static struct hh_cache *rt_hh(struct rtable *rt)
-{
-	struct neighbour *neigh = dst_get_neighbour_noref(&rt->dst);
-	if (!neigh || !(neigh->nud_state & NUD_CONNECTED) ||
-			!neigh->hh.hh_len)
-		return NULL;
-	return &neigh->hh;
-}
-#else
-#define rt_hh(rt) (rt_dst(rt).hh)
-#endif
-
 static struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport)
 {
 	return vport_from_priv(tnl_vport);
 }
 
-/* This is analogous to rtnl_dereference for the tunnel cache.  It checks that
- * cache_lock is held, so it is only for update side code.
- */
-static struct tnl_cache *cache_dereference(struct tnl_vport *tnl_vport)
-{
-	return rcu_dereference_protected(tnl_vport->cache,
-				 lockdep_is_held(&tnl_vport->cache_lock));
-}
-
-static void schedule_cache_cleaner(void)
-{
-	schedule_delayed_work(&cache_cleaner_wq, CACHE_CLEANER_INTERVAL);
-}
-
-static void free_cache(struct tnl_cache *cache)
-{
-	if (!cache)
-		return;
-
-	ovs_flow_put(cache->flow);
-	ip_rt_put(cache->rt);
-	kfree(cache);
-}
-
 static void free_config_rcu(struct rcu_head *rcu)
 {
 	struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
 	kfree(c);
 }
 
-static void free_cache_rcu(struct rcu_head *rcu)
-{
-	struct tnl_cache *c = container_of(rcu, struct tnl_cache, rcu);
-	free_cache(c);
-}
-
 static void assign_config_rcu(struct vport *vport,
 			      struct tnl_mutable_config *new_config)
 {
@@ -174,18 +97,6 @@ static void assign_config_rcu(struct vport *vport,
 	call_rcu(&old_config->rcu, free_config_rcu);
 }
 
-static void assign_cache_rcu(struct vport *vport, struct tnl_cache *new_cache)
-{
-	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct tnl_cache *old_cache;
-
-	old_cache = cache_dereference(tnl_vport);
-	rcu_assign_pointer(tnl_vport->cache, new_cache);
-
-	if (old_cache)
-		call_rcu(&old_cache->rcu, free_cache_rcu);
-}
-
 static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
 {
 	bool is_multicast = ipv4_is_multicast(mutable->key.daddr);
@@ -223,13 +134,9 @@ static void port_table_add_port(struct vport *vport)
 	const struct tnl_mutable_config *mutable;
 	u32 hash;
 
-	if (port_table_count == 0)
-		schedule_cache_cleaner();
-
 	mutable = rtnl_dereference(tnl_vport->mutable);
 	hash = port_hash(&mutable->key);
 	hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
-	port_table_count++;
 
 	(*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
 }
@@ -240,10 +147,6 @@ static void port_table_remove_port(struct vport *vport)
 
 	hlist_del_init_rcu(&tnl_vport->hash_node);
 
-	port_table_count--;
-	if (port_table_count == 0)
-		cancel_delayed_work_sync(&cache_cleaner_wq);
-
 	(*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
 }
 
@@ -780,11 +683,6 @@ static void create_tunnel_header(const struct vport *vport,
 	tnl_vport->tnl_ops->build_header(vport, mutable, iph + 1);
 }
 
-static void *get_cached_header(const struct tnl_cache *cache)
-{
-	return (void *)cache + ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN);
-}
-
 #ifdef HAVE_RT_GENID
 static inline int rt_genid(struct net *net)
 {
@@ -792,184 +690,6 @@ static inline int rt_genid(struct net *net)
 }
 #endif
 
-static bool check_cache_valid(const struct tnl_cache *cache,
-			      const struct tnl_mutable_config *mutable)
-{
-	struct hh_cache *hh;
-
-	if (!cache)
-		return false;
-
-	hh = rt_hh(cache->rt);
-	return hh &&
-#ifdef NEED_CACHE_TIMEOUT
-		time_before(jiffies, cache->expiration) &&
-#endif
-#ifdef HAVE_RT_GENID
-		rt_genid(dev_net(rt_dst(cache->rt).dev)) == cache->rt->rt_genid &&
-#endif
-#ifdef HAVE_HH_SEQ
-		hh->hh_lock.sequence == cache->hh_seq &&
-#endif
-		mutable->seq == cache->mutable_seq &&
-		(!ovs_is_internal_dev(rt_dst(cache->rt).dev) ||
-		(cache->flow && !cache->flow->dead));
-}
-
-static void __cache_cleaner(struct tnl_vport *tnl_vport)
-{
-	const struct tnl_mutable_config *mutable =
-			rcu_dereference(tnl_vport->mutable);
-	const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache);
-
-	if (cache && !check_cache_valid(cache, mutable) &&
-	    spin_trylock_bh(&tnl_vport->cache_lock)) {
-		assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL);
-		spin_unlock_bh(&tnl_vport->cache_lock);
-	}
-}
-
-static void cache_cleaner(struct work_struct *work)
-{
-	int i;
-
-	schedule_cache_cleaner();
-
-	rcu_read_lock();
-	for (i = 0; i < PORT_TABLE_SIZE; i++) {
-		struct hlist_node *n;
-		struct hlist_head *bucket;
-		struct tnl_vport *tnl_vport;
-
-		bucket = &port_table[i];
-		hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node)
-			__cache_cleaner(tnl_vport);
-	}
-	rcu_read_unlock();
-}
-
-static void create_eth_hdr(struct tnl_cache *cache, struct hh_cache *hh)
-{
-	void *cache_data = get_cached_header(cache);
-	int hh_off;
-
-#ifdef HAVE_HH_SEQ
-	unsigned hh_seq;
-
-	do {
-		hh_seq = read_seqbegin(&hh->hh_lock);
-		hh_off = HH_DATA_ALIGN(hh->hh_len) - hh->hh_len;
-		memcpy(cache_data, (void *)hh->hh_data + hh_off, hh->hh_len);
-		cache->hh_len = hh->hh_len;
-	} while (read_seqretry(&hh->hh_lock, hh_seq));
-
-	cache->hh_seq = hh_seq;
-#else
-	read_lock(&hh->hh_lock);
-	hh_off = HH_DATA_ALIGN(hh->hh_len) - hh->hh_len;
-	memcpy(cache_data, (void *)hh->hh_data + hh_off, hh->hh_len);
-	cache->hh_len = hh->hh_len;
-	read_unlock(&hh->hh_lock);
-#endif
-}
-
-static struct tnl_cache *build_cache(struct vport *vport,
-				     const struct tnl_mutable_config *mutable,
-				     struct rtable *rt)
-{
-	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct tnl_cache *cache;
-	void *cache_data;
-	int cache_len;
-	struct hh_cache *hh;
-
-	if (!(mutable->flags & TNL_F_HDR_CACHE))
-		return NULL;
-
-	/*
-	 * If there is no entry in the ARP cache or if this device does not
-	 * support hard header caching just fall back to the IP stack.
-	 */
-
-	hh = rt_hh(rt);
-	if (!hh)
-		return NULL;
-
-	/*
-	 * If lock is contended fall back to directly building the header.
-	 * We're not going to help performance by sitting here spinning.
-	 */
-	if (!spin_trylock(&tnl_vport->cache_lock))
-		return NULL;
-
-	cache = cache_dereference(tnl_vport);
-	if (check_cache_valid(cache, mutable))
-		goto unlock;
-	else
-		cache = NULL;
-
-	cache_len = LL_RESERVED_SPACE(rt_dst(rt).dev) + mutable->tunnel_hlen;
-
-	cache = kzalloc(ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN) +
-			cache_len, GFP_ATOMIC);
-	if (!cache)
-		goto unlock;
-
-	create_eth_hdr(cache, hh);
-	cache_data = get_cached_header(cache) + cache->hh_len;
-	cache->len = cache->hh_len + mutable->tunnel_hlen;
-
-	create_tunnel_header(vport, mutable, rt, cache_data);
-
-	cache->mutable_seq = mutable->seq;
-	cache->rt = rt;
-#ifdef NEED_CACHE_TIMEOUT
-	cache->expiration = jiffies + tnl_vport->cache_exp_interval;
-#endif
-
-	if (ovs_is_internal_dev(rt_dst(rt).dev)) {
-		struct sw_flow_key flow_key;
-		struct vport *dst_vport;
-		struct sk_buff *skb;
-		int err;
-		int flow_key_len;
-		struct sw_flow *flow;
-
-		dst_vport = ovs_internal_dev_get_vport(rt_dst(rt).dev);
-		if (!dst_vport)
-			goto done;
-
-		skb = alloc_skb(cache->len, GFP_ATOMIC);
-		if (!skb)
-			goto done;
-
-		__skb_put(skb, cache->len);
-		memcpy(skb->data, get_cached_header(cache), cache->len);
-
-		err = ovs_flow_extract(skb, dst_vport->port_no, &flow_key,
-				       &flow_key_len);
-
-		consume_skb(skb);
-		if (err)
-			goto done;
-
-		flow = ovs_flow_tbl_lookup(rcu_dereference(dst_vport->dp->table),
-					   &flow_key, flow_key_len);
-		if (flow) {
-			cache->flow = flow;
-			ovs_flow_hold(flow);
-		}
-	}
-
-done:
-	assign_cache_rcu(vport, cache);
-
-unlock:
-	spin_unlock(&tnl_vport->cache_lock);
-
-	return cache;
-}
-
 static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
 				   u8 ipproto, __be32 daddr, __be32 saddr,
 				   u8 tos)
@@ -1001,33 +721,19 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
 
 static struct rtable *find_route(struct vport *vport,
 				 const struct tnl_mutable_config *mutable,
-				 u8 tos, __be32 daddr, __be32 saddr,
-				 struct tnl_cache **cache)
+				 u8 tos, __be32 daddr, __be32 saddr)
 {
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
+	struct rtable *rt;
 
-	*cache = NULL;
 	tos = RT_TOS(tos);
 
-	if (daddr == mutable->key.daddr && saddr == mutable->key.saddr &&
-	    tos == RT_TOS(mutable->tos) &&
-	    check_cache_valid(cur_cache, mutable)) {
-		*cache = cur_cache;
-		return cur_cache->rt;
-	} else {
-		struct rtable *rt;
-
-		rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto,
-				  daddr, saddr, tos);
-		if (IS_ERR(rt))
-			return NULL;
-
-		if (likely(tos == RT_TOS(mutable->tos)))
-			*cache = build_cache(vport, mutable, rt);
+	rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto,
+			  daddr, saddr, tos);
+	if (IS_ERR(rt))
+		return NULL;
 
-		return rt;
-	}
+	return rt;
 }
 
 static bool need_linearize(const struct sk_buff *skb)
@@ -1152,7 +858,6 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	enum vport_err_type err = VPORT_E_TX_ERROR;
 	struct rtable *rt;
 	struct dst_entry *unattached_dst = NULL;
-	struct tnl_cache *cache;
 	int sent_len = 0;
 	__be16 frag_off = 0;
 	__be32 daddr;
@@ -1210,11 +915,10 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	}
 
 	/* Route lookup */
-	rt = find_route(vport, mutable, tos, daddr, saddr, &cache);
+	rt = find_route(vport, mutable, tos, daddr, saddr);
 	if (unlikely(!rt))
 		goto error_free;
-	if (unlikely(!cache))
-		unattached_dst = &rt_dst(rt);
+	unattached_dst = &rt_dst(rt);
 
 	tos = INET_ECN_encapsulate(tos, inner_tos);
 
@@ -1239,11 +943,9 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	 * If we are over the MTU, allow the IP stack to handle fragmentation.
 	 * Fragmentation is a slow path anyways.
 	 */
-	if (unlikely(skb->len + mutable->tunnel_hlen > dst_mtu(&rt_dst(rt)) &&
-		     cache)) {
+	if (unlikely(skb->len + mutable->tunnel_hlen > dst_mtu(&rt_dst(rt)))) {
 		unattached_dst = &rt_dst(rt);
 		dst_hold(unattached_dst);
-		cache = NULL;
 	}
 
 	/* TTL */
@@ -1270,23 +972,15 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 		if (unlikely(vlan_deaccel_tag(skb)))
 			goto next;
 
-		if (likely(cache)) {
-			skb_push(skb, cache->len);
-			memcpy(skb->data, get_cached_header(cache), cache->len);
-			skb_reset_mac_header(skb);
-			skb_set_network_header(skb, cache->hh_len);
-
-		} else {
-			skb_push(skb, mutable->tunnel_hlen);
-			create_tunnel_header(vport, mutable, rt, skb->data);
-			skb_reset_network_header(skb);
-
-			if (next_skb)
-				skb_dst_set(skb, dst_clone(unattached_dst));
-			else {
-				skb_dst_set(skb, unattached_dst);
-				unattached_dst = NULL;
-			}
+		skb_push(skb, mutable->tunnel_hlen);
+		create_tunnel_header(vport, mutable, rt, skb->data);
+		skb_reset_network_header(skb);
+
+		if (next_skb)
+			skb_dst_set(skb, dst_clone(unattached_dst));
+		else {
+			skb_dst_set(skb, unattached_dst);
+			unattached_dst = NULL;
 		}
 		skb_set_transport_header(skb, skb_network_offset(skb) + sizeof(struct iphdr));
 
@@ -1301,37 +995,7 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 		if (unlikely(!skb))
 			goto next;
 
-		if (likely(cache)) {
-			int orig_len = skb->len - cache->len;
-			struct vport *cache_vport;
-
-			cache_vport = ovs_internal_dev_get_vport(rt_dst(rt).dev);
-			skb->protocol = htons(ETH_P_IP);
-			iph = ip_hdr(skb);
-			iph->tot_len = htons(skb->len - skb_network_offset(skb));
-			ip_send_check(iph);
-
-			if (cache_vport) {
-				if (unlikely(compute_ip_summed(skb, true))) {
-					kfree_skb(skb);
-					goto next;
-				}
-
-				OVS_CB(skb)->flow = cache->flow;
-				ovs_vport_receive(cache_vport, skb);
-				sent_len += orig_len;
-			} else {
-				int xmit_err;
-
-				skb->dev = rt_dst(rt).dev;
-				xmit_err = dev_queue_xmit(skb);
-
-				if (likely(net_xmit_eval(xmit_err) == 0))
-					sent_len += orig_len;
-			}
-		} else
-			sent_len += send_frags(skb, mutable);
-
+		sent_len += send_frags(skb, mutable);
 next:
 		skb = next_skb;
 	}
@@ -1414,13 +1078,6 @@ struct vport *ovs_tnl_create(const struct vport_parms *parms,
 	if (err)
 		goto error_free_mutable;
 
-	spin_lock_init(&tnl_vport->cache_lock);
-
-#ifdef NEED_CACHE_TIMEOUT
-	tnl_vport->cache_exp_interval = MAX_CACHE_EXP -
-				       (net_random() % (MAX_CACHE_EXP / 2));
-#endif
-
 	rcu_assign_pointer(tnl_vport->mutable, mutable);
 
 	port_table_add_port(vport);
@@ -1439,7 +1096,6 @@ static void free_port_rcu(struct rcu_head *rcu)
 	struct tnl_vport *tnl_vport = container_of(rcu,
 						   struct tnl_vport, rcu);
 
-	free_cache((struct tnl_cache __force *)tnl_vport->cache);
 	kfree((struct tnl_mutable __force *)tnl_vport->mutable);
 	ovs_vport_free(tnl_vport_to_vport(tnl_vport));
 }
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index 0af27ac..ed3b4ec 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -172,58 +172,6 @@ struct tnl_ops {
 /* If we can't detect all system changes directly we need to use a timeout. */
 #define NEED_CACHE_TIMEOUT
 #endif
-struct tnl_cache {
-	struct rcu_head rcu;
-
-	int len;		/* Length of data to be memcpy'd from cache. */
-	int hh_len;		/* Hardware hdr length, cached from hh_cache. */
-
-	/* Sequence number of mutable->seq from which this cache was
-	 * generated. */
-	unsigned mutable_seq;
-
-#ifdef HAVE_HH_SEQ
-	/*
-	 * The sequence number from the seqlock protecting the hardware header
-	 * cache (in the ARP cache).  Since every write increments the counter
-	 * this gives us an easy way to tell if it has changed.
-	 */
-	unsigned hh_seq;
-#endif
-
-#ifdef NEED_CACHE_TIMEOUT
-	/*
-	 * If we don't have direct mechanisms to detect all important changes in
-	 * the system fall back to an expiration time.  This expiration time
-	 * can be relatively short since at high rates there will be millions of
-	 * packets per second, so we'll still get plenty of benefit from the
-	 * cache.  Note that if something changes we may blackhole packets
-	 * until the expiration time (depending on what changed and the kernel
-	 * version we may be able to detect the change sooner).  Expiration is
-	 * expressed as a time in jiffies.
-	 */
-	unsigned long expiration;
-#endif
-
-	/*
-	 * The routing table entry that is the result of looking up the tunnel
-	 * endpoints.  It also contains a sequence number (called a generation
-	 * ID) that can be compared to a global sequence to tell if the routing
-	 * table has changed (and therefore there is a potential that this
-	 * cached route has been invalidated).
-	 */
-	struct rtable *rt;
-
-	/*
-	 * If the output device for tunnel traffic is an OVS internal device,
-	 * the flow of that datapath.  Since all tunnel traffic will have the
-	 * same headers this allows us to cache the flow lookup.  NULL if the
-	 * output device is not OVS or if there is no flow installed.
-	 */
-	struct sw_flow *flow;
-
-	/* The cached header follows after padding for alignment. */
-};
 
 struct tnl_vport {
 	struct rcu_head rcu;
-- 
1.7.10.2.484.gcd07cc5

^ permalink raw reply related

* [PATCH] net: qmi_wwan: Add Sierra Wireless device IDs
From: Bjørn Mork @ 2012-05-24  9:19 UTC (permalink / raw)
  To: netdev; +Cc: linux-usb, Bjørn Mork

Some additional Gobi3K IDs found in the BSD/GPL licensed
out-of-tree GobiNet driver from Sierra Wireless.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
---
 drivers/net/usb/qmi_wwan.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 380dbea..3b20678 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -547,6 +547,8 @@ static const struct usb_device_id products[] = {
 	{QMI_GOBI_DEVICE(0x16d8, 0x8002)},	/* CMDTech Gobi 2000 Modem device (VU922) */
 	{QMI_GOBI_DEVICE(0x05c6, 0x9205)},	/* Gobi 2000 Modem device */
 	{QMI_GOBI_DEVICE(0x1199, 0x9013)},	/* Sierra Wireless Gobi 3000 Modem device (MC8355) */
+	{QMI_GOBI_DEVICE(0x1199, 0x9015)},	/* Sierra Wireless Gobi 3000 Modem device */
+	{QMI_GOBI_DEVICE(0x1199, 0x9019)},	/* Sierra Wireless Gobi 3000 Modem device */
 	{ }					/* END */
 };
 MODULE_DEVICE_TABLE(usb, products);
-- 
1.7.2.5

^ permalink raw reply related

* Re: [PATCH 04/17] netfilter: add namespace support for l4proto_generic
From: Pablo Neira Ayuso @ 2012-05-24  9:52 UTC (permalink / raw)
  To: Gao feng; +Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano
In-Reply-To: <4FBD8B40.4020303@cn.fujitsu.com>

On Thu, May 24, 2012 at 09:13:36AM +0800, Gao feng wrote:
> 于 2012年05月23日 18:32, Pablo Neira Ayuso 写道:
> > On Mon, May 14, 2012 at 04:52:14PM +0800, Gao feng wrote:
> >> implement and export nf_conntrack_proto_generic_[init,fini],
> >> nf_conntrack_[init,cleanup]_net call them to register or unregister
> >> the sysctl of generic proto.
> >>
> >> implement generic_net_init,it's used to initial the pernet
> >> data for generic proto.
> >>
> >> and use nf_generic_net.timeout to replace nf_ct_generic_timeout in
> >> get_timeouts function.
> >>
> >> Acked-by: Eric W. Biederman <ebiederm@xmission.com>
> >> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
> >> ---
> >>  include/net/netfilter/nf_conntrack_l4proto.h |    2 +
> >>  include/net/netns/conntrack.h                |    6 +++
> >>  net/netfilter/nf_conntrack_core.c            |    8 +++-
> >>  net/netfilter/nf_conntrack_proto.c           |   21 +++++-----
> >>  net/netfilter/nf_conntrack_proto_generic.c   |   55 ++++++++++++++++++++++++-
> >>  5 files changed, 76 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
> >> index a93dcd5..0d329b9 100644
> >> --- a/include/net/netfilter/nf_conntrack_l4proto.h
> >> +++ b/include/net/netfilter/nf_conntrack_l4proto.h
> >> @@ -118,6 +118,8 @@ struct nf_conntrack_l4proto {
> >>  
> >>  /* Existing built-in generic protocol */
> >>  extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
> >> +extern int nf_conntrack_proto_generic_init(struct net *net);
> >> +extern void nf_conntrack_proto_generic_fini(struct net *net);
> >>  
> >>  #define MAX_NF_CT_PROTO 256
> >>  
> >> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
> >> index 94992e9..3381b80 100644
> >> --- a/include/net/netns/conntrack.h
> >> +++ b/include/net/netns/conntrack.h
> >> @@ -20,7 +20,13 @@ struct nf_proto_net {
> >>  	unsigned int		users;
> >>  };
> >>  
> >> +struct nf_generic_net {
> >> +	struct nf_proto_net pn;
> >> +	unsigned int timeout;
> >> +};
> >> +
> >>  struct nf_ip_net {
> >> +	struct nf_generic_net   generic;
> >>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
> >>  	struct ctl_table_header *ctl_table_header;
> >>  	struct ctl_table	*ctl_table;
> >> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> >> index 32c5909..fd33e91 100644
> >> --- a/net/netfilter/nf_conntrack_core.c
> >> +++ b/net/netfilter/nf_conntrack_core.c
> >> @@ -1353,6 +1353,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
> >>  	}
> >>  
> >>  	nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
> >> +	nf_conntrack_proto_generic_fini(net);
> >>  	nf_conntrack_helper_fini(net);
> >>  	nf_conntrack_timeout_fini(net);
> >>  	nf_conntrack_ecache_fini(net);
> >> @@ -1586,9 +1587,12 @@ static int nf_conntrack_init_net(struct net *net)
> >>  	ret = nf_conntrack_helper_init(net);
> >>  	if (ret < 0)
> >>  		goto err_helper;
> >> -
> >> +	ret = nf_conntrack_proto_generic_init(net);
> >> +	if (ret < 0)
> >> +		goto err_generic;
> >>  	return 0;
> >> -
> >> +err_generic:
> >> +	nf_conntrack_helper_fini(net);
> >>  err_helper:
> >>  	nf_conntrack_timeout_fini(net);
> >>  err_timeout:
> >> diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
> >> index 7ee6653..9b4bf6d 100644
> >> --- a/net/netfilter/nf_conntrack_proto.c
> >> +++ b/net/netfilter/nf_conntrack_proto.c
> >> @@ -287,10 +287,16 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
> >>  static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
> >>  					      struct nf_conntrack_l4proto *l4proto)
> >>  {
> >> -	if (l4proto->net_id)
> >> -		return net_generic(net, *l4proto->net_id);
> >> -	else
> >> -		return NULL;
> >> +	switch (l4proto->l4proto) {
> >> +	case 255: /* l4proto_generic */
> >> +		return (struct nf_proto_net *)&net->ct.proto.generic;
> >> +	default:
> >> +		if (l4proto->net_id)
> >> +			return net_generic(net, *l4proto->net_id);
> >> +		else
> >> +			return NULL;
> >> +	}
> >> +	return NULL;
> >>  }
> >>  
> >>  int nf_ct_l4proto_register_sysctl(struct net *net,
> >> @@ -457,11 +463,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
> >>  int nf_conntrack_proto_init(void)
> >>  {
> >>  	unsigned int i;
> >> -	int err;
> >> -
> >> -	err = nf_ct_l4proto_register_sysctl(&init_net, &nf_conntrack_l4proto_generic);
> >> -	if (err < 0)
> >> -		return err;
> > 
> > I like that all protocols sysctl are registered by
> > nf_conntrack_proto_init. Can you keep using that?
> 
> you mean per-net's generic_proto sysctl are registered by
> nf_conntrack_proto_init?
> 
> such as
> 
> int nf_conntrack_proto_init(struct net *net)
> {
> 	...
> 	err = nf_ct_l4proto_register_sysctl(net, &nf_conntrack_l4proto_generic);

Yes, all protocol trackers included in nf_conntrack_proto_init:

        err = nf_conntrack_proto_generic_init(net);
        ...
        err = nf_conntrack_proto_tcp_init(net);
        ...

and so on.

> 	...
> }
> 
> if my understanding is right,my answer is yes we can ;)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 15/17] netfilter: cleanup sysctl for l4proto and l3proto
From: Pablo Neira Ayuso @ 2012-05-24  9:56 UTC (permalink / raw)
  To: Gao feng; +Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano
In-Reply-To: <4FBD87E6.6000402@cn.fujitsu.com>

On Thu, May 24, 2012 at 08:59:18AM +0800, Gao feng wrote:
> Hi pablo:
> 
> 于 2012年05月23日 18:38, Pablo Neira Ayuso 写道:
> > On Mon, May 14, 2012 at 04:52:25PM +0800, Gao feng wrote:
> >> delete no useless sysctl data for l4proto and l3proto.
> >>
> >> Acked-by: Eric W. Biederman <ebiederm@xmission.com>
> >> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
> >> ---
> >>  include/net/netfilter/nf_conntrack_l3proto.h   |    2 --
> >>  include/net/netfilter/nf_conntrack_l4proto.h   |   10 ----------
> >>  net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |    1 -
> >>  net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |    8 --------
> >>  net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |    5 -----
> >>  net/netfilter/nf_conntrack_proto_generic.c     |    8 --------
> >>  net/netfilter/nf_conntrack_proto_sctp.c        |   15 ---------------
> >>  net/netfilter/nf_conntrack_proto_tcp.c         |   15 ---------------
> >>  net/netfilter/nf_conntrack_proto_udp.c         |   15 ---------------
> >>  net/netfilter/nf_conntrack_proto_udplite.c     |   12 ------------
> >>  10 files changed, 0 insertions(+), 91 deletions(-)
> >>
> >> diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
> >> index d6df8c7..6f7c13f 100644
> >> --- a/include/net/netfilter/nf_conntrack_l3proto.h
> >> +++ b/include/net/netfilter/nf_conntrack_l3proto.h
> >> @@ -64,9 +64,7 @@ struct nf_conntrack_l3proto {
> >>  	size_t nla_size;
> >>  
> >>  #ifdef CONFIG_SYSCTL
> >> -	struct ctl_table_header	*ctl_table_header;
> >>  	const char		*ctl_table_path;
> >> -	struct ctl_table	*ctl_table;
> >>  #endif /* CONFIG_SYSCTL */
> >>  
> >>  	/* Init l3proto pernet data */
> >> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
> >> index 0d329b9..4881df34 100644
> >> --- a/include/net/netfilter/nf_conntrack_l4proto.h
> >> +++ b/include/net/netfilter/nf_conntrack_l4proto.h
> >> @@ -95,16 +95,6 @@ struct nf_conntrack_l4proto {
> >>  		const struct nla_policy *nla_policy;
> >>  	} ctnl_timeout;
> >>  #endif
> >> -
> >> -#ifdef CONFIG_SYSCTL
> >> -	struct ctl_table_header	**ctl_table_header;
> >> -	struct ctl_table	*ctl_table;
> >> -	unsigned int		*ctl_table_users;
> >> -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
> >> -	struct ctl_table_header	*ctl_compat_table_header;
> >> -	struct ctl_table	*ctl_compat_table;
> >> -#endif
> >> -#endif
> > 
> > Interesting. This structure is added in patch 1/17, then it's remove
> > in patch 15/17.
> > 
> > Probably I'm missing anything, but why are you doing it like that?
> 
> This structure means ctl_table_header,ctl_table and so on?
> 
> I add this structure to struct nf_proto_net in patch 1/17,so those fields in
> struct nf_conntrack_l4proto are useless,this patch is just some cleanup.
> 
> the same with nf_conntrack_l3proto.

I see, then it's OK. Please, elaborate a bit more the patch
description to explain that this structure is not required anymore.

^ permalink raw reply

* Re: [PATCH 01/17] netfilter: add struct nf_proto_net for register l4proto sysctl
From: Pablo Neira Ayuso @ 2012-05-24  9:58 UTC (permalink / raw)
  To: Gao feng
  Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano,
	Gao feng
In-Reply-To: <4FBD9076.6060309@cn.fujitsu.com>

On Thu, May 24, 2012 at 09:35:50AM +0800, Gao feng wrote:
> Hi pablo:
> 
> 于 2012年05月23日 18:12, Pablo Neira Ayuso 写道:
> > On Mon, May 14, 2012 at 04:52:11PM +0800, Gao feng wrote:
> >> From: Gao feng <gaofeng@cn.fujitus.com>
> >>
> >> the struct nf_proto_net stroes proto's ctl_table_header and ctl_table,
> >> nf_ct_l4proto_(un)register_sysctl use it to register sysctl.
> >>
> >> there are some changes for struct nf_conntrack_l4proto:
> >> - add field compat to identify if this proto should do compat.
> >> - the net_id field is used to store the pernet_operations id
> >>   that belones to l4proto.
> >> - init_net will be used to initial the proto's pernet data
> >>
> >> and add init_net for struct nf_conntrack_l3proto too.
> > 
> > This patchset looks bette but there are still things that we have to
> > resolve.
> > 
> > The first one (regarding this patch 1/17) changes in:
> > * include/net/netfilter/nf_conntrack_l4proto.h
> > * include/net/netns/conntrack.h
> > 
> > should be included in:
> > [PATCH] netfilter: add namespace support for l4proto
> > 
> > And changes in:
> > * include/net/netfilter/nf_conntrack_l3proto.h
> > 
> > should be included in:
> > [PATCH] netfilter: add namespace support for l3proto
> > 
> > I already told you. A patch that adds a structure without using it,
> > is not good. The structure has to go together with the code uses it.
> > 
> 
> It seams this patch should be merged to "netfilter: add namespace support for l4proto"
> the struct nf_proto_net is first used there.
> 
> > More comments below.
> > 
> >> Acked-by: Eric W. Biederman <ebiederm@xmission.com>
> >> Signed-off-by: Gao feng <gaofeng@cn.fujitus.com>
> >> ---
> >>  include/net/netfilter/nf_conntrack_l3proto.h |    3 +++
> >>  include/net/netfilter/nf_conntrack_l4proto.h |    6 ++++++
> >>  include/net/netns/conntrack.h                |   12 ++++++++++++
> >>  3 files changed, 21 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
> >> index 9699c02..9766005 100644
> >> --- a/include/net/netfilter/nf_conntrack_l3proto.h
> >> +++ b/include/net/netfilter/nf_conntrack_l3proto.h
> >> @@ -69,6 +69,9 @@ struct nf_conntrack_l3proto {
> >>  	struct ctl_table	*ctl_table;
> >>  #endif /* CONFIG_SYSCTL */
> >>  
> >> +	/* Init l3proto pernet data */
> >> +	int (*init_net)(struct net *net);
> >> +
> >>  	/* Module (if any) which this is connected to. */
> >>  	struct module *me;
> >>  };
> >> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
> >> index 3b572bb..a90eab5 100644
> >> --- a/include/net/netfilter/nf_conntrack_l4proto.h
> >> +++ b/include/net/netfilter/nf_conntrack_l4proto.h
> >> @@ -22,6 +22,8 @@ struct nf_conntrack_l4proto {
> >>  	/* L4 Protocol number. */
> >>  	u_int8_t l4proto;
> >>  
> >> +	u_int8_t compat;
> > 
> > I don't see why we need this new field.
> > 
> > It seems to be set to 1 in each structure that has set:
> > 
> > .ctl_compat_table
> > 
> > to non-NULL. So, it's redundant.
> > 
> > Moreover, you already know from the protocol tracker itself if you
> > have to allocate the compat ctl table or not.
> > 
> > In other words: You set compat to 1 for nf_conntrack_l4proto_generic.
> > Then, you pass that compat value to generic_init_net via ->inet_net
> > again, but this information (that determines if the compat has to be
> > done or not) is already in the scope of the protocol tracker.
> > 
> 
> because some protocols such l4proto_tcp6 and l4proto_tcp use the same init_net
> function. the l4proto_tcp6 doesn't need compat sysctl, so we should use this new
> field to identify if we should kmemdup compat_sysctl_table.

Then, could you use two init_net functions? one for TCP for IPv4 and another
for TCP for IPv6?

> and beacuse protocols will have pernet ctl_compat_table and ctl_table,the .ctl_compat_table
> field will be deleted in patch 15/17. so we should the new field compat.
> 
> actually, we don't need to pass compat value for generic_init_net,beacuse
> we know l4proto_generic need compat. But consider there are l4proto_tcp(6), and in order to keep
> code readable,I prefer to add compat field and pass it to init_net.
> 
> > You have to fix this.
> > 
> >> +
> >>  	/* Try to fill in the third arg: dataoff is offset past network protocol
> >>             hdr.  Return true if possible. */
> >>  	bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
> >> @@ -103,6 +105,10 @@ struct nf_conntrack_l4proto {
> >>  	struct ctl_table	*ctl_compat_table;
> >>  #endif
> >>  #endif
> >> +	int	*net_id;
> >> +	/* Init l4proto pernet data */
> >> +	int (*init_net)(struct net *net, u_int8_t compat);
> >> +
> >>  	/* Protocol name */
> >>  	const char *name;
> >>  
> >> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
> >> index a053a19..1f53038 100644
> >> --- a/include/net/netns/conntrack.h
> >> +++ b/include/net/netns/conntrack.h
> >> @@ -8,6 +8,18 @@
> >>  struct ctl_table_header;
> >>  struct nf_conntrack_ecache;
> >>  
> >> +struct nf_proto_net {
> >> +#ifdef CONFIG_SYSCTL
> >> +	struct ctl_table_header *ctl_table_header;
> >> +	struct ctl_table        *ctl_table;
> >> +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
> >> +	struct ctl_table_header *ctl_compat_header;
> >> +	struct ctl_table        *ctl_compat_table;
> >> +#endif
> >> +#endif
> >> +	unsigned int		users;
> >> +};
> >> +
> >>  struct netns_ct {
> >>  	atomic_t		count;
> >>  	unsigned int		expect_count;
> > --
> > To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 02/17] netfilter: add namespace support for l4proto
From: Pablo Neira Ayuso @ 2012-05-24 10:00 UTC (permalink / raw)
  To: Gao feng
  Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano,
	Gao feng
In-Reply-To: <4FBD9473.5050304@cn.fujitsu.com>

On Thu, May 24, 2012 at 09:52:51AM +0800, Gao feng wrote:
> 于 2012年05月23日 18:25, Pablo Neira Ayuso 写道:
> > On Mon, May 14, 2012 at 04:52:12PM +0800, Gao feng wrote:
> >> From: Gao feng <gaofeng@cn.fujitus.com>
[...]
> >> @@ -243,137 +253,172 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
> >>  }
> >>  EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
> >>  
> >> -static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto)
> >> +static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
> >> +					      struct nf_conntrack_l4proto *l4proto)
> >>  {
> >> -	int err = 0;
> >> +	if (l4proto->net_id)
> >> +		return net_generic(net, *l4proto->net_id);
> >> +	else
> >> +		return NULL;
> >> +}
> >>  
> >> +int nf_ct_l4proto_register_sysctl(struct net *net,
> >> +				  struct nf_conntrack_l4proto *l4proto)
> >> +{
> >> +	int err = 0;
> >> +	struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
> >> +	if (pn == NULL)
> >> +		return 0;
> >>  #ifdef CONFIG_SYSCTL
> >> -	if (l4proto->ctl_table != NULL) {
> >> -		err = nf_ct_register_sysctl(l4proto->ctl_table_header,
> >> +	if (pn->ctl_table != NULL) {
> >> +		err = nf_ct_register_sysctl(net,
> >> +					    &pn->ctl_table_header,
> >>  					    "net/netfilter",
> >> -					    l4proto->ctl_table,
> >> -					    l4proto->ctl_table_users);
> >> -		if (err < 0)
> >> +					    pn->ctl_table,
> >> +					    &pn->users);
> >> +		if (err < 0) {
> >> +			kfree(pn->ctl_table);
> >> +			pn->ctl_table = NULL;
> >                                ^^^^^^^^^^^
> > Do you really need to set this above to NULL? Is there any existing
> > bug trap? If not, it's superfluous, please, remove it.
> > 
> yes,l4proto_tcp(udp,icmp)'s ctl_table is stored in netns_ct.proto,
> so when we register l4proto_tcp's sysctl failed,ctl_table will still
> point to the kfreed memory. this will cause panic the next
> time we register l4proto_tcp's sysctl.

I see, thanks for the clarification.

^ permalink raw reply

* Re: [PATCH 03/17] netfilter: add namespace support for l3proto
From: Pablo Neira Ayuso @ 2012-05-24 10:04 UTC (permalink / raw)
  To: Gao feng; +Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano
In-Reply-To: <4FBD95AA.8070301@cn.fujitsu.com>

On Thu, May 24, 2012 at 09:58:02AM +0800, Gao feng wrote:
> 于 2012年05月23日 18:29, Pablo Neira Ayuso 写道:
> > On Mon, May 14, 2012 at 04:52:13PM +0800, Gao feng wrote:
[...]
> >> diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
> >> index 6d68727..7ee6653 100644
> >> --- a/net/netfilter/nf_conntrack_proto.c
> >> +++ b/net/netfilter/nf_conntrack_proto.c
> >> @@ -170,85 +170,116 @@ static int kill_l4proto(struct nf_conn *i, void *data)
> >>  	       nf_ct_l3num(i) == l4proto->l3proto;
> >>  }
> >>  
> >> -static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto)
> >> +static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
> >> +					   struct nf_conntrack_l3proto *l3proto)
> >> +{
> >> +	if (l3proto->l3proto == PF_INET)
> >> +		return &net->ct.proto;
> >> +	else
> >> +		return NULL;
> >> +}
> >> +
> >> +static int nf_ct_l3proto_register_sysctl(struct net *net,
> >> +					 struct nf_conntrack_l3proto *l3proto)
> >>  {
> >>  	int err = 0;
> >> +	struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
> >>  
> >> -#ifdef CONFIG_SYSCTL
> >> -	if (l3proto->ctl_table != NULL) {
> >> -		err = nf_ct_register_sysctl(&init_net,
> >> -					    &l3proto->ctl_table_header,
> >> +	if (in == NULL)
> >> +		return 0;
> > 
> > Under what circunstances that in be NULL?
> 
> Because l3proto_ipv6 doesn't need sysctl,so l3proto_ipv6's nf_ip_net is NULL,
> please see function nf_ct_l3proto_net above.

Then, please add a comment there to explain that some per-net protocol
information may missing since no sysctl is supported.

^ permalink raw reply

* Re: tc filter u32 match
From: Jamal Hadi Salim @ 2012-05-24 10:04 UTC (permalink / raw)
  To: adam.niescierowicz; +Cc: netdev
In-Reply-To: <32a6182e71dd565206cf39d4cad3f984@justnet.pl>

On Tue, 2012-05-22 at 15:42 +0200, Nieścierowicz Adam wrote:
> Hello,
> 
> I'm in the process of building a new shaper, when adding support for 
> 802.1q
> vlan noticed that u32 can catch network traffic without giving 4 bytes
> offset. How is this possible?
> 

Because we look at where the network header starts?
Why do you expect 4 bytes to be counted? 

> My environment:
> 
> eth2 - network card
> eth2.200 - vlan
> 
> /sbin/tc filter add dev eth2 parent 1:0 prio 5 handle 35: protocol ip 
> u32 divisor 256
> /sbin/tc filter add dev eth2 protocol ip parent 1:0 prio 5 u32 ht 800:: 
> match ip dst 31.41.208.32/27 hashkey mask 0x000000ff at 16 link 35:
> /sbin/tc filter add dev eth2 protocol ip parent 1: prio 1 u32 ht 35:24: 
> match ip dst 31.41.208.36 flowid 1:2e5
> 
> Here you can see the hits in the rule
> filter parent 1: protocol ip pref 5 u32 fh 35:24:800 order 2048 key ht 
> 35 bkt 24 flowid 1:2e5  (rule hit 44037 success 44037)
>    match 1f29d024/ffffffff at 16 (success 44037 )

I dont see an issue. This looks correct.

> 
> I found a similar question here 
> http://serverfault.com/questions/370795/tc-u32-how-to-match-l2-protocols-in-recent-kernels
> 

There may have been bugs in the past that someone missed or didnt
report here (likely around the time there was a lot of changes
happening with vlan offloading). Try the latest kernel and 
if it behaves badly, send a report and a reproducible test case.

cheers,
jamal

^ permalink raw reply

* System Administrator (Mailbox Quota Exceeded!)
From: Webmail Admin Support @ 2012-05-24  7:19 UTC (permalink / raw)


System Administrator,

Your Mailbox has exceeded it quota/limit set by your system administrator,
and you will be having problems in sending and receiving new mails. To
upgrade your account now and click the link below

https://docs.google.com/spreadsheet/viewform?formkey=dGpBb0ZCdkgwZU1xdlhhV1E4UjlqRFE6MQ

Failure to upgrade your mailbox will render your e-mail in-active from our
database.Thanks

System Administrator.

^ permalink raw reply

* Re: [v4 PATCH 1/1] netfilter: Add fail-open support
From: Pablo Neira Ayuso @ 2012-05-24 10:17 UTC (permalink / raw)
  To: Krishna Kumar
  Cc: kaber, vivk, svajipay, fw, netfilter-devel, sri, Eric Dumazet,
	davem, netdev
In-Reply-To: <20120524082531.13146.347.sendpatchset@localhost.localdomain>

My main objection with this patch is that it adds more code out of the
scope of the nf_queue handling to nf_hook_slow. And this is done for
very specific purpose.

@David, @Eric: Krishna aims to provide a mechanism that can be enabled
to accept packets if the nfqueue becomes full, ie. it changes the
default behaviour under congestion from drop to accept. It seems some
users prefer not to block traffic under nfqueue congestion.

The problem is the GSO handling: If we start enqueueing segments and
the queue gets full, we've got a list with the remaining segments that
need to be accepted. The current approach to handle this situation
does not look very nice. Do you have any suggestion for this?

Thanks!

Patch is below, in case you want to have a look at it.

On Thu, May 24, 2012 at 01:55:31PM +0530, Krishna Kumar wrote:
> Implement a new "fail-open" mode where packets are not dropped
> upon queue-full condition. This mode can be enabled/disabled per
> queue using netlink NFAQ_CFG_FLAGS & NFAQ_CFG_MASK attributes.
> 
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> Signed-off-by: Vivek Kashyap <vivk@us.ibm.com>
> Signed-off-by: Sridhar Samudrala <samudrala@us.ibm.com>
> ---
>  include/linux/netfilter/nfnetlink_queue.h |    5 ++
>  net/netfilter/core.c                      |   37 +++++++++++++++++++-
>  net/netfilter/nf_queue.c                  |   15 ++++++--
>  net/netfilter/nfnetlink_queue.c           |   36 +++++++++++++++++--
>  4 files changed, 86 insertions(+), 7 deletions(-)
> 
> diff -ruNp org/include/linux/netfilter/nfnetlink_queue.h new/include/linux/netfilter/nfnetlink_queue.h
> --- org/include/linux/netfilter/nfnetlink_queue.h	2012-05-23 09:52:54.738660685 +0530
> +++ new/include/linux/netfilter/nfnetlink_queue.h	2012-05-24 10:25:33.500073415 +0530
> @@ -84,8 +84,13 @@ enum nfqnl_attr_config {
>  	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
>  	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
>  	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
> +	NFQA_CFG_MASK,			/* identify which flags to change */
> +	NFQA_CFG_FLAGS,			/* value of these flags (__be32) */
>  	__NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
>  
> +/* Flags for NFQA_CFG_FLAGS */
> +#define NFQA_CFG_F_FAIL_OPEN			(1 << 0)
> +
>  #endif /* _NFNETLINK_QUEUE_H */
> diff -ruNp org/net/netfilter/core.c new/net/netfilter/core.c
> --- org/net/netfilter/core.c	2012-05-23 09:52:54.740660556 +0530
> +++ new/net/netfilter/core.c	2012-05-24 11:35:55.958845493 +0530
> @@ -163,6 +163,31 @@ repeat:
>  	return NF_ACCEPT;
>  }
>  
> +/*
> + * Handler was not able to enqueue the packet, and returned ENOSPC
> + * as "fail-open" was enabled. We temporarily accept the skb; or
> + * each segment for a GSO skb and free the header.
> + */
> +static void handle_fail_open(struct sk_buff *skb,
> +			     int (*okfn)(struct sk_buff *))
> +{
> +	struct sk_buff *segs;
> +	bool gso;
> +
> +	segs = skb->next ? : skb;
> +	gso = skb->next != NULL;
> +
> +	do {
> +		struct sk_buff *nskb = segs->next;
> +
> +		segs->next = NULL;
> +		okfn(segs);
> +		segs = nskb;
> +	} while (segs);
> +
> +	if (gso)
> +		kfree_skb(skb);
> +}
>  
>  /* Returns 1 if okfn() needs to be executed by the caller,
>   * -EPERM for NF_DROP, 0 otherwise. */
> @@ -174,6 +199,7 @@ int nf_hook_slow(u_int8_t pf, unsigned i
>  {
>  	struct list_head *elem;
>  	unsigned int verdict;
> +	int failopen = 0;
>  	int ret = 0;
>  
>  	/* We may already have this, but read-locks nest anyway */
> @@ -184,7 +210,8 @@ next_hook:
>  	verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
>  			     outdev, &elem, okfn, hook_thresh);
>  	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
> -		ret = 1;
> +		if (!failopen) /* don't use the default verdict if 'failopen' */
> +			ret = 1;
>  	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
>  		kfree_skb(skb);
>  		ret = NF_DROP_GETERR(verdict);
> @@ -199,10 +226,18 @@ next_hook:
>  			if (err == -ESRCH &&
>  			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
>  				goto next_hook;
> +			if (err == -ENOSPC) {
> +				failopen = 1;
> +				goto next_hook;
> +			}
>  			kfree_skb(skb);
>  		}
>  	}
>  	rcu_read_unlock();
> +
> +	if (!ret && failopen)
> +		handle_fail_open(skb, okfn);
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL(nf_hook_slow);
> diff -ruNp org/net/netfilter/nfnetlink_queue.c new/net/netfilter/nfnetlink_queue.c
> --- org/net/netfilter/nfnetlink_queue.c	2012-05-23 09:52:54.742661899 +0530
> +++ new/net/netfilter/nfnetlink_queue.c	2012-05-24 13:42:24.155860334 +0530
> @@ -52,6 +52,7 @@ struct nfqnl_instance {
>  
>  	u_int16_t queue_num;			/* number of this queue */
>  	u_int8_t copy_mode;
> +	u_int32_t flags;			/* Set using NFQA_CFG_FLAGS */
>  /*
>   * Following fields are dirtied for each queued packet,
>   * keep them in same cache line if possible.
> @@ -431,9 +432,13 @@ nfqnl_enqueue_packet(struct nf_queue_ent
>  		goto err_out_free_nskb;
>  	}
>  	if (queue->queue_total >= queue->queue_maxlen) {
> -		queue->queue_dropped++;
> -		net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
> -				     queue->queue_total);
> +		if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
> +			err = -ENOSPC;
> +		} else {
> +			queue->queue_dropped++;
> +			net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
> +					     queue->queue_total);
> +		}
>  		goto err_out_free_nskb;
>  	}
>  	entry->id = ++queue->id_sequence;
> @@ -858,6 +863,31 @@ nfqnl_recv_config(struct sock *ctnl, str
>  		spin_unlock_bh(&queue->lock);
>  	}
>  
> +	if (nfqa[NFQA_CFG_FLAGS]) {
> +		__be32 flags, mask;
> +
> +		if (!queue) {
> +			ret = -ENODEV;
> +			goto err_out_unlock;
> +		}
> +
> +		if (!nfqa[NFQA_CFG_MASK]) {
> +			/* A mask is needed to specify which flags are being
> +			 * changed.
> +			 */
> +			ret = -EINVAL;
> +			goto err_out_unlock;
> +		}
> +
> +		flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
> +		mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));
> +
> +		spin_lock_bh(&queue->lock);
> +		queue->flags &= ~mask;
> +		queue->flags |= flags & mask;
> +		spin_unlock_bh(&queue->lock);
> +	}
> +
>  err_out_unlock:
>  	rcu_read_unlock();
>  	return ret;
> diff -ruNp org/net/netfilter/nf_queue.c new/net/netfilter/nf_queue.c
> --- org/net/netfilter/nf_queue.c	2012-05-23 09:52:54.739533744 +0530
> +++ new/net/netfilter/nf_queue.c	2012-05-24 11:34:46.302003629 +0530
> @@ -268,14 +268,23 @@ int nf_queue(struct sk_buff *skb,
>  			err = __nf_queue(segs, elem, pf, hook, indev,
>  					   outdev, okfn, queuenum);
>  		}
> -		if (err == 0)
> +
> +		if (err == 0) {
>  			queued++;
> -		else
> +		} else if (err == -ENOSPC) {
> +			/* Enqueue failed due to queue-full and handler is
> +			 * in "fail-open" mode.
> +			 */
> +			segs->next = nskb;
> +			skb->next = segs;
> +			break;
> +		} else {
>  			kfree_skb(segs);
> +		}
>  		segs = nskb;
>  	} while (segs);
>  
> -	if (queued) {
> +	if (queued && err != -ENOSPC) {
>  		kfree_skb(skb);
>  		return 0;
>  	}
> 

^ permalink raw reply

* Re: [v4 PATCH 1/1] netfilter: Add fail-open support
From: Pablo Neira Ayuso @ 2012-05-24 10:41 UTC (permalink / raw)
  To: Krishna Kumar
  Cc: kaber, vivk, svajipay, fw, netfilter-devel, sri, Eric Dumazet,
	davem, netdev
In-Reply-To: <20120524101755.GF13091@1984>

On Thu, May 24, 2012 at 12:17:55PM +0200, Pablo Neira Ayuso wrote:
> My main objection with this patch is that it adds more code out of the
> scope of the nf_queue handling to nf_hook_slow. And this is done for
> very specific purpose.
> 
> @David, @Eric: Krishna aims to provide a mechanism that can be enabled
> to accept packets if the nfqueue becomes full, ie. it changes the
> default behaviour under congestion from drop to accept. It seems some
> users prefer not to block traffic under nfqueue congestion.

Florian Westphal just proposed some possible interesting solution for
this.

^ permalink raw reply

* Re: [v4 PATCH 1/1] netfilter: Add fail-open support
From: Krishna Kumar2 @ 2012-05-24 10:49 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: davem, Eric Dumazet, fw, kaber, netdev, netfilter-devel, sri,
	Sulakshan Vajipayajula, vivk
In-Reply-To: <20120524104156.GA13785@1984>

Pablo Neira Ayuso <pablo@netfilter.org> wrote on 05/24/2012 04:11:56 PM:

> On Thu, May 24, 2012 at 12:17:55PM +0200, Pablo Neira Ayuso wrote:
> > My main objection with this patch is that it adds more code out of the
> > scope of the nf_queue handling to nf_hook_slow. And this is done for
> > very specific purpose.
> >
> > @David, @Eric: Krishna aims to provide a mechanism that can be enabled
> > to accept packets if the nfqueue becomes full, ie. it changes the
> > default behaviour under congestion from drop to accept. It seems some
> > users prefer not to block traffic under nfqueue congestion.
>
> Florian Westphal just proposed some possible interesting solution for
> this.

Yes, and I have just finished testing this and it works fine. With
this, all the changes are localized to nfnetlink_queue.c. I am doing
some more tests before resubmitting this.

thanks,
- KK

^ permalink raw reply

* Re: [PATCH 03/17] netfilter: add namespace support for l3proto
From: Gao feng @ 2012-05-24 10:57 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano
In-Reply-To: <20120524100412.GE13091@1984>

于 2012年05月24日 18:04, Pablo Neira Ayuso 写道:
> On Thu, May 24, 2012 at 09:58:02AM +0800, Gao feng wrote:
>> 于 2012年05月23日 18:29, Pablo Neira Ayuso 写道:
>>> On Mon, May 14, 2012 at 04:52:13PM +0800, Gao feng wrote:
> [...]
>>>> diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
>>>> index 6d68727..7ee6653 100644
>>>> --- a/net/netfilter/nf_conntrack_proto.c
>>>> +++ b/net/netfilter/nf_conntrack_proto.c
>>>> @@ -170,85 +170,116 @@ static int kill_l4proto(struct nf_conn *i, void *data)
>>>>  	       nf_ct_l3num(i) == l4proto->l3proto;
>>>>  }
>>>>  
>>>> -static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto)
>>>> +static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
>>>> +					   struct nf_conntrack_l3proto *l3proto)
>>>> +{
>>>> +	if (l3proto->l3proto == PF_INET)
>>>> +		return &net->ct.proto;
>>>> +	else
>>>> +		return NULL;
>>>> +}
>>>> +
>>>> +static int nf_ct_l3proto_register_sysctl(struct net *net,
>>>> +					 struct nf_conntrack_l3proto *l3proto)
>>>>  {
>>>>  	int err = 0;
>>>> +	struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
>>>>  
>>>> -#ifdef CONFIG_SYSCTL
>>>> -	if (l3proto->ctl_table != NULL) {
>>>> -		err = nf_ct_register_sysctl(&init_net,
>>>> -					    &l3proto->ctl_table_header,
>>>> +	if (in == NULL)
>>>> +		return 0;
>>>
>>> Under what circunstances that in be NULL?
>>
>> Because l3proto_ipv6 doesn't need sysctl,so l3proto_ipv6's nf_ip_net is NULL,
>> please see function nf_ct_l3proto_net above.
> 
> Then, please add a comment there to explain that some per-net protocol
> information may missing since no sysctl is supported.

Yes, I will add a comment to make it more clearer ;)

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 04/17] netfilter: add namespace support for l4proto_generic
From: Gao feng @ 2012-05-24 11:07 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano
In-Reply-To: <20120524095222.GA13091@1984>

于 2012年05月24日 17:52, Pablo Neira Ayuso 写道:
> On Thu, May 24, 2012 at 09:13:36AM +0800, Gao feng wrote:
>> 于 2012年05月23日 18:32, Pablo Neira Ayuso 写道:
>>> On Mon, May 14, 2012 at 04:52:14PM +0800, Gao feng wrote:
>>>> implement and export nf_conntrack_proto_generic_[init,fini],
>>>> nf_conntrack_[init,cleanup]_net call them to register or unregister
>>>> the sysctl of generic proto.
>>>>
>>>> implement generic_net_init,it's used to initial the pernet
>>>> data for generic proto.
>>>>
>>>> and use nf_generic_net.timeout to replace nf_ct_generic_timeout in
>>>> get_timeouts function.
>>>>
>>>> Acked-by: Eric W. Biederman <ebiederm@xmission.com>
>>>> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
>>>> ---
>>>>  include/net/netfilter/nf_conntrack_l4proto.h |    2 +
>>>>  include/net/netns/conntrack.h                |    6 +++
>>>>  net/netfilter/nf_conntrack_core.c            |    8 +++-
>>>>  net/netfilter/nf_conntrack_proto.c           |   21 +++++-----
>>>>  net/netfilter/nf_conntrack_proto_generic.c   |   55 ++++++++++++++++++++++++-
>>>>  5 files changed, 76 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
>>>> index a93dcd5..0d329b9 100644
>>>> --- a/include/net/netfilter/nf_conntrack_l4proto.h
>>>> +++ b/include/net/netfilter/nf_conntrack_l4proto.h
>>>> @@ -118,6 +118,8 @@ struct nf_conntrack_l4proto {
>>>>  
>>>>  /* Existing built-in generic protocol */
>>>>  extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
>>>> +extern int nf_conntrack_proto_generic_init(struct net *net);
>>>> +extern void nf_conntrack_proto_generic_fini(struct net *net);
>>>>  
>>>>  #define MAX_NF_CT_PROTO 256
>>>>  
>>>> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
>>>> index 94992e9..3381b80 100644
>>>> --- a/include/net/netns/conntrack.h
>>>> +++ b/include/net/netns/conntrack.h
>>>> @@ -20,7 +20,13 @@ struct nf_proto_net {
>>>>  	unsigned int		users;
>>>>  };
>>>>  
>>>> +struct nf_generic_net {
>>>> +	struct nf_proto_net pn;
>>>> +	unsigned int timeout;
>>>> +};
>>>> +
>>>>  struct nf_ip_net {
>>>> +	struct nf_generic_net   generic;
>>>>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
>>>>  	struct ctl_table_header *ctl_table_header;
>>>>  	struct ctl_table	*ctl_table;
>>>> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
>>>> index 32c5909..fd33e91 100644
>>>> --- a/net/netfilter/nf_conntrack_core.c
>>>> +++ b/net/netfilter/nf_conntrack_core.c
>>>> @@ -1353,6 +1353,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
>>>>  	}
>>>>  
>>>>  	nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
>>>> +	nf_conntrack_proto_generic_fini(net);
>>>>  	nf_conntrack_helper_fini(net);
>>>>  	nf_conntrack_timeout_fini(net);
>>>>  	nf_conntrack_ecache_fini(net);
>>>> @@ -1586,9 +1587,12 @@ static int nf_conntrack_init_net(struct net *net)
>>>>  	ret = nf_conntrack_helper_init(net);
>>>>  	if (ret < 0)
>>>>  		goto err_helper;
>>>> -
>>>> +	ret = nf_conntrack_proto_generic_init(net);
>>>> +	if (ret < 0)
>>>> +		goto err_generic;
>>>>  	return 0;
>>>> -
>>>> +err_generic:
>>>> +	nf_conntrack_helper_fini(net);
>>>>  err_helper:
>>>>  	nf_conntrack_timeout_fini(net);
>>>>  err_timeout:
>>>> diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
>>>> index 7ee6653..9b4bf6d 100644
>>>> --- a/net/netfilter/nf_conntrack_proto.c
>>>> +++ b/net/netfilter/nf_conntrack_proto.c
>>>> @@ -287,10 +287,16 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
>>>>  static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
>>>>  					      struct nf_conntrack_l4proto *l4proto)
>>>>  {
>>>> -	if (l4proto->net_id)
>>>> -		return net_generic(net, *l4proto->net_id);
>>>> -	else
>>>> -		return NULL;
>>>> +	switch (l4proto->l4proto) {
>>>> +	case 255: /* l4proto_generic */
>>>> +		return (struct nf_proto_net *)&net->ct.proto.generic;
>>>> +	default:
>>>> +		if (l4proto->net_id)
>>>> +			return net_generic(net, *l4proto->net_id);
>>>> +		else
>>>> +			return NULL;
>>>> +	}
>>>> +	return NULL;
>>>>  }
>>>>  
>>>>  int nf_ct_l4proto_register_sysctl(struct net *net,
>>>> @@ -457,11 +463,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
>>>>  int nf_conntrack_proto_init(void)
>>>>  {
>>>>  	unsigned int i;
>>>> -	int err;
>>>> -
>>>> -	err = nf_ct_l4proto_register_sysctl(&init_net, &nf_conntrack_l4proto_generic);
>>>> -	if (err < 0)
>>>> -		return err;
>>>
>>> I like that all protocols sysctl are registered by
>>> nf_conntrack_proto_init. Can you keep using that?
>>
>> you mean per-net's generic_proto sysctl are registered by
>> nf_conntrack_proto_init?
>>
>> such as
>>
>> int nf_conntrack_proto_init(struct net *net)
>> {
>> 	...
>> 	err = nf_ct_l4proto_register_sysctl(net, &nf_conntrack_l4proto_generic);
> 
> Yes, all protocol trackers included in nf_conntrack_proto_init:
> 
>         err = nf_conntrack_proto_generic_init(net);
>         ...
>         err = nf_conntrack_proto_tcp_init(net);
>         ...
> 
> and so on.

sounds good,but the l4protos except l4proto_generic are enabled by
insmod modules(such as nf_conntrack_ipv4,nf_conntrack_proto_udplite).

So I think it makes no sense to init all protocol here, unless we decide
to put those protos into module nf_conntrack.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] xen/netback: calculate correctly the SKB slots.
From: Ian Campbell @ 2012-05-24 11:12 UTC (permalink / raw)
  To: Adnan Misherfi
  Cc: Konrad Rzeszutek Wilk, Ben Hutchings,
	xen-devel@lists.xensource.com, netdev@vger.kernel.org,
	davem@davemloft.net, linux-kernel@vger.kernel.org
In-Reply-To: <4FBBE7D2.9040105@oracle.com>

On Tue, 2012-05-22 at 20:24 +0100, Adnan Misherfi wrote:
> 
> Konrad Rzeszutek Wilk wrote:
> >>>> wrong, which caused the RX ring to be erroneously declared full,
> >>>> and the receive queue to be stopped. The problem shows up when two
> >>>> guest running on the same server tries to communicates using large
> >>>>         
> > .. snip..
> >   
> >>> The function name is xen_netbk_count_skb_slots() in net-next.  This
> >>> appears to depend on the series in
> >>> <http://lists.xen.org/archives/html/xen-devel/2012-01/msg00982.html>.
> >>>       
> >> Yes, I don't think that patchset was intended for prime time just yet.
> >> Can this issue be reproduced without it?
> >>     
> >
> > It was based on 3.4, but the bug and work to fix this was  done on top of
> > a 3.4 version of netback backported in a 3.0 kernel. Let me double check
> > whether there were some missing patches.
> >
> >   
> >>>>  	int i, copy_off;
> >>>>  
> >>>>  	count = DIV_ROUND_UP(
> >>>> -			offset_in_page(skb->data)+skb_headlen(skb), PAGE_SIZE);
> >>>> +			offset_in_page(skb->data + skb_headlen(skb)), PAGE_SIZE);
> >>>>         
> >>> The new version would be equivalent to:
> >>> 	count = offset_in_page(skb->data + skb_headlen(skb)) != 0;
> >>> which is not right, as netbk_gop_skb() will use one slot per page.
> >>>       
> >> Just outside the context of this patch we separately count the frag
> >> pages.
> >>
> >> However I think you are right if skb->data covers > 1 page, since the
> >> new version can only ever return 0 or 1. I expect this patch papers over
> >> the underlying issue by not stopping often enough, rather than actually
> >> fixing the underlying issue.
> >>     
> >
> > Ah, any thoughts? Have you guys seen this behavior as well?
> >   
> >>> The real problem is likely that you're not using the same condition to
> >>> stop and wake the queue.
> >>>       
> >> Agreed, it would be useful to see the argument for this patch presented
> >> in that light. In particular the relationship between
> >> xenvif_rx_schedulable() (used to wake queue) and
> >> xen_netbk_must_stop_queue() (used to stop queue).
> >>     
> >
> > Do you have any debug patches to ... do open-heart surgery on the
> > rings of netback as its hitting the issues Adnan has found?
> >
> >   
> >> As it stands the description describes a setup which can repro the
> >> problem but doesn't really analyse what actually happens, nor justify
> >> the correctness of the fix.
> >>     
> >
> > Hm, Adnan - you dug in to this and you got tons of notes. Could you
> > describe what you saw that caused this?
> >   
> The problem is that the function xen_netbk_count_skb_slots() returns two 
> different counts for same type packets of same size (ICMP,3991). At the 
> start of the test
> the count is one, later on the count changes to two, soon after the 
> counts becomes two, the condition ring full becomes true, and queue get 
> stopped, and never gets
> started again.There are few point to make here:
> 1- It takes less that 128 ping packets to reproduce this
> 2- What is interesting here is that it works correct for many packet 
> sizes including 1500,400,500 9000, (3990, but not 3991)
> 3- The inconsistent count for the same packet size and type
> 4- I do not believe the ring was actually full when it was declared 
> full, I think the consumer pointer was wrong. (vif->rx_req_cons_peek in 
> function xenvif_start_xmit())
> 5- After changing the code the count returned from 
> xen_netbk_count_skb_slots() was always consistent, and worked just fine, 
> I let it runs for at least 12 hours.

That doesn't really explain why you think your fix is correct though,
which is what I was asking for.

In any case, does Simon's patch also fix things for you? As far as I can
tell that is the right fix.

Ian.

^ permalink raw reply

* Re: [PATCH 01/17] netfilter: add struct nf_proto_net for register l4proto sysctl
From: Gao feng @ 2012-05-24 10:54 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano,
	Gao feng
In-Reply-To: <20120524095859.GC13091@1984>

于 2012年05月24日 17:58, Pablo Neira Ayuso 写道:
> On Thu, May 24, 2012 at 09:35:50AM +0800, Gao feng wrote:
>> Hi pablo:
>>
>> 于 2012年05月23日 18:12, Pablo Neira Ayuso 写道:
>>> On Mon, May 14, 2012 at 04:52:11PM +0800, Gao feng wrote:
>>>> From: Gao feng <gaofeng@cn.fujitus.com>
>>>>
>>>> the struct nf_proto_net stroes proto's ctl_table_header and ctl_table,
>>>> nf_ct_l4proto_(un)register_sysctl use it to register sysctl.
>>>>
>>>> there are some changes for struct nf_conntrack_l4proto:
>>>> - add field compat to identify if this proto should do compat.
>>>> - the net_id field is used to store the pernet_operations id
>>>>   that belones to l4proto.
>>>> - init_net will be used to initial the proto's pernet data
>>>>
>>>> and add init_net for struct nf_conntrack_l3proto too.
>>>
>>> This patchset looks bette but there are still things that we have to
>>> resolve.
>>>
>>> The first one (regarding this patch 1/17) changes in:
>>> * include/net/netfilter/nf_conntrack_l4proto.h
>>> * include/net/netns/conntrack.h
>>>
>>> should be included in:
>>> [PATCH] netfilter: add namespace support for l4proto
>>>
>>> And changes in:
>>> * include/net/netfilter/nf_conntrack_l3proto.h
>>>
>>> should be included in:
>>> [PATCH] netfilter: add namespace support for l3proto
>>>
>>> I already told you. A patch that adds a structure without using it,
>>> is not good. The structure has to go together with the code uses it.
>>>
>>
>> It seams this patch should be merged to "netfilter: add namespace support for l4proto"
>> the struct nf_proto_net is first used there.
>>
>>> More comments below.
>>>
>>>> Acked-by: Eric W. Biederman <ebiederm@xmission.com>
>>>> Signed-off-by: Gao feng <gaofeng@cn.fujitus.com>
>>>> ---
>>>>  include/net/netfilter/nf_conntrack_l3proto.h |    3 +++
>>>>  include/net/netfilter/nf_conntrack_l4proto.h |    6 ++++++
>>>>  include/net/netns/conntrack.h                |   12 ++++++++++++
>>>>  3 files changed, 21 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
>>>> index 9699c02..9766005 100644
>>>> --- a/include/net/netfilter/nf_conntrack_l3proto.h
>>>> +++ b/include/net/netfilter/nf_conntrack_l3proto.h
>>>> @@ -69,6 +69,9 @@ struct nf_conntrack_l3proto {
>>>>  	struct ctl_table	*ctl_table;
>>>>  #endif /* CONFIG_SYSCTL */
>>>>  
>>>> +	/* Init l3proto pernet data */
>>>> +	int (*init_net)(struct net *net);
>>>> +
>>>>  	/* Module (if any) which this is connected to. */
>>>>  	struct module *me;
>>>>  };
>>>> diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
>>>> index 3b572bb..a90eab5 100644
>>>> --- a/include/net/netfilter/nf_conntrack_l4proto.h
>>>> +++ b/include/net/netfilter/nf_conntrack_l4proto.h
>>>> @@ -22,6 +22,8 @@ struct nf_conntrack_l4proto {
>>>>  	/* L4 Protocol number. */
>>>>  	u_int8_t l4proto;
>>>>  
>>>> +	u_int8_t compat;
>>>
>>> I don't see why we need this new field.
>>>
>>> It seems to be set to 1 in each structure that has set:
>>>
>>> .ctl_compat_table
>>>
>>> to non-NULL. So, it's redundant.
>>>
>>> Moreover, you already know from the protocol tracker itself if you
>>> have to allocate the compat ctl table or not.
>>>
>>> In other words: You set compat to 1 for nf_conntrack_l4proto_generic.
>>> Then, you pass that compat value to generic_init_net via ->inet_net
>>> again, but this information (that determines if the compat has to be
>>> done or not) is already in the scope of the protocol tracker.
>>>
>>
>> because some protocols such l4proto_tcp6 and l4proto_tcp use the same init_net
>> function. the l4proto_tcp6 doesn't need compat sysctl, so we should use this new
>> field to identify if we should kmemdup compat_sysctl_table.
> 
> Then, could you use two init_net functions? one for TCP for IPv4 and another
> for TCP for IPv6?

Of cause, if you prefer to impletment it in this way.

^ permalink raw reply

* [PATCH] MAINTAINERS
From: jamal @ 2012-05-24 12:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

commit 2c2996304c01a7af350c431c0445ae7956c5ff30
Author: Jamal Hadi Salim <jhs@mojatatu.com>
Date:   Thu May 24 08:21:02 2012 -0400

    After about two decades, I am giving up on cyberus.
    Nabwaga Manyanga.
    
    Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index d4abe75..a004446 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6605,7 +6605,7 @@ F:	include/linux/taskstats*
 F:	kernel/taskstats.c
 
 TC CLASSIFIER
-M:	Jamal Hadi Salim <hadi@cyberus.ca>
+M:	Jamal Hadi Salim <jhs@mojatatu.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	include/linux/pkt_cls.h

^ permalink raw reply related

* Re: NETDEV WATCHDOG: %s (%s): transmit queue %u timed out
From: George Spelvin @ 2012-05-24 12:46 UTC (permalink / raw)
  To: romieu; +Cc: davej, kernel-team, linux, netdev
In-Reply-To: <20120523223210.GA20536@electric-eye.fr.zoreil.com>

Francois Romieu <romieu@fr.zoreil.com>
> You may try the attached patches on top of current -git. A complete dmesg
> will be welcome. So will an 'ethtool -d eth0' if the device stops working.
>
> You did not label the problem as a serious one. Does it means that the
> driver automatically recovers ?

Same timeout, at 6h20m uptime (dmesg attached),  System is 3.4.0 +
some completely unrelated local patches (to serial port PPS drivers) +
your two patches.

System recovers, is operating normally.

Thank you for your efforts!


Initializing cgroup subsys cpu
Linux version 3.4.0-00020-g3579858 ($USER@$HOST) (gcc version 4.7.0 (Debian 4.7.0-9) ) #156 SMP Wed May 23 22:24:39 EDT 2012
Command line: auto BOOT_IMAGE=Amd64 ro root=905 libata.fua=1 acpi_enforce_resources=lax k10temp.force=1
BIOS-provided physical RAM map:
 BIOS-e820: 0000000000000000 - 0000000000093800 (usable)
 BIOS-e820: 0000000000093800 - 00000000000a0000 (reserved)
 BIOS-e820: 00000000000e4000 - 0000000000100000 (reserved)
 BIOS-e820: 0000000000100000 - 00000000dffa0000 (usable)
 BIOS-e820: 00000000dffc0000 - 00000000dffce000 (ACPI data)
 BIOS-e820: 00000000dffce000 - 00000000dfff0000 (ACPI NVS)
 BIOS-e820: 00000000dfff0000 - 00000000dfffe000 (reserved)
 BIOS-e820: 00000000fff00000 - 0000000100000000 (reserved)
 BIOS-e820: 0000000100000000 - 0000000220000000 (usable)
NX (Execute Disable) protection: active
DMI present.
DMI: MICRO-STAR INTERNATIONAL CO.,LTD MS-7376/MS-7376, BIOS V1.7 01/13/2009
e820 update range: 0000000000000000 - 0000000000010000 (usable) ==> (reserved)
ie820 remove range: 00000000000a0000 - 0000000000100000 (usable)
No AGP bridge found
last_pfn = 0x220000 max_arch_pfn = 0x400000000
MTRR default type: uncachable
MTRR fixed ranges enabled:
  00000-9FFFF write-back
  A0000-EFFFF uncachable
  F0000-FFFFF write-protect
MTRR variable ranges enabled:
  0 base 000000000000 mask FFFF80000000 write-back
  1 base 000080000000 mask FFFFC0000000 write-back
  2 base 0000C0000000 mask FFFFE0000000 write-back
  3 disabled
  4 disabled
  5 disabled
  6 disabled
  7 disabled
TOM2: 0000000220000000 aka 8704M
x86 PAT enabled: cpu 0, old 0x7040600070406, new 0x7010600070106
e820 update range: 00000000e0000000 - 0000000100000000 (usable) ==> (reserved)
last_pfn = 0xdffa0 max_arch_pfn = 0x400000000
found SMP MP-table at [ffff8800000ff780] ff780
initial memory mapped : 0 - 20000000
Base memory trampoline at [ffff880000091000] 91000 size 8192
Using GB pages for direct mapping
init_memory_mapping: 0000000000000000-00000000dffa0000
 0000000000 - 00c0000000 page 1G
 00c0000000 - 00dfe00000 page 2M
 00dfe00000 - 00dffa0000 page 4k
kernel direct mapping tables up to dffa0000 @ 1fffd000-20000000
init_memory_mapping: 0000000100000000-0000000220000000
 0100000000 - 0200000000 page 1G
 0200000000 - 0220000000 page 2M
kernel direct mapping tables up to 220000000 @ dff9e000-dffa0000
ACPI: RSDP 00000000000f9e30 00014 (v00 ACPIAM)
ACPI: RSDT 00000000dffc0000 00038 (v01 011309 RSDT1044 20090113 MSFT 00000097)
ACPI: FACP 00000000dffc0200 00084 (v02 011309 FACP1044 20090113 MSFT 00000097)
ACPI: DSDT 00000000dffc0440 06E9B (v01  1ADNC 1ADNC001 00000001 INTL 20051117)
ACPI: FACS 00000000dffce000 00040
ACPI: APIC 00000000dffc0390 0006C (v01 011309 APIC1044 20090113 MSFT 00000097)
ACPI: MCFG 00000000dffc0400 0003C (v01 011309 OEMMCFG  20090113 MSFT 00000097)
ACPI: OEMB 00000000dffce040 00071 (v01 011309 OEMB1044 20090113 MSFT 00000097)
ACPI: HPET 00000000dffc72e0 00038 (v01 011309 OEMHPET  20090113 MSFT 00000097)
ACPI: Local APIC address 0xfee00000
 [ffffea0000000000-ffffea00087fffff] PMD -> [ffff880217600000-ffff88021f5fffff] on node 0
Zone PFN ranges:
  DMA      0x00000010 -> 0x00001000
  DMA32    0x00001000 -> 0x00100000
  Normal   0x00100000 -> 0x00220000
Movable zone start PFN for each node
Early memory PFN ranges
    0: 0x00000010 -> 0x00000093
    0: 0x00000100 -> 0x000dffa0
    0: 0x00100000 -> 0x00220000
On node 0 totalpages: 2096931
  DMA zone: 64 pages used for memmap
  DMA zone: 2 pages reserved
  DMA zone: 3905 pages, LIFO batch:0
  DMA32 zone: 16320 pages used for memmap
  DMA32 zone: 896992 pages, LIFO batch:31
  Normal zone: 18432 pages used for memmap
  Normal zone: 1161216 pages, LIFO batch:31
ACPI: PM-Timer IO Port: 0x808
ACPI: Local APIC address 0xfee00000
ACPI: LAPIC (acpi_id[0x01] lapic_id[0x00] enabled)
ACPI: LAPIC (acpi_id[0x02] lapic_id[0x01] enabled)
ACPI: LAPIC (acpi_id[0x03] lapic_id[0x02] enabled)
ACPI: LAPIC (acpi_id[0x04] lapic_id[0x03] enabled)
ACPI: IOAPIC (id[0x04] address[0xfec00000] gsi_base[0])
IOAPIC[0]: apic_id 4, version 33, address 0xfec00000, GSI 0-23
ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
ACPI: IRQ0 used by override.
ACPI: IRQ2 used by override.
ACPI: IRQ9 used by override.
Using ACPI (MADT) for SMP configuration information
ACPI: HPET id: 0x8300 base: 0xfed00000
SMP: Allowing 4 CPUs, 0 hotplug CPUs
nr_irqs_gsi: 40
Allocating PCI resources starting at dfffe000 (gap: dfffe000:1ff02000)
setup_percpu: NR_CPUS:4 nr_cpumask_bits:4 nr_cpu_ids:4 nr_node_ids:1
PERCPU: Embedded 23 pages/cpu @ffff88021fc00000 s72704 r0 d21504 u524288
pcpu-alloc: s72704 r0 d21504 u524288 alloc=1*2097152
pcpu-alloc: [0] 0 1 2 3 
Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 2062113
Kernel command line: auto BOOT_IMAGE=Amd64 ro root=905 libata.fua=1 acpi_enforce_resources=lax k10temp.force=1
PID hash table entries: 4096 (order: 3, 32768 bytes)
Dentry cache hash table entries: 1048576 (order: 11, 8388608 bytes)
Inode-cache hash table entries: 524288 (order: 10, 4194304 bytes)
Checking aperture...
No AGP bridge found
Node 0: aperture @ d4000000 size 32 MB
Aperture pointing to e820 RAM. Ignoring.
Your BIOS doesn't leave a aperture memory hole
Please enable the IOMMU option in the BIOS setup
This costs you 64 MB of RAM
Mapping aperture over 65536 KB of RAM @ d4000000
Memory: 8105124k/8912896k available (4126k kernel code, 525172k absent, 282600k reserved, 1952k data, 476k init)
SLUB: Genslabs=15, HWalign=64, Order=0-3, MinObjects=0, CPUs=4, Nodes=1
Hierarchical RCU implementation.
NR_IRQS:4352 nr_irqs:712 16
Console: colour VGA+ 80x50
console [tty0] enabled
hpet clockevent registered
Fast TSC calibration failed
TSC: Unable to calibrate against PIT
TSC: using HPET reference calibration
Detected 2500.164 MHz processor.
Calibrating delay loop (skipped), value calculated using timer frequency.. 5000.32 BogoMIPS (lpj=25001640)
pid_max: default: 32768 minimum: 301
Mount-cache hash table entries: 256
tseg: 0000000000
CPU: Physical Processor ID: 0
CPU: Processor Core ID: 0
mce: CPU supports 6 MCE banks
LVT offset 0 assigned for vector 0xf9
using AMD E400 aware idle routine
Freeing SMP alternatives: 12k freed
ACPI: Core revision 20120320
..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
CPU0: AMD Phenom(tm) 9850 Quad-Core Processor stepping 03
Performance Events: AMD PMU driver.
... version:                0
... bit width:              48
... generic registers:      4
... value mask:             0000ffffffffffff
... max period:             00007fffffffffff
... fixed-purpose events:   0
... event mask:             000000000000000f
MCE: In-kernel MCE decoding enabled.
NMI watchdog: enabled, takes one hw-pmu counter.
Booting Node   0, Processors  #1
NMI watchdog: enabled, takes one hw-pmu counter.
System has AMD C1E enabled
Switch to broadcast mode on CPU1
 #2
NMI watchdog: enabled, takes one hw-pmu counter.
Switch to broadcast mode on CPU2
 #3 Ok.
NMI watchdog: enabled, takes one hw-pmu counter.
Switch to broadcast mode on CPU3
Brought up 4 CPUs
Total of 4 processors activated (20001.31 BogoMIPS).
Switch to broadcast mode on CPU0
xor: automatically using best checksumming function: generic_sse
   generic_sse:  9968.800 MB/sec
xor: using function: generic_sse (9968.800 MB/sec)
NET: Registered protocol family 16
node 0 link 0: io port [1000, ffffff]
TOM: 00000000e0000000 aka 3584M
Fam 10h mmconf [mem 0xe0000000-0xefffffff]
node 0 link 0: mmio [e0000000, efffffff] ==> none
node 0 link 0: mmio [f0000000, ffffffff]
node 0 link 0: mmio [a0000, bffff]
node 0 link 0: mmio [e0000000, dfffffff] ==> none
TOM2: 0000000220000000 aka 8704M
bus: [00, 07] on node 0 link 0
bus: 00 index 0 [io  0x0000-0xffff]
bus: 00 index 1 [mem 0xf0000000-0xffffffff]
bus: 00 index 2 [mem 0x000a0000-0x000bffff]
bus: 00 index 3 [mem 0x220000000-0xfcffffffff]
ACPI: bus type pci registered
PCI: MMCONFIG for domain 0000 [bus 00-ff] at [mem 0xe0000000-0xefffffff] (base 0xe0000000)
PCI: not using MMCONFIG
PCI: Using configuration type 1 for base access
PCI: Using configuration type 1 for extended access
bio: create slab <bio-0> at 0
raid6: int64x1   2426 MB/s
raid6: int64x2   2127 MB/s
raid6: int64x4   2237 MB/s
raid6: int64x8   1474 MB/s
raid6: sse2x1    3546 MB/s
raid6: sse2x2    5429 MB/s
raid6: sse2x4    6551 MB/s
raid6: using algorithm sse2x4 (6551 MB/s)
ACPI: Added _OSI(Module Device)
ACPI: Added _OSI(Processor Device)
ACPI: Added _OSI(3.0 _SCP Extensions)
ACPI: Added _OSI(Processor Aggregator Device)
ACPI: EC: Detected MSI hardware, enabling workarounds.
ACPI: EC: Look up EC in DSDT
ACPI: Executed 3 blocks of module-level executable AML code
ACPI: Interpreter enabled
ACPI: (supports S0 S5)
ACPI: Using IOAPIC for interrupt routing
PCI: MMCONFIG for domain 0000 [bus 00-ff] at [mem 0xe0000000-0xefffffff] (base 0xe0000000)
PCI: MMCONFIG at [mem 0xe0000000-0xefffffff] reserved in ACPI motherboard resources
ACPI: No dock devices found.
PCI: Using host bridge windows from ACPI; if necessary, use "pci=nocrs" and report a bug
ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
pci_root PNP0A03:00: host bridge window [io  0x0000-0x0cf7]
pci_root PNP0A03:00: host bridge window [io  0x0d00-0xffff]
pci_root PNP0A03:00: host bridge window [mem 0x000a0000-0x000bffff]
pci_root PNP0A03:00: host bridge window [mem 0x000d0000-0x000dffff]
pci_root PNP0A03:00: host bridge window [mem 0xf0000000-0xfebfffff]
PCI host bridge to bus 0000:00
pci_bus 0000:00: root bus resource [io  0x0000-0x0cf7]
pci_bus 0000:00: root bus resource [io  0x0d00-0xffff]
pci_bus 0000:00: root bus resource [mem 0x000a0000-0x000bffff]
pci_bus 0000:00: root bus resource [mem 0x000d0000-0x000dffff]
pci_bus 0000:00: root bus resource [mem 0xf0000000-0xfebfffff]
pci 0000:00:00.0: [1002:5956] type 00 class 0x060000
pci 0000:00:00.0: reg 1c: [mem 0xe0000000-0xffffffff 64bit]
pci 0000:00:02.0: [1002:5978] type 01 class 0x060400
pci 0000:00:02.0: PME# supported from D0 D3hot D3cold
pci 0000:00:05.0: [1002:597b] type 01 class 0x060400
pci 0000:00:05.0: PME# supported from D0 D3hot D3cold
pci 0000:00:09.0: [1002:597e] type 01 class 0x060400
pci 0000:00:09.0: PME# supported from D0 D3hot D3cold
pci 0000:00:12.0: [1002:4380] type 00 class 0x010601
pci 0000:00:12.0: reg 10: [io  0x7000-0x7007]
pci 0000:00:12.0: reg 14: [io  0x6000-0x6003]
pci 0000:00:12.0: reg 18: [io  0x5000-0x5007]
pci 0000:00:12.0: reg 1c: [io  0x4000-0x4003]
pci 0000:00:12.0: reg 20: [io  0x3000-0x300f]
pci 0000:00:12.0: reg 24: [mem 0xfe5ff800-0xfe5ffbff]
pci 0000:00:13.0: [1002:4387] type 00 class 0x0c0310
pci 0000:00:13.0: reg 10: [mem 0xfe5fe000-0xfe5fefff]
pci 0000:00:13.1: [1002:4388] type 00 class 0x0c0310
pci 0000:00:13.1: reg 10: [mem 0xfe5fd000-0xfe5fdfff]
pci 0000:00:13.2: [1002:4389] type 00 class 0x0c0310
pci 0000:00:13.2: reg 10: [mem 0xfe5fc000-0xfe5fcfff]
pci 0000:00:13.3: [1002:438a] type 00 class 0x0c0310
pci 0000:00:13.3: reg 10: [mem 0xfe5fb000-0xfe5fbfff]
pci 0000:00:13.4: [1002:438b] type 00 class 0x0c0310
pci 0000:00:13.4: reg 10: [mem 0xfe5fa000-0xfe5fafff]
pci 0000:00:13.5: [1002:4386] type 00 class 0x0c0320
pci 0000:00:13.5: reg 10: [mem 0xfe5ff000-0xfe5ff0ff]
pci 0000:00:13.5: supports D1 D2
pci 0000:00:13.5: PME# supported from D0 D1 D2 D3hot
pci 0000:00:14.0: [1002:4385] type 00 class 0x0c0500
pci 0000:00:14.0: reg 10: [io  0x0b00-0x0b0f]
pci 0000:00:14.1: [1002:438c] type 00 class 0x01018a
pci 0000:00:14.1: reg 10: [io  0x0000-0x0007]
pci 0000:00:14.1: reg 14: [io  0x0000-0x0003]
pci 0000:00:14.1: reg 18: [io  0x0000-0x0007]
pci 0000:00:14.1: reg 1c: [io  0x0000-0x0003]
pci 0000:00:14.1: reg 20: [io  0xff00-0xff0f]
pci 0000:00:14.2: [1002:4383] type 00 class 0x040300
pci 0000:00:14.2: reg 10: [mem 0xfe5f4000-0xfe5f7fff 64bit]
pci 0000:00:14.2: PME# supported from D0 D3hot D3cold
pci 0000:00:14.3: [1002:438d] type 00 class 0x060100
pci 0000:00:14.4: [1002:4384] type 01 class 0x060401
pci 0000:00:18.0: [1022:1200] type 00 class 0x060000
pci 0000:00:18.1: [1022:1201] type 00 class 0x060000
pci 0000:00:18.2: [1022:1202] type 00 class 0x060000
pci 0000:00:18.3: [1022:1203] type 00 class 0x060000
pci 0000:00:18.4: [1022:1204] type 00 class 0x060000
pci 0000:01:00.0: [1002:5b60] type 00 class 0x030000
pci 0000:01:00.0: reg 10: [mem 0xfc000000-0xfdffffff 64bit pref]
pci 0000:01:00.0: reg 18: [mem 0xfe6f0000-0xfe6fffff 64bit]
pci 0000:01:00.0: reg 20: [io  0x8000-0x80ff]
pci 0000:01:00.0: reg 30: [mem 0xfe6c0000-0xfe6dffff pref]
pci 0000:01:00.0: supports D1 D2
pci 0000:01:00.1: [1002:5b70] type 00 class 0x038000
pci 0000:01:00.1: reg 10: [mem 0xfe6e0000-0xfe6effff 64bit]
pci 0000:01:00.1: supports D1 D2
pci 0000:00:02.0: PCI bridge to [bus 01-01]
pci 0000:00:02.0:   bridge window [io  0x8000-0x8fff]
pci 0000:00:02.0:   bridge window [mem 0xfe600000-0xfe6fffff]
pci 0000:00:02.0:   bridge window [mem 0xfc000000-0xfdffffff 64bit pref]
pci 0000:02:00.0: [10ec:8168] type 00 class 0x020000
pci 0000:02:00.0: reg 10: [io  0x9800-0x98ff]
pci 0000:02:00.0: reg 18: [mem 0xfe7ff000-0xfe7fffff 64bit]
pci 0000:02:00.0: reg 30: [mem 0xfe7c0000-0xfe7dffff pref]
pci 0000:02:00.0: supports D1 D2
pci 0000:02:00.0: PME# supported from D1 D2 D3hot D3cold
pci 0000:00:05.0: PCI bridge to [bus 02-02]
pci 0000:00:05.0:   bridge window [io  0x9000-0x9fff]
pci 0000:00:05.0:   bridge window [mem 0xfe700000-0xfe7fffff]
pci 0000:03:00.0: [105a:3f20] type 00 class 0x010400
pci 0000:03:00.0: reg 10: [io  0xa800-0xa87f]
pci 0000:03:00.0: reg 18: [io  0xa400-0xa4ff]
pci 0000:03:00.0: reg 1c: [mem 0xfe8ff000-0xfe8fffff]
pci 0000:03:00.0: reg 20: [mem 0xfe8c0000-0xfe8dffff]
pci 0000:03:00.0: reg 24: [mem 0xfe8fc000-0xfe8fdfff]
pci 0000:03:00.0: supports D1
pci 0000:00:09.0: PCI bridge to [bus 03-03]
pci 0000:00:09.0:   bridge window [io  0xa000-0xafff]
pci 0000:00:09.0:   bridge window [mem 0xfe800000-0xfe8fffff]
pci 0000:04:00.0: [1106:3044] type 00 class 0x0c0010
pci 0000:04:00.0: reg 10: [mem 0xfe9ff800-0xfe9fffff]
pci 0000:04:00.0: reg 14: [io  0xc800-0xc87f]
pci 0000:04:00.0: supports D2
pci 0000:04:00.0: PME# supported from D2 D3hot D3cold
pci 0000:04:02.0: [1415:9501] type 00 class 0x070006
pci 0000:04:02.0: reg 10: [io  0xc400-0xc41f]
pci 0000:04:02.0: reg 14: [mem 0xfe9fe000-0xfe9fefff]
pci 0000:04:02.0: reg 18: [io  0xc000-0xc01f]
pci 0000:04:02.0: reg 1c: [mem 0xfe9fd000-0xfe9fdfff]
pci 0000:04:02.0: supports D2
pci 0000:04:02.0: PME# supported from D0 D2 D3hot
pci 0000:04:02.1: [1415:9513] type 00 class 0x070101
pci 0000:04:02.1: reg 10: [io  0xb800-0xb807]
pci 0000:04:02.1: reg 14: [io  0xb400-0xb407]
pci 0000:04:02.1: reg 18: [io  0xb000-0xb01f]
pci 0000:04:02.1: reg 1c: [mem 0xfe9fc000-0xfe9fcfff]
pci 0000:04:02.1: supports D2
pci 0000:04:02.1: PME# supported from D0 D2 D3hot
pci 0000:04:03.0: [1011:0024] type 01 class 0x060400
pci 0000:00:14.4: PCI bridge to [bus 04-05] (subtractive decode)
pci 0000:00:14.4:   bridge window [io  0xb000-0xefff]
pci 0000:00:14.4:   bridge window [mem 0xfe900000-0xfebfffff]
pci 0000:00:14.4:   bridge window [io  0x0000-0x0cf7] (subtractive decode)
pci 0000:00:14.4:   bridge window [io  0x0d00-0xffff] (subtractive decode)
pci 0000:00:14.4:   bridge window [mem 0x000a0000-0x000bffff] (subtractive decode)
pci 0000:00:14.4:   bridge window [mem 0x000d0000-0x000dffff] (subtractive decode)
pci 0000:00:14.4:   bridge window [mem 0xf0000000-0xfebfffff] (subtractive decode)
pci 0000:05:04.0: [1011:0019] type 00 class 0x020000
pci 0000:05:04.0: reg 10: [io  0xe800-0xe87f]
pci 0000:05:04.0: reg 14: [mem 0xfebffc00-0xfebfffff]
pci 0000:05:04.0: reg 30: [mem 0xfeb80000-0xfebbffff pref]
pci 0000:05:05.0: [1011:0019] type 00 class 0x020000
pci 0000:05:05.0: reg 10: [io  0xe400-0xe47f]
pci 0000:05:05.0: reg 14: [mem 0xfebff800-0xfebffbff]
pci 0000:05:05.0: reg 30: [mem 0xfeb40000-0xfeb7ffff pref]
pci 0000:05:06.0: [1011:0019] type 00 class 0x020000
pci 0000:05:06.0: reg 10: [io  0xe000-0xe07f]
pci 0000:05:06.0: reg 14: [mem 0xfebff400-0xfebff7ff]
pci 0000:05:06.0: reg 30: [mem 0xfeb00000-0xfeb3ffff pref]
pci 0000:05:07.0: [1011:0019] type 00 class 0x020000
pci 0000:05:07.0: reg 10: [io  0xd800-0xd87f]
pci 0000:05:07.0: reg 14: [mem 0xfebff000-0xfebff3ff]
pci 0000:05:07.0: reg 30: [mem 0xfeac0000-0xfeafffff pref]
pci 0000:04:03.0: PCI bridge to [bus 05-05]
pci 0000:04:03.0:   bridge window [io  0xd000-0xefff]
pci 0000:04:03.0:   bridge window [mem 0xfea00000-0xfebfffff]
pci_bus 0000:00: on NUMA node 0
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0.PCE2._PRT]
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0.PCE5._PRT]
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0.PCE9._PRT]
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0.P0PC._PRT]
 pci0000:00: Unable to request _OSC control (_OSC support mask: 0x19)
ACPI: PCI Interrupt Link [LNKA] (IRQs 3 4 5 7 10 11 12 14 *15)
ACPI: PCI Interrupt Link [LNKB] (IRQs 3 4 5 7 10 *11 12 14 15)
ACPI: PCI Interrupt Link [LNKC] (IRQs 3 4 5 7 *10 11 12 14 15)
ACPI: PCI Interrupt Link [LNKD] (IRQs 3 4 5 7 *10 11 12 14 15)
ACPI: PCI Interrupt Link [LNKE] (IRQs 3 4 5 *7 10 11 12 14 15)
ACPI: PCI Interrupt Link [LNKF] (IRQs *9)
ACPI: PCI Interrupt Link [LNKG] (IRQs 3 4 *5 7 10 11 12 14 15)
ACPI: PCI Interrupt Link [LNKH] (IRQs *3 4 5 7 10 11 12 14 15)
vgaarb: device added: PCI:0000:01:00.0,decodes=io+mem,owns=io+mem,locks=none
vgaarb: loaded
vgaarb: bridge control possible 0000:01:00.0
SCSI subsystem initialized
libata version 3.00 loaded.
usbcore: registered new interface driver usbfs
usbcore: registered new interface driver hub
usbcore: registered new device driver usb
pps_core: LinuxPPS API ver. 1 registered
pps_core: Software ver. 5.3.6 - Copyright 2005-2007 Rodolfo Giometti <giometti@linux.it>
PTP clock support registered
Advanced Linux Sound Architecture Driver Version 1.0.25.
PCI: Using ACPI for IRQ routing
PCI: pci_cache_line_size set to 64 bytes
pci 0000:00:00.0: no compatible bridge window for [mem 0xe0000000-0xffffffff 64bit]
reserve RAM buffer: 0000000000093800 - 000000000009ffff 
reserve RAM buffer: 00000000dffa0000 - 00000000dfffffff 
hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0, 0
hpet0: 4 comparators, 32-bit 14.318180 MHz counter
Switching to clocksource hpet
pnp: PnP ACPI init
ACPI: bus type pnp registered
pnp 00:00: [bus 00-ff]
pnp 00:00: [io  0x0cf8-0x0cff]
pnp 00:00: [io  0x0000-0x0cf7 window]
pnp 00:00: [io  0x0d00-0xffff window]
pnp 00:00: [mem 0x000a0000-0x000bffff window]
pnp 00:00: [mem 0x000d0000-0x000dffff window]
pnp 00:00: [mem 0xe0000000-0xdfffffff window disabled]
pnp 00:00: [mem 0xf0000000-0xfebfffff window]
pnp 00:00: Plug and Play ACPI device, IDs PNP0a03 (active)
pnp 00:01: [dma 4]
pnp 00:01: [io  0x0000-0x000f]
pnp 00:01: [io  0x0081-0x0083]
pnp 00:01: [io  0x0087]
pnp 00:01: [io  0x0089-0x008b]
pnp 00:01: [io  0x008f]
pnp 00:01: [io  0x00c0-0x00df]
pnp 00:01: Plug and Play ACPI device, IDs PNP0200 (active)
pnp 00:02: [io  0x0070-0x0071]
pnp 00:02: [irq 8]
pnp 00:02: Plug and Play ACPI device, IDs PNP0b00 (active)
pnp 00:03: [io  0x0061]
pnp 00:03: Plug and Play ACPI device, IDs PNP0800 (active)
pnp 00:04: [io  0x00f0-0x00ff]
pnp 00:04: [irq 13]
pnp 00:04: Plug and Play ACPI device, IDs PNP0c04 (active)
pnp 00:05: [io  0x03f8-0x03ff]
pnp 00:05: [irq 4]
pnp 00:05: [dma 0 disabled]
pnp 00:05: Plug and Play ACPI device, IDs PNP0501 (active)
pnp 00:06: [io  0x03f0-0x03f5]
pnp 00:06: [io  0x03f7]
pnp 00:06: [irq 6]
pnp 00:06: [dma 2]
pnp 00:06: Plug and Play ACPI device, IDs PNP0700 (active)
pnp 00:07: [mem 0xfed00000-0xfed003ff]
pnp 00:07: Plug and Play ACPI device, IDs PNP0103 (active)
pnp 00:08: [mem 0xfec00000-0xfec00fff]
pnp 00:08: [mem 0xfee00000-0xfee00fff]
system 00:08: [mem 0xfec00000-0xfec00fff] could not be reserved
system 00:08: [mem 0xfee00000-0xfee00fff] has been reserved
system 00:08: Plug and Play ACPI device, IDs PNP0c02 (active)
pnp 00:09: [io  0x0010-0x001f]
pnp 00:09: [io  0x0022-0x003f]
pnp 00:09: [io  0x0062-0x0063]
pnp 00:09: [io  0x0065-0x006f]
pnp 00:09: [io  0x0072-0x007f]
pnp 00:09: [io  0x0080]
pnp 00:09: [io  0x0084-0x0086]
pnp 00:09: [io  0x0088]
pnp 00:09: [io  0x008c-0x008e]
pnp 00:09: [io  0x0090-0x009f]
pnp 00:09: [io  0x00a2-0x00bf]
pnp 00:09: [io  0x00b1]
pnp 00:09: [io  0x00e0-0x00ef]
pnp 00:09: [io  0x04d0-0x04d1]
pnp 00:09: [io  0x040b]
pnp 00:09: [io  0x04d6]
pnp 00:09: [io  0x0c00-0x0c01]
pnp 00:09: [io  0x0c14]
pnp 00:09: [io  0x0c50-0x0c51]
pnp 00:09: [io  0x0c52]
pnp 00:09: [io  0x0c6c]
pnp 00:09: [io  0x0c6f]
pnp 00:09: [io  0x0cd0-0x0cd1]
pnp 00:09: [io  0x0cd2-0x0cd3]
pnp 00:09: [io  0x0cd4-0x0cd5]
pnp 00:09: [io  0x0cd6-0x0cd7]
pnp 00:09: [io  0x0cd8-0x0cdf]
pnp 00:09: [io  0x0800-0x089f]
pnp 00:09: [io  0x0b10-0x0b1f]
pnp 00:09: [io  0x0000-0xffffffffffffffff disabled]
pnp 00:09: [io  0x0900-0x090f]
pnp 00:09: [io  0x0910-0x091f]
pnp 00:09: [io  0xfe00-0xfefe]
pnp 00:09: [mem 0xffb80000-0xffbfffff]
system 00:09: [io  0x04d0-0x04d1] has been reserved
system 00:09: [io  0x040b] has been reserved
system 00:09: [io  0x04d6] has been reserved
system 00:09: [io  0x0c00-0x0c01] has been reserved
system 00:09: [io  0x0c14] has been reserved
system 00:09: [io  0x0c50-0x0c51] has been reserved
system 00:09: [io  0x0c52] has been reserved
system 00:09: [io  0x0c6c] has been reserved
system 00:09: [io  0x0c6f] has been reserved
system 00:09: [io  0x0cd0-0x0cd1] has been reserved
system 00:09: [io  0x0cd2-0x0cd3] has been reserved
system 00:09: [io  0x0cd4-0x0cd5] has been reserved
system 00:09: [io  0x0cd6-0x0cd7] has been reserved
system 00:09: [io  0x0cd8-0x0cdf] has been reserved
system 00:09: [io  0x0800-0x089f] has been reserved
system 00:09: [io  0x0b10-0x0b1f] has been reserved
system 00:09: [io  0x0900-0x090f] has been reserved
system 00:09: [io  0x0910-0x091f] has been reserved
system 00:09: [io  0xfe00-0xfefe] has been reserved
system 00:09: [mem 0xffb80000-0xffbfffff] has been reserved
system 00:09: Plug and Play ACPI device, IDs PNP0c02 (active)
pnp 00:0a: [io  0x0060]
pnp 00:0a: [io  0x0064]
pnp 00:0a: [irq 1]
pnp 00:0a: Plug and Play ACPI device, IDs PNP0303 PNP030b (active)
pnp 00:0b: [irq 12]
pnp 00:0b: Plug and Play ACPI device, IDs PNP0f03 PNP0f13 (active)
pnp 00:0c: [io  0x0000-0xffffffffffffffff disabled]
pnp 00:0c: [io  0x0600-0x06df]
pnp 00:0c: [io  0x0ae0-0x0aef]
system 00:0c: [io  0x0600-0x06df] has been reserved
system 00:0c: [io  0x0ae0-0x0aef] has been reserved
system 00:0c: Plug and Play ACPI device, IDs PNP0c02 (active)
pnp 00:0d: [mem 0xe0000000-0xefffffff]
system 00:0d: [mem 0xe0000000-0xefffffff] has been reserved
system 00:0d: Plug and Play ACPI device, IDs PNP0c02 (active)
pnp 00:0e: [mem 0x00000000-0x0009ffff]
pnp 00:0e: [mem 0x000c0000-0x000cffff]
pnp 00:0e: [mem 0x000e0000-0x000fffff]
pnp 00:0e: [mem 0x00100000-0xdfffffff]
pnp 00:0e: [mem 0xfec00000-0xffffffff]
pnp 00:0e: disabling [mem 0x00000000-0x0009ffff] because it overlaps 0000:00:00.0 BAR 3 [mem 0x00000000-0x1fffffff 64bit]
pnp 00:0e: disabling [mem 0x000c0000-0x000cffff] because it overlaps 0000:00:00.0 BAR 3 [mem 0x00000000-0x1fffffff 64bit]
pnp 00:0e: disabling [mem 0x000e0000-0x000fffff] because it overlaps 0000:00:00.0 BAR 3 [mem 0x00000000-0x1fffffff 64bit]
pnp 00:0e: disabling [mem 0x00100000-0xdfffffff] because it overlaps 0000:00:00.0 BAR 3 [mem 0x00000000-0x1fffffff 64bit]
system 00:0e: [mem 0xfec00000-0xffffffff] could not be reserved
system 00:0e: Plug and Play ACPI device, IDs PNP0c01 (active)
pnp: PnP ACPI: found 15 devices
ACPI: ACPI bus type pnp unregistered
pci 0000:00:02.0: PCI bridge to [bus 01-01]
pci 0000:00:02.0:   bridge window [io  0x8000-0x8fff]
pci 0000:00:02.0:   bridge window [mem 0xfe600000-0xfe6fffff]
pci 0000:00:02.0:   bridge window [mem 0xfc000000-0xfdffffff 64bit pref]
pci 0000:00:05.0: PCI bridge to [bus 02-02]
pci 0000:00:05.0:   bridge window [io  0x9000-0x9fff]
pci 0000:00:05.0:   bridge window [mem 0xfe700000-0xfe7fffff]
pci 0000:00:09.0: PCI bridge to [bus 03-03]
pci 0000:00:09.0:   bridge window [io  0xa000-0xafff]
pci 0000:00:09.0:   bridge window [mem 0xfe800000-0xfe8fffff]
pci 0000:04:03.0: PCI bridge to [bus 05-05]
pci 0000:04:03.0:   bridge window [io  0xd000-0xefff]
pci 0000:04:03.0:   bridge window [mem 0xfea00000-0xfebfffff]
pci 0000:00:14.4: PCI bridge to [bus 04-05]
pci 0000:00:14.4:   bridge window [io  0xb000-0xefff]
pci 0000:00:14.4:   bridge window [mem 0xfe900000-0xfebfffff]
pci_bus 0000:00: resource 4 [io  0x0000-0x0cf7]
pci_bus 0000:00: resource 5 [io  0x0d00-0xffff]
pci_bus 0000:00: resource 6 [mem 0x000a0000-0x000bffff]
pci_bus 0000:00: resource 7 [mem 0x000d0000-0x000dffff]
pci_bus 0000:00: resource 8 [mem 0xf0000000-0xfebfffff]
pci_bus 0000:01: resource 0 [io  0x8000-0x8fff]
pci_bus 0000:01: resource 1 [mem 0xfe600000-0xfe6fffff]
pci_bus 0000:01: resource 2 [mem 0xfc000000-0xfdffffff 64bit pref]
pci_bus 0000:02: resource 0 [io  0x9000-0x9fff]
pci_bus 0000:02: resource 1 [mem 0xfe700000-0xfe7fffff]
pci_bus 0000:03: resource 0 [io  0xa000-0xafff]
pci_bus 0000:03: resource 1 [mem 0xfe800000-0xfe8fffff]
pci_bus 0000:04: resource 0 [io  0xb000-0xefff]
pci_bus 0000:04: resource 1 [mem 0xfe900000-0xfebfffff]
pci_bus 0000:04: resource 4 [io  0x0000-0x0cf7]
pci_bus 0000:04: resource 5 [io  0x0d00-0xffff]
pci_bus 0000:04: resource 6 [mem 0x000a0000-0x000bffff]
pci_bus 0000:04: resource 7 [mem 0x000d0000-0x000dffff]
pci_bus 0000:04: resource 8 [mem 0xf0000000-0xfebfffff]
pci_bus 0000:05: resource 0 [io  0xd000-0xefff]
pci_bus 0000:05: resource 1 [mem 0xfea00000-0xfebfffff]
NET: Registered protocol family 2
IP route cache hash table entries: 262144 (order: 9, 2097152 bytes)
TCP established hash table entries: 262144 (order: 10, 4194304 bytes)
TCP bind hash table entries: 65536 (order: 8, 1048576 bytes)
TCP: Hash tables configured (established 262144 bind 65536)
TCP: reno registered
UDP hash table entries: 4096 (order: 5, 131072 bytes)
UDP-Lite hash table entries: 4096 (order: 5, 131072 bytes)
NET: Registered protocol family 1
RPC: Registered named UNIX socket transport module.
RPC: Registered udp transport module.
RPC: Registered tcp transport module.
RPC: Registered tcp NFSv4.1 backchannel transport module.
pci 0000:01:00.0: Boot video device
PCI: CLS 64 bytes, default 64
PCI-DMA: Disabling AGP.
PCI-DMA: aperture base @ d4000000 size 65536 KB
PCI-DMA: using GART IOMMU.
PCI-DMA: Reserving 64MB of IOMMU area in the AGP aperture
LVT offset 1 assigned for vector 0x400
IBS: LVT offset 1 assigned
perf: AMD IBS detected (0x00000007)
Installing knfsd (copyright (C) 1996 okir@monad.swb.de).
msgmni has been set to 15959
alg: No test for stdrng (krng)
io scheduler noop registered
io scheduler deadline registered
io scheduler cfq registered (default)
ACPI: duty_cycle spans bit 4
ACPI: processor limited to max C-state 1
Serial: 8250/16550 driver, 5 ports, IRQ sharing enabled
serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
00:05: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
ttyS4: detected caps 00000700 should be 00000500
0000:04:02.0: ttyS4 at I/O 0xc400 (irq = 22) is a 16C950/954
ttyS1: detected caps 00000700 should be 00000500
0000:04:02.0: ttyS1 at I/O 0xc408 (irq = 22) is a 16C950/954
ttyS2: detected caps 00000700 should be 00000500
0000:04:02.0: ttyS2 at I/O 0xc410 (irq = 22) is a 16C950/954
ttyS3: detected caps 00000700 should be 00000500
0000:04:02.0: ttyS3 at I/O 0xc418 (irq = 22) is a 16C950/954
lp: driver loaded but no devices found
Linux agpgart interface v0.103
PCI parallel port detected: 1415:9513, I/O at 0xb800(0x0), IRQ 23
parport0: PC-style at 0xb800, irq 23 [PCSPP]
lp0: using parport0 (interrupt-driven).
Floppy drive(s): fd0 is 1.44M
FDC 0 is a post-1991 82077
loop: module loaded
Uniform Multi-Platform E-IDE driver
atiixp 0000:00:14.1: IDE controller (0x1002:0x438c rev 0x00)
atiixp 0000:00:14.1: not 100% native mode: will probe irqs later
    ide0: BM-DMA at 0xff00-0xff07
Probing IDE interface ide0...
Refined TSC clocksource calibration: 2500.174 MHz.
Switching to clocksource tsc
hda: _NEC DVD_RW ND-3540A, ATAPI CD/DVD-ROM drive
hda: host max PIO4 wanted PIO255(auto-tune) selected PIO4
hda: UDMA/33 mode selected
ide0 at 0x1f0-0x1f7,0x3f6 on irq 14
ide_generic: please use "probe_mask=0x3f" module parameter for probing all legacy ISA IDE ports
ide-gd driver 1.18
ide-cd driver 5.00
ide-cd: hda: ATAPI 48X DVD-ROM DVD-R CD-R/RW drive, 2048kB Cache
cdrom: Uniform CD-ROM driver Revision: 3.20
ahci 0000:00:12.0: version 3.0
ahci 0000:00:12.0: MSI K9A2 Platinum: enabling 64bit DMA
ahci 0000:00:12.0: AHCI 0001.0100 32 slots 4 ports 3 Gbps 0xf impl SATA mode
ahci 0000:00:12.0: flags: 64bit ncq sntf ilck pm led clo pmp pio slum part ccc 
scsi0 : ahci
scsi1 : ahci
scsi2 : ahci
scsi3 : ahci
ata1: SATA max UDMA/133 abar m1024@0xfe5ff800 port 0xfe5ff900 irq 22
ata2: SATA max UDMA/133 abar m1024@0xfe5ff800 port 0xfe5ff980 irq 22
ata3: SATA max UDMA/133 abar m1024@0xfe5ff800 port 0xfe5ffa00 irq 22
ata4: SATA max UDMA/133 abar m1024@0xfe5ff800 port 0xfe5ffa80 irq 22
ahci 0000:03:00.0: PDC42819 can only drive SATA devices with this driver
ahci 0000:03:00.0: AHCI 0001.0100 32 slots 4 ports 3 Gbps 0xf impl RAID mode
ahci 0000:03:00.0: flags: 64bit ncq sntf ilck pm led clo pmp pio slum part ccc 
scsi4 : ahci
scsi5 : ahci
scsi6 : ahci
scsi7 : ahci
ata5: SATA max UDMA/133 abar m8192@0xfe8fc000 port 0xfe8fc100 irq 17
ata6: SATA max UDMA/133 abar m8192@0xfe8fc000 port 0xfe8fc180 irq 17
ata7: SATA max UDMA/133 abar m8192@0xfe8fc000 port 0xfe8fc200 irq 17
ata8: SATA max UDMA/133 abar m8192@0xfe8fc000 port 0xfe8fc280 irq 17
Linux Tulip driver version 1.1.15-NAPI (Feb 27, 2007)
tulip0: EEPROM default media type Autosense
tulip0: Index #0 - Media MII (#11) described by a 21142 MII PHY (3) block
tulip0:  MII transceiver #1 config 3100 status 7869 advertising 01e1
net eth0: Digital DS21142/43 Tulip rev 65 at MMIO 0xfebffc00, 00:80:c8:b9:c1:d5, IRQ 21
tulip1: EEPROM default media type Autosense
tulip1: Index #0 - Media MII (#11) described by a 21142 MII PHY (3) block
tulip1:  MII transceiver #1 config 3100 status 7869 advertising 01e1
net eth1: Digital DS21142/43 Tulip rev 65 at MMIO 0xfebff800, 00:80:c8:b9:c1:d6, IRQ 22
tulip2: EEPROM default media type Autosense
tulip2: Index #0 - Media MII (#11) described by a 21142 MII PHY (3) block
tulip2:  MII transceiver #1 config 3100 status 7849 advertising 01e1
net eth2: Digital DS21142/43 Tulip rev 65 at MMIO 0xfebff400, 00:80:c8:b9:c1:d7, IRQ 23
tulip3: EEPROM default media type Autosense
tulip3: Index #0 - Media MII (#11) described by a 21142 MII PHY (3) block
tulip3:  MII transceiver #1 config 3100 status 7869 advertising 01e1
net eth3: Digital DS21142/43 Tulip rev 65 at MMIO 0xfebff000, 00:80:c8:b9:c1:d8, IRQ 20
r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
r8169 0000:02:00.0: irq 40 for MSI/MSI-X
r8169 0000:02:00.0: eth4: RTL8168b/8111b at 0xffffc90000020000, 00:21:85:16:51:7f, XID 18000000 IRQ 40
r8169 0000:02:00.0: eth4: jumbo features [frames: 4080 bytes, tx checksumming: ko]
ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
ehci_hcd 0000:00:13.5: EHCI Host Controller
ehci_hcd 0000:00:13.5: new USB bus registered, assigned bus number 1
ehci_hcd 0000:00:13.5: applying AMD SB600/SB700 USB freeze workaround
ehci_hcd 0000:00:13.5: debug port 1
ehci_hcd 0000:00:13.5: irq 19, io mem 0xfe5ff000
ehci_hcd 0000:00:13.5: USB 2.0 started, EHCI 1.00
hub 1-0:1.0: USB hub found
hub 1-0:1.0: 10 ports detected
ohci_hcd: USB 1.1 'Open' Host Controller (OHCI) Driver
ohci_hcd 0000:00:13.0: OHCI Host Controller
ohci_hcd 0000:00:13.0: new USB bus registered, assigned bus number 2
ohci_hcd 0000:00:13.0: irq 16, io mem 0xfe5fe000
hub 2-0:1.0: USB hub found
hub 2-0:1.0: 2 ports detected
ohci_hcd 0000:00:13.1: OHCI Host Controller
ohci_hcd 0000:00:13.1: new USB bus registered, assigned bus number 3
ohci_hcd 0000:00:13.1: irq 17, io mem 0xfe5fd000
hub 3-0:1.0: USB hub found
hub 3-0:1.0: 2 ports detected
ohci_hcd 0000:00:13.2: OHCI Host Controller
ohci_hcd 0000:00:13.2: new USB bus registered, assigned bus number 4
ohci_hcd 0000:00:13.2: irq 18, io mem 0xfe5fc000
hub 4-0:1.0: USB hub found
hub 4-0:1.0: 2 ports detected
ohci_hcd 0000:00:13.3: OHCI Host Controller
ohci_hcd 0000:00:13.3: new USB bus registered, assigned bus number 5
ohci_hcd 0000:00:13.3: irq 17, io mem 0xfe5fb000
ata3: SATA link up 3.0 Gbps (SStatus 123 SControl 300)
ata4: SATA link up 3.0 Gbps (SStatus 123 SControl 300)
ata7: SATA link down (SStatus 0 SControl 300)
ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
ata5: SATA link up 3.0 Gbps (SStatus 123 SControl 300)
ata1: SATA link up 3.0 Gbps (SStatus 123 SControl 300)
ata8: SATA link down (SStatus 0 SControl 300)
ata6: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
ata6.00: ATA-8: Hitachi HDS5C3020ALA632, ML6OA580, max UDMA/133
ata3.00: ATA-8: ST3750330AS, SD1A, max UDMA/133
ata3.00: 1465149168 sectors, multi 16: LBA48 NCQ (depth 31/32)
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: ATA-8: ST3750330AS, SD1A, max UDMA/133
ata4.00: 1465149168 sectors, multi 16: LBA48 NCQ (depth 31/32)
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata6.00: 3907029168 sectors, multi 16: LBA48 NCQ (depth 31/32), AA
ata2.00: ATA-8: ST3750330AS, SD1A, max UDMA/133
ata2.00: 1465149168 sectors, multi 16: LBA48 NCQ (depth 31/32)
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: ATA-8: ST3750330AS, SD1A, max UDMA/133
ata1.00: 1465149168 sectors, multi 16: LBA48 NCQ (depth 31/32)
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata5.00: ATA-8: ST3750330AS, SD1A, max UDMA/133
ata5.00: 1465149168 sectors, multi 16: LBA48 NCQ (depth 31/32)
ata6.00: configured for UDMA/133
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
scsi 0:0:0:0: Direct-Access     ATA      ST3750330AS      SD1A PQ: 0 ANSI: 5
ata2.00: configured for UDMA/133
ata5.00: configured for UDMA/133
sd 0:0:0:0: [sda] 1465149168 512-byte logical blocks: (750 GB/698 GiB)
sd 0:0:0:0: [sda] Write Protect is off
sd 0:0:0:0: [sda] Mode Sense: 00 3a 00 00
sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
scsi 1:0:0:0: Direct-Access     ATA      ST3750330AS      SD1A PQ: 0 ANSI: 5
sd 1:0:0:0: [sdb] 1465149168 512-byte logical blocks: (750 GB/698 GiB)
sd 1:0:0:0: [sdb] Write Protect is off
sd 1:0:0:0: [sdb] Mode Sense: 00 3a 00 00
scsi 2:0:0:0: Direct-Access     ATA      ST3750330AS      SD1A PQ: 0 ANSI: 5
sd 1:0:0:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 2:0:0:0: [sdc] 1465149168 512-byte logical blocks: (750 GB/698 GiB)
sd 2:0:0:0: [sdc] Write Protect is off
sd 2:0:0:0: [sdc] Mode Sense: 00 3a 00 00
scsi 3:0:0:0: Direct-Access     ATA      ST3750330AS      SD1A PQ: 0 ANSI: 5
sd 2:0:0:0: [sdc] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 3:0:0:0: [sdd] 1465149168 512-byte logical blocks: (750 GB/698 GiB)
sd 3:0:0:0: [sdd] Write Protect is off
sd 3:0:0:0: [sdd] Mode Sense: 00 3a 00 00
sd 3:0:0:0: [sdd] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
scsi 4:0:0:0: Direct-Access     ATA      ST3750330AS      SD1A PQ: 0 ANSI: 5
sd 4:0:0:0: [sde] 1465149168 512-byte logical blocks: (750 GB/698 GiB)
sd 4:0:0:0: [sde] Write Protect is off
sd 4:0:0:0: [sde] Mode Sense: 00 3a 00 00
scsi 5:0:0:0: Direct-Access     ATA      Hitachi HDS5C302 ML6O PQ: 0 ANSI: 5
sd 4:0:0:0: [sde] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 5:0:0:0: [sdf] 3907029168 512-byte logical blocks: (2.00 TB/1.81 TiB)
sd 5:0:0:0: [sdf] Write Protect is off
sd 5:0:0:0: [sdf] Mode Sense: 00 3a 00 10
sd 5:0:0:0: [sdf] Write cache: enabled, read cache: enabled, supports DPO and FUA
 sdf: sdf1
sd 5:0:0:0: [sdf] Attached SCSI disk
 sdb: sdb1 sdb2 sdb3 sdb4
sd 1:0:0:0: [sdb] Attached SCSI disk
 sdc: sdc1 sdc2 sdc3 sdc4
hub 5-0:1.0: USB hub found
hub 5-0:1.0: 2 ports detected
ohci_hcd 0000:00:13.4: OHCI Host Controller
ohci_hcd 0000:00:13.4: new USB bus registered, assigned bus number 6
ohci_hcd 0000:00:13.4: irq 18, io mem 0xfe5fa000
sd 2:0:0:0: [sdc] Attached SCSI disk
 sdd: sdd1 sdd2 sdd3 sdd4
sd 3:0:0:0: [sdd] Attached SCSI disk
 sda: sda1 sda2 sda3 sda4
 sde: sde1 sde2 sde3 sde4
sd 0:0:0:0: [sda] Attached SCSI disk
sd 4:0:0:0: [sde] Attached SCSI disk
hub 6-0:1.0: USB hub found
hub 6-0:1.0: 2 ports detected
Initializing USB Mass Storage driver...
usbcore: registered new interface driver usb-storage
USB Mass Storage support registered.
usbcore: registered new interface driver usbserial
usbserial: USB Serial Driver core
usbcore: registered new interface driver pl2303
USB Serial support registered for pl2303
i8042: PNP: PS/2 Controller [PNP0303:PS2K,PNP0f03:PS2M] at 0x60,0x64 irq 1,12
serio: i8042 KBD port at 0x60,0x64 irq 1
serio: i8042 AUX port at 0x60,0x64 irq 12
mousedev: PS/2 mouse device common for all mice
rtc_cmos 00:02: RTC can wake from S4
rtc_cmos 00:02: rtc core: registered rtc_cmos as rtc0
rtc0: alarms up to one month, y3k, 114 bytes nvram, hpet irqs
i2c /dev entries driver
ACPI Warning: 0x0000000000000b00-0x0000000000000b07 SystemIO conflicts with Region \SOR1 1 (20120320/utaddress-251)
ACPI: This conflict may cause random problems and system instability
ACPI: If an ACPI driver is available for this device, you should use it instead of the native driver
piix4_smbus 0000:00:14.0: SMBus Host Controller at 0xb00, revision 0
input: AT Translated Set 2 keyboard as /devices/platform/i8042/serio0/input/input0
pps_ldisc: PPS line discipline registered
f71882fg: Found f71882fg chip at 0x600, revision 32
ACPI Warning: 0x0000000000000600-0x0000000000000607 SystemIO conflicts with Region \HMOR 1 (20120320/utaddress-251)
ACPI: This conflict may cause random problems and system instability
ACPI: If an ACPI driver is available for this device, you should use it instead of the native driver
f71882fg f71882fg.1536: Fan: 1 is in duty-cycle mode
f71882fg f71882fg.1536: Fan: 2 is in duty-cycle mode
f71882fg f71882fg.1536: Fan: 3 is in duty-cycle mode
f71882fg f71882fg.1536: Fan: 4 is in duty-cycle mode
k10temp 0000:00:18.3: unreliable CPU thermal sensor; check erratum 319
md: raid0 personality registered for level 0
md: raid1 personality registered for level 1
md: raid10 personality registered for level 10
md: raid6 personality registered for level 6
md: raid5 personality registered for level 5
md: raid4 personality registered for level 4
EDAC MC: Ver: 2.1.0
AMD64 EDAC driver v3.4.0
EDAC amd64: DRAM ECC disabled.
EDAC amd64: ECC disabled in the BIOS or no ECC capability, module will not load.
 Either enable ECC checking or force module loading by setting 'ecc_enable_override'.
 (Note that use of the override may cause unknown side effects.)
cpuidle: using governor ladder
cpuidle: using governor menu
usbcore: registered new interface driver usbhid
usbhid: USB HID core driver
GACT probability on
netem: version 1.3
u32 classifier
    Actions configured
Netfilter messages via NETLINK v0.30.
nf_conntrack version 0.5.0 (16384 buckets, 65536 max)
ctnetlink v0.93: registering with nfnetlink.
NF_TPROXY: Transparent proxy support initialized, version 4.1.0
NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.
ip_set: protocol 6
IPv4 over IPv4 tunneling driver
ip_tables: (C) 2000-2006 Netfilter Core Team
TCP: bic registered
TCP: cubic registered
TCP: westwood registered
TCP: highspeed registered
TCP: hybla registered
TCP: htcp registered
TCP: vegas registered
TCP: veno registered
TCP: scalable registered
TCP: lp registered
TCP: yeah registered
TCP: illinois registered
Initializing XFRM netlink socket
NET: Registered protocol family 10
ip6_tables: (C) 2000-2006 Netfilter Core Team
IPv6 over IPv4 tunneling driver
NET: Registered protocol family 17
NET: Registered protocol family 15
Bridge firewalling registered
8021q: 802.1Q VLAN Support v1.8
registered taskstats version 1
rtc_cmos 00:02: setting system clock to 2012-05-24 02:27:33 UTC (1337826453)
powernow-k8: Found 1 AMD Phenom(tm) 9850 Quad-Core Processor (4 cpu cores) (version 2.20.00)
[Firmware Bug]: powernow-k8: No compatible ACPI _PSS objects found.
[Firmware Bug]: powernow-k8: Try again with latest BIOS.
ALSA device list:
  #0: HDA ATI SB at 0xfe5f4000 irq 16
usb 6-2: new low-speed USB device number 2 using ohci_hcd
input: HID 0430:0100 as /devices/pci0000:00/0000:00:13.4/usb6/6-2/6-2:1.0/input/input1
generic-usb 0003:0430:0100.0001: input: USB HID v1.00 Mouse [HID 0430:0100] on usb-0000:00:13.4-2/input0
input: PS/2 Generic Mouse as /devices/platform/i8042/serio1/input/input2
md: Waiting for all devices to be available before autodetect
md: If you don't use raid, use raid=noautodetect
md: Autodetecting RAID arrays.
md: Scanned 20 and added 20 devices.
md: autorun ...
md: considering sde4 ...
md:  adding sde4 ...
md: sde3 has different UUID to sde4
md: sde2 has different UUID to sde4
md: sde1 has different UUID to sde4
md:  adding sda4 ...
md: sda3 has different UUID to sde4
md: sda2 has different UUID to sde4
md: sda1 has different UUID to sde4
md:  adding sdd4 ...
md: sdd3 has different UUID to sde4
md: sdd2 has different UUID to sde4
md: sdd1 has different UUID to sde4
md:  adding sdc4 ...
md: sdc3 has different UUID to sde4
md: sdc2 has different UUID to sde4
md: sdc1 has different UUID to sde4
md:  adding sdb4 ...
md: sdb3 has different UUID to sde4
md: sdb2 has different UUID to sde4
md: sdb1 has different UUID to sde4
md: created md6
md: bind<sdb4>
md: bind<sdc4>
md: bind<sdd4>
md: bind<sda4>
md: bind<sde4>
md: running: <sde4><sda4><sdd4><sdc4><sdb4>
bio: create slab <bio-1> at 1
md/raid:md6: device sde4 operational as raid disk 4
md/raid:md6: device sda4 operational as raid disk 0
md/raid:md6: device sdd4 operational as raid disk 3
md/raid:md6: device sdc4 operational as raid disk 2
md/raid:md6: device sdb4 operational as raid disk 1
md/raid:md6: allocated 5350kB
md/raid:md6: raid level 5 active with 5 out of 5 devices, algorithm 2
RAID conf printout:
 --- level:5 rd:5 wd:5
 disk 0, o:1, dev:sda4
 disk 1, o:1, dev:sdb4
 disk 2, o:1, dev:sdc4
 disk 3, o:1, dev:sdd4
 disk 4, o:1, dev:sde4
created bitmap (173 pages) for device md6
md6: bitmap initialized from disk: read 11/11 pages, set 0 of 354293 bits
md6: detected capacity change from 0 to 1486011498496
md: considering sde3 ...
md:  adding sde3 ...
md: sde2 has different UUID to sde3
md: sde1 has different UUID to sde3
md:  adding sda3 ...
md: sda2 has different UUID to sde3
md: sda1 has different UUID to sde3
md: sdd3 has different UUID to sde3
md: sdd2 has different UUID to sde3
md: sdd1 has different UUID to sde3
md: sdc3 has different UUID to sde3
md: sdc2 has different UUID to sde3
md: sdc1 has different UUID to sde3
md:  adding sdb3 ...
md: sdb2 has different UUID to sde3
md: sdb1 has different UUID to sde3
md: created md7
md: bind<sdb3>
md: bind<sda3>
md: bind<sde3>
md: running: <sde3><sda3><sdb3>
md/raid1:md7: active with 2 out of 2 mirrors
md7: detected capacity change from 0 to 6144196608
md: considering sde2 ...
RAID1 conf printout:
 --- wd:2 rd:2
 disk 0, wo:0, o:1, dev:sda3
 disk 1, wo:0, o:1, dev:sdb3
md:  adding sde2 ...
md: sde1 has different UUID to sde2
md:  adding sda2 ...
md: sda1 has different UUID to sde2
md: sdd3 has different UUID to sde2
md:  adding sdd2 ...
md: sdd1 has different UUID to sde2
md: sdc3 has different UUID to sde2
md:  adding sdc2 ...
md: sdc1 has different UUID to sde2
md:  adding sdb2 ...
md: sdb1 has different UUID to sde2
md: created md5
md: bind<sdb2>
md: bind<sdc2>
md: bind<sdd2>
md: bind<sda2>
md: bind<sde2>
md: running: <sde2><sda2><sdd2><sdc2><sdb2>
md/raid10:md5: active with 4 out of 4 devices
created bitmap (173 pages) for device md5
md5: bitmap initialized from disk: read 11/11 pages, set 0 of 354293 bits
md5: detected capacity change from 0 to 743005749248
md: considering sde1 ...
RAID10 conf printout:
 --- wd:4 rd:4
 disk 0, wo:0, o:1, dev:sda2
 disk 1, wo:0, o:1, dev:sdb2
 disk 2, wo:0, o:1, dev:sdc2
 disk 3, wo:0, o:1, dev:sdd2
md:  adding sde1 ...
md:  adding sda1 ...
md: sdd3 has different UUID to sde1
md:  adding sdd1 ...
md: sdc3 has different UUID to sde1
md:  adding sdc1 ...
md:  adding sdb1 ...
md: created md0
md: bind<sdb1>
md: bind<sdc1>
md: bind<sdd1>
md: bind<sda1>
md: bind<sde1>
md: running: <sde1><sda1><sdd1><sdc1><sdb1>
md/raid1:md0: active with 5 out of 5 mirrors
md0: detected capacity change from 0 to 1003356160
md: considering sdd3 ...
md:  adding sdd3 ...
md:  adding sdc3 ...
md: created md8
md: bind<sdc3>
md: bind<sdd3>
md: running: <sdd3><sdc3>
md/raid1:md8: active with 2 out of 2 mirrors
md8: detected capacity change from 0 to 6144196608
md: ... autorun DONE.
 md5: unknown partition table
kjournald starting.  Commit interval 5 seconds
EXT3-fs (md5): mounted filesystem with ordered data mode
VFS: Mounted root (ext3 filesystem) readonly on device 9:5.
Freeing unused kernel memory: 476k freed
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
 md8: unknown partition table
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
 md6: unknown partition table
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
 md7: unknown partition table
 md0:
udevd[1132]: renamed network interface eth4 to inside
udevd[1151]: renamed network interface eth0 to cable
udevd[1141]: renamed network interface eth1 to dmz
udevd[1155]: renamed network interface eth2 to t1
udevd[1156]: renamed network interface eth3 to spare
hda: UDMA/33 mode selected
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata5.00: configured for UDMA/133
ata5: EH complete
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata5.00: configured for UDMA/133
ata5: EH complete
ata6.00: configured for UDMA/133
ata6: EH complete
hda: UDMA/33 mode selected
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: SB600 AHCI: limiting to 255 sectors per cmd
ata1.00: configured for UDMA/133
ata1: EH complete
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: SB600 AHCI: limiting to 255 sectors per cmd
ata2.00: configured for UDMA/133
ata2: EH complete
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: SB600 AHCI: limiting to 255 sectors per cmd
ata3.00: configured for UDMA/133
ata3: EH complete
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: SB600 AHCI: limiting to 255 sectors per cmd
ata4.00: configured for UDMA/133
ata4: EH complete
ata5.00: configured for UDMA/133
ata5: EH complete
ata6.00: configured for UDMA/133
ata6: EH complete
Adding 6000188k swap on /dev/md7.  Priority:1 extents:1 across:6000188k 
Adding 6000188k swap on /dev/md8.  Priority:1 extents:1 across:6000188k 
scsi_verify_blk_ioctl: 562 callbacks suppressed
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
EXT3-fs (md5): using internal journal
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
mdadm: sending ioctl 1261 to a partition!
kjournald starting.  Commit interval 5 seconds
EXT3-fs (md6): using internal journal
EXT3-fs (md6): mounted filesystem with writeback data mode
EXT4-fs (sdf1): mounted filesystem with writeback data mode. Opts: data=writeback
r8169 0000:02:00.0: inside: link down
r8169 0000:02:00.0: inside: link down
NOHZ: local_softirq_pending 08
ADDRCONF(NETDEV_UP): inside: link is not ready
r8169 0000:02:00.0: inside: link up
ADDRCONF(NETDEV_CHANGE): inside: link becomes ready
net dmz: Setting full-duplex based on MII#1 link partner capability of 45e1
net cable: Setting full-duplex based on MII#1 link partner capability of 41e1
cable: no IPv6 routers present
dmz: no IPv6 routers present
inside: no IPv6 routers present
postgres (4955): /proc/4955/oom_adj is deprecated, please use /proc/4955/oom_score_adj instead.
pps pps0: new PPS source serial3 at ID 0
pps pps0: source "/dev/ttyS3" added
pps pps1: new PPS source serial4 at ID 1
pps pps1: source "/dev/ttyS4" added
device cable entered promiscuous mode
device dmz entered promiscuous mode
UDP: bad checksum. From yyy.yy.yyy.yy:1215 to xx.xxx.xx.xxx:62901 ulen 38
UDP: short packet: From zzz.zzz.zz.zz:1400 49320/143 to xx.xxx.xx.xxx:6881
------------[ cut here ]------------
WARNING: at net/sched/sch_generic.c:256 dev_watchdog+0xe9/0x15c()
Hardware name: MS-7376
NETDEV WATCHDOG: inside (r8169): transmit queue 0 timed out
Pid: 0, comm: swapper/3 Not tainted 3.4.0-00020-g3579858 #156
Call Trace:
 <IRQ>  [<ffffffff813122ba>] ? dev_watchdog+0xe9/0x15c
 [<ffffffff810249a1>] ? warn_slowpath_common+0x71/0x85
 [<ffffffff813121d1>] ? netif_tx_lock+0x7a/0x7a
 [<ffffffff81024a19>] ? warn_slowpath_fmt+0x45/0x4a
 [<ffffffff813121be>] ? netif_tx_lock+0x67/0x7a
 [<ffffffff813122ba>] ? dev_watchdog+0xe9/0x15c
 [<ffffffff8102ce10>] ? run_timer_softirq+0x17e/0x20b
 [<ffffffff81028d91>] ? __do_softirq+0x80/0x102
 [<ffffffff8140550c>] ? call_softirq+0x1c/0x30
 [<ffffffff81003300>] ? do_softirq+0x2c/0x60
 [<ffffffff81028fc4>] ? irq_exit+0x3a/0x91
 [<ffffffff8100314d>] ? do_IRQ+0x81/0x97
 [<ffffffff81403b27>] ? common_interrupt+0x67/0x67
 <EOI>  [<ffffffff81007c94>] ? default_idle+0x1e/0x32
 [<ffffffff81007db8>] ? amd_e400_idle+0xb7/0xd4
 [<ffffffff81008461>] ? cpu_idle+0x58/0x98
---[ end trace 14c35d45980e4004 ]---
r8169 0000:02:00.0: inside: link up

^ permalink raw reply

* [RFC PATCH] tcp: Fast/early SYN handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-24 13:01 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: Martin Topholm, netdev

Hi Eric,

I have been doing some TCP performance measurements with SYN flooding,
and have found that, we don't handle this case well.

I have made a patch for fast/early SYN handling in tcp_v4_rcv() in
net/ipv4/tcp_ipv4.c.  This increases SYN performance from 130 kpps to
750 kpps (max of the generator), with idle CPU cycles.

Current locking:
 During a SYN flood (against a single port) all CPUs are spinning on
the same spinlock, namely bh_lock_sock_nested(sk), in tcp_ipv4.c.  The
lock dates back to a commit by DaveM in May 1999, see historic
commit[1].  It seem that TCP runs fully locked, per sock.

I need some help with locking, as the patch seems to work fine, with
NO-PREEMPT, but with PREEMPT enabled I start to see warnings (in
reqsk_queue_destroy) and oopses (in inet_csk_reqsk_queue_prune).

What am I missing?

[1] Historic commit: http://git.kernel.org/?p=linux/kernel/git/davem/netdev-vger-cvs.git;a=commitdiff;h=5744fad55cefbd6f079410500a507443d92d63ff

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer


[RFC PATCH] tcp: Fast/early SYN handling to mitigate SYN floods

TCP SYN handling is on the slow path via tcp_v4_rcv(), and is
performed while holding spinlock bh_lock_sock().

Real-life and testlab experiments show, that the kernel choks
when reaching 130Kpps SYN floods (powerful Nehalem 16 cores).
Measuring with perf reveals, that its caused by
bh_lock_sock_nested() call in tcp_v4_rcv().

With this patch, the machine can handle 750Kpps (max of the SYN
flood generator) with cycles to spare.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 net/ipv4/tcp_ipv4.c |   16 ++++++++++++++++
 1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e76ffb..7d7e8e0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1718,6 +1718,22 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	if (!sk)
 		goto no_tcp_socket;
 
+	/* Fast/early SYN handling, to mitigate SYN attacks */
+	if (sk->sk_state == TCP_LISTEN && th->syn && !th->ack && !th->fin) {
+		//bh_lock_sock_nested(sk); /* Don't think lock is needed */
+		/* Handles syn cookie, normally called from
+		 * tcp_rcv_state_process() */
+		tcp_v4_conn_request(sk, skb);
+		//bh_unlock_sock(sk);
+
+		/* Questions, do we (really) need to create a new sk,
+		 * as in tcp_v4_hnd_req() ?
+		 */
+		sock_put(sk);
+		kfree_skb(skb);
+		return 0;
+	}
+
 process:
 	if (sk->sk_state == TCP_TIME_WAIT)
 		goto do_time_wait;

^ permalink raw reply related

* Re: [RFC PATCH] tcp: Fast/early SYN handling to mitigate SYN floods
From: Hans Schillstrom @ 2012-05-24 13:20 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Eric Dumazet, David Miller, Martin Topholm, netdev
In-Reply-To: <1337864467.13491.15.camel@localhost>

Hi Jesper
We are also working with this issue right now,

On Thursday 24 May 2012 15:01:07 Jesper Dangaard Brouer wrote:
> Hi Eric,
> 
> I have been doing some TCP performance measurements with SYN flooding,
> and have found that, we don't handle this case well.
> 
> I have made a patch for fast/early SYN handling in tcp_v4_rcv() in
> net/ipv4/tcp_ipv4.c.  This increases SYN performance from 130 kpps to
> 750 kpps (max of the generator), with idle CPU cycles.
> 
> Current locking:
>  During a SYN flood (against a single port) all CPUs are spinning on
> the same spinlock, namely bh_lock_sock_nested(sk), in tcp_ipv4.c.  The
> lock dates back to a commit by DaveM in May 1999, see historic
> commit[1].  It seem that TCP runs fully locked, per sock.
> 
> I need some help with locking, as the patch seems to work fine, with
> NO-PREEMPT, but with PREEMPT enabled I start to see warnings (in
> reqsk_queue_destroy) and oopses (in inet_csk_reqsk_queue_prune).
> 
> What am I missing?
> 
> [1] Historic commit: http://git.kernel.org/?p=linux/kernel/git/davem/netdev-vger-cvs.git;a=commitdiff;h=5744fad55cefbd6f079410500a507443d92d63ff
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Sr. Network Kernel Developer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
> 
> 
> [RFC PATCH] tcp: Fast/early SYN handling to mitigate SYN floods
> 
> TCP SYN handling is on the slow path via tcp_v4_rcv(), and is
> performed while holding spinlock bh_lock_sock().
> 
> Real-life and testlab experiments show, that the kernel choks
> when reaching 130Kpps SYN floods (powerful Nehalem 16 cores).
> Measuring with perf reveals, that its caused by
> bh_lock_sock_nested() call in tcp_v4_rcv().

I can confirm this too, and it doesn't scale with more cores

> 
> With this patch, the machine can handle 750Kpps (max of the SYN
> flood generator) with cycles to spare.
This looks great.

I'm also working with a solution that not trash conntack
i.e. have conntrack working during a heavy SYN attack

-- 
Regards
Hans Schillstrom 

^ permalink raw reply

* Re: [RFC PATCH] tcp: Fast/early SYN handling to mitigate SYN floods
From: Christoph Paasch @ 2012-05-24 13:26 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Eric Dumazet, David Miller, Martin Topholm, netdev
In-Reply-To: <1337864467.13491.15.camel@localhost>

Hello,

On 05/24/2012 03:01 PM, Jesper Dangaard Brouer wrote:
> I have been doing some TCP performance measurements with SYN flooding,
> and have found that, we don't handle this case well.
> 
> I have made a patch for fast/early SYN handling in tcp_v4_rcv() in
> net/ipv4/tcp_ipv4.c.  This increases SYN performance from 130 kpps to
> 750 kpps (max of the generator), with idle CPU cycles.
> 
> Current locking:
>  During a SYN flood (against a single port) all CPUs are spinning on
> the same spinlock, namely bh_lock_sock_nested(sk), in tcp_ipv4.c.  The
> lock dates back to a commit by DaveM in May 1999, see historic
> commit[1].  It seem that TCP runs fully locked, per sock.
> 
> I need some help with locking, as the patch seems to work fine, with
> NO-PREEMPT, but with PREEMPT enabled I start to see warnings (in
> reqsk_queue_destroy) and oopses (in inet_csk_reqsk_queue_prune).
> 
> What am I missing?

For each retransmission of a SYN you will add a request-sock to the
syn_table, because you do not pass by tcp_v4_hnd_req(), which checks
this by calling inet_csk_search_req().

And your warning in reqsk_queue_destroy is because the access to the the
request_sock_queue is no more protected by a lock.


The request_sock_queue is a shared resource, which must be protect by a
lock. As you allow "parallel" SYN-processing, the queue will get corrupted.


Cheers,
Christoph


-- 
Christoph Paasch
PhD Student

IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://mptcp.info.ucl.ac.be
Université Catholique de Louvain
-- 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox