Netdev List
 help / color / mirror / Atom feed
* [PATCH 3/5] netfilter: xtables: inclusion of xt_TEE
From: Jan Engelhardt @ 2010-03-31 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-1-git-send-email-jengelh@medozas.de>

xt_TEE can be used to clone and reroute a packet. This can for
example be used to copy traffic at a router for logging purposes
to another dedicated machine.

References: http://www.gossamer-threads.com/lists/iptables/devel/68781
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 include/linux/netfilter/Kbuild   |    1 +
 include/linux/netfilter/xt_TEE.h |    8 +
 net/ipv4/ip_output.c             |    1 +
 net/ipv6/ip6_output.c            |    1 +
 net/netfilter/Kconfig            |    7 +
 net/netfilter/Makefile           |    1 +
 net/netfilter/xt_TEE.c           |  272 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 291 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/netfilter/xt_TEE.h
 create mode 100644 net/netfilter/xt_TEE.c

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index a5a63e4..48767cd 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -16,6 +16,7 @@ header-y += xt_RATEEST.h
 header-y += xt_SECMARK.h
 header-y += xt_TCPMSS.h
 header-y += xt_TCPOPTSTRIP.h
+header-y += xt_TEE.h
 header-y += xt_TPROXY.h
 header-y += xt_comment.h
 header-y += xt_connbytes.h
diff --git a/include/linux/netfilter/xt_TEE.h b/include/linux/netfilter/xt_TEE.h
new file mode 100644
index 0000000..83fa768
--- /dev/null
+++ b/include/linux/netfilter/xt_TEE.h
@@ -0,0 +1,8 @@
+#ifndef _XT_TEE_TARGET_H
+#define _XT_TEE_TARGET_H
+
+struct xt_tee_tginfo {
+	union nf_inet_addr gw;
+};
+
+#endif /* _XT_TEE_TARGET_H */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f09135e..0abfdde 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -309,6 +309,7 @@ int ip_output(struct sk_buff *skb)
 			    ip_finish_output,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
+EXPORT_SYMBOL_GPL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e10f62..307d8bf 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -176,6 +176,7 @@ int ip6_output(struct sk_buff *skb)
 			    ip6_finish_output,
 			    !(IP6CB(skb)->flags & IPSKB_REROUTED));
 }
+EXPORT_SYMBOL_GPL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8055786..673a6c8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -502,6 +502,13 @@ config NETFILTER_XT_TARGET_RATEEST
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_TEE
+	tristate '"TEE" - packet cloning to alternate destiantion'
+	depends on NETFILTER_ADVANCED
+	---help---
+	This option adds a "TEE" target with which a packet can be cloned and
+	this clone be rerouted to another nexthop.
+
 config NETFILTER_XT_TARGET_TPROXY
 	tristate '"TPROXY" target support (EXPERIMENTAL)'
 	depends on EXPERIMENTAL
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index cd31afe..14e3a8f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
 
 # matches
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
new file mode 100644
index 0000000..96dd746
--- /dev/null
+++ b/net/netfilter/xt_TEE.c
@@ -0,0 +1,272 @@
+/*
+ *	"TEE" target extension for Xtables
+ *	Copyright © Sebastian Claßen <sebastian.classen [at] freenet de>, 2007
+ *	Jan Engelhardt <jengelh [at] medozas de>, 2007 - 2010
+ *
+ *	based on ipt_ROUTE.c from Cédric de Launois
+ *	<delaunois@info.ucl.be>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 or later, as published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/route.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_TEE.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#	define WITH_CONNTRACK 1
+#	include <net/netfilter/nf_conntrack.h>
+static struct nf_conn tee_track;
+#endif
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#	define WITH_IPV6 1
+#endif
+
+static const union nf_inet_addr tee_zero_address;
+
+/*
+ * Try to route the packet according to the routing keys specified in
+ * route_info. Keys are :
+ *  - ifindex :
+ *      0 if no oif preferred,
+ *      otherwise set to the index of the desired oif
+ *  - route_info->gateway :
+ *      0 if no gateway specified,
+ *      otherwise set to the next host to which the pkt must be routed
+ * If success, skb->dev is the output device to which the packet must
+ * be sent and skb->dst is not NULL
+ *
+ * RETURN: false - if an error occured
+ *         true  - if the packet was succesfully routed to the
+ *                 destination desired
+ */
+static bool
+tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt;
+	struct flowi fl;
+	int err;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.iif  = skb->skb_iif;
+	fl.mark = skb->mark;
+	fl.nl_u.ip4_u.daddr = info->gw.ip;
+	fl.nl_u.ip4_u.tos   = RT_TOS(iph->tos);
+	fl.nl_u.ip4_u.scope = RT_SCOPE_UNIVERSE;
+
+	/* Trying to route the packet using the standard routing table. */
+	err = ip_route_output_key(dev_net(skb->dev), &rt, &fl);
+	if (err != 0)
+		return false;
+
+	dst_release(skb_dst(skb));
+	skb_dst_set(skb, &rt->u.dst);
+	skb->dev      = rt->u.dst.dev;
+	skb->protocol = htons(ETH_P_IP);
+	IPCB(skb)->flags |= IPSKB_REROUTED;
+	return true;
+}
+
+/*
+ * To detect and deter routed packet loopback when using the --tee option, we
+ * take a page out of the raw.patch book: on the copied skb, we set up a fake
+ * ->nfct entry, pointing to the local &route_tee_track. We skip routing
+ * packets when we see they already have that ->nfct.
+ */
+static unsigned int
+tee_tg4(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+	struct iphdr *iph;
+
+#ifdef WITH_CONNTRACK
+	if (skb->nfct == &tee_track.ct_general)
+		/*
+		 * Loopback - a packet we already routed, is to be
+		 * routed another time. Avoid that, now.
+		 */
+		return NF_DROP;
+#endif
+	/*
+	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+	 * the original skb, which should continue on its way as if nothing has
+	 * happened. The copy should be independently delivered to the TEE
+	 * --gateway.
+	 */
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return XT_CONTINUE;
+	/*
+	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
+	 * since the length could have changed as a result of defragmentation.
+	 *
+	 * We also decrease the TTL to mitigate potential TEE loops
+	 * between two hosts.
+	 *
+	 * Set %IP_DF so that the original source is notified of a potentially
+	 * decreased MTU on the clone route. IPv6 does this too.
+	 */
+	iph = ip_hdr(skb);
+	iph->frag_off |= htons(IP_DF);
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_IN)
+		--iph->ttl;
+	ip_send_check(iph);
+
+#ifdef WITH_CONNTRACK
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &tee_track.ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	/*
+	 * Xtables is not reentrant currently, so a choice has to be made:
+	 * 1. return absolute verdict for the original and let the cloned
+	 *    packet travel through the chains
+	 * 2. let the original continue travelling and not pass the clone
+	 *    to Xtables.
+	 * #2 is chosen. Normally, we would use ip_local_out for the clone.
+	 * Because iph->check is already correct and we don't pass it to
+	 * Xtables anyway, a shortcut to dst_output [forwards to ip_output] can
+	 * be taken. %IPSKB_REROUTED needs to be set so that ip_output does not
+	 * invoke POSTROUTING on the cloned packet.
+	 */
+	IPCB(skb)->flags |= IPSKB_REROUTED;
+	if (tee_tg_route4(skb, info))
+		ip_output(skb);
+
+	return XT_CONTINUE;
+}
+
+#ifdef WITH_IPV6
+static bool
+tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct dst_entry *dst;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.iif  = skb->skb_iif;
+	fl.mark = skb->mark;
+	fl.nl_u.ip6_u.daddr = info->gw.in6;
+	fl.nl_u.ip6_u.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+				  (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
+
+	dst = ip6_route_output(dev_net(skb->dev), NULL, &fl);
+	if (dst == NULL)
+		return false;
+
+	dst_release(skb_dst(skb));
+	skb_dst_set(skb, dst);
+	skb->dev      = dst->dev;
+	skb->protocol = htons(ETH_P_IPV6);
+	IP6CB(skb)->flags |= IPSKB_REROUTED;
+	return true;
+}
+
+static unsigned int
+tee_tg6(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+
+#ifdef WITH_CONNTRACK
+	if (skb->nfct == &tee_track.ct_general)
+		return NF_DROP;
+#endif
+	if ((skb = skb_copy(skb, GFP_ATOMIC)) == NULL)
+		return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &tee_track.ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_IN) {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+		--iph->hop_limit;
+	}
+	IP6CB(skb)->flags |= IPSKB_REROUTED;
+	if (tee_tg_route6(skb, info))
+		ip6_output(skb);
+
+	return XT_CONTINUE;
+}
+#endif /* WITH_IPV6 */
+
+static int tee_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+
+	/* 0.0.0.0 and :: not allowed */
+	return (memcmp(&info->gw, &tee_zero_address,
+	       sizeof(tee_zero_address)) == 0) ? -EINVAL : 0;
+}
+
+static struct xt_target tee_tg_reg[] __read_mostly = {
+	{
+		.name       = "TEE",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.target     = tee_tg4,
+		.targetsize = sizeof(struct xt_tee_tginfo),
+		.checkentry = tee_tg_check,
+		.me         = THIS_MODULE,
+	},
+#ifdef WITH_IPV6
+	{
+		.name       = "TEE",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.target     = tee_tg6,
+		.targetsize = sizeof(struct xt_tee_tginfo),
+		.checkentry = tee_tg_check,
+		.me         = THIS_MODULE,
+	},
+#endif
+};
+
+static int __init tee_tg_init(void)
+{
+#ifdef WITH_CONNTRACK
+	/*
+	 * Set up fake conntrack (stolen from raw.patch):
+	 * - to never be deleted, not in any hashes
+	 */
+	atomic_set(&tee_track.ct_general.use, 1);
+
+	/* - and look it like as a confirmed connection */
+	set_bit(IPS_CONFIRMED_BIT, &tee_track.status);
+
+	/* Initialize fake conntrack so that NAT will skip it */
+	tee_track.status |= IPS_NAT_DONE_MASK;
+#endif
+	return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+static void __exit tee_tg_exit(void)
+{
+	xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+module_init(tee_tg_init);
+module_exit(tee_tg_exit);
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: Reroute packet copy");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TEE");
+MODULE_ALIAS("ip6t_TEE");
-- 
1.7.0.2


^ permalink raw reply related

* [PATCH 2/5] net: ipv6: add IPSKB_REROUTED exclusion to NF_HOOK/POSTROUTING invocation
From: Jan Engelhardt @ 2010-03-31 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-1-git-send-email-jengelh@medozas.de>

Similar to how IPv4's ip_output.c works, have ip6_output also check
the IPSKB_REROUTED flag. It will be set from xt_TEE for cloned packets
since Xtables can currently only deal with a single packet in flight
at a time.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 net/ipv6/ip6_output.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f314ba4..7e10f62 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -172,8 +172,9 @@ int ip6_output(struct sk_buff *skb)
 		return 0;
 	}
 
-	return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
-		       ip6_finish_output);
+	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
+			    ip6_finish_output,
+			    !(IP6CB(skb)->flags & IPSKB_REROUTED));
 }
 
 /*
-- 
1.7.0.2


^ permalink raw reply related

* [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-03-31 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-1-git-send-email-jengelh@medozas.de>

Since Xtables is now reentrant/nestable, the cloned packet can also go
through Xtables and be subject to rules itself.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 net/ipv4/ip_output.c   |    1 -
 net/ipv6/ip6_output.c  |    1 -
 net/netfilter/xt_TEE.c |   18 ++----------------
 3 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0abfdde..f09135e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -309,7 +309,6 @@ int ip_output(struct sk_buff *skb)
 			    ip_finish_output,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-EXPORT_SYMBOL_GPL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 307d8bf..7e10f62 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -176,7 +176,6 @@ int ip6_output(struct sk_buff *skb)
 			    ip6_finish_output,
 			    !(IP6CB(skb)->flags & IPSKB_REROUTED));
 }
-EXPORT_SYMBOL_GPL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP)
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 96dd746..70078f1 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -130,21 +130,8 @@ tee_tg4(struct sk_buff *skb, const struct xt_target_param *par)
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 #endif
-	/*
-	 * Xtables is not reentrant currently, so a choice has to be made:
-	 * 1. return absolute verdict for the original and let the cloned
-	 *    packet travel through the chains
-	 * 2. let the original continue travelling and not pass the clone
-	 *    to Xtables.
-	 * #2 is chosen. Normally, we would use ip_local_out for the clone.
-	 * Because iph->check is already correct and we don't pass it to
-	 * Xtables anyway, a shortcut to dst_output [forwards to ip_output] can
-	 * be taken. %IPSKB_REROUTED needs to be set so that ip_output does not
-	 * invoke POSTROUTING on the cloned packet.
-	 */
-	IPCB(skb)->flags |= IPSKB_REROUTED;
 	if (tee_tg_route4(skb, info))
-		ip_output(skb);
+		ip_local_out(skb);
 
 	return XT_CONTINUE;
 }
@@ -199,9 +186,8 @@ tee_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 		struct ipv6hdr *iph = ipv6_hdr(skb);
 		--iph->hop_limit;
 	}
-	IP6CB(skb)->flags |= IPSKB_REROUTED;
 	if (tee_tg_route6(skb, info))
-		ip6_output(skb);
+		ip6_local_out(skb);
 
 	return XT_CONTINUE;
 }
-- 
1.7.0.2


^ permalink raw reply related

* [PATCH 4/5] netfilter: xtables2: make ip_tables reentrant
From: Jan Engelhardt @ 2010-03-31 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-1-git-send-email-jengelh@medozas.de>

Currently, the table traverser stores return addresses in the ruleset
itself (struct ip6t_entry->comefrom). This has a well-known drawback:
the jumpstack is overwritten on reentry, making it necessary for
targets to return absolute verdicts. Also, the ruleset (which might
be heavy memory-wise) needs to be replicated for each CPU that can
possibly invoke ip6t_do_table.

This patch decouples the jumpstack from struct ip6t_entry and instead
puts it into xt_table_info. Not being restricted by 'comefrom'
anymore, we can set up a stack as needed. By default, there is room
allocated for two entries into the traverser. The setting is
configurable at runtime through sysfs and will take effect when a
table is replaced by a new one.

arp_tables is not touched though, because there is just one/two
modules and further patches seek to collapse the table traverser
anyhow.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 include/linux/netfilter/x_tables.h |    7 +++
 net/ipv4/netfilter/arp_tables.c    |    6 ++-
 net/ipv4/netfilter/ip_tables.c     |   65 ++++++++++++++++--------------
 net/ipv6/netfilter/ip6_tables.c    |   56 ++++++++++---------------
 net/netfilter/x_tables.c           |   79 ++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+), 66 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1a65d45..62cc5ca 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -401,6 +401,13 @@ struct xt_table_info {
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/*
+	 * Number of user chains. Since tables cannot have loops, at most
+	 * @stacksize jumps (number of user chains) can possibly be made.
+	 */
+	unsigned int stacksize;
+	unsigned int *stackptr;
+	void ***jumpstack;
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	void *entries[1];
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8e363d..07a6990 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -649,6 +649,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 		if (ret != 0)
 			break;
 		++i;
+		if (strcmp(arpt_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
 	}
 	duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
 	if (ret != 0)
@@ -1774,8 +1777,7 @@ struct xt_table *arpt_register_table(struct net *net,
 {
 	int ret;
 	struct xt_table_info *newinfo;
-	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 18c5b15..70900ec 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -321,8 +321,6 @@ ipt_do_table(struct sk_buff *skb,
 	     const struct net_device *out,
 	     struct xt_table *table)
 {
-#define tb_comefrom ((struct ipt_entry *)table_base)->comefrom
-
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
 	const struct iphdr *ip;
 	bool hotdrop = false;
@@ -330,7 +328,8 @@ ipt_do_table(struct sk_buff *skb,
 	unsigned int verdict = NF_DROP;
 	const char *indev, *outdev;
 	const void *table_base;
-	struct ipt_entry *e, *back;
+	struct ipt_entry *e, **jumpstack;
+	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
@@ -356,19 +355,23 @@ ipt_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	cpu        = smp_processor_id();
+	table_base = private->entries[cpu];
+	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
+	stackptr   = &private->stackptr[cpu];
+	origptr    = *stackptr;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
-	/* For return from builtin chain */
-	back = get_entry(table_base, private->underflow[hook]);
+	pr_devel("Entering %s(hook %u); sp at %u (UF %p)\n",
+		 table->name, hook, origptr,
+		 get_entry(table_base, private->underflow[hook]));
 
 	do {
 		const struct ipt_entry_target *t;
 		const struct xt_entry_match *ematch;
 
 		IP_NF_ASSERT(e);
-		IP_NF_ASSERT(back);
 		if (!ip_packet_match(ip, indev, outdev,
 		    &e->ip, mtpar.fragoff)) {
  no_match:
@@ -403,17 +406,28 @@ ipt_do_table(struct sk_buff *skb,
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
-				e = back;
-				back = get_entry(table_base, back->comefrom);
+				if (*stackptr == 0) {
+					e = get_entry(table_base,
+					    private->underflow[hook]);
+					pr_devel("Underflow (this is normal) "
+						 "to %p\n", e);
+				} else {
+					e = jumpstack[--*stackptr];
+					pr_devel("Pulled %p out from pos %u\n",
+						 e, *stackptr);
+					e = ipt_next_entry(e);
+				}
 				continue;
 			}
 			if (table_base + v != ipt_next_entry(e) &&
 			    !(e->ip.flags & IPT_F_GOTO)) {
-				/* Save old back ptr in next entry */
-				struct ipt_entry *next = ipt_next_entry(e);
-				next->comefrom = (void *)back - table_base;
-				/* set back pointer to next entry */
-				back = next;
+				if (*stackptr >= private->stacksize) {
+					verdict = NF_DROP;
+					break;
+				}
+				jumpstack[(*stackptr)++] = e;
+				pr_devel("Pushed %p into pos %u\n",
+					 e, *stackptr - 1);
 			}
 
 			e = get_entry(table_base, v);
@@ -426,18 +440,7 @@ ipt_do_table(struct sk_buff *skb,
 		tgpar.targinfo = t->data;
 
 
-#ifdef CONFIG_NETFILTER_DEBUG
-		tb_comefrom = 0xeeeeeeec;
-#endif
 		verdict = t->u.kernel.target->target(skb, &tgpar);
-#ifdef CONFIG_NETFILTER_DEBUG
-		if (tb_comefrom != 0xeeeeeeec && verdict == IPT_CONTINUE) {
-			printk("Target %s reentered!\n",
-			       t->u.kernel.target->name);
-			verdict = NF_DROP;
-		}
-		tb_comefrom = 0x57acc001;
-#endif
 		/* Target might have changed stuff. */
 		ip = ip_hdr(skb);
 		if (verdict == IPT_CONTINUE)
@@ -447,7 +450,9 @@ ipt_do_table(struct sk_buff *skb,
 			break;
 	} while (!hotdrop);
 	xt_info_rdunlock_bh();
-
+	pr_devel("Exiting %s; resetting sp from %u to %u\n",
+		 __func__, *stackptr, origptr);
+	*stackptr = origptr;
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
 #else
@@ -455,8 +460,6 @@ ipt_do_table(struct sk_buff *skb,
 		return NF_DROP;
 	else return verdict;
 #endif
-
-#undef tb_comefrom
 }
 
 /* Figures out from what hook each rule can be called: returns 0 if
@@ -838,6 +841,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		if (ret != 0)
 			return ret;
 		++i;
+		if (strcmp(ipt_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
 	}
 
 	if (i != repl->num_entries) {
@@ -2086,8 +2092,7 @@ struct xt_table *ipt_register_table(struct net *net,
 {
 	int ret;
 	struct xt_table_info *newinfo;
-	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f2b815e..2a2770b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -351,15 +351,14 @@ ip6t_do_table(struct sk_buff *skb,
 	      const struct net_device *out,
 	      struct xt_table *table)
 {
-#define tb_comefrom ((struct ip6t_entry *)table_base)->comefrom
-
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
 	bool hotdrop = false;
 	/* Initializing verdict to NF_DROP keeps gcc happy. */
 	unsigned int verdict = NF_DROP;
 	const char *indev, *outdev;
 	const void *table_base;
-	struct ip6t_entry *e, *back;
+	struct ip6t_entry *e, **jumpstack;
+	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
@@ -383,19 +382,19 @@ ip6t_do_table(struct sk_buff *skb,
 
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	cpu        = smp_processor_id();
+	table_base = private->entries[cpu];
+	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
+	stackptr   = &private->stackptr[cpu];
+	origptr    = *stackptr;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
-	/* For return from builtin chain */
-	back = get_entry(table_base, private->underflow[hook]);
-
 	do {
 		const struct ip6t_entry_target *t;
 		const struct xt_entry_match *ematch;
 
 		IP_NF_ASSERT(e);
-		IP_NF_ASSERT(back);
 		if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
 		    &mtpar.thoff, &mtpar.fragoff, &hotdrop)) {
  no_match:
@@ -432,17 +431,20 @@ ip6t_do_table(struct sk_buff *skb,
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
-				e = back;
-				back = get_entry(table_base, back->comefrom);
+				if (*stackptr == 0)
+					e = get_entry(table_base,
+					    private->underflow[hook]);
+				else
+					e = ip6t_next_entry(jumpstack[--*stackptr]);
 				continue;
 			}
 			if (table_base + v != ip6t_next_entry(e) &&
 			    !(e->ipv6.flags & IP6T_F_GOTO)) {
-				/* Save old back ptr in next entry */
-				struct ip6t_entry *next = ip6t_next_entry(e);
-				next->comefrom = (void *)back - table_base;
-				/* set back pointer to next entry */
-				back = next;
+				if (*stackptr >= private->stacksize) {
+					verdict = NF_DROP;
+					break;
+				}
+				jumpstack[(*stackptr)++] = e;
 			}
 
 			e = get_entry(table_base, v);
@@ -454,19 +456,7 @@ ip6t_do_table(struct sk_buff *skb,
 		tgpar.target   = t->u.kernel.target;
 		tgpar.targinfo = t->data;
 
-#ifdef CONFIG_NETFILTER_DEBUG
-		tb_comefrom = 0xeeeeeeec;
-#endif
 		verdict = t->u.kernel.target->target(skb, &tgpar);
-
-#ifdef CONFIG_NETFILTER_DEBUG
-		if (tb_comefrom != 0xeeeeeeec && verdict == IP6T_CONTINUE) {
-			printk("Target %s reentered!\n",
-			       t->u.kernel.target->name);
-			verdict = NF_DROP;
-		}
-		tb_comefrom = 0x57acc001;
-#endif
 		if (verdict == IP6T_CONTINUE)
 			e = ip6t_next_entry(e);
 		else
@@ -474,10 +464,8 @@ ip6t_do_table(struct sk_buff *skb,
 			break;
 	} while (!hotdrop);
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	tb_comefrom = NETFILTER_LINK_POISON;
-#endif
 	xt_info_rdunlock_bh();
+	*stackptr = origptr;
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -486,8 +474,6 @@ ip6t_do_table(struct sk_buff *skb,
 		return NF_DROP;
 	else return verdict;
 #endif
-
-#undef tb_comefrom
 }
 
 /* Figures out from what hook each rule can be called: returns 0 if
@@ -869,6 +855,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		if (ret != 0)
 			return ret;
 		++i;
+		if (strcmp(ip6t_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
 	}
 
 	if (i != repl->num_entries) {
@@ -2120,8 +2109,7 @@ struct xt_table *ip6t_register_table(struct net *net,
 {
 	int ret;
 	struct xt_table_info *newinfo;
-	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8e23d8f..2010b56 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -62,6 +62,11 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 	[NFPROTO_IPV6]   = "ip6",
 };
 
+/* Allow this many total (re)entries. */
+static unsigned int xt_jumpstack_multiplier = 2;
+module_param_named(jumpstack_multiplier, xt_jumpstack_multiplier,
+	uint, S_IRUGO | S_IWUSR);
+
 /* Registration hooks for targets. */
 int
 xt_register_target(struct xt_target *target)
@@ -680,6 +685,26 @@ void xt_free_table_info(struct xt_table_info *info)
 		else
 			vfree(info->entries[cpu]);
 	}
+
+	if (info->jumpstack != NULL) {
+		if (sizeof(void *) * info->stacksize > PAGE_SIZE) {
+			for_each_possible_cpu(cpu)
+				vfree(info->jumpstack[cpu]);
+		} else {
+			for_each_possible_cpu(cpu)
+				kfree(info->jumpstack[cpu]);
+		}
+	}
+
+	if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE)
+		vfree(info->jumpstack);
+	else
+		kfree(info->jumpstack);
+	if (sizeof(unsigned int) * nr_cpu_ids > PAGE_SIZE)
+		vfree(info->stackptr);
+	else
+		kfree(info->stackptr);
+
 	kfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
@@ -724,6 +749,49 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
 DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
 EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
 
+static int xt_jumpstack_alloc(struct xt_table_info *i)
+{
+	unsigned int size;
+	int cpu;
+
+	size = sizeof(unsigned int) * nr_cpu_ids;
+	if (size > PAGE_SIZE)
+		i->stackptr = vmalloc(size);
+	else
+		i->stackptr = kmalloc(size, GFP_KERNEL);
+	if (i->stackptr == NULL)
+		return -ENOMEM;
+	memset(i->stackptr, 0, size);
+
+	size = sizeof(void **) * nr_cpu_ids;
+	if (size > PAGE_SIZE)
+		i->jumpstack = vmalloc(size);
+	else
+		i->jumpstack = kmalloc(size, GFP_KERNEL);
+	if (i->jumpstack == NULL)
+		return -ENOMEM;
+	memset(i->jumpstack, 0, size);
+
+	i->stacksize *= xt_jumpstack_multiplier;
+	size = sizeof(void *) * i->stacksize;
+	for_each_possible_cpu(cpu) {
+		if (size > PAGE_SIZE)
+			i->jumpstack[cpu] = vmalloc_node(size,
+				cpu_to_node(cpu));
+		else
+			i->jumpstack[cpu] = kmalloc_node(size,
+				GFP_KERNEL, cpu_to_node(cpu));
+		if (i->jumpstack[cpu] == NULL)
+			/*
+			 * Freeing will be done later on by the callers. The
+			 * chain is: xt_replace_table -> __do_replace ->
+			 * do_replace -> xt_free_table_info.
+			 */
+			return -ENOMEM;
+	}
+
+	return 0;
+}
 
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
@@ -732,6 +800,7 @@ xt_replace_table(struct xt_table *table,
 	      int *error)
 {
 	struct xt_table_info *private;
+	int ret;
 
 	/* Do the substitution. */
 	local_bh_disable();
@@ -746,6 +815,12 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 
+	ret = xt_jumpstack_alloc(newinfo);
+	if (ret < 0) {
+		*error = ret;
+		return NULL;
+	}
+
 	table->private = newinfo;
 	newinfo->initial_entries = private->initial_entries;
 
@@ -770,6 +845,10 @@ struct xt_table *xt_register_table(struct net *net,
 	struct xt_table_info *private;
 	struct xt_table *t, *table;
 
+	ret = xt_jumpstack_alloc(newinfo);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
 	/* Don't add one object to multiple lists. */
 	table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
 	if (!table) {
-- 
1.7.0.2


^ permalink raw reply related

* [PATCH 1/5] netfilter: ipv6: move POSTROUTING invocation before fragmentation
From: Jan Engelhardt @ 2010-03-31 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-1-git-send-email-jengelh@medozas.de>

Patrick McHardy notes: "We used to invoke IPv4 POST_ROUTING after
fragmentation as well just to defragment the packets in conntrack
immediately afterwards, but that got changed during the
netfilter-ipsec integration. Ideally IPv6 would behave like IPv4."

This patch makes it so. Sending an oversized frame (e.g. `ping6
-s64000 -c1 ::1`) will now show up in POSTROUTING as a single skb
rather than multiple ones.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 net/ipv6/ip6_output.c |   49 +++++++++++++++++++++++--------------------------
 1 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4535b7a..f314ba4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -82,22 +82,6 @@ int ip6_local_out(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip6_local_out);
 
-static int ip6_output_finish(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-
-	if (dst->hh)
-		return neigh_hh_output(dst->hh, skb);
-	else if (dst->neighbour)
-		return dst->neighbour->output(skb);
-
-	IP6_INC_STATS_BH(dev_net(dst->dev),
-			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
-	kfree_skb(skb);
-	return -EINVAL;
-
-}
-
 /* dev_loopback_xmit for use with netfilter. */
 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 {
@@ -111,8 +95,7 @@ static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 	return 0;
 }
 
-
-static int ip6_output2(struct sk_buff *skb)
+static int ip6_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
@@ -150,8 +133,15 @@ static int ip6_output2(struct sk_buff *skb)
 				skb->len);
 	}
 
-	return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
-		       ip6_output_finish);
+	if (dst->hh)
+		return neigh_hh_output(dst->hh, skb);
+	else if (dst->neighbour)
+		return dst->neighbour->output(skb);
+
+	IP6_INC_STATS_BH(dev_net(dst->dev),
+			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb(skb);
+	return -EINVAL;
 }
 
 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
@@ -162,21 +152,28 @@ static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 }
 
+static int ip6_finish_output(struct sk_buff *skb)
+{
+	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+				dst_allfrag(skb_dst(skb)))
+		return ip6_fragment(skb, ip6_finish_output2);
+	else
+		return ip6_finish_output2(skb);
+}
+
 int ip6_output(struct sk_buff *skb)
 {
+	struct net_device *dev = skb_dst(skb)->dev;
 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 	if (unlikely(idev->cnf.disable_ipv6)) {
-		IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
+		IP6_INC_STATS(dev_net(dev), idev,
 			      IPSTATS_MIB_OUTDISCARDS);
 		kfree_skb(skb);
 		return 0;
 	}
 
-	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
-				dst_allfrag(skb_dst(skb)))
-		return ip6_fragment(skb, ip6_output2);
-	else
-		return ip6_output2(skb);
+	return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
+		       ip6_finish_output);
 }
 
 /*
-- 
1.7.0.2


^ permalink raw reply related

* nf-next: TEE and nesting
From: Jan Engelhardt @ 2010-03-31 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netfilter-devel, netdev


> Hi,
>
> next on the calendar is the xt_TEE submission.
>
>
> The following changes since commit b44672889c11e13e4f4dc0a8ee23f0e64f1e57c6:
>   Jan Engelhardt (1):
>         netfilter: xtables: merge registration structure to NFPROTO_UNSPEC
>
> are available in the git repository at:
>
>   git://dev.medozas.de/linux master
>
> Jan Engelhardt (5):
>       netfilter: ipv6: move POSTROUTING invocation before fragmentation
>       net: ipv6: add IPSKB_REROUTED exclusion to NF_HOOK/POSTROUTING invocation
>       netfilter: xtables: inclusion of xt_TEE
>       netfilter: xtables2: make ip_tables reentrant
>       netfilter: xt_TEE: have cloned packet travel through Xtables too

cc'd netdev due to larger changes to the IPv6 code.

^ permalink raw reply

* crucial fixes only for net-2.6 please
From: David Miller @ 2010-03-31 10:27 UTC (permalink / raw)
  To: netdev


I really don't want to see any more patches for net-2.6 unless they
fix a bug that eats someones disk and then sends all of their user's
passwords to an arbitray remote site.

I'm very serious.  Things should be going into net-2.6 at a very slow
trickle at this point.

Thanks.

^ permalink raw reply

* Re: [net-2.6] bonding: bond_xmit_roundrobin() fix
From: David Miller @ 2010-03-31 10:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, andy, lhh, fubar, bonding-devel


Applied, thanks Eric!

^ permalink raw reply

* [r8169] WARNING: at net/sched/sch_generic.c
From: Sergey Senozhatsky @ 2010-03-31 10:21 UTC (permalink / raw)
  To: netdev
  Cc: Francois Romieu, Neil Horman, Eric Dumazet, David S. Miller,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1969 bytes --]

Hello,
I have the following problem:

[  296.337510] ------------[ cut here ]------------
[  296.337523] WARNING: at net/sched/sch_generic.c:255 dev_watchdog+0xc1/0x125()
[  296.337527] Hardware name: F3JC                
[  296.337530] NETDEV WATCHDOG: eth0 (r8169): transmit queue 0 timed out
[  296.337533] Modules linked in: pktgen ipv6 snd_hwdep snd_hda_codec_si3054 snd_hda_codec_realtek sdhci_pci sdhci asus_laptop sparse_keymap mmc_core led_class snd_hda_intel
snd_hda_codec psmouse snd_pcm snd_timer snd soundcore snd_page_alloc serio_raw i2c_i801 rng_core evdev sg r8169 mii usbhid hid uhci_hcd ehci_hcd sr_mod cdrom sd_mod usbcore
ata_piix
[  296.337586] Pid: 0, comm: swapper Not tainted 2.6.34-rc3-dbg #74
[  296.337589] Call Trace:
[  296.337597]  [<c102e71f>] warn_slowpath_common+0x65/0x7c
[  296.337603]  [<c126e30c>] ? dev_watchdog+0xc1/0x125
[  296.337608]  [<c102e76a>] warn_slowpath_fmt+0x24/0x27
[  296.337613]  [<c126e30c>] dev_watchdog+0xc1/0x125
[  296.337620]  [<c1040039>] ? prepare_to_wait_exclusive+0x52/0x5b
[  296.337627]  [<c1037053>] ? run_timer_softirq+0x120/0x1eb
[  296.337632]  [<c10370a9>] run_timer_softirq+0x176/0x1eb
[  296.337637]  [<c1037053>] ? run_timer_softirq+0x120/0x1eb
[  296.337643]  [<c126e24b>] ? dev_watchdog+0x0/0x125
[  296.337650]  [<c10331c9>] __do_softirq+0x8d/0x117
[  296.337655]  [<c103327e>] do_softirq+0x2b/0x43
[  296.337660]  [<c10333a3>] irq_exit+0x38/0x75
[  296.337667]  [<c1015138>] smp_apic_timer_interrupt+0x6d/0x7b
[  296.337673]  [<c12cbada>] apic_timer_interrupt+0x36/0x3c
[  296.337679]  [<c104007b>] ? prepare_to_wait+0x39/0x57
[  296.337685]  [<c11dd835>] ? acpi_idle_enter_simple+0x119/0x144
[  296.337692]  [<c124d358>] cpuidle_idle_call+0x6d/0xa5
[  296.337697]  [<c1001b51>] cpu_idle+0x92/0xc1
[  296.337704]  [<c12c63d0>] start_secondary+0x1f3/0x1fa
[  296.337708] ---[ end trace cd4a1b50139837df ]---


Reproducing 100% with pktgen tests.


	Sergey

[-- Attachment #2: Type: application/pgp-signature, Size: 316 bytes --]

^ permalink raw reply

* [PATCH 4/4] flow: structurize flow cache
From: Timo Teras @ 2010-03-31 10:17 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270030626-16687-1-git-send-email-timo.teras@iki.fi>

Group all per-cpu data to one structure instead of having many
globals. Also prepare the internals so that we can have multiple
instances of the flow cache if needed.

Only the kmem_cache is left as a global as all flow caches share
the same element size, and benefit from using a common cache.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/core/flow.c |  223 +++++++++++++++++++++++++++++--------------------------
 1 files changed, 119 insertions(+), 104 deletions(-)

diff --git a/net/core/flow.c b/net/core/flow.c
index 9601587..1d27ca6 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -35,104 +35,105 @@ struct flow_cache_entry {
 	atomic_t		*object_ref;
 };
 
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static struct kmem_cache *flow_cachep __read_mostly;
-
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
-	int hash_rnd_recalc;
-	u32 hash_rnd;
-	int count;
+struct flow_cache_percpu {
+	struct flow_cache_entry **	hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-
-#define flow_hash_rnd_recalc(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
-	(per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD	(10 * 60 * HZ)
 
 struct flow_flush_info {
-	atomic_t cpuleft;
-	struct completion completion;
+	struct flow_cache *		cache;
+	atomic_t			cpuleft;
+	struct completion		completion;
 };
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
 
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+struct flow_cache {
+	u32				hash_shift;
+	unsigned long			order;
+	struct flow_cache_percpu *	percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+};
+
+atomic_t flow_cache_genid = ATOMIC_INIT(0);
+static struct flow_cache flow_cache_global;
+static struct kmem_cache *flow_cachep;
+
+#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+	struct flow_cache *fc = (void *) arg;
 	int i;
 
 	for_each_possible_cpu(i)
-		flow_hash_rnd_recalc(i) = 1;
+		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
 
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 }
 
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache *fc,
+			    struct flow_cache_percpu *fcp,
+			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
 		atomic_dec(fle->object_ref);
 	kmem_cache_free(flow_cachep, fle);
-	flow_count(cpu)--;
+	fcp->hash_count--;
 }
 
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void __flow_cache_shrink(struct flow_cache *fc,
+				struct flow_cache_percpu *fcp,
+				int shrink_to)
 {
 	struct flow_cache_entry *fle, **flp;
 	int i;
 
-	for (i = 0; i < flow_hash_size; i++) {
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int k = 0;
 
-		flp = &flow_table(cpu)[i];
+		flp = &fcp->hash_table[i];
 		while ((fle = *flp) != NULL && k < shrink_to) {
 			k++;
 			flp = &fle->next;
 		}
 		while ((fle = *flp) != NULL) {
 			*flp = fle->next;
-			flow_entry_kill(cpu, fle);
+			flow_entry_kill(fc, fcp, fle);
 		}
 	}
 }
 
-static void flow_cache_shrink(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	int shrink_to = flow_lwm / flow_hash_size;
+	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 
-	__flow_cache_shrink(cpu, shrink_to);
+	__flow_cache_shrink(fc, fcp, shrink_to);
 }
 
-static void flow_new_hash_rnd(int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
-	flow_hash_rnd_recalc(cpu) = 0;
-
-	__flow_cache_shrink(cpu, 0);
+	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+	fcp->hash_rnd_recalc = 0;
+	__flow_cache_shrink(fc, fcp, 0);
 }
 
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static u32 flow_hash_code(struct flow_cache *fc,
+			  struct flow_cache_percpu *fcp,
+			  struct flowi *key)
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
-		(flow_hash_size - 1));
+	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1));
 }
 
 #if (BITS_PER_LONG == 64)
@@ -168,24 +169,25 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
+	struct flow_cache *fc = &flow_cache_global;
+	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
 	unsigned int hash;
-	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!fcp->hash_table)
 		goto nocache;
 
-	if (flow_hash_rnd_recalc(cpu))
-		flow_new_hash_rnd(cpu);
-	hash = flow_hash_code(key, cpu);
+	if (fcp->hash_rnd_recalc)
+		flow_new_hash_rnd(fc, fcp);
+	hash = flow_hash_code(fc, fcp, key);
 
-	head = &flow_table(cpu)[hash];
+	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
@@ -204,8 +206,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 	}
 
 	if (!fle) {
-		if (flow_count(cpu) > flow_hwm)
-			flow_cache_shrink(cpu);
+		if (fcp->hash_count > fc->high_watermark)
+			flow_cache_shrink(fc, fcp);
 
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
@@ -215,7 +217,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->dir = dir;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
-			flow_count(cpu)++;
+			fcp->hash_count++;
 		}
 	}
 
@@ -249,14 +251,15 @@ nocache:
 static void flow_cache_flush_tasklet(unsigned long data)
 {
 	struct flow_flush_info *info = (void *)data;
+	struct flow_cache *fc = info->cache;
+	struct flow_cache_percpu *fcp;
 	int i;
-	int cpu;
 
-	cpu = smp_processor_id();
-	for (i = 0; i < flow_hash_size; i++) {
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		struct flow_cache_entry *fle;
 
-		fle = flow_table(cpu)[i];
+		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
 			unsigned genid = atomic_read(&flow_cache_genid);
 
@@ -272,7 +275,6 @@ static void flow_cache_flush_tasklet(unsigned long data)
 		complete(&info->completion);
 }
 
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
 	struct flow_flush_info *info = data;
@@ -280,8 +282,7 @@ static void flow_cache_flush_per_cpu(void *data)
 	struct tasklet_struct *tasklet;
 
 	cpu = smp_processor_id();
-
-	tasklet = flow_flush_tasklet(cpu);
+	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
 	tasklet->data = (unsigned long)info;
 	tasklet_schedule(tasklet);
 }
@@ -294,6 +295,7 @@ void flow_cache_flush(void)
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
+	info.cache = &flow_cache_global;
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
 
@@ -307,62 +309,75 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+					  struct flow_cache_percpu *fcp)
 {
-	struct tasklet_struct *tasklet;
-	unsigned long order;
-
-	for (order = 0;
-	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_hash_size);
-	     order++)
-		/* NOTHING */;
-
-	flow_table(cpu) = (struct flow_cache_entry **)
-		__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-	if (!flow_table(cpu))
-		panic("NET: failed to allocate flow cache order %lu\n", order);
-
-	flow_hash_rnd_recalc(cpu) = 1;
-	flow_count(cpu) = 0;
-
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
+	fcp->hash_table = (struct flow_cache_entry **)
+		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+	if (!fcp->hash_table)
+		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
+
+	fcp->hash_rnd_recalc = 1;
+	fcp->hash_count = 0;
+	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
 }
 
 static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
+	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	int cpu = (unsigned long) hcpu;
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		__flow_cache_shrink((unsigned long)hcpu, 0);
+		__flow_cache_shrink(fc, fcp, 0);
 	return NOTIFY_OK;
 }
 
-static int __init flow_cache_init(void)
+static int flow_cache_init(struct flow_cache *fc)
 {
+	unsigned long order;
 	int i;
 
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC,
-					NULL);
-	flow_hash_shift = 10;
-	flow_lwm = 2 * flow_hash_size;
-	flow_hwm = 4 * flow_hash_size;
+	fc->hash_shift = 10;
+	fc->low_watermark = 2 * flow_cache_hash_size(fc);
+	fc->high_watermark = 4 * flow_cache_hash_size(fc);
+
+	for (order = 0;
+	     (PAGE_SIZE << order) <
+		     (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc));
+	     order++)
+		/* NOTHING */;
+	fc->order = order;
+	fc->percpu = alloc_percpu(struct flow_cache_percpu);
 
-	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+		    (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 
 	for_each_possible_cpu(i)
-		flow_cache_cpu_prepare(i);
+		flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
+
+	fc->hotcpu_notifier = (struct notifier_block){
+		.notifier_call = flow_cache_cpu,
+	};
+	register_hotcpu_notifier(&fc->hotcpu_notifier);
 
-	hotcpu_notifier(flow_cache_cpu, 0);
 	return 0;
 }
 
-module_init(flow_cache_init);
+static int __init flow_cache_init_global(void)
+{
+	flow_cachep = kmem_cache_create("flow_cache",
+					sizeof(struct flow_cache_entry),
+					0, SLAB_PANIC, NULL);
+
+	return flow_cache_init(&flow_cache_global);
+}
+
+module_init(flow_cache_init_global);
 
 EXPORT_SYMBOL(flow_cache_genid);
 EXPORT_SYMBOL(flow_cache_lookup);
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 3/4] xfrm: remove policy lock when accessing policy->walk.dead
From: Timo Teras @ 2010-03-31 10:17 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270030626-16687-1-git-send-email-timo.teras@iki.fi>

All of the code considers ->dead as a hint that the cached policy
needs to get refreshed. The read side can just drop the read lock
without any side effects.

The write side needs to make sure that it's written only exactly
once. Only possible race is at xfrm_policy_kill(). This is fixed
by checking result of __xfrm_policy_unlink() when needed. It will
always succeed if the policy object is looked up from the hash
list (so some checks are removed), but it needs to be checked if
we are trying to unlink policy via a reference (appropriate
checks added).

Since policy->walk.dead is written exactly once, it no longer
needs to be protected with a write lock.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/xfrm/xfrm_policy.c |   31 +++++++++----------------------
 net/xfrm/xfrm_user.c   |    6 +-----
 2 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 843e066..82789cf 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -156,7 +156,7 @@ static void xfrm_policy_timer(unsigned long data)
 
 	read_lock(&xp->lock);
 
-	if (xp->walk.dead)
+	if (unlikely(xp->walk.dead))
 		goto out;
 
 	dir = xfrm_policy_id2dir(xp->index);
@@ -297,17 +297,7 @@ static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
-	int dead;
-
-	write_lock_bh(&policy->lock);
-	dead = policy->walk.dead;
 	policy->walk.dead = 1;
-	write_unlock_bh(&policy->lock);
-
-	if (unlikely(dead)) {
-		WARN_ON(1);
-		return;
-	}
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
 	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
@@ -776,7 +766,6 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audi
 int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 {
 	int dir, err = 0, cnt = 0;
-	struct xfrm_policy *dp;
 
 	write_lock_bh(&xfrm_policy_lock);
 
@@ -794,10 +783,9 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 				     &net->xfrm.policy_inexact[dir], bydst) {
 			if (pol->type != type)
 				continue;
-			dp = __xfrm_policy_unlink(pol, dir);
+			__xfrm_policy_unlink(pol, dir);
 			write_unlock_bh(&xfrm_policy_lock);
-			if (dp)
-				cnt++;
+			cnt++;
 
 			xfrm_audit_policy_delete(pol, 1, audit_info->loginuid,
 						 audit_info->sessionid,
@@ -816,10 +804,9 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 					     bydst) {
 				if (pol->type != type)
 					continue;
-				dp = __xfrm_policy_unlink(pol, dir);
+				__xfrm_policy_unlink(pol, dir);
 				write_unlock_bh(&xfrm_policy_lock);
-				if (dp)
-					cnt++;
+				cnt++;
 
 				xfrm_audit_policy_delete(pol, 1,
 							 audit_info->loginuid,
@@ -1132,6 +1119,9 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 		__xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
 	}
 	if (old_pol)
+		/* Unlinking succeeds always. This is the only function
+		 * allowed to delete or replace socket policy.
+		 */
 		__xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
 	write_unlock_bh(&xfrm_policy_lock);
 
@@ -1737,11 +1727,8 @@ restart:
 			goto error;
 		}
 
-		for (pi = 0; pi < npols; pi++) {
-			read_lock_bh(&pols[pi]->lock);
+		for (pi = 0; pi < npols; pi++)
 			pol_dead |= pols[pi]->walk.dead;
-			read_unlock_bh(&pols[pi]->lock);
-		}
 
 		write_lock_bh(&policy->lock);
 		if (unlikely(pol_dead || stale_bundle(dst))) {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index da5ba86..a267fbd 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1770,13 +1770,9 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (xp == NULL)
 		return -ENOENT;
 
-	read_lock(&xp->lock);
-	if (xp->walk.dead) {
-		read_unlock(&xp->lock);
+	if (unlikely(xp->walk.dead))
 		goto out;
-	}
 
-	read_unlock(&xp->lock);
 	err = 0;
 	if (up->hard) {
 		uid_t loginuid = NETLINK_CB(skb).loginuid;
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 2/4] xfrm_user: verify policy direction at XFRM_MSG_POLEXPIRE handler
From: Timo Teras @ 2010-03-31 10:17 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270030626-16687-1-git-send-email-timo.teras@iki.fi>

Add missing check for policy direction verification. This is
especially important since without this xfrm_user may end up
deleting per-socket policy which is not allowed.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/xfrm/xfrm_user.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 6106b72..da5ba86 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1741,6 +1741,10 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
+	err = verify_policy_dir(p->dir);
+	if (err)
+		return err;
+
 	if (p->index)
 		xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
 	else {
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 1/4] xfrm: increment genid before bumping state genids
From: Timo Teras @ 2010-03-31 10:17 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270030626-16687-1-git-send-email-timo.teras@iki.fi>

__xfrm_state_bump_genids() is used to update the genid of all
matching xfrm_state's, so any bundle using the state would get
refreshed with the newly inserted state.

However, since __xfrm_state_bump_genids() is called before the
__xfrm_state_insert() which actually bumps the genid counter,
it is possible that the genid was not updated at all (if there
was no state inserts previously).

This is fixed by moving the genid incrementation to
__xfrm_state_bump_genids() so the older states are guaranteed
to get different genid.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/xfrm/xfrm_state.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 17d5b96..b4efc28 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -923,7 +923,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 	struct net *net = xs_net(x);
 	unsigned int h;
 
-	x->genid = ++xfrm_state_genid;
+	x->genid = xfrm_state_genid;
 
 	list_add(&x->km.all, &net->xfrm.state_all);
 
@@ -963,6 +963,7 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
 	unsigned int h;
 	u32 mark = xnew->mark.v & xnew->mark.m;
 
+	xfrm_state_genid++;
 	h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
 	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
 		if (x->props.family	== family &&
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 0/4] xfrm fixes and flow structurization
From: Timo Teras @ 2010-03-31 10:17 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270030626-16687-1-git-send-email-timo.teras@iki.fi>

These are fixes and cleanups which should be good for merging in.
Patches 1 and 2 are new. Patches 3 and 4 were previously discussed
with Herbert.

Please review and consider committing.

Thanks.

Timo Teras (4):
  xfrm: increment genid before bumping state genids
  xfrm_user: verify policy direction at XFRM_MSG_POLEXPIRE handler
  xfrm: remove policy lock when accessing policy->walk.dead
  flow: structurize flow cache

 net/core/flow.c        |  223 +++++++++++++++++++++++++----------------------
 net/xfrm/xfrm_policy.c |   31 ++-----
 net/xfrm/xfrm_state.c  |    3 +-
 net/xfrm/xfrm_user.c   |   10 +-
 4 files changed, 135 insertions(+), 132 deletions(-)


^ permalink raw reply

* [PATCH 0/4] xfrm fixes and flow structurization
From: Timo Teras @ 2010-03-31 10:17 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

These are fixes and cleanups which should be good for merging in.
Patches 1 and 2 are new. Patches 3 and 4 were previously discussed
with Herbert.

Please review and consider committing these.

Thanks.

Timo Teras (4):
  xfrm: increment genid before bumping state genids
  xfrm_user: verify policy direction at XFRM_MSG_POLEXPIRE handler
  xfrm: remove policy lock when accessing policy->walk.dead
  flow: structurize flow cache

 net/core/flow.c        |  223 +++++++++++++++++++++++++----------------------
 net/xfrm/xfrm_policy.c |   31 ++-----
 net/xfrm/xfrm_state.c  |    3 +-
 net/xfrm/xfrm_user.c   |   10 +-
 4 files changed, 135 insertions(+), 132 deletions(-)


^ permalink raw reply

* Re: [PATCH next-next-2.6 v2] virtio_net: missing sg_init_table
From: David Miller @ 2010-03-31 10:16 UTC (permalink / raw)
  To: mst; +Cc: mashirle, thomas, netdev, linux-kernel
In-Reply-To: <20100331092022.GA31911@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 31 Mar 2010 12:20:22 +0300

> On Mon, Mar 29, 2010 at 06:19:15PM -0700, Shirley Ma wrote:
>> Add missing sg_init_table for sg_set_buf in virtio_net which
>> induced in defer skb patch.
>> 
>> Reported-by: Thomas Müller <thomas@mathtm.de>
>> Tested-by: Thomas Müller <thomas@mathtm.de>
>> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> 
> I'm concerned that the 'big' path might cause a performance regression.
> Let's move sg into virtnet_info so that this needs to be only called
> once?

Yeah that might improve things.

Shirley's change is already in net-next-2.6 so anything implementing
this would need to be submitted relative to that.

^ permalink raw reply

* Re: [PATCH] MACB: Set PHY address in kernel parameters
From: Anders Darander @ 2010-03-31 10:11 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Haavard Skinnemoen, David S. Miller, Erik Waling, Patrick McHardy,
	Grant Likely, netdev, linux-kernel
In-Reply-To: <20100331100313.GA4053@psychotron.lab.eng.brq.redhat.com>

* Jiri Pirko <jpirko@redhat.com> [100331 12:03]:
> Wed, Mar 31, 2010 at 09:51:42AM CEST, anders.darander@gmail.com wrote:
> > 
> >-	phydev = phy_find_first(bp->mii_bus);
> >+	if (phy_addr >= PHY_MAX_ADDRESS)
> >+		phydev = phy_find_first(bp->mii_bus);
> >+	else
> >+		phydev = bp->mii_bus->phy_map[phy_addr];
> >+
> > 	if (!phydev) {
> > 		printk (KERN_ERR "%s: no PHY found\n", dev->name);
> > 		return -1;
> 
> This is really ugly :( Should be done differently, more general. I've been
> thinking about this for a while. Maybe the solution is to integrate the switch
> into DSA subsystem. See net/dsa. Not sure though...

A more general solution is always welcome...

However, although I've not tested it, I think that Marc Kleine-Budde
came up with the correct solution. That was using phy_mask, and set it
up in the *eth_data structure in the board setup code.

Thus, it seems that the needed functionality is readily available.
(Working in the embedded field, I've no problem with defining HW-related
things in the board setup code, as opposite of having it dynamically
defined).

Regards,
Anders


^ permalink raw reply

* Re: [Patch] fix packet loss and massive ping spikes with PPP multi-link
From: Ben McKeegan @ 2010-03-31 10:03 UTC (permalink / raw)
  To: netdev, linux-ppp
  Cc: Alan Cox, Alexander E. Patrakov, Richard Hartmann, linux-kernel
In-Reply-To: <20100326170428.6c1ad66c@lxorguk.ukuu.org.uk>

[-- Attachment #1: Type: text/plain, Size: 2775 bytes --]

>>> Making it runtime per link selectable would be nicer but thats a bit more
>>> work.
>> Doesn't it work already via echoing values to 
>> /sys/module/ppp/generic/parameters/ml_explode in the above code?
> 
> Thats runtime (and why I set 0600 in the permissions for the example) but
> not per link.
> 

I needed to do something similar a while back and I took a very 
different approach, which I think is more flexible.   Rather than 
implement a new round-robin scheduler I simply introduced a target 
minimum fragment size into the fragment size calculation, as a per 
bundle parameter that can be configured via a new ioctl.  This modifies 
the algorithm so that it tries to limit the number of fragments such 
that each fragment is at least the minimum size.  If the minimum size is 
greater than the packet size it will not be fragmented all but will 
instead just get sent down the next available channel.

A pppd plugin generates the ioctl call allowing this to be tweaked per 
connection.  It is more flexible in that you can still have the larger 
packets fragmented if you wish.

We've used a variant of this patch on our ADSL LNS pool for a few years 
now with varying results.  We originally did it to save bandwidth as we 
have a per packet overhead and fragmenting tiny packets such as VoIP 
across a bundle of 4 lines made no sense at all.  We've experimented 
with higher minimum settings up to and above the link MTU, thus 
achieving the equivalent of Richard's patch.

In some cases this has improved performance, others it makes it worse. 
It depends a lot on the lines and traffic patterns, and it is certainly 
not a change we would wish to have on by default.  Any solution going 
into mainline kernel would need to be tunable per connection.  One of 
the issues seems to be with poor recovery from packet loss on low 
volume, highly delay sensitive traffic on large bundles of lines.  With 
Linux at both ends you are relying on received sequence numbers to 
detect loss.  When packets are being fragmented across all channels and 
a fragment is lost, the receiving system is able to spot the lost 
fragment fairly quickly.  Once you start sending some multilink frames 
down individual channels, it takes a lot longer for the receiver to 
notice the packet loss on an individual channel.  Until another fragment 
is successfully received on the lossy channel, the fragments of the 
incomplete frame sit in the queue clogging up the other channels (the 
receiver is attempting to preserve the original packet order and is 
still waiting for the lost fragment).

Original patch attached.   This almost certainly needs updating to take 
account of other more recent changes in multi link algorithm but it may 
provide some inspiration.

Regards,
Ben.


[-- Attachment #2: mppp-min-frag-size.patch --]
[-- Type: text/x-diff, Size: 4255 bytes --]

diff -ubdr linux-2.6.16.16-l2tp/drivers/net/ppp_generic.c linux-2.6.16.16-l2tp-mppp/drivers/net/ppp_generic.c
--- linux-2.6.16.16-l2tp/drivers/net/ppp_generic.c	2006-05-11 02:56:24.000000000 +0100
+++ linux-2.6.16.16-l2tp-mppp/drivers/net/ppp_generic.c	2007-07-03 18:23:35.000000000 +0100
@@ -64,7 +64,7 @@
 
 #define MPHDRLEN	6	/* multilink protocol header length */
 #define MPHDRLEN_SSN	4	/* ditto with short sequence numbers */
-#define MIN_FRAG_SIZE	64
+#define MIN_FRAG_SIZE	256
 
 /*
  * An instance of /dev/ppp can be associated with either a ppp
@@ -120,6 +120,7 @@
 	unsigned long	last_recv;	/* jiffies when last pkt rcvd a0 */
 	struct net_device *dev;		/* network interface device a4 */
 #ifdef CONFIG_PPP_MULTILINK
+        int             minfragsize;    /* minimum size for a fragment */
 	int		nxchan;		/* next channel to send something on */
 	u32		nxseq;		/* next sequence number to send */
 	int		mrru;		/* MP: max reconst. receive unit */
@@ -767,6 +768,15 @@
 		ppp_recv_unlock(ppp);
 		err = 0;
 		break;
+
+	case PPPIOCSMINFRAG:
+	        if (get_user(val, p))
+	                break;
+	        ppp_recv_lock(ppp);
+	        ppp->minfragsize = val < 0 ? 0 : val;
+	        ppp_recv_unlock(ppp);
+	        err = 0;
+	        break;
 #endif /* CONFIG_PPP_MULTILINK */
 
 	default:
@@ -1254,7 +1264,7 @@
 	int len, fragsize;
 	int i, bits, hdrlen, mtu;
 	int flen;
-	int navail, nfree;
+	int navail, nfree, nfrag;
 	int nbigger;
 	unsigned char *p, *q;
 	struct list_head *list;
@@ -1285,7 +1295,7 @@
 	 * the channels are free.  This gives much better TCP
 	 * performance if we have a lot of channels.
 	 */
-	if (nfree == 0 || nfree < navail / 2)
+	if (nfree == 0 || (nfree < navail / 2 && ppp->minfragsize == 0))
 		return 0;	/* can't take now, leave it in xmit_pending */
 
 	/* Do protocol field compression (XXX this should be optional) */
@@ -1302,13 +1312,24 @@
 	 * how small they are (i.e. even 0 length) in order to minimize
 	 * the time that it will take to detect when a channel drops
 	 * a fragment.
+         * However, if ppp->minfragsize > 0 we try to avoid creating
+         * fragments smaller than ppp->minfragsize and thus do not
+         * always use all free channels
 	 */
+	if (ppp->minfragsize > 0) {
+	  nfrag= len / ppp->minfragsize;
+	  if (nfrag < 1)
+	        nfrag = 1;
+	  else if (nfrag > nfree)
+	        nfrag = nfree;
+	} else
+	        nfrag = nfree;
 	fragsize = len;
-	if (nfree > 1)
-		fragsize = DIV_ROUND_UP(fragsize, nfree);
+	if (nfrag > 1)
+	        fragsize = DIV_ROUND_UP(fragsize, nfrag);
 	/* nbigger channels get fragsize bytes, the rest get fragsize-1,
 	   except if nbigger==0, then they all get fragsize. */
-	nbigger = len % nfree;
+	nbigger = len % nfrag;
 
 	/* skip to the channel after the one we last used
 	   and start at that one */
@@ -1323,7 +1344,7 @@
 
 	/* create a fragment for each channel */
 	bits = B;
-	while (nfree > 0 || len > 0) {
+	while (len > 0 || (nfree > 0 && ppp->minfragsize == 0)) {
 		list = list->next;
 		if (list == &ppp->channels) {
 			i = 0;
@@ -1371,7 +1392,7 @@
 			mtu = 4;
 		if (flen > mtu)
 			flen = mtu;
-		if (flen == len && nfree == 0)
+		if (flen == len && (nfree == 0 || ppp->minfragsize != 0))
 			bits |= E;
 		frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC);
 		if (frag == 0)
@@ -2435,6 +2456,7 @@
 	spin_lock_init(&ppp->rlock);
 	spin_lock_init(&ppp->wlock);
 #ifdef CONFIG_PPP_MULTILINK
+	ppp->minfragsize = MIN_FRAG_SIZE;
 	ppp->minseq = -1;
 	skb_queue_head_init(&ppp->mrq);
 #endif /* CONFIG_PPP_MULTILINK */
diff -ubdr linux-2.6.16.16-l2tp/include/linux/if_ppp.h linux-2.6.16.16-l2tp-mppp/include/linux/if_ppp.h
--- linux-2.6.16.16-l2tp/include/linux/if_ppp.h	2006-05-12 13:45:00.000000000 +0100
+++ linux-2.6.16.16-l2tp-mppp/include/linux/if_ppp.h	2007-07-03 18:15:27.000000000 +0100
@@ -162,6 +162,7 @@
 #define PPPIOCATTCHAN	_IOW('t', 56, int)	/* attach to ppp channel */
 #define PPPIOCGCHAN	_IOR('t', 55, int)	/* get ppp channel number */
 #define PPPIOCGL2TPSTATS _IOR('t', 54, struct pppol2tp_ioc_stats)
+#define PPPIOCSMINFRAG  _IOW('t', 53, int)  /* minimum fragment size */
 
 #define SIOCGPPPSTATS   (SIOCDEVPRIVATE + 0)
 #define SIOCGPPPVER     (SIOCDEVPRIVATE + 1)	/* NEVER change this!! */

^ permalink raw reply

* Re: [PATCH] MACB: Set PHY address in kernel parameters
From: Jiri Pirko @ 2010-03-31 10:03 UTC (permalink / raw)
  To: Anders Darander
  Cc: Haavard Skinnemoen, David S. Miller, Erik Waling, Patrick McHardy,
	Anders Darander, Grant Likely, netdev, linux-kernel
In-Reply-To: <1270021902-6556-1-git-send-email-anders.darander@gmail.com>

Wed, Mar 31, 2010 at 09:51:42AM CEST, anders.darander@gmail.com wrote:
>From: Anders Darander <ad@datarespons.se>
>
>Add the possibility to set the phy address. This is needed if an integrated
>switch is connected to the MAC, as it is often the case that the highest port
>is the one connected to the MAC of the MCU.
>
>E.g. in the case of the Micrel KSZ8873, port 3 is the one to connect to the
>MCU, thus, the MAC needs to connect to phy address 0x03, instead of the first
>phy found.
>
>Signed-off-by: Anders Darander <ad@datarespons.se>
>---
> drivers/net/macb.c |   14 +++++++++++++-
> 1 files changed, 13 insertions(+), 1 deletions(-)
>
>diff --git a/drivers/net/macb.c b/drivers/net/macb.c
>index c8a18a6..9b4e301 100644
>--- a/drivers/net/macb.c
>+++ b/drivers/net/macb.c
>@@ -53,6 +53,14 @@
> #define MACB_RX_INT_FLAGS	(MACB_BIT(RCOMP) | MACB_BIT(RXUBR)	\
> 				 | MACB_BIT(ISR_ROVR))
> 
>+/*
>+ * Setup PHY probeing
>+ */
>+
>+static int phy_addr = PHY_MAX_ADDR;
>+module_param(phy_addr, ushort, 0);
>+MODULE_PARAM_DESC(phy_addr, "PHY address connected to the MACB");
>+
> static void __macb_set_hwaddr(struct macb *bp)
> {
> 	u32 bottom;
>@@ -193,7 +201,11 @@ static int macb_mii_probe(struct net_device *dev)
> 	struct eth_platform_data *pdata;
> 	int ret;
> 
>-	phydev = phy_find_first(bp->mii_bus);
>+	if (phy_addr >= PHY_MAX_ADDRESS)
>+		phydev = phy_find_first(bp->mii_bus);
>+	else
>+		phydev = bp->mii_bus->phy_map[phy_addr];
>+
> 	if (!phydev) {
> 		printk (KERN_ERR "%s: no PHY found\n", dev->name);
> 		return -1;
>-- 
>1.7.0.3
>

This is really ugly :( Should be done differently, more general. I've been
thinking about this for a while. Maybe the solution is to integrate the switch
into DSA subsystem. See net/dsa. Not sure though...

Jirka

^ permalink raw reply

* Re: iproute u32 filter - server hang
From: Patrick McHardy @ 2010-03-31 10:01 UTC (permalink / raw)
  To: Paweł Staszewski; +Cc: Linux Network Development list
In-Reply-To: <4BB31D15.3000500@itcare.pl>

Paweł Staszewski wrote:
> W dniu 2010-03-31 11:46, Patrick McHardy pisze:
>>>>> tc filter add dev eth0 protocol ip parent 1:101 u32 match ip
>>>>> protocol 1
>>>>> 0xff flowid 1:101
>>>>>
>>>>> ping 212.77.100.101
>>>>> And after this server will stop responding to anything - without any
>>>>> error (hang).
>>>>>
>>>>>          
>>>> This is caused by hfsc_classify() looping endlessly since the filter
>>>> points to the originating class. hfsc_bind_tcf() is actually supposed
>>>> to prevent this, but it only prevents resolving the filter immediately
>>>> and we still run into the loop at runtime.
>>>>
>>>> This patch (based on how CBQ handles this) should abort classification
>>>> and fall back to the default class. It would be better to simply catch
>>>> this at configuration time, but that looks a bit more involved. I'll
>>>> try
>>>> to look into it this weekend.
>>>>
>>>>
>>>>
>>>>        
>>> I check this also with htb and the same problem like with hfsc.
>>> This rules also hang my server.
>>>      
>> Yes, HTB doesn't even catch loops when binding filters. As I said,
>> its a larger piece of work, for now please just try the patch I
>> sent.
>>    
> 
> Yes.
> Your patch fix this problem.

Thanks for testing, I'll let you know once I have a complete
patch for this problem.

^ permalink raw reply

* Re: iproute u32 filter - server hang
From: Paweł Staszewski @ 2010-03-31  9:59 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Linux Network Development list
In-Reply-To: <4BB31A08.90809@trash.net>

W dniu 2010-03-31 11:46, Patrick McHardy pisze:
> Paweł Staszewski wrote:
>    
>> W dniu 2010-03-31 11:34, Patrick McHardy pisze:
>>      
>>> Paweł Staszewski wrote:
>>>
>>>        
>>>> I find some problem with iproute2 and u32 filters
>>>>
>>>> To reproduce the problem (need to make one mistake in filter parent
>>>> declaration 1:101):
>>>>
>>>> ...
>>>> tc filter add dev eth0 protocol ip parent 1:101 u32 match ip protocol 1
>>>> 0xff flowid 1:101
>>>>
>>>> ping 212.77.100.101
>>>> And after this server will stop responding to anything - without any
>>>> error (hang).
>>>>
>>>>          
>>> This is caused by hfsc_classify() looping endlessly since the filter
>>> points to the originating class. hfsc_bind_tcf() is actually supposed
>>> to prevent this, but it only prevents resolving the filter immediately
>>> and we still run into the loop at runtime.
>>>
>>> This patch (based on how CBQ handles this) should abort classification
>>> and fall back to the default class. It would be better to simply catch
>>> this at configuration time, but that looks a bit more involved. I'll try
>>> to look into it this weekend.
>>>
>>>
>>>
>>>        
>> I check this also with htb and the same problem like with hfsc.
>> This rules also hang my server.
>>      
> Yes, HTB doesn't even catch loops when binding filters. As I said,
> its a larger piece of work, for now please just try the patch I
> sent.
>    

Yes.
Your patch fix this problem.


Thanks
Paweł


> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>    

^ permalink raw reply

* Re: [PATCH][V2] MACB: Set PHY address in kernel parameters
From: Marc Kleine-Budde @ 2010-03-31  9:48 UTC (permalink / raw)
  To: Anders Darander
  Cc: Haavard Skinnemoen, David S. Miller, Jiri Pirko, Erik Waling,
	Patrick McHardy, Grant Likely, netdev, linux-kernel
In-Reply-To: <20100331093956.GI4837@datarespons.se>

[-- Attachment #1: Type: text/plain, Size: 1066 bytes --]

Anders Darander wrote:
> * Marc Kleine-Budde <mkl@pengutronix.de> [100331 11:18]:
>> We're using phy_mask in one of our projects, it's still using 2.6.29,
>> though. I think it's worth testing if it's still working.
> 
> Well, that was obviously something I overlooked / misinterpreted when I
> tried to solve our problem!
> 
> As I understand it, phy_mask is a bitfield for setting which PHYs should
> be enabled... Then this should have worked OK for us!

Yes, it's a bitmask of PHYs that should be probed.

>> static struct at91_eth_data __initdata p298_macb_data = {
>>        .is_rmii        = 0,
>>        .phy_mask       = ~(1 << 8),
>> };
> 
>> at91_add_device_eth(&p298_macb_data);
> 
> I've not tested it, but it should probably have worked.

Cheers, Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply

* Re: iproute u32 filter - server hang
From: Patrick McHardy @ 2010-03-31  9:46 UTC (permalink / raw)
  To: Paweł Staszewski; +Cc: Linux Network Development list
In-Reply-To: <4BB31908.5090601@itcare.pl>

Paweł Staszewski wrote:
> W dniu 2010-03-31 11:34, Patrick McHardy pisze:
>> Paweł Staszewski wrote:
>>   
>>> I find some problem with iproute2 and u32 filters
>>>
>>> To reproduce the problem (need to make one mistake in filter parent
>>> declaration 1:101):
>>>
>>> ...
>>> tc filter add dev eth0 protocol ip parent 1:101 u32 match ip protocol 1
>>> 0xff flowid 1:101
>>>
>>> ping 212.77.100.101
>>> And after this server will stop responding to anything - without any
>>> error (hang).
>>>      
>> This is caused by hfsc_classify() looping endlessly since the filter
>> points to the originating class. hfsc_bind_tcf() is actually supposed
>> to prevent this, but it only prevents resolving the filter immediately
>> and we still run into the loop at runtime.
>>
>> This patch (based on how CBQ handles this) should abort classification
>> and fall back to the default class. It would be better to simply catch
>> this at configuration time, but that looks a bit more involved. I'll try
>> to look into it this weekend.
>>
>>
>>    
> I check this also with htb and the same problem like with hfsc.
> This rules also hang my server.

Yes, HTB doesn't even catch loops when binding filters. As I said,
its a larger piece of work, for now please just try the patch I
sent.

^ permalink raw reply

* Re: iproute u32 filter - server hang
From: Paweł Staszewski @ 2010-03-31  9:42 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Linux Network Development list
In-Reply-To: <4BB31731.9060104@trash.net>

W dniu 2010-03-31 11:34, Patrick McHardy pisze:
> Paweł Staszewski wrote:
>    
>> I find some problem with iproute2 and u32 filters
>>
>> To reproduce the problem (need to make one mistake in filter parent
>> declaration 1:101):
>>
>> ...
>> tc filter add dev eth0 protocol ip parent 1:101 u32 match ip protocol 1
>> 0xff flowid 1:101
>>
>> ping 212.77.100.101
>> And after this server will stop responding to anything - without any
>> error (hang).
>>      
> This is caused by hfsc_classify() looping endlessly since the filter
> points to the originating class. hfsc_bind_tcf() is actually supposed
> to prevent this, but it only prevents resolving the filter immediately
> and we still run into the loop at runtime.
>
> This patch (based on how CBQ handles this) should abort classification
> and fall back to the default class. It would be better to simply catch
> this at configuration time, but that looks a bit more involved. I'll try
> to look into it this weekend.
>
>
>    
I check this also with htb and the same problem like with hfsc.
This rules also hang my server.

tc qdisc del dev eth4 root
tc qdisc add dev eth4 root handle 1: htb default 63
tc class add dev eth4 parent 1: classid 1:1 htb rate 100mbit ceil 100mbit
tc class add dev eth4 parent 1:1 classid 1:2 htb rate 1mbit ceil 1mbit
tc class add dev eth4 parent 1:1 classid 1:63 htb rate 99mbit ceil 99mbit
tc class add dev eth4 parent 1:1 classid 1:101 htb rate 8kbit ceil 1mbit
tc class add dev eth4 parent 1:101 classid 1:102 htb rate 8kbit ceil 1mbit
tc filter add dev eth4 protocol ip parent 1: u32 match ip dst 
212.77.100.101 flowid 1:101
tc filter add dev eth4 protocol ip parent 1:101 u32 match ip protocol 1 
0xff flowid 1:101





^ permalink raw reply

* Re: [PATCH][V2] MACB: Set PHY address in kernel parameters
From: Anders Darander @ 2010-03-31  9:39 UTC (permalink / raw)
  To: Marc Kleine-Budde
  Cc: Haavard Skinnemoen, David S. Miller, Jiri Pirko, Erik Waling,
	Patrick McHardy, Grant Likely, netdev, linux-kernel
In-Reply-To: <4BB31345.5050101@pengutronix.de>

* Marc Kleine-Budde <mkl@pengutronix.de> [100331 11:18]:
> We're using phy_mask in one of our projects, it's still using 2.6.29,
> though. I think it's worth testing if it's still working.

Well, that was obviously something I overlooked / misinterpreted when I
tried to solve our problem!

As I understand it, phy_mask is a bitfield for setting which PHYs should
be enabled... Then this should have worked OK for us!

> static struct at91_eth_data __initdata p298_macb_data = {
>        .is_rmii        = 0,
>        .phy_mask       = ~(1 << 8),
> };

> at91_add_device_eth(&p298_macb_data);

I've not tested it, but it should probably have worked.

Regards,
Anders

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox