Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH net-next v2 1/1] ipv6: add support of ECMP
From: Nicolas Dichtel @ 2012-09-14  7:59 UTC (permalink / raw)
  To: yoshfuji; +Cc: bernat, netdev, davem, Nicolas Dichtel
In-Reply-To: <1347609548-14494-1-git-send-email-nicolas.dichtel@6wind.com>

This patch adds the support of equal cost multipath for IPv6.

The patch is based on a previous work from
Luc Saillard <luc.saillard@6wind.com>.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/net/ip6_fib.h |  13 ++++
 net/ipv6/Kconfig      |  33 ++++++++
 net/ipv6/ip6_fib.c    |  73 ++++++++++++++++++
 net/ipv6/route.c      | 209 +++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 325 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cd64cf3..37e502a 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -47,6 +47,10 @@ struct fib6_config {
 	unsigned long	fc_expires;
 	struct nlattr	*fc_mx;
 	int		fc_mx_len;
+#ifdef CONFIG_IPV6_MULTIPATH
+	struct nlattr	*fc_mp;
+	int		fc_mp_len;
+#endif
 
 	struct nl_info	fc_nlinfo;
 };
@@ -98,6 +102,15 @@ struct rt6_info {
 	struct fib6_node		*rt6i_node;
 
 	struct in6_addr			rt6i_gateway;
+#ifdef CONFIG_IPV6_MULTIPATH
+	/*
+	 * siblings is a list of rt6_info that have the the same metric/weight,
+	 * destination, but not the same gateway. nsiblings is just a cache
+	 * to speed up lookup.
+	 */
+	unsigned int			rt6i_nsiblings;
+	struct list_head		rt6i_siblings;
+#endif
 
 	atomic_t			rt6i_ref;
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 4f7fe72..e0c92dc 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -266,4 +266,37 @@ config IPV6_PIMSM_V2
 	  Support for IPv6 PIM multicast routing protocol PIM-SMv2.
 	  If unsure, say N.
 
+config IPV6_MULTIPATH
+	bool "IPv6: equal cost multipath for IPv6 routing"
+	depends on IPV6
+	default y
+	---help---
+	  Enable this option to support ECMP for IPv6.
+
+choice
+	prompt "IPv6: choose Multipath algorithm"
+	depends on IPV6_MULTIPATH
+	default IPV6_MULTIPATH_HASH
+	---help---
+	  Define the method to select route between each possible path.
+	  The recommanded algorithm (by RFC4311) is HASH method.
+
+	config IPV6_MULTIPATH_HASH
+	bool "IPv6: MULTIPATH hash/flow algorithm"
+	---help---
+	  Multipath routes are chosen according to hash of packet header to
+	  ensure a flow keeps the same route.
+	  This algorithm is recommanded by RFC4311.
+
+	config IPV6_MULTIPATH_RR
+	bool "IPv6: MULTIPATH round robin algorithm"
+	---help---
+	  Multipath routes are chosen according to Round Robin.
+
+	config IPV6_MULTIPATH_RANDOM
+	bool "IPv6: MULTIPATH random algorithm"
+	---help---
+	  Multipath routes are chosen in a random fashion.
+endchoice
+
 endif # IPV6
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 13690d6..3541e44 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -672,6 +672,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			    iter->rt6i_idev == rt->rt6i_idev &&
 			    ipv6_addr_equal(&iter->rt6i_gateway,
 					    &rt->rt6i_gateway)) {
+#ifdef CONFIG_IPV6_MULTIPATH
+				if (rt->rt6i_nsiblings)
+					rt->rt6i_nsiblings = 0;
+#endif
 				if (!(iter->rt6i_flags & RTF_EXPIRES))
 					return -EEXIST;
 				if (!(rt->rt6i_flags & RTF_EXPIRES))
@@ -680,6 +684,23 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 					rt6_set_expires(iter, rt->dst.expires);
 				return -EEXIST;
 			}
+#ifdef CONFIG_IPV6_MULTIPATH
+			/* If we have the same destination and the same metric,
+			 * but not the same gateway, then the route we try to
+			 * add is sibling to this route, increment our counter
+			 * of siblings, and later we will add our route to the
+			 * list.
+			 * Only static routes (which don't have flag
+			 * RTF_EXPIRES) are used for ECMPv6.
+			 *
+			 * To avoid long list, we only had siblings if the
+			 * route have a gateway.
+			 */
+			if (rt->rt6i_flags & RTF_GATEWAY &&
+			    !(rt->rt6i_flags & RTF_EXPIRES) &&
+			    !(iter->rt6i_flags & RTF_EXPIRES))
+				rt->rt6i_nsiblings++;
+#endif
 		}
 
 		if (iter->rt6i_metric > rt->rt6i_metric)
@@ -692,6 +713,43 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 	if (ins == &fn->leaf)
 		fn->rr_ptr = NULL;
 
+#ifdef CONFIG_IPV6_MULTIPATH
+	/* Link this route to others same route. */
+	if (rt->rt6i_nsiblings) {
+		unsigned int rt6i_nsiblings;
+		struct rt6_info *sibling, *temp_sibling;
+
+		/* Find the first route that have the same metric */
+		sibling = fn->leaf;
+		while (sibling) {
+			if (sibling->rt6i_metric == rt->rt6i_metric) {
+				list_add_tail(&rt->rt6i_siblings,
+					      &sibling->rt6i_siblings);
+				break;
+			}
+			sibling = sibling->dst.rt6_next;
+		}
+		/* For each sibling in the list, increment the counter of
+		 * siblings. We can check if all the counter are equal.
+		 */
+		rt6i_nsiblings = 0;
+		list_for_each_entry_safe(sibling, temp_sibling,
+					 &rt->rt6i_siblings,
+					 rt6i_siblings) {
+			sibling->rt6i_nsiblings++;
+			if (unlikely(sibling->rt6i_nsiblings !=
+				     rt->rt6i_nsiblings)) {
+				pr_err("Wrong number of siblings for route %p (%d)\n",
+				       sibling, sibling->rt6i_nsiblings);
+			}
+			rt6i_nsiblings++;
+		}
+		if (unlikely(rt6i_nsiblings != rt->rt6i_nsiblings)) {
+			pr_err("Wrong number of siblings for route %p. I have %d routes, but count %d siblings\n",
+			       rt, rt6i_nsiblings, rt->rt6i_nsiblings);
+		}
+	}
+#endif
 	/*
 	 *	insert node
 	 */
@@ -1197,6 +1255,21 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	if (fn->rr_ptr == rt)
 		fn->rr_ptr = NULL;
 
+#ifdef CONFIG_IPV6_MULTIPATH
+	/* Remove this entry from other siblings */
+	if (rt->rt6i_nsiblings) {
+		struct rt6_info *sibling, *next_sibling;
+
+		/* For each siblings, decrement the counter of siblings */
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &rt->rt6i_siblings, rt6i_siblings) {
+			sibling->rt6i_nsiblings--;
+		}
+		rt->rt6i_nsiblings = 0;
+		list_del_init(&rt->rt6i_siblings);
+	}
+#endif
+
 	/* Adjust walkers */
 	read_lock(&fib6_walker_lock);
 	FOR_WALKERS(w) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 399613b..431f7ad 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -57,6 +57,9 @@
 #include <net/xfrm.h>
 #include <net/netevent.h>
 #include <net/netlink.h>
+#ifdef CONFIG_IPV6_MULTIPATH
+#include <net/nexthop.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -288,6 +291,10 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 
 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
+#ifdef CONFIG_IPV6_MULTIPATH
+		INIT_LIST_HEAD(&rt->rt6i_siblings);
+		rt->rt6i_nsiblings = 0;
+#endif
 	}
 	return rt;
 }
@@ -388,6 +395,124 @@ static bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+#ifdef CONFIG_IPV6_MULTIPATH
+/*
+ *	Multipath route selection.
+ */
+
+#ifdef CONFIG_IPV6_MULTIPATH_RANDOM
+/*
+ * Pseudo random candidate function
+ */
+static int rt6_info_hash_randomfn(unsigned int candidate_count)
+{
+	return random32() % candidate_count;
+}
+#endif
+
+#ifdef CONFIG_IPV6_MULTIPATH_RR
+/*
+ * Fake Round Robin candidate function
+ * If we want real RR, we need to add a counter in each route
+ */
+static int rt6_info_hash_falserr(unsigned int candidate_count)
+{
+	static unsigned int seed;
+	seed++;
+	return seed % candidate_count;
+}
+#endif
+
+#ifdef CONFIG_IPV6_MULTIPATH_HASH
+/*
+ * Pseudo random candidate using the src port, and other information
+ * Adapted from fib_info_hashfn()
+ */
+static int rt6_info_hash_nhsfn(unsigned int candidate_count,
+			       const struct flowi6 *fl6)
+{
+	unsigned int val = fl6->flowi6_proto;
+
+	val ^= fl6->daddr.s6_addr32[0];
+	val ^= fl6->daddr.s6_addr32[1];
+	val ^= fl6->daddr.s6_addr32[2];
+	val ^= fl6->daddr.s6_addr32[3];
+
+	val ^= fl6->saddr.s6_addr32[0];
+	val ^= fl6->saddr.s6_addr32[1];
+	val ^= fl6->saddr.s6_addr32[2];
+	val ^= fl6->saddr.s6_addr32[3];
+
+	/* Work only if this not encapsulated */
+	switch (fl6->flowi6_proto) {
+	case IPPROTO_UDP:
+	case IPPROTO_TCP:
+	case IPPROTO_SCTP:
+		val ^= fl6->fl6_sport;
+		val ^= fl6->fl6_dport;
+		break;
+
+	case IPPROTO_ICMPV6:
+		val ^= fl6->fl6_icmp_type;
+		val ^= fl6->fl6_icmp_code;
+		break;
+	}
+	/* RFC6438 recommands to use flowlabel */
+	val ^= fl6->flowlabel;
+
+	/* Perhaps, we need to tune, this function? */
+	val = val ^ (val >> 7) ^ (val >> 12);
+	return val % candidate_count;
+}
+#endif
+
+/*
+ * This function return an index used to select (at random, round robin, ...)
+ * a route between any siblings.
+ *
+ * Note: fl6 can be NULL
+ */
+static unsigned int rt6_info_hashfn(const struct rt6_info *rt,
+				    const struct flowi6 *fl6)
+{
+	int candidate_count = rt->rt6i_nsiblings + 1;
+
+#if defined(CONFIG_IPV6_MULTIPATH_RR)
+	return rt6_info_hash_falserr(candidate_count);
+#elif defined(CONFIG_IPV6_MULTIPATH_RANDOM)
+	return rt6_info_hash_randomfn(candidate_count);
+#elif defined(CONFIG_IPV6_MULTIPATH_HASH)
+	if (fl6 == NULL)
+		return 0;
+	return rt6_info_hash_nhsfn(candidate_count, fl6);
+#else
+	return 0;
+#endif
+}
+
+static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+					     struct flowi6 *fl6)
+{
+	struct rt6_info *sibling, *next_sibling;
+	int route_choosen;
+
+	route_choosen = rt6_info_hashfn(match, fl6);
+	/* Don't change the route, if route_choosen == 0
+	 * (siblings does not include ourself)
+	 */
+	if (route_choosen)
+		list_for_each_entry_safe(sibling, next_sibling,
+				&match->rt6i_siblings, rt6i_siblings) {
+			route_choosen--;
+			if (route_choosen == 0) {
+				match = sibling;
+				break;
+			}
+		}
+	return match;
+}
+#endif /* CONFIG_IPV6_MULTIPATH */
+
 /*
  *	Route lookup. Any table->tb6_lock is implied.
  */
@@ -705,6 +830,10 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 restart:
 	rt = fn->leaf;
 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+#ifdef CONFIG_IPV6_MULTIPATH
+	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+		rt = rt6_multipath_select(rt, fl6);
+#endif
 	BACKTRACK(net, &fl6->saddr);
 out:
 	dst_use(&rt->dst, jiffies);
@@ -866,7 +995,10 @@ restart_2:
 
 restart:
 	rt = rt6_select(fn, oif, strict | reachable);
-
+#ifdef CONFIG_IPV6_MULTIPATH
+	if (rt->rt6i_nsiblings && oif == 0)
+		rt = rt6_multipath_select(rt, fl6);
+#endif
 	BACKTRACK(net, &fl6->saddr);
 	if (rt == net->ipv6.ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
@@ -2247,6 +2379,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_IIF]		= { .type = NLA_U32 },
 	[RTA_PRIORITY]          = { .type = NLA_U32 },
 	[RTA_METRICS]           = { .type = NLA_NESTED },
+#ifdef CONFIG_IPV6_MULTIPATH
+	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
+#endif
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2324,11 +2459,69 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[RTA_TABLE])
 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
 
+#ifdef CONFIG_IPV6_MULTIPATH
+	if (tb[RTA_MULTIPATH]) {
+		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
+		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+	}
+#endif
+
 	err = 0;
 errout:
 	return err;
 }
 
+#ifdef CONFIG_IPV6_MULTIPATH
+static int ip6_route_multipath(struct fib6_config *cfg, int add)
+{
+	struct fib6_config r_cfg;
+	struct rtnexthop *rtnh;
+	int remaining;
+	int attrlen;
+	int err = 0, last_err = 0;
+
+beginning:
+	rtnh = (struct rtnexthop *)cfg->fc_mp;
+	remaining = cfg->fc_mp_len;
+
+	/* Parse a Multipath Entry */
+	while (rtnh_ok(rtnh, remaining)) {
+		memcpy(&r_cfg, cfg, sizeof(*cfg));
+		if (rtnh->rtnh_ifindex)
+			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			if (nla) {
+				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+				r_cfg.fc_flags |= RTF_GATEWAY;
+			}
+		}
+		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+		if (err) {
+			last_err = err;
+			/* If we are trying to remove a route, do not stop the
+			 * loop when ip6_route_del() fails (because next hop is
+			 * already gone), we should try to remove all next hops.
+			 */
+			if (add) {
+				/* If add fails, we should try to delete all
+				 * next hops that have been already added.
+				 */
+				add = 0;
+				goto beginning;
+			}
+		}
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	return last_err;
+}
+#endif /* CONFIG_IPV6_MULTIPATH */
+
 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct fib6_config cfg;
@@ -2338,7 +2531,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
 	if (err < 0)
 		return err;
 
-	return ip6_route_del(&cfg);
+#ifdef CONFIG_IPV6_MULTIPATH
+	if (cfg.fc_mp)
+		return ip6_route_multipath(&cfg, 0);
+	else
+#endif
+		return ip6_route_del(&cfg);
 }
 
 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -2350,7 +2548,12 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
 	if (err < 0)
 		return err;
 
-	return ip6_route_add(&cfg);
+#ifdef CONFIG_IPV6_MULTIPATH
+	if (cfg.fc_mp)
+		return ip6_route_multipath(&cfg, 1);
+	else
+#endif
+		return ip6_route_add(&cfg);
 }
 
 static inline size_t rt6_nlmsg_size(void)
-- 
1.7.12

^ permalink raw reply related

* Re[2]:  [PATCH V3 0/8] ipvs: IPv6 fragment handling for  IPVS
From: Julian Anastasov @ 2012-09-14  8:23 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: Jesper Dangaard Brouer, Hans Schillstrom, netdev, Patrick McHardy,
	Pablo Neira Ayuso, lvs-devel, Thomas Graf, Wensong Zhang,
	netfilter-devel, Simon Horman
In-Reply-To: <ee5vnhe.0891bc4ff9e38f64c4c5c0da6c50af1c@obelix.schillstrom.com>


	Hello,

On Thu, 13 Sep 2012, Hans Schillstrom wrote:

> >	I also see that we should not send ICMP
> >errors (FRAG NEEDED/TOO BIG) in response to large
> >ICMP error packets but it is not related to your changes,
> >it needs separate change to all transmitters.
> 
> Just a curious question, have you seen such big ICMP packets in real life ?
> (if you have normal MTU:s) 

	No, may be it can happen when PMTU reaches minimum.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* [PATCH v2 0/7] Introduce of_get_child_by_name.
From: Srinivas KANDAGATLA @ 2012-09-14  8:17 UTC (permalink / raw)
  To: davem, robherring2, bergner, devicetree-discuss
  Cc: benh, linuxppc-dev, netdev, jassi.brar, afleming,
	srinivas.kandagatla, grant.likely

From: Srinivas Kandagatla <srinivas.kandagatla@st.com>

This patch series introduces of_get_child_by_name function to get a 
child node by its name in a given parent node and also removes code 
duplication in some of the existing driver code by using of_get_child_by_name.

Normally if a driver want to get a child node it would iterate all the nodes
of parent and compare its name and then get it.
This use case is becoming common as device trees are used more, so moving this 
functionality to a libary function makes sense.

Having of_get_child_by_name libary function would avoid code duplication,
errors and is more convenient.

Changes from v1:
	-rename of_get_child to of_get_child_by_name.
	-remove read lock in of_get_child_by_name.
	-make use of of_get_child_by_name in the existing kernel code.

Srinivas Kandagatla (7):
  dt: introduce of_get_child_by_name to get child node by name.
  dt/powerpc: Use of_get_child_by_name to get a named child.
  dt/powerpc/powernv: Use of_get_child_by_name to get a named child.
  dt/powerpc/sysdev: Use of_get_child_by_name to get a named child.
  dt/net/fsl_pq_mdio: Use of_get_child_by_name to get a named child.
  dt/s3c64xx/spi: Use of_get_child_by_name to get a named child.
  dt/tty/opal: Use of_get_child_by_name to get a named child.

 arch/powerpc/kernel/prom.c                   |    5 +----
 arch/powerpc/platforms/powernv/opal.c        |    6 ++----
 arch/powerpc/sysdev/qe_lib/qe.c              |    5 +----
 drivers/net/ethernet/freescale/fsl_pq_mdio.c |    5 +----
 drivers/of/base.c                            |   25 +++++++++++++++++++++++++
 drivers/spi/spi-s3c64xx.c                    |    4 +---
 drivers/tty/hvc/hvc_opal.c                   |    9 ++-------
 include/linux/of.h                           |    2 ++
 8 files changed, 35 insertions(+), 26 deletions(-)

^ permalink raw reply

* [PATCH v2 1/7] dt: introduce of_get_child_by_name to get child node by name.
From: Srinivas KANDAGATLA @ 2012-09-14  8:17 UTC (permalink / raw)
  To: davem, robherring2, bergner, devicetree-discuss
  Cc: benh, linuxppc-dev, netdev, jassi.brar, afleming,
	srinivas.kandagatla, grant.likely

From: Srinivas Kandagatla <srinivas.kandagatla@st.com>

This patch introduces of_get_child_by_name function to get a child node
by its name in a given parent node.

Without this patch each driver code has to iterate the parent and do
a string compare, However having of_get_child_by_name libary function would
avoid code duplication, errors and is more convenient.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
---
 drivers/of/base.c  |   25 +++++++++++++++++++++++++
 include/linux/of.h |    2 ++
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d4a1c9a..7391f8a 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -391,6 +391,31 @@ struct device_node *of_get_next_available_child(const struct device_node *node,
 EXPORT_SYMBOL(of_get_next_available_child);
 
 /**
+ *	of_get_child_by_name - Find the child node by name for a given parent
+ *	@node:	parent node
+ *	@name:	child name to look for.
+ *
+ *      This function looks for child node for given matching name
+ *
+ *	Returns a node pointer if found, with refcount incremented, use
+ *	of_node_put() on it when done.
+ *	Returns NULL if node is not found.
+ */
+
+struct device_node *of_get_child_by_name(const struct device_node *node,
+				const char *name)
+{
+	struct device_node *child;
+
+	for_each_child_of_node(node, child)
+		if (child->name && (of_node_cmp(child->name, name) == 0)
+			&& of_node_get(child))
+			break;
+	return child;
+}
+EXPORT_SYMBOL(of_get_child_by_name);
+
+/**
  *	of_find_node_by_path - Find a node matching a full OF path
  *	@path:	The full path to match
  *
diff --git a/include/linux/of.h b/include/linux/of.h
index 1b11632..7b8e3cd 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -192,6 +192,8 @@ extern struct device_node *of_get_next_child(const struct device_node *node,
 					     struct device_node *prev);
 extern struct device_node *of_get_next_available_child(
 	const struct device_node *node, struct device_node *prev);
+extern struct device_node *of_get_child_by_name(const struct device_node *node,
+					const char *name);
 
 #define for_each_child_of_node(parent, child) \
 	for (child = of_get_next_child(parent, NULL); child != NULL; \
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 5/7] dt/net/fsl_pq_mdio: Use of_get_child_by_name to get a named child.
From: Srinivas KANDAGATLA @ 2012-09-14  8:19 UTC (permalink / raw)
  To: afleming
  Cc: davem, netdev, devicetree-discuss, robherring2,
	srinivas.kandagatla, grant.likely

From: Srinivas Kandagatla <srinivas.kandagatla@st.com>

As follow-up to "dt: introduce of_get_child_by_name to get child node by
name." patch, This patch removes some of the code duplication in the
driver by replacing it with of_get_child_by_name instead.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
---
 drivers/net/ethernet/freescale/fsl_pq_mdio.c |    5 +----
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
index 9527b28..e710c60 100644
--- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c
+++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
@@ -355,10 +355,7 @@ static int fsl_pq_mdio_probe(struct platform_device *ofdev)
 		goto err_free_irqs;
 	}
 
-	for_each_child_of_node(np, tbi) {
-		if (!strncmp(tbi->type, "tbi-phy", 8))
-			break;
-	}
+	tbi = of_get_child_by_name(np, "tbi-phy");
 
 	if (tbi) {
 		const u32 *prop = of_get_property(tbi, "reg", NULL);
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH v2] netfilter: take care of timewait sockets
From: Sami Farin @ 2012-09-14  9:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, Sami Farin, netdev
In-Reply-To: <1346629684.2563.78.camel@edumazet-glaptop>

On Mon, Sep 03, 2012 at 01:48:04 +0200, Eric Dumazet wrote:
> Sami Farin reported crashes in xt_LOG because it assumes skb->sk is a
> full blown socket.
> 
> But with TCP early demux, we can have skb->sk pointing to a timewait
> socket.
> 
> Same fix is needed in netfnetlink_log

with ip_early_demux=0 and without these patches I did not get panics for
7 days, and with the patches and ip_early_demux=1 it has worked OK for 5 days,
thanks.

-- 
Do what you love because life is too short for anything else.

^ permalink raw reply

* Re: [RFC PATCH net-next v2 0/1] Add support of ECMPv6
From: Vincent Bernat @ 2012-09-14  9:40 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: yoshfuji, netdev, davem
In-Reply-To: <1347609548-14494-1-git-send-email-nicolas.dichtel@6wind.com>

 ❦ 14 septembre 2012 09:59 CEST, Nicolas Dichtel <nicolas.dichtel@6wind.com> :

> Here is an example of a command to add an ECMP route:
> $ ip -6 route add 3ffe:304:124:2306::/64 \
> 	nexthop via fe80::230:1bff:feb4:e05c dev eth0 weight 1 \
> 	nexthop via fe80::230:1bff:feb4:dd4f dev eth0 weight 1

When displaying ECMP routes, the display is different than for IPv4: we
get two distinct routes instead of an ECMP route (with nexthop
keyword).

With IPv4:

193.252.X.X/26  proto zebra  metric 20 
	nexthop via 193.252.X.X  dev bae1 weight 1
	nexthop via 193.252.X.X  dev bae2 weight 1

With IPv6:

2a01:c9c0:X:X::/64 via fe80::215:17ff:fe85:76b9 dev bae1 metric 11 
2a01:c9c0:X:X::/64 via fe80::222:91ff:fe4e:b000 dev bae2 metric 11

If I capture the netlink message from the add command, put it in a file
and use "ip monitor file ...", I see this:

2a01:c9c0:X:X::/64
	nexthop via fe80::215:17ff:fe85:76b9  dev if12 weight 1
	nexthop via fe80::222:91ff:fe4e:b000  dev if11 weight 1

Therefore, the problem is not in iproute2 which knows how to display
those ECMP routes. I fear that this difference make support in routing
daemons more difficult.
-- 
Make the coupling between modules visible.
            - The Elements of Programming Style (Kernighan & Plauger)

^ permalink raw reply

* tulip Ethernet driver messes up USB keyboard
From: Marti Raudsepp @ 2012-09-14  9:49 UTC (permalink / raw)
  To: Kernel hackers, Linux USB list, Linux network list
In-Reply-To: <CABRT9RDdbqz6iscOz7BYwEQPOo8xfstfUZyBqVw44gmPsauLTg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hi lists,

After installing an old 100Mbit PCI Ethernet card to my machine, it has
complained a few times about spurious interrupts ("nobody cared") at a
random time of the day. After the oops is reported, my USB keyboard (HP
smart card keyboard) stops working properly -- it has lots of delay, it
misses some keystrokes and also causes "stuck keys". Strangely enough my USB
mouse continues to work without problems. Apparently the USB controller and
the Ethernet card share an interrupt line. Un-/replugging the keyboard does
not help.

So far I have simply rebooted to fix the issue. I haven't been able to
reproduce this on will despite my best attempts.

The Ethernet card is driven by the tulip driver and is listed as "ADMtek
NC100 Network Everywhere Fast Ethernet 10/100 (rev 11)" in lspci. My
motherboard uses the Intel H77 chipset (ASRock H77 Pro4/MVP)

Here are two slightly different examples of the oops report; the effect is
the same:

kernel: irq 16: nobody cared (try booting with the "irqpoll" option)
kernel: Pid: 0, comm: swapper/2 Not tainted 3.5.3-1-ARCH #1
kernel: Call Trace:
kernel:  <IRQ>  [<ffffffff810d4d3d>] __report_bad_irq+0x3d/0xe0
kernel:  [<ffffffff810d5033>] note_interrupt+0x1a3/0x1f0
kernel:  [<ffffffff810d292f>] handle_irq_event_percpu+0xbf/0x260
kernel:  [<ffffffff810d2b18>] handle_irq_event+0x48/0x70
kernel:  [<ffffffff810d5b4a>] handle_fasteoi_irq+0x5a/0x100
kernel:  [<ffffffff810160c2>] handle_irq+0x22/0x40
kernel:  [<ffffffff81484cea>] do_IRQ+0x5a/0xe0
kernel:  [<ffffffff8147c12a>] common_interrupt+0x6a/0x6a
kernel:  <EOI>  [<ffffffffa02d8f01>] ? acpi_idle_enter_c1+0xda/0x104
[processor]
kernel:  [<ffffffffa02d8edc>] ? acpi_idle_enter_c1+0xb5/0x104 [processor]
kernel:  [<ffffffff8134f159>] cpuidle_enter+0x19/0x20
kernel:  [<ffffffff8134f7a6>] cpuidle_idle_call+0xa6/0x330
kernel:  [<ffffffff8101daaf>] cpu_idle+0xbf/0x130
kernel:  [<ffffffff8146a19b>] start_secondary+0x203/0x20a
kernel: handlers:
kernel: [<ffffffffa012c0b0>] usb_hcd_irq [usbcore]
kernel: [<ffffffffa023b1c0>] tulip_interrupt [tulip]
kernel: Disabling IRQ #16

kernel: irq 16: nobody cared (try booting with the "irqpoll" option)
kernel: Pid: 0, comm: swapper/0 Not tainted 3.5.3-1-ARCH #1
kernel: Call Trace:
kernel:  <IRQ>  [<ffffffff810d4d3d>] __report_bad_irq+0x3d/0xe0
kernel:  [<ffffffff810d5033>] note_interrupt+0x1a3/0x1f0
kernel:  [<ffffffff810d292f>] handle_irq_event_percpu+0xbf/0x260
kernel:  [<ffffffff810d2b18>] handle_irq_event+0x48/0x70
kernel:  [<ffffffff810d5b4a>] handle_fasteoi_irq+0x5a/0x100
kernel:  [<ffffffff810160c2>] handle_irq+0x22/0x40
kernel:  [<ffffffff81484cea>] do_IRQ+0x5a/0xe0
kernel:  [<ffffffff8147c12a>] common_interrupt+0x6a/0x6a
kernel:  <EOI>  [<ffffffffa01b6f01>] ? acpi_idle_enter_c1+0xda/0x104
[processor]
kernel:  [<ffffffffa01b6edc>] ? acpi_idle_enter_c1+0xb5/0x104 [processor]
kernel:  [<ffffffff8134f159>] cpuidle_enter+0x19/0x20
kernel:  [<ffffffff8134f7a6>] cpuidle_idle_call+0xa6/0x330
kernel:  [<ffffffff8101daaf>] cpu_idle+0xbf/0x130
kernel:  [<ffffffff81456fbc>] rest_init+0x80/0x84
kernel:  [<ffffffff818bbc35>] start_kernel+0x3c1/0x3ce
kernel:  [<ffffffff818bb673>] ? repair_env_string+0x5e/0x5e
kernel:  [<ffffffff818bb356>] x86_64_start_reservations+0x131/0x135
kernel:  [<ffffffff818bb45a>] x86_64_start_kernel+0x100/0x10f
kernel: handlers:
kernel: [<ffffffffa012c0b0>] usb_hcd_irq [usbcore]
kernel: [<ffffffffa02b41c0>] tulip_interrupt [tulip]
kernel: Disabling IRQ #16

These are after a fresh reboot...

% lspci |grep Eth
03:00.0 Ethernet controller: ADMtek NC100 Network Everywhere Fast Ethernet
10/100 (rev 11)
04:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168B
PCI Express Gigabit Ethernet controller (rev 06)

% dmesg |egrep -i 'eth|tulip'
[    7.868120] tulip: Linux Tulip driver version 1.1.15 (Feb 27, 2007)
[    7.868146] tulip: tulip_init_one: Enabled WOL support for AN983B
[    7.870487] tulip0:  MII transceiver #1 config 1000 status 786d
advertising 05e1
[    7.893274] net eth0: ADMtek Comet rev 17 at Port 0xe000,
00:06:25:48:7f:16, IRQ 16
[    7.902994] systemd-udevd[162]: renamed network interface eth0 to eth1
[    7.989822] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
[    7.990148] r8169 0000:04:00.0: eth0: RTL8168evl/8111evl at
0xffffc90005798000, bc:5f:f4:44:29:6c, XID 0c900800 IRQ 47
[    7.990150] r8169 0000:04:00.0: eth0: jumbo features [frames: 9200 bytes,
tx checksumming: ko]
[   16.032206] r8169 0000:04:00.0: eth0: link down
[   16.032568] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready
[   18.287417] r8169 0000:04:00.0: eth0: link up
[   18.287761] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[   18.437518] tulip 0000:03:00.0: eth1: tulip_stop_rxtx() failed (CSR5
0xfc664010 CSR6 0xff972113)
[   18.437530] net eth1: Setting full-duplex based on MII#1 link partner
capability of 45e1

Somewhat suspect is the "tulip_stop_rxtx() failed" line. I also get that
line every time I stop the interface (using ip link set eth1 down):
Sep 14 12:40:11 wrx kernel: tulip 0000:03:00.0: eth1: tulip_stop_rxtx()
failed (CSR5 0xfc664010 CSR6 0xff972113)
Sep 14 12:40:11 wrx kernel: net eth1: Setting full-duplex based on MII#1
link partner capability of 45e1
Sep 14 12:40:12 wrx NetworkManager[386]: <info> (eth1): carrier now OFF
(device state 100, deferring action for 4 seconds)
Sep 14 12:40:12 wrx kernel: tulip 0000:03:00.0: eth1: tulip_stop_rxtx()
failed (CSR5 0xfc07c013 CSR6 0xff970111)

% cat /proc/interrupts
           CPU0       CPU1       CPU2       CPU3
  0:         48          0          0          0   IO-APIC-edge      timer
  1:          2          1          0          0   IO-APIC-edge      i8042
  8:        164          5          2          0   IO-APIC-edge      rtc0
  9:          0          0          0          0   IO-APIC-fasteoi   acpi
 12:          3          0          1          0   IO-APIC-edge      i8042
 16:        141        346    8067481       2394   IO-APIC-fasteoi
ehci_hcd:usb1, eth1
 23:        319      18623       7406      31659   IO-APIC-fasteoi
ehci_hcd:usb2
 40:          0          0          0          0   PCI-MSI-edge      PCIe
PME
 41:          0          0          0          0   PCI-MSI-edge      PCIe
PME
 42:          0          0          0          0   PCI-MSI-edge      PCIe
PME
 43:      22382      61895      31816      10217   PCI-MSI-edge      ahci
 44:          0          0          0          0   PCI-MSI-edge      ahci
 45:          0          0          0          0   PCI-MSI-edge
xhci_hcd
 46:         12          1          0          0   PCI-MSI-edge      mei
 47:          6          2          2      32904   PCI-MSI-edge      eth0
 48:      75136      39014      54183      22566   PCI-MSI-edge      i915
 49:        267        295        254         57   PCI-MSI-edge
snd_hda_intel
NMI:      40906      21860      26792      17063   Non-maskable interrupts
LOC:     362056     362594     377195     364303   Local timer interrupts
SPU:          0          0          0          0   Spurious interrupts
PMI:      40906      21860      26792      17063   Performance monitoring
interrupts
IWI:          6          3          3          2   IRQ work interrupts
RTR:          3          0          0          0   APIC ICR read retries
RES:     241774      18622      10902       4227   Rescheduling interrupts
CAL:        108        116        119        139   Function call interrupts
TLB:      26279      11870      16395      25314   TLB shootdowns
TRM:          0          0          0          0   Thermal event interrupts
THR:          0          0          0          0   Threshold APIC interrupts
MCE:          0          0          0          0   Machine check exceptions
MCP:         11         11         11         11   Machine check polls
ERR:          0
MIS:          0


Regards,
Marti
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] i825xx: znet: fix compiler warnings when building a 64-bit kernel
From: Mika Westerberg @ 2012-09-14 10:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, Mika Westerberg

When building 64-bit kernel with this driver we get following warnings from
the compiler:

drivers/net/ethernet/i825xx/znet.c: In function ‘hardware_init’:
drivers/net/ethernet/i825xx/znet.c:863:29: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
drivers/net/ethernet/i825xx/znet.c:870:29: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]

Fix these by calling isa_virt_to_bus() before passing the pointers to
set_dma_addr().

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/net/ethernet/i825xx/znet.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/i825xx/znet.c b/drivers/net/ethernet/i825xx/znet.c
index bd1f1ef..e843624 100644
--- a/drivers/net/ethernet/i825xx/znet.c
+++ b/drivers/net/ethernet/i825xx/znet.c
@@ -860,14 +860,14 @@ static void hardware_init(struct net_device *dev)
 	disable_dma(znet->rx_dma); 		/* reset by an interrupting task. */
 	clear_dma_ff(znet->rx_dma);
 	set_dma_mode(znet->rx_dma, DMA_RX_MODE);
-	set_dma_addr(znet->rx_dma, (unsigned int) znet->rx_start);
+	set_dma_addr(znet->rx_dma, isa_virt_to_bus(znet->rx_start));
 	set_dma_count(znet->rx_dma, RX_BUF_SIZE);
 	enable_dma(znet->rx_dma);
 	/* Now set up the Tx channel. */
 	disable_dma(znet->tx_dma);
 	clear_dma_ff(znet->tx_dma);
 	set_dma_mode(znet->tx_dma, DMA_TX_MODE);
-	set_dma_addr(znet->tx_dma, (unsigned int) znet->tx_start);
+	set_dma_addr(znet->tx_dma, isa_virt_to_bus(znet->tx_start));
 	set_dma_count(znet->tx_dma, znet->tx_buf_len<<1);
 	enable_dma(znet->tx_dma);
 	release_dma_lock(flags);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH net] batman-adv: make batadv_test_bit() return 0 or 1 only
From: Antonio Quartulli @ 2012-09-14 10:40 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r

From: Linus Lüssing <linus.luessing-S0/GAf8tV78@public.gmane.org>

On some architectures test_bit() can return other values than 0 or 1:

With a generic x86 OpenWrt image in a kvm setup (batadv_)test_bit()
frequently returns -1 for me, leading to batadv_iv_ogm_update_seqnos()
wrongly signaling a protected seqno window.

This patch tries to fix this issue by making batadv_test_bit() return 0
or 1 only.

Signed-off-by: Linus Lüssing <linus.luessing-S0/GAf8tV78@public.gmane.org>
Acked-by: Sven Eckelmann <sven-KaDOiPu9UxWEi8DpZVb4nw@public.gmane.org>
Signed-off-by: Antonio Quartulli <ordex-GaUfNO9RBHfsrOwW+9ziJQ@public.gmane.org>
---
 net/batman-adv/bitarray.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index a081ce1..cebaae7 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -20,8 +20,8 @@
 #ifndef _NET_BATMAN_ADV_BITARRAY_H_
 #define _NET_BATMAN_ADV_BITARRAY_H_
 
-/* returns true if the corresponding bit in the given seq_bits indicates true
- * and curr_seqno is within range of last_seqno
+/* Returns 1 if the corresponding bit in the given seq_bits indicates true
+ * and curr_seqno is within range of last_seqno. Otherwise returns 0.
  */
 static inline int batadv_test_bit(const unsigned long *seq_bits,
 				  uint32_t last_seqno, uint32_t curr_seqno)
@@ -32,7 +32,7 @@ static inline int batadv_test_bit(const unsigned long *seq_bits,
 	if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE)
 		return 0;
 	else
-		return  test_bit(diff, seq_bits);
+		return test_bit(diff, seq_bits) != 0;
 }
 
 /* turn corresponding bit on, so we can remember that we got the packet */
-- 
1.7.12

^ permalink raw reply related

* Re: [PATCH] netfilter: Allow xt_nat.c and x_tables.c to compiled in
From: Patrick McHardy @ 2012-09-14 11:54 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: David Miller, netdev, netfilter-devel
In-Reply-To: <87627hfi69.fsf@xmission.com>

On Thu, 13 Sep 2012, Eric W. Biederman wrote:

> xt_init in x_tables.c must be called before xt_nat_init in xt_nat.c
> Reorder the makefile so that x_tables.o comes before xt_nat.o in
> netfilter.o.
>
> This allows me to built a kernel with both of these modules compiled in.

Thanks, we've already fixed that, the patch is queued in Pablo's tree.


^ permalink raw reply

* Re: Oops with latest (netfilter) nf-next tree, when unloading iptable_nat
From: Pablo Neira Ayuso @ 2012-09-14 12:07 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Jesper Dangaard Brouer, netfilter-devel, netdev, yongjun_wei,
	kaber
In-Reply-To: <20120912213627.GJ14750@breakpoint.cc>

On Wed, Sep 12, 2012 at 11:36:27PM +0200, Florian Westphal wrote:
> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> [ CC'd Patrick ]
> 
> > I'm hitting this general protection fault, when unloading iptables_nat.
> > [  524.591067] Pid: 5842, comm: modprobe Not tainted 3.6.0-rc3-pablo-nf-next+ #1 Red Hat KVM
> > [  524.591067] RIP: 0010:[<ffffffffa002c2fd>]  [<ffffffffa002c2fd>] nf_nat_proto_clean+0x6d/0xc0 [nf_nat]
> > [  524.591067] RSP: 0018:ffff880073203e18  EFLAGS: 00010246
> > [  524.591067] RAX: 0000000000000000 RBX: ffff880077dff2c8 RCX: ffff8800797fab70
> > [  524.591067] RDX: dead000000200200 RSI: ffff880073203e88 RDI: ffffffffa002f208
> > [  524.591067] RBP: ffff880073203e28 R08: ffff880073202000 R09: 0000000000000000
> > [  524.591067] R10: dead000000200200 R11: dead000000100100 R12: ffffffff81c6dc00
> >  list corruption?   ^^^^^^^^^^^^^^^^      ^^^^^^^^^^^^^^^^
> 
> Yep, looks like it.
> 
> > [  524.591067]  [<ffffffffa002c290>] ? nf_nat_net_exit+0x50/0x50 [nf_nat]
> > [  524.591067]  [<ffffffff815614e3>] nf_ct_iterate_cleanup+0xc3/0x170
> > [  524.591067]  [<ffffffffa002c54a>] nf_nat_l3proto_unregister+0x8a/0x100 [nf_nat]
> > [  524.591067]  [<ffffffff812a0303>] ? compat_prepare_timeout+0x13/0xb0
> > [  524.591067]  [<ffffffffa0035848>] nf_nat_l3proto_ipv4_exit+0x10/0x23 [nf_nat_ipv4]
> 
> On module removal nf_nat_ipv4 calls nf_iterate_cleanup which invokes
> nf_nat_proto_clean() for each conntrack.  That will then call
> hlist_del_rcu(&nat->bysource) using eachs conntracks nat ext area.
> 
> Problem is that nf_nat_proto_clean() is called multiple times for the same
> conntrack:
> a) nf_ct_iterate_cleanup() returns each ct twice (origin, reply)
> b) we call it both for l3 and for l4 protocol ids
> 
> We barf in hlist_del_rcu the 2nd time because ->pprev is poisoned.
> 
> This was introduced with the ipv6 nat patches.
> 
> --- a/net/netfilter/nf_nat_core.c
> +++ b/net/netfilter/nf_nat_core.c
> @@ -487,7 +487,7 @@ static int nf_nat_proto_clean(struct nf_conn *i, void *data)
>  
>         if (clean->hash) {
>                 spin_lock_bh(&nf_nat_lock);
> -               hlist_del_rcu(&nat->bysource);
> +               hlist_del_init_rcu(&nat->bysource);
>                 spin_unlock_bh(&nf_nat_lock);
>         } else {
> 
> Would probably avoid it.  I guess it would be nicer to only call this
> once for each ct.
> 
> Patrick, any other idea?

I already discussed this with Florian (I've been having problems with
two out of three of my email accounts this week... so I couldn't reply
to this email in the mailing list).

We can add nf_nat_iterate_cleanup that can iterate over the NAT
hashtable to replace current usage of nf_ct_iterate_cleanup.

^ permalink raw reply

* Re: [PATCH] netfilter: Allow xt_nat.c and x_tables.c to compiled in
From: Pablo Neira Ayuso @ 2012-09-14 12:08 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric W. Biederman, David Miller, netdev, netfilter-devel
In-Reply-To: <Pine.GSO.4.63.1209141353470.11622@stinky-local.trash.net>

On Fri, Sep 14, 2012 at 01:54:22PM +0200, Patrick McHardy wrote:
> On Thu, 13 Sep 2012, Eric W. Biederman wrote:
> 
> >xt_init in x_tables.c must be called before xt_nat_init in xt_nat.c
> >Reorder the makefile so that x_tables.o comes before xt_nat.o in
> >netfilter.o.
> >
> >This allows me to built a kernel with both of these modules compiled in.
> 
> Thanks, we've already fixed that, the patch is queued in Pablo's tree.

It should hit Linus tree soon, David pulled the fix yesterday.

^ permalink raw reply

* Re: [PATCH] netfilter: Allow xt_nat.c and x_tables.c to compiled in
From: Eric Dumazet @ 2012-09-14 12:25 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Patrick McHardy, Eric W. Biederman, David Miller, netdev,
	netfilter-devel
In-Reply-To: <20120914120853.GA6056@1984>

On Fri, 2012-09-14 at 14:08 +0200, Pablo Neira Ayuso wrote:
> On Fri, Sep 14, 2012 at 01:54:22PM +0200, Patrick McHardy wrote:
> > On Thu, 13 Sep 2012, Eric W. Biederman wrote:
> > 
> > >xt_init in x_tables.c must be called before xt_nat_init in xt_nat.c
> > >Reorder the makefile so that x_tables.o comes before xt_nat.o in
> > >netfilter.o.
> > >
> > >This allows me to built a kernel with both of these modules compiled in.
> > 
> > Thanks, we've already fixed that, the patch is queued in Pablo's tree.
> 
> It should hit Linus tree soon, David pulled the fix yesterday.
> --

Little correction :

Its in net-next, Linus tree doesnt need this fix yet.

http://git.kernel.org/?p=linux/kernel/git/davem/net-next.git;a=commitdiff;h=00545bec9412d130c77f72a08d6c8b6ad21d4a1e




^ permalink raw reply

* Re: [PATCH] Xen backend support for paged out grant targets.
From: Konrad Rzeszutek Wilk @ 2012-09-14 12:44 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Andres Lagar-Cavilla, Andres Lagar-Cavilla,
	xen-devel@xen.lists.org, David Vrabel, David Miller,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <1347607141.25803.73.camel@dagon.hellion.org.uk>

On Fri, Sep 14, 2012 at 08:19:01AM +0100, Ian Campbell wrote:
> On Thu, 2012-09-13 at 20:45 +0100, Andres Lagar-Cavilla wrote:
> > On Sep 13, 2012, at 2:11 PM, Ian Campbell wrote:
> > 
> > > On Thu, 2012-09-13 at 18:28 +0100, Andres Lagar-Cavilla wrote:
> > >> 
> > >> * Add placeholder in array of grant table error descriptions for
> > >> unrelated error code we jump over. 
> > > 
> > > Why not just define it, it's listed here:
> > > http://xenbits.xen.org/docs/unstable/hypercall/include,public,grant_table.h.html#Enum_grant_status
> > Well, a) we'd be defining something no one will be using (for the
> > moment)
> 
> Even if no one in the kernel is using it, having "placeholder" as an
> entry in GNTTABOP_error_msgs is just silly, even things which don't
> understand GNTST_address_too_big directly could end up looking it up
> here.
> 
> >  b) I would be signing-off on something unrelated.
> 
> Lets take this patch instead then.
> 
> 8<------------------------------------------------
> 
> >From cb9daaf3029accb6d5fef58b450a625b27190429 Mon Sep 17 00:00:00 2001
> From: Ian Campbell <ian.campbell@citrix.com>
> Date: Fri, 14 Sep 2012 08:10:06 +0100
> Subject: [PATCH] xen: resynchronise grant table status codes with upstream
> 
> Adds GNTST_address_too_big and GNTST_eagain.
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

applied.

> ---
>  include/xen/interface/grant_table.h |    8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
> index a17d844..84a8fbf 100644
> --- a/include/xen/interface/grant_table.h
> +++ b/include/xen/interface/grant_table.h
> @@ -519,7 +519,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
>  #define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
>  #define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
>  #define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
> -#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
> +#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary.   */
> +#define GNTST_address_too_big (-11) /* transfer page address too large.      */
> +#define GNTST_eagain          (-12) /* Operation not done; try again.        */
>  
>  #define GNTTABOP_error_msgs {                   \
>      "okay",                                     \
> @@ -532,7 +534,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
>      "no spare translation slot in the I/O MMU", \
>      "permission denied",                        \
>      "bad page",                                 \
> -    "copy arguments cross page boundary"        \
> +    "copy arguments cross page boundary",       \
> +    "page address size too large",              \
> +    "operation not done; try again"             \
>  }
>  
>  #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
> -- 
> 1.7.10.4
> 
> 

^ permalink raw reply

* Re: [PATCH] netfilter: Allow xt_nat.c and x_tables.c to compiled in
From: Pablo Neira Ayuso @ 2012-09-14 12:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Eric W. Biederman, David Miller, netdev,
	netfilter-devel
In-Reply-To: <1347625514.26523.2.camel@edumazet-glaptop>

On Fri, Sep 14, 2012 at 02:25:14PM +0200, Eric Dumazet wrote:
> On Fri, 2012-09-14 at 14:08 +0200, Pablo Neira Ayuso wrote:
> > On Fri, Sep 14, 2012 at 01:54:22PM +0200, Patrick McHardy wrote:
> > > On Thu, 13 Sep 2012, Eric W. Biederman wrote:
> > > 
> > > >xt_init in x_tables.c must be called before xt_nat_init in xt_nat.c
> > > >Reorder the makefile so that x_tables.o comes before xt_nat.o in
> > > >netfilter.o.
> > > >
> > > >This allows me to built a kernel with both of these modules compiled in.
> > > 
> > > Thanks, we've already fixed that, the patch is queued in Pablo's tree.
> > 
> > It should hit Linus tree soon, David pulled the fix yesterday.
> > --
> 
> Little correction :
> 
> Its in net-next, Linus tree doesnt need this fix yet.
> 
> http://git.kernel.org/?p=linux/kernel/git/davem/net-next.git;a=commitdiff;h=00545bec9412d130c77f72a08d6c8b6ad21d4a1e

You're right, thanks for the clarification.

^ permalink raw reply

* RE: [PATCH] sch_red: fix weighted average calculation
From: Dowdal, John @ 2012-09-14 13:01 UTC (permalink / raw)
  To: Eric Dumazet, Chemparathy, Cyril
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	davem@davemloft.net, david.ward@ll.mit.edu,
	paul.gortmaker@windriver.com
In-Reply-To: <1347544431.13103.1557.camel@edumazet-glaptop>

Eric, thank you for reviewing the code.  I now see the problem with the patch since backlog is an integer and qavg is a fixed point number at logW.  

We are considering another patch to update the comments to this code (with the actual C code change reverted) to stop the FPP by showing the derivation of the equation in the comments.  Does this sound good?


-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com] 
Sent: Thursday, September 13, 2012 9:54 AM
To: Chemparathy, Cyril
Cc: linux-kernel@vger.kernel.org; netdev@vger.kernel.org; davem@davemloft.net; david.ward@ll.mit.edu; Dowdal, John; paul.gortmaker@windriver.com
Subject: Re: [PATCH] sch_red: fix weighted average calculation

On Thu, 2012-09-13 at 09:43 -0400, Cyril Chemparathy wrote:
> This patch fixes an apparent bug in the running weighted average calculation
> used in the RED algorithm.
> 
> Going by the described formula:
> 	   qavg = qavg*(1-W) + backlog*W
> 	=> qavg = qavg + (backlog - qavg) * W
> 
> ... with W converted to a pre-calculated shift, this then becomes:
> 	qavg = qavg + (backlog - qavg) >> logW
> 
> ... giving the modified expression introduced by this patch.
> 
> Signed-off-by: John Dowdal <jdowdal@ti.com>
> ---
>  include/net/red.h |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/net/red.h b/include/net/red.h
> index ef46058..05960a4 100644
> --- a/include/net/red.h
> +++ b/include/net/red.h
> @@ -287,7 +287,7 @@ static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p
>  	 *
>  	 * --ANK (980924)
>  	 */
> -	return v->qavg + (backlog - (v->qavg >> p->Wlog));
> +	return v->qavg + (backlog - v->qavg) >> p->Wlog;
>  }
>  
>  static inline unsigned long red_calc_qavg(const struct red_parms *p,

This is going to be a FPP (Frequently Posted Patch)

Current formulae is fine.

Thats because backlog, at start of red_calc_qavg_no_idle_time() is not
yet scaled by p->Wlog. v->avg is scaled, but not backlog.

Have you tested RED after your patch ?



^ permalink raw reply

* RE: [PATCH] sch_red: fix weighted average calculation
From: Eric Dumazet @ 2012-09-14 13:13 UTC (permalink / raw)
  To: Dowdal, John
  Cc: Chemparathy, Cyril, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, davem@davemloft.net,
	david.ward@ll.mit.edu, paul.gortmaker@windriver.com
In-Reply-To: <2D29FFBF554947468515B8EF6C733BB72E564F65@DLEE09.ent.ti.com>

On Fri, 2012-09-14 at 13:01 +0000, Dowdal, John wrote:
> Eric, thank you for reviewing the code.  I now see the problem with
> the patch since backlog is an integer and qavg is a fixed point number
> at logW.  
> 
> We are considering another patch to update the comments to this code
> (with the actual C code change reverted) to stop the FPP by showing
> the derivation of the equation in the comments.  Does this sound good?

This would be good, please do so.

Thanks

^ permalink raw reply

* Re: Oops with latest (netfilter) nf-next tree, when unloading iptable_nat
From: Patrick McHardy @ 2012-09-14 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, Jesper Dangaard Brouer, netfilter-devel, netdev,
	yongjun_wei
In-Reply-To: <20120914120750.GA5764@1984>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2900 bytes --]

On Fri, 14 Sep 2012, Pablo Neira Ayuso wrote:

> On Wed, Sep 12, 2012 at 11:36:27PM +0200, Florian Westphal wrote:
>> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>>
>> [ CC'd Patrick ]
>>
>>> I'm hitting this general protection fault, when unloading iptables_nat.
>>> [  524.591067] Pid: 5842, comm: modprobe Not tainted 3.6.0-rc3-pablo-nf-next+ #1 Red Hat KVM
>>> [  524.591067] RIP: 0010:[<ffffffffa002c2fd>]  [<ffffffffa002c2fd>] nf_nat_proto_clean+0x6d/0xc0 [nf_nat]
>>> [  524.591067] RSP: 0018:ffff880073203e18  EFLAGS: 00010246
>>> [  524.591067] RAX: 0000000000000000 RBX: ffff880077dff2c8 RCX: ffff8800797fab70
>>> [  524.591067] RDX: dead000000200200 RSI: ffff880073203e88 RDI: ffffffffa002f208
>>> [  524.591067] RBP: ffff880073203e28 R08: ffff880073202000 R09: 0000000000000000
>>> [  524.591067] R10: dead000000200200 R11: dead000000100100 R12: ffffffff81c6dc00
>>>  list corruption?   ^^^^^^^^^^^^^^^^      ^^^^^^^^^^^^^^^^
>>
>> Yep, looks like it.
>>
>>> [  524.591067]  [<ffffffffa002c290>] ? nf_nat_net_exit+0x50/0x50 [nf_nat]
>>> [  524.591067]  [<ffffffff815614e3>] nf_ct_iterate_cleanup+0xc3/0x170
>>> [  524.591067]  [<ffffffffa002c54a>] nf_nat_l3proto_unregister+0x8a/0x100 [nf_nat]
>>> [  524.591067]  [<ffffffff812a0303>] ? compat_prepare_timeout+0x13/0xb0
>>> [  524.591067]  [<ffffffffa0035848>] nf_nat_l3proto_ipv4_exit+0x10/0x23 [nf_nat_ipv4]
>>
>> On module removal nf_nat_ipv4 calls nf_iterate_cleanup which invokes
>> nf_nat_proto_clean() for each conntrack.  That will then call
>> hlist_del_rcu(&nat->bysource) using eachs conntracks nat ext area.
>>
>> Problem is that nf_nat_proto_clean() is called multiple times for the same
>> conntrack:
>> a) nf_ct_iterate_cleanup() returns each ct twice (origin, reply)
>> b) we call it both for l3 and for l4 protocol ids
>>
>> We barf in hlist_del_rcu the 2nd time because ->pprev is poisoned.
>>
>> This was introduced with the ipv6 nat patches.
>>
>> --- a/net/netfilter/nf_nat_core.c
>> +++ b/net/netfilter/nf_nat_core.c
>> @@ -487,7 +487,7 @@ static int nf_nat_proto_clean(struct nf_conn *i, void *data)
>>
>>         if (clean->hash) {
>>                 spin_lock_bh(&nf_nat_lock);
>> -               hlist_del_rcu(&nat->bysource);
>> +               hlist_del_init_rcu(&nat->bysource);
>>                 spin_unlock_bh(&nf_nat_lock);
>>         } else {
>>
>> Would probably avoid it.  I guess it would be nicer to only call this
>> once for each ct.
>>
>> Patrick, any other idea?
>
> I already discussed this with Florian (I've been having problems with
> two out of three of my email accounts this week... so I couldn't reply
> to this email in the mailing list).
>
> We can add nf_nat_iterate_cleanup that can iterate over the NAT
> hashtable to replace current usage of nf_ct_iterate_cleanup.

Lets just bail out when IPS_SRC_NAT_DONE is not set, that should also fix
it. Could you try this patch please?

[-- Attachment #2: Type: TEXT/PLAIN, Size: 480 bytes --]

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 29d4452..8b5d220 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -481,6 +481,8 @@ static int nf_nat_proto_clean(struct nf_conn *i, void *data)
 
 	if (!nat)
 		return 0;
+	if (!(i->status & IPS_SRC_NAT_DONE))
+		return 0;
 	if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
 	    (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
 		return 0;

^ permalink raw reply related

* Re: NULL deref in bnx2 / crashes ? ( was: netconsole leads to stalled CPU task )
From: Cong Wang @ 2012-09-14 13:22 UTC (permalink / raw)
  To: Sylvain Munaut; +Cc: Eric Dumazet, netdev
In-Reply-To: <CAF6-1L6RJkV6u4Rc2uDMQGUG23EGRKrdWSzTiHEYO9Z9ioGtaw@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 904 bytes --]

On 09/14/2012 01:35 AM, Sylvain Munaut wrote:
> Hi,
>
>> Yes, but I have some worries of why it is needed.
>>
>> Isnt it covering a bug elsewhere ?
>
> That may very well be.
>
> Of the few test servers I have running the same kernel, I just found
> the one with netconsole active to be "stuck".
>
> Not frozen, but all user process are hanged up and it's spitting
> message about processes and CPU being "stuck". The trace is different
> in each case depending on what the process was actually doing at the
> time it got stuck.
>
> No message sent to the netconsole with the root cause and nothing was
> written in the logs ...
>

Yeah, in this case, kdump is your friend. :)

Anyway, I think Eric is right, the bug may be in other place. I am 
wondering if the attached patch could help? It seems in netpoll tx path, 
we miss the chance of calling ->ndo_select_queue().

Please give it a try.

Thanks!

[-- Attachment #2: netpoll-txq.diff --]
[-- Type: text/x-patch, Size: 1645 bytes --]

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae3153c0..72661f6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1403,6 +1403,9 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 		f(dev, &dev->_tx[i], arg);
 }
 
+extern struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+					   struct sk_buff *skb);
+
 /*
  * Net namespace inlines
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index b1e6d63..d76bf73 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2381,8 +2381,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 #endif
 }
 
-static struct netdev_queue *dev_pick_tx(struct net_device *dev,
-					struct sk_buff *skb)
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+				    struct sk_buff *skb)
 {
 	int queue_index;
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -2556,7 +2556,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 	skb_update_prio(skb);
 
-	txq = dev_pick_tx(dev, skb);
+	txq = netdev_pick_tx(dev, skb);
 	q = rcu_dereference_bh(txq->qdisc);
 
 #ifdef CONFIG_NET_CLS_ACT
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index dd67818..77a0388 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -328,7 +328,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
 		struct netdev_queue *txq;
 
-		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+		txq = netdev_pick_tx(dev, skb);
 
 		/* try until next clock tick */
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;

^ permalink raw reply related

* Re: [RFC PATCH net-next v2 0/1] Add support of ECMPv6
From: Nicolas Dichtel @ 2012-09-14 13:35 UTC (permalink / raw)
  To: Vincent Bernat; +Cc: yoshfuji, netdev, davem
In-Reply-To: <87392l7xjc.fsf@guybrush.luffy.cx>

Le 14/09/2012 11:40, Vincent Bernat a écrit :
>   ❦ 14 septembre 2012 09:59 CEST, Nicolas Dichtel <nicolas.dichtel@6wind.com> :
>
>> Here is an example of a command to add an ECMP route:
>> $ ip -6 route add 3ffe:304:124:2306::/64 \
>> 	nexthop via fe80::230:1bff:feb4:e05c dev eth0 weight 1 \
>> 	nexthop via fe80::230:1bff:feb4:dd4f dev eth0 weight 1
In fact, I use this command as a shortcut. You can also use:
$ ip -6 route add 3ffe:304:124:2306::/64 via fe80::230:1bff:feb4:dd4f dev eth0
$ ip -6 route append 3ffe:304:124:2406::/64 via fe80::230:1bff:feb4:dd4f dev eth0

and these commands will output two standard netlink messages, without 'nexthop' 
lines.

>
> When displaying ECMP routes, the display is different than for IPv4: we
> get two distinct routes instead of an ECMP route (with nexthop
> keyword).
Sure, this implementation stores each 'nexthop' like a standard route in the 
kernel table.

>
> With IPv4:
>
> 193.252.X.X/26  proto zebra  metric 20
> 	nexthop via 193.252.X.X  dev bae1 weight 1
> 	nexthop via 193.252.X.X  dev bae2 weight 1
>
> With IPv6:
>
> 2a01:c9c0:X:X::/64 via fe80::215:17ff:fe85:76b9 dev bae1 metric 11
> 2a01:c9c0:X:X::/64 via fe80::222:91ff:fe4e:b000 dev bae2 metric 11
>
> If I capture the netlink message from the add command, put it in a file
> and use "ip monitor file ...", I see this:
>
> 2a01:c9c0:X:X::/64
> 	nexthop via fe80::215:17ff:fe85:76b9  dev if12 weight 1
> 	nexthop via fe80::222:91ff:fe4e:b000  dev if11 weight 1
>
> Therefore, the problem is not in iproute2 which knows how to display
> those ECMP routes. I fear that this difference make support in routing
> daemons more difficult.
Hmm, can you elaborate? Our routing daemon, quagga, manage it without any problem.

^ permalink raw reply

* Re: [RFC PATCH net-next v2 0/1] Add support of ECMPv6
From: Nicolas Dichtel @ 2012-09-14 13:37 UTC (permalink / raw)
  To: Vincent Bernat; +Cc: yoshfuji, netdev, davem
In-Reply-To: <5053329F.6030109@6wind.com>

Le 14/09/2012 15:35, Nicolas Dichtel a écrit :
> Le 14/09/2012 11:40, Vincent Bernat a écrit :
>>   ❦ 14 septembre 2012 09:59 CEST, Nicolas Dichtel <nicolas.dichtel@6wind.com> :
>>
>>> Here is an example of a command to add an ECMP route:
>>> $ ip -6 route add 3ffe:304:124:2306::/64 \
>>>     nexthop via fe80::230:1bff:feb4:e05c dev eth0 weight 1 \
>>>     nexthop via fe80::230:1bff:feb4:dd4f dev eth0 weight 1
> In fact, I use this command as a shortcut. You can also use:
> $ ip -6 route add 3ffe:304:124:2306::/64 via fe80::230:1bff:feb4:dd4f dev eth0
> $ ip -6 route append 3ffe:304:124:2406::/64 via fe80::230:1bff:feb4:dd4f dev eth0
Note also that with these commands, there is no need to patch iproute2.

^ permalink raw reply

* Re: tulip Ethernet driver messes up USB keyboard
From: Sergey Vlasov @ 2012-09-14 14:24 UTC (permalink / raw)
  To: Marti Raudsepp; +Cc: Kernel hackers, Linux USB list, Linux network list
In-Reply-To: <CABRT9RCWU1RqpAay9iWPKwduejvkdk_CA2P_gV70iiEE2FS5HQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1924 bytes --]

On Fri, Sep 14, 2012 at 12:49:45PM +0300, Marti Raudsepp wrote:
> After installing an old 100Mbit PCI Ethernet card to my machine, it has
> complained a few times about spurious interrupts ("nobody cared") at a
> random time of the day. After the oops is reported, my USB keyboard (HP
> smart card keyboard) stops working properly -- it has lots of delay, it
> misses some keystrokes and also causes "stuck keys". Strangely enough my USB
> mouse continues to work without problems. Apparently the USB controller and
> the Ethernet card share an interrupt line. Un-/replugging the keyboard does
> not help.
> 
> So far I have simply rebooted to fix the issue. I haven't been able to
> reproduce this on will despite my best attempts.
> 
> The Ethernet card is driven by the tulip driver and is listed as "ADMtek
> NC100 Network Everywhere Fast Ethernet 10/100 (rev 11)" in lspci. My
> motherboard uses the Intel H77 chipset (ASRock H77 Pro4/MVP)

This board uses the ASMedia ASM1083 PCIe-to-PCI bridge (or maybe ASM1085 -
hard to see from photos available on the net, and the PCI IDs for these
chips are the same), which is known to have problems with interrupt
handling:

  https://lkml.org/lkml/2012/1/30/216

I experience the same problem with ASUS P8P67 LE (except in my case the
interrupt is not shared with anything, and the network card does not stop
working completely, just gives horrible performance with > 100ms pings).

Some people use "noirqdebug irqpoll" as a workaround:

  http://lkml.indiana.edu/hypermail/linux/kernel/1204.1/03451.html

But this workaround basically turns off the protection against stuck
interrupts in the kernel, so the system could potentially be busy handling
a stuck interrupt forever.

You can also try to move the PCI card to another slot, so that it would
use another interrupt, which might be not shared with USB or another
important device.

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 190 bytes --]

^ permalink raw reply

* [PATCH] Xen backend support for paged out grant targets V4.
From: Andres Lagar-Cavilla @ 2012-09-14 14:26 UTC (permalink / raw)
  To: xen-devel
  Cc: Konrad Rzeszutek Wilk, Ian Campbell, David Vrabel, David Miller,
	linux-kernel, netdev, Andres Lagar-Cavilla

Since Xen-4.2, hvm domains may have portions of their memory paged out. When a
foreign domain (such as dom0) attempts to map these frames, the map will
initially fail. The hypervisor returns a suitable errno, and kicks an
asynchronous page-in operation carried out by a helper. The foreign domain is
expected to retry the mapping operation until it eventually succeeds. The
foreign domain is not put to sleep because itself could be the one running the
pager assist (typical scenario for dom0).

This patch adds support for this mechanism for backend drivers using grant
mapping and copying operations. Specifically, this covers the blkback and
gntdev drivers (which map foregin grants), and the netback driver (which copies
foreign grants).

* Add a retry method for grants that fail with GNTST_eagain (i.e. because the
  target foregin frame is paged out).
* Insert hooks with appropriate wrappers in the aforementioned drivers.

The retry loop is only invoked if the grant operation status is GNTST_eagain.
It guarantees to leave a new status code different from GNTST_eagain. Any other
status code results in identical code execution as before.

The retry loop performs 256 attempts with increasing time intervals through a
32 second period. It uses msleep to yield while waiting for the next retry.

V2 after feedback from David Vrabel:
* Explicit MAX_DELAY instead of wrap-around delay into zero
* Abstract GNTST_eagain check into core grant table code for netback module.

V3 after feedback from Ian Campbell:
* Add placeholder in array of grant table error descriptions for unrelated
  error code we jump over.
* Eliminate single map and retry macro in favor of a generic batch flavor.
* Some renaming.
* Bury most implementation in grant_table.c, cleaner interface.

V4 rebased on top of sync of Xen grant table interface headers.

Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>
---
 drivers/net/xen-netback/netback.c  |   11 ++------
 drivers/xen/grant-table.c          |   53 ++++++++++++++++++++++++++++++++++++
 drivers/xen/xenbus/xenbus_client.c |    6 ++--
 include/xen/grant_table.h          |   12 ++++++++
 4 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 682633b..05593d8 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -635,9 +635,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk)
 		return;
 
 	BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
-					npo.copy_prod);
-	BUG_ON(ret != 0);
+	gnttab_batch_copy(netbk->grant_copy_op, npo.copy_prod);
 
 	while ((skb = __skb_dequeue(&rxq)) != NULL) {
 		sco = (struct skb_cb_overlay *)skb->cb;
@@ -1460,18 +1458,15 @@ static void xen_netbk_tx_submit(struct xen_netbk *netbk)
 static void xen_netbk_tx_action(struct xen_netbk *netbk)
 {
 	unsigned nr_gops;
-	int ret;
 
 	nr_gops = xen_netbk_tx_build_gops(netbk);
 
 	if (nr_gops == 0)
 		return;
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
-					netbk->tx_copy_ops, nr_gops);
-	BUG_ON(ret);
 
-	xen_netbk_tx_submit(netbk);
+	gnttab_batch_copy(netbk->tx_copy_ops, nr_gops);
 
+	xen_netbk_tx_submit(netbk);
 }
 
 static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index eea81cf..f5681c8 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -38,6 +38,7 @@
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/delay.h>
 #include <linux/hardirq.h>
 
 #include <xen/xen.h>
@@ -823,6 +824,52 @@ unsigned int gnttab_max_grant_frames(void)
 }
 EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
+/* Handling of paged out grant targets (GNTST_eagain) */
+#define MAX_DELAY 256
+static inline void
+gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status,
+						const char *func)
+{
+	unsigned delay = 1;
+
+	do {
+		BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1));
+		if (*status == GNTST_eagain)
+			msleep(delay++);
+	} while ((*status == GNTST_eagain) && (delay < MAX_DELAY));
+
+	if (delay >= MAX_DELAY) {
+		printk(KERN_ERR "%s: %s eagain grant\n", func, current->comm);
+		*status = GNTST_bad_page;
+	}
+}
+
+void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count)
+{
+	struct gnttab_map_grant_ref *op;
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count))
+		BUG();
+	for (op = batch; op < batch + count; op++)
+		if (op->status == GNTST_eagain)
+			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op,
+									&op->status, __func__);
+}
+EXPORT_SYMBOL_GPL(gnttab_batch_map);
+
+void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count)
+{
+	struct gnttab_copy *op;
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count))
+		BUG();
+	for (op = batch; op < batch + count; op++)
+		if (op->status == GNTST_eagain)
+			gnttab_retry_eagain_gop(GNTTABOP_copy, op, &op->status,
+									__func__);
+}
+EXPORT_SYMBOL_GPL(gnttab_batch_copy);
+
 int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 		    struct gnttab_map_grant_ref *kmap_ops,
 		    struct page **pages, unsigned int count)
@@ -836,6 +883,12 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 	if (ret)
 		return ret;
 
+	/* Retry eagain maps */
+	for (i = 0; i < count; i++)
+		if (map_ops[i].status == GNTST_eagain)
+			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,
+                                    &map_ops[i].status, __func__);
+
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return ret;
 
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index b3e146e..bcf3ba4 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -490,8 +490,7 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
 
 	op.host_addr = arbitrary_virt_to_machine(pte).maddr;
 
-	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-		BUG();
+	gnttab_batch_map(&op, 1);
 
 	if (op.status != GNTST_okay) {
 		free_vm_area(area);
@@ -572,8 +571,7 @@ int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
 	gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref,
 			  dev->otherend_id);
 
-	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-		BUG();
+	gnttab_batch_map(&op, 1);
 
 	if (op.status != GNTST_okay) {
 		xenbus_dev_fatal(dev, op.status,
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 11e27c3..da9386e 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -189,4 +189,16 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 		      struct page **pages, unsigned int count, bool clear_pte);
 
+/* Perform a batch of grant map/copy operations. Retry every batch slot
+ * for which the hypervisor returns GNTST_eagain. This is typically due 
+ * to paged out target frames.
+ *
+ * Will retry for 1, 2, ... 255 ms, i.e. 256 times during 32 seconds.
+ *
+ * Return value in each iand every status field of the batch guaranteed
+ * to not be GNTST_eagain. 
+ */
+void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count);
+void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count);
+
 #endif /* __ASM_GNTTAB_H__ */
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH] Xen backend support for paged out grant targets.
From: Andres Lagar-Cavilla @ 2012-09-14 14:27 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Ian Campbell, Andres Lagar-Cavilla, xen-devel@xen.lists.org,
	David Vrabel, David Miller, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <20120914124411.GB25249@phenom.dumpdata.com>


On Sep 14, 2012, at 8:44 AM, Konrad Rzeszutek Wilk wrote:

> On Fri, Sep 14, 2012 at 08:19:01AM +0100, Ian Campbell wrote:
>> On Thu, 2012-09-13 at 20:45 +0100, Andres Lagar-Cavilla wrote:
>>> On Sep 13, 2012, at 2:11 PM, Ian Campbell wrote:
>>> 
>>>> On Thu, 2012-09-13 at 18:28 +0100, Andres Lagar-Cavilla wrote:
>>>>> 
>>>>> * Add placeholder in array of grant table error descriptions for
>>>>> unrelated error code we jump over. 
>>>> 
>>>> Why not just define it, it's listed here:
>>>> http://xenbits.xen.org/docs/unstable/hypercall/include,public,grant_table.h.html#Enum_grant_status
>>> Well, a) we'd be defining something no one will be using (for the
>>> moment)
>> 
>> Even if no one in the kernel is using it, having "placeholder" as an
>> entry in GNTTABOP_error_msgs is just silly, even things which don't
>> understand GNTST_address_too_big directly could end up looking it up
>> here.
>> 
>>> b) I would be signing-off on something unrelated.
>> 
>> Lets take this patch instead then.
>> 
>> 8<------------------------------------------------
>> 
>>> From cb9daaf3029accb6d5fef58b450a625b27190429 Mon Sep 17 00:00:00 2001
>> From: Ian Campbell <ian.campbell@citrix.com>
>> Date: Fri, 14 Sep 2012 08:10:06 +0100
>> Subject: [PATCH] xen: resynchronise grant table status codes with upstream
>> 
>> Adds GNTST_address_too_big and GNTST_eagain.
>> 
>> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> 
> applied.
Thanks. Rebased on top of this and just sent.
Andres
> 
>> ---
>> include/xen/interface/grant_table.h |    8 ++++++--
>> 1 file changed, 6 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
>> index a17d844..84a8fbf 100644
>> --- a/include/xen/interface/grant_table.h
>> +++ b/include/xen/interface/grant_table.h
>> @@ -519,7 +519,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
>> #define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
>> #define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
>> #define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
>> -#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
>> +#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary.   */
>> +#define GNTST_address_too_big (-11) /* transfer page address too large.      */
>> +#define GNTST_eagain          (-12) /* Operation not done; try again.        */
>> 
>> #define GNTTABOP_error_msgs {                   \
>>     "okay",                                     \
>> @@ -532,7 +534,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
>>     "no spare translation slot in the I/O MMU", \
>>     "permission denied",                        \
>>     "bad page",                                 \
>> -    "copy arguments cross page boundary"        \
>> +    "copy arguments cross page boundary",       \
>> +    "page address size too large",              \
>> +    "operation not done; try again"             \
>> }
>> 
>> #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
>> -- 
>> 1.7.10.4
>> 
>> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox