Netdev List
 help / color / mirror / Atom feed
* [PATCH 3/5] ipv4: Remove rt_key_{src,dst,tos} from struct rtable.
From: David Miller @ 2012-07-01 12:02 UTC (permalink / raw)
  To: netdev


They are always used in contexts where they can be reconstituted,
or where the finally resolved rt->rt_{src,dst} is semantically
equivalent.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    5 -----
 net/ipv4/route.c        |   39 +++++++++------------------------------
 net/ipv4/xfrm4_policy.c |    3 ---
 3 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 04c1b7f..c5f2955 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -45,14 +45,9 @@ struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
 
-	/* Lookup key. */
-	__be32			rt_key_dst;
-	__be32			rt_key_src;
-
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
-	__u8			rt_key_tos;
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4377e01..ec38892 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1219,12 +1219,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	rth->dst.output = ip_rt_bug;
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1344,12 +1341,9 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
@@ -1518,12 +1512,9 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1626,9 +1617,7 @@ EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-				       const struct flowi4 *fl4,
-				       __be32 orig_daddr, __be32 orig_saddr,
-				       int orig_oif, __u8 orig_rtos,
+				       const struct flowi4 *fl4, int orig_oif,
 				       struct net_device *dev_out,
 				       unsigned int flags)
 {
@@ -1679,12 +1668,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rth->dst.output = ip_output;
 
-	rth->rt_key_dst	= orig_daddr;
-	rth->rt_key_src	= orig_saddr;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_key_tos	= orig_rtos;
 	rth->rt_dst	= fl4->daddr;
 	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
@@ -1738,8 +1724,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	__be32 orig_daddr;
-	__be32 orig_saddr;
 	int orig_oif;
 
 	res.fi		= NULL;
@@ -1748,8 +1732,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	res.r		= NULL;
 #endif
 
-	orig_daddr = fl4->daddr;
-	orig_saddr = fl4->saddr;
 	orig_oif = fl4->flowi4_oif;
 
 	fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -1911,8 +1893,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 
 make_route:
-	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-			       tos, dev_out, flags);
+	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
 	if (!IS_ERR(rth))
 		rth = rt_finalize(rth, NULL);
 
@@ -1973,9 +1954,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_key_dst = ort->rt_key_dst;
-		rt->rt_key_src = ort->rt_key_src;
-		rt->rt_key_tos = ort->rt_key_tos;
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
@@ -2017,7 +1995,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,
+static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2035,7 +2013,7 @@ static int rt_fill_info(struct net *net,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= rt->rt_key_tos;
+	r->rtm_tos	= tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2048,9 +2026,9 @@ static int rt_fill_info(struct net *net,
 
 	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
 		goto nla_put_failure;
-	if (rt->rt_key_src) {
+	if (src) {
 		r->rtm_src_len = 32;
-		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+		if (nla_put_be32(skb, RTA_SRC, src))
 			goto nla_put_failure;
 	}
 	if (rt->dst.dev &&
@@ -2062,7 +2040,7 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != rt->rt_key_src) {
+	    rt->rt_src != src) {
 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 			goto nla_put_failure;
 	}
@@ -2213,7 +2191,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9815ea0..01808c5 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,9 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_key_dst = fl4->daddr;
-	xdst->u.rt.rt_key_src = fl4->saddr;
-	xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-- 
1.7.10

^ permalink raw reply related

* [PATCH 4/5] ipv4: Kill 'rt_src' from 'struct rtable'
From: David Miller @ 2012-07-01 12:02 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/route.c        |   34 +++++++++++++++-------------------
 net/ipv4/xfrm4_policy.c |    1 -
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index c5f2955..ad387d2 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -50,7 +50,6 @@ struct rtable {
 	__u16			rt_type;
 
 	__be32			rt_dst;	/* Path destination	*/
-	__be32			rt_src;	/* Path source		*/
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ec38892..a43a39b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1223,7 +1223,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1345,7 +1344,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
@@ -1516,7 +1514,6 @@ local_input:
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1672,7 +1669,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
 	rth->rt_dst	= fl4->daddr;
-	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
@@ -1963,7 +1959,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_dst = ort->rt_dst;
-		rt->rt_src = ort->rt_src;
 		rt->rt_gateway = ort->rt_gateway;
 		rt_transfer_peer(rt, ort);
 		rt->fi = ort->fi;
@@ -1995,7 +1990,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
+static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2013,7 +2008,7 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= tos;
+	r->rtm_tos	= fl4->flowi4_tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2040,11 +2035,11 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != src) {
-		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+	    fl4->saddr != src) {
+		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (rt->rt_dst != rt->rt_gateway &&
+	if (fl4->daddr != rt->rt_gateway &&
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
@@ -2080,7 +2075,7 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
-						 rt->rt_src, rt->rt_dst,
+						 fl4->saddr, fl4->daddr,
 						 r, nowait);
 			if (err <= 0) {
 				if (!nowait) {
@@ -2116,6 +2111,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
 	struct rtable *rt = NULL;
+	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
 	u32 iif;
@@ -2150,6 +2146,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = dst;
+	fl4.saddr = src;
+	fl4.flowi4_tos = rtm->rtm_tos;
+	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+	fl4.flowi4_mark = mark;
+
 	if (iif) {
 		struct net_device *dev;
 
@@ -2170,13 +2173,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 		if (err == 0 && rt->dst.error)
 			err = -rt->dst.error;
 	} else {
-		struct flowi4 fl4 = {
-			.daddr = dst,
-			.saddr = src,
-			.flowi4_tos = rtm->rtm_tos,
-			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-			.flowi4_mark = mark,
-		};
 		rt = ip_route_output_key(net, &fl4);
 
 		err = 0;
@@ -2191,7 +2187,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+	err = rt_fill_info(net, src, &fl4, skb,
 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 01808c5..679f4fd 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -94,7 +94,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-	xdst->u.rt.rt_src = rt->rt_src;
 	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 
-- 
1.7.10

^ permalink raw reply related

* [PATCH 5/5] ipv4: Remove 'rt_mark' from 'struct rtable'
From: David Miller @ 2012-07-01 12:03 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/ipmr.c         |    2 +-
 net/ipv4/route.c        |    9 ++-------
 net/ipv4/xfrm4_policy.c |    1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index ad387d2..9face41 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -53,7 +53,6 @@ struct rtable {
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
-	__u32			rt_mark;
 
 	/* Info on neighbour */
 	__be32			rt_gateway;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b4ac39f..d472d1d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1797,7 +1797,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
 		.flowi4_tos = RT_TOS(iph->tos),
 		.flowi4_oif = rt->rt_oif,
 		.flowi4_iif = rt->rt_iif,
-		.flowi4_mark = rt->rt_mark,
+		.flowi4_mark = skb->mark,
 	};
 	struct mr_table *mrt;
 	int err;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a43a39b..f7ad8fd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1226,7 +1226,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_gateway	= daddr;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, dev_net(dev)->ipv4.peers);
@@ -1347,7 +1346,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_gateway	= daddr;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, &res->table->tb_peers);
@@ -1517,7 +1515,6 @@ local_input:
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_gateway	= daddr;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, net->ipv4.peers);
@@ -1672,7 +1669,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
-	rth->rt_mark    = fl4->flowi4_mark;
 	rth->rt_gateway = fl4->daddr;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, (res->table ?
@@ -1953,7 +1949,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
-		rt->rt_mark = ort->rt_mark;
 
 		rt->rt_genid = rt_genid(net);
 		rt->rt_flags = ort->rt_flags;
@@ -2046,8 +2041,8 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
 		goto nla_put_failure;
 
-	if (rt->rt_mark &&
-	    nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+	if (fl4->flowi4_mark &&
+	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
 		goto nla_put_failure;
 
 	error = rt->dst.error;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 679f4fd..d59eaec 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -82,7 +82,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-	xdst->u.rt.rt_mark = fl4->flowi4_mark;
 
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
-- 
1.7.10

^ permalink raw reply related

* Re: [PATCH 0/5] rtcache remove respin
From: David Miller @ 2012-07-01 12:15 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20120701.050243.908285695895815999.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Sun, 01 Jul 2012 05:02:43 -0700 (PDT)

> On a SPARC T3-1:
> 
> 1) Output route lookup: ~2800 cycles
> 2) Input route lookups: ~3000 cycles (rpfilter=0)
>                         ~4300 cycles (rpfilter=1)

Out of curiosity I got rid of the local table and made all routes
go into the main table and those numbers above become:

1) Output route lookup: ~2500 cycles
2) Input route lookups: ~2800 cycles (rpfilter=0)
                        ~4100 cycles (rpfilter=1)

====================
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 3dc7c96..5a103a9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -186,9 +186,7 @@ static inline struct fib_table *fib_get_table(struct net *net, u32 id)
 {
 	struct hlist_head *ptr;
 
-	ptr = id == RT_TABLE_LOCAL ?
-		&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
-		&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
+	ptr = &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
 	return hlist_entry(ptr->first, struct fib_table, tb_hlist);
 }
 
@@ -202,10 +200,6 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 {
 	struct fib_table *table;
 
-	table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
-		return 0;
-
 	table = fib_get_table(net, RT_TABLE_MAIN);
 	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
 		return 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44bf82e..bdc0231 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -160,7 +160,7 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 		/* Fallback to FIB local table so that communication
 		 * over loopback subnets work.
 		 */
-		local = fib_get_table(net, RT_TABLE_LOCAL);
+		local = fib_get_table(net, RT_TABLE_MAIN);
 		if (local &&
 		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
 		    res.type == RTN_LOCAL)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a658fb4..0056542 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -52,16 +52,10 @@ static int __net_init fib4_rules_init(struct net *net)
 {
 	struct fib_table *local_table, *main_table;
 
-	local_table = fib_trie_table(RT_TABLE_LOCAL);
-	if (local_table == NULL)
-		return -ENOMEM;
-
 	main_table  = fib_trie_table(RT_TABLE_MAIN);
 	if (main_table == NULL)
 		goto fail;
 
-	hlist_add_head_rcu(&local_table->tb_hlist,
-				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
 	hlist_add_head_rcu(&main_table->tb_hlist,
 				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
 	return 0;
@@ -155,7 +149,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	res.r = NULL;
 #endif
 
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
+	local_table = fib_get_table(net, RT_TABLE_MAIN);
 	if (local_table) {
 		ret = RTN_UNICAST;
 		rcu_read_lock();
@@ -702,11 +696,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
 		},
 	};
 
-	if (type == RTN_UNICAST)
-		tb = fib_new_table(net, RT_TABLE_MAIN);
-	else
-		tb = fib_new_table(net, RT_TABLE_LOCAL);
-
+	tb = fib_new_table(net, RT_TABLE_MAIN);
 	if (tb == NULL)
 		return;
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b23fd95..41a53b0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -284,9 +284,6 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
 {
 	int err;
 
-	err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
-	if (err < 0)
-		return err;
 	err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
 	if (err < 0)
 		return err;

^ permalink raw reply related

* Re: [PATCH net-next 06/10] {NET,IB}/mlx4: Add device managed flow steering firmware API
From: Or Gerlitz @ 2012-07-01 12:29 UTC (permalink / raw)
  To: David Miller; +Cc: roland, yevgenyp, oren, netdev, hadarh
In-Reply-To: <20120701.033055.489908836962064737.davem@davemloft.net>

On 7/1/2012 1:30 PM, David Miller wrote:
[...]
> @@ -136,6 +138,11 @@ module_param_array(port_type_array, int, &arr_argc, 0444);
>   MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
>   				"1 for IB, 2 for Ethernet");
>   
> +static int mlx4_flow_steering_hash;
> +module_param_named(flow_steering_hash, mlx4_flow_steering_hash, int, 0444);
> +MODULE_PARM_DESC(flow_steering_hash,
> +		 "Flow steering hash function configuration. Config options: L2 = 0, L2_L3_L4 = 1 (default: L2).");
> +
>
> No module paramters, do it via ethtool or similar.
>

Dave,

We'll be handling the indentation, comments and other syntax issues you 
have pointed out
and submit V1, hopefully which would be clean from such errors.

Re the usage of the module param - as was pointed out in the change-log, 
please note that
this policy is **global** to the HCA, that is effects all the Ethernet 
(and down the road,
also IPoIB flow-steering, when we run a card with one IB port and one 
Eth port) net-devices
that relate to that device.

The module param is for mlx4_core and not for mlx4_en. In that respect, 
per net-device
ethtool directive wouldn't work, even when there is a dedicated firmware 
command that
allows changing this in run-time, which we don't have now.

My thinking was that once we have this command at hand, we can add 
sysfs/etc entry at
the mlx4_core level to provide a different hash function to the device. 
And for the
time being use the module param, makes sense?

Or.

^ permalink raw reply

* Re: [PATCH v6] sctp: be more restrictive in transport selection on bundled sacks
From: Neil Horman @ 2012-07-01 12:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, vyasevich, linux-sctp
In-Reply-To: <20120630.173945.173993639982489712.davem@davemloft.net>

On Sat, Jun 30, 2012 at 05:39:45PM -0700, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Sat, 30 Jun 2012 09:04:26 -0400
> 
> > It was noticed recently that when we send data on a transport, its possible that
> > we might bundle a sack that arrived on a different transport.  While this isn't
> > a major problem, it does go against the SHOULD requirement in section 6.4 of RFC
> > 2960:
> > 
> >  An endpoint SHOULD transmit reply chunks (e.g., SACK, HEARTBEAT ACK,
> >    etc.) to the same destination transport address from which it
> >    received the DATA or control chunk to which it is replying.  This
> >    rule should also be followed if the endpoint is bundling DATA chunks
> >    together with the reply chunk.
> > 
> > This patch seeks to correct that.  It restricts the bundling of sack operations
> > to only those transports which have moved the ctsn of the association forward
> > since the last sack.  By doing this we guarantee that we only bundle outbound
> > saks on a transport that has received a chunk since the last sack.  This brings
> > us into stricter compliance with the RFC.
> > 
> > Vlad had initially suggested that we strictly allow only sack bundling on the
> > transport that last moved the ctsn forward.  While this makes sense, I was
> > concerned that doing so prevented us from bundling in the case where we had
> > received chunks that moved the ctsn on multiple transports.  In those cases, the
> > RFC allows us to select any of the transports having received chunks to bundle
> > the sack on.  so I've modified the approach to allow for that, by adding a state
> > variable to each transport that tracks weather it has moved the ctsn since the
> > last sack.  This I think keeps our behavior (and performance), close enough to
> > our current profile that I think we can do this without a sysctl knob to
> > enable/disable it.
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > CC: Vlad Yaseivch <vyasevich@gmail.com>
> > CC: David S. Miller <davem@davemloft.net>
> > CC: linux-sctp@vger.kernel.org
> > Reported-by: Michele Baldessari <michele@redhat.com>
> > Reported-by: sorin serban <sserban@redhat.com>
> 
> Once this has Vlad's ACK I'll apply it.
> 
Thanks!
> There has to be a better way to handle this situation, wherein the
> responsible party has ACK'd the patch but I just ask for a few coding
> style fixups and whatnot.  As it stands now I have to twiddle my
> thumbs waiting for the new ACK.
> 
Perhaps we could modify the SubmittingPatches document to indicate that an
Acked-by from a subsystem maintainer implicitly confers authority on the
upstream receiver to request reasonable stylistic changes that don't affect the
functionality of the patch in the interests of maintaining coding conventions.

i.e. Since vlad applied an acked-by to v5 of this patch, that would give you the
right to carry that ack forward to v6 because you only asked for style changes.

Thoughts?
Neil

> 

^ permalink raw reply

* [PATCH net-next 00/11] default maximal number of RSS queues in mq drivers
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev
  Cc: eilong, Yuval Mintz, Divy Le Ray, Or Gerlitz, Jon Mason,
	Anirban Chakraborty, Jitendra Kalsaria, Ron Mercer, Jeff Kirsher,
	Jon Mason, Andrew Gallatin, Sathya Perla, Subbu Seetharaman,
	Ajit Khaparde, Matt Carlson, Michael Chan, Eric Dumazet,
	Ben Hutchings

Different vendors support different number of RSS queues by default. Today,
there exists an ethtool API through which users can change the number of
channels their driver supports; This enables us to pursue the goal of using
a default number of RSS queues in various multi-queue drivers.

This patch intendeds to achieve the above default, by upper-limiting the number
of interrupts multi-queue drivers request (by default, not via the new API) 
with correlation to the number of cpus on the machine.

After examining multi-queue drivers that call alloc_etherdev_mq[s],
it became evident that most drivers allocate their devices using hard-coded
values. Changing those defaults directly will most likely cause a regression. 

However, (most) multi-queue driver look at the number of online cpus when 
requesting for interrupts. We assume that the number of interrupts the
driver manages to request is propagated across the driver, and the number
of RSS queues it configures is based upon it. 

This patch modifies said logic - if the number of cpus is large enough, use
a smaller default value instead. This serves 2 main purposes: 
 1. A step forward unity in the number of RSS queues of various drivers.
 2. It prevents wasteful requests for memory on machines with many cpus.

Notice no testing was made on this patch (other than on the bnx2x driver)
except for compilation test. This patch is based on RFC:
http://marc.info/?l=linux-netdev&m=134062509310404&w=2

Drivers identified as multi-queue, handled in this patch:

* mellanox mlx4
* neterion vxge
* qlogic   qlge
* chelsio  cxgb3, cxgb4
* myricom  myri10ge
* emulex   benet
* broadcom tg3, bnx2, bnx2x

The guys at Intel requested to change their drivers' behaviour on their own,
in order to support ethtool's [g|s]et_channels. Thus, igb, igbx2, and igbxevf
were not handled in this patch.

Driver identified as multi-queue, no reference to number of online cpus found,
and thus unhandled in this patch:

* neterion  s2io
* marvell   mv643xx
* freescale gianfar
* ibm       ehea
* ti        cpmac
* sun       niu
* sfc       efx
* chelsio   cxgb4vf

Cheers,
Yuval Mintz

Cc: Divy Le Ray <divy@chelsio.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Jon Mason <jdmason@kudzu.us>
Cc: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
Cc: Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>
Cc: Ron Mercer <ron.mercer@qlogic.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Jon Mason <mason@myri.com>
Cc: Andrew Gallatin <gallatin@myri.com>
Cc: Sathya Perla <sathya.perla@emulex.com>
Cc: Subbu Seetharaman <subbu.seetharaman@emulex.com>
Cc: Ajit Khaparde <ajit.khaparde@emulex.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>

^ permalink raw reply

* [PATCH net-next 02/11] mlx4: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Or Gerlitz
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/main.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a0313de..14d9c76 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/io-mapping.h>
 #include <linux/delay.h>
+#include <linux/netdevice.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
@@ -1539,8 +1540,8 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct msix_entry *entries;
 	int nreq = min_t(int, dev->caps.num_ports *
-			 min_t(int, num_online_cpus() + 1, MAX_MSIX_P_PORT)
-				+ MSIX_LEGACY_SZ, MAX_MSIX);
+			 min_t(int, netif_get_num_default_rss_queues() + 1,
+			       MAX_MSIX_P_PORT) + MSIX_LEGACY_SZ, MAX_MSIX);
 	int err;
 	int i;
 
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 01/11] net-next: Add netif_get_num_default_rss_queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Most multi-queue networking driver consider the number of online cpus when
configuring RSS queues.
This patch adds a wrapper to the number of cpus, setting an upper limit on the
number of cpus a driver should consider (by default) when allocating resources
for his queues.

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 include/linux/netdevice.h |    3 +++
 net/core/dev.c            |   11 +++++++++++
 2 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2c2ecea..ab0251d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2119,6 +2119,9 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 #endif
 }
 
+#define DEFAULT_MAX_NUM_RSS_QUEUES	(8)
+extern int netif_get_num_default_rss_queues(void);
+
 /* Use this variant when it is known for sure that it
  * is executing from hardware interrupt context or with hardware interrupts
  * disabled.
diff --git a/net/core/dev.c b/net/core/dev.c
index ed674e2..69f7a1a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1793,6 +1793,17 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 #endif
 
+/* netif_get_num_default_rss_queues - default number of RSS queues
+ *
+ * This routine should set an upper limit on the number of RSS queues
+ * used by default by multiqueue devices.
+ */
+int netif_get_num_default_rss_queues()
+{
+	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+}
+EXPORT_SYMBOL(netif_get_num_default_rss_queues);
+
 static inline void __netif_reschedule(struct Qdisc *q)
 {
 	struct softnet_data *sd;
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 06/11] cxgb4: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Divy Le Ray
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Divy Le Ray <divy@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index e1f96fb..5ed49af 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3493,8 +3493,8 @@ static void __devinit cfg_queues(struct adapter *adap)
 	 */
 	if (n10g)
 		q10g = (MAX_ETH_QSETS - (adap->params.nports - n10g)) / n10g;
-	if (q10g > num_online_cpus())
-		q10g = num_online_cpus();
+	if (q10g > netif_get_num_default_rss_queues())
+		q10g = netif_get_num_default_rss_queues();
 
 	for_each_port(adap, i) {
 		struct port_info *pi = adap2pinfo(adap, i);
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 03/11] vxge: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Jon Mason
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Jon Mason <jdmason@kudzu.us>
---
 drivers/net/ethernet/neterion/vxge/vxge-main.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 2578eb1..2fd1edb 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3687,7 +3687,8 @@ static int __devinit vxge_config_vpaths(
 			return 0;
 
 		if (!driver_config->g_no_cpus)
-			driver_config->g_no_cpus = num_online_cpus();
+			driver_config->g_no_cpus =
+				netif_get_num_default_rss_queues();
 
 		driver_config->vpath_per_dev = driver_config->g_no_cpus >> 1;
 		if (!driver_config->vpath_per_dev)
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 05/11] cxgb3: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Divy Le Ray
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Divy Le Ray <divy@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index abb6ce7..9b08749 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3050,7 +3050,7 @@ static struct pci_error_handlers t3_err_handler = {
 static void set_nqsets(struct adapter *adap)
 {
 	int i, j = 0;
-	int num_cpus = num_online_cpus();
+	int num_cpus = netif_get_num_default_rss_queues();
 	int hwports = adap->params.nports;
 	int nqsets = adap->msix_nvectors - 1;
 
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 04/11] qlge: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev
  Cc: eilong, Yuval Mintz, Anirban Chakraborty, Jitendra Kalsaria,
	Ron Mercer
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
Cc: Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>
Cc: Ron Mercer <ron.mercer@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlge/qlge_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 09d8d33..3c3499d 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -4649,7 +4649,7 @@ static int __devinit qlge_probe(struct pci_dev *pdev,
 	int err = 0;
 
 	ndev = alloc_etherdev_mq(sizeof(struct ql_adapter),
-			min(MAX_CPUS, (int)num_online_cpus()));
+			min(MAX_CPUS, netif_get_num_default_rss_queues()));
 	if (!ndev)
 		return -ENOMEM;
 
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 09/11] bnx2: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Michael Chan
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Michael Chan <mchan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 9eb7624..1901da1 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6250,7 +6250,7 @@ bnx2_enable_msix(struct bnx2 *bp, int msix_vecs)
 static int
 bnx2_setup_int_mode(struct bnx2 *bp, int dis_msi)
 {
-	int cpus = num_online_cpus();
+	int cpus = netif_get_num_default_rss_queues();
 	int msix_vecs;
 
 	if (!bp->num_req_rx_rings)
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 08/11] tg3: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Matt Carlson
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Matt Carlson <mcarlson@broadcom.com>
---
 drivers/net/ethernet/broadcom/tg3.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index e47ff8b..6cbab03 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -9908,7 +9908,7 @@ static bool tg3_enable_msix(struct tg3 *tp)
 	int i, rc;
 	struct msix_entry msix_ent[tp->irq_max];
 
-	tp->irq_cnt = num_online_cpus();
+	tp->irq_cnt = netif_get_num_default_rss_queues();
 	if (tp->irq_cnt > 1) {
 		/* We want as many rx rings enabled as there are cpus.
 		 * In multiqueue MSI-X mode, the first MSI-X vector
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 07/11] myri10ge: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz, Jon Mason
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Jon Mason <mason@myri.com>
---
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 90153fc..fa85cf1 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -3775,7 +3775,7 @@ static void myri10ge_probe_slices(struct myri10ge_priv *mgp)
 
 	mgp->num_slices = 1;
 	msix_cap = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
-	ncpus = num_online_cpus();
+	ncpus = netif_get_num_default_rss_queues();
 
 	if (myri10ge_max_slices == 1 || msix_cap == 0 ||
 	    (myri10ge_max_slices == -1 && ncpus < 2))
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 10/11] bnx2x: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:18 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, Yuval Mintz
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index daa894b..53659f3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -822,7 +822,8 @@ static inline int bnx2x_calc_num_queues(struct bnx2x *bp)
 {
 	return  num_queues ?
 		 min_t(int, num_queues, BNX2X_MAX_QUEUES(bp)) :
-		 min_t(int, num_online_cpus(), BNX2X_MAX_QUEUES(bp));
+		 min_t(int, netif_get_num_default_rss_queues(),
+		       BNX2X_MAX_QUEUES(bp));
 }
 
 static inline void bnx2x_clear_sge_mask_next_elems(struct bnx2x_fastpath *fp)
-- 
1.7.9.rc2

^ permalink raw reply related

* [PATCH net-next 11/11] be2net: set maximal number of default RSS queues
From: Yuval Mintz @ 2012-07-01 13:19 UTC (permalink / raw)
  To: davem, netdev
  Cc: eilong, Yuval Mintz, Sathya Perla, Subbu Seetharaman,
	Ajit Khaparde
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Cc: Sathya Perla <sathya.perla@emulex.com>
Cc: Subbu Seetharaman <subbu.seetharaman@emulex.com>
Cc: Ajit Khaparde <ajit.khaparde@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |   10 ++++++----
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index edce7af..2141bd7 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2172,12 +2172,14 @@ static void be_msix_disable(struct be_adapter *adapter)
 
 static uint be_num_rss_want(struct be_adapter *adapter)
 {
+	u32 num = 0;
 	if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) &&
 	     !sriov_want(adapter) && be_physfn(adapter) &&
-	     !be_is_mc(adapter))
-		return (adapter->be3_native) ? BE3_MAX_RSS_QS : BE2_MAX_RSS_QS;
-	else
-		return 0;
+	     !be_is_mc(adapter)) {
+		num = (adapter->be3_native) ? BE3_MAX_RSS_QS : BE2_MAX_RSS_QS;
+		num = min_t(u32, num, (u32)netif_get_num_default_rss_queues());
+	}
+	return num;
 }
 
 static void be_msix_enable(struct be_adapter *adapter)
-- 
1.7.9.rc2

^ permalink raw reply related

* Re: [BUG, regression, bisected] Marvell 88E8055 NIC (sky2) fails to detect link after resume from S3
From: Michal Zatloukal @ 2012-07-01 13:30 UTC (permalink / raw)
  To: Francois Romieu; +Cc: Stephen Hemminger, netdev
In-Reply-To: <20120630200236.GA30058@electric-eye.fr.zoreil.com>

Breaking news:
Fiddling with mmiotrace, I was able to get the "phy I/O error" on
previously-thought-good kernel (non-reliably; it took several
suspend-resume cycles), and also vice versa - in vty, I was able to
suspend and resume the "bad" kernel with the NIC still working (also
not reliably - in GUI, it still flaked out after first
suspend-resume).

Anyway, I've gathered traces of both good and bad behaviour, each from
the respective kernel (not that it seems to matter much). Now, what's
the recommended way of posting them? Each is over 3MB in size and
pastebin won't take them. :-(

MZ

On Sat, Jun 30, 2012 at 10:02 PM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> Michal Zatloukal <myxal.mxl@gmail.com> :
> [...]
>> Is there something I can try?
>
> I have not used it for quite some time but comparing mmiotrace output
> (see Documentation/trace/mmiotrace.txt) before and after the regression
> commit may give some hint.
>
> Otherwise I would ask for help on linux-pm@vger.kernel.org
>
> --
> Ueimor

^ permalink raw reply

* Re: REGRESSION: 3.4.0->3.5.0-rc2 kernel WARNING on cable plug on Acer Aspire One, no network
From: Alex Villacís Lasso @ 2012-07-01 13:50 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev
In-Reply-To: <20120611213815.GA8517@electric-eye.fr.zoreil.com>

El 11/06/12 16:38, Francois Romieu escribió:
> Alex Villacís Lasso <a_villacis@palosanto.com> :
> [...]
>> $ grep XID dmesg-3.5.0-rc2.txt
>> [   15.873858] r8169 0000:02:00.0: eth0: RTL8102e at 0xf7c0e000,
>> 00:1e:68:e5:5d:b1, XID 04a00000 IRQ 44
> The 8102e has not been touched by that many suspect patches but I do
> not see where the problem is :o(
>
> Can you peel off the r8169 patches between 3.4.0 and 3.5-rc ?
>
Still present in 3.5-rc5. Bisection still in progress.

^ permalink raw reply

* Re: AF_BUS socket address family
From: Alan Cox @ 2012-07-01 14:16 UTC (permalink / raw)
  To: David Miller; +Cc: vincent.sanders, netdev, linux-kernel
In-Reply-To: <20120630.173346.670086962166528527.davem@davemloft.net>

> The issue is that what to do when a receiver goes deaf is a policy
> issue.

Something all these protocols alredy recognize and have done for years.
The real issue for AF_BUS versus the current slow AF_UNIX approach is
that you really need to know *who* is blocked up.

That's no different to UDP multicast and needing to know how errored the
frame.

Alan

^ permalink raw reply

* Re: [PATCH net-next 00/11] default maximal number of RSS queues in mq drivers
From: Or Gerlitz @ 2012-07-01 15:01 UTC (permalink / raw)
  To: Yuval Mintz
  Cc: davem, netdev, eilong, Divy Le Ray, Jon Mason,
	Anirban Chakraborty, Jitendra Kalsaria, Ron Mercer, Jeff Kirsher,
	Jon Mason, Andrew Gallatin, Sathya Perla, Subbu Seetharaman,
	Ajit Khaparde, Matt Carlson, Michael Chan, Eric Dumazet,
	Ben Hutchings
In-Reply-To: <1341148740-7375-1-git-send-email-yuvalmin@broadcom.com>

On 7/1/2012 4:18 PM, Yuval Mintz wrote:
> Different vendors support different number of RSS queues by default. Today,
> there exists an ethtool API through which users can change the number of
> channels their driver supports; This enables us to pursue the goal of using
> a default number of RSS queues in various multi-queue drivers.

Yuval,

I assume this posting is actually v3 of "[RFC net-next (v2) 00/14] 
default maximal number of RSS queues in mq drivers" see 
http://marc.info/?l=linux-netdev&m=134062509310404&w=2?

correct? so what has been changed along v0/v1/v2 --> v3?  It would be 
very helpful to follow
if you make sure to mark the patches that makes Vn with "PATCH net-next 
Vn" and specify the changes
from Vn-1, Vn-2...V1, V0.

Or.

^ permalink raw reply

* Re: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: Ben Hutchings @ 2012-07-01 16:00 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: davem, roland, yevgenyp, oren, netdev, Hadar Hen Zion
In-Reply-To: <1341135823-29039-10-git-send-email-ogerlitz@mellanox.com>

On Sun, 2012-07-01 at 12:43 +0300, Or Gerlitz wrote:
> From: Hadar Hen Zion <hadarh@mellanox.co.il>
> 
> Implement the ethtool APIs for attaching L2/L3/L4 based flow steering
> rules to the netdevice RX rings. Added set_rxnfc callback and enhanced
> the existing get_rxnfc callback.
> 
> Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  373 +++++++++++++++++++++++
>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    7 +
>  2 files changed, 380 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> index 72901ce..30de264 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> @@ -38,6 +38,10 @@
>  #include "mlx4_en.h"
>  #include "en_port.h"
>  
> +#define EN_ETHTOOL_QP_ATTACH (1ull << 63)
> +#define EN_ETHTOOL_MAC_MASK 0xffffffffffffULL
> +#define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff)
> +#define EN_ETHTOOL_WORD_MASK  cpu_to_be32(0xffffffff)
>  
>  static void
>  mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
> @@ -599,16 +603,360 @@ static int mlx4_en_set_rxfh_indir(struct net_device *dev,
>  	return err;
>  }
>  
> +#define not_all_zeros_or_all_ones(field, type) \
> +	(field && (type)~field)
> +
> +static int mlx4_en_validate_flow(struct net_device *dev,
> +				 struct ethtool_rxnfc *cmd)
> +{
> +	struct ethtool_usrip4_spec *l3_mask;
> +	struct ethtool_tcpip4_spec *l4_mask;
> +	struct ethhdr *eth_mask;
> +	u64 full_mac = ~0ull;
> +	u64 zero_mac = 0;
> +
> +	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
> +		return -EINVAL;
> +
> +	switch (cmd->fs.flow_type & ~FLOW_EXT) {
> +	case TCP_V4_FLOW:
> +	case UDP_V4_FLOW:
> +		if (cmd->fs.h_u.tcp_ip4_spec.tos)
> +			return -EOPNOTSUPP;

I suspect that your filter ignores TOS, rather than only matching TOS ==
0, so you should actually be checking the corresponding field in the
mask (fs.m_u).  The error code should be EINVAL.

> +		l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
> +		/* don't allow mask which isn't all 0 or 1 */
> +		if (not_all_zeros_or_all_ones(l4_mask->ip4src, __be32) ||
> +		    not_all_zeros_or_all_ones(l4_mask->ip4dst, __be32) ||
> +		    not_all_zeros_or_all_ones(l4_mask->psrc, __be16) ||
> +		    not_all_zeros_or_all_ones(l4_mask->pdst, __be16))
> +			return -EOPNOTSUPP;

Again, here and in many further instances, the error code should be
EINVAL.

> +		break;
> +	case IP_USER_FLOW:
> +		l3_mask = &cmd->fs.m_u.usr_ip4_spec;
> +		if (cmd->fs.h_u.usr_ip4_spec.l4_4_bytes ||
> +		    cmd->fs.h_u.usr_ip4_spec.tos ||

I think this should be checking l4_4_bytes and tos in the mask.

> +		    cmd->fs.h_u.usr_ip4_spec.proto ||
> +		    cmd->fs.h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
> +		    (!cmd->fs.h_u.usr_ip4_spec.ip4src &&
> +		     !cmd->fs.h_u.usr_ip4_spec.ip4dst) ||
> +		    not_all_zeros_or_all_ones(l3_mask->ip4src, __be32) ||
> +		    not_all_zeros_or_all_ones(l3_mask->ip4dst, __be32))
> +			return -EOPNOTSUPP;
> +		break;
> +	case ETHER_FLOW:
> +		eth_mask = &cmd->fs.m_u.ether_spec;
> +		if (memcmp(eth_mask->h_source, &zero_mac, ETH_ALEN))
> +			return -EOPNOTSUPP;
> +		if (!memcmp(eth_mask->h_dest, &zero_mac, ETH_ALEN))
> +			return -EOPNOTSUPP;

But in the next statement you test whether eth_mask->h_dest is either
all-zeroes or all-ones.  Is all-zeroes valid or not?  I suspect you
actually intend to reject the case where both h_dest and h_proto are
masked out.

> +		if (not_all_zeros_or_all_ones(eth_mask->h_proto, __be16) ||
> +		    (memcmp(eth_mask->h_dest, &zero_mac, ETH_ALEN) &&
> +		     memcmp(eth_mask->h_dest, &full_mac, ETH_ALEN)))
> +			return -EOPNOTSUPP;
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if ((cmd->fs.flow_type & FLOW_EXT)) {
> +		if (cmd->fs.m_ext.vlan_etype ||
> +		    not_all_zeros_or_all_ones(cmd->fs.m_ext.vlan_tci,
> +					       __be16)) {
> +			return -EOPNOTSUPP;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void add_ip_rule(struct mlx4_en_priv *priv,
> +			struct ethtool_rxnfc *cmd,
> +			struct list_head *list_h)
> +{
> +	struct mlx4_spec_list *spec_l3;
> +
> +	spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
> +	if (!spec_l3) {
> +		en_err(priv, "Fail to alloc ethtool rule.\n");
> +		return;
> +	}

This should return an error code as well - logging is not a substitue.

> +	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
> +	spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src;
> +	if (spec_l3->ipv4.src_ip)
> +		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
> +	spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst;
> +	if (spec_l3->ipv4.dst_ip)
> +		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;

The conditions should be using the mask (cmd->fs.m_u) not the value.

> +	list_add_tail(&spec_l3->list, list_h);
> +}
> +
> +static void add_tcp_udp_rule(struct mlx4_en_priv *priv,
> +			     struct ethtool_rxnfc *cmd,
> +			     struct list_head *list_h, int proto)
> +{
> +	struct mlx4_spec_list *spec_l3;
> +	struct mlx4_spec_list *spec_l4;
> +
> +	spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
> +	spec_l4 = kzalloc(sizeof *spec_l4, GFP_KERNEL);
> +	if (!spec_l4 || !spec_l3) {
> +		en_err(priv, "Fail to alloc ethtool rule.\n");

If one of them was successfully allocated, it will now be leaked.

> +		return;
> +	}
> +
> +	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
> +
> +	if (proto == TCP_V4_FLOW) {
> +		spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
> +		spec_l3->ipv4.src_ip = cmd->fs.h_u.tcp_ip4_spec.ip4src;
> +		spec_l3->ipv4.dst_ip = cmd->fs.h_u.tcp_ip4_spec.ip4dst;
> +		spec_l4->tcp_udp.src_port = cmd->fs.h_u.tcp_ip4_spec.psrc;
> +		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.tcp_ip4_spec.pdst;
> +	} else {
> +		spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
> +		spec_l3->ipv4.src_ip = cmd->fs.h_u.udp_ip4_spec.ip4src;
> +		spec_l3->ipv4.dst_ip = cmd->fs.h_u.udp_ip4_spec.ip4dst;
> +		spec_l4->tcp_udp.src_port = cmd->fs.h_u.udp_ip4_spec.psrc;
> +		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.udp_ip4_spec.pdst;
> +	}
> +
> +	if (spec_l3->ipv4.src_ip)
> +		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
> +	if (spec_l3->ipv4.dst_ip)
> +		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
> +
> +	if (spec_l4->tcp_udp.src_port)
> +		spec_l4->tcp_udp.src_port_msk = EN_ETHTOOL_SHORT_MASK;
> +	if (spec_l4->tcp_udp.dst_port)
> +		spec_l4->tcp_udp.dst_port_msk = EN_ETHTOOL_SHORT_MASK;

All these conditions should be using the mask, not the value.

> +	list_add_tail(&spec_l3->list, list_h);
> +	list_add_tail(&spec_l4->list, list_h);
> +}
> +
> +static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
> +					     struct ethtool_rxnfc *cmd,
> +					     struct list_head *rule_list_h)
> +{
> +	int err;
> +	u64 mac;
> +	__be64 be_mac;
> +	struct ethhdr *eth_spec;
> +	struct mlx4_en_priv *priv = netdev_priv(dev);
> +	struct mlx4_spec_list *spec_l2;
> +	__be64 mac_msk = cpu_to_be64(EN_ETHTOOL_MAC_MASK << 16);
> +
> +	err = mlx4_en_validate_flow(dev, cmd);
> +	if (err)
> +		return err;
> +
> +	spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL);
> +	if (!spec_l2)
> +		return -ENOMEM;
> +
> +	mac = priv->mac & EN_ETHTOOL_MAC_MASK;
> +	be_mac = cpu_to_be64(mac << 16);
> +
> +	spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
> +	memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
> +	if ((cmd->fs.flow_type & ~FLOW_EXT) != ETHER_FLOW)
> +		memcpy(spec_l2->eth.dst_mac, &be_mac, ETH_ALEN);

Does the hardware require filtering by MAC address and not only by layer
3/4 addresses?

> +	if ((cmd->fs.flow_type & FLOW_EXT) && cmd->fs.m_ext.vlan_tci) {
> +		spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci;
> +		spec_l2->eth.vlan_id_msk = cpu_to_be16(0xfff);

This doesn't match mlx4_en_validate_flow(); you are replacing a mask of
0xffff with 0xfff.

> +	}
> +
> +	list_add_tail(&spec_l2->list, rule_list_h);
> +
> +	switch (cmd->fs.flow_type & ~FLOW_EXT) {
> +	case ETHER_FLOW:
> +		eth_spec = &cmd->fs.h_u.ether_spec;
> +		memcpy(&spec_l2->eth.dst_mac, eth_spec->h_dest, ETH_ALEN);
> +		spec_l2->eth.ether_type = eth_spec->h_proto;
> +		if (eth_spec->h_proto)
> +			spec_l2->eth.ether_type_enable = 1;
> +		break;
> +	case IP_USER_FLOW:
> +		add_ip_rule(priv, cmd, rule_list_h);
> +		break;
> +	case TCP_V4_FLOW:
> +		add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW);
> +		break;
> +	case UDP_V4_FLOW:
> +		add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW);
> +		break;

All those functions need to be able to return errors.

> +	}
> +	return 0;
> +}
> +
> +static int mlx4_en_flow_replace(struct net_device *dev,
> +				struct ethtool_rxnfc *cmd)
> +{
> +	int err;
> +	struct mlx4_en_priv *priv = netdev_priv(dev);
> +	struct ethtool_flow_id *loc_rule;
> +	struct mlx4_spec_list *spec, *tmp_spec;
> +	u32 qpn;
> +	u64 reg_id;
> +
> +	struct mlx4_net_trans_rule rule = {
> +		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
> +		.exclusive = 0,
> +		.allow_loopback = 1,
> +		.promisc_mode = MLX4_FS_PROMISC_NONE,
> +	};
> +
> +	rule.port = priv->port;
> +	rule.priority = MLX4_DOMAIN_ETHTOOL | cmd->fs.location;
> +	INIT_LIST_HEAD(&rule.list);
> +
> +	/* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */
> +	if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC)
> +		return -EOPNOTSUPP;

EINVAL.

[...]
>  static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
>  			     u32 *rule_locs)
>  {
>  	struct mlx4_en_priv *priv = netdev_priv(dev);
> +	struct mlx4_en_dev *mdev = priv->mdev;
>  	int err = 0;
> +	int i = 0, priority = 0;
> +
> +	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
> +		return -EOPNOTSUPP;
>  
>  	switch (cmd->cmd) {
>  	case ETHTOOL_GRXRINGS:
>  		cmd->data = priv->rx_ring_num;
>  		break;
> +	case ETHTOOL_GRXCLSRLCNT:
> +		cmd->rule_cnt = mlx4_en_get_num_flows(priv);
> +		break;
> +	case ETHTOOL_GRXCLSRULE:
> +		err = mlx4_en_get_flow(dev, cmd, cmd->fs.location);
> +		break;
> +	case ETHTOOL_GRXCLSRLALL:
> +		while (!err || err == -ENOENT) {
> +			err = mlx4_en_get_flow(dev, cmd, i);
> +			if (!err)
> +				((u32 *)(rule_locs))[priority++] = i;

I don't see any range check against cmd->rule_cnt.

Why are you casting rule_locs?

Also, are the rules really prioritised by location, so that if rule 0
and 1 match a packet then only rule 0 is applied?  If they are actually
prioritised by the match type then you need to assign locations on the
driver side that reflect the actual priorities.

> +			i++;
> +		}
> +		if (priority)
> +			err = 0;
[...]

But if there are no rules defined, this is an error?  That's not right.
I think you should unconditionally set err = 0 here.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: Joe Perches @ 2012-07-01 16:38 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Or Gerlitz, davem, roland, yevgenyp, oren, netdev, Hadar Hen Zion
In-Reply-To: <1341158452.4852.107.camel@deadeye.wl.decadent.org.uk>

On Sun, 2012-07-01 at 17:00 +0100, Ben Hutchings wrote:
> On Sun, 2012-07-01 at 12:43 +0300, Or Gerlitz wrote:
> > From: Hadar Hen Zion <hadarh@mellanox.co.il>
[]
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
[]
> > @@ -599,16 +603,360 @@ static int mlx4_en_set_rxfh_indir(struct net_device *dev,
> >  	return err;
> >  }
> >  
> > +#define not_all_zeros_or_all_ones(field, type) \
> > +	(field && (type)~field)

I think this macro is suboptimal because
negated names are easy to misuse.

I think type is also unnecessary and too
easy to mismatch or keep up to date with
field type changes.

Perhaps it's better as:

#define all_zeros_or_all_ones(field)		\
({						\
	field && (typeof(field))~field;		\
})

> > +
> > +static int mlx4_en_validate_flow(struct net_device *dev,
> > +				 struct ethtool_rxnfc *cmd)
> > +{
[]
> > +		/* don't allow mask which isn't all 0 or 1 */
> > +		if (not_all_zeros_or_all_ones(l4_mask->ip4src, __be32) ||
> > +		    not_all_zeros_or_all_ones(l4_mask->ip4dst, __be32) ||
> > +		    not_all_zeros_or_all_ones(l4_mask->psrc, __be16) ||
> > +		    not_all_zeros_or_all_ones(l4_mask->pdst, __be16))
> > +			return -EOPNOTSUPP;

		if (!all_zeros_or_all_ones(l4_mask->ip4src) ||
		    !all_zeros_or_all_ones(l4_mask->ip4dst) ||
		    !all_zeros_or_all_ones(l4_mask->psrc) ||
		    !all_zeros_or_all_ones(l4_mask->pdst))

^ permalink raw reply

* Re: [PATCH 00/12] Swap-over-NFS without deadlocking V8
From: Eric B Munson @ 2012-07-01 17:22 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Linux-MM, Linux-Netdev, Linux-NFS, LKML,
	David Miller, Trond Myklebust, Neil Brown, Christoph Hellwig,
	Peter Zijlstra, Mike Christie, Sebastian Andrzej Siewior
In-Reply-To: <1340976805-5799-1-git-send-email-mgorman@suse.de>

[-- Attachment #1: Type: text/plain, Size: 4245 bytes --]

On Fri, 29 Jun 2012, Mel Gorman wrote:

> Changelog since V7
>   o Rebase to linux-next 20120629
>   o bi->page_dma instead of bi->page in intel driver
>   o Build fix for !CONFIG_NET					(sebastian)
>   o Restore PF_MEMALLOC flags correctly in all cases		(jlayton)
> 
> Changelog since V6
>   o Rebase to linux-next 20120622
> 
> Changelog since V5
>   o Rebase to v3.5-rc3
> 
> Changelog since V4
>   o Catch if SOCK_MEMALLOC flag is cleared with rmem tokens	(davem)
> 
> Changelog since V3
>   o Rebase to 3.4-rc5
>   o kmap pages for writing to swap				(akpm)
>   o Move forward declaration to reduce chance of duplication	(akpm)
> 
> Changelog since V2
>   o Nothing significant, just rebases. A radix tree lookup is replaced with
>     a linear search would be the biggest rebase artifact
> 
> This patch series is based on top of "Swap-over-NBD without deadlocking v14"
> as it depends on the same reservation of PF_MEMALLOC reserves logic.
> 
> When a user or administrator requires swap for their application, they
> create a swap partition and file, format it with mkswap and activate it with
> swapon. In diskless systems this is not an option so if swap if required
> then swapping over the network is considered.  The two likely scenarios
> are when blade servers are used as part of a cluster where the form factor
> or maintenance costs do not allow the use of disks and thin clients.
> 
> The Linux Terminal Server Project recommends the use of the Network
> Block Device (NBD) for swap but this is not always an option.  There is
> no guarantee that the network attached storage (NAS) device is running
> Linux or supports NBD. However, it is likely that it supports NFS so there
> are users that want support for swapping over NFS despite any performance
> concern. Some distributions currently carry patches that support swapping
> over NFS but it would be preferable to support it in the mainline kernel.
> 
> Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
> 
> Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
> 	reserves.
> 
> Patch 3 adds three helpers for filesystems to handle swap cache pages.
> 	For example, page_file_mapping() returns page->mapping for
> 	file-backed pages and the address_space of the underlying
> 	swap file for swap cache pages.
> 
> Patch 4 adds two address_space_operations to allow a filesystem
> 	to pin all metadata relevant to a swapfile in memory. Upon
> 	successful activation, the swapfile is marked SWP_FILE and
> 	the address space operation ->direct_IO is used for writing
> 	and ->readpage for reading in swap pages.
> 
> Patch 5 notes that patch 3 is bolting
> 	filesystem-specific-swapfile-support onto the side and that
> 	the default handlers have different information to what
> 	is available to the filesystem. This patch refactors the
> 	code so that there are generic handlers for each of the new
> 	address_space operations.
> 
> Patch 6 adds an API to allow a vector of kernel addresses to be
> 	translated to struct pages and pinned for IO.
> 
> Patch 7 adds support for using highmem pages for swap by kmapping
> 	the pages before calling the direct_IO handler.
> 
> Patch 8 updates NFS to use the helpers from patch 3 where necessary.
> 
> Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
> 
> Patch 10 implements the new swapfile-related address_space operations
> 	for NFS and teaches the direct IO handler how to manage
> 	kernel addresses.
> 
> Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
> 	where appropriate.
> 
> Patch 12 fixes a NULL pointer dereference that occurs when using
> 	swap-over-NFS.
> 
> With the patches applied, it is possible to mount a swapfile that is on an
> NFS filesystem. Swap performance is not great with a swap stress test taking
> roughly twice as long to complete than if the swap device was backed by NBD.

To test this set I am using memory cgroups to force swap usage.  I am seeing
the cgroup controller killing my processes instead of using the nfs swapfile.

I am not yet sure if I am making a silly mistake or if something else is wrong.

Eric

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox