Netdev List
 help / color / mirror / Atom feed
* [PATCH 3/5] ipv4: Remove rt_key_{src,dst,tos} from struct rtable.
From: David Miller @ 2012-07-01 12:02 UTC (permalink / raw)
  To: netdev


They are always used in contexts where they can be reconstituted,
or where the finally resolved rt->rt_{src,dst} is semantically
equivalent.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    5 -----
 net/ipv4/route.c        |   39 +++++++++------------------------------
 net/ipv4/xfrm4_policy.c |    3 ---
 3 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 04c1b7f..c5f2955 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -45,14 +45,9 @@ struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
 
-	/* Lookup key. */
-	__be32			rt_key_dst;
-	__be32			rt_key_src;
-
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
-	__u8			rt_key_tos;
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4377e01..ec38892 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1219,12 +1219,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	rth->dst.output = ip_rt_bug;
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1344,12 +1341,9 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
@@ -1518,12 +1512,9 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1626,9 +1617,7 @@ EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-				       const struct flowi4 *fl4,
-				       __be32 orig_daddr, __be32 orig_saddr,
-				       int orig_oif, __u8 orig_rtos,
+				       const struct flowi4 *fl4, int orig_oif,
 				       struct net_device *dev_out,
 				       unsigned int flags)
 {
@@ -1679,12 +1668,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rth->dst.output = ip_output;
 
-	rth->rt_key_dst	= orig_daddr;
-	rth->rt_key_src	= orig_saddr;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_key_tos	= orig_rtos;
 	rth->rt_dst	= fl4->daddr;
 	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
@@ -1738,8 +1724,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	__be32 orig_daddr;
-	__be32 orig_saddr;
 	int orig_oif;
 
 	res.fi		= NULL;
@@ -1748,8 +1732,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	res.r		= NULL;
 #endif
 
-	orig_daddr = fl4->daddr;
-	orig_saddr = fl4->saddr;
 	orig_oif = fl4->flowi4_oif;
 
 	fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -1911,8 +1893,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 
 make_route:
-	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-			       tos, dev_out, flags);
+	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
 	if (!IS_ERR(rth))
 		rth = rt_finalize(rth, NULL);
 
@@ -1973,9 +1954,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_key_dst = ort->rt_key_dst;
-		rt->rt_key_src = ort->rt_key_src;
-		rt->rt_key_tos = ort->rt_key_tos;
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
@@ -2017,7 +1995,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,
+static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2035,7 +2013,7 @@ static int rt_fill_info(struct net *net,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= rt->rt_key_tos;
+	r->rtm_tos	= tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2048,9 +2026,9 @@ static int rt_fill_info(struct net *net,
 
 	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
 		goto nla_put_failure;
-	if (rt->rt_key_src) {
+	if (src) {
 		r->rtm_src_len = 32;
-		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+		if (nla_put_be32(skb, RTA_SRC, src))
 			goto nla_put_failure;
 	}
 	if (rt->dst.dev &&
@@ -2062,7 +2040,7 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != rt->rt_key_src) {
+	    rt->rt_src != src) {
 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 			goto nla_put_failure;
 	}
@@ -2213,7 +2191,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9815ea0..01808c5 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,9 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_key_dst = fl4->daddr;
-	xdst->u.rt.rt_key_src = fl4->saddr;
-	xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-- 
1.7.10

^ permalink raw reply related

* [PATCH 2/5] ipv4: Kill ip_route_input_noref().
From: David Miller @ 2012-07-01 12:02 UTC (permalink / raw)
  To: netdev


The "noref" argument to ip_route_input_common() is now always ignored
because we do not cache routes, and in that case we must always grab
a reference to the resulting 'dst'.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h    |   16 ++--------------
 net/ipv4/arp.c         |    2 +-
 net/ipv4/ip_fragment.c |    4 ++--
 net/ipv4/ip_input.c    |    4 ++--
 net/ipv4/route.c       |    6 +++---
 net/ipv4/xfrm4_input.c |    4 ++--
 6 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 23d8026..04c1b7f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -198,20 +198,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 	return ip_route_output_key(net, fl4);
 }
 
-extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
-
-static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
-}
-
-static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
-}
+extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
+			  u8 tos, struct net_device *devin);
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			     int oif, u32 mark, u8 protocol, int flow_flags);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0..c38293f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c97..7ad88e5 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
 		/* skb dst is stale, drop it, and perform route lookup again */
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev);
+		err = ip_route_input(head, iph->daddr, iph->saddr,
+				     iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b27d444..4ebc6fe 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
+		int err = ip_route_input(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EXDEV)
 				NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e7650c5..4377e01 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1578,8 +1578,8 @@ martian_source_keep_err:
 	goto out;
 }
 
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
 {
 	int res;
 
@@ -1622,7 +1622,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_unlock();
 	return res;
 }
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6..58d23a5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
1.7.10

^ permalink raw reply related

* [PATCH 1/5] ipv4: Delete routing cache.
From: David Miller @ 2012-07-01 12:02 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    5 +-
 net/ipv4/fib_frontend.c |    5 -
 net/ipv4/icmp.c         |    5 +-
 net/ipv4/route.c        | 1053 +++--------------------------------------------
 4 files changed, 52 insertions(+), 1016 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 211e266..23d8026 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -144,10 +144,9 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
 
 struct in_device;
 extern int		ip_rt_init(void);
-extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
-				       __be32 src, struct net_device *dev);
+extern void		ip_rt_redirect(struct sk_buff *skb, __be32 old_gw,
+				       __be32 new_gw);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(struct net *net);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 					   struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3e11ea2..a658fb4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1065,11 +1065,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		/* The batch unregister is only called on the first
-		 * device in the list of devices being unregistered.
-		 * Therefore we should not pass dev_net(dev) in here.
-		 */
-		rt_cache_flush_batch(NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4bce5a2..7ad960a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -774,9 +774,8 @@ static void icmp_redirect(struct sk_buff *skb)
 		 */
 	case ICMP_REDIR_HOST:
 	case ICMP_REDIR_HOSTTOS:
-		ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
-			       icmp_hdr(skb)->un.gateway,
-			       iph->saddr, skb->dev);
+		ip_rt_redirect(skb, ip_hdr(skb)->saddr,
+			       icmp_hdr(skb)->un.gateway);
 		break;
 	}
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6a5afc7..e7650c5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
-static int rt_chain_length_max __read_mostly	= 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
 
 /*
  *	Interface to generic destination cache.
@@ -149,7 +145,6 @@ static void		 ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			    int how)
@@ -193,7 +188,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
-	.gc =			rt_garbage_collect,
 	.check =		ipv4_dst_check,
 	.default_advmss =	ipv4_default_advmss,
 	.mtu =			ipv4_mtu,
@@ -229,184 +223,30 @@ const __u8 ip_tos2prio[16] = {
 };
 EXPORT_SYMBOL(ip_tos2prio);
 
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- *    as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- *    they do so with atomic increments and with the
- *    lock held.
- */
-
-struct rt_hash_bucket {
-	struct rtable __rcu	*chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ	256
-#else
-# if NR_CPUS >= 32
-#  define RT_HASH_LOCK_SZ	4096
-# elif NR_CPUS >= 16
-#  define RT_HASH_LOCK_SZ	2048
-# elif NR_CPUS >= 8
-#  define RT_HASH_LOCK_SZ	1024
-# elif NR_CPUS >= 4
-#  define RT_HASH_LOCK_SZ	512
-# else
-#  define RT_HASH_LOCK_SZ	256
-# endif
-#endif
-
-static spinlock_t	*rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
-	int i;
-
-	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
-			GFP_KERNEL);
-	if (!rt_hash_locks)
-		panic("IP: failed to allocate rt_hash_locks\n");
-
-	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
-		spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
-static unsigned int		rt_hash_mask __read_mostly;
-static unsigned int		rt_hash_log  __read_mostly;
-
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-				   int genid)
-{
-	return jhash_3words((__force u32)daddr, (__force u32)saddr,
-			    idx, genid)
-		& rt_hash_mask;
-}
-
 static inline int rt_genid(struct net *net)
 {
 	return atomic_read(&net->ipv4.rt_genid);
 }
 
 #ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
-	struct seq_net_private p;
-	int bucket;
-	int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	struct rtable *r = NULL;
-
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
-			continue;
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-		while (r) {
-			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
-			    r->rt_genid == st->genid)
-				return r;
-			r = rcu_dereference_bh(r->dst.rt_next);
-		}
-		rcu_read_unlock_bh();
-	}
-	return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
-					  struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-
-	r = rcu_dereference_bh(r->dst.rt_next);
-	while (!r) {
-		rcu_read_unlock_bh();
-		do {
-			if (--st->bucket < 0)
-				return NULL;
-		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
-					struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-		if (dev_net(r->dst.dev) != seq_file_net(seq))
-			continue;
-		if (r->rt_genid == st->genid)
-			break;
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
-	struct rtable *r = rt_cache_get_first(seq);
-
-	if (r)
-		while (pos && (r = rt_cache_get_next(seq, r)))
-			--pos;
-	return pos ? NULL : r;
-}
-
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct rt_cache_iter_state *st = seq->private;
 	if (*pos)
-		return rt_cache_get_idx(seq, *pos - 1);
-	st->genid = rt_genid(seq_file_net(seq));
+		return NULL;
 	return SEQ_START_TOKEN;
 }
 
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct rtable *r;
-
-	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(seq);
-	else
-		r = rt_cache_get_next(seq, v);
 	++*pos;
-	return r;
+	return NULL;
 }
 
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
-	if (v && v != SEQ_START_TOKEN)
-		rcu_read_unlock_bh();
 }
 
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -416,34 +256,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 			   "HHUptod\tSpecDst");
-	else {
-		struct rtable *r = v;
-		struct neighbour *n;
-		int len, HHUptod;
-
-		rcu_read_lock();
-		n = dst_get_neighbour_noref(&r->dst);
-		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
-		rcu_read_unlock();
-
-		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			r->dst.dev ? r->dst.dev->name : "*",
-			(__force u32)r->rt_dst,
-			(__force u32)r->rt_gateway,
-			r->rt_flags, atomic_read(&r->dst.__refcnt),
-			r->dst.__use, 0, (__force u32)r->rt_src,
-			dst_metric_advmss(&r->dst) + 40,
-			dst_metric(&r->dst, RTAX_WINDOW),
-			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
-			      dst_metric(&r->dst, RTAX_RTTVAR)),
-			r->rt_key_tos,
-			-1,
-			HHUptod,
-			0, &len);
-
-		seq_printf(seq, "%*s\n", 127 - len, "");
-	}
 	return 0;
 }
 
@@ -456,8 +268,7 @@ static const struct seq_operations rt_cache_seq_ops = {
 
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &rt_cache_seq_ops,
-			sizeof(struct rt_cache_iter_state));
+	return seq_open(file, &rt_cache_seq_ops);
 }
 
 static const struct file_operations rt_cache_seq_fops = {
@@ -465,7 +276,7 @@ static const struct file_operations rt_cache_seq_fops = {
 	.open	 = rt_cache_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release_net,
+	.release = seq_release,
 };
 
 
@@ -655,263 +466,12 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
-	ip_rt_put(rt);
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
-	/* Kill broadcast/multicast entries very aggresively, if they
-	   collide in hash table with more useful entries */
-	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
-	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		(rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
-	unsigned long age;
-	int ret = 0;
-
-	if (atomic_read(&rth->dst.__refcnt))
-		goto out;
-
-	age = jiffies - rth->dst.lastuse;
-	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
-	    (age <= tmo2 && rt_valuable(rth)))
-		goto out;
-	ret = 1;
-out:	return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
-	u32 score = jiffies - rt->dst.lastuse;
-
-	score = ~score & ~(3<<30);
-
-	if (rt_valuable(rt))
-		score |= (1<<31);
-
-	if (rt_is_output_route(rt) ||
-	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
-		score |= (1<<30);
-
-	return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
-	return net->ipv4.current_rt_cache_rebuild_count <=
-		net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct rtable *rt1,
-				       const struct rtable *rt2)
-{
-	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
-	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_mark ^ rt2->rt_mark) |
-		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
-		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
-	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
 static inline int rt_is_expired(struct rtable *rth)
 {
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
 /*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
-	unsigned int i;
-	struct rtable *rth, *next;
-
-	for (i = 0; i <= rt_hash_mask; i++) {
-		struct rtable __rcu **pprev;
-		struct rtable *list;
-
-		if (process_context && need_resched())
-			cond_resched();
-		rth = rcu_access_pointer(rt_hash_table[i].chain);
-		if (!rth)
-			continue;
-
-		spin_lock_bh(rt_hash_lock_addr(i));
-
-		list = NULL;
-		pprev = &rt_hash_table[i].chain;
-		rth = rcu_dereference_protected(*pprev,
-			lockdep_is_held(rt_hash_lock_addr(i)));
-
-		while (rth) {
-			next = rcu_dereference_protected(rth->dst.rt_next,
-				lockdep_is_held(rt_hash_lock_addr(i)));
-
-			if (!net ||
-			    net_eq(dev_net(rth->dst.dev), net)) {
-				rcu_assign_pointer(*pprev, next);
-				rcu_assign_pointer(rth->dst.rt_next, list);
-				list = rth;
-			} else {
-				pprev = &rth->dst.rt_next;
-			}
-			rth = next;
-		}
-
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; list; list = next) {
-			next = rcu_dereference_protected(list->dst.rt_next, 1);
-			rt_free(list);
-		}
-	}
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
-	const struct rtable *aux = head;
-
-	while (aux != rth) {
-		if (compare_hash_inputs(aux, rth))
-			return 0;
-		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
-	}
-	return ONE;
-}
-
-static void rt_check_expire(void)
-{
-	static unsigned int rover;
-	unsigned int i = rover, goal;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long samples = 0;
-	unsigned long sum = 0, sum2 = 0;
-	unsigned long delta;
-	u64 mult;
-
-	delta = jiffies - expires_ljiffies;
-	expires_ljiffies = jiffies;
-	mult = ((u64)delta) << rt_hash_log;
-	if (ip_rt_gc_timeout > 1)
-		do_div(mult, ip_rt_gc_timeout);
-	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask)
-		goal = rt_hash_mask + 1;
-	for (; goal > 0; goal--) {
-		unsigned long tmo = ip_rt_gc_timeout;
-		unsigned long length;
-
-		i = (i + 1) & rt_hash_mask;
-		rthp = &rt_hash_table[i].chain;
-
-		if (need_resched())
-			cond_resched();
-
-		samples++;
-
-		if (rcu_dereference_raw(*rthp) == NULL)
-			continue;
-		length = 0;
-		spin_lock_bh(rt_hash_lock_addr(i));
-		while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
-			prefetch(rth->dst.rt_next);
-			if (rt_is_expired(rth) ||
-			    rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				continue;
-			}
-
-			/* We only count entries on a chain with equal
-			 * hash inputs once so that entries for
-			 * different QOS levels, and other non-hash
-			 * input attributes don't unfairly skew the
-			 * length computation
-			 */
-			tmo >>= 1;
-			rthp = &rth->dst.rt_next;
-			length += has_noalias(rt_hash_table[i].chain, rth);
-		}
-		spin_unlock_bh(rt_hash_lock_addr(i));
-		sum += length;
-		sum2 += length*length;
-	}
-	if (samples) {
-		unsigned long avg = sum / samples;
-		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
-		rt_chain_length_max = max_t(unsigned long,
-					ip_rt_gc_elasticity,
-					(avg + 4*sd) >> FRACT_BITS);
-	}
-	rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
-	rt_check_expire();
-	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
-/*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
  * many times (2^24) without giving recent rt_genid.
@@ -933,167 +493,6 @@ static void rt_cache_invalidate(struct net *net)
 void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
-	if (delay >= 0)
-		rt_do_flush(net, !in_softirq());
-}
-
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
-	rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
-	net_warn_ratelimited("Route hash chain too long!\n");
-	rt_cache_invalidate(net);
-}
-
-/*
-   Short description of GC goals.
-
-   We want to build algorithm, which will keep routing cache
-   at some equilibrium point, when number of aged off entries
-   is kept approximately equal to newly generated ones.
-
-   Current expiration strength is variable "expire".
-   We try to adjust it dynamically, so that if networking
-   is idle expires is large enough to keep enough of warm entries,
-   and when load increases it reduces to limit cache size.
- */
-
-static int rt_garbage_collect(struct dst_ops *ops)
-{
-	static unsigned long expire = RT_GC_TIMEOUT;
-	static unsigned long last_gc;
-	static int rover;
-	static int equilibrium;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long now = jiffies;
-	int goal;
-	int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
-	/*
-	 * Garbage collection is pretty expensive,
-	 * do not make it too frequently.
-	 */
-
-	RT_CACHE_STAT_INC(gc_total);
-
-	if (now - last_gc < ip_rt_gc_min_interval &&
-	    entries < ip_rt_max_size) {
-		RT_CACHE_STAT_INC(gc_ignored);
-		goto out;
-	}
-
-	entries = dst_entries_get_slow(&ipv4_dst_ops);
-	/* Calculate number of entries, which we want to expire now. */
-	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-	if (goal <= 0) {
-		if (equilibrium < ipv4_dst_ops.gc_thresh)
-			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = entries - equilibrium;
-		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = entries - equilibrium;
-		}
-	} else {
-		/* We are in dangerous area. Try to reduce cache really
-		 * aggressively.
-		 */
-		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = entries - goal;
-	}
-
-	if (now - last_gc >= ip_rt_gc_min_interval)
-		last_gc = now;
-
-	if (goal <= 0) {
-		equilibrium += goal;
-		goto work_done;
-	}
-
-	do {
-		int i, k;
-
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
-			unsigned long tmo = expire;
-
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
-				if (!rt_is_expired(rth) &&
-					!rt_may_expire(rth, tmo, expire)) {
-					tmo >>= 1;
-					rthp = &rth->dst.rt_next;
-					continue;
-				}
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				goal--;
-			}
-			spin_unlock_bh(rt_hash_lock_addr(k));
-			if (goal <= 0)
-				break;
-		}
-		rover = k;
-
-		if (goal <= 0)
-			goto work_done;
-
-		/* Goal is not achieved. We stop process if:
-
-		   - if expire reduced to zero. Otherwise, expire is halfed.
-		   - if table is not full.
-		   - if we are called from interrupt.
-		   - jiffies check is just fallback/debug loop breaker.
-		     We will not spin here for long time in any case.
-		 */
-
-		RT_CACHE_STAT_INC(gc_goal_miss);
-
-		if (expire == 0)
-			break;
-
-		expire >>= 1;
-
-		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-			goto out;
-	} while (!in_softirq() && time_before_eq(jiffies, now));
-
-	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	net_warn_ratelimited("dst cache overflow\n");
-	RT_CACHE_STAT_INC(gc_dst_overflow);
-	return 1;
-
-work_done:
-	expire += ip_rt_gc_min_interval;
-	if (expire > ip_rt_gc_timeout ||
-	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
-	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
-		expire = ip_rt_gc_timeout;
-out:	return 0;
-}
-
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
-	int length = 0;
-	const struct rtable *rth = head;
-
-	while (rth) {
-		length += has_noalias(head, rth);
-		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
-	}
-	return length >> FRACT_BITS;
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
@@ -1127,178 +526,17 @@ static int rt_bind_neighbour(struct rtable *rt)
 	return 0;
 }
 
-static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
-				     struct sk_buff *skb, int ifindex)
+static struct rtable *rt_finalize(struct rtable *rt, struct sk_buff *skb)
 {
-	struct rtable	*rth, *cand;
-	struct rtable __rcu **rthp, **candp;
-	unsigned long	now;
-	u32 		min_score;
-	int		chain_length;
-	int attempts = !in_softirq();
-
-restart:
-	chain_length = 0;
-	min_score = ~(u32)0;
-	cand = NULL;
-	candp = NULL;
-	now = jiffies;
-
-	if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
-		/*
-		 * If we're not caching, just tell the caller we
-		 * were successful and don't touch the route.  The
-		 * caller hold the sole reference to the cache entry, and
-		 * it will be released when the caller is done with it.
-		 * If we drop it here, the callers have no way to resolve routes
-		 * when we're not caching.  Instead, just point *rp at rt, so
-		 * the caller gets a single use out of the route
-		 * Note that we do rt_free on this new route entry, so that
-		 * once its refcount hits zero, we are still able to reap it
-		 * (Thanks Alexey)
-		 * Note: To avoid expensive rcu stuff for this uncached dst,
-		 * we set DST_NOCACHE so that dst_release() can free dst without
-		 * waiting a grace period.
-		 */
-
-		rt->dst.flags |= DST_NOCACHE;
-		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
-			int err = rt_bind_neighbour(rt);
-			if (err) {
-				net_warn_ratelimited("Neighbour table failure & not caching routes\n");
-				ip_rt_put(rt);
-				return ERR_PTR(err);
-			}
-		}
-
-		goto skip_hashing;
-	}
-
-	rthp = &rt_hash_table[hash].chain;
-
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (rt_is_expired(rth)) {
-			*rthp = rth->dst.rt_next;
-			rt_free(rth);
-			continue;
-		}
-		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
-			/* Put it first */
-			*rthp = rth->dst.rt_next;
-			/*
-			 * Since lookup is lockfree, the deletion
-			 * must be visible to another weakly ordered CPU before
-			 * the insertion at the start of the hash chain.
-			 */
-			rcu_assign_pointer(rth->dst.rt_next,
-					   rt_hash_table[hash].chain);
-			/*
-			 * Since lookup is lockfree, the update writes
-			 * must be ordered for consistency on SMP.
-			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
-			dst_use(&rth->dst, now);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			rt_drop(rt);
-			if (skb)
-				skb_dst_set(skb, &rth->dst);
-			return rth;
-		}
-
-		if (!atomic_read(&rth->dst.__refcnt)) {
-			u32 score = rt_score(rth);
-
-			if (score <= min_score) {
-				cand = rth;
-				candp = rthp;
-				min_score = score;
-			}
-		}
-
-		chain_length++;
-
-		rthp = &rth->dst.rt_next;
-	}
-
-	if (cand) {
-		/* ip_rt_gc_elasticity used to be average length of chain
-		 * length, when exceeded gc becomes really aggressive.
-		 *
-		 * The second limit is less certain. At the moment it allows
-		 * only 2 entries per bucket. We will see.
-		 */
-		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->dst.rt_next;
-			rt_free(cand);
-		}
-	} else {
-		if (chain_length > rt_chain_length_max &&
-		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-			struct net *net = dev_net(rt->dst.dev);
-			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(net)) {
-				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
-					rt->dst.dev->name, num);
-			}
-			rt_emergency_hash_rebuild(net);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-					ifindex, rt_genid(net));
-			goto restart;
-		}
-	}
-
-	/* Try to bind route to arp only if it is output
-	   route or unicast forwarding path.
-	 */
 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
 		int err = rt_bind_neighbour(rt);
 		if (err) {
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			if (err != -ENOBUFS) {
-				rt_drop(rt);
-				return ERR_PTR(err);
-			}
-
-			/* Neighbour tables are full and nothing
-			   can be released. Try to shrink route cache,
-			   it is most likely it holds some neighbour records.
-			 */
-			if (attempts-- > 0) {
-				int saved_elasticity = ip_rt_gc_elasticity;
-				int saved_int = ip_rt_gc_min_interval;
-				ip_rt_gc_elasticity	= 1;
-				ip_rt_gc_min_interval	= 0;
-				rt_garbage_collect(&ipv4_dst_ops);
-				ip_rt_gc_min_interval	= saved_int;
-				ip_rt_gc_elasticity	= saved_elasticity;
-				goto restart;
-			}
-
-			net_warn_ratelimited("Neighbour table overflow\n");
-			rt_drop(rt);
-			return ERR_PTR(-ENOBUFS);
+			net_warn_ratelimited("Neighbour table failure\n");
+			ip_rt_put(rt);
+			return ERR_PTR(err);
 		}
 	}
 
-	rt->dst.rt_next = rt_hash_table[hash].chain;
-
-	/*
-	 * Since lookup is lockfree, we must make sure
-	 * previous writes to rt are committed to memory
-	 * before making rt visible to other CPUS.
-	 */
-	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
 	if (skb)
 		skb_dst_set(skb, &rt->dst);
 	return rt;
@@ -1370,26 +608,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void rt_del(unsigned int hash, struct rtable *rt)
-{
-	struct rtable __rcu **rthp;
-	struct rtable *aux;
-
-	rthp = &rt_hash_table[hash].chain;
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	ip_rt_put(rt);
-	while ((aux = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->dst.rt_next;
-			rt_free(aux);
-			continue;
-		}
-		rthp = &aux->dst.rt_next;
-	}
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-
 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 {
 	struct rtable *rt = (struct rtable *) dst;
@@ -1417,16 +635,16 @@ static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 }
 
 /* called in rcu_read_lock() section */
-void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
-		    __be32 saddr, struct net_device *dev)
+void ip_rt_redirect(struct sk_buff *skb, __be32 old_gw, __be32 new_gw)
 {
-	int s, i;
-	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	__be32 skeys[2] = { saddr, 0 };
-	int    ikeys[2] = { dev->ifindex, 0 };
-	struct inet_peer *peer;
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	const struct iphdr *iph;
+	struct fib_result res;
+	struct flowi4 fl4;
 	struct net *net;
 
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev)
 		return;
 
@@ -1446,41 +664,32 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			goto reject_redirect;
 	}
 
-	for (s = 0; s < 2; s++) {
-		for (i = 0; i < 2; i++) {
-			unsigned int hash;
-			struct rtable __rcu **rthp;
-			struct rtable *rt;
-
-			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
-
-			rthp = &rt_hash_table[hash].chain;
-
-			while ((rt = rcu_dereference(*rthp)) != NULL) {
-				rthp = &rt->dst.rt_next;
-
-				if (rt->rt_key_dst != daddr ||
-				    rt->rt_key_src != skeys[s] ||
-				    rt->rt_oif != ikeys[i] ||
-				    rt_is_input_route(rt) ||
-				    rt_is_expired(rt) ||
-				    !net_eq(dev_net(rt->dst.dev), net) ||
-				    rt->dst.error ||
-				    rt->dst.dev != dev ||
-				    rt->rt_gateway != old_gw)
-					continue;
-
-				peer = rt_get_peer_create(rt, rt->rt_dst);
-				if (peer) {
-					if (peer->redirect_learned.a4 != new_gw) {
-						peer->redirect_learned.a4 = new_gw;
-						atomic_inc(&__rt_peer_genid);
-					}
-					check_peer_redir(&rt->dst, peer);
-				}
+	iph = (const struct iphdr *) skb->data;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = iph->daddr;
+	fl4.saddr = iph->saddr;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+
+	rcu_read_lock();
+	if (fib_lookup(net, &fl4, &res) == 0) {
+		struct inet_peer_base *base;
+		struct inet_peer *peer;
+
+		base = (res.table ?
+			&res.table->tb_peers :
+			net->ipv4.peers);
+		peer = inet_getpeer_v4(base, iph->daddr, 1);
+		if (peer) {
+			if (peer->redirect_learned.a4 != new_gw) {
+				peer->redirect_learned.a4 = new_gw;
+				atomic_inc(&__rt_peer_genid);
 			}
+			inet_putpeer(peer);
 		}
 	}
+	rcu_read_unlock();
+
 	return;
 
 reject_redirect:
@@ -1521,10 +730,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ip_rt_put(rt);
 			ret = NULL;
 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
-			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-						rt->rt_oif,
-						rt_genid(dev_net(dst->dev)));
-			rt_del(hash, rt);
+			ip_rt_put(rt);
 			ret = NULL;
 		} else if (rt_has_peer(rt)) {
 			struct inet_peer *peer = rt_peer_ptr(rt);
@@ -1967,7 +1173,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 				   bool nopolicy, bool noxfrm)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-			 DST_HOST |
+			 DST_HOST | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1976,7 +1182,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned int hash;
 	struct rtable *rth;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	u32 itag = 0;
@@ -2041,8 +1246,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
-	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
+	rth = rt_finalize(rth, skb);
 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
 
 e_nobufs:
@@ -2176,7 +1380,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 	struct rtable *rth = NULL;
 	int err;
-	unsigned int hash;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1)
@@ -2188,10 +1391,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	if (err)
 		return err;
 
-	/* put it into the cache */
-	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
-		       rt_genid(dev_net(rth->dst.dev)));
-	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
+	rth = rt_finalize(rth, skb);
 	if (IS_ERR(rth))
 		return PTR_ERR(rth);
 	return 0;
@@ -2217,7 +1417,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned int	flags = 0;
 	u32		itag = 0;
 	struct rtable	*rth;
-	unsigned int	hash;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
 
@@ -2340,8 +1539,7 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
-	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+	rth = rt_finalize(rth, skb);
 	err = 0;
 	if (IS_ERR(rth))
 		err = PTR_ERR(rth);
@@ -2383,47 +1581,10 @@ martian_source_keep_err:
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			   u8 tos, struct net_device *dev, bool noref)
 {
-	struct rtable	*rth;
-	unsigned int	hash;
-	int iif = dev->ifindex;
-	struct net *net;
 	int res;
 
-	net = dev_net(dev);
-
 	rcu_read_lock();
 
-	if (!rt_caching(net))
-		goto skip_cache;
-
-	tos &= IPTOS_RT_MASK;
-	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-	     rth = rcu_dereference(rth->dst.rt_next)) {
-		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
-		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
-		     (rth->rt_route_iif ^ iif) |
-		     (rth->rt_key_tos ^ tos)) == 0 &&
-		    rth->rt_mark == skb->mark &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			ipv4_validate_peer(rth);
-			if (noref) {
-				dst_use_noref(&rth->dst, jiffies);
-				skb_dst_set_noref(skb, &rth->dst);
-			} else {
-				dst_use(&rth->dst, jiffies);
-				skb_dst_set(skb, &rth->dst);
-			}
-			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
-			return 0;
-		}
-		RT_CACHE_STAT_INC(in_hlist_search);
-	}
-
-skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2568,10 +1729,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 /*
  * Major route resolver routine.
- * called with rcu_read_lock();
  */
 
-static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2753,58 +1913,13 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 make_route:
 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
 			       tos, dev_out, flags);
-	if (!IS_ERR(rth)) {
-		unsigned int hash;
-
-		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
-			       rt_genid(dev_net(dev_out)));
-		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
-	}
+	if (!IS_ERR(rth))
+		rth = rt_finalize(rth, NULL);
 
 out:
 	rcu_read_unlock();
 	return rth;
 }
-
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-{
-	struct rtable *rth;
-	unsigned int hash;
-
-	if (!rt_caching(net))
-		goto slow_output;
-
-	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
-
-	rcu_read_lock_bh();
-	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-		rth = rcu_dereference_bh(rth->dst.rt_next)) {
-		if (rth->rt_key_dst == flp4->daddr &&
-		    rth->rt_key_src == flp4->saddr &&
-		    rt_is_output_route(rth) &&
-		    rth->rt_oif == flp4->flowi4_oif &&
-		    rth->rt_mark == flp4->flowi4_mark &&
-		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
-			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			ipv4_validate_peer(rth);
-			dst_use(&rth->dst, jiffies);
-			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
-			if (!flp4->saddr)
-				flp4->saddr = rth->rt_src;
-			if (!flp4->daddr)
-				flp4->daddr = rth->rt_dst;
-			return rth;
-		}
-		RT_CACHE_STAT_INC(out_hlist_search);
-	}
-	rcu_read_unlock_bh();
-
-slow_output:
-	return ip_route_output_slow(net, flp4);
-}
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -3114,43 +2229,6 @@ errout_free:
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
-	struct rtable *rt;
-	int h, s_h;
-	int idx, s_idx;
-	struct net *net;
-
-	net = sock_net(skb->sk);
-
-	s_h = cb->args[0];
-	if (s_h < 0)
-		s_h = 0;
-	s_idx = idx = cb->args[1];
-	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
-		if (!rt_hash_table[h].chain)
-			continue;
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
-			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
-				continue;
-			if (rt_is_expired(rt))
-				continue;
-			skb_dst_set_noref(skb, &rt->dst);
-			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
-					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					 1, NLM_F_MULTI) <= 0) {
-				skb_dst_drop(skb);
-				rcu_read_unlock_bh();
-				goto done;
-			}
-			skb_dst_drop(skb);
-		}
-		rcu_read_unlock_bh();
-	}
-
-done:
-	cb->args[0] = h;
-	cb->args[1] = idx;
 	return skb->len;
 }
 
@@ -3384,22 +2462,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_IP_ROUTE_CLASSID */
 
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
-{
-	ssize_t ret;
-
-	if (!str)
-		return 0;
-
-	ret = kstrtoul(str, 0, &rhash_entries);
-	if (ret)
-		return 0;
-
-	return 1;
-}
-__setup("rhash_entries=", set_rhash_entries);
-
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3422,31 +2484,12 @@ int __init ip_rt_init(void)
 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
 
-	rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(totalram_pages >= 128 * 1024) ?
-					15 : 17,
-					0,
-					&rt_hash_log,
-					&rt_hash_mask,
-					0,
-					rhash_entries ? 0 : 512 * 1024);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = ~0;
+	ip_rt_max_size = INT_MAX;
 
 	devinet_init();
 	ip_fib_init();
 
-	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
-	expires_ljiffies = jiffies;
-	schedule_delayed_work(&expires_work,
-		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
 	if (ip_rt_proc_init())
 		pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
-- 
1.7.10

^ permalink raw reply related

* [PATCH 0/5] rtcache remove respin
From: David Miller @ 2012-07-01 12:02 UTC (permalink / raw)
  To: netdev


It's been a while and there were of course a lot of merge hassles with
the most recent set I posted, so I respun these patches tonight
because I wanted to see the effects of the recent rpfilter hacks on an
rtcache-less system.

On a SPARC T3-1:

1) Output route lookup: ~2800 cycles
2) Input route lookups: ~3000 cycles (rpfilter=0)
                        ~4300 cycles (rpfilter=1)

Another nice part is how small struct rtable is after this patch set:

struct rtable {
	struct dst_entry        dst;
	int                     rt_genid;
	unsigned int            rt_flags;
	__u16                   rt_type;
	__be32                  rt_dst;
	int                     rt_route_iif;
	int                     rt_iif;
	int                     rt_oif;
	__be32                  rt_gateway;
	u32                     rt_peer_genid;
	unsigned long           _peer;
	struct fib_info         *fi;
};

which is about 208 bytes on sparc64.

Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* [net-next:master 294/295] drivers/net/ethernet/stmicro/stmmac/stmmac_main.c:287:3: error: implicit declaration of function 'phy_init_eee'
From: wfg @ 2012-07-01 11:35 UTC (permalink / raw)
  To: Giuseppe CAVALLARO; +Cc: David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 1958 bytes --]

Hi Giuseppe,

Here is a build dependency: the below commit used some functions
provided by the immediate next commit "phy: add the EEE support and
the way to access to the MMD registers."

---
Kernel build failed on

tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   a59a4d1921664da63d801ba477950114c71c88c9
commit: d765955d2ae0b88781a0db3a5bacfe4241925e09 [294/295] stmmac: add the Energy Efficient Ethernet support
config: x86_64-allmodconfig

All related error/warning messages:

drivers/net/ethernet/stmicro/stmmac/stmmac_main.c: In function 'stmmac_eee_init':
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c:287:3: error: implicit declaration of function 'phy_init_eee' [-Werror=implicit-function-declaration]
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c: In function 'stmmac_get_ethtool_stats':
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:380:4: error: implicit declaration of function 'phy_get_eee_err' [-Werror=implicit-function-declaration]
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c: In function 'stmmac_ethtool_op_get_eee':
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:494:2: error: implicit declaration of function 'phy_ethtool_get_eee' [-Werror=implicit-function-declaration]
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c: In function 'stmmac_ethtool_op_set_eee':
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:519:2: error: implicit declaration of function 'phy_ethtool_set_eee' [-Werror=implicit-function-declaration]

drivers/net/ethernet/stmicro/stmmac/stmmac_main.c:287:
   284		/* MAC core supports the EEE feature. */
   285		if (priv->dma_cap.eee) {
   286			/* Check if the PHY supports EEE */
 > 287			if (phy_init_eee(priv->phydev, 1))
   288				goto out;
   289	
   290			priv->eee_active = 1;

---
0-DAY kernel build testing backend         Open Source Technology Centre
Fengguang Wu <wfg@linux.intel.com>                     Intel Corporation

[-- Attachment #2: 0001-stmmac-add-the-Energy-Efficient-Ethernet-support.patch --]
[-- Type: text/x-diff, Size: 36081 bytes --]

>From d765955d2ae0b88781a0db3a5bacfe4241925e09 Mon Sep 17 00:00:00 2001
From: Giuseppe CAVALLARO <peppe.cavallaro@st.com>
Date: Wed, 27 Jun 2012 21:14:37 +0000
Subject: [PATCH] stmmac: add the Energy Efficient Ethernet support

This patch adds the Energy Efficient Ethernet support to the stmmac.

Please see the driver's documentation for further details about this support
in the driver.

Thanks also goes to Rayagond Kokatanur for his first implementation.

Note:
 to clearly manage and expose the lpi interrupt status and eee ethtool
 stats I've had to do some modifications to the driver's design and I
 found really useful to move other parts of the code (e.g. mmc irq stat)
 in the main directly. So this means that some core has been reworked
 to introduce the EEE.

v1: initial patch
v2: fixed some sparse issues (typos)
v3: erroneously sent the v2 renamed as v3
v4:
	o Fixed the return value of the stmmac_eee_init as suggested by D.Miller
	o Totally reviewed the ethtool support for EEE
	o Added a new internal parameter to tune the SW timer for TX LPI.
v5: do not change any eee setting in case of the stmmac_ethtool_op_set_eee fails
    (it has to return -EOPNOTSUPP in that case).

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |   31 +++-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h    |   20 +++
 .../net/ethernet/stmicro/stmmac/dwmac1000_core.c   |  101 +++++++++++-
 .../net/ethernet/stmicro/stmmac/dwmac100_core.c    |    4 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h    |    1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |    8 +
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |   57 +++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  166 +++++++++++++++++++-
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  |    2 +
 9 files changed, 372 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index bcd54d6..e2d0832 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -89,18 +89,28 @@ struct stmmac_extra_stats {
 	unsigned long tx_early_irq;
 	unsigned long fatal_bus_error_irq;
 	/* Extra info */
 	unsigned long threshold;
 	unsigned long tx_pkt_n;
 	unsigned long rx_pkt_n;
 	unsigned long poll_n;
 	unsigned long sched_timer_n;
 	unsigned long normal_irq_n;
+	unsigned long mmc_tx_irq_n;
+	unsigned long mmc_rx_irq_n;
+	unsigned long mmc_rx_csum_offload_irq_n;
+	/* EEE */
+	unsigned long irq_receive_pmt_irq_n;
+	unsigned long irq_tx_path_in_lpi_mode_n;
+	unsigned long irq_tx_path_exit_lpi_mode_n;
+	unsigned long irq_rx_path_in_lpi_mode_n;
+	unsigned long irq_rx_path_exit_lpi_mode_n;
+	unsigned long phy_eee_wakeup_error_n;
 };
 
 /* CSR Frequency Access Defines*/
 #define CSR_F_35M	35000000
 #define CSR_F_60M	60000000
 #define CSR_F_100M	100000000
 #define CSR_F_150M	150000000
 #define CSR_F_250M	250000000
 #define CSR_F_300M	300000000
@@ -156,18 +166,29 @@ enum rx_frame_status { /* IPC status */
 	llc_snap = 4,
 };
 
 enum tx_dma_irq_status {
 	tx_hard_error = 1,
 	tx_hard_error_bump_tc = 2,
 	handle_tx_rx = 3,
 };
 
+enum core_specific_irq_mask {
+	core_mmc_tx_irq = 1,
+	core_mmc_rx_irq = 2,
+	core_mmc_rx_csum_offload_irq = 4,
+	core_irq_receive_pmt_irq = 8,
+	core_irq_tx_path_in_lpi_mode = 16,
+	core_irq_tx_path_exit_lpi_mode = 32,
+	core_irq_rx_path_in_lpi_mode = 64,
+	core_irq_rx_path_exit_lpi_mode = 128,
+};
+
 /* DMA HW capabilities */
 struct dma_features {
 	unsigned int mbps_10_100;
 	unsigned int mbps_1000;
 	unsigned int half_duplex;
 	unsigned int hash_filter;
 	unsigned int multi_addr;
 	unsigned int pcs;
 	unsigned int sma_mdio;
@@ -202,18 +223,22 @@ struct dma_features {
 /* Power Down and WOL */
 #define PMT_NOT_SUPPORTED 0
 #define PMT_SUPPORTED 1
 
 /* Common MAC defines */
 #define MAC_CTRL_REG		0x00000000	/* MAC Control */
 #define MAC_ENABLE_TX		0x00000008	/* Transmitter Enable */
 #define MAC_RNABLE_RX		0x00000004	/* Receiver Enable */
 
+/* Default LPI timers */
+#define STMMAC_DEFAULT_LIT_LS_TIMER	0x3E8
+#define STMMAC_DEFAULT_TWT_LS_TIMER	0x0
+
 struct stmmac_desc_ops {
 	/* DMA RX descriptor ring initialization */
 	void (*init_rx_desc) (struct dma_desc *p, unsigned int ring_size,
 			      int disable_rx_ic);
 	/* DMA TX descriptor ring initialization */
 	void (*init_tx_desc) (struct dma_desc *p, unsigned int ring_size);
 
 	/* Invoked by the xmit function to prepare the tx descriptor */
 	void (*prepare_tx_desc) (struct dma_desc *p, int is_fs, int len,
@@ -272,31 +297,35 @@ struct stmmac_dma_ops {
 
 struct stmmac_ops {
 	/* MAC core initialization */
 	void (*core_init) (void __iomem *ioaddr) ____cacheline_aligned;
 	/* Enable and verify that the IPC module is supported */
 	int (*rx_ipc) (void __iomem *ioaddr);
 	/* Dump MAC registers */
 	void (*dump_regs) (void __iomem *ioaddr);
 	/* Handle extra events on specific interrupts hw dependent */
-	void (*host_irq_status) (void __iomem *ioaddr);
+	int (*host_irq_status) (void __iomem *ioaddr);
 	/* Multicast filter setting */
 	void (*set_filter) (struct net_device *dev, int id);
 	/* Flow control setting */
 	void (*flow_ctrl) (void __iomem *ioaddr, unsigned int duplex,
 			   unsigned int fc, unsigned int pause_time);
 	/* Set power management mode (e.g. magic frame) */
 	void (*pmt) (void __iomem *ioaddr, unsigned long mode);
 	/* Set/Get Unicast MAC addresses */
 	void (*set_umac_addr) (void __iomem *ioaddr, unsigned char *addr,
 			       unsigned int reg_n);
 	void (*get_umac_addr) (void __iomem *ioaddr, unsigned char *addr,
 			       unsigned int reg_n);
+	void (*set_eee_mode) (void __iomem *ioaddr);
+	void (*reset_eee_mode) (void __iomem *ioaddr);
+	void (*set_eee_timer) (void __iomem *ioaddr, int ls, int tw);
+	void (*set_eee_pls) (void __iomem *ioaddr, int link);
 };
 
 struct mac_link {
 	int port;
 	int duplex;
 	int speed;
 };
 
 struct mii_regs {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index 23478bf..f90fcb5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -30,18 +30,19 @@
 #define GMAC_MII_ADDR		0x00000010	/* MII Address */
 #define GMAC_MII_DATA		0x00000014	/* MII Data */
 #define GMAC_FLOW_CTRL		0x00000018	/* Flow Control */
 #define GMAC_VLAN_TAG		0x0000001c	/* VLAN Tag */
 #define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
 #define GMAC_WAKEUP_FILTER	0x00000028	/* Wake-up Frame Filter */
 
 #define GMAC_INT_STATUS		0x00000038	/* interrupt status register */
 enum dwmac1000_irq_status {
+	lpiis_irq = 0x400,
 	time_stamp_irq = 0x0200,
 	mmc_rx_csum_offload_irq = 0x0080,
 	mmc_tx_irq = 0x0040,
 	mmc_rx_irq = 0x0020,
 	mmc_irq = 0x0010,
 	pmt_irq = 0x0008,
 	pcs_ane_irq = 0x0004,
 	pcs_link_irq = 0x0002,
 	rgmii_irq = 0x0001,
@@ -54,18 +55,37 @@ enum power_event {
 	pointer_reset = 0x80000000,
 	global_unicast = 0x00000200,
 	wake_up_rx_frame = 0x00000040,
 	magic_frame = 0x00000020,
 	wake_up_frame_en = 0x00000004,
 	magic_pkt_en = 0x00000002,
 	power_down = 0x00000001,
 };
 
+/* Energy Efficient Ethernet (EEE)
+ *
+ * LPI status, timer and control register offset
+ */
+#define LPI_CTRL_STATUS	0x0030
+#define LPI_TIMER_CTRL	0x0034
+
+/* LPI control and status defines */
+#define LPI_CTRL_STATUS_LPITXA	0x00080000	/* Enable LPI TX Automate */
+#define LPI_CTRL_STATUS_PLSEN	0x00040000	/* Enable PHY Link Status */
+#define LPI_CTRL_STATUS_PLS	0x00020000	/* PHY Link Status */
+#define LPI_CTRL_STATUS_LPIEN	0x00010000	/* LPI Enable */
+#define LPI_CTRL_STATUS_RLPIST	0x00000200	/* Receive LPI state */
+#define LPI_CTRL_STATUS_TLPIST	0x00000100	/* Transmit LPI state */
+#define LPI_CTRL_STATUS_RLPIEX	0x00000008	/* Receive LPI Exit */
+#define LPI_CTRL_STATUS_RLPIEN	0x00000004	/* Receive LPI Entry */
+#define LPI_CTRL_STATUS_TLPIEX	0x00000002	/* Transmit LPI Exit */
+#define LPI_CTRL_STATUS_TLPIEN	0x00000001	/* Transmit LPI Entry */
+
 /* GMAC HW ADDR regs */
 #define GMAC_ADDR_HIGH(reg)	(((reg > 15) ? 0x00000800 : 0x00000040) + \
 				(reg * 8))
 #define GMAC_ADDR_LOW(reg)	(((reg > 15) ? 0x00000804 : 0x00000044) + \
 				(reg * 8))
 #define GMAC_MAX_PERFECT_ADDRESSES	32
 
 #define GMAC_AN_CTRL	0x000000c0	/* AN control */
 #define GMAC_AN_STATUS	0x000000c4	/* AN status */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index b5e4d02..bfe0226 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -188,50 +188,135 @@ static void dwmac1000_pmt(void __iomem *ioaddr, unsigned long mode)
 	if (mode & WAKE_UCAST) {
 		CHIP_DBG(KERN_DEBUG "GMAC: WOL on global unicast\n");
 		pmt |= global_unicast;
 	}
 
 	writel(pmt, ioaddr + GMAC_PMT);
 }
 
 
-static void dwmac1000_irq_status(void __iomem *ioaddr)
+static int dwmac1000_irq_status(void __iomem *ioaddr)
 {
 	u32 intr_status = readl(ioaddr + GMAC_INT_STATUS);
+	int status = 0;
 
 	/* Not used events (e.g. MMC interrupts) are not handled. */
-	if ((intr_status & mmc_tx_irq))
-		CHIP_DBG(KERN_DEBUG "GMAC: MMC tx interrupt: 0x%08x\n",
+	if ((intr_status & mmc_tx_irq)) {
+		CHIP_DBG(KERN_INFO "GMAC: MMC tx interrupt: 0x%08x\n",
 		    readl(ioaddr + GMAC_MMC_TX_INTR));
-	if (unlikely(intr_status & mmc_rx_irq))
-		CHIP_DBG(KERN_DEBUG "GMAC: MMC rx interrupt: 0x%08x\n",
+		status |= core_mmc_tx_irq;
+	}
+	if (unlikely(intr_status & mmc_rx_irq)) {
+		CHIP_DBG(KERN_INFO "GMAC: MMC rx interrupt: 0x%08x\n",
 		    readl(ioaddr + GMAC_MMC_RX_INTR));
-	if (unlikely(intr_status & mmc_rx_csum_offload_irq))
-		CHIP_DBG(KERN_DEBUG "GMAC: MMC rx csum offload: 0x%08x\n",
+		status |= core_mmc_rx_irq;
+	}
+	if (unlikely(intr_status & mmc_rx_csum_offload_irq)) {
+		CHIP_DBG(KERN_INFO "GMAC: MMC rx csum offload: 0x%08x\n",
 		    readl(ioaddr + GMAC_MMC_RX_CSUM_OFFLOAD));
+		status |= core_mmc_rx_csum_offload_irq;
+	}
 	if (unlikely(intr_status & pmt_irq)) {
-		CHIP_DBG(KERN_DEBUG "GMAC: received Magic frame\n");
+		CHIP_DBG(KERN_INFO "GMAC: received Magic frame\n");
 		/* clear the PMT bits 5 and 6 by reading the PMT
 		 * status register. */
 		readl(ioaddr + GMAC_PMT);
+		status |= core_irq_receive_pmt_irq;
 	}
+	/* MAC trx/rx EEE LPI entry/exit interrupts */
+	if (intr_status & lpiis_irq) {
+		/* Clean LPI interrupt by reading the Reg 12 */
+		u32 lpi_status = readl(ioaddr + LPI_CTRL_STATUS);
+
+		if (lpi_status & LPI_CTRL_STATUS_TLPIEN) {
+			CHIP_DBG(KERN_INFO "GMAC TX entered in LPI\n");
+			status |= core_irq_tx_path_in_lpi_mode;
+		}
+		if (lpi_status & LPI_CTRL_STATUS_TLPIEX) {
+			CHIP_DBG(KERN_INFO "GMAC TX exit from LPI\n");
+			status |= core_irq_tx_path_exit_lpi_mode;
+		}
+		if (lpi_status & LPI_CTRL_STATUS_RLPIEN) {
+			CHIP_DBG(KERN_INFO "GMAC RX entered in LPI\n");
+			status |= core_irq_rx_path_in_lpi_mode;
+		}
+		if (lpi_status & LPI_CTRL_STATUS_RLPIEX) {
+			CHIP_DBG(KERN_INFO "GMAC RX exit from LPI\n");
+			status |= core_irq_rx_path_exit_lpi_mode;
+		}
+	}
+
+	return status;
+}
+
+static void  dwmac1000_set_eee_mode(void __iomem *ioaddr)
+{
+	u32 value;
+
+	/* Enable the link status receive on RGMII, SGMII ore SMII
+	 * receive path and instruct the transmit to enter in LPI
+	 * state. */
+	value = readl(ioaddr + LPI_CTRL_STATUS);
+	value |= LPI_CTRL_STATUS_LPIEN | LPI_CTRL_STATUS_LPITXA;
+	writel(value, ioaddr + LPI_CTRL_STATUS);
+}
+
+static void  dwmac1000_reset_eee_mode(void __iomem *ioaddr)
+{
+	u32 value;
+
+	value = readl(ioaddr + LPI_CTRL_STATUS);
+	value &= ~(LPI_CTRL_STATUS_LPIEN | LPI_CTRL_STATUS_LPITXA);
+	writel(value, ioaddr + LPI_CTRL_STATUS);
+}
+
+static void  dwmac1000_set_eee_pls(void __iomem *ioaddr, int link)
+{
+	u32 value;
+
+	value = readl(ioaddr + LPI_CTRL_STATUS);
+
+	if (link)
+		value |= LPI_CTRL_STATUS_PLS;
+	else
+		value &= ~LPI_CTRL_STATUS_PLS;
+
+	writel(value, ioaddr + LPI_CTRL_STATUS);
+}
+
+static void  dwmac1000_set_eee_timer(void __iomem *ioaddr, int ls, int tw)
+{
+	int value = ((tw & 0xffff)) | ((ls & 0x7ff) << 16);
+
+	/* Program the timers in the LPI timer control register:
+	 * LS: minimum time (ms) for which the link
+	 *  status from PHY should be ok before transmitting
+	 *  the LPI pattern.
+	 * TW: minimum time (us) for which the core waits
+	 *  after it has stopped transmitting the LPI pattern.
+	 */
+	writel(value, ioaddr + LPI_TIMER_CTRL);
 }
 
 static const struct stmmac_ops dwmac1000_ops = {
 	.core_init = dwmac1000_core_init,
 	.rx_ipc = dwmac1000_rx_ipc_enable,
 	.dump_regs = dwmac1000_dump_regs,
 	.host_irq_status = dwmac1000_irq_status,
 	.set_filter = dwmac1000_set_filter,
 	.flow_ctrl = dwmac1000_flow_ctrl,
 	.pmt = dwmac1000_pmt,
 	.set_umac_addr = dwmac1000_set_umac_addr,
 	.get_umac_addr = dwmac1000_get_umac_addr,
+	.set_eee_mode =  dwmac1000_set_eee_mode,
+	.reset_eee_mode =  dwmac1000_reset_eee_mode,
+	.set_eee_timer =  dwmac1000_set_eee_timer,
+	.set_eee_pls =  dwmac1000_set_eee_pls,
 };
 
 struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr)
 {
 	struct mac_device_info *mac;
 	u32 hwid = readl(ioaddr + GMAC_VERSION);
 
 	mac = kzalloc(sizeof(const struct mac_device_info), GFP_KERNEL);
 	if (!mac)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
index 19e0f4e..f83210e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
@@ -66,21 +66,21 @@ static void dwmac100_dump_mac_regs(void __iomem *ioaddr)
 	pr_info("\tVLAN2 tag (offset 0x%x): 0x%08x\n", MAC_VLAN2,
 		readl(ioaddr + MAC_VLAN2));
 }
 
 static int dwmac100_rx_ipc_enable(void __iomem *ioaddr)
 {
 	return 0;
 }
 
-static void dwmac100_irq_status(void __iomem *ioaddr)
+static int dwmac100_irq_status(void __iomem *ioaddr)
 {
-	return;
+	return 0;
 }
 
 static void dwmac100_set_umac_addr(void __iomem *ioaddr, unsigned char *addr,
 				   unsigned int reg_n)
 {
 	stmmac_set_mac_addr(ioaddr, addr, MAC_ADDR_HIGH, MAC_ADDR_LOW);
 }
 
 static void dwmac100_get_umac_addr(void __iomem *ioaddr, unsigned char *addr,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
index 6e0360f..e678ce3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
@@ -64,18 +64,19 @@
 #define DMA_INTR_ENA_TSE 0x00000002	/* Transmit Stopped */
 
 #define DMA_INTR_ABNORMAL	(DMA_INTR_ENA_AIE | DMA_INTR_ENA_FBE | \
 				DMA_INTR_ENA_UNE)
 
 /* DMA default interrupt mask */
 #define DMA_INTR_DEFAULT_MASK	(DMA_INTR_NORMAL | DMA_INTR_ABNORMAL)
 
 /* DMA Status register defines */
+#define DMA_STATUS_GLPII	0x40000000	/* GMAC LPI interrupt */
 #define DMA_STATUS_GPI		0x10000000	/* PMT interrupt */
 #define DMA_STATUS_GMI		0x08000000	/* MMC interrupt */
 #define DMA_STATUS_GLI		0x04000000	/* GMAC Line interface int */
 #define DMA_STATUS_GMI		0x08000000
 #define DMA_STATUS_GLI		0x04000000
 #define DMA_STATUS_EB_MASK	0x00380000	/* Error Bits Mask */
 #define DMA_STATUS_EB_TX_ABORT	0x00080000	/* Error Bits - TX Abort */
 #define DMA_STATUS_EB_RX_ABORT	0x00100000	/* Error Bits - RX Abort */
 #define DMA_STATUS_TS_MASK	0x00700000	/* Transmit Process State */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index dc20c56..ab4c376 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -81,35 +81,43 @@ struct stmmac_priv {
 	struct plat_stmmacenet_data *plat;
 	struct stmmac_counters mmc;
 	struct dma_features dma_cap;
 	int hw_cap_support;
 #ifdef CONFIG_HAVE_CLK
 	struct clk *stmmac_clk;
 #endif
 	int clk_csr;
 	int synopsys_id;
+	struct timer_list eee_ctrl_timer;
+	bool tx_path_in_lpi_mode;
+	int lpi_irq;
+	int eee_enabled;
+	int eee_active;
+	int tx_lpi_timer;
 };
 
 extern int phyaddr;
 
 extern int stmmac_mdio_unregister(struct net_device *ndev);
 extern int stmmac_mdio_register(struct net_device *ndev);
 extern void stmmac_set_ethtool_ops(struct net_device *netdev);
 extern const struct stmmac_desc_ops enh_desc_ops;
 extern const struct stmmac_desc_ops ndesc_ops;
 int stmmac_freeze(struct net_device *ndev);
 int stmmac_restore(struct net_device *ndev);
 int stmmac_resume(struct net_device *ndev);
 int stmmac_suspend(struct net_device *ndev);
 int stmmac_dvr_remove(struct net_device *ndev);
 struct stmmac_priv *stmmac_dvr_probe(struct device *device,
 				     struct plat_stmmacenet_data *plat_dat,
 				     void __iomem *addr);
+void stmmac_disable_eee_mode(struct stmmac_priv *priv);
+bool stmmac_eee_init(struct stmmac_priv *priv);
 
 #ifdef CONFIG_HAVE_CLK
 static inline int stmmac_clk_enable(struct stmmac_priv *priv)
 {
 	if (!IS_ERR(priv->stmmac_clk))
 		return clk_prepare_enable(priv->stmmac_clk);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index ce43184..76fd61a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -87,18 +87,28 @@ static const struct stmmac_stats stmmac_gstrings_stats[] = {
 	STMMAC_STAT(tx_early_irq),
 	STMMAC_STAT(fatal_bus_error_irq),
 	/* Extra info */
 	STMMAC_STAT(threshold),
 	STMMAC_STAT(tx_pkt_n),
 	STMMAC_STAT(rx_pkt_n),
 	STMMAC_STAT(poll_n),
 	STMMAC_STAT(sched_timer_n),
 	STMMAC_STAT(normal_irq_n),
+	STMMAC_STAT(normal_irq_n),
+	STMMAC_STAT(mmc_tx_irq_n),
+	STMMAC_STAT(mmc_rx_irq_n),
+	STMMAC_STAT(mmc_rx_csum_offload_irq_n),
+	STMMAC_STAT(irq_receive_pmt_irq_n),
+	STMMAC_STAT(irq_tx_path_in_lpi_mode_n),
+	STMMAC_STAT(irq_tx_path_exit_lpi_mode_n),
+	STMMAC_STAT(irq_rx_path_in_lpi_mode_n),
+	STMMAC_STAT(irq_rx_path_exit_lpi_mode_n),
+	STMMAC_STAT(phy_eee_wakeup_error_n),
 };
 #define STMMAC_STATS_LEN ARRAY_SIZE(stmmac_gstrings_stats)
 
 /* HW MAC Management counters (if supported) */
 #define STMMAC_MMC_STAT(m)	\
 	{ #m, FIELD_SIZEOF(struct stmmac_counters, m),	\
 	offsetof(struct stmmac_priv, mmc.m)}
 
 static const struct stmmac_stats stmmac_mmc[] = {
@@ -360,18 +370,23 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 			for (i = 0; i < STMMAC_MMC_STATS_LEN; i++) {
 				char *p;
 				p = (char *)priv + stmmac_mmc[i].stat_offset;
 
 				data[j++] = (stmmac_mmc[i].sizeof_stat ==
 					     sizeof(u64)) ? (*(u64 *)p) :
 					     (*(u32 *)p);
 			}
 		}
+		if (priv->eee_enabled) {
+			int val = phy_get_eee_err(priv->phydev);
+			if (val)
+				priv->xstats.phy_eee_wakeup_error_n = val;
+		}
 	}
 	for (i = 0; i < STMMAC_STATS_LEN; i++) {
 		char *p = (char *)priv + stmmac_gstrings_stats[i].stat_offset;
 		data[j++] = (stmmac_gstrings_stats[i].sizeof_stat ==
 			     sizeof(u64)) ? (*(u64 *)p) : (*(u32 *)p);
 	}
 }
 
 static int stmmac_get_sset_count(struct net_device *netdev, int sset)
@@ -458,33 +473,75 @@ static int stmmac_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 	}
 
 	spin_lock_irq(&priv->lock);
 	priv->wolopts = wol->wolopts;
 	spin_unlock_irq(&priv->lock);
 
 	return 0;
 }
 
+static int stmmac_ethtool_op_get_eee(struct net_device *dev,
+				     struct ethtool_eee *edata)
+{
+	struct stmmac_priv *priv = netdev_priv(dev);
+
+	if (!priv->dma_cap.eee)
+		return -EOPNOTSUPP;
+
+	edata->eee_enabled = priv->eee_enabled;
+	edata->eee_active = priv->eee_active;
+	edata->tx_lpi_timer = priv->tx_lpi_timer;
+
+	return phy_ethtool_get_eee(priv->phydev, edata);
+}
+
+static int stmmac_ethtool_op_set_eee(struct net_device *dev,
+				     struct ethtool_eee *edata)
+{
+	struct stmmac_priv *priv = netdev_priv(dev);
+
+	priv->eee_enabled = edata->eee_enabled;
+
+	if (!priv->eee_enabled)
+		stmmac_disable_eee_mode(priv);
+	else {
+		/* We are asking for enabling the EEE but it is safe
+		 * to verify all by invoking the eee_init function.
+		 * In case of failure it will return an error.
+		 */
+		priv->eee_enabled = stmmac_eee_init(priv);
+		if (!priv->eee_enabled)
+			return -EOPNOTSUPP;
+
+		/* Do not change tx_lpi_timer in case of failure */
+		priv->tx_lpi_timer = edata->tx_lpi_timer;
+	}
+
+	return phy_ethtool_set_eee(priv->phydev, edata);
+}
+
 static const struct ethtool_ops stmmac_ethtool_ops = {
 	.begin = stmmac_check_if_running,
 	.get_drvinfo = stmmac_ethtool_getdrvinfo,
 	.get_settings = stmmac_ethtool_getsettings,
 	.set_settings = stmmac_ethtool_setsettings,
 	.get_msglevel = stmmac_ethtool_getmsglevel,
 	.set_msglevel = stmmac_ethtool_setmsglevel,
 	.get_regs = stmmac_ethtool_gregs,
 	.get_regs_len = stmmac_ethtool_get_regs_len,
 	.get_link = ethtool_op_get_link,
 	.get_pauseparam = stmmac_get_pauseparam,
 	.set_pauseparam = stmmac_set_pauseparam,
 	.get_ethtool_stats = stmmac_get_ethtool_stats,
 	.get_strings = stmmac_get_strings,
 	.get_wol = stmmac_get_wol,
 	.set_wol = stmmac_set_wol,
+	.get_eee = stmmac_ethtool_op_get_eee,
+	.set_eee = stmmac_ethtool_op_set_eee,
 	.get_sset_count	= stmmac_get_sset_count,
 	.get_ts_info = ethtool_op_get_ts_info,
 };
 
 void stmmac_set_ethtool_ops(struct net_device *netdev)
 {
 	SET_ETHTOOL_OPS(netdev, &stmmac_ethtool_ops);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index eba49cb..ea3bc09 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -127,18 +127,24 @@ MODULE_PARM_DESC(tmrate, "External timer freq. (default: 256Hz)");
 #define DMA_BUFFER_SIZE	BUF_SIZE_2KiB
 static int buf_sz = DMA_BUFFER_SIZE;
 module_param(buf_sz, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(buf_sz, "DMA buffer size");
 
 static const u32 default_msg_level = (NETIF_MSG_DRV | NETIF_MSG_PROBE |
 				      NETIF_MSG_LINK | NETIF_MSG_IFUP |
 				      NETIF_MSG_IFDOWN | NETIF_MSG_TIMER);
 
+#define STMMAC_DEFAULT_LPI_TIMER	1000
+static int eee_timer = STMMAC_DEFAULT_LPI_TIMER;
+module_param(eee_timer, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(eee_timer, "LPI tx expiration time in msec");
+#define STMMAC_LPI_TIMER(x) (jiffies + msecs_to_jiffies(x))
+
 static irqreturn_t stmmac_interrupt(int irq, void *dev_id);
 
 #ifdef CONFIG_STMMAC_DEBUG_FS
 static int stmmac_init_fs(struct net_device *dev);
 static void stmmac_exit_fs(void);
 #endif
 
 /**
  * stmmac_verify_args - verify the driver parameters.
@@ -155,18 +161,20 @@ static void stmmac_verify_args(void)
 		dma_txsize = DMA_TX_SIZE;
 	if (unlikely((buf_sz < DMA_BUFFER_SIZE) || (buf_sz > BUF_SIZE_16KiB)))
 		buf_sz = DMA_BUFFER_SIZE;
 	if (unlikely(flow_ctrl > 1))
 		flow_ctrl = FLOW_AUTO;
 	else if (likely(flow_ctrl < 0))
 		flow_ctrl = FLOW_OFF;
 	if (unlikely((pause < 0) || (pause > 0xffff)))
 		pause = PAUSE_TIME;
+	if (eee_timer < 0)
+		eee_timer = STMMAC_DEFAULT_LPI_TIMER;
 }
 
 static void stmmac_clk_csr_set(struct stmmac_priv *priv)
 {
 #ifdef CONFIG_HAVE_CLK
 	u32 clk_rate;
 
 	if (IS_ERR(priv->stmmac_clk))
 		return;
@@ -223,18 +231,97 @@ static inline u32 stmmac_tx_avail(struct stmmac_priv *priv)
 static inline void stmmac_hw_fix_mac_speed(struct stmmac_priv *priv)
 {
 	struct phy_device *phydev = priv->phydev;
 
 	if (likely(priv->plat->fix_mac_speed))
 		priv->plat->fix_mac_speed(priv->plat->bsp_priv,
 					  phydev->speed);
 }
 
+static void stmmac_enable_eee_mode(struct stmmac_priv *priv)
+{
+	/* Check and enter in LPI mode */
+	if ((priv->dirty_tx == priv->cur_tx) &&
+	    (priv->tx_path_in_lpi_mode == false))
+		priv->hw->mac->set_eee_mode(priv->ioaddr);
+}
+
+void stmmac_disable_eee_mode(struct stmmac_priv *priv)
+{
+	/* Exit and disable EEE in case of we are are in LPI state. */
+	priv->hw->mac->reset_eee_mode(priv->ioaddr);
+	del_timer_sync(&priv->eee_ctrl_timer);
+	priv->tx_path_in_lpi_mode = false;
+}
+
+/**
+ * stmmac_eee_ctrl_timer
+ * @arg : data hook
+ * Description:
+ *  If there is no data transfer and if we are not in LPI state,
+ *  then MAC Transmitter can be moved to LPI state.
+ */
+static void stmmac_eee_ctrl_timer(unsigned long arg)
+{
+	struct stmmac_priv *priv = (struct stmmac_priv *)arg;
+
+	stmmac_enable_eee_mode(priv);
+	mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_TIMER(eee_timer));
+}
+
+/**
+ * stmmac_eee_init
+ * @priv: private device pointer
+ * Description:
+ *  If the EEE support has been enabled while configuring the driver,
+ *  if the GMAC actually supports the EEE (from the HW cap reg) and the
+ *  phy can also manage EEE, so enable the LPI state and start the timer
+ *  to verify if the tx path can enter in LPI state.
+ */
+bool stmmac_eee_init(struct stmmac_priv *priv)
+{
+	bool ret = false;
+
+	/* MAC core supports the EEE feature. */
+	if (priv->dma_cap.eee) {
+		/* Check if the PHY supports EEE */
+		if (phy_init_eee(priv->phydev, 1))
+			goto out;
+
+		priv->eee_active = 1;
+		init_timer(&priv->eee_ctrl_timer);
+		priv->eee_ctrl_timer.function = stmmac_eee_ctrl_timer;
+		priv->eee_ctrl_timer.data = (unsigned long)priv;
+		priv->eee_ctrl_timer.expires = STMMAC_LPI_TIMER(eee_timer);
+		add_timer(&priv->eee_ctrl_timer);
+
+		priv->hw->mac->set_eee_timer(priv->ioaddr,
+					     STMMAC_DEFAULT_LIT_LS_TIMER,
+					     priv->tx_lpi_timer);
+
+		pr_info("stmmac: Energy-Efficient Ethernet initialized\n");
+
+		ret = true;
+	}
+out:
+	return ret;
+}
+
+static void stmmac_eee_adjust(struct stmmac_priv *priv)
+{
+	/* When the EEE has been already initialised we have to
+	 * modify the PLS bit in the LPI ctrl & status reg according
+	 * to the PHY link status. For this reason.
+	 */
+	if (priv->eee_enabled)
+		priv->hw->mac->set_eee_pls(priv->ioaddr, priv->phydev->link);
+}
+
 /**
  * stmmac_adjust_link
  * @dev: net device structure
  * Description: it adjusts the link parameters.
  */
 static void stmmac_adjust_link(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	struct phy_device *phydev = priv->phydev;
@@ -243,18 +330,19 @@ static void stmmac_adjust_link(struct net_device *dev)
 	unsigned int fc = priv->flow_ctrl, pause_time = priv->pause;
 
 	if (phydev == NULL)
 		return;
 
 	DBG(probe, DEBUG, "stmmac_adjust_link: called.  address %d link %d\n",
 	    phydev->addr, phydev->link);
 
 	spin_lock_irqsave(&priv->lock, flags);
+
 	if (phydev->link) {
 		u32 ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
 
 		/* Now we make sure that we can be in full duplex mode.
 		 * If not, we operate in half-duplex mode. */
 		if (phydev->duplex != priv->oldduplex) {
 			new_state = 1;
 			if (!(phydev->duplex))
 				ctrl &= ~priv->hw->link.duplex;
@@ -309,54 +397,57 @@ static void stmmac_adjust_link(struct net_device *dev)
 		new_state = 1;
 		priv->oldlink = 0;
 		priv->speed = 0;
 		priv->oldduplex = -1;
 	}
 
 	if (new_state && netif_msg_link(priv))
 		phy_print_status(phydev);
 
+	stmmac_eee_adjust(priv);
+
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	DBG(probe, DEBUG, "stmmac_adjust_link: exiting\n");
 }
 
 /**
  * stmmac_init_phy - PHY initialization
  * @dev: net device structure
  * Description: it initializes the driver's PHY state, and attaches the PHY
  * to the mac driver.
  *  Return value:
  *  0 on success
  */
 static int stmmac_init_phy(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	struct phy_device *phydev;
-	char phy_id[MII_BUS_ID_SIZE + 3];
+	char phy_id_fmt[MII_BUS_ID_SIZE + 3];
 	char bus_id[MII_BUS_ID_SIZE];
 	int interface = priv->plat->interface;
 	priv->oldlink = 0;
 	priv->speed = 0;
 	priv->oldduplex = -1;
 
 	if (priv->plat->phy_bus_name)
 		snprintf(bus_id, MII_BUS_ID_SIZE, "%s-%x",
 				priv->plat->phy_bus_name, priv->plat->bus_id);
 	else
 		snprintf(bus_id, MII_BUS_ID_SIZE, "stmmac-%x",
 				priv->plat->bus_id);
 
-	snprintf(phy_id, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
+	snprintf(phy_id_fmt, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
 		 priv->plat->phy_addr);
-	pr_debug("stmmac_init_phy:  trying to attach to %s\n", phy_id);
+	pr_debug("stmmac_init_phy:  trying to attach to %s\n", phy_id_fmt);
 
-	phydev = phy_connect(dev, phy_id, &stmmac_adjust_link, 0, interface);
+	phydev = phy_connect(dev, phy_id_fmt, &stmmac_adjust_link, 0,
+			     interface);
 
 	if (IS_ERR(phydev)) {
 		pr_err("%s: Could not attach to PHY\n", dev->name);
 		return PTR_ERR(phydev);
 	}
 
 	/* Stop Advertising 1000BASE Capability if interface is not GMII */
 	if ((interface == PHY_INTERFACE_MODE_MII) ||
 	    (interface == PHY_INTERFACE_MODE_RMII))
@@ -683,18 +774,23 @@ static void stmmac_tx(struct stmmac_priv *priv)
 		     stmmac_tx_avail(priv) > STMMAC_TX_THRESH(priv))) {
 		netif_tx_lock(priv->dev);
 		if (netif_queue_stopped(priv->dev) &&
 		     stmmac_tx_avail(priv) > STMMAC_TX_THRESH(priv)) {
 			TX_DBG("%s: restart transmit\n", __func__);
 			netif_wake_queue(priv->dev);
 		}
 		netif_tx_unlock(priv->dev);
 	}
+
+	if ((priv->eee_enabled) && (!priv->tx_path_in_lpi_mode)) {
+		stmmac_enable_eee_mode(priv);
+		mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_TIMER(eee_timer));
+	}
 	spin_unlock(&priv->tx_lock);
 }
 
 static inline void stmmac_enable_irq(struct stmmac_priv *priv)
 {
 #ifdef CONFIG_STMMAC_TIMER
 	if (likely(priv->tm->enable))
 		priv->tm->timer_start(tmrate);
 	else
@@ -1021,18 +1117,29 @@ static int stmmac_open(struct net_device *dev)
 		ret = request_irq(priv->wol_irq, stmmac_interrupt,
 				  IRQF_SHARED, dev->name, dev);
 		if (unlikely(ret < 0)) {
 			pr_err("%s: ERROR: allocating the ext WoL IRQ %d "
 			       "(error: %d)\n",	__func__, priv->wol_irq, ret);
 			goto open_error_wolirq;
 		}
 	}
 
+	/* Request the IRQ lines */
+	if (priv->lpi_irq != -ENXIO) {
+		ret = request_irq(priv->lpi_irq, stmmac_interrupt, IRQF_SHARED,
+				  dev->name, dev);
+		if (unlikely(ret < 0)) {
+			pr_err("%s: ERROR: allocating the LPI IRQ %d (%d)\n",
+			       __func__, priv->lpi_irq, ret);
+			goto open_error_lpiirq;
+		}
+	}
+
 	/* Enable the MAC Rx/Tx */
 	stmmac_set_mac(priv->ioaddr, true);
 
 	/* Set the HW DMA mode and the COE */
 	stmmac_dma_operation_mode(priv);
 
 	/* Extra statistics */
 	memset(&priv->xstats, 0, sizeof(struct stmmac_extra_stats));
 	priv->xstats.threshold = tc;
@@ -1056,24 +1163,31 @@ static int stmmac_open(struct net_device *dev)
 	/* Dump DMA/MAC registers */
 	if (netif_msg_hw(priv)) {
 		priv->hw->mac->dump_regs(priv->ioaddr);
 		priv->hw->dma->dump_regs(priv->ioaddr);
 	}
 
 	if (priv->phydev)
 		phy_start(priv->phydev);
 
+	priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS_TIMER;
+	priv->eee_enabled = stmmac_eee_init(priv);
+
 	napi_enable(&priv->napi);
 	skb_queue_head_init(&priv->rx_recycle);
 	netif_start_queue(dev);
 
 	return 0;
 
+open_error_lpiirq:
+	if (priv->wol_irq != dev->irq)
+		free_irq(priv->wol_irq, dev);
+
 open_error_wolirq:
 	free_irq(dev->irq, dev);
 
 open_error:
 #ifdef CONFIG_STMMAC_TIMER
 	kfree(priv->tm);
 #endif
 	if (priv->phydev)
 		phy_disconnect(priv->phydev);
@@ -1087,18 +1201,21 @@ open_error:
  *  stmmac_release - close entry point of the driver
  *  @dev : device pointer.
  *  Description:
  *  This is the stop entry point of the driver.
  */
 static int stmmac_release(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
+	if (priv->eee_enabled)
+		del_timer_sync(&priv->eee_ctrl_timer);
+
 	/* Stop and disconnect the PHY */
 	if (priv->phydev) {
 		phy_stop(priv->phydev);
 		phy_disconnect(priv->phydev);
 		priv->phydev = NULL;
 	}
 
 	netif_stop_queue(dev);
 
@@ -1109,18 +1226,20 @@ static int stmmac_release(struct net_device *dev)
 		kfree(priv->tm);
 #endif
 	napi_disable(&priv->napi);
 	skb_queue_purge(&priv->rx_recycle);
 
 	/* Free the IRQ lines */
 	free_irq(dev->irq, dev);
 	if (priv->wol_irq != dev->irq)
 		free_irq(priv->wol_irq, dev);
+	if (priv->lpi_irq != -ENXIO)
+		free_irq(priv->lpi_irq, dev);
 
 	/* Stop TX/RX DMA and clear the descriptors */
 	priv->hw->dma->stop_tx(priv->ioaddr);
 	priv->hw->dma->stop_rx(priv->ioaddr);
 
 	/* Release and free the Rx/Tx resources */
 	free_dma_desc_resources(priv);
 
 	/* Disable the MAC Rx/Tx */
@@ -1158,18 +1277,21 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			/* This is a hard error, log it. */
 			pr_err("%s: BUG! Tx Ring full when queue awake\n",
 				__func__);
 		}
 		return NETDEV_TX_BUSY;
 	}
 
 	spin_lock(&priv->tx_lock);
 
+	if (priv->tx_path_in_lpi_mode)
+		stmmac_disable_eee_mode(priv);
+
 	entry = priv->cur_tx % txsize;
 
 #ifdef STMMAC_XMIT_DEBUG
 	if ((skb->len > ETH_FRAME_LEN) || nfrags)
 		pr_info("stmmac xmit:\n"
 		       "\tskb addr %p - len: %d - nopaged_len: %d\n"
 		       "\tn_frags: %d - ip_summed: %d - %s gso\n",
 		       skb, skb->len, nopaged_len, nfrags, skb->ip_summed,
 		       !skb_is_gso(skb) ? "isn't" : "is");
@@ -1534,22 +1656,49 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = (struct net_device *)dev_id;
 	struct stmmac_priv *priv = netdev_priv(dev);
 
 	if (unlikely(!dev)) {
 		pr_err("%s: invalid dev pointer\n", __func__);
 		return IRQ_NONE;
 	}
 
-	if (priv->plat->has_gmac)
-		/* To handle GMAC own interrupts */
-		priv->hw->mac->host_irq_status((void __iomem *) dev->base_addr);
+	/* To handle GMAC own interrupts */
+	if (priv->plat->has_gmac) {
+		int status = priv->hw->mac->host_irq_status((void __iomem *)
+							    dev->base_addr);
+		if (unlikely(status)) {
+			if (status & core_mmc_tx_irq)
+				priv->xstats.mmc_tx_irq_n++;
+			if (status & core_mmc_rx_irq)
+				priv->xstats.mmc_rx_irq_n++;
+			if (status & core_mmc_rx_csum_offload_irq)
+				priv->xstats.mmc_rx_csum_offload_irq_n++;
+			if (status & core_irq_receive_pmt_irq)
+				priv->xstats.irq_receive_pmt_irq_n++;
+
+			/* For LPI we need to save the tx status */
+			if (status & core_irq_tx_path_in_lpi_mode) {
+				priv->xstats.irq_tx_path_in_lpi_mode_n++;
+				priv->tx_path_in_lpi_mode = true;
+			}
+			if (status & core_irq_tx_path_exit_lpi_mode) {
+				priv->xstats.irq_tx_path_exit_lpi_mode_n++;
+				priv->tx_path_in_lpi_mode = false;
+			}
+			if (status & core_irq_rx_path_in_lpi_mode)
+				priv->xstats.irq_rx_path_in_lpi_mode_n++;
+			if (status & core_irq_rx_path_exit_lpi_mode)
+				priv->xstats.irq_rx_path_exit_lpi_mode_n++;
+		}
+	}
 
+	/* To handle DMA interrupts */
 	stmmac_dma_interrupt(priv);
 
 	return IRQ_HANDLED;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
 /* Polling receive - used by NETCONSOLE and other diagnostic tools
  * to allow network I/O with interrupts disabled. */
 static void stmmac_poll_controller(struct net_device *dev)
@@ -2149,18 +2298,21 @@ static int __init stmmac_cmdline_opt(char *str)
 		} else if (!strncmp(opt, "watchdog:", 9)) {
 			if (kstrtoint(opt + 9, 0, &watchdog))
 				goto err;
 		} else if (!strncmp(opt, "flow_ctrl:", 10)) {
 			if (kstrtoint(opt + 10, 0, &flow_ctrl))
 				goto err;
 		} else if (!strncmp(opt, "pause:", 6)) {
 			if (kstrtoint(opt + 6, 0, &pause))
 				goto err;
+		} else if (!strncmp(opt, "eee_timer:", 6)) {
+			if (kstrtoint(opt + 10, 0, &eee_timer))
+				goto err;
 #ifdef CONFIG_STMMAC_TIMER
 		} else if (!strncmp(opt, "tmrate:", 7)) {
 			if (kstrtoint(opt + 7, 0, &tmrate))
 				goto err;
 #endif
 		}
 	}
 	return 0;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 20eb502..7d36163 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -150,18 +150,20 @@ static int stmmac_pltfr_probe(struct platform_device *pdev)
 	 * named as "eth_wake_irq"
 	 *
 	 * In case the wake up interrupt is not passed from the platform
 	 * so the driver will continue to use the mac irq (ndev->irq)
 	 */
 	priv->wol_irq = platform_get_irq_byname(pdev, "eth_wake_irq");
 	if (priv->wol_irq == -ENXIO)
 		priv->wol_irq = priv->dev->irq;
 
+	priv->lpi_irq = platform_get_irq_byname(pdev, "eth_lpi");
+
 	platform_set_drvdata(pdev, priv->dev);
 
 	pr_debug("STMMAC platform driver registration completed");
 
 	return 0;
 
 out_unmap:
 	iounmap(addr);
 	platform_set_drvdata(pdev, NULL);
-- 
1.7.10


^ permalink raw reply related

* Re: New commands to configure IOV features
From: Yuval Mintz @ 2012-07-01 11:09 UTC (permalink / raw)
  To: linux-pci; +Cc: Greg Rose, Ben Hutchings, netdev@vger.kernel.org
In-Reply-To: <20120626101903.0000791c@unknown>


>>>> I've tried to figure out if there was a standard interface
>>>> (ethtool/iproute) through which a user could configure the number
>>>> of vfs in his system.
>>>>
>>>> I've seen the RFC suggested in
>>>> http://markmail.org/thread/qblfcv7zbxsxp7q6, and
>>>> http://markmail.org/thread/fw54dcppmxuxoe6n, but failed to see any
>>>> later references to it (commits or further discussion on this
>>>> topic).
>>>>
>>>> How exactly are things standing with these RFCs? Were they
>>>> abandoned?
>>>
>>> The only way to configure the number of VFs continues to be through
>>> the max_vfs module parameter.  I've got a patch to do it through
>>> ethtool sitting on the back burner but due to other requirements of
>>> my day job I've not been able to work on it since last fall.
>>>
>>> - Greg
>>
>>
>> Hi Ben,
>>
>> If I want to pick the RFCs and add support for configuring the number
>> of VFs - do you think ethtool's the right place for such added
>> support?
>>
>> I'm asking since as far as I can see, ethtool (today) doesn't contain
>> any features related to virtual functions.
> 
> I think a PCI utility tool would be better, SR-IOV is not limited to
> network devices.  That's one of the reasons I dropped the RFC.  I
> haven't gotten back to the idea since then due to my day job keeping me
> pretty busy.
> 
> - Greg


I'm adding the linux-pci guys, since they (obviously) are the experts in
these topics.

Is there any PCI utility (or at least an existing kernel API) which is
suitable for adding this sort of functionality?

Thanks,
Yuval

^ permalink raw reply

* Re: [net-next.git 0/4] EEE for PAL and stmmac (V7)
From: David Miller @ 2012-07-01 10:37 UTC (permalink / raw)
  To: peppe.cavallaro; +Cc: netdev, eric.dumazet, rayagond, yuvalmin, bhutchings
In-Reply-To: <1340867678-18375-1-git-send-email-peppe.cavallaro@st.com>

From: Giuseppe CAVALLARO <peppe.cavallaro@st.com>
Date: Thu, 28 Jun 2012 09:14:34 +0200

> Giuseppe Cavallaro (4):
>   stmmac: do not use strict_strtoul but kstrtoint
>   stmmac: update the driver Documentation and add EEE
>   stmmac: add the Energy Efficient Ethernet support
>   phy: add the EEE support and the way to access to the MMD registers.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH v5] bonding support for IPv6 transmit hashing
From: David Miller @ 2012-07-01 10:33 UTC (permalink / raw)
  To: linux; +Cc: netdev
In-Reply-To: <c390ff662ede0f304398a88ca1a96e3773065c60.1341129744.git.linux@8192.net>

From: John Eaglesham <linux@8192.net>
Date: Sun,  1 Jul 2012 01:07:43 -0700

> +		v6hash =
> +			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
> +			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
> +			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);

You completely ignored my feedback:

	This is rediculous, just put &ipv6h->saddr into a local
	pointer named 's' and then you won't have use such gymnastics
	to indent the code.

^ permalink raw reply

* Re: [PATCH net-next 03/10] net/mlx4_en: Re-design multicast attachments flow
From: David Miller @ 2012-07-01 10:32 UTC (permalink / raw)
  To: ogerlitz; +Cc: roland, yevgenyp, oren, netdev, yevgenyp, sbohrer, hadarh
In-Reply-To: <1341135823-29039-4-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun,  1 Jul 2012 12:43:36 +0300

> +	/* Find all the entries that should be removed from dst,
> +	 * These are the entries that are not found in src */

Improperly formatted comment.

> +			new_mc = kmalloc(sizeof(struct mlx4_en_mc_list),
> +					     GFP_KERNEL);

Improperly indented second line.

^ permalink raw reply

* Re: [PATCH net-next 06/10] {NET,IB}/mlx4: Add device managed flow steering firmware API
From: David Miller @ 2012-07-01 10:30 UTC (permalink / raw)
  To: ogerlitz; +Cc: roland, yevgenyp, oren, netdev, hadarh
In-Reply-To: <1341135823-29039-7-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun,  1 Jul 2012 12:43:39 +0300

> +		/* Enable Ethernet flow steering
> +		 * with udp unicast and tcp unicast */

Improperly formatted comment, do it like this:

		/* Enable Ethernet flow steering
		 * with udp unicast and tcp unicast
		 */

> +		/* Enable IPoIB flow steering
> +		 * with udp unicast and tcp unicast */

Likewise.

> @@ -136,6 +138,11 @@ module_param_array(port_type_array, int, &arr_argc, 0444);
>  MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
>  				"1 for IB, 2 for Ethernet");
>  
> +static int mlx4_flow_steering_hash;
> +module_param_named(flow_steering_hash, mlx4_flow_steering_hash, int, 0444);
> +MODULE_PARM_DESC(flow_steering_hash,
> +		 "Flow steering hash function configuration. Config options: L2 = 0, L2_L3_L4 = 1 (default: L2).");
> +

No module paramters, do it via ethtool or similar.

> +		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
> +		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) {
> +			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
> +
> +		} else {

Get rid of that pointless empty line

> +
> +			dev->caps.steering_mode = MLX4_STEERING_MODE_A0;

Likewise.
> +			/* Enable flow steering with
> +			   udp unicast and tcp unicast*/

Comment formatting.

> +static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
> +				struct mlx4_net_trans_rule_hw_ctrl *hw)

Second line improperly indented.

> +		memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk,
> +								ETH_ALEN);

Don't make us barf, other people have to read this stuff.  This looks
terrible, indent it properly.

> +		memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk,
> +								ETH_ALEN);

Likewise.

> +		mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n"
> +								, reg_id);

Please format this properly.

^ permalink raw reply

* Re: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: David Miller @ 2012-07-01 10:23 UTC (permalink / raw)
  To: ogerlitz; +Cc: roland, yevgenyp, oren, netdev, hadarh
In-Reply-To: <1341135823-29039-10-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun,  1 Jul 2012 12:43:42 +0300

> +		en_err(priv, "Fail to detach network rule at location %d. registration id = 0x%llx\n"
> +		       , cmd->fs.location, rule->id);

Put that first comma at the end of the first line, this is very ugly.

^ permalink raw reply

* Re: [PATCH net-next 07/10] net/mlx4_core: Add resource tracking for device managed flow steering rules
From: David Miller @ 2012-07-01 10:22 UTC (permalink / raw)
  To: ogerlitz; +Cc: roland, yevgenyp, oren, netdev, hadarh
In-Reply-To: <1341135823-29039-8-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun,  1 Jul 2012 12:43:40 +0300

> +					err = mlx4_cmd(dev, base, 0, 0,
> +						MLX4_QP_FLOW_STEERING_DETACH,
> +						MLX4_CMD_TIME_CLASS_A,
> +						MLX4_CMD_NATIVE);

This is improperly formatted, line up the arguments with the first column
after the openning parenthesis on the first line.

^ permalink raw reply

* Re: [PATCH net-next 04/10] net/mlx4: Set steering mode according to device capabilities
From: David Miller @ 2012-07-01 10:20 UTC (permalink / raw)
  To: ogerlitz; +Cc: roland, yevgenyp, oren, netdev, hadarh
In-Reply-To: <1341135823-29039-5-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun,  1 Jul 2012 12:43:37 +0300

> +				/* Add the default qp number as multicast
> +				* promisc */

This is not the correct way to format a comment, do it:

	/* Like
	 * this.
	 */

> +	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
> +	    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) {
> +		dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
> +
> +	} else {

That empty line is extraneous and ugly, remove it.

> +/* Driver supports 2 diffrent device methods to manage traffic steering:
> +	- B0 steering mode - Common low level API for ib and (if supported) eth.
> +	- A0 steering mode - Limited low level API for eth. In case of IB,
> +			     B0 mode is in use.
> + */

Improperly formatted, do it like this:

/* Driver supports 2 diffrent device methods to manage traffic steering:
 *	- B0 steering mode - Common low level API for ib and (if supported) eth.
 *	- A0 steering mode - Limited low level API for eth. In case of IB,
 *			     B0 mode is in use.
 */

^ permalink raw reply

* Re: [PATCH net-next 01/10] net/mlx4_core: Change resource tracking mechanism to use red-black tree
From: David Miller @ 2012-07-01 10:17 UTC (permalink / raw)
  To: ogerlitz; +Cc: roland, yevgenyp, oren, netdev, hadarh
In-Reply-To: <1341135823-29039-2-git-send-email-ogerlitz@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun,  1 Jul 2012 12:43:34 +0300

> @@ -733,7 +776,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
>  	int err = 0;
>  
>  	spin_lock_irq(mlx4_tlock(dev));
> -	r = radix_tree_lookup(&tracker->res_tree[RES_QP], qpn);
> +	r = (struct res_qp *)res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
>  	if (!r)
>  		err = -ENOENT;
>  	else if (r->com.owner != slave)

Casts are terrible, return "void *" from res_tracker_lookup() just as
radix_tree_lookup() does.

Also you have indentation problems all over this patch.  When you have
a multi-line function call the first non-whitespace character on the
second and subsequent lines should line up with the first column after
the openning parenthesis on the first line.

^ permalink raw reply

* [PATCH net-next 05/10] net/mlx4_core: Add firmware commands to support device managed flow steering
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Add support for firmware commands to attach/detach a new device managed
steering mode. Such network steering rules allow the user to provide an
L2/L3/L4 flow specification to the firmware and have the device to steer
traffic that matches that specification to the provided QP.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |   19 +++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mcg.c           |   29 ++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   10 +++++++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   24 ++++++++++++++++
 include/linux/mlx4/cmd.h                           |    4 +++
 5 files changed, 86 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 842c8ce..7e94987 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1080,6 +1080,25 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = NULL
 	},
+	/* flow steering commands */
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_DETACH,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper
+	},
 };
 
 static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 319c9d4..3c59a33 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -62,6 +62,35 @@ int mlx4_get_qp_per_mgm(struct mlx4_dev *dev)
 	return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2);
 }
 
+static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev,
+					struct mlx4_cmd_mailbox *mailbox,
+					u32 size,
+					u64 *reg_id)
+{
+	u64 imm;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+	*reg_id = imm;
+
+	return err;
+}
+
+static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid)
+{
+	int err = 0;
+
+	err = mlx4_cmd(dev, regid, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	return err;
+}
+
 static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index,
 			   struct mlx4_cmd_mailbox *mailbox)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index a425a98..c07e882 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1118,6 +1118,16 @@ int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
 			       struct mlx4_cmd_mailbox *inbox,
 			       struct mlx4_cmd_mailbox *outbox,
 			       struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
 
 int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
 int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index b8e8969..4301aa5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2740,6 +2740,30 @@ ex_put:
 	return err;
 }
 
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+	return mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
+			    vhcr->in_modifier, 0,
+			    MLX4_QP_FLOW_STEERING_ATTACH,
+			    MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_NATIVE);
+}
+
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+	return mlx4_cmd(dev, vhcr->in_param, 0, 0,
+			MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
 enum {
 	BUSY_MAX_RETRIES = 10
 };
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 1f3860a..2606951 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -154,6 +154,10 @@ enum {
 	/* set port opcode modifiers */
 	MLX4_SET_PORT_PRIO2TC = 0x8,
 	MLX4_SET_PORT_SCHEDULER  = 0x9,
+
+	/* register/delete flow steering network rules */
+	MLX4_QP_FLOW_STEERING_ATTACH = 0x65,
+	MLX4_QP_FLOW_STEERING_DETACH = 0x66,
 };
 
 enum {
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 03/10] net/mlx4_en: Re-design multicast attachments flow
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem
  Cc: roland, yevgenyp, oren, netdev, Yevgeny Petrilin, Shawn Bohrer,
	Hadar Hen Zion
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>

Currently, for every change in the net device multicast list, the driver
detaches all the addresses from the HW device, and then attaches the
updated list. This behavior is wrong from two aspects: first, it causes
a load of firmware commands and second, there is period of time where
the correct addresses are not attached, which turned into packet loss.

To improve - a copy of the multicast list is saved by the driver. For
every change in the multicast list, the multicast list copy is used
to find the delta between those two lists and add or remove multicast
addresses as needed. 

Reported-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Cc: Shawn Bohrer <sbohrer@rgmadvisors.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  141 ++++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   16 +++-
 2 files changed, 122 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 073b85b..35c64f2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -170,33 +170,79 @@ static void mlx4_en_do_set_mac(struct work_struct *work)
 static void mlx4_en_clear_list(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_mc_list *tmp, *mc_to_del;
 
-	kfree(priv->mc_addrs);
-	priv->mc_addrs = NULL;
-	priv->mc_addrs_cnt = 0;
+	list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) {
+		list_del(&mc_to_del->list);
+		kfree(mc_to_del);
+	}
 }
 
 static void mlx4_en_cache_mclist(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct netdev_hw_addr *ha;
-	char *mc_addrs;
-	int mc_addrs_cnt = netdev_mc_count(dev);
-	int i;
+	struct mlx4_en_mc_list *tmp;
 
-	mc_addrs = kmalloc(mc_addrs_cnt * ETH_ALEN, GFP_ATOMIC);
-	if (!mc_addrs) {
-		en_err(priv, "failed to allocate multicast list\n");
-		return;
-	}
-	i = 0;
-	netdev_for_each_mc_addr(ha, dev)
-		memcpy(mc_addrs + i++ * ETH_ALEN, ha->addr, ETH_ALEN);
 	mlx4_en_clear_list(dev);
-	priv->mc_addrs = mc_addrs;
-	priv->mc_addrs_cnt = mc_addrs_cnt;
+	netdev_for_each_mc_addr(ha, dev) {
+		tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC);
+		if (!tmp) {
+			en_err(priv, "failed to allocate multicast list\n");
+			mlx4_en_clear_list(dev);
+			return;
+		}
+		memcpy(tmp->addr, ha->addr, ETH_ALEN);
+		list_add_tail(&tmp->list, &priv->mc_list);
+	}
 }
 
+static void update_mclist_flags(struct mlx4_en_priv *priv,
+				struct list_head *dst,
+				struct list_head *src)
+{
+	struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc;
+	bool found;
+
+	/* Find all the entries that should be removed from dst,
+	 * These are the entries that are not found in src */
+	list_for_each_entry(dst_tmp, dst, list) {
+		found = false;
+		list_for_each_entry(src_tmp, src, list) {
+			if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			dst_tmp->action = MCLIST_REM;
+	}
+
+	/* Add entries that exist in src but not in dst
+	 * mark them as need to add */
+	list_for_each_entry(src_tmp, src, list) {
+		found = false;
+		list_for_each_entry(dst_tmp, dst, list) {
+			if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) {
+				dst_tmp->action = MCLIST_NONE;
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			new_mc = kmalloc(sizeof(struct mlx4_en_mc_list),
+					     GFP_KERNEL);
+			if (!new_mc) {
+				en_err(priv, "Failed to allocate current multicast list\n");
+				return;
+			}
+			memcpy(new_mc, src_tmp,
+			       sizeof(struct mlx4_en_mc_list));
+			new_mc->action = MCLIST_ADD;
+			list_add_tail(&new_mc->list, dst);
+		}
+	}
+}
 
 static void mlx4_en_set_multicast(struct net_device *dev)
 {
@@ -214,6 +260,7 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 						 mcast_task);
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct net_device *dev = priv->dev;
+	struct mlx4_en_mc_list *mclist, *tmp;
 	u64 mcast_addr = 0;
 	u8 mc_list[16] = {0};
 	int err;
@@ -336,7 +383,6 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
 		}
 	} else {
-		int i;
 		/* Disable Multicast promisc */
 		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
 			err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
@@ -351,13 +397,6 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 		if (err)
 			en_err(priv, "Failed disabling multicast filter\n");
 
-		/* Detach our qp from all the multicast addresses */
-		for (i = 0; i < priv->mc_addrs_cnt; i++) {
-			memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN);
-			mc_list[5] = priv->port;
-			mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp,
-					      mc_list, MLX4_PROT_ETH);
-		}
 		/* Flush mcast filter and init it with broadcast address */
 		mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
 				    1, MLX4_MCAST_CONFIG);
@@ -367,13 +406,8 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 		netif_tx_lock_bh(dev);
 		mlx4_en_cache_mclist(dev);
 		netif_tx_unlock_bh(dev);
-		for (i = 0; i < priv->mc_addrs_cnt; i++) {
-			mcast_addr =
-			      mlx4_en_mac_to_u64(priv->mc_addrs + i * ETH_ALEN);
-			memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN);
-			mc_list[5] = priv->port;
-			mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp,
-					      mc_list, 0, MLX4_PROT_ETH);
+		list_for_each_entry(mclist, &priv->mc_list, list) {
+			mcast_addr = mlx4_en_mac_to_u64(mclist->addr);
 			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
 					    mcast_addr, 0, MLX4_MCAST_CONFIG);
 		}
@@ -381,6 +415,38 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 					  0, MLX4_MCAST_ENABLE);
 		if (err)
 			en_err(priv, "Failed enabling multicast filter\n");
+
+		update_mclist_flags(priv, &priv->curr_list, &priv->mc_list);
+		list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+			if (mclist->action == MCLIST_REM) {
+				/* detach this address and delete from list */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_detach(mdev->dev,
+							    &priv->rss_map.indir_qp,
+							    mc_list,
+							    MLX4_PROT_ETH);
+				if (err)
+					en_err(priv, "Fail to detach multicast address\n");
+
+				/* remove from list */
+				list_del(&mclist->list);
+				kfree(mclist);
+			}
+
+			if (mclist->action == MCLIST_ADD) {
+				/* attach the address */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_attach(mdev->dev,
+							    &priv->rss_map.indir_qp,
+							    mc_list, 0,
+							    MLX4_PROT_ETH);
+				if (err)
+					en_err(priv, "Fail to attach multicast address\n");
+
+			}
+		}
 	}
 out:
 	mutex_unlock(&mdev->state_lock);
@@ -605,6 +671,9 @@ int mlx4_en_start_port(struct net_device *dev)
 		return 0;
 	}
 
+	INIT_LIST_HEAD(&priv->mc_list);
+	INIT_LIST_HEAD(&priv->curr_list);
+
 	/* Calculate Rx buf size */
 	dev->mtu = min(dev->mtu, priv->max_mtu);
 	mlx4_en_calc_rx_buf(dev);
@@ -760,6 +829,7 @@ void mlx4_en_stop_port(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_mc_list *mclist, *tmp;
 	int i;
 	u8 mc_list[16] = {0};
 
@@ -781,13 +851,18 @@ void mlx4_en_stop_port(struct net_device *dev)
 	mc_list[5] = priv->port;
 	mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
 			      MLX4_PROT_ETH);
-	for (i = 0; i < priv->mc_addrs_cnt; i++) {
-		memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN);
+	list_for_each_entry(mclist, &priv->curr_list, list) {
+		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
 		mc_list[5] = priv->port;
 		mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp,
 				      mc_list, MLX4_PROT_ETH);
 	}
 	mlx4_en_clear_list(dev);
+	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+		list_del(&mclist->list);
+		kfree(mclist);
+	}
+
 	/* Flush multicast filter */
 	mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 225c20d..1bb00cd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -404,6 +404,18 @@ struct mlx4_en_perf_stats {
 #define NUM_PERF_COUNTERS		6
 };
 
+enum mlx4_en_mclist_act {
+	MCLIST_NONE,
+	MCLIST_REM,
+	MCLIST_ADD,
+};
+
+struct mlx4_en_mc_list {
+	struct list_head	list;
+	enum mlx4_en_mclist_act	action;
+	u8			addr[ETH_ALEN];
+};
+
 struct mlx4_en_frag_info {
 	u16 frag_size;
 	u16 frag_prefix_size;
@@ -489,8 +501,8 @@ struct mlx4_en_priv {
 	struct mlx4_en_pkt_stats pkstats;
 	struct mlx4_en_port_stats port_stats;
 	u64 stats_bitmap;
-	char *mc_addrs;
-	int mc_addrs_cnt;
+	struct list_head mc_list;
+	struct list_head curr_list;
 	struct mlx4_en_stat_out_mbox hw_stats;
 	int vids[128];
 	bool wol;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 06/10] {NET,IB}/mlx4: Add device managed flow steering firmware API
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

The driver is modified to support three operation modes.

If supported by firmware use the device managed flow steering
API, that which we call device managed steering mode. Else, if
the firmware supports the B0 steering mode use it, and finally,
if none of the above, use the A0 steering mode.

When the steering mode is device managed, the code is modified
such that L2 based rules set by the mlx4_en driver for Ethernet
unicast and multicast, and the IB stack multicast attach calls
done through the mlx4_ib driver are all routed to use the device
managed API.

When attaching rule using device managed flow steering API,
the firmware returns a 64 bit registration id, which is to be
provided during detach.

Set the firmware to use either standard L2 or L3/L4 optimized flow
steering configuration. Currently there's no dedicated firmware
command to set this, and it has to be provided to the device during
initialization, hence a module param was used here.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/infiniband/hw/mlx4/main.c                  |   62 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h               |    1 +
 drivers/infiniband/hw/mlx4/qp.c                    |    1 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |   21 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c            |   90 ++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h            |    3 +
 drivers/net/ethernet/mellanox/mlx4/main.c          |   63 +++-
 drivers/net/ethernet/mellanox/mlx4/mcg.c           |  365 +++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   13 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |    2 +
 drivers/net/ethernet/mellanox/mlx4/port.c          |  102 ++++--
 drivers/net/ethernet/mellanox/mlx4/profile.c       |   12 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |    6 +
 include/linux/mlx4/device.h                        |  108 ++++++-
 14 files changed, 766 insertions(+), 83 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 3530c41..8a3a203 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -718,26 +718,53 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 	return ret;
 }
 
+struct mlx4_ib_steering {
+	struct list_head list;
+	u64 reg_id;
+	union ib_gid gid;
+};
+
 static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	int err;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	u64 reg_id;
+	struct mlx4_ib_steering *ib_steering = NULL;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+		if (!ib_steering)
+			return -ENOMEM;
+	}
 
-	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw,
-				    !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
-				    MLX4_PROT_IB_IPV6);
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
+				    !!(mqp->flags &
+				       MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				    MLX4_PROT_IB_IPV6, &reg_id);
 	if (err)
-		return err;
+		goto err_malloc;
 
 	err = add_gid_entry(ibqp, gid);
 	if (err)
 		goto err_add;
 
+	if (ib_steering) {
+		memcpy(ib_steering->gid.raw, gid->raw, 16);
+		ib_steering->reg_id = reg_id;
+		mutex_lock(&mqp->mutex);
+		list_add(&ib_steering->list, &mqp->steering_rules);
+		mutex_unlock(&mqp->mutex);
+	}
 	return 0;
 
 err_add:
-	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6);
+	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+			      MLX4_PROT_IB_IPV6, reg_id);
+err_malloc:
+	kfree(ib_steering);
+
 	return err;
 }
 
@@ -765,9 +792,30 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	u8 mac[6];
 	struct net_device *ndev;
 	struct mlx4_ib_gid_entry *ge;
+	u64 reg_id = 0;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		struct mlx4_ib_steering *ib_steering;
+
+		mutex_lock(&mqp->mutex);
+		list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
+			if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
+				list_del(&ib_steering->list);
+				break;
+			}
+		}
+		mutex_unlock(&mqp->mutex);
+		if (&ib_steering->list == &mqp->steering_rules) {
+			pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
+			return -EINVAL;
+		}
+		reg_id = ib_steering->reg_id;
+		kfree(ib_steering);
+	}
 
-	err = mlx4_multicast_detach(mdev->dev,
-				    &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6);
+	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				    MLX4_PROT_IB_IPV6, reg_id);
 	if (err)
 		return err;
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index ff36655..42df4f7 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -163,6 +163,7 @@ struct mlx4_ib_qp {
 	u8			state;
 	int			mlx_type;
 	struct list_head	gid_list;
+	struct list_head	steering_rules;
 };
 
 struct mlx4_ib_srq {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 8d4ed24..6af19f6 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -495,6 +495,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
 	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
 
 	qp->state	 = IB_QPS_RESET;
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index f9a4a00..62f0601 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -460,7 +460,8 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 				err = mlx4_multicast_detach(mdev->dev,
 							    &priv->rss_map.indir_qp,
 							    mc_list,
-							    MLX4_PROT_ETH);
+							    MLX4_PROT_ETH,
+							    mclist->reg_id);
 				if (err)
 					en_err(priv, "Fail to detach multicast address\n");
 
@@ -472,11 +473,14 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 			if (mclist->action == MCLIST_ADD) {
 				/* attach the address */
 				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				/* needed for B0 steering support */
 				mc_list[5] = priv->port;
 				err = mlx4_multicast_attach(mdev->dev,
 							    &priv->rss_map.indir_qp,
-							    mc_list, 0,
-							    MLX4_PROT_ETH);
+							    mc_list,
+							    priv->port, 0,
+							    MLX4_PROT_ETH,
+							    &mclist->reg_id);
 				if (err)
 					en_err(priv, "Fail to attach multicast address\n");
 
@@ -824,9 +828,10 @@ int mlx4_en_start_port(struct net_device *dev)
 
 	/* Attach rx QP to bradcast address */
 	memset(&mc_list[10], 0xff, ETH_ALEN);
-	mc_list[5] = priv->port;
+	mc_list[5] = priv->port; /* needed for B0 steering support */
 	if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
-				  0, MLX4_PROT_ETH))
+				  priv->port, 0, MLX4_PROT_ETH,
+				  &priv->broadcast_id))
 		mlx4_warn(mdev, "Failed Attaching Broadcast\n");
 
 	/* Must redo promiscuous mode setup. */
@@ -883,14 +888,14 @@ void mlx4_en_stop_port(struct net_device *dev)
 
 	/* Detach All multicasts */
 	memset(&mc_list[10], 0xff, ETH_ALEN);
-	mc_list[5] = priv->port;
+	mc_list[5] = priv->port; /* needed for B0 steering support */
 	mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
-			      MLX4_PROT_ETH);
+			      MLX4_PROT_ETH, priv->broadcast_id);
 	list_for_each_entry(mclist, &priv->curr_list, list) {
 		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
 		mc_list[5] = priv->port;
 		mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp,
-				      mc_list, MLX4_PROT_ETH);
+				      mc_list, MLX4_PROT_ETH, mclist->reg_id);
 	}
 	mlx4_en_clear_list(dev);
 	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 40e048b..c7fb921 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -123,7 +123,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 	static const char * const fname[] = {
 		[0] = "RSS support",
 		[1] = "RSS Toeplitz Hash Function support",
-		[2] = "RSS XOR Hash Function support"
+		[2] = "RSS XOR Hash Function support",
+		[3] = "Device manage flow steering support"
 	};
 	int i;
 
@@ -391,6 +392,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
 #define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
 #define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
+#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
+#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
@@ -474,6 +477,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->num_ports = field & 0xf;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
 	dev_cap->max_msg_sz = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN;
+	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET);
+	dev_cap->fs_max_num_qp_per_entry = field;
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
 	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
@@ -1061,6 +1070,15 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x16)
 #define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
 #define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
+#define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
+#define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
+#define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
+#define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
+#define  INIT_HCA_FS_IB_BITS_OFFSET       (INIT_HCA_FS_PARAM_OFFSET + 0x25)
+#define  INIT_HCA_FS_IB_NUM_ADDRS_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x26)
 #define INIT_HCA_TPT_OFFSET		 0x0f0
 #define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
 #define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
@@ -1119,14 +1137,44 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
 
-	/* multicast attributes */
+	/* steering attributes */
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+			cpu_to_be32(1 <<
+				    INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN);
+
+		MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		/* Enable Ethernet flow steering
+		 * with udp unicast and tcp unicast */
+		MLX4_PUT(inbox, param->fs_hash_enable_bits,
+			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
+		/* Enable IPoIB flow steering
+		 * with udp unicast and tcp unicast */
+		MLX4_PUT(inbox, param->fs_hash_enable_bits,
+			 INIT_HCA_FS_IB_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
 
-	MLX4_PUT(inbox, param->mc_base,		INIT_HCA_MC_BASE_OFFSET);
-	MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
-	MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
-	if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
-		MLX4_PUT(inbox, (u8) (1 << 3),	INIT_HCA_UC_STEERING_OFFSET);
-	MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+	} else {
+
+		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_hash_sz,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
+			MLX4_PUT(inbox, (u8) (1 << 3),
+				 INIT_HCA_UC_STEERING_OFFSET);
+	}
 
 	/* TPT attributes */
 
@@ -1188,15 +1236,25 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	MLX4_GET(param->rdmarc_base,   outbox, INIT_HCA_RDMARC_BASE_OFFSET);
 	MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);
 
-	/* multicast attributes */
+	/* steering attributes */
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
 
-	MLX4_GET(param->mc_base,         outbox, INIT_HCA_MC_BASE_OFFSET);
-	MLX4_GET(param->log_mc_entry_sz, outbox,
-		 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
-	MLX4_GET(param->log_mc_hash_sz,  outbox,
-		 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
-	MLX4_GET(param->log_mc_table_sz, outbox,
-		 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+	} else {
+
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_hash_sz,  outbox,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+	}
 
 	/* TPT attributes */
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 64c0399..83fcbbf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -78,6 +78,8 @@ struct mlx4_dev_cap {
 	u16 wavelength[MLX4_MAX_PORTS + 1];
 	u64 trans_code[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
+	int fs_log_max_ucast_qp_range_size;
+	int fs_max_num_qp_per_entry;
 	u64 flags;
 	u64 flags2;
 	int reserved_uars;
@@ -165,6 +167,7 @@ struct mlx4_init_hca_param {
 	u8  log_mpt_sz;
 	u8  log_uar_sz;
 	u8  uar_page_sz; /* log pg sz in 4k chunks */
+	u8  fs_hash_enable_bits;
 };
 
 struct mlx4_init_ib_param {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 7df0517..cf5eb2c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -90,7 +90,9 @@ module_param_named(log_num_mgm_entry_size,
 MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
 					 " of qp per mcg, for example:"
 					 " 10 gives 248.range: 9<="
-					 " log_num_mgm_entry_size <= 12");
+					 " log_num_mgm_entry_size <= 12."
+					 " Not in use with device managed"
+					 " flow steering");
 
 #define MLX4_VF                                        (1 << 0)
 
@@ -136,6 +138,11 @@ module_param_array(port_type_array, int, &arr_argc, 0444);
 MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
 				"1 for IB, 2 for Ethernet");
 
+static int mlx4_flow_steering_hash;
+module_param_named(flow_steering_hash, mlx4_flow_steering_hash, int, 0444);
+MODULE_PARM_DESC(flow_steering_hash,
+		 "Flow steering hash function configuration. Config options: L2 = 0, L2_L3_L4 = 1 (default: L2).");
+
 struct mlx4_port_config {
 	struct list_head list;
 	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
@@ -273,21 +280,29 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
-	    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) {
-		dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
-
+	if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
 	} else {
-		dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) {
+			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
+
+		} else {
+
+			dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
 
-		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
-		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
-			mlx4_warn(dev, "Must have UC_STEER and MC_STEER flags "
-				       "set to use B0 steering. Falling back to A0 steering mode.\n");
+			if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
+			    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+				mlx4_warn(dev, "Must have UC_STEER and MC_STEER flags "
+						"set to use B0 steering. Falling back to A0 steering mode.\n");
+		}
+		dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
 	}
 	mlx4_dbg(dev, "Steering mode is: %s\n",
 		 mlx4_steering_mode_str(dev->caps.steering_mode));
-	dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
 
 	/* Sense port always allowed on supported devices for ConnectX1 and 2 */
 	if (dev->pdev->device != 0x1003)
@@ -982,9 +997,11 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 	}
 
 	/*
-	 * It's not strictly required, but for simplicity just map the
-	 * whole multicast group table now.  The table isn't very big
-	 * and it's a lot easier than trying to track ref counts.
+	 * For flow steering device managed mode it is required to use
+	 * mlx4_init_icm_table. For B0 steering mode it's not strictly
+	 * required, but for simplicity just map the whole multicast
+	 * group table now.  The table isn't very big and it's a lot
+	 * easier than trying to track ref counts.
 	 */
 	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
 				  init_hca->mc_base,
@@ -1220,7 +1237,25 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 			goto err_stop_fw;
 		}
 
+		priv->fs_hash_mode = mlx4_flow_steering_hash;
+
+		switch (priv->fs_hash_mode) {
+		case MLX4_FS_L2_HASH:
+			init_hca.fs_hash_enable_bits = 0;
+			break;
+
+		case MLX4_FS_L2_L3_L4_HASH:
+			/* Enable flow steering with
+			   udp unicast and tcp unicast*/
+			init_hca.fs_hash_enable_bits =
+				MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN;
+			break;
+		}
+
 		profile = default_profile;
+		if (dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED)
+			profile.num_mcg = MLX4_FS_NUM_MCG;
 
 		icm_size = mlx4_make_profile(dev, &profile, &dev_cap,
 					     &init_hca);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 3c59a33..c567655 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -41,6 +41,7 @@
 
 #define MGM_QPN_MASK       0x00FFFFFF
 #define MGM_BLCK_LB_BIT    30
+#define MLX4_MAC_MASK	   0xffffffffffffULL
 
 static const u8 zero_gid[16];	/* automatically initialized to 0 */
 
@@ -54,7 +55,12 @@ struct mlx4_mgm {
 
 int mlx4_get_mgm_entry_size(struct mlx4_dev *dev)
 {
-	return min((1 << mlx4_log_num_mgm_entry_size), MLX4_MAX_MGM_ENTRY_SIZE);
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return 1 << MLX4_FS_MGM_LOG_ENTRY_SIZE;
+	else
+		return min((1 << mlx4_log_num_mgm_entry_size),
+			   MLX4_MAX_MGM_ENTRY_SIZE);
 }
 
 int mlx4_get_qp_per_mgm(struct mlx4_dev *dev)
@@ -643,6 +649,311 @@ static int find_entry(struct mlx4_dev *dev, u8 port,
 	return err;
 }
 
+struct mlx4_net_trans_rule_hw_ctrl {
+	__be32 ctrl;
+	__be32 vf_vep_port;
+	__be32 qpn;
+	__be32 reserved;
+};
+
+static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
+				struct mlx4_net_trans_rule_hw_ctrl *hw)
+{
+	static const u8 __promisc_mode[] = {
+		[MLX4_FS_PROMISC_NONE]   = 0x0,
+		[MLX4_FS_PROMISC_UPLINK] = 0x1,
+		[MLX4_FS_PROMISC_FUNCTION_PORT] = 0x2,
+		[MLX4_FS_PROMISC_ALL_MULTI] = 0x3,
+	};
+
+	u32 dw = 0;
+
+	dw = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+	dw |= ctrl->exclusive ? (1 << 2) : 0;
+	dw |= ctrl->allow_loopback ? (1 << 3) : 0;
+	dw |= __promisc_mode[ctrl->promisc_mode] << 8;
+	dw |= ctrl->priority << 16;
+
+	hw->ctrl = cpu_to_be32(dw);
+	hw->vf_vep_port = cpu_to_be32(ctrl->port);
+	hw->qpn = cpu_to_be32(ctrl->qpn);
+}
+
+struct mlx4_net_trans_rule_hw_ib {
+	u8	size;
+	u8	rsvd1;
+	__be16	id;
+	u32	rsvd2;
+	__be32	qpn;
+	__be32	qpn_mask;
+	u8	dst_gid[16];
+	u8	dst_gid_msk[16];
+} __packed;
+
+struct mlx4_net_trans_rule_hw_eth {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	u8	rsvd1[6];
+	u8	dst_mac[6];
+	u16	rsvd2;
+	u8	dst_mac_msk[6];
+	u16	rsvd3;
+	u8	src_mac[6];
+	u16	rsvd4;
+	u8	src_mac_msk[6];
+	u8      rsvd5;
+	u8      ether_type_enable;
+	__be16  ether_type;
+	__be16  vlan_id_msk;
+	__be16  vlan_id;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_tcp_udp {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be16	rsvd1[3];
+	__be16	dst_port;
+	__be16	rsvd2;
+	__be16	dst_port_msk;
+	__be16	rsvd3;
+	__be16	src_port;
+	__be16	rsvd4;
+	__be16	src_port_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_ipv4 {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	dst_ip;
+	__be32	dst_ip_msk;
+	__be32	src_ip;
+	__be32	src_ip_msk;
+} __packed;
+
+struct _rule_hw {
+	union {
+		struct {
+			u8 size;
+			u8 rsvd;
+			__be16 id;
+		};
+		struct mlx4_net_trans_rule_hw_eth eth;
+		struct mlx4_net_trans_rule_hw_ib ib;
+		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
+		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+	};
+};
+
+static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
+			    struct _rule_hw *rule_hw)
+{
+	static const u16 __sw_id_hw[] = {
+		[MLX4_NET_TRANS_RULE_ID_ETH]     = 0xE001,
+		[MLX4_NET_TRANS_RULE_ID_IB]      = 0xE005,
+		[MLX4_NET_TRANS_RULE_ID_IPV6]    = 0xE003,
+		[MLX4_NET_TRANS_RULE_ID_IPV4]    = 0xE002,
+		[MLX4_NET_TRANS_RULE_ID_TCP]     = 0xE004,
+		[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006
+	};
+
+	static const size_t __rule_hw_sz[] = {
+		[MLX4_NET_TRANS_RULE_ID_ETH] =
+			sizeof(struct mlx4_net_trans_rule_hw_eth),
+		[MLX4_NET_TRANS_RULE_ID_IB] =
+			sizeof(struct mlx4_net_trans_rule_hw_ib),
+		[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+		[MLX4_NET_TRANS_RULE_ID_IPV4] =
+			sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+		[MLX4_NET_TRANS_RULE_ID_TCP] =
+			sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+		[MLX4_NET_TRANS_RULE_ID_UDP] =
+			sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
+	};
+	if (spec->id > MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", spec->id);
+		return -EINVAL;
+	}
+	memset(rule_hw, 0, __rule_hw_sz[spec->id]);
+	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
+	rule_hw->size = __rule_hw_sz[spec->id] >> 2;
+
+	switch (spec->id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk,
+								ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk,
+								ETH_ALEN);
+		if (spec->eth.ether_type_enable) {
+			rule_hw->eth.ether_type_enable = 1;
+			rule_hw->eth.ether_type = spec->eth.ether_type;
+		}
+		rule_hw->eth.vlan_id = spec->eth.vlan_id;
+		rule_hw->eth.vlan_id_msk = spec->eth.vlan_id_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		rule_hw->ib.qpn = spec->ib.r_qpn;
+		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
+		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
+		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV6:
+		return -EOPNOTSUPP;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		rule_hw->ipv4.src_ip = spec->ipv4.src_ip;
+		rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk;
+		rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip;
+		rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port;
+		rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk;
+		rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port;
+		rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[spec->id];
+}
+
+static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
+			  struct mlx4_net_trans_rule *rule)
+{
+#define BUF_SIZE 256
+	struct mlx4_spec_list *cur;
+	char buf[BUF_SIZE];
+	int len = 0;
+
+	mlx4_err(dev, "%s", str);
+	len += snprintf(buf + len, BUF_SIZE - len,
+			"port = %d prio = 0x%x qp = 0x%x ",
+			rule->port, rule->priority, rule->qpn);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		switch (cur->id) {
+		case MLX4_NET_TRANS_RULE_ID_ETH:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dmac = %pM ", &cur->eth.dst_mac);
+			if (cur->eth.ether_type)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"ethertype = 0x%x ",
+						be16_to_cpu(cur->eth.ether_type));
+			if (cur->eth.vlan_id)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"vlan-id = %d ",
+						be16_to_cpu(cur->eth.vlan_id));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV4:
+			if (cur->ipv4.src_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-ip = %pI4 ",
+						&cur->ipv4.src_ip);
+			if (cur->ipv4.dst_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-ip = %pI4 ",
+						&cur->ipv4.dst_ip);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_TCP:
+		case MLX4_NET_TRANS_RULE_ID_UDP:
+			if (cur->tcp_udp.src_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-port = %d ",
+						be16_to_cpu(cur->tcp_udp.src_port));
+			if (cur->tcp_udp.dst_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-port = %d ",
+						be16_to_cpu(cur->tcp_udp.dst_port));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IB:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid = %pI6\n", cur->ib.dst_gid);
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid-mask = %pI6\n",
+					cur->ib.dst_gid_msk);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV6:
+			break;
+
+		default:
+			break;
+		}
+	}
+	len += snprintf(buf + len, BUF_SIZE - len, "\n");
+	mlx4_err(dev, "%s", buf);
+
+	if (len >= BUF_SIZE)
+		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small.\n");
+}
+
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_spec_list *cur;
+	u32 size = 0;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, sizeof(struct mlx4_net_trans_rule_hw_ctrl));
+	trans_rule_ctrl_to_hw(rule, mailbox->buf);
+
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		ret = parse_trans_rule(dev, cur, mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(dev, mailbox);
+			return -EINVAL;
+		}
+		size += ret;
+	}
+
+	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
+	if (ret == -ENOMEM)
+		mlx4_err_rule(dev,
+			      "mcg table is full. Fail to register network rule.\n",
+			      rule);
+	else if (ret)
+		mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_attach);
+
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+
+	err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id);
+	if (err)
+		mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n"
+								, reg_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_detach);
+
 int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  int block_mcast_loopback, enum mlx4_protocol prot,
 			  enum mlx4_steer_type steer)
@@ -895,7 +1206,8 @@ static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
 }
 
 int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  int block_mcast_loopback, enum mlx4_protocol prot)
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol prot, u64 *reg_id)
 {
 
 	switch (dev->caps.steering_mode) {
@@ -914,6 +1226,42 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 					     block_mcast_loopback, prot,
 					     MLX4_MC_STEER);
 
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		struct mlx4_spec_list spec = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.promisc_mode = MLX4_FS_PROMISC_NONE,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.allow_loopback = ~block_mcast_loopback;
+		rule.port = port;
+		rule.qpn = qp->qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		switch (prot) {
+		case MLX4_PROT_ETH:
+			spec.id = MLX4_NET_TRANS_RULE_ID_ETH;
+			memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN);
+			memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+			break;
+
+		case MLX4_PROT_IB_IPV6:
+			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
+			memcpy(spec.ib.dst_gid, gid, 16);
+			memset(&spec.ib.dst_gid_msk, 0xff, 16);
+			break;
+		default:
+			return -EINVAL;
+		}
+		list_add_tail(&spec.list, &rule.list);
+
+		return  mlx4_flow_attach(dev, &rule, reg_id);
+	}
+
 	default:
 		return -EINVAL;
 	}
@@ -921,7 +1269,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
 
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  enum mlx4_protocol prot)
+			  enum mlx4_protocol prot, u64 reg_id)
 {
 	switch (dev->caps.steering_mode) {
 	case MLX4_STEERING_MODE_A0:
@@ -938,6 +1286,9 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 		return mlx4_qp_detach_common(dev, qp, gid, prot,
 					     MLX4_MC_STEER);
 
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+
 	default:
 		return -EINVAL;
 	}
@@ -1042,6 +1393,10 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
 
+	/* No need for mcg_table when fw managed the mcg table*/
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return 0;
 	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
 			       dev->caps.num_amgms - 1, 0, 0);
 	if (err)
@@ -1054,5 +1409,7 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev)
 
 void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
 {
-	mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index c07e882..0084967 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -54,6 +54,17 @@
 #define DRV_VERSION	"1.1"
 #define DRV_RELDATE	"Dec, 2011"
 
+#define MLX4_FS_UDP_UC_EN		(1 << 1)
+#define MLX4_FS_TCP_UC_EN		(1 << 2)
+#define MLX4_FS_NUM_OF_L2_ADDR		8
+#define MLX4_FS_MGM_LOG_ENTRY_SIZE	7
+#define MLX4_FS_NUM_MCG			(1 << 17)
+
+enum {
+	MLX4_FS_L2_HASH = 0,
+	MLX4_FS_L2_L3_L4_HASH,
+};
+
 #define MLX4_NUM_UP		8
 #define MLX4_NUM_TC		8
 #define MLX4_RATELIMIT_UNITS 3 /* 100 Mbps */
@@ -704,6 +715,7 @@ struct mlx4_set_port_rqp_calc_context {
 
 struct mlx4_mac_entry {
 	u64 mac;
+	u64 reg_id;
 };
 
 struct mlx4_port_info {
@@ -777,6 +789,7 @@ struct mlx4_priv {
 	struct mutex		bf_mutex;
 	struct io_mapping	*bf_mapping;
 	int			reserved_mtts;
+	int			fs_hash_mode;
 };
 
 static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 1bb00cd..2d6dabe 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -414,6 +414,7 @@ struct mlx4_en_mc_list {
 	struct list_head	list;
 	enum mlx4_en_mclist_act	action;
 	u8			addr[ETH_ALEN];
+	u64			reg_id;
 };
 
 struct mlx4_en_frag_info {
@@ -503,6 +504,7 @@ struct mlx4_en_priv {
 	u64 stats_bitmap;
 	struct list_head mc_list;
 	struct list_head curr_list;
+	u64 broadcast_id;
 	struct mlx4_en_stat_out_mbox hw_stats;
 	int vids[128];
 	bool wol;
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 58de723..a51d1b9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -75,21 +75,54 @@ void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
 	table->total = 0;
 }
 
-static int mlx4_uc_steer_add(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn)
+static int mlx4_uc_steer_add(struct mlx4_dev *dev, u8 port,
+			     u64 mac, int *qpn, u64 *reg_id)
 {
-	struct mlx4_qp qp;
-	u8 gid[16] = {0};
 	__be64 be_mac;
 	int err;
 
-	qp.qpn = *qpn;
-
-	mac &= 0xffffffffffffULL;
+	mac &= MLX4_MAC_MASK;
 	be_mac = cpu_to_be64(mac << 16);
-	memcpy(&gid[10], &be_mac, ETH_ALEN);
-	gid[5] = port;
 
-	err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH);
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = *qpn;
+		memcpy(&gid[10], &be_mac, ETH_ALEN);
+		gid[5] = port;
+
+		err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		struct mlx4_spec_list spec_eth = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.allow_loopback = 1,
+			.promisc_mode = MLX4_FS_PROMISC_NONE,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.port = port;
+		rule.qpn = *qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(spec_eth.eth.dst_mac, &be_mac, ETH_ALEN);
+		memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+		list_add_tail(&spec_eth.list, &rule.list);
+
+		err = mlx4_flow_attach(dev, &rule, reg_id);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
 	if (err)
 		mlx4_warn(dev, "Failed Attaching Unicast\n");
 
@@ -97,19 +130,30 @@ static int mlx4_uc_steer_add(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn)
 }
 
 static void mlx4_uc_steer_release(struct mlx4_dev *dev, u8 port,
-				  u64 mac, int qpn)
+				  u64 mac, int qpn, u64 reg_id)
 {
-	struct mlx4_qp qp;
-	u8 gid[16] = {0};
-	__be64 be_mac;
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+		__be64 be_mac;
 
-	qp.qpn = qpn;
-	mac &= 0xffffffffffffULL;
-	be_mac = cpu_to_be64(mac << 16);
-	memcpy(&gid[10], &be_mac, ETH_ALEN);
-	gid[5] = port;
+		qp.qpn = qpn;
+		mac &= MLX4_MAC_MASK;
+		be_mac = cpu_to_be64(mac << 16);
+		memcpy(&gid[10], &be_mac, ETH_ALEN);
+		gid[5] = port;
 
-	mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH);
+		mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		mlx4_flow_detach(dev, reg_id);
+		break;
+	}
+	default:
+		mlx4_err(dev, "Invalid steering mode.\n");
+	}
 }
 
 static int validate_index(struct mlx4_dev *dev,
@@ -144,6 +188,7 @@ int mlx4_get_eth_qp(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn)
 	struct mlx4_mac_entry *entry;
 	int index = 0;
 	int err = 0;
+	u64 reg_id;
 
 	mlx4_dbg(dev, "Registering MAC: 0x%llx for adding\n",
 			(unsigned long long) mac);
@@ -167,7 +212,7 @@ int mlx4_get_eth_qp(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn)
 		goto qp_err;
 	}
 
-	err = mlx4_uc_steer_add(dev, port, mac, qpn);
+	err = mlx4_uc_steer_add(dev, port, mac, qpn, &reg_id);
 	if (err)
 		goto steer_err;
 
@@ -177,6 +222,7 @@ int mlx4_get_eth_qp(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn)
 		goto alloc_err;
 	}
 	entry->mac = mac;
+	entry->reg_id = reg_id;
 	err = radix_tree_insert(&info->mac_tree, *qpn, entry);
 	if (err)
 		goto insert_err;
@@ -186,7 +232,7 @@ insert_err:
 	kfree(entry);
 
 alloc_err:
-	mlx4_uc_steer_release(dev, port, mac, *qpn);
+	mlx4_uc_steer_release(dev, port, mac, *qpn, reg_id);
 
 steer_err:
 	mlx4_qp_release_range(dev, *qpn, 1);
@@ -212,7 +258,8 @@ void mlx4_put_eth_qp(struct mlx4_dev *dev, u8 port, u64 mac, int qpn)
 			mlx4_dbg(dev, "Releasing qp: port %d, mac 0x%llx,"
 				 " qpn %d\n", port,
 				 (unsigned long long) mac, qpn);
-			mlx4_uc_steer_release(dev, port, entry->mac, qpn);
+			mlx4_uc_steer_release(dev, port, entry->mac,
+					      qpn, entry->reg_id);
 			mlx4_qp_release_range(dev, qpn, 1);
 			radix_tree_delete(&info->mac_tree, qpn);
 			kfree(entry);
@@ -363,11 +410,14 @@ int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 		entry = radix_tree_lookup(&info->mac_tree, qpn);
 		if (!entry)
 			return -EINVAL;
-		mlx4_uc_steer_release(dev, port, entry->mac, qpn);
+		mlx4_uc_steer_release(dev, port, entry->mac,
+				      qpn, entry->reg_id);
 		mlx4_unregister_mac(dev, port, entry->mac);
 		entry->mac = new_mac;
+		entry->reg_id = 0;
 		mlx4_register_mac(dev, port, new_mac);
-		err = mlx4_uc_steer_add(dev, port, entry->mac, &qpn);
+		err = mlx4_uc_steer_add(dev, port, entry->mac,
+					&qpn, &entry->reg_id);
 		return err;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c
index b83bc92..9ee4725 100644
--- a/drivers/net/ethernet/mellanox/mlx4/profile.c
+++ b/drivers/net/ethernet/mellanox/mlx4/profile.c
@@ -237,13 +237,19 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 			init_hca->mtt_base	 = profile[i].start;
 			break;
 		case MLX4_RES_MCG:
-			dev->caps.num_mgms	  = profile[i].num >> 1;
-			dev->caps.num_amgms	  = profile[i].num >> 1;
 			init_hca->mc_base	  = profile[i].start;
 			init_hca->log_mc_entry_sz =
 					ilog2(mlx4_get_mgm_entry_size(dev));
 			init_hca->log_mc_table_sz = profile[i].log_num;
-			init_hca->log_mc_hash_sz  = profile[i].log_num - 1;
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				dev->caps.num_mgms = profile[i].num;
+			} else {
+				init_hca->log_mc_hash_sz =
+						profile[i].log_num - 1;
+				dev->caps.num_mgms = profile[i].num >> 1;
+				dev->caps.num_amgms = profile[i].num >> 1;
+			}
 			break;
 		default:
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 4301aa5..ce73772 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2746,6 +2746,9 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_cmd_mailbox *outbox,
 					 struct mlx4_cmd_info *cmd)
 {
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
 	return mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
 			    vhcr->in_modifier, 0,
 			    MLX4_QP_FLOW_STEERING_ATTACH,
@@ -2759,6 +2762,9 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_cmd_mailbox *outbox,
 					 struct mlx4_cmd_info *cmd)
 {
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
 	return mlx4_cmd(dev, vhcr->in_param, 0, 0,
 			MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
 			MLX4_CMD_NATIVE);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 80891a4..a22a851 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -70,14 +70,17 @@ enum {
 	MLX4_MFUNC_EQE_MASK     = (MLX4_MFUNC_MAX_EQES - 1)
 };
 
-/* Driver supports 2 diffrent device methods to manage traffic steering:
+/* Driver supports 3 diffrent device methods to manage traffic steering:
+	-device managed - High level API for ib and eth flow steering. FW is
+			  managing flow steering tables.
 	- B0 steering mode - Common low level API for ib and (if supported) eth.
 	- A0 steering mode - Limited low level API for eth. In case of IB,
 			     B0 mode is in use.
  */
 enum {
 	MLX4_STEERING_MODE_A0,
-	MLX4_STEERING_MODE_B0
+	MLX4_STEERING_MODE_B0,
+	MLX4_STEERING_MODE_DEVICE_MANAGED
 };
 
 static inline const char *mlx4_steering_mode_str(int steering_mode)
@@ -88,6 +91,10 @@ static inline const char *mlx4_steering_mode_str(int steering_mode)
 
 	case MLX4_STEERING_MODE_B0:
 		return "B0 steering";
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return "Device managed flow steering";
+
 	default:
 		return "Unrecognize steering mode";
 	}
@@ -125,7 +132,8 @@ enum {
 enum {
 	MLX4_DEV_CAP_FLAG2_RSS			= 1LL <<  0,
 	MLX4_DEV_CAP_FLAG2_RSS_TOP		= 1LL <<  1,
-	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2
+	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3
 };
 
 #define MLX4_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
@@ -319,6 +327,7 @@ struct mlx4_caps {
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
 	int			steering_mode;
+	int			fs_log_max_ucast_qp_range_size;
 	int			num_pds;
 	int			reserved_pds;
 	int			max_xrcds;
@@ -647,9 +656,94 @@ int mlx4_unicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			enum mlx4_protocol prot);
 int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  int block_mcast_loopback, enum mlx4_protocol protocol);
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol protocol, u64 *reg_id);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
-			  enum mlx4_protocol protocol);
+			  enum mlx4_protocol protocol, u64 reg_id);
+
+enum {
+	MLX4_DOMAIN_UVERBS	= 0x1000,
+	MLX4_DOMAIN_ETHTOOL     = 0x2000,
+	MLX4_DOMAIN_RFS         = 0x3000,
+	MLX4_DOMAIN_NIC    = 0x5000,
+};
+
+enum mlx4_net_trans_rule_id {
+	MLX4_NET_TRANS_RULE_ID_ETH = 0,
+	MLX4_NET_TRANS_RULE_ID_IB,
+	MLX4_NET_TRANS_RULE_ID_IPV6,
+	MLX4_NET_TRANS_RULE_ID_IPV4,
+	MLX4_NET_TRANS_RULE_ID_TCP,
+	MLX4_NET_TRANS_RULE_ID_UDP,
+	MLX4_NET_TRANS_RULE_NUM, /* should be last */
+};
+
+enum mlx4_net_trans_promisc_mode {
+	MLX4_FS_PROMISC_NONE = 0,
+	MLX4_FS_PROMISC_UPLINK,
+	MLX4_FS_PROMISC_FUNCTION_PORT,
+	MLX4_FS_PROMISC_ALL_MULTI,
+};
+
+struct mlx4_spec_eth {
+	u8	dst_mac[6];
+	u8	dst_mac_msk[6];
+	u8	src_mac[6];
+	u8	src_mac_msk[6];
+	u8	ether_type_enable;
+	__be16	ether_type;
+	__be16	vlan_id_msk;
+	__be16	vlan_id;
+};
+
+struct mlx4_spec_tcp_udp {
+	__be16 dst_port;
+	__be16 dst_port_msk;
+	__be16 src_port;
+	__be16 src_port_msk;
+};
+
+struct mlx4_spec_ipv4 {
+	__be32 dst_ip;
+	__be32 dst_ip_msk;
+	__be32 src_ip;
+	__be32 src_ip_msk;
+};
+
+struct mlx4_spec_ib {
+	__be32	r_qpn;
+	__be32	qpn_msk;
+	u8	dst_gid[16];
+	u8	dst_gid_msk[16];
+};
+
+struct mlx4_spec_list {
+	struct	list_head list;
+	enum	mlx4_net_trans_rule_id id;
+	union {
+		struct mlx4_spec_eth eth;
+		struct mlx4_spec_ib ib;
+		struct mlx4_spec_ipv4 ipv4;
+		struct mlx4_spec_tcp_udp tcp_udp;
+	};
+};
+
+enum mlx4_net_trans_hw_rule_queue {
+	MLX4_NET_TRANS_Q_FIFO,
+	MLX4_NET_TRANS_Q_LIFO,
+};
+
+struct mlx4_net_trans_rule {
+	struct	list_head list;
+	enum	mlx4_net_trans_hw_rule_queue queue_mode;
+	bool	exclusive;
+	bool	allow_loopback;
+	enum	mlx4_net_trans_promisc_mode promisc_mode;
+	u8	port;
+	u16	priority;
+	u32	qpn;
+};
+
 int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
@@ -692,4 +786,8 @@ int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
 void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
 
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id);
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
+
 #endif /* MLX4_DEVICE_H */
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Implement the ethtool APIs for attaching L2/L3/L4 based flow steering
rules to the netdevice RX rings. Added set_rxnfc callback and enhanced
the existing get_rxnfc callback.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  373 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    7 +
 2 files changed, 380 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 72901ce..30de264 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -38,6 +38,10 @@
 #include "mlx4_en.h"
 #include "en_port.h"
 
+#define EN_ETHTOOL_QP_ATTACH (1ull << 63)
+#define EN_ETHTOOL_MAC_MASK 0xffffffffffffULL
+#define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff)
+#define EN_ETHTOOL_WORD_MASK  cpu_to_be32(0xffffffff)
 
 static void
 mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
@@ -599,16 +603,360 @@ static int mlx4_en_set_rxfh_indir(struct net_device *dev,
 	return err;
 }
 
+#define not_all_zeros_or_all_ones(field, type) \
+	(field && (type)~field)
+
+static int mlx4_en_validate_flow(struct net_device *dev,
+				 struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_usrip4_spec *l3_mask;
+	struct ethtool_tcpip4_spec *l4_mask;
+	struct ethhdr *eth_mask;
+	u64 full_mac = ~0ull;
+	u64 zero_mac = 0;
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	switch (cmd->fs.flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		if (cmd->fs.h_u.tcp_ip4_spec.tos)
+			return -EOPNOTSUPP;
+		l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+		/* don't allow mask which isn't all 0 or 1 */
+		if (not_all_zeros_or_all_ones(l4_mask->ip4src, __be32) ||
+		    not_all_zeros_or_all_ones(l4_mask->ip4dst, __be32) ||
+		    not_all_zeros_or_all_ones(l4_mask->psrc, __be16) ||
+		    not_all_zeros_or_all_ones(l4_mask->pdst, __be16))
+			return -EOPNOTSUPP;
+		break;
+	case IP_USER_FLOW:
+		l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+		if (cmd->fs.h_u.usr_ip4_spec.l4_4_bytes ||
+		    cmd->fs.h_u.usr_ip4_spec.tos ||
+		    cmd->fs.h_u.usr_ip4_spec.proto ||
+		    cmd->fs.h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
+		    (!cmd->fs.h_u.usr_ip4_spec.ip4src &&
+		     !cmd->fs.h_u.usr_ip4_spec.ip4dst) ||
+		    not_all_zeros_or_all_ones(l3_mask->ip4src, __be32) ||
+		    not_all_zeros_or_all_ones(l3_mask->ip4dst, __be32))
+			return -EOPNOTSUPP;
+		break;
+	case ETHER_FLOW:
+		eth_mask = &cmd->fs.m_u.ether_spec;
+		if (memcmp(eth_mask->h_source, &zero_mac, ETH_ALEN))
+			return -EOPNOTSUPP;
+		if (!memcmp(eth_mask->h_dest, &zero_mac, ETH_ALEN))
+			return -EOPNOTSUPP;
+		if (not_all_zeros_or_all_ones(eth_mask->h_proto, __be16) ||
+		    (memcmp(eth_mask->h_dest, &zero_mac, ETH_ALEN) &&
+		     memcmp(eth_mask->h_dest, &full_mac, ETH_ALEN)))
+			return -EOPNOTSUPP;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if ((cmd->fs.flow_type & FLOW_EXT)) {
+		if (cmd->fs.m_ext.vlan_etype ||
+		    not_all_zeros_or_all_ones(cmd->fs.m_ext.vlan_tci,
+					       __be16)) {
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
+static void add_ip_rule(struct mlx4_en_priv *priv,
+			struct ethtool_rxnfc *cmd,
+			struct list_head *list_h)
+{
+	struct mlx4_spec_list *spec_l3;
+
+	spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
+	if (!spec_l3) {
+		en_err(priv, "Fail to alloc ethtool rule.\n");
+		return;
+	}
+
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+	spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src;
+	if (spec_l3->ipv4.src_ip)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst;
+	if (spec_l3->ipv4.dst_ip)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+	list_add_tail(&spec_l3->list, list_h);
+}
+
+static void add_tcp_udp_rule(struct mlx4_en_priv *priv,
+			     struct ethtool_rxnfc *cmd,
+			     struct list_head *list_h, int proto)
+{
+	struct mlx4_spec_list *spec_l3;
+	struct mlx4_spec_list *spec_l4;
+
+	spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
+	spec_l4 = kzalloc(sizeof *spec_l4, GFP_KERNEL);
+	if (!spec_l4 || !spec_l3) {
+		en_err(priv, "Fail to alloc ethtool rule.\n");
+		return;
+	}
+
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+
+	if (proto == TCP_V4_FLOW) {
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.tcp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.tcp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.tcp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.tcp_ip4_spec.pdst;
+	} else {
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.udp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.udp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.udp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.udp_ip4_spec.pdst;
+	}
+
+	if (spec_l3->ipv4.src_ip)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	if (spec_l3->ipv4.dst_ip)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+
+	if (spec_l4->tcp_udp.src_port)
+		spec_l4->tcp_udp.src_port_msk = EN_ETHTOOL_SHORT_MASK;
+	if (spec_l4->tcp_udp.dst_port)
+		spec_l4->tcp_udp.dst_port_msk = EN_ETHTOOL_SHORT_MASK;
+
+	list_add_tail(&spec_l3->list, list_h);
+	list_add_tail(&spec_l4->list, list_h);
+}
+
+static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
+					     struct ethtool_rxnfc *cmd,
+					     struct list_head *rule_list_h)
+{
+	int err;
+	u64 mac;
+	__be64 be_mac;
+	struct ethhdr *eth_spec;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_spec_list *spec_l2;
+	__be64 mac_msk = cpu_to_be64(EN_ETHTOOL_MAC_MASK << 16);
+
+	err = mlx4_en_validate_flow(dev, cmd);
+	if (err)
+		return err;
+
+	spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL);
+	if (!spec_l2)
+		return -ENOMEM;
+
+	mac = priv->mac & EN_ETHTOOL_MAC_MASK;
+	be_mac = cpu_to_be64(mac << 16);
+
+	spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
+	if ((cmd->fs.flow_type & ~FLOW_EXT) != ETHER_FLOW)
+		memcpy(spec_l2->eth.dst_mac, &be_mac, ETH_ALEN);
+
+	if ((cmd->fs.flow_type & FLOW_EXT) && cmd->fs.m_ext.vlan_tci) {
+		spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci;
+		spec_l2->eth.vlan_id_msk = cpu_to_be16(0xfff);
+	}
+
+	list_add_tail(&spec_l2->list, rule_list_h);
+
+	switch (cmd->fs.flow_type & ~FLOW_EXT) {
+	case ETHER_FLOW:
+		eth_spec = &cmd->fs.h_u.ether_spec;
+		memcpy(&spec_l2->eth.dst_mac, eth_spec->h_dest, ETH_ALEN);
+		spec_l2->eth.ether_type = eth_spec->h_proto;
+		if (eth_spec->h_proto)
+			spec_l2->eth.ether_type_enable = 1;
+		break;
+	case IP_USER_FLOW:
+		add_ip_rule(priv, cmd, rule_list_h);
+		break;
+	case TCP_V4_FLOW:
+		add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW);
+		break;
+	case UDP_V4_FLOW:
+		add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW);
+		break;
+	}
+	return 0;
+}
+
+static int mlx4_en_flow_replace(struct net_device *dev,
+				struct ethtool_rxnfc *cmd)
+{
+	int err;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ethtool_flow_id *loc_rule;
+	struct mlx4_spec_list *spec, *tmp_spec;
+	u32 qpn;
+	u64 reg_id;
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_PROMISC_NONE,
+	};
+
+	rule.port = priv->port;
+	rule.priority = MLX4_DOMAIN_ETHTOOL | cmd->fs.location;
+	INIT_LIST_HEAD(&rule.list);
+
+	/* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */
+	if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC)
+		return -EOPNOTSUPP;
+	else if (cmd->fs.ring_cookie & EN_ETHTOOL_QP_ATTACH) {
+		qpn = cmd->fs.ring_cookie & (EN_ETHTOOL_QP_ATTACH - 1);
+	} else {
+		if (cmd->fs.ring_cookie >= priv->rx_ring_num) {
+			en_warn(priv, "rxnfc: RX ring (%llu) doesn't exist.\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+		qpn = priv->rss_map.qps[cmd->fs.ring_cookie].qpn;
+		if (!qpn) {
+			en_warn(priv, "rxnfc: RX ring (%llu) is inactive.\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+	}
+	rule.qpn = qpn;
+	err = mlx4_en_ethtool_to_net_trans_rule(dev, cmd, &rule.list);
+	if (err)
+		goto out_free_list;
+
+	loc_rule = &priv->ethtool_rules[cmd->fs.location];
+	if (loc_rule->id) {
+		err = mlx4_flow_detach(priv->mdev->dev, loc_rule->id);
+		if (err) {
+			en_err(priv, "Fail to detach network rule at location %d. registration id = %llx\n"
+			       , cmd->fs.location, loc_rule->id);
+			goto out_free_list;
+		}
+		loc_rule->id = 0;
+		memset(&loc_rule->flow_spec, 0,
+		       sizeof(struct ethtool_rx_flow_spec));
+	}
+	err = mlx4_flow_attach(priv->mdev->dev, &rule, &reg_id);
+	if (err) {
+		en_err(priv, "Fail to attach network rule at location %d.\n"
+		       , cmd->fs.location);
+		goto out_free_list;
+	}
+	loc_rule->id = reg_id;
+	memcpy(&loc_rule->flow_spec, &cmd->fs,
+	       sizeof(struct ethtool_rx_flow_spec));
+
+out_free_list:
+	list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) {
+		list_del(&spec->list);
+		kfree(spec);
+	}
+	return err;
+}
+
+static int mlx4_en_flow_detach(struct net_device *dev,
+			       struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[cmd->fs.location];
+	if (!rule->id) {
+		err =  -ENOENT;
+		goto out;
+	}
+
+	err = mlx4_flow_detach(priv->mdev->dev, rule->id);
+	if (err) {
+		en_err(priv, "Fail to detach network rule at location %d. registration id = 0x%llx\n"
+		       , cmd->fs.location, rule->id);
+		goto out;
+	}
+	rule->id = 0;
+	memset(&rule->flow_spec, 0, sizeof(struct ethtool_rx_flow_spec));
+out:
+	return err;
+
+}
+
+static int mlx4_en_get_flow(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			    int loc)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (loc < 0 || loc >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[loc];
+	if (rule->id)
+		memcpy(&cmd->fs, &rule->flow_spec,
+		       sizeof(struct ethtool_rx_flow_spec));
+	else
+		err = -ENOENT;
+
+	return err;
+}
+
+static int mlx4_en_get_num_flows(struct mlx4_en_priv *priv)
+{
+
+	int i, res = 0;
+	for (i = 0; i < MAX_NUM_OF_FS_RULES; i++) {
+		if (priv->ethtool_rules[i].id)
+			res++;
+	}
+	return res;
+
+}
+
 static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
 			     u32 *rule_locs)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	int err = 0;
+	int i = 0, priority = 0;
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
 
 	switch (cmd->cmd) {
 	case ETHTOOL_GRXRINGS:
 		cmd->data = priv->rx_ring_num;
 		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = mlx4_en_get_num_flows(priv);
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = mlx4_en_get_flow(dev, cmd, cmd->fs.location);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		while (!err || err == -ENOENT) {
+			err = mlx4_en_get_flow(dev, cmd, i);
+			if (!err)
+				((u32 *)(rule_locs))[priority++] = i;
+			i++;
+		}
+		if (priority)
+			err = 0;
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -617,6 +965,30 @@ static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
 	return err;
 }
 
+static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mlx4_en_flow_replace(dev, cmd);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = mlx4_en_flow_detach(dev, cmd);
+		break;
+	default:
+		en_warn(priv, "Unsupported ethtool command. (%d)\n", cmd->cmd);
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
 const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.get_drvinfo = mlx4_en_get_drvinfo,
 	.get_settings = mlx4_en_get_settings,
@@ -637,6 +1009,7 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.get_ringparam = mlx4_en_get_ringparam,
 	.set_ringparam = mlx4_en_set_ringparam,
 	.get_rxnfc = mlx4_en_get_rxnfc,
+	.set_rxnfc = mlx4_en_set_rxnfc,
 	.get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size,
 	.get_rxfh_indir = mlx4_en_get_rxfh_indir,
 	.set_rxfh_indir = mlx4_en_set_rxfh_indir,
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 2d6dabe..8882e70 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -75,6 +75,7 @@
 #define STAMP_SHIFT		31
 #define STAMP_VAL		0x7fffffff
 #define STATS_DELAY		(HZ / 4)
+#define MAX_NUM_OF_FS_RULES	256
 
 /* Typical TSO descriptor with 16 gather entries is 352 bytes... */
 #define MAX_DESC_SIZE		512
@@ -435,6 +436,11 @@ struct mlx4_en_frag_info {
 
 #endif
 
+struct ethtool_flow_id {
+	struct ethtool_rx_flow_spec flow_spec;
+	u64 id;
+};
+
 struct mlx4_en_priv {
 	struct mlx4_en_dev *mdev;
 	struct mlx4_en_port_profile *prof;
@@ -444,6 +450,7 @@ struct mlx4_en_priv {
 	struct net_device_stats ret_stats;
 	struct mlx4_en_port_state port_state;
 	spinlock_t stats_lock;
+	struct ethtool_flow_id ethtool_rules[MAX_NUM_OF_FS_RULES];
 
 	unsigned long last_moder_packets[MAX_RX_RINGS];
 	unsigned long last_moder_tx_packets;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 10/10] net/mlx4_en: Add support for drop action through ethtool
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

The drop action is implemented by allocating a QP and keeping it in a reset state
such that the HW drops any packets which are steered to that QP. When a drop action
is requested, we attach the relevant flow to that QP.

Sign-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |    9 ++++++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c      |   30 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    3 ++
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 30de264..6c44047 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -814,7 +814,7 @@ static int mlx4_en_flow_replace(struct net_device *dev,
 
 	/* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */
 	if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC)
-		return -EOPNOTSUPP;
+		qpn = priv->drop_qp.qpn;
 	else if (cmd->fs.ring_cookie & EN_ETHTOOL_QP_ATTACH) {
 		qpn = cmd->fs.ring_cookie & (EN_ETHTOOL_QP_ATTACH - 1);
 	} else {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 46b58d8..ed121dc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -793,6 +793,10 @@ int mlx4_en_start_port(struct net_device *dev)
 		goto mac_err;
 	}
 
+	err = mlx4_en_create_drop_qp(priv);
+	if (err)
+		goto rss_err;
+
 	/* Configure tx cq's and rings */
 	for (i = 0; i < priv->tx_ring_num; i++) {
 		/* Configure cq */
@@ -892,7 +896,8 @@ tx_err:
 		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]);
 		mlx4_en_deactivate_cq(priv, &priv->tx_cq[tx_index]);
 	}
-
+	mlx4_en_destroy_drop_qp(priv);
+rss_err:
 	mlx4_en_release_rss_steer(priv);
 mac_err:
 	mlx4_put_eth_qp(mdev->dev, priv->port, priv->mac, priv->base_qpn);
@@ -947,6 +952,8 @@ void mlx4_en_stop_port(struct net_device *dev)
 	/* Flush multicast filter */
 	mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG);
 
+	mlx4_en_destroy_drop_qp(priv);
+
 	/* Free TX Rings */
 	for (i = 0; i < priv->tx_ring_num; i++) {
 		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[i]);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index d49a7ac..a04cbf7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -844,6 +844,36 @@ out:
 	return err;
 }
 
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
+{
+	int err;
+	u32 qpn;
+
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn);
+	if (err) {
+		en_err(priv, "Failed reserving drop qpn\n");
+		return err;
+	}
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
+	if (err) {
+		en_err(priv, "Failed allocating drop qp\n");
+		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+		return err;
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv)
+{
+	u32 qpn;
+
+	qpn = priv->drop_qp.qpn;
+	mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_free(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+}
+
 /* Allocate rx qp's and configure them according to rss map */
 int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 8882e70..a126321 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -500,6 +500,7 @@ struct mlx4_en_priv {
 	struct mlx4_en_rx_ring rx_ring[MAX_RX_RINGS];
 	struct mlx4_en_cq *tx_cq;
 	struct mlx4_en_cq rx_cq[MAX_RX_RINGS];
+	struct mlx4_qp drop_qp;
 	struct work_struct mcast_task;
 	struct work_struct mac_task;
 	struct work_struct watchdog_task;
@@ -586,6 +587,8 @@ void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
 void mlx4_en_calc_rx_buf(struct net_device *dev);
 int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
 void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
 int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
 void mlx4_en_rx_irq(struct mlx4_cq *mcq);
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 07/10] net/mlx4_core: Add resource tracking for device managed flow steering rules
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

As with other device resources, the resource tracker is needed for supporting
device managed flow steering rules under SRIOV: make sure virtual functions
delete only rules created by them, and clean all rules attached by a crashed VF.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    1 +
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  132 ++++++++++++++++++--
 2 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 0084967..d2c436b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -149,6 +149,7 @@ enum mlx4_resource {
 	RES_VLAN,
 	RES_EQ,
 	RES_COUNTER,
+	RES_FS_RULE,
 	MLX4_NUM_OF_RESOURCE_TYPE
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index ce73772..b4480de 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -190,6 +190,15 @@ struct res_xrcdn {
 	int			port;
 };
 
+enum res_fs_rule_states {
+	RES_FS_RULE_BUSY = RES_ANY_BUSY,
+	RES_FS_RULE_ALLOCATED,
+};
+
+struct res_fs_rule {
+	struct res_common	com;
+};
+
 static struct res_common *res_tracker_lookup(struct rb_root *root, u64 res_id)
 {
 	struct rb_node *node = root->rb_node;
@@ -245,6 +254,7 @@ static const char *ResourceType(enum mlx4_resource rt)
 	case RES_MAC: return  "RES_MAC";
 	case RES_EQ: return "RES_EQ";
 	case RES_COUNTER: return "RES_COUNTER";
+	case RES_FS_RULE: return "RES_FS_RULE";
 	case RES_XRCD: return "RES_XRCD";
 	default: return "Unknown resource type !!!";
 	};
@@ -516,6 +526,20 @@ static struct res_common *alloc_xrcdn_tr(int id)
 	return &ret->com;
 }
 
+static struct res_common *alloc_fs_rule_tr(u64 id)
+{
+	struct res_fs_rule *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_FS_RULE_ALLOCATED;
+
+	return &ret->com;
+}
+
 static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 				   int extra)
 {
@@ -549,6 +573,9 @@ static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 	case RES_XRCD:
 		ret = alloc_xrcdn_tr(id);
 		break;
+	case RES_FS_RULE:
+		ret = alloc_fs_rule_tr(id);
+		break;
 	default:
 		return NULL;
 	}
@@ -681,6 +708,16 @@ static int remove_xrcdn_ok(struct res_xrcdn *res)
 	return 0;
 }
 
+static int remove_fs_rule_ok(struct res_fs_rule *res)
+{
+	if (res->com.state == RES_FS_RULE_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_FS_RULE_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
 static int remove_cq_ok(struct res_cq *res)
 {
 	if (res->com.state == RES_CQ_BUSY)
@@ -722,6 +759,8 @@ static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
 		return remove_counter_ok((struct res_counter *)res);
 	case RES_XRCD:
 		return remove_xrcdn_ok((struct res_xrcdn *)res);
+	case RES_FS_RULE:
+		return remove_fs_rule_ok((struct res_fs_rule *)res);
 	default:
 		return -EINVAL;
 	}
@@ -2746,14 +2785,28 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_cmd_mailbox *outbox,
 					 struct mlx4_cmd_info *cmd)
 {
+	int err;
+
 	if (dev->caps.steering_mode !=
 	    MLX4_STEERING_MODE_DEVICE_MANAGED)
 		return -EOPNOTSUPP;
-	return mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
-			    vhcr->in_modifier, 0,
-			    MLX4_QP_FLOW_STEERING_ATTACH,
-			    MLX4_CMD_TIME_CLASS_A,
-			    MLX4_CMD_NATIVE);
+
+	err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
+			   vhcr->in_modifier, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to add flow steering resources.\n ");
+		/* detach rule*/
+		mlx4_cmd(dev, vhcr->out_param, 0, 0,
+			 MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_NATIVE);
+	}
+	return err;
 }
 
 int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
@@ -2762,12 +2815,22 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_cmd_mailbox *outbox,
 					 struct mlx4_cmd_info *cmd)
 {
+	int err;
+
 	if (dev->caps.steering_mode !=
 	    MLX4_STEERING_MODE_DEVICE_MANAGED)
 		return -EOPNOTSUPP;
-	return mlx4_cmd(dev, vhcr->in_param, 0, 0,
-			MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
-			MLX4_CMD_NATIVE);
+
+	err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources.\n ");
+		return err;
+	}
+
+	err = mlx4_cmd(dev, vhcr->in_param, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	return err;
 }
 
 enum {
@@ -3179,6 +3242,58 @@ static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
 	spin_unlock_irq(mlx4_tlock(dev));
 }
 
+static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *fs_rule_list =
+		&tracker->slave_list[slave].res_list[RES_FS_RULE];
+	struct res_fs_rule *fs_rule;
+	struct res_fs_rule *tmp;
+	int state;
+	u64 base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_FS_RULE);
+	if (err)
+		mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (fs_rule->com.owner == slave) {
+			base = fs_rule->com.res_id;
+			state = fs_rule->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_FS_RULE_ALLOCATED:
+					/* detach rule */
+					err = mlx4_cmd(dev, base, 0, 0,
+						MLX4_QP_FLOW_STEERING_DETACH,
+						MLX4_CMD_TIME_CLASS_A,
+						MLX4_CMD_NATIVE);
+
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&fs_rule->com.node,
+						 &tracker->res_tree[RES_FS_RULE]);
+					list_del(&fs_rule->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(fs_rule);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
 static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -3320,5 +3435,6 @@ void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave)
 	rem_slave_mtts(dev, slave);
 	rem_slave_counters(dev, slave);
 	rem_slave_xrcdns(dev, slave);
+	rem_slave_fs_rule(dev, slave);
 	mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 04/10] net/mlx4: Set steering mode according to device capabilities
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Instead of checking the firmware supported steering mode in various
places in the code, add a dedicated field in the mlx4 device capabilities
structure which is written once during the initialization flow and read
across the code.

This also set the grounds for add new steering modes. Currently two modes
are supported, and are named after the ConnectX HW versions A0 and B0.

A0 steering uses mac_index, vlan_index and priority to steer traffic
into pre-defined range of QPs.

B0 steering uses Ethernet L2 hashing rules and is enabled only
if the firmware supports both unicast and multicast B0 steering,

The current steering modes are relevant for Ethernet traffic only,
such that Infiniband steering remains untouched.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  107 ++++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx4/fw.c        |    2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c      |   17 ++++-
 drivers/net/ethernet/mellanox/mlx4/mcg.c       |   70 +++++++---------
 drivers/net/ethernet/mellanox/mlx4/port.c      |    9 +-
 include/linux/mlx4/device.h                    |   24 +++++
 6 files changed, 148 insertions(+), 81 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 35c64f2..f9a4a00 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -263,7 +263,7 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 	struct mlx4_en_mc_list *mclist, *tmp;
 	u64 mcast_addr = 0;
 	u8 mc_list[16] = {0};
-	int err;
+	int err = 0;
 
 	mutex_lock(&mdev->state_lock);
 	if (!mdev->device_up) {
@@ -298,16 +298,35 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 			priv->flags |= MLX4_EN_FLAG_PROMISC;
 
 			/* Enable promiscouos mode */
-			if (!(mdev->dev->caps.flags &
-						MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
-				err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
-							     priv->base_qpn, 1);
-			else
-				err = mlx4_unicast_promisc_add(mdev->dev, priv->base_qpn,
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_unicast_promisc_add(mdev->dev,
+							       priv->base_qpn,
 							       priv->port);
-			if (err)
-				en_err(priv, "Failed enabling "
-					     "promiscuous mode\n");
+				if (err)
+					en_err(priv, "Failed enabling unicast promiscuous mode\n");
+
+				/* Add the default qp number as multicast
+				* promisc */
+				if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+					err = mlx4_multicast_promisc_add(mdev->dev,
+									 priv->base_qpn,
+									 priv->port);
+					if (err)
+						en_err(priv, "Failed enabling multicast promiscuous mode\n");
+					priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+				}
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+							     priv->port,
+							     priv->base_qpn,
+							     1);
+				if (err)
+					en_err(priv, "Failed enabling promiscuous mode\n");
+				break;
+			}
 
 			/* Disable port multicast filter (unconditionally) */
 			err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
@@ -316,15 +335,6 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 				en_err(priv, "Failed disabling "
 					     "multicast filter\n");
 
-			/* Add the default qp number as multicast promisc */
-			if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
-				err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn,
-								 priv->port);
-				if (err)
-					en_err(priv, "Failed entering multicast promisc mode\n");
-				priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
-			}
-
 			/* Disable port VLAN filter */
 			err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
 			if (err)
@@ -343,22 +353,31 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
 
 		/* Disable promiscouos mode */
-		if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
-			err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
-						     priv->base_qpn, 0);
-		else
-			err = mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn,
+		switch (mdev->dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_B0:
+			err = mlx4_unicast_promisc_remove(mdev->dev,
+							  priv->base_qpn,
 							  priv->port);
-		if (err)
-			en_err(priv, "Failed disabling promiscuous mode\n");
+			if (err)
+				en_err(priv, "Failed disabling unicast promiscuous mode\n");
+			/* Disable Multicast promisc */
+			if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+				err = mlx4_multicast_promisc_remove(mdev->dev,
+								    priv->base_qpn,
+								    priv->port);
+				if (err)
+					en_err(priv, "Failed disabling multicast promiscuous mode\n");
+				priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+			}
+			break;
 
-		/* Disable Multicast promisc */
-		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
-			err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
-							    priv->port);
+		case MLX4_STEERING_MODE_A0:
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+						     priv->port,
+						     priv->base_qpn, 0);
 			if (err)
-				en_err(priv, "Failed disabling multicast promiscuous mode\n");
-			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+				en_err(priv, "Failed disabling promiscuous mode\n");
+			break;
 		}
 
 		/* Enable port VLAN filter */
@@ -376,8 +395,16 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 
 		/* Add the default qp number as multicast promisc */
 		if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
-			err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn,
-							 priv->port);
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
 			if (err)
 				en_err(priv, "Failed entering multicast promisc mode\n");
 			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
@@ -385,8 +412,16 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 	} else {
 		/* Disable Multicast promisc */
 		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
-			err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
-							    priv->port);
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_remove(mdev->dev,
+								    priv->base_qpn,
+								    priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
 			if (err)
 				en_err(priv, "Failed disabling multicast promiscuous mode\n");
 			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 9c83bb8..40e048b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1124,7 +1124,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	MLX4_PUT(inbox, param->mc_base,		INIT_HCA_MC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
 	MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
 		MLX4_PUT(inbox, (u8) (1 << 3),	INIT_HCA_UC_STEERING_OFFSET);
 	MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a0313de..7df0517 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -243,7 +243,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
 	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
 	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
-	dev->caps.num_qp_per_mgm     = mlx4_get_qp_per_mgm(dev);
 	/*
 	 * Subtract 1 from the limit because we need to allocate a
 	 * spare CQE so the HCA HW can tell the difference between an
@@ -274,6 +273,22 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
+	    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) {
+		dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
+
+	} else {
+		dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
+
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+			mlx4_warn(dev, "Must have UC_STEER and MC_STEER flags "
+				       "set to use B0 steering. Falling back to A0 steering mode.\n");
+	}
+	mlx4_dbg(dev, "Steering mode is: %s\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode));
+	dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
+
 	/* Sense port always allowed on supported devices for ConnectX1 and 2 */
 	if (dev->pdev->device != 0x1003)
 		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index f4a8f98..319c9d4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -868,36 +868,50 @@ static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
 int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  int block_mcast_loopback, enum mlx4_protocol prot)
 {
-	if (prot == MLX4_PROT_ETH &&
-			!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
-		return 0;
 
-	if (prot == MLX4_PROT_ETH)
-		gid[7] |= (MLX4_MC_STEER << 1);
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
 
-	if (mlx4_is_mfunc(dev))
-		return mlx4_QP_ATTACH(dev, qp, gid, 1,
-					block_mcast_loopback, prot);
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
 
-	return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback,
-					prot, MLX4_MC_STEER);
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					      block_mcast_loopback, prot);
+		return mlx4_qp_attach_common(dev, qp, gid,
+					     block_mcast_loopback, prot,
+					     MLX4_MC_STEER);
+
+	default:
+		return -EINVAL;
+	}
 }
 EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
 
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  enum mlx4_protocol prot)
 {
-	if (prot == MLX4_PROT_ETH &&
-			!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
-		return 0;
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
 
-	if (prot == MLX4_PROT_ETH)
-		gid[7] |= (MLX4_MC_STEER << 1);
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
 
-	if (mlx4_is_mfunc(dev))
-		return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+		return mlx4_qp_detach_common(dev, qp, gid, prot,
+					     MLX4_MC_STEER);
 
-	return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_MC_STEER);
+	default:
+		return -EINVAL;
+	}
 }
 EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
 
@@ -905,10 +919,6 @@ int mlx4_unicast_attach(struct mlx4_dev *dev,
 			struct mlx4_qp *qp, u8 gid[16],
 			int block_mcast_loopback, enum mlx4_protocol prot)
 {
-	if (prot == MLX4_PROT_ETH &&
-			!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
-		return 0;
-
 	if (prot == MLX4_PROT_ETH)
 		gid[7] |= (MLX4_UC_STEER << 1);
 
@@ -924,10 +934,6 @@ EXPORT_SYMBOL_GPL(mlx4_unicast_attach);
 int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
 			       u8 gid[16], enum mlx4_protocol prot)
 {
-	if (prot == MLX4_PROT_ETH &&
-			!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
-		return 0;
-
 	if (prot == MLX4_PROT_ETH)
 		gid[7] |= (MLX4_UC_STEER << 1);
 
@@ -968,9 +974,6 @@ static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn,
 
 int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
-		return 0;
-
 	if (mlx4_is_mfunc(dev))
 		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port);
 
@@ -980,9 +983,6 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add);
 
 int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER))
-		return 0;
-
 	if (mlx4_is_mfunc(dev))
 		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port);
 
@@ -992,9 +992,6 @@ EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove);
 
 int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
-		return 0;
-
 	if (mlx4_is_mfunc(dev))
 		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port);
 
@@ -1004,9 +1001,6 @@ EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add);
 
 int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
 {
-	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER))
-		return 0;
-
 	if (mlx4_is_mfunc(dev))
 		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index a8fb529..58de723 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -155,7 +155,7 @@ int mlx4_get_eth_qp(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn)
 		return err;
 	}
 
-	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER)) {
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
 		*qpn = info->base_qpn + index;
 		return 0;
 	}
@@ -206,7 +206,7 @@ void mlx4_put_eth_qp(struct mlx4_dev *dev, u8 port, u64 mac, int qpn)
 		 (unsigned long long) mac);
 	mlx4_unregister_mac(dev, port, mac);
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) {
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) {
 		entry = radix_tree_lookup(&info->mac_tree, qpn);
 		if (entry) {
 			mlx4_dbg(dev, "Releasing qp: port %d, mac 0x%llx,"
@@ -359,7 +359,7 @@ int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 	int index = qpn - info->base_qpn;
 	int err = 0;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) {
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) {
 		entry = radix_tree_lookup(&info->mac_tree, qpn);
 		if (!entry)
 			return -EINVAL;
@@ -803,8 +803,7 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 	u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ?
 		MCAST_DIRECT : MCAST_DEFAULT;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER  &&
-	    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER)
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
 		return 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 6a8f002..80891a4 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -70,6 +70,29 @@ enum {
 	MLX4_MFUNC_EQE_MASK     = (MLX4_MFUNC_MAX_EQES - 1)
 };
 
+/* Driver supports 2 diffrent device methods to manage traffic steering:
+	- B0 steering mode - Common low level API for ib and (if supported) eth.
+	- A0 steering mode - Limited low level API for eth. In case of IB,
+			     B0 mode is in use.
+ */
+enum {
+	MLX4_STEERING_MODE_A0,
+	MLX4_STEERING_MODE_B0
+};
+
+static inline const char *mlx4_steering_mode_str(int steering_mode)
+{
+	switch (steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		return "A0 steering";
+
+	case MLX4_STEERING_MODE_B0:
+		return "B0 steering";
+	default:
+		return "Unrecognize steering mode";
+	}
+}
+
 enum {
 	MLX4_DEV_CAP_FLAG_RC		= 1LL <<  0,
 	MLX4_DEV_CAP_FLAG_UC		= 1LL <<  1,
@@ -295,6 +318,7 @@ struct mlx4_caps {
 	int			num_amgms;
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
+	int			steering_mode;
 	int			num_pds;
 	int			reserved_pds;
 	int			max_xrcds;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 08/10] net/mlx4: Implement promiscuous mode with device managed flow-steering
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

The device managed flow steering API has three promiscuous modes:

1. Uplink - captures all the packets that arrive to the port.
2. Allmulti - captures all multicast packets arriving to the port.
3. Function port - for future use, this mode is not implemented yet.

Use these modes with the flow_attach and flow_detach firmware commands
according to the promiscuous state of the netdevice.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |   41 ++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mcg.c       |   60 ++++++++++++++++++++++++
 include/linux/mlx4/device.h                    |    8 +++
 3 files changed, 109 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 62f0601..46b58d8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -299,6 +299,16 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 
 			/* Enable promiscouos mode */
 			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_add(mdev->dev,
+								  priv->port,
+								  priv->base_qpn,
+								  MLX4_FS_PROMISC_UPLINK);
+				if (err)
+					en_err(priv, "Failed enabling promiscuous mode\n");
+				priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+				break;
+
 			case MLX4_STEERING_MODE_B0:
 				err = mlx4_unicast_promisc_add(mdev->dev,
 							       priv->base_qpn,
@@ -354,6 +364,15 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 
 		/* Disable promiscouos mode */
 		switch (mdev->dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			err = mlx4_flow_steer_promisc_remove(mdev->dev,
+							     priv->port,
+							     MLX4_FS_PROMISC_UPLINK);
+			if (err)
+				en_err(priv, "Failed disabling promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+			break;
+
 		case MLX4_STEERING_MODE_B0:
 			err = mlx4_unicast_promisc_remove(mdev->dev,
 							  priv->base_qpn,
@@ -396,6 +415,13 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 		/* Add the default qp number as multicast promisc */
 		if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
 			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_add(mdev->dev,
+								  priv->port,
+								  priv->base_qpn,
+								  MLX4_FS_PROMISC_ALL_MULTI);
+				break;
+
 			case MLX4_STEERING_MODE_B0:
 				err = mlx4_multicast_promisc_add(mdev->dev,
 								 priv->base_qpn,
@@ -413,6 +439,12 @@ static void mlx4_en_do_set_multicast(struct work_struct *work)
 		/* Disable Multicast promisc */
 		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
 			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_remove(mdev->dev,
+								     priv->port,
+								     MLX4_FS_PROMISC_ALL_MULTI);
+				break;
+
 			case MLX4_STEERING_MODE_B0:
 				err = mlx4_multicast_promisc_remove(mdev->dev,
 								    priv->base_qpn,
@@ -836,6 +868,15 @@ int mlx4_en_start_port(struct net_device *dev)
 
 	/* Must redo promiscuous mode setup. */
 	priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC);
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+						priv->port,
+						MLX4_FS_PROMISC_UPLINK);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+						priv->port,
+						MLX4_FS_PROMISC_ALL_MULTI);
+	}
 
 	/* Schedule multicast task to populate multicast list */
 	queue_work(mdev->workqueue, &priv->mcast_task);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index c567655..c0c9751 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -1295,6 +1295,66 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 }
 EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
 
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
+				u32 qpn, enum mlx4_net_trans_promisc_mode mode)
+{
+	struct mlx4_net_trans_rule rule;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_PROMISC_UPLINK:
+	case MLX4_FS_PROMISC_FUNCTION_PORT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_PROMISC_ALL_MULTI:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p != 0)
+		return -1;
+
+	rule.promisc_mode = mode;
+	rule.port = port;
+	rule.qpn = qpn;
+	INIT_LIST_HEAD(&rule.list);
+	mlx4_err(dev, "going promisc on %x\n", port);
+
+	return  mlx4_flow_attach(dev, &rule, regid_p);
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add);
+
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode)
+{
+	int ret;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_PROMISC_UPLINK:
+	case MLX4_FS_PROMISC_FUNCTION_PORT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_PROMISC_ALL_MULTI:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p == 0)
+		return -1;
+
+	ret =  mlx4_flow_detach(dev, *regid_p);
+	if (ret == 0)
+		*regid_p = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove);
+
 int mlx4_unicast_attach(struct mlx4_dev *dev,
 			struct mlx4_qp *qp, u8 gid[16],
 			int block_mcast_loopback, enum mlx4_protocol prot)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a22a851..692f9db 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -542,6 +542,8 @@ struct mlx4_dev {
 	u8			rev_id;
 	char			board_id[MLX4_BOARD_ID_LEN];
 	int			num_vfs;
+	u64 regid_promisc_array[MLX4_MAX_PORTS + 1];
+	u64 regid_allmulti_array[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_init_port_param {
@@ -681,6 +683,7 @@ enum mlx4_net_trans_rule_id {
 enum mlx4_net_trans_promisc_mode {
 	MLX4_FS_PROMISC_NONE = 0,
 	MLX4_FS_PROMISC_UPLINK,
+	/* For future use. Not implemented yet*/
 	MLX4_FS_PROMISC_FUNCTION_PORT,
 	MLX4_FS_PROMISC_ALL_MULTI,
 };
@@ -744,6 +747,11 @@ struct mlx4_net_trans_rule {
 	u32	qpn;
 };
 
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
+				u32 qpn,
+				enum mlx4_net_trans_promisc_mode mode);
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode);
 int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
 int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 02/10] net/mlx4_core: Change resource tracking ID to be 64 bit
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Currently the IDs used by the resource tracker are of type u32, so far this was
ok since all the different resources we were tracking could be encoded in 32bit.

As a preparation step for tracking of resources whose IDs need > 32 bits such
as network flow steering rules, who are 64 bit in size, move to use 64 bit
based resource IDs.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    2 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   28 ++++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 1a2f372..a425a98 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1033,7 +1033,7 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
 /* resource tracker functions*/
 int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 				    enum mlx4_resource resource_type,
-				    int resource_id, int *slave);
+				    u64 resource_id, int *slave);
 void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
 int mlx4_init_resource_tracker(struct mlx4_dev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 6f89d44..b8e8969 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -58,7 +58,7 @@ struct mac_res {
 struct res_common {
 	struct list_head	list;
 	struct rb_node		node;
-	u32		        res_id;
+	u64		        res_id;
 	int			owner;
 	int			state;
 	int			from_state;
@@ -315,7 +315,7 @@ static int mpt_mask(struct mlx4_dev *dev)
 	return dev->caps.num_mpts - 1;
 }
 
-static struct res_common *find_res(struct mlx4_dev *dev, int res_id,
+static struct res_common *find_res(struct mlx4_dev *dev, u64 res_id,
 				   enum mlx4_resource type)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -324,7 +324,7 @@ static struct res_common *find_res(struct mlx4_dev *dev, int res_id,
 				  res_id);
 }
 
-static int get_res(struct mlx4_dev *dev, int slave, int res_id,
+static int get_res(struct mlx4_dev *dev, int slave, u64 res_id,
 		   enum mlx4_resource type,
 		   void *res)
 {
@@ -350,7 +350,7 @@ static int get_res(struct mlx4_dev *dev, int slave, int res_id,
 
 	r->from_state = r->state;
 	r->state = RES_ANY_BUSY;
-	mlx4_dbg(dev, "res %s id 0x%x to busy\n",
+	mlx4_dbg(dev, "res %s id 0x%llx to busy\n",
 		 ResourceType(type), r->res_id);
 
 	if (res)
@@ -363,7 +363,7 @@ exit:
 
 int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 				    enum mlx4_resource type,
-				    int res_id, int *slave)
+				    u64 res_id, int *slave)
 {
 
 	struct res_common *r;
@@ -384,7 +384,7 @@ int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 	return err;
 }
 
-static void put_res(struct mlx4_dev *dev, int slave, int res_id,
+static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
 		    enum mlx4_resource type)
 {
 	struct res_common *r;
@@ -516,7 +516,7 @@ static struct res_common *alloc_xrcdn_tr(int id)
 	return &ret->com;
 }
 
-static struct res_common *alloc_tr(int id, enum mlx4_resource type, int slave,
+static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 				   int extra)
 {
 	struct res_common *ret;
@@ -558,7 +558,7 @@ static struct res_common *alloc_tr(int id, enum mlx4_resource type, int slave,
 	return ret;
 }
 
-static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
+static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 			 enum mlx4_resource type, int extra)
 {
 	int i;
@@ -727,10 +727,10 @@ static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
 	}
 }
 
-static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count,
+static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 			 enum mlx4_resource type, int extra)
 {
-	int i;
+	u64 i;
 	int err;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
@@ -784,7 +784,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 	else {
 		switch (state) {
 		case RES_QP_BUSY:
-			mlx4_dbg(dev, "%s: failed RES_QP, 0x%x\n",
+			mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n",
 				 __func__, r->com.res_id);
 			err = -EBUSY;
 			break;
@@ -793,7 +793,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 			if (r->com.state == RES_QP_MAPPED && !alloc)
 				break;
 
-			mlx4_dbg(dev, "failed RES_QP, 0x%x\n", r->com.res_id);
+			mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id);
 			err = -EINVAL;
 			break;
 
@@ -802,7 +802,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 			    r->com.state == RES_QP_HW)
 				break;
 			else {
-				mlx4_dbg(dev, "failed RES_QP, 0x%x\n",
+				mlx4_dbg(dev, "failed RES_QP, 0x%llx\n",
 					  r->com.res_id);
 				err = -EINVAL;
 			}
@@ -2796,7 +2796,7 @@ static int _move_all_busy(struct mlx4_dev *dev, int slave,
 				if (r->state == RES_ANY_BUSY) {
 					if (print)
 						mlx4_dbg(dev,
-							 "%s id 0x%x is busy\n",
+							 "%s id 0x%llx is busy\n",
 							  ResourceType(type),
 							  r->res_id);
 					++busy;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 01/10] net/mlx4_core: Change resource tracking mechanism to use red-black tree
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Change the data structure used for managing the SRIOV resource tracking
mechanism from radix tree to red-black tree. This is preparation step
for supporting resource IDs which are 64bit long, such as network flow
steering rules. Such IDs can't be used as radix-tree keys on 32bit
architectures and hence the reason for the change.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    3 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  112 ++++++++++++++------
 2 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index e5d2022..1a2f372 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -39,6 +39,7 @@
 
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #include <linux/timer.h>
 #include <linux/semaphore.h>
 #include <linux/workqueue.h>
@@ -509,7 +510,7 @@ struct slave_list {
 struct mlx4_resource_tracker {
 	spinlock_t lock;
 	/* tree for each resources */
-	struct radix_tree_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
+	struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
 	/* num_of_slave's lists, one per slave */
 	struct slave_list *slave_list;
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 766b8c5..6f89d44 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -57,6 +57,7 @@ struct mac_res {
 
 struct res_common {
 	struct list_head	list;
+	struct rb_node		node;
 	u32		        res_id;
 	int			owner;
 	int			state;
@@ -189,6 +190,49 @@ struct res_xrcdn {
 	int			port;
 };
 
+static struct res_common *res_tracker_lookup(struct rb_root *root, u64 res_id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct res_common *res = container_of(node, struct res_common,
+						      node);
+
+		if (res_id < res->res_id)
+			node = node->rb_left;
+		else if (res_id > res->res_id)
+			node = node->rb_right;
+		else
+			return res;
+	}
+	return NULL;
+}
+
+static int res_tracker_insert(struct rb_root *root, struct res_common *res)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct res_common *this = container_of(*new, struct res_common,
+						       node);
+
+		parent = *new;
+		if (res->res_id < this->res_id)
+			new = &((*new)->rb_left);
+		else if (res->res_id > this->res_id)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&res->node, parent, new);
+	rb_insert_color(&res->node, root);
+
+	return 0;
+}
+
 /* For Debug uses */
 static const char *ResourceType(enum mlx4_resource rt)
 {
@@ -228,8 +272,7 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n",
 		 dev->num_slaves);
 	for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++)
-		INIT_RADIX_TREE(&priv->mfunc.master.res_tracker.res_tree[i],
-				GFP_ATOMIC|__GFP_NOWARN);
+		priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT;
 
 	spin_lock_init(&priv->mfunc.master.res_tracker.lock);
 	return 0 ;
@@ -272,13 +315,13 @@ static int mpt_mask(struct mlx4_dev *dev)
 	return dev->caps.num_mpts - 1;
 }
 
-static void *find_res(struct mlx4_dev *dev, int res_id,
-		      enum mlx4_resource type)
+static struct res_common *find_res(struct mlx4_dev *dev, int res_id,
+				   enum mlx4_resource type)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	return radix_tree_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
-				 res_id);
+	return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
+				  res_id);
 }
 
 static int get_res(struct mlx4_dev *dev, int slave, int res_id,
@@ -523,7 +566,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct res_common **res_arr;
 	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
-	struct radix_tree_root *root = &tracker->res_tree[type];
+	struct rb_root *root = &tracker->res_tree[type];
 
 	res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL);
 	if (!res_arr)
@@ -546,7 +589,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 			err = -EEXIST;
 			goto undo;
 		}
-		err = radix_tree_insert(root, base + i, res_arr[i]);
+		err = res_tracker_insert(root, res_arr[i]);
 		if (err)
 			goto undo;
 		list_add_tail(&res_arr[i]->list,
@@ -559,7 +602,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 
 undo:
 	for (--i; i >= base; --i)
-		radix_tree_delete(&tracker->res_tree[type], i);
+		rb_erase(&res_arr[i]->node, root);
 
 	spin_unlock_irq(mlx4_tlock(dev));
 
@@ -695,7 +738,7 @@ static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 
 	spin_lock_irq(mlx4_tlock(dev));
 	for (i = base; i < base + count; ++i) {
-		r = radix_tree_lookup(&tracker->res_tree[type], i);
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
 		if (!r) {
 			err = -ENOENT;
 			goto out;
@@ -710,8 +753,8 @@ static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 	}
 
 	for (i = base; i < base + count; ++i) {
-		r = radix_tree_lookup(&tracker->res_tree[type], i);
-		radix_tree_delete(&tracker->res_tree[type], i);
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		rb_erase(&r->node, &tracker->res_tree[type]);
 		list_del(&r->list);
 		kfree(r);
 	}
@@ -733,7 +776,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_QP], qpn);
+	r = (struct res_qp *)res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -797,7 +840,8 @@ static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_MPT], index);
+	r = (struct res_mpt *)res_tracker_lookup(&tracker->res_tree[RES_MPT],
+					     index);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -850,7 +894,7 @@ static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_EQ], index);
+	r = (struct res_eq *)res_tracker_lookup(&tracker->res_tree[RES_EQ], index);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -898,7 +942,7 @@ static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
 	int err;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_CQ], cqn);
+	r = (struct res_cq *)res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -952,7 +996,8 @@ static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_SRQ], index);
+	r = (struct res_srq *)res_tracker_lookup(&tracker->res_tree[RES_SRQ],
+					     index);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -1001,7 +1046,7 @@ static void res_abort_move(struct mlx4_dev *dev, int slave,
 	struct res_common *r;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[type], id);
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
 	if (r && (r->owner == slave))
 		r->state = r->from_state;
 	spin_unlock_irq(mlx4_tlock(dev));
@@ -1015,7 +1060,7 @@ static void res_end_move(struct mlx4_dev *dev, int slave,
 	struct res_common *r;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[type], id);
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
 	if (r && (r->owner == slave))
 		r->state = r->to_state;
 	spin_unlock_irq(mlx4_tlock(dev));
@@ -2817,8 +2862,8 @@ static void rem_slave_qps(struct mlx4_dev *dev, int slave)
 				switch (state) {
 				case RES_QP_RESERVED:
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_QP],
-							  qp->com.res_id);
+					rb_erase(&qp->com.node,
+						 &tracker->res_tree[RES_QP]);
 					list_del(&qp->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(qp);
@@ -2888,8 +2933,8 @@ static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
 				case RES_SRQ_ALLOCATED:
 					__mlx4_srq_free_icm(dev, srqn);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_SRQ],
-							  srqn);
+					rb_erase(&srq->com.node,
+						 &tracker->res_tree[RES_SRQ]);
 					list_del(&srq->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(srq);
@@ -2954,8 +2999,8 @@ static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
 				case RES_CQ_ALLOCATED:
 					__mlx4_cq_free_icm(dev, cqn);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_CQ],
-							  cqn);
+					rb_erase(&cq->com.node,
+						 &tracker->res_tree[RES_CQ]);
 					list_del(&cq->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(cq);
@@ -3017,8 +3062,8 @@ static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
 				case RES_MPT_RESERVED:
 					__mlx4_mr_release(dev, mpt->key);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_MPT],
-							  mptn);
+					rb_erase(&mpt->com.node,
+						 &tracker->res_tree[RES_MPT]);
 					list_del(&mpt->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(mpt);
@@ -3086,8 +3131,8 @@ static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
 					__mlx4_free_mtt_range(dev, base,
 							      mtt->order);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_MTT],
-							  base);
+					rb_erase(&mtt->com.node,
+						 &tracker->res_tree[RES_MTT]);
 					list_del(&mtt->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(mtt);
@@ -3133,8 +3178,8 @@ static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
 				switch (state) {
 				case RES_EQ_RESERVED:
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_EQ],
-							  eqn);
+					rb_erase(&eq->com.node,
+						 &tracker->res_tree[RES_EQ]);
 					list_del(&eq->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(eq);
@@ -3191,7 +3236,8 @@ static void rem_slave_counters(struct mlx4_dev *dev, int slave)
 	list_for_each_entry_safe(counter, tmp, counter_list, com.list) {
 		if (counter->com.owner == slave) {
 			index = counter->com.res_id;
-			radix_tree_delete(&tracker->res_tree[RES_COUNTER], index);
+			rb_erase(&counter->com.node,
+				 &tracker->res_tree[RES_COUNTER]);
 			list_del(&counter->com.list);
 			kfree(counter);
 			__mlx4_counter_free(dev, index);
@@ -3220,7 +3266,7 @@ static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
 	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
 		if (xrcd->com.owner == slave) {
 			xrcdn = xrcd->com.res_id;
-			radix_tree_delete(&tracker->res_tree[RES_XRCD], xrcdn);
+			rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]);
 			list_del(&xrcd->com.list);
 			kfree(xrcd);
 			__mlx4_xrcd_free(dev, xrcdn);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 00/10] net/mlx4: Add flow-steering support
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Or Gerlitz, Hadar Hen Zion

Hi Dave, 

This patch series from Hadar adds code to manage L2/L3/L4 network 
flow steering rules, a feature which is supported by the ConnectX-3 device.

The series is built as follows:

The first two patches deal with SRIOV resource tracker, whose mechanism 
is changed to use red-black tree instead of radix tree. The reason for 
this change is that the coming steering patches use flow IDs which are 64 
bits in size, where radix tree keys can't be 64bit on 32bit architecture, 
while RB tree can do that.

Patch #3 is little re-design of the Ethernet driver multicast attachments 
flow to be more efficient and robust.

The fourth patch does a re-org of the checks that deal with the current 
"older" steering modes such that we can easily add soon the new steering 
mode and the code remains easy to manage.

Patch #5 adds the firmware commands for the new steering mode, which is 
called "device managed flow steeering".

Patch 6 is the main patch of this series. It adds support for device-managed flow 
steering all across the place. We had to have this patch also to touch the mlx4 
IB driver, since the steering mode is global to the HCA -- so when being enabled, 
multicast attachment calls done by the IB driver into the mlx4 core driver, 
are now routed to the flow steering firmware commands whose API is a bit different, 
something that the IB driver had to be aware to. Following that, the 7th patch 
adds resource tracking for device-managed flow steering rules.

The 8th patch adds promiscuous mode support under device-managed flow steering,
next, the 9th patch adds implementation for the ethtool APIs for attaching 
L2/L3/L4 based flow steering rules, and the last patch adds support for drop 
action through ethtool.

Or.

Hadar Hen Zion (9):
  net/mlx4_core: Change resource tracking mechanism to use red-black tree
  net/mlx4_core: Change resource tracking ID to be 64 bit
  net/mlx4: Set steering mode according to device capabilities
  net/mlx4_core: Add firmware commands to support device managed flow steering
  {NET,IB}/mlx4: Add device managed flow steering firmware API
  net/mlx4_core: Add resource tracking for device managed flow steering rules
  net/mlx4: Implement promiscuous mode with device managed flow-steering
  net/mlx4_en: Manage flow steering rules with ethtool
  net/mlx4_en: Add support for drop action through ethtool

Yevgeny Petrilin (1):
  net/mlx4_en: Re-design multicast attachments flow

 drivers/infiniband/hw/mlx4/main.c                  |   62 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h               |    1 +
 drivers/infiniband/hw/mlx4/qp.c                    |    1 +
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |   19 +
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c    |  373 ++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  313 +++++++++---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   30 ++
 drivers/net/ethernet/mellanox/mlx4/fw.c            |   90 +++-
 drivers/net/ethernet/mellanox/mlx4/fw.h            |    3 +
 drivers/net/ethernet/mellanox/mlx4/main.c          |   60 ++-
 drivers/net/ethernet/mellanox/mlx4/mcg.c           |  524 ++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   29 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |   28 +-
 drivers/net/ethernet/mellanox/mlx4/port.c          |  111 +++--
 drivers/net/ethernet/mellanox/mlx4/profile.c       |   12 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  284 +++++++++--
 include/linux/mlx4/cmd.h                           |    4 +
 include/linux/mlx4/device.h                        |  136 +++++-
 18 files changed, 1848 insertions(+), 232 deletions(-)

-- 
1.7.1

Cc: Hadar Hen Zion <hadarh@mellanox.co.il>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox