Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 03/15] ipv4: Remove rt_key_{src,dst,tos} from struct rtable.
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


They are always used in contexts where they can be reconstituted,
or where the finally resolved rt->rt_{src,dst} is semantically
equivalent.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    5 -----
 net/ipv4/route.c        |   39 +++++++++------------------------------
 net/ipv4/xfrm4_policy.c |    3 ---
 3 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 5c86c47..935fa59 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -44,14 +44,9 @@ struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
 
-	/* Lookup key. */
-	__be32			rt_key_dst;
-	__be32			rt_key_src;
-
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
-	__u8			rt_key_tos;
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 534bc4d..83e9663 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1250,12 +1250,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	rth->dst.output = ip_rt_bug;
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1374,12 +1371,9 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
@@ -1545,12 +1539,9 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1650,9 +1641,7 @@ EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-				       const struct flowi4 *fl4,
-				       __be32 orig_daddr, __be32 orig_saddr,
-				       int orig_oif, __u8 orig_rtos,
+				       const struct flowi4 *fl4, int orig_oif,
 				       struct net_device *dev_out,
 				       unsigned int flags)
 {
@@ -1703,12 +1692,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rth->dst.output = ip_output;
 
-	rth->rt_key_dst	= orig_daddr;
-	rth->rt_key_src	= orig_saddr;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_key_tos	= orig_rtos;
 	rth->rt_dst	= fl4->daddr;
 	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
@@ -1759,16 +1745,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	__be32 orig_daddr;
-	__be32 orig_saddr;
 	int orig_oif;
 
 	res.tclassid	= 0;
 	res.fi		= NULL;
 	res.table	= NULL;
 
-	orig_daddr = fl4->daddr;
-	orig_saddr = fl4->saddr;
 	orig_oif = fl4->flowi4_oif;
 
 	fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -1930,8 +1912,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 
 make_route:
-	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-			       tos, dev_out, flags);
+	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
 
 out:
 	rcu_read_unlock();
@@ -1996,9 +1977,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_key_dst = ort->rt_key_dst;
-		rt->rt_key_src = ort->rt_key_src;
-		rt->rt_key_tos = ort->rt_key_tos;
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
@@ -2040,7 +2018,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,
+static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2058,7 +2036,7 @@ static int rt_fill_info(struct net *net,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= rt->rt_key_tos;
+	r->rtm_tos	= tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2071,9 +2049,9 @@ static int rt_fill_info(struct net *net,
 
 	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
 		goto nla_put_failure;
-	if (rt->rt_key_src) {
+	if (src) {
 		r->rtm_src_len = 32;
-		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+		if (nla_put_be32(skb, RTA_SRC, src))
 			goto nla_put_failure;
 	}
 	if (rt->dst.dev &&
@@ -2085,7 +2063,7 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != rt->rt_key_src) {
+	    rt->rt_src != src) {
 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 			goto nla_put_failure;
 	}
@@ -2226,7 +2204,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index fcf7678..2a8d5cf 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,9 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_key_dst = fl4->daddr;
-	xdst->u.rt.rt_key_src = fl4->saddr;
-	xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 02/15] ipv4: Kill ip_route_input_noref().
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


The "noref" argument to ip_route_input_common() is now always ignored
because we do not cache routes, and in that case we must always grab
a reference to the resulting 'dst'.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h    |   16 ++--------------
 net/ipv4/arp.c         |    2 +-
 net/ipv4/ip_fragment.c |    4 ++--
 net/ipv4/ip_input.c    |    4 ++--
 net/ipv4/route.c       |    6 +++---
 net/ipv4/xfrm4_input.c |    4 ++--
 6 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 5dcfeb6..5c86c47 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -160,20 +160,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 	return ip_route_output_key(net, fl4);
 }
 
-extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
-
-static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
-}
-
-static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
-}
+extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
+			  u8 tos, struct net_device *devin);
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			     int oif, u32 mark, u8 protocol, int flow_flags);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0..c38293f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c97..7ad88e5 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
 		/* skb dst is stale, drop it, and perform route lookup again */
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev);
+		err = ip_route_input(head, iph->daddr, iph->saddr,
+				     iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b27d444..4ebc6fe 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
+		int err = ip_route_input(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EXDEV)
 				NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 60ce756..534bc4d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1602,8 +1602,8 @@ martian_source_keep_err:
 	goto out;
 }
 
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
 {
 	int res;
 
@@ -1646,7 +1646,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_unlock();
 	return res;
 }
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6..58d23a5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 01/15] ipv4: Delete routing cache.
From: David Miller @ 2012-07-18 18:22 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/fib_frontend.c |    5 -
 net/ipv4/route.c        |  940 +----------------------------------------------
 3 files changed, 13 insertions(+), 933 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index ace3cb4..5dcfeb6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
 struct in_device;
 extern int		ip_rt_init(void);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(struct net *net);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 					   struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7a31194..c1fde53 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1071,11 +1071,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		/* The batch unregister is only called on the first
-		 * device in the list of devices being unregistered.
-		 * Therefore we should not pass dev_net(dev) in here.
-		 */
-		rt_cache_flush_batch(NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e702..60ce756 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
-static int rt_chain_length_max __read_mostly	= 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
 
 /*
  *	Interface to generic destination cache.
@@ -152,7 +148,6 @@ static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			    int how)
@@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
-	.gc =			rt_garbage_collect,
 	.check =		ipv4_dst_check,
 	.default_advmss =	ipv4_default_advmss,
 	.mtu =			ipv4_mtu,
@@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = {
 };
 EXPORT_SYMBOL(ip_tos2prio);
 
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- *    as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- *    they do so with atomic increments and with the
- *    lock held.
- */
-
-struct rt_hash_bucket {
-	struct rtable __rcu	*chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ	256
-#else
-# if NR_CPUS >= 32
-#  define RT_HASH_LOCK_SZ	4096
-# elif NR_CPUS >= 16
-#  define RT_HASH_LOCK_SZ	2048
-# elif NR_CPUS >= 8
-#  define RT_HASH_LOCK_SZ	1024
-# elif NR_CPUS >= 4
-#  define RT_HASH_LOCK_SZ	512
-# else
-#  define RT_HASH_LOCK_SZ	256
-# endif
-#endif
-
-static spinlock_t	*rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
-	int i;
-
-	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
-			GFP_KERNEL);
-	if (!rt_hash_locks)
-		panic("IP: failed to allocate rt_hash_locks\n");
-
-	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
-		spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
-static unsigned int		rt_hash_mask __read_mostly;
-static unsigned int		rt_hash_log  __read_mostly;
-
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-				   int genid)
-{
-	return jhash_3words((__force u32)daddr, (__force u32)saddr,
-			    idx, genid)
-		& rt_hash_mask;
-}
-
 static inline int rt_genid(struct net *net)
 {
 	return atomic_read(&net->ipv4.rt_genid);
 }
 
 #ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
-	struct seq_net_private p;
-	int bucket;
-	int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	struct rtable *r = NULL;
-
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
-			continue;
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-		while (r) {
-			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
-			    r->rt_genid == st->genid)
-				return r;
-			r = rcu_dereference_bh(r->dst.rt_next);
-		}
-		rcu_read_unlock_bh();
-	}
-	return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
-					  struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-
-	r = rcu_dereference_bh(r->dst.rt_next);
-	while (!r) {
-		rcu_read_unlock_bh();
-		do {
-			if (--st->bucket < 0)
-				return NULL;
-		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
-					struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-		if (dev_net(r->dst.dev) != seq_file_net(seq))
-			continue;
-		if (r->rt_genid == st->genid)
-			break;
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
-	struct rtable *r = rt_cache_get_first(seq);
-
-	if (r)
-		while (pos && (r = rt_cache_get_next(seq, r)))
-			--pos;
-	return pos ? NULL : r;
-}
-
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct rt_cache_iter_state *st = seq->private;
 	if (*pos)
-		return rt_cache_get_idx(seq, *pos - 1);
-	st->genid = rt_genid(seq_file_net(seq));
+		return NULL;
 	return SEQ_START_TOKEN;
 }
 
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct rtable *r;
-
-	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(seq);
-	else
-		r = rt_cache_get_next(seq, v);
 	++*pos;
-	return r;
+	return NULL;
 }
 
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
-	if (v && v != SEQ_START_TOKEN)
-		rcu_read_unlock_bh();
 }
 
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 			   "HHUptod\tSpecDst");
-	else {
-		struct rtable *r = v;
-		int len;
-
-		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-			   "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			   r->dst.dev ? r->dst.dev->name : "*",
-			   (__force u32)r->rt_dst,
-			   (__force u32)r->rt_gateway,
-			   r->rt_flags, atomic_read(&r->dst.__refcnt),
-			   r->dst.__use, 0, (__force u32)r->rt_src,
-			   dst_metric_advmss(&r->dst) + 40,
-			   dst_metric(&r->dst, RTAX_WINDOW), 0,
-			   r->rt_key_tos,
-			   -1, 0, 0, &len);
-
-		seq_printf(seq, "%*s\n", 127 - len, "");
-	}
 	return 0;
 }
 
@@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = {
 
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &rt_cache_seq_ops,
-			sizeof(struct rt_cache_iter_state));
+	return seq_open(file, &rt_cache_seq_ops);
 }
 
 static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = {
 	.open	 = rt_cache_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release_net,
+	.release = seq_release,
 };
 
 
@@ -625,263 +446,12 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
-	ip_rt_put(rt);
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
-	/* Kill broadcast/multicast entries very aggresively, if they
-	   collide in hash table with more useful entries */
-	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
-	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		rth->dst.expires;
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
-	unsigned long age;
-	int ret = 0;
-
-	if (atomic_read(&rth->dst.__refcnt))
-		goto out;
-
-	age = jiffies - rth->dst.lastuse;
-	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
-	    (age <= tmo2 && rt_valuable(rth)))
-		goto out;
-	ret = 1;
-out:	return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
-	u32 score = jiffies - rt->dst.lastuse;
-
-	score = ~score & ~(3<<30);
-
-	if (rt_valuable(rt))
-		score |= (1<<31);
-
-	if (rt_is_output_route(rt) ||
-	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
-		score |= (1<<30);
-
-	return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
-	return net->ipv4.current_rt_cache_rebuild_count <=
-		net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct rtable *rt1,
-				       const struct rtable *rt2)
-{
-	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
-	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_mark ^ rt2->rt_mark) |
-		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
-		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
-	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
 static inline int rt_is_expired(struct rtable *rth)
 {
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
 /*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
-	unsigned int i;
-	struct rtable *rth, *next;
-
-	for (i = 0; i <= rt_hash_mask; i++) {
-		struct rtable __rcu **pprev;
-		struct rtable *list;
-
-		if (process_context && need_resched())
-			cond_resched();
-		rth = rcu_access_pointer(rt_hash_table[i].chain);
-		if (!rth)
-			continue;
-
-		spin_lock_bh(rt_hash_lock_addr(i));
-
-		list = NULL;
-		pprev = &rt_hash_table[i].chain;
-		rth = rcu_dereference_protected(*pprev,
-			lockdep_is_held(rt_hash_lock_addr(i)));
-
-		while (rth) {
-			next = rcu_dereference_protected(rth->dst.rt_next,
-				lockdep_is_held(rt_hash_lock_addr(i)));
-
-			if (!net ||
-			    net_eq(dev_net(rth->dst.dev), net)) {
-				rcu_assign_pointer(*pprev, next);
-				rcu_assign_pointer(rth->dst.rt_next, list);
-				list = rth;
-			} else {
-				pprev = &rth->dst.rt_next;
-			}
-			rth = next;
-		}
-
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; list; list = next) {
-			next = rcu_dereference_protected(list->dst.rt_next, 1);
-			rt_free(list);
-		}
-	}
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
-	const struct rtable *aux = head;
-
-	while (aux != rth) {
-		if (compare_hash_inputs(aux, rth))
-			return 0;
-		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
-	}
-	return ONE;
-}
-
-static void rt_check_expire(void)
-{
-	static unsigned int rover;
-	unsigned int i = rover, goal;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long samples = 0;
-	unsigned long sum = 0, sum2 = 0;
-	unsigned long delta;
-	u64 mult;
-
-	delta = jiffies - expires_ljiffies;
-	expires_ljiffies = jiffies;
-	mult = ((u64)delta) << rt_hash_log;
-	if (ip_rt_gc_timeout > 1)
-		do_div(mult, ip_rt_gc_timeout);
-	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask)
-		goal = rt_hash_mask + 1;
-	for (; goal > 0; goal--) {
-		unsigned long tmo = ip_rt_gc_timeout;
-		unsigned long length;
-
-		i = (i + 1) & rt_hash_mask;
-		rthp = &rt_hash_table[i].chain;
-
-		if (need_resched())
-			cond_resched();
-
-		samples++;
-
-		if (rcu_dereference_raw(*rthp) == NULL)
-			continue;
-		length = 0;
-		spin_lock_bh(rt_hash_lock_addr(i));
-		while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
-			prefetch(rth->dst.rt_next);
-			if (rt_is_expired(rth) ||
-			    rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				continue;
-			}
-
-			/* We only count entries on a chain with equal
-			 * hash inputs once so that entries for
-			 * different QOS levels, and other non-hash
-			 * input attributes don't unfairly skew the
-			 * length computation
-			 */
-			tmo >>= 1;
-			rthp = &rth->dst.rt_next;
-			length += has_noalias(rt_hash_table[i].chain, rth);
-		}
-		spin_unlock_bh(rt_hash_lock_addr(i));
-		sum += length;
-		sum2 += length*length;
-	}
-	if (samples) {
-		unsigned long avg = sum / samples;
-		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
-		rt_chain_length_max = max_t(unsigned long,
-					ip_rt_gc_elasticity,
-					(avg + 4*sd) >> FRACT_BITS);
-	}
-	rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
-	rt_check_expire();
-	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
-/*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
  * many times (2^24) without giving recent rt_genid.
@@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net)
 void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
-	if (delay >= 0)
-		rt_do_flush(net, !in_softirq());
-}
-
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
-	rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
-	net_warn_ratelimited("Route hash chain too long!\n");
-	rt_cache_invalidate(net);
-}
-
-/*
-   Short description of GC goals.
-
-   We want to build algorithm, which will keep routing cache
-   at some equilibrium point, when number of aged off entries
-   is kept approximately equal to newly generated ones.
-
-   Current expiration strength is variable "expire".
-   We try to adjust it dynamically, so that if networking
-   is idle expires is large enough to keep enough of warm entries,
-   and when load increases it reduces to limit cache size.
- */
-
-static int rt_garbage_collect(struct dst_ops *ops)
-{
-	static unsigned long expire = RT_GC_TIMEOUT;
-	static unsigned long last_gc;
-	static int rover;
-	static int equilibrium;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long now = jiffies;
-	int goal;
-	int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
-	/*
-	 * Garbage collection is pretty expensive,
-	 * do not make it too frequently.
-	 */
-
-	RT_CACHE_STAT_INC(gc_total);
-
-	if (now - last_gc < ip_rt_gc_min_interval &&
-	    entries < ip_rt_max_size) {
-		RT_CACHE_STAT_INC(gc_ignored);
-		goto out;
-	}
-
-	entries = dst_entries_get_slow(&ipv4_dst_ops);
-	/* Calculate number of entries, which we want to expire now. */
-	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-	if (goal <= 0) {
-		if (equilibrium < ipv4_dst_ops.gc_thresh)
-			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = entries - equilibrium;
-		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = entries - equilibrium;
-		}
-	} else {
-		/* We are in dangerous area. Try to reduce cache really
-		 * aggressively.
-		 */
-		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = entries - goal;
-	}
-
-	if (now - last_gc >= ip_rt_gc_min_interval)
-		last_gc = now;
-
-	if (goal <= 0) {
-		equilibrium += goal;
-		goto work_done;
-	}
-
-	do {
-		int i, k;
-
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
-			unsigned long tmo = expire;
-
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
-				if (!rt_is_expired(rth) &&
-					!rt_may_expire(rth, tmo, expire)) {
-					tmo >>= 1;
-					rthp = &rth->dst.rt_next;
-					continue;
-				}
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				goal--;
-			}
-			spin_unlock_bh(rt_hash_lock_addr(k));
-			if (goal <= 0)
-				break;
-		}
-		rover = k;
-
-		if (goal <= 0)
-			goto work_done;
-
-		/* Goal is not achieved. We stop process if:
-
-		   - if expire reduced to zero. Otherwise, expire is halfed.
-		   - if table is not full.
-		   - if we are called from interrupt.
-		   - jiffies check is just fallback/debug loop breaker.
-		     We will not spin here for long time in any case.
-		 */
-
-		RT_CACHE_STAT_INC(gc_goal_miss);
-
-		if (expire == 0)
-			break;
-
-		expire >>= 1;
-
-		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-			goto out;
-	} while (!in_softirq() && time_before_eq(jiffies, now));
-
-	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	net_warn_ratelimited("dst cache overflow\n");
-	RT_CACHE_STAT_INC(gc_dst_overflow);
-	return 1;
-
-work_done:
-	expire += ip_rt_gc_min_interval;
-	if (expire > ip_rt_gc_timeout ||
-	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
-	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
-		expire = ip_rt_gc_timeout;
-out:	return 0;
-}
-
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
-	int length = 0;
-	const struct rtable *rth = head;
-
-	while (rth) {
-		length += has_noalias(head, rth);
-		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
-	}
-	return length >> FRACT_BITS;
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
-static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
-				     struct sk_buff *skb, int ifindex)
-{
-	struct rtable	*rth, *cand;
-	struct rtable __rcu **rthp, **candp;
-	unsigned long	now;
-	u32 		min_score;
-	int		chain_length;
-
-restart:
-	chain_length = 0;
-	min_score = ~(u32)0;
-	cand = NULL;
-	candp = NULL;
-	now = jiffies;
-
-	if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
-		/*
-		 * If we're not caching, just tell the caller we
-		 * were successful and don't touch the route.  The
-		 * caller hold the sole reference to the cache entry, and
-		 * it will be released when the caller is done with it.
-		 * If we drop it here, the callers have no way to resolve routes
-		 * when we're not caching.  Instead, just point *rp at rt, so
-		 * the caller gets a single use out of the route
-		 * Note that we do rt_free on this new route entry, so that
-		 * once its refcount hits zero, we are still able to reap it
-		 * (Thanks Alexey)
-		 * Note: To avoid expensive rcu stuff for this uncached dst,
-		 * we set DST_NOCACHE so that dst_release() can free dst without
-		 * waiting a grace period.
-		 */
-
-		rt->dst.flags |= DST_NOCACHE;
-		goto skip_hashing;
-	}
-
-	rthp = &rt_hash_table[hash].chain;
-
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (rt_is_expired(rth)) {
-			*rthp = rth->dst.rt_next;
-			rt_free(rth);
-			continue;
-		}
-		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
-			/* Put it first */
-			*rthp = rth->dst.rt_next;
-			/*
-			 * Since lookup is lockfree, the deletion
-			 * must be visible to another weakly ordered CPU before
-			 * the insertion at the start of the hash chain.
-			 */
-			rcu_assign_pointer(rth->dst.rt_next,
-					   rt_hash_table[hash].chain);
-			/*
-			 * Since lookup is lockfree, the update writes
-			 * must be ordered for consistency on SMP.
-			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
-			dst_use(&rth->dst, now);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			rt_drop(rt);
-			if (skb)
-				skb_dst_set(skb, &rth->dst);
-			return rth;
-		}
-
-		if (!atomic_read(&rth->dst.__refcnt)) {
-			u32 score = rt_score(rth);
-
-			if (score <= min_score) {
-				cand = rth;
-				candp = rthp;
-				min_score = score;
-			}
-		}
-
-		chain_length++;
-
-		rthp = &rth->dst.rt_next;
-	}
-
-	if (cand) {
-		/* ip_rt_gc_elasticity used to be average length of chain
-		 * length, when exceeded gc becomes really aggressive.
-		 *
-		 * The second limit is less certain. At the moment it allows
-		 * only 2 entries per bucket. We will see.
-		 */
-		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->dst.rt_next;
-			rt_free(cand);
-		}
-	} else {
-		if (chain_length > rt_chain_length_max &&
-		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-			struct net *net = dev_net(rt->dst.dev);
-			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(net)) {
-				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
-					rt->dst.dev->name, num);
-			}
-			rt_emergency_hash_rebuild(net);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-					ifindex, rt_genid(net));
-			goto restart;
-		}
-	}
-
-	rt->dst.rt_next = rt_hash_table[hash].chain;
-
-	/*
-	 * Since lookup is lockfree, we must make sure
-	 * previous writes to rt are committed to memory
-	 * before making rt visible to other CPUS.
-	 */
-	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
-	if (skb)
-		skb_dst_set(skb, &rt->dst);
-	return rt;
-}
-
 /*
  * Peer allocation may fail only in serious out-of-memory conditions.  However
  * we still can generate some output.
@@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void rt_del(unsigned int hash, struct rtable *rt)
-{
-	struct rtable __rcu **rthp;
-	struct rtable *aux;
-
-	rthp = &rt_hash_table[hash].chain;
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	ip_rt_put(rt);
-	while ((aux = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->dst.rt_next;
-			rt_free(aux);
-			continue;
-		}
-		rthp = &aux->dst.rt_next;
-	}
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-
 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 			     const struct iphdr *iph,
 			     int oif, u8 tos,
@@ -1506,10 +762,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 			   rt->dst.expires) {
-			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-						rt->rt_oif,
-						rt_genid(dev_net(dst->dev)));
-			rt_del(hash, rt);
+			ip_rt_put(rt);
 			ret = NULL;
 		}
 	}
@@ -1951,7 +1204,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 				   bool nopolicy, bool noxfrm)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-			 DST_HOST |
+			 DST_HOST | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1960,7 +1213,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned int hash;
 	struct rtable *rth;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	u32 itag = 0;
@@ -2024,9 +1276,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
-	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
-	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
+	skb_dst_set(skb, &rth->dst);
+	return 0;
 
 e_nobufs:
 	return -ENOBUFS;
@@ -2158,7 +1409,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 	struct rtable *rth = NULL;
 	int err;
-	unsigned int hash;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1)
@@ -2170,12 +1420,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	if (err)
 		return err;
 
-	/* put it into the cache */
-	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
-		       rt_genid(dev_net(rth->dst.dev)));
-	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
-	if (IS_ERR(rth))
-		return PTR_ERR(rth);
+	skb_dst_set(skb, &rth->dst);
 	return 0;
 }
 
@@ -2199,7 +1444,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned int	flags = 0;
 	u32		itag = 0;
 	struct rtable	*rth;
-	unsigned int	hash;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
 
@@ -2321,11 +1565,8 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
-	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+	skb_dst_set(skb, &rth->dst);
 	err = 0;
-	if (IS_ERR(rth))
-		err = PTR_ERR(rth);
 	goto out;
 
 no_route:
@@ -2364,46 +1605,10 @@ martian_source_keep_err:
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			   u8 tos, struct net_device *dev, bool noref)
 {
-	struct rtable	*rth;
-	unsigned int	hash;
-	int iif = dev->ifindex;
-	struct net *net;
 	int res;
 
-	net = dev_net(dev);
-
 	rcu_read_lock();
 
-	if (!rt_caching(net))
-		goto skip_cache;
-
-	tos &= IPTOS_RT_MASK;
-	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-	     rth = rcu_dereference(rth->dst.rt_next)) {
-		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
-		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
-		     (rth->rt_route_iif ^ iif) |
-		     (rth->rt_key_tos ^ tos)) == 0 &&
-		    rth->rt_mark == skb->mark &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			if (noref) {
-				dst_use_noref(&rth->dst, jiffies);
-				skb_dst_set_noref(skb, &rth->dst);
-			} else {
-				dst_use(&rth->dst, jiffies);
-				skb_dst_set(skb, &rth->dst);
-			}
-			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
-			return 0;
-		}
-		RT_CACHE_STAT_INC(in_hlist_search);
-	}
-
-skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2545,10 +1750,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 /*
  * Major route resolver routine.
- * called with rcu_read_lock();
  */
 
-static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2728,57 +1932,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 make_route:
 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
 			       tos, dev_out, flags);
-	if (!IS_ERR(rth)) {
-		unsigned int hash;
-
-		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
-			       rt_genid(dev_net(dev_out)));
-		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
-	}
 
 out:
 	rcu_read_unlock();
 	return rth;
 }
-
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-{
-	struct rtable *rth;
-	unsigned int hash;
-
-	if (!rt_caching(net))
-		goto slow_output;
-
-	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
-
-	rcu_read_lock_bh();
-	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-		rth = rcu_dereference_bh(rth->dst.rt_next)) {
-		if (rth->rt_key_dst == flp4->daddr &&
-		    rth->rt_key_src == flp4->saddr &&
-		    rt_is_output_route(rth) &&
-		    rth->rt_oif == flp4->flowi4_oif &&
-		    rth->rt_mark == flp4->flowi4_mark &&
-		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
-			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			dst_use(&rth->dst, jiffies);
-			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
-			if (!flp4->saddr)
-				flp4->saddr = rth->rt_src;
-			if (!flp4->daddr)
-				flp4->daddr = rth->rt_dst;
-			return rth;
-		}
-		RT_CACHE_STAT_INC(out_hlist_search);
-	}
-	rcu_read_unlock_bh();
-
-slow_output:
-	return ip_route_output_slow(net, flp4);
-}
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -3084,43 +2242,6 @@ errout_free:
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
-	struct rtable *rt;
-	int h, s_h;
-	int idx, s_idx;
-	struct net *net;
-
-	net = sock_net(skb->sk);
-
-	s_h = cb->args[0];
-	if (s_h < 0)
-		s_h = 0;
-	s_idx = idx = cb->args[1];
-	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
-		if (!rt_hash_table[h].chain)
-			continue;
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
-			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
-				continue;
-			if (rt_is_expired(rt))
-				continue;
-			skb_dst_set_noref(skb, &rt->dst);
-			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
-					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					 1, NLM_F_MULTI) <= 0) {
-				skb_dst_drop(skb);
-				rcu_read_unlock_bh();
-				goto done;
-			}
-			skb_dst_drop(skb);
-		}
-		rcu_read_unlock_bh();
-	}
-
-done:
-	cb->args[0] = h;
-	cb->args[1] = idx;
 	return skb->len;
 }
 
@@ -3354,22 +2475,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_IP_ROUTE_CLASSID */
 
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
-{
-	ssize_t ret;
-
-	if (!str)
-		return 0;
-
-	ret = kstrtoul(str, 0, &rhash_entries);
-	if (ret)
-		return 0;
-
-	return 1;
-}
-__setup("rhash_entries=", set_rhash_entries);
-
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3392,31 +2497,12 @@ int __init ip_rt_init(void)
 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
 
-	rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(totalram_pages >= 128 * 1024) ?
-					15 : 17,
-					0,
-					&rt_hash_log,
-					&rt_hash_mask,
-					0,
-					rhash_entries ? 0 : 512 * 1024);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = ~0;
+	ip_rt_max_size = INT_MAX;
 
 	devinet_init();
 	ip_fib_init();
 
-	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
-	expires_ljiffies = jiffies;
-	schedule_delayed_work(&expires_work,
-		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
 	if (ip_rt_proc_init())
 		pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 00/15] Kill the routing cache.
From: David Miller @ 2012-07-18 18:22 UTC (permalink / raw)
  To: netdev

And I'm really serious this time :-)

The general flow of this patch series is that first the routing
cache is removed.  We build a completely new rtable entry every
lookup request.

Next we make some simplifications due to the fact that removing the
routing cache causes several members of struct rtable to become
no longer necessary.

Then we need to make some amends such that we can legally cache
pre-constructed routes in the FIB nexthops.  Firstly, we need to
invalidate routes which are hit with nexthop exceptions.  Secondly we
have to change the semantics of rt->rt_gateway such that zero means
that the destination is on-link and non-zero otherwise.

Now that the preparations are ready, we start caching precomputed
routes in the FIB nexthops.  Output and input routes need different
kinds of care when determining if we can legally do such caching or
not.  The details are in the commit log messages for those changes.

The patch series then winds down with some more struct rtable
simplifications and other tidy ups that remove unnecessary overhead.

On a SPARC-T3 output route lookups are ~870 cycles.  Input route
lookups are ~970 cycles with rpfilter disabled, and about ~1400 with
rpfilter enabled.

These measurements were taken with the kbench_mod test module in the
net_test_tools GIT tree:

git://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git

Performance can easily be improved further.  For example
fib_table_lookup() performs a lot of excessive computations with all
the masking and shifting, some of it conditionalized to deal with edge
cases.

Also, Eric's no-ref optimization for input route lookups can be
re-instated for the FIB nexthop caching code path.  I would be really
pleased if someone would work on that.

In fact anyone suitable motivated can just fire up perf on the loading
of the test net_test_tools benchmark kernel module.  I spend much of
my time going:

bash# perf record insmod ./kbench_mod.ko dst=172.30.42.22 src=74.128.0.1 iif=2
bash# perf report

Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* [PATCH net-next 7/7] sfc: Bump version to 3.2
From: Ben Hutchings @ 2012-07-18 18:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

The key new feature for 3.2 is PTP support.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/net_driver.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index f84a5d5..27b44a3 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -37,7 +37,7 @@
  *
  **************************************************************************/
 
-#define EFX_DRIVER_VERSION	"3.1"
+#define EFX_DRIVER_VERSION	"3.2"
 
 #ifdef DEBUG
 #define EFX_BUG_ON_PARANOID(x) BUG_ON(x)
-- 
1.7.7.6


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 6/7] sfc: Expose FPGA bitfile partition through MTD
From: Ben Hutchings @ 2012-07-18 18:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mtd.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mtd.c b/drivers/net/ethernet/sfc/mtd.c
index 8f4604d..08f825b 100644
--- a/drivers/net/ethernet/sfc/mtd.c
+++ b/drivers/net/ethernet/sfc/mtd.c
@@ -585,6 +585,7 @@ static const struct siena_nvram_type_info siena_nvram_types[] = {
 	[MC_CMD_NVRAM_TYPE_EXP_ROM_CFG_PORT1]	= { 1, "sfc_exp_rom_cfg" },
 	[MC_CMD_NVRAM_TYPE_PHY_PORT0]		= { 0, "sfc_phy_fw" },
 	[MC_CMD_NVRAM_TYPE_PHY_PORT1]		= { 1, "sfc_phy_fw" },
+	[MC_CMD_NVRAM_TYPE_FPGA]		= { 0, "sfc_fpga" },
 };
 
 static int siena_mtd_probe_partition(struct efx_nic *efx,
@@ -598,7 +599,8 @@ static int siena_mtd_probe_partition(struct efx_nic *efx,
 	bool protected;
 	int rc;
 
-	if (type >= ARRAY_SIZE(siena_nvram_types))
+	if (type >= ARRAY_SIZE(siena_nvram_types) ||
+	    siena_nvram_types[type].name == NULL)
 		return -ENODEV;
 
 	info = &siena_nvram_types[type];
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 5/7] sfc: Support variable-length response to MCDI GET_BOARD_CFG
From: Ben Hutchings @ 2012-07-18 18:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mcdi.c |   18 +++++++++++++-----
 drivers/net/ethernet/sfc/mtd.c  |    3 ++-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 294df4b..9f74ab6 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -660,8 +660,8 @@ fail:
 int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address,
 			   u16 *fw_subtype_list, u32 *capabilities)
 {
-	uint8_t outbuf[MC_CMD_GET_BOARD_CFG_OUT_LENMIN];
-	size_t outlen;
+	uint8_t outbuf[MC_CMD_GET_BOARD_CFG_OUT_LENMAX];
+	size_t outlen, src_size, dst_size;
 	int port_num = efx_port_num(efx);
 	int offset;
 	int rc;
@@ -683,11 +683,19 @@ int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address,
 		: MC_CMD_GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT0_OFST;
 	if (mac_address)
 		memcpy(mac_address, outbuf + offset, ETH_ALEN);
-	if (fw_subtype_list)
+	if (fw_subtype_list) {
+		/* Truncate or zero-pad as necessary */
+		src_size = (outlen -
+			    MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_OFST);
+		dst_size = (MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM *
+			    sizeof(*fw_subtype_list));
 		memcpy(fw_subtype_list,
 		       outbuf + MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_OFST,
-		       MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MINNUM *
-		       sizeof(fw_subtype_list[0]));
+		       min(src_size, dst_size));
+		if (dst_size < src_size)
+			memset(fw_subtype_list + src_size, 0,
+			       dst_size - src_size);
+	}
 	if (capabilities) {
 		if (port_num)
 			*capabilities = MCDI_DWORD(outbuf,
diff --git a/drivers/net/ethernet/sfc/mtd.c b/drivers/net/ethernet/sfc/mtd.c
index 7581483..8f4604d 100644
--- a/drivers/net/ethernet/sfc/mtd.c
+++ b/drivers/net/ethernet/sfc/mtd.c
@@ -627,7 +627,8 @@ static int siena_mtd_get_fw_subtypes(struct efx_nic *efx,
 				     struct efx_mtd *efx_mtd)
 {
 	struct efx_mtd_partition *part;
-	uint16_t fw_subtype_list[MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MINNUM];
+	uint16_t fw_subtype_list[
+		MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM];
 	int rc;
 
 	rc = efx_mcdi_get_board_cfg(efx, NULL, fw_subtype_list, NULL);
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 4/7] sfc: Add support for IEEE-1588 PTP
From: Ben Hutchings @ 2012-07-18 18:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Add PTP IEEE-1588 support and make accesible via the PHC subsystem.

This work is based on prior code by Andrew Jackson

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/Kconfig      |    7 +
 drivers/net/ethernet/sfc/Makefile     |    1 +
 drivers/net/ethernet/sfc/efx.c        |    3 +
 drivers/net/ethernet/sfc/ethtool.c    |    1 +
 drivers/net/ethernet/sfc/mcdi_pcol.h  |    1 +
 drivers/net/ethernet/sfc/net_driver.h |   19 +-
 drivers/net/ethernet/sfc/nic.h        |   31 +
 drivers/net/ethernet/sfc/ptp.c        | 1519 +++++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/rx.c         |    2 +-
 drivers/net/ethernet/sfc/siena.c      |    1 +
 drivers/net/ethernet/sfc/tx.c         |    6 +
 11 files changed, 1589 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/sfc/ptp.c

diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig
index fb3cbc2..78c5d435 100644
--- a/drivers/net/ethernet/sfc/Kconfig
+++ b/drivers/net/ethernet/sfc/Kconfig
@@ -34,3 +34,10 @@ config SFC_SRIOV
 	  This enables support for the SFC9000 I/O Virtualization
 	  features, allowing accelerated network performance in
 	  virtualized environments.
+config SFC_PTP
+	bool "Solarflare SFC9000-family PTP support"
+	depends on SFC && PTP_1588_CLOCK
+	default y
+	---help---
+	  This enables support for the Precision Time Protocol (PTP)
+	  on SFC9000-family NICs
diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile
index ea1f8db..e11f2ec 100644
--- a/drivers/net/ethernet/sfc/Makefile
+++ b/drivers/net/ethernet/sfc/Makefile
@@ -5,5 +5,6 @@ sfc-y			+= efx.o nic.o falcon.o siena.o tx.o rx.o filter.o \
 			   mcdi.o mcdi_phy.o mcdi_mon.o
 sfc-$(CONFIG_SFC_MTD)	+= mtd.o
 sfc-$(CONFIG_SFC_SRIOV)	+= siena_sriov.o
+sfc-$(CONFIG_SFC_PTP)	+= ptp.o
 
 obj-$(CONFIG_SFC)	+= sfc.o
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 1c53d4b..e6631f0e 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1748,6 +1748,9 @@ static int efx_ioctl(struct net_device *net_dev, struct ifreq *ifr, int cmd)
 
 	EFX_ASSERT_RESET_SERIALISED(efx);
 
+	if (cmd == SIOCSHWTSTAMP)
+		return efx_ptp_ioctl(efx, ifr, cmd);
+
 	/* Convert phy_id from older PRTAD/DEVAD format */
 	if ((cmd == SIOCGMIIREG || cmd == SIOCSMIIREG) &&
 	    (data->phy_id & 0xfc00) == 0x0400)
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 10536f9..50cdd39 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -1170,6 +1170,7 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
 	.get_rxfh_indir		= efx_ethtool_get_rxfh_indir,
 	.set_rxfh_indir		= efx_ethtool_set_rxfh_indir,
+	.get_ts_info		= efx_ptp_get_ts_info,
 	.get_module_info	= efx_ethtool_get_module_info,
 	.get_module_eeprom	= efx_ethtool_get_module_eeprom,
 };
diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h
index 0310b9f0..0017f98 100644
--- a/drivers/net/ethernet/sfc/mcdi_pcol.h
+++ b/drivers/net/ethernet/sfc/mcdi_pcol.h
@@ -290,6 +290,7 @@
 #define          MCDI_EVENT_CODE_TX_FLUSH  0xc /* enum */
 #define          MCDI_EVENT_CODE_PTP_RX  0xd /* enum */
 #define          MCDI_EVENT_CODE_PTP_FAULT  0xe /* enum */
+#define          MCDI_EVENT_CODE_PTP_PPS  0xf /* enum */
 #define       MCDI_EVENT_CMDDONE_DATA_OFST 0
 #define       MCDI_EVENT_CMDDONE_DATA_LBN 0
 #define       MCDI_EVENT_CMDDONE_DATA_WIDTH 32
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 9913e32..f84a5d5 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -56,7 +56,8 @@
 #define EFX_MAX_CHANNELS 32U
 #define EFX_MAX_RX_QUEUES EFX_MAX_CHANNELS
 #define EFX_EXTRA_CHANNEL_IOV	0
-#define EFX_MAX_EXTRA_CHANNELS	1U
+#define EFX_EXTRA_CHANNEL_PTP	1
+#define EFX_MAX_EXTRA_CHANNELS	2U
 
 /* Checksum generation is a per-queue option in hardware, so each
  * queue visible to the networking core is backed by two hardware TX
@@ -68,6 +69,9 @@
 #define EFX_TXQ_TYPES		4
 #define EFX_MAX_TX_QUEUES	(EFX_TXQ_TYPES * EFX_MAX_CHANNELS)
 
+/* Forward declare Precision Time Protocol (PTP) support structure. */
+struct efx_ptp_data;
+
 struct efx_self_tests;
 
 /**
@@ -736,6 +740,7 @@ struct vfdi_status;
  *	%local_addr_list. Protected by %local_lock.
  * @local_lock: Mutex protecting %local_addr_list and %local_page_list.
  * @peer_work: Work item to broadcast peer addresses to VMs.
+ * @ptp_data: PTP state data
  * @monitor_work: Hardware monitor workitem
  * @biu_lock: BIU (bus interface unit) lock
  * @last_irq_cpu: Last CPU to handle a possible test interrupt.  This
@@ -860,6 +865,10 @@ struct efx_nic {
 	struct work_struct peer_work;
 #endif
 
+#ifdef CONFIG_SFC_PTP
+	struct efx_ptp_data *ptp_data;
+#endif
+
 	/* The following fields may be written more often */
 
 	struct delayed_work monitor_work ____cacheline_aligned_in_smp;
@@ -1122,5 +1131,13 @@ static inline void clear_bit_le(unsigned nr, unsigned char *addr)
 #define EFX_MAX_FRAME_LEN(mtu) \
 	((((mtu) + ETH_HLEN + VLAN_HLEN + 4/* FCS */ + 7) & ~7) + 16)
 
+static inline bool efx_xmit_with_hwtstamp(struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP;
+}
+static inline void efx_xmit_hwtstamp_pending(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+}
 
 #endif /* EFX_NET_DRIVER_H */
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index bab5cd9..cd53ab5 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -250,6 +250,37 @@ extern int efx_sriov_get_vf_config(struct net_device *dev, int vf,
 extern int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf,
 				     bool spoofchk);
 
+struct ethtool_ts_info;
+#ifdef CONFIG_SFC_PTP
+extern void efx_ptp_probe(struct efx_nic *efx);
+extern int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd);
+extern int efx_ptp_get_ts_info(struct net_device *net_dev,
+			       struct ethtool_ts_info *ts_info);
+extern bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb);
+extern int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb);
+extern void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev);
+#else
+static inline void efx_ptp_probe(struct efx_nic *efx) {}
+static inline int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+static inline int efx_ptp_get_ts_info(struct net_device *net_dev,
+				      struct ethtool_ts_info *ts_info)
+{
+	return -EOPNOTSUPP;
+}
+static inline bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return false;
+}
+static inline int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return NETDEV_TX_OK;
+}
+static inline void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev) {}
+#endif
+
 extern const struct efx_nic_type falcon_a1_nic_type;
 extern const struct efx_nic_type falcon_b0_nic_type;
 extern const struct efx_nic_type siena_a0_nic_type;
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
new file mode 100644
index 0000000..ba1e76a
--- /dev/null
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -0,0 +1,1519 @@
+/****************************************************************************
+ * Driver for Solarflare Solarstorm network controllers and boards
+ * Copyright 2011 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+/* Theory of operation:
+ *
+ * PTP support is assisted by firmware running on the MC, which provides
+ * the hardware timestamping capabilities.  Both transmitted and received
+ * PTP event packets are queued onto internal queues for subsequent processing;
+ * this is because the MC operations are relatively long and would block
+ * block NAPI/interrupt operation.
+ *
+ * Receive event processing:
+ *	The event contains the packet's UUID and sequence number, together
+ *	with the hardware timestamp.  The PTP receive packet queue is searched
+ *	for this UUID/sequence number and, if found, put on a pending queue.
+ *	Packets not matching are delivered without timestamps (MCDI events will
+ *	always arrive after the actual packet).
+ *	It is important for the operation of the PTP protocol that the ordering
+ *	of packets between the event and general port is maintained.
+ *
+ * Work queue processing:
+ *	If work waiting, synchronise host/hardware time
+ *
+ *	Transmit: send packet through MC, which returns the transmission time
+ *	that is converted to an appropriate timestamp.
+ *
+ *	Receive: the packet's reception time is converted to an appropriate
+ *	timestamp.
+ */
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include "net_driver.h"
+#include "efx.h"
+#include "mcdi.h"
+#include "mcdi_pcol.h"
+#include "io.h"
+#include "regs.h"
+#include "nic.h"
+
+/* Maximum number of events expected to make up a PTP event */
+#define	MAX_EVENT_FRAGS			3
+
+/* Maximum delay, ms, to begin synchronisation */
+#define	MAX_SYNCHRONISE_WAIT_MS		2
+
+/* How long, at most, to spend synchronising */
+#define	SYNCHRONISE_PERIOD_NS		250000
+
+/* How often to update the shared memory time */
+#define	SYNCHRONISATION_GRANULARITY_NS	200
+
+/* Minimum permitted length of a (corrected) synchronisation time */
+#define	MIN_SYNCHRONISATION_NS		120
+
+/* Maximum permitted length of a (corrected) synchronisation time */
+#define	MAX_SYNCHRONISATION_NS		1000
+
+/* How many (MC) receive events that can be queued */
+#define	MAX_RECEIVE_EVENTS		8
+
+/* Length of (modified) moving average. */
+#define	AVERAGE_LENGTH			16
+
+/* How long an unmatched event or packet can be held */
+#define PKT_EVENT_LIFETIME_MS		10
+
+/* Offsets into PTP packet for identification.  These offsets are from the
+ * start of the IP header, not the MAC header.  Note that neither PTP V1 nor
+ * PTP V2 permit the use of IPV4 options.
+ */
+#define PTP_DPORT_OFFSET	22
+
+#define PTP_V1_VERSION_LENGTH	2
+#define PTP_V1_VERSION_OFFSET	28
+
+#define PTP_V1_UUID_LENGTH	6
+#define PTP_V1_UUID_OFFSET	50
+
+#define PTP_V1_SEQUENCE_LENGTH	2
+#define PTP_V1_SEQUENCE_OFFSET	58
+
+/* The minimum length of a PTP V1 packet for offsets, etc. to be valid:
+ * includes IP header.
+ */
+#define	PTP_V1_MIN_LENGTH	64
+
+#define PTP_V2_VERSION_LENGTH	1
+#define PTP_V2_VERSION_OFFSET	29
+
+/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),
+ * the MC only captures the last six bytes of the clock identity. These values
+ * reflect those, not the ones used in the standard.  The standard permits
+ * mapping of V1 UUIDs to V2 UUIDs with these same values.
+ */
+#define PTP_V2_MC_UUID_LENGTH	6
+#define PTP_V2_MC_UUID_OFFSET	50
+
+#define PTP_V2_SEQUENCE_LENGTH	2
+#define PTP_V2_SEQUENCE_OFFSET	58
+
+/* The minimum length of a PTP V2 packet for offsets, etc. to be valid:
+ * includes IP header.
+ */
+#define	PTP_V2_MIN_LENGTH	63
+
+#define	PTP_MIN_LENGTH		63
+
+#define PTP_ADDRESS		0xe0000181	/* 224.0.1.129 */
+#define PTP_EVENT_PORT		319
+#define PTP_GENERAL_PORT	320
+
+/* Annoyingly the format of the version numbers are different between
+ * versions 1 and 2 so it isn't possible to simply look for 1 or 2.
+ */
+#define	PTP_VERSION_V1		1
+
+#define	PTP_VERSION_V2		2
+#define	PTP_VERSION_V2_MASK	0x0f
+
+enum ptp_packet_state {
+	PTP_PACKET_STATE_UNMATCHED = 0,
+	PTP_PACKET_STATE_MATCHED,
+	PTP_PACKET_STATE_TIMED_OUT,
+	PTP_PACKET_STATE_MATCH_UNWANTED
+};
+
+/* NIC synchronised with single word of time only comprising
+ * partial seconds and full nanoseconds: 10^9 ~ 2^30 so 2 bits for seconds.
+ */
+#define	MC_NANOSECOND_BITS	30
+#define	MC_NANOSECOND_MASK	((1 << MC_NANOSECOND_BITS) - 1)
+#define	MC_SECOND_MASK		((1 << (32 - MC_NANOSECOND_BITS)) - 1)
+
+/* Maximum parts-per-billion adjustment that is acceptable */
+#define MAX_PPB			1000000
+
+/* Number of bits required to hold the above */
+#define	MAX_PPB_BITS		20
+
+/* Number of extra bits allowed when calculating fractional ns.
+ * EXTRA_BITS + MC_CMD_PTP_IN_ADJUST_BITS + MAX_PPB_BITS should
+ * be less than 63.
+ */
+#define	PPB_EXTRA_BITS		2
+
+/* Precalculate scale word to avoid long long division at runtime */
+#define	PPB_SCALE_WORD	((1LL << (PPB_EXTRA_BITS + MC_CMD_PTP_IN_ADJUST_BITS +\
+			MAX_PPB_BITS)) / 1000000000LL)
+
+#define PTP_SYNC_ATTEMPTS	4
+
+/**
+ * struct efx_ptp_match - Matching structure, stored in sk_buff's cb area.
+ * @words: UUID and (partial) sequence number
+ * @expiry: Time after which the packet should be delivered irrespective of
+ *            event arrival.
+ * @state: The state of the packet - whether it is ready for processing or
+ *         whether that is of no interest.
+ */
+struct efx_ptp_match {
+	u32 words[DIV_ROUND_UP(PTP_V1_UUID_LENGTH, 4)];
+	unsigned long expiry;
+	enum ptp_packet_state state;
+};
+
+/**
+ * struct efx_ptp_event_rx - A PTP receive event (from MC)
+ * @seq0: First part of (PTP) UUID
+ * @seq1: Second part of (PTP) UUID and sequence number
+ * @hwtimestamp: Event timestamp
+ */
+struct efx_ptp_event_rx {
+	struct list_head link;
+	u32 seq0;
+	u32 seq1;
+	ktime_t hwtimestamp;
+	unsigned long expiry;
+};
+
+/**
+ * struct efx_ptp_timeset - Synchronisation between host and MC
+ * @host_start: Host time immediately before hardware timestamp taken
+ * @seconds: Hardware timestamp, seconds
+ * @nanoseconds: Hardware timestamp, nanoseconds
+ * @host_end: Host time immediately after hardware timestamp taken
+ * @waitns: Number of nanoseconds between hardware timestamp being read and
+ *          host end time being seen
+ * @window: Difference of host_end and host_start
+ * @valid: Whether this timeset is valid
+ */
+struct efx_ptp_timeset {
+	u32 host_start;
+	u32 seconds;
+	u32 nanoseconds;
+	u32 host_end;
+	u32 waitns;
+	u32 window;	/* Derived: end - start, allowing for wrap */
+};
+
+/**
+ * struct efx_ptp_data - Precision Time Protocol (PTP) state
+ * @channel: The PTP channel
+ * @rxq: Receive queue (awaiting timestamps)
+ * @txq: Transmit queue
+ * @evt_list: List of MC receive events awaiting packets
+ * @evt_free_list: List of free events
+ * @evt_lock: Lock for manipulating evt_list and evt_free_list
+ * @rx_evts: Instantiated events (on evt_list and evt_free_list)
+ * @workwq: Work queue for processing pending PTP operations
+ * @work: Work task
+ * @reset_required: A serious error has occurred and the PTP task needs to be
+ *                  reset (disable, enable).
+ * @rxfilter_event: Receive filter when operating
+ * @rxfilter_general: Receive filter when operating
+ * @config: Current timestamp configuration
+ * @enabled: PTP operation enabled
+ * @mode: Mode in which PTP operating (PTP version)
+ * @evt_frags: Partly assembled PTP events
+ * @evt_frag_idx: Current fragment number
+ * @evt_code: Last event code
+ * @start: Address at which MC indicates ready for synchronisation
+ * @host_base_time: (Synchronised with mc_base_time) host time
+ * @mc_base_time: (Synchronised with host_base_time) MC/hardware time
+ * @base_time_valid: Whether host_base_time and mc_base_time are synchronised
+ * @last_sync_ns: Last number of nanoseconds between readings when synchronising
+ * @base_sync_ns: Number of nanoseconds for last synchronisation.
+ * @base_sync_valid: Whether base_sync_time is valid.
+ * @current_adjfreq: Current ppb adjustment.
+ * @phc_clock: Pointer to registered phc device
+ * @phc_clock_info: Registration structure for phc device
+ * @pps_work: pps work task for handling pps events
+ * @pps_workwq: pps work queue
+ * @nic_ts_enabled: Flag indicating if NIC generated TS events are handled
+ * @txbuf: Buffer for use when transmitting (PTP) packets to MC (avoids
+ *         allocations in main data path).
+ * @debug_ptp_dir: PTP debugfs directory
+ * @missed_rx_sync: Number of packets received without syncrhonisation.
+ * @good_syncs: Number of successful synchronisations.
+ * @no_time_syncs: Number of synchronisations with no good times.
+ * @bad_sync_durations: Number of synchronisations with bad durations.
+ * @bad_syncs: Number of failed synchronisations.
+ * @last_sync_time: Number of nanoseconds for last synchronisation.
+ * @sync_timeouts: Number of synchronisation timeouts
+ * @fast_syncs: Number of synchronisations requiring short delay
+ * @min_sync_delta: Minimum time between event and synchronisation
+ * @max_sync_delta: Maximum time between event and synchronisation
+ * @average_sync_delta: Average time between event and synchronisation.
+ *                      Modified moving average.
+ * @last_sync_delta: Last time between event and synchronisation
+ * @mc_stats: Context value for MC statistics
+ * @timeset: Last set of synchronisation statistics.
+ */
+struct efx_ptp_data {
+	struct efx_channel *channel;
+	struct sk_buff_head rxq;
+	struct sk_buff_head txq;
+	struct list_head evt_list;
+	struct list_head evt_free_list;
+	spinlock_t evt_lock;
+	struct efx_ptp_event_rx rx_evts[MAX_RECEIVE_EVENTS];
+	struct workqueue_struct *workwq;
+	struct work_struct work;
+	bool reset_required;
+	u32 rxfilter_event;
+	u32 rxfilter_general;
+	bool rxfilter_installed;
+	struct hwtstamp_config config;
+	bool enabled;
+	unsigned int mode;
+	efx_qword_t evt_frags[MAX_EVENT_FRAGS];
+	int evt_frag_idx;
+	int evt_code;
+	struct efx_buffer start;
+	ktime_t host_base_time;
+	ktime_t mc_base_time;
+	bool base_time_valid;
+	unsigned last_sync_ns;
+	unsigned base_sync_ns;
+	bool base_sync_valid;
+	s64 current_adjfreq;
+	struct ptp_clock *phc_clock;
+	struct ptp_clock_info phc_clock_info;
+	struct work_struct pps_work;
+	struct workqueue_struct *pps_workwq;
+	bool nic_ts_enabled;
+	u8 txbuf[ALIGN(MC_CMD_PTP_IN_TRANSMIT_LEN(
+			       MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM), 4)];
+	struct efx_ptp_timeset
+	timeset[MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_MAXNUM];
+};
+
+static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta);
+static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta);
+static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts);
+static int efx_phc_settime(struct ptp_clock_info *ptp,
+			   const struct timespec *e_ts);
+static int efx_phc_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *request, int on);
+
+/* Enable MCDI PTP support. */
+static int efx_ptp_enable(struct efx_nic *efx)
+{
+	u8 inbuf[MC_CMD_PTP_IN_ENABLE_LEN];
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ENABLE);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_QUEUE,
+		       efx->ptp_data->channel->channel);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_MODE, efx->ptp_data->mode);
+
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+/* Disable MCDI PTP support.
+ *
+ * Note that this function should never rely on the presence of ptp_data -
+ * may be called before that exists.
+ */
+static int efx_ptp_disable(struct efx_nic *efx)
+{
+	u8 inbuf[MC_CMD_PTP_IN_DISABLE_LEN];
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_DISABLE);
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+static void efx_ptp_deliver_rx_queue(struct sk_buff_head *q)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(q))) {
+		local_bh_disable();
+		netif_receive_skb(skb);
+		local_bh_enable();
+	}
+}
+
+static void efx_ptp_handle_no_channel(struct efx_nic *efx)
+{
+	netif_err(efx, drv, efx->net_dev,
+		  "ERROR: PTP requires MSI-X and 1 additional interrupt"
+		  "vector. PTP disabled\n");
+}
+
+/* Repeatedly send the host time to the MC which will capture the hardware
+ * time.
+ */
+static void efx_ptp_send_times(struct efx_nic *efx, struct timespec *last_time)
+{
+	struct timespec now;
+	struct timespec limit;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct timespec start;
+	int *mc_running = ptp->start.addr;
+
+	getnstimeofday(&now);
+	start = now;
+	limit = now;
+	timespec_add_ns(&limit, SYNCHRONISE_PERIOD_NS);
+
+	/* Write host time for specified period or until MC is done */
+	while ((timespec_compare(&now, &limit) < 0) &&
+	       ACCESS_ONCE(*mc_running)) {
+		struct timespec update_time;
+		unsigned int host_time;
+
+		/* Don't update continuously to avoid saturating the PCIe bus */
+		update_time = now;
+		timespec_add_ns(&update_time, SYNCHRONISATION_GRANULARITY_NS);
+		do {
+			getnstimeofday(&now);
+		} while ((timespec_compare(&now, &update_time) < 0) &&
+			 ACCESS_ONCE(*mc_running));
+
+		/* Synchronise NIC with single word of time only */
+		host_time = (now.tv_sec << MC_NANOSECOND_BITS) | now.tv_nsec;
+		/* Update host time in NIC memory */
+		_efx_writed(efx, host_time,
+			    FR_CZ_MC_TREG_SMEM + MC_SMEM_P0_PTP_TIME_OFST);
+	}
+	*last_time = now;
+	start = timespec_sub(now, start);
+}
+
+/* Read a timeset from the MC's results and partial process. */
+static void efx_ptp_read_timeset(u8 *data, struct efx_ptp_timeset *timeset)
+{
+	unsigned start_ns, end_ns;
+
+	timeset->host_start = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTSTART);
+	timeset->seconds = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_SECONDS);
+	timeset->nanoseconds = MCDI_DWORD(data,
+					 PTP_OUT_SYNCHRONIZE_NANOSECONDS);
+	timeset->host_end = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTEND),
+	timeset->waitns = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_WAITNS);
+
+	/* Ignore seconds */
+	start_ns = timeset->host_start & MC_NANOSECOND_MASK;
+	end_ns = timeset->host_end & MC_NANOSECOND_MASK;
+	/* Allow for rollover */
+	if (end_ns < start_ns)
+		end_ns += NSEC_PER_SEC;
+	/* Determine duration of operation */
+	timeset->window = end_ns - start_ns;
+}
+
+/* Process times received from MC.
+ *
+ * Extract times from returned results, and establish the minimum value
+ * seen.  The minimum value represents the "best" possible time and events
+ * too much greater than this are rejected - the machine is, perhaps, too
+ * busy. A number of readings are taken so that, hopefully, at least one good
+ * synchronisation will be seen in the results.
+ */
+static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
+				 size_t response_length,
+				 struct timespec *last_time)
+{
+	unsigned number_readings = (response_length /
+			       MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN);
+	unsigned i;
+	unsigned min;
+	unsigned min_set = 0;
+	unsigned total;
+	unsigned ngood = 0;
+	unsigned last_good = 0;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool min_valid = false;
+	u32 last_sec;
+	u32 start_sec;
+
+	if (number_readings == 0)
+		return -EAGAIN;
+
+	/* Find minimum value in this set of results, discarding clearly
+	 * erroneous results.
+	 */
+	for (i = 0; i < number_readings; i++) {
+		efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]);
+		synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN;
+		if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) {
+			if (min_valid) {
+				if (ptp->timeset[i].window < min_set)
+					min_set = ptp->timeset[i].window;
+			} else {
+				min_valid = true;
+				min_set = ptp->timeset[i].window;
+			}
+		}
+	}
+
+	if (min_valid) {
+		if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns))
+			min = ptp->base_sync_ns;
+		else
+			min = min_set;
+	} else {
+		min = SYNCHRONISATION_GRANULARITY_NS;
+	}
+
+	/* Discard excessively long synchronise durations.  The MC times
+	 * when it finishes reading the host time so the corrected window
+	 * time should be fairly constant for a given platform.
+	 */
+	total = 0;
+	for (i = 0; i < number_readings; i++)
+		if (ptp->timeset[i].window > ptp->timeset[i].waitns) {
+			unsigned win;
+
+			win = ptp->timeset[i].window - ptp->timeset[i].waitns;
+			if (win >= MIN_SYNCHRONISATION_NS &&
+			    win < MAX_SYNCHRONISATION_NS) {
+				total += ptp->timeset[i].window;
+				ngood++;
+				last_good = i;
+			}
+		}
+
+	if (ngood == 0) {
+		netif_warn(efx, drv, efx->net_dev,
+			   "PTP no suitable synchronisations %dns %dns\n",
+			   ptp->base_sync_ns, min_set);
+		return -EAGAIN;
+	}
+
+	/* Average minimum this synchronisation */
+	ptp->last_sync_ns = DIV_ROUND_UP(total, ngood);
+	if (!ptp->base_sync_valid || (ptp->last_sync_ns < ptp->base_sync_ns)) {
+		ptp->base_sync_valid = true;
+		ptp->base_sync_ns = ptp->last_sync_ns;
+	}
+
+	ptp->mc_base_time = ktime_set(ptp->timeset[last_good].seconds,
+				      ptp->timeset[last_good].nanoseconds);
+	last_time->tv_nsec =
+		ptp->timeset[last_good].host_start & MC_NANOSECOND_MASK;
+
+	/* It is possible that the seconds rolled over between taking
+	 * the start reading and the last value written by the host.  The
+	 * timescales are such that a gap of more than one second is never
+	 * expected.
+	 */
+	start_sec = ptp->timeset[last_good].host_start >> MC_NANOSECOND_BITS;
+	last_sec = last_time->tv_sec & MC_SECOND_MASK;
+	if (start_sec != last_sec) {
+		if (((start_sec + 1) & MC_SECOND_MASK) != last_sec) {
+			netif_warn(efx, hw, efx->net_dev,
+				   "PTP bad synchronisation seconds\n");
+			return -EAGAIN;
+		} else {
+			last_time->tv_sec--;
+		}
+	}
+	ptp->host_base_time = ktime_set(last_time->tv_sec,
+					last_time->tv_nsec);
+
+	/* At least one good synchronisation */
+	ptp->base_time_valid = true;
+
+	return 0;
+}
+
+/* Synchronize times between the host and the MC */
+static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	u8 synch_buf[MC_CMD_PTP_OUT_SYNCHRONIZE_LENMAX];
+	size_t response_length;
+	int rc;
+	unsigned long timeout;
+	struct timespec last_time;
+	unsigned int loops = 0;
+	int *start = ptp->start.addr;
+
+	last_time.tv_sec = 0;
+	last_time.tv_nsec = 0;
+
+	MCDI_SET_DWORD(synch_buf, PTP_IN_OP, MC_CMD_PTP_OP_SYNCHRONIZE);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_NUMTIMESETS,
+		       num_readings);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR_LO,
+		       (u32)ptp->start.dma_addr);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR_HI,
+		       (u32)((u64)ptp->start.dma_addr >> 32));
+
+	/* Clear flag that signals MC ready */
+	ACCESS_ONCE(*start) = 0;
+	efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf,
+			   MC_CMD_PTP_IN_SYNCHRONIZE_LEN);
+
+	/* Wait for start from MCDI (or timeout) */
+	timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS);
+	while (!ACCESS_ONCE(*start) && (time_before(jiffies, timeout))) {
+		udelay(20);	/* Usually start MCDI execution quickly */
+		loops++;
+	}
+
+	if (ACCESS_ONCE(*start))
+		efx_ptp_send_times(efx, &last_time);
+
+	/* Collect results */
+	rc = efx_mcdi_rpc_finish(efx, MC_CMD_PTP,
+				 MC_CMD_PTP_IN_SYNCHRONIZE_LEN,
+				 synch_buf, sizeof(synch_buf),
+				 &response_length);
+	if (rc == 0)
+		rc = efx_ptp_process_times(efx, synch_buf, response_length,
+					   &last_time);
+
+	return rc;
+}
+
+/* Get the host time from a given hardware time */
+static bool efx_ptp_get_host_time(struct efx_nic *efx,
+				  struct skb_shared_hwtstamps *timestamps)
+{
+	if (efx->ptp_data->base_time_valid) {
+		ktime_t diff = ktime_sub(timestamps->hwtstamp,
+					 efx->ptp_data->mc_base_time);
+
+		timestamps->syststamp = ktime_add(efx->ptp_data->host_base_time,
+						  diff);
+	}
+
+	return efx->ptp_data->base_time_valid;
+}
+
+/* Transmit a PTP packet, via the MCDI interface, to the wire. */
+static int efx_ptp_xmit_skb(struct efx_nic *efx, struct sk_buff *skb)
+{
+	u8 *txbuf = efx->ptp_data->txbuf;
+	struct skb_shared_hwtstamps timestamps;
+	int rc = -EIO;
+	/* MCDI driver requires word aligned lengths */
+	size_t len = ALIGN(MC_CMD_PTP_IN_TRANSMIT_LEN(skb->len), 4);
+	u8 txtime[MC_CMD_PTP_OUT_TRANSMIT_LEN];
+
+	MCDI_SET_DWORD(txbuf, PTP_IN_OP, MC_CMD_PTP_OP_TRANSMIT);
+	MCDI_SET_DWORD(txbuf, PTP_IN_TRANSMIT_LENGTH, skb->len);
+	if (skb_shinfo(skb)->nr_frags != 0) {
+		rc = skb_linearize(skb);
+		if (rc != 0)
+			goto fail;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		rc = skb_checksum_help(skb);
+		if (rc != 0)
+			goto fail;
+	}
+	skb_copy_from_linear_data(skb,
+				  &txbuf[MC_CMD_PTP_IN_TRANSMIT_PACKET_OFST],
+				  len);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, txbuf, len, txtime,
+			  sizeof(txtime), &len);
+	if (rc != 0)
+		goto fail;
+
+	memset(&timestamps, 0, sizeof(timestamps));
+	timestamps.hwtstamp = ktime_set(
+		MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_SECONDS),
+		MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_NANOSECONDS));
+	if (efx_ptp_get_host_time(efx, &timestamps))
+		skb_tstamp_tx(skb, &timestamps);
+	/* Success even if hardware timestamping failed */
+	rc = 0;
+
+fail:
+	dev_kfree_skb(skb);
+
+	return rc;
+}
+
+static void efx_ptp_drop_time_expired_events(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct list_head *cursor;
+	struct list_head *next;
+
+	/* Drop time-expired events */
+	spin_lock_bh(&ptp->evt_lock);
+	if (!list_empty(&ptp->evt_list)) {
+		list_for_each_safe(cursor, next, &ptp->evt_list) {
+			struct efx_ptp_event_rx *evt;
+
+			evt = list_entry(cursor, struct efx_ptp_event_rx,
+					 link);
+			if (time_after(jiffies, evt->expiry)) {
+				list_del(&evt->link);
+				list_add(&evt->link, &ptp->evt_free_list);
+				netif_warn(efx, hw, efx->net_dev,
+					   "PTP rx event dropped\n");
+			}
+		}
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+}
+
+static enum ptp_packet_state efx_ptp_match_rx(struct efx_nic *efx,
+					      struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool evts_waiting;
+	struct list_head *cursor;
+	struct list_head *next;
+	struct efx_ptp_match *match;
+	enum ptp_packet_state rc = PTP_PACKET_STATE_UNMATCHED;
+
+	spin_lock_bh(&ptp->evt_lock);
+	evts_waiting = !list_empty(&ptp->evt_list);
+	spin_unlock_bh(&ptp->evt_lock);
+
+	if (!evts_waiting)
+		return PTP_PACKET_STATE_UNMATCHED;
+
+	match = (struct efx_ptp_match *)skb->cb;
+	/* Look for a matching timestamp in the event queue */
+	spin_lock_bh(&ptp->evt_lock);
+	list_for_each_safe(cursor, next, &ptp->evt_list) {
+		struct efx_ptp_event_rx *evt;
+
+		evt = list_entry(cursor, struct efx_ptp_event_rx, link);
+		if ((evt->seq0 == match->words[0]) &&
+		    (evt->seq1 == match->words[1])) {
+			struct skb_shared_hwtstamps *timestamps;
+
+			/* Match - add in hardware timestamp */
+			timestamps = skb_hwtstamps(skb);
+			timestamps->hwtstamp = evt->hwtimestamp;
+
+			match->state = PTP_PACKET_STATE_MATCHED;
+			rc = PTP_PACKET_STATE_MATCHED;
+			list_del(&evt->link);
+			list_add(&evt->link, &ptp->evt_free_list);
+			break;
+		}
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+
+	return rc;
+}
+
+/* Process any queued receive events and corresponding packets
+ *
+ * q is returned with all the packets that are ready for delivery.
+ * true is returned if at least one of those packets requires
+ * synchronisation.
+ */
+static bool efx_ptp_process_events(struct efx_nic *efx, struct sk_buff_head *q)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool rc = false;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ptp->rxq))) {
+		struct efx_ptp_match *match;
+
+		match = (struct efx_ptp_match *)skb->cb;
+		if (match->state == PTP_PACKET_STATE_MATCH_UNWANTED) {
+			__skb_queue_tail(q, skb);
+		} else if (efx_ptp_match_rx(efx, skb) ==
+			   PTP_PACKET_STATE_MATCHED) {
+			rc = true;
+			__skb_queue_tail(q, skb);
+		} else if (time_after(jiffies, match->expiry)) {
+			match->state = PTP_PACKET_STATE_TIMED_OUT;
+			netif_warn(efx, rx_err, efx->net_dev,
+				   "PTP packet - no timestamp seen\n");
+			__skb_queue_tail(q, skb);
+		} else {
+			/* Replace unprocessed entry and stop */
+			skb_queue_head(&ptp->rxq, skb);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+/* Complete processing of a received packet */
+static void efx_ptp_process_rx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
+
+	/* Translate timestamps, as required */
+	if (match->state == PTP_PACKET_STATE_MATCHED) {
+		struct skb_shared_hwtstamps *timestamps;
+
+		timestamps = skb_hwtstamps(skb);
+		efx_ptp_get_host_time(efx, timestamps);
+	}
+
+	local_bh_disable();
+	netif_receive_skb(skb);
+	local_bh_enable();
+}
+
+static int efx_ptp_start(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_filter_spec rxfilter;
+	int rc;
+
+	ptp->reset_required = false;
+
+	/* Must resynchronise when starting */
+	ptp->base_time_valid = false;
+	ptp->base_sync_valid = false;
+
+	/* Must filter on both event and general ports to ensure
+	 * that there is no packet re-ordering.
+	 */
+	efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0,
+			   efx_rx_queue_index(
+				   efx_channel_get_rx_queue(ptp->channel)));
+	rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP,
+				       htonl(PTP_ADDRESS),
+				       htons(PTP_EVENT_PORT));
+	if (rc != 0)
+		return rc;
+
+	rc = efx_filter_insert_filter(efx, &rxfilter, true);
+	if (rc < 0)
+		return rc;
+	ptp->rxfilter_event = rc;
+
+	efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0,
+			   efx_rx_queue_index(
+				   efx_channel_get_rx_queue(ptp->channel)));
+	rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP,
+				       htonl(PTP_ADDRESS),
+				       htons(PTP_GENERAL_PORT));
+	if (rc != 0)
+		goto fail;
+
+	rc = efx_filter_insert_filter(efx, &rxfilter, true);
+	if (rc < 0)
+		goto fail;
+	ptp->rxfilter_general = rc;
+
+	rc = efx_ptp_enable(efx);
+	if (rc != 0)
+		goto fail2;
+
+	ptp->evt_frag_idx = 0;
+	ptp->current_adjfreq = 0;
+	ptp->rxfilter_installed = true;
+
+	return 0;
+
+fail2:
+	efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+				  ptp->rxfilter_general);
+fail:
+	efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+				  ptp->rxfilter_event);
+
+	return rc;
+}
+
+static int efx_ptp_stop(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int rc = efx_ptp_disable(efx);
+	struct list_head *cursor;
+	struct list_head *next;
+
+	if (ptp->rxfilter_installed) {
+		efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+					  ptp->rxfilter_general);
+		efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+					  ptp->rxfilter_event);
+		ptp->rxfilter_installed = false;
+	}
+
+	/* Make sure RX packets are really delivered */
+	efx_ptp_deliver_rx_queue(&efx->ptp_data->rxq);
+	skb_queue_purge(&efx->ptp_data->txq);
+
+	/* Drop any pending receive events */
+	spin_lock_bh(&efx->ptp_data->evt_lock);
+	list_for_each_safe(cursor, next, &efx->ptp_data->evt_list) {
+		list_del(cursor);
+		list_add(cursor, &efx->ptp_data->evt_free_list);
+	}
+	spin_unlock_bh(&efx->ptp_data->evt_lock);
+
+	return rc;
+}
+
+static void efx_ptp_pps_worker(struct work_struct *work)
+{
+	struct efx_ptp_data *ptp =
+		container_of(work, struct efx_ptp_data, pps_work);
+	struct efx_nic *efx = ptp->channel->efx;
+	struct timespec event_gen_time;
+	struct ptp_clock_event ptp_pps_evt;
+	ktime_t gen_time_host;
+
+	if (efx_ptp_synchronize(efx, PTP_SYNC_ATTEMPTS))
+		return;
+
+	gen_time_host = ktime_sub(ptp->mc_base_time,
+				  ptp->host_base_time);
+	event_gen_time = ktime_to_timespec(gen_time_host);
+
+	ptp_pps_evt.type = PTP_CLOCK_EXTTS;
+	ptp_pps_evt.timestamp = ktime_to_ns(gen_time_host);
+	ptp_clock_event(ptp->phc_clock, &ptp_pps_evt);
+}
+
+/* Process any pending transmissions and timestamp any received packets.
+ *
+ * Host and NIC time are synchronised once if there is any work to do:
+ * the process is relatively expensive so don't do it for each packet.
+ */
+static void efx_ptp_worker(struct work_struct *work)
+{
+	struct efx_ptp_data *ptp_data =
+		container_of(work, struct efx_ptp_data, work);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	struct sk_buff *skb;
+	struct sk_buff_head tempq;
+
+	if (ptp_data->reset_required) {
+		efx_ptp_stop(efx);
+		efx_ptp_start(efx);
+		return;
+	}
+
+	efx_ptp_drop_time_expired_events(efx);
+
+	__skb_queue_head_init(&tempq);
+	if (efx_ptp_process_events(efx, &tempq) ||
+	    !skb_queue_empty(&ptp_data->txq)) {
+		/* Synchronise PC/MC times when there's work to do. This
+		 * isn't fatal but would be unusual (because of the retries
+		 * within efx_ptp_synchronize).  Failure may suggest a heavily
+		 * overloaded system.
+		 */
+		if (0 != efx_ptp_synchronize(efx, PTP_SYNC_ATTEMPTS))
+			netif_warn(efx, drv, efx->net_dev,
+				   "PTP couldn't get synchronisation\n");
+
+		while ((skb = skb_dequeue(&ptp_data->txq)))
+			efx_ptp_xmit_skb(efx, skb);
+	}
+
+	while ((skb = __skb_dequeue(&tempq)))
+		efx_ptp_process_rx(efx, skb);
+}
+
+/* Initialise PTP channel and state.
+ *
+ * Setting core_index to zero causes the queue to be initialised and doesn't
+ * overlap with 'rxq0' because ptp.c doesn't use skb_record_rx_queue.
+ */
+static int efx_ptp_probe_channel(struct efx_channel *channel)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp;
+	int rc = 0;
+	unsigned int pos;
+
+	channel->irq_moderation = 0;
+	channel->rx_queue.core_index = 0;
+
+	ptp = kzalloc(sizeof(struct efx_ptp_data), GFP_KERNEL);
+	efx->ptp_data = ptp;
+	if (!efx->ptp_data)
+		return -ENOMEM;
+
+	rc = efx_nic_alloc_buffer(efx, &ptp->start, sizeof(int));
+	if (rc != 0)
+		goto fail1;
+
+	ptp->channel = channel;
+	skb_queue_head_init(&ptp->rxq);
+	skb_queue_head_init(&ptp->txq);
+	ptp->workwq = create_singlethread_workqueue("sfc_ptp");
+	if (!ptp->workwq) {
+		rc = -ENOMEM;
+		goto fail2;
+	}
+
+	INIT_WORK(&ptp->work, efx_ptp_worker);
+	ptp->config.flags = 0;
+	ptp->config.tx_type = HWTSTAMP_TX_OFF;
+	ptp->config.rx_filter = HWTSTAMP_FILTER_NONE;
+	INIT_LIST_HEAD(&ptp->evt_list);
+	INIT_LIST_HEAD(&ptp->evt_free_list);
+	spin_lock_init(&ptp->evt_lock);
+	for (pos = 0; pos < MAX_RECEIVE_EVENTS; pos++)
+		list_add(&ptp->rx_evts[pos].link, &ptp->evt_free_list);
+
+	ptp->phc_clock_info.owner = THIS_MODULE;
+	snprintf(ptp->phc_clock_info.name,
+		 sizeof(ptp->phc_clock_info.name),
+		 "%pm", efx->net_dev->perm_addr);
+	ptp->phc_clock_info.max_adj = MAX_PPB;
+	ptp->phc_clock_info.n_alarm = 0;
+	ptp->phc_clock_info.n_ext_ts = 1;
+	ptp->phc_clock_info.n_per_out = 0;
+	ptp->phc_clock_info.pps = 0;
+	ptp->phc_clock_info.adjfreq = efx_phc_adjfreq;
+	ptp->phc_clock_info.adjtime = efx_phc_adjtime;
+	ptp->phc_clock_info.gettime = efx_phc_gettime;
+	ptp->phc_clock_info.settime = efx_phc_settime;
+	ptp->phc_clock_info.enable = efx_phc_enable;
+
+	ptp->phc_clock = ptp_clock_register(&ptp->phc_clock_info);
+	if (!ptp->phc_clock)
+		goto fail3;
+
+	INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker);
+	ptp->pps_workwq = create_singlethread_workqueue("sfc_pps");
+	if (!ptp->pps_workwq) {
+		rc = -ENOMEM;
+		goto fail4;
+	}
+	ptp->nic_ts_enabled = false;
+
+	return 0;
+fail4:
+	ptp_clock_unregister(efx->ptp_data->phc_clock);
+
+fail3:
+	destroy_workqueue(efx->ptp_data->workwq);
+
+fail2:
+	efx_nic_free_buffer(efx, &ptp->start);
+
+fail1:
+	kfree(efx->ptp_data);
+	efx->ptp_data = 0;
+
+	return rc;
+}
+
+static void efx_ptp_remove_channel(struct efx_channel *channel)
+{
+	struct efx_nic *efx = channel->efx;
+
+	if (!efx->ptp_data)
+		return;
+
+	(void)efx_ptp_disable(channel->efx);
+
+	cancel_work_sync(&efx->ptp_data->work);
+	cancel_work_sync(&efx->ptp_data->pps_work);
+
+	skb_queue_purge(&efx->ptp_data->rxq);
+	skb_queue_purge(&efx->ptp_data->txq);
+
+	ptp_clock_unregister(efx->ptp_data->phc_clock);
+
+	destroy_workqueue(efx->ptp_data->workwq);
+	destroy_workqueue(efx->ptp_data->pps_workwq);
+
+	efx_nic_free_buffer(efx, &efx->ptp_data->start);
+	kfree(efx->ptp_data);
+}
+
+static void efx_ptp_get_channel_name(struct efx_channel *channel,
+				     char *buf, size_t len)
+{
+	snprintf(buf, len, "%s-ptp", channel->efx->name);
+}
+
+/* Determine whether this packet should be processed by the PTP module
+ * or transmitted conventionally.
+ */
+bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return efx->ptp_data &&
+		efx->ptp_data->enabled &&
+		skb->len >= PTP_MIN_LENGTH &&
+		skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM &&
+		likely(skb->protocol == htons(ETH_P_IP)) &&
+		ip_hdr(skb)->protocol == IPPROTO_UDP &&
+		udp_hdr(skb)->dest == htons(PTP_EVENT_PORT);
+}
+
+/* Receive a PTP packet.  Packets are queued until the arrival of
+ * the receive timestamp from the MC - this will probably occur after the
+ * packet arrival because of the processing in the MC.
+ */
+static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
+	u8 *data;
+	unsigned int version;
+
+	match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
+
+	/* Correct version? */
+	if (ptp->mode == MC_CMD_PTP_MODE_V1) {
+		if (skb->len < PTP_V1_MIN_LENGTH) {
+			netif_receive_skb(skb);
+			return;
+		}
+		version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);
+		if (version != PTP_VERSION_V1) {
+			netif_receive_skb(skb);
+			return;
+		}
+	} else {
+		if (skb->len < PTP_V2_MIN_LENGTH) {
+			netif_receive_skb(skb);
+			return;
+		}
+		version = skb->data[PTP_V2_VERSION_OFFSET];
+
+		BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2);
+		BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET);
+		BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH);
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
+
+		if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
+			netif_receive_skb(skb);
+			return;
+		}
+	}
+
+	/* Does this packet require timestamping? */
+	if (ntohs(*(__be16 *)&skb->data[PTP_DPORT_OFFSET]) == PTP_EVENT_PORT) {
+		struct skb_shared_hwtstamps *timestamps;
+
+		match->state = PTP_PACKET_STATE_UNMATCHED;
+
+		/* Clear all timestamps held: filled in later */
+		timestamps = skb_hwtstamps(skb);
+		memset(timestamps, 0, sizeof(*timestamps));
+
+		/* Extract UUID/Sequence information */
+		data = skb->data + PTP_V1_UUID_OFFSET;
+		match->words[0] = (data[0]         |
+				   (data[1] << 8)  |
+				   (data[2] << 16) |
+				   (data[3] << 24));
+		match->words[1] = (data[4]         |
+				   (data[5] << 8)  |
+				   (skb->data[PTP_V1_SEQUENCE_OFFSET +
+					      PTP_V1_SEQUENCE_LENGTH - 1] <<
+				    16));
+	} else {
+		match->state = PTP_PACKET_STATE_MATCH_UNWANTED;
+	}
+
+	skb_queue_tail(&ptp->rxq, skb);
+	queue_work(ptp->workwq, &ptp->work);
+}
+
+/* Transmit a PTP packet.  This has to be transmitted by the MC
+ * itself, through an MCDI call.  MCDI calls aren't permitted
+ * in the transmit path so defer the actual transmission to a suitable worker.
+ */
+int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	skb_queue_tail(&ptp->txq, skb);
+
+	if ((udp_hdr(skb)->dest == htons(PTP_EVENT_PORT)) &&
+	    (skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM))
+		efx_xmit_hwtstamp_pending(skb);
+	queue_work(ptp->workwq, &ptp->work);
+
+	return NETDEV_TX_OK;
+}
+
+static int efx_ptp_change_mode(struct efx_nic *efx, bool enable_wanted,
+			       unsigned int new_mode)
+{
+	if ((enable_wanted != efx->ptp_data->enabled) ||
+	    (enable_wanted && (efx->ptp_data->mode != new_mode))) {
+		int rc;
+
+		if (enable_wanted) {
+			/* Change of mode requires disable */
+			if (efx->ptp_data->enabled &&
+			    (efx->ptp_data->mode != new_mode)) {
+				efx->ptp_data->enabled = false;
+				rc = efx_ptp_stop(efx);
+				if (rc != 0)
+					return rc;
+			}
+
+			/* Set new operating mode and establish
+			 * baseline synchronisation, which must
+			 * succeed.
+			 */
+			efx->ptp_data->mode = new_mode;
+			rc = efx_ptp_start(efx);
+			if (rc == 0) {
+				rc = efx_ptp_synchronize(efx,
+							 PTP_SYNC_ATTEMPTS * 2);
+				if (rc != 0)
+					efx_ptp_stop(efx);
+			}
+		} else {
+			rc = efx_ptp_stop(efx);
+		}
+
+		if (rc != 0)
+			return rc;
+
+		efx->ptp_data->enabled = enable_wanted;
+	}
+
+	return 0;
+}
+
+static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
+{
+	bool enable_wanted = false;
+	unsigned int new_mode;
+	int rc;
+
+	if (init->flags)
+		return -EINVAL;
+
+	if ((init->tx_type != HWTSTAMP_TX_OFF) &&
+	    (init->tx_type != HWTSTAMP_TX_ON))
+		return -ERANGE;
+
+	new_mode = efx->ptp_data->mode;
+	/* Determine whether any PTP HW operations are required */
+	switch (init->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+		init->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		new_mode = MC_CMD_PTP_MODE_V1;
+		enable_wanted = true;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	/* Although these three are accepted only IPV4 packets will be
+	 * timestamped
+	 */
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		new_mode = MC_CMD_PTP_MODE_V2;
+		enable_wanted = true;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		/* Non-IP timestamping not supported */
+		return -ERANGE;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (init->tx_type != HWTSTAMP_TX_OFF)
+		enable_wanted = true;
+
+	rc = efx_ptp_change_mode(efx, enable_wanted, new_mode);
+	if (rc != 0)
+		return rc;
+
+	efx->ptp_data->config = *init;
+
+	return 0;
+}
+
+int
+efx_ptp_get_ts_info(struct net_device *net_dev, struct ethtool_ts_info *ts_info)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	if (!ptp)
+		return -EOPNOTSUPP;
+
+	ts_info->so_timestamping = (SOF_TIMESTAMPING_TX_HARDWARE |
+				    SOF_TIMESTAMPING_RX_HARDWARE |
+				    SOF_TIMESTAMPING_SYS_HARDWARE |
+				    SOF_TIMESTAMPING_RAW_HARDWARE);
+	ts_info->phc_index = ptp_clock_index(ptp->phc_clock);
+	ts_info->tx_types = 1 << HWTSTAMP_TX_OFF | 1 << HWTSTAMP_TX_ON;
+	ts_info->rx_filters = (1 << HWTSTAMP_FILTER_NONE |
+			       1 << HWTSTAMP_FILTER_PTP_V1_L4_EVENT |
+			       1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC |
+			       1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ |
+			       1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT |
+			       1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC |
+			       1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ |
+			       1 << HWTSTAMP_FILTER_PTP_V2_EVENT |
+			       1 << HWTSTAMP_FILTER_PTP_V2_SYNC |
+			       1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ);
+	return 0;
+}
+
+int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config config;
+	int rc;
+
+	/* Not a PTP enabled port */
+	if (!efx->ptp_data)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	rc = efx_ptp_ts_init(efx, &config);
+	if (rc != 0)
+		return rc;
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config))
+		? -EFAULT : 0;
+}
+
+static void ptp_event_failure(struct efx_nic *efx, int expected_frag_len)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	netif_err(efx, hw, efx->net_dev,
+		"PTP unexpected event length: got %d expected %d\n",
+		ptp->evt_frag_idx, expected_frag_len);
+	ptp->reset_required = true;
+	queue_work(ptp->workwq, &ptp->work);
+}
+
+/* Process a completed receive event.  Put it on the event queue and
+ * start worker thread.  This is required because event and their
+ * correspoding packets may come in either order.
+ */
+static void ptp_event_rx(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	struct efx_ptp_event_rx *evt = NULL;
+
+	if (ptp->evt_frag_idx != 3) {
+		ptp_event_failure(efx, 3);
+		return;
+	}
+
+	spin_lock_bh(&ptp->evt_lock);
+	if (!list_empty(&ptp->evt_free_list)) {
+		evt = list_first_entry(&ptp->evt_free_list,
+				       struct efx_ptp_event_rx, link);
+		list_del(&evt->link);
+
+		evt->seq0 = EFX_QWORD_FIELD(ptp->evt_frags[2], MCDI_EVENT_DATA);
+		evt->seq1 = (EFX_QWORD_FIELD(ptp->evt_frags[2],
+					     MCDI_EVENT_SRC)        |
+			     (EFX_QWORD_FIELD(ptp->evt_frags[1],
+					      MCDI_EVENT_SRC) << 8) |
+			     (EFX_QWORD_FIELD(ptp->evt_frags[0],
+					      MCDI_EVENT_SRC) << 16));
+		evt->hwtimestamp = ktime_set(
+			EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA),
+			EFX_QWORD_FIELD(ptp->evt_frags[1], MCDI_EVENT_DATA));
+		evt->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
+		list_add_tail(&evt->link, &ptp->evt_list);
+
+		queue_work(ptp->workwq, &ptp->work);
+	} else {
+		netif_err(efx, rx_err, efx->net_dev, "No free PTP event");
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+}
+
+static void ptp_event_fault(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	int code = EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA);
+	if (ptp->evt_frag_idx != 1) {
+		ptp_event_failure(efx, 1);
+		return;
+	}
+
+	netif_err(efx, hw, efx->net_dev, "PTP error %d\n", code);
+}
+
+static void ptp_event_pps(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	if (ptp->nic_ts_enabled)
+		queue_work(ptp->pps_workwq, &ptp->pps_work);
+}
+
+void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int code = EFX_QWORD_FIELD(*ev, MCDI_EVENT_CODE);
+
+	if (!ptp->enabled)
+		return;
+
+	if (ptp->evt_frag_idx == 0) {
+		ptp->evt_code = code;
+	} else if (ptp->evt_code != code) {
+		netif_err(efx, hw, efx->net_dev,
+			  "PTP out of sequence event %d\n", code);
+		ptp->evt_frag_idx = 0;
+	}
+
+	ptp->evt_frags[ptp->evt_frag_idx++] = *ev;
+	if (!MCDI_EVENT_FIELD(*ev, CONT)) {
+		/* Process resulting event */
+		switch (code) {
+		case MCDI_EVENT_CODE_PTP_RX:
+			ptp_event_rx(efx, ptp);
+			break;
+		case MCDI_EVENT_CODE_PTP_FAULT:
+			ptp_event_fault(efx, ptp);
+			break;
+		case MCDI_EVENT_CODE_PTP_PPS:
+			ptp_event_pps(efx, ptp);
+			break;
+		default:
+			netif_err(efx, hw, efx->net_dev,
+				  "PTP unknown event %d\n", code);
+			break;
+		}
+		ptp->evt_frag_idx = 0;
+	} else if (MAX_EVENT_FRAGS == ptp->evt_frag_idx) {
+		netif_err(efx, hw, efx->net_dev,
+			  "PTP too many event fragments\n");
+		ptp->evt_frag_idx = 0;
+	}
+}
+
+static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	u8 inadj[MC_CMD_PTP_IN_ADJUST_LEN];
+	s64 adjustment_ns;
+	int rc;
+
+	if (delta > MAX_PPB)
+		delta = MAX_PPB;
+	else if (delta < -MAX_PPB)
+		delta = -MAX_PPB;
+
+	/* Convert ppb to fixed point ns. */
+	adjustment_ns = (((s64)delta * PPB_SCALE_WORD) >>
+			 (PPB_EXTRA_BITS + MAX_PPB_BITS));
+
+	MCDI_SET_DWORD(inadj, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_FREQ_LO, (u32)adjustment_ns);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_FREQ_HI,
+		       (u32)(adjustment_ns >> 32));
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_SECONDS, 0);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_NANOSECONDS, 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inadj, sizeof(inadj),
+			  NULL, 0, NULL);
+	if (rc != 0)
+		return rc;
+
+	ptp_data->current_adjfreq = delta;
+	return 0;
+}
+
+static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	struct timespec delta_ts = ns_to_timespec(delta);
+	u8 inbuf[MC_CMD_PTP_IN_ADJUST_LEN];
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_FREQ_LO, 0);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_FREQ_HI, 0);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_SECONDS, (u32)delta_ts.tv_sec);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_NANOSECONDS, (u32)delta_ts.tv_nsec);
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	u8 inbuf[MC_CMD_PTP_IN_READ_NIC_TIME_LEN];
+	u8 outbuf[MC_CMD_PTP_OUT_READ_NIC_TIME_LEN];
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_READ_NIC_TIME);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc != 0)
+		return rc;
+
+	ts->tv_sec = MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_SECONDS);
+	ts->tv_nsec = MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_NANOSECONDS);
+	return 0;
+}
+
+static int efx_phc_settime(struct ptp_clock_info *ptp,
+			   const struct timespec *e_ts)
+{
+	/* We must provide this function, but we cannot actually set the time */
+	return -EOPNOTSUPP;
+}
+
+static int efx_phc_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *request,
+			  int enable)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	if (request->type != PTP_CLK_REQ_EXTTS)
+		return -EOPNOTSUPP;
+
+	ptp_data->nic_ts_enabled = !!enable;
+	return 0;
+}
+
+static const struct efx_channel_type efx_ptp_channel_type = {
+	.handle_no_channel	= efx_ptp_handle_no_channel,
+	.pre_probe		= efx_ptp_probe_channel,
+	.post_remove		= efx_ptp_remove_channel,
+	.get_name		= efx_ptp_get_channel_name,
+	/* no copy operation; there is no need to reallocate this channel */
+	.receive_skb		= efx_ptp_rx,
+	.keep_eventq		= false,
+};
+
+void efx_ptp_probe(struct efx_nic *efx)
+{
+	/* Check whether PTP is implemented on this NIC.  The DISABLE
+	 * operation will succeed if and only if it is implemented.
+	 */
+	if (efx_ptp_disable(efx) == 0)
+		efx->extra_channel_type[EFX_EXTRA_CHANNEL_PTP] =
+			&efx_ptp_channel_type;
+}
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 92699a0..8923863 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -573,7 +573,7 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Record the rx_queue */
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	/* Does the channel want to handle the skb */
+	/* Pass the packet up */
 	if (channel->type->receive_skb)
 		channel->type->receive_skb(channel, skb);
 	else
diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c
index 6bafd21..84b41bf 100644
--- a/drivers/net/ethernet/sfc/siena.c
+++ b/drivers/net/ethernet/sfc/siena.c
@@ -335,6 +335,7 @@ static int siena_probe_nic(struct efx_nic *efx)
 		goto fail5;
 
 	efx_sriov_probe(efx);
+	efx_ptp_probe(efx);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 9b225a7..66badf8 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -347,6 +347,12 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 
 	EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));
 
+	/* PTP "event" packet */
+	if (unlikely(efx_xmit_with_hwtstamp(skb)) &&
+	    unlikely(efx_ptp_is_ptp_tx(efx, skb))) {
+		return efx_ptp_tx(efx, skb);
+	}
+
 	index = skb_get_queue_mapping(skb);
 	type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;
 	if (index >= efx->n_tx_channels) {
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 3/7] sfc: Allow efx_mcdi_rpc to be called in two parts
From: Ben Hutchings @ 2012-07-18 18:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

For NIC/System time synchonisation efx_mcdi_rpc needs to be split in
efx_mcdi_rpc_start and efx_mcdi_rpc_finish operations.

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mcdi.c |   26 +++++++++++++++++++++++---
 drivers/net/ethernet/sfc/mcdi.h |    6 ++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index fc5e7bb..294df4b 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -320,14 +320,20 @@ static void efx_mcdi_ev_cpl(struct efx_nic *efx, unsigned int seqno,
 		efx_mcdi_complete(mcdi);
 }
 
-/* Issue the given command by writing the data into the shared memory PDU,
- * ring the doorbell and wait for completion. Copyout the result. */
 int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd,
 		 const u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen,
 		 size_t *outlen_actual)
 {
+	efx_mcdi_rpc_start(efx, cmd, inbuf, inlen);
+	return efx_mcdi_rpc_finish(efx, cmd, inlen,
+				   outbuf, outlen, outlen_actual);
+}
+
+void efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd, const u8 *inbuf,
+			size_t inlen)
+{
 	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
-	int rc;
+
 	BUG_ON(efx_nic_rev(efx) < EFX_REV_SIENA_A0);
 
 	efx_mcdi_acquire(mcdi);
@@ -338,6 +344,15 @@ int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd,
 	spin_unlock_bh(&mcdi->iface_lock);
 
 	efx_mcdi_copyin(efx, cmd, inbuf, inlen);
+}
+
+int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			u8 *outbuf, size_t outlen, size_t *outlen_actual)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	int rc;
+
+	BUG_ON(efx_nic_rev(efx) < EFX_REV_SIENA_A0);
 
 	if (mcdi->mode == MCDI_MODE_POLL)
 		rc = efx_mcdi_poll(efx);
@@ -563,6 +578,11 @@ void efx_mcdi_process_event(struct efx_channel *channel,
 	case MCDI_EVENT_CODE_FLR:
 		efx_sriov_flr(efx, MCDI_EVENT_FIELD(*event, FLR_VF));
 		break;
+	case MCDI_EVENT_CODE_PTP_RX:
+	case MCDI_EVENT_CODE_PTP_FAULT:
+	case MCDI_EVENT_CODE_PTP_PPS:
+		efx_ptp_event(efx, event);
+		break;
 
 	default:
 		netif_err(efx, hw, efx->net_dev, "Unknown MCDI event 0x%x\n",
diff --git a/drivers/net/ethernet/sfc/mcdi.h b/drivers/net/ethernet/sfc/mcdi.h
index 0bdf3e3..dc25caa 100644
--- a/drivers/net/ethernet/sfc/mcdi.h
+++ b/drivers/net/ethernet/sfc/mcdi.h
@@ -71,6 +71,12 @@ extern int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd, const u8 *inbuf,
 			size_t inlen, u8 *outbuf, size_t outlen,
 			size_t *outlen_actual);
 
+extern void efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd,
+			       const u8 *inbuf, size_t inlen);
+extern int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			       u8 *outbuf, size_t outlen,
+			       size_t *outlen_actual);
+
 extern int efx_mcdi_poll_reboot(struct efx_nic *efx);
 extern void efx_mcdi_mode_poll(struct efx_nic *efx);
 extern void efx_mcdi_mode_event(struct efx_nic *efx);
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 2/7] sfc: Add channel specific receive_skb handler and post_remove callback
From: Ben Hutchings @ 2012-07-18 18:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Allows an extra channel to override the standard receive_skb handler
and also for extra non generic operations to be performed on remove.

Also set default rx strategy so only skbs can be delivered to the
PTP receive function.

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        |    6 ++++++
 drivers/net/ethernet/sfc/efx.h        |    1 +
 drivers/net/ethernet/sfc/net_driver.h |    3 +++
 drivers/net/ethernet/sfc/rx.c         |   15 ++++++++++++---
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 3ae6774..1c53d4b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -714,6 +714,7 @@ static void efx_remove_channel(struct efx_channel *channel)
 	efx_for_each_possible_channel_tx_queue(tx_queue, channel)
 		efx_remove_tx_queue(tx_queue);
 	efx_remove_eventq(channel);
+	channel->type->post_remove(channel);
 }
 
 static void efx_remove_channels(struct efx_nic *efx)
@@ -828,6 +829,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue)
 
 static const struct efx_channel_type efx_default_channel_type = {
 	.pre_probe		= efx_channel_dummy_op_int,
+	.post_remove		= efx_channel_dummy_op_void,
 	.get_name		= efx_get_channel_name,
 	.copy			= efx_copy_channel,
 	.keep_eventq		= false,
@@ -838,6 +840,10 @@ int efx_channel_dummy_op_int(struct efx_channel *channel)
 	return 0;
 }
 
+void efx_channel_dummy_op_void(struct efx_channel *channel)
+{
+}
+
 /**************************************************************************
  *
  * Port handling
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index be8f915..121441f 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -96,6 +96,7 @@ static inline void efx_filter_rfs_expire(struct efx_channel *channel) {}
 
 /* Channels */
 extern int efx_channel_dummy_op_int(struct efx_channel *channel);
+extern void efx_channel_dummy_op_void(struct efx_channel *channel);
 extern void efx_process_channel_now(struct efx_channel *channel);
 extern int
 efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 3894593..9913e32 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -393,14 +393,17 @@ struct efx_channel {
  * @get_name: Generate the channel's name (used for its IRQ handler)
  * @copy: Copy the channel state prior to reallocation.  May be %NULL if
  *	reallocation is not supported.
+ * @receive_skb: Handle an skb ready to be passed to netif_receive_skb()
  * @keep_eventq: Flag for whether event queue should be kept initialised
  *	while the device is stopped
  */
 struct efx_channel_type {
 	void (*handle_no_channel)(struct efx_nic *);
 	int (*pre_probe)(struct efx_channel *);
+	void (*post_remove)(struct efx_channel *);
 	void (*get_name)(struct efx_channel *, char *buf, size_t len);
 	struct efx_channel *(*copy)(const struct efx_channel *);
+	void (*receive_skb)(struct efx_channel *, struct sk_buff *);
 	bool keep_eventq;
 };
 
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 3f598bf..92699a0 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -573,8 +573,11 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Record the rx_queue */
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	/* Pass the packet up */
-	netif_receive_skb(skb);
+	/* Does the channel want to handle the skb */
+	if (channel->type->receive_skb)
+		channel->type->receive_skb(channel, skb);
+	else
+		netif_receive_skb(skb);
 
 	/* Update allocation strategy method */
 	channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
@@ -616,7 +619,8 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
 		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
 
-	if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)))
+	if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) &&
+	    !channel->type->receive_skb)
 		efx_rx_packet_gro(channel, rx_buf, eh);
 	else
 		efx_rx_deliver(channel, rx_buf);
@@ -626,6 +630,11 @@ void efx_rx_strategy(struct efx_channel *channel)
 {
 	enum efx_rx_alloc_method method = rx_alloc_method;
 
+	if (channel->type->receive_skb) {
+		channel->rx_alloc_push_pages = false;
+		return;
+	}
+
 	/* Only makes sense to use page based allocation if GRO is enabled */
 	if (!(channel->efx->net_dev->features & NETIF_F_GRO)) {
 		method = RX_ALLOC_METHOD_SKB;
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH net-next 1/7] sfc: Add explicit RX queue flag to channel
From: Ben Hutchings @ 2012-07-18 18:19 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

The PTP channel will have its own RX queue even though it's not
a regular traffic channel.

Original work by Ben Hutchings <bhutchings@solarflare.com>

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        |    8 +++++++-
 drivers/net/ethernet/sfc/net_driver.h |    5 ++++-
 drivers/net/ethernet/sfc/rx.c         |    7 +++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 70554a1..3ae6774 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1422,10 +1422,16 @@ static void efx_set_channels(struct efx_nic *efx)
 	efx->tx_channel_offset =
 		separate_tx_channels ? efx->n_channels - efx->n_tx_channels : 0;
 
-	/* We need to adjust the TX queue numbers if we have separate
+	/* We need to mark which channels really have RX and TX
+	 * queues, and adjust the TX queue numbers if we have separate
 	 * RX-only and TX-only channels.
 	 */
 	efx_for_each_channel(channel, efx) {
+		if (channel->channel < efx->n_rx_channels)
+			channel->rx_queue.core_index = channel->channel;
+		else
+			channel->rx_queue.core_index = -1;
+
 		efx_for_each_channel_tx_queue(tx_queue, channel)
 			tx_queue->queue -= (efx->tx_channel_offset *
 					    EFX_TXQ_TYPES);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 55be2fd..3894593 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -242,6 +242,8 @@ struct efx_rx_page_state {
 /**
  * struct efx_rx_queue - An Efx RX queue
  * @efx: The associated Efx NIC
+ * @core_index:  Index of network core RX queue.  Will be >= 0 iff this
+ *	is associated with a real RX queue.
  * @buffer: The software buffer ring
  * @rxd: The hardware descriptor ring
  * @ptr_mask: The size of the ring minus 1.
@@ -263,6 +265,7 @@ struct efx_rx_page_state {
  */
 struct efx_rx_queue {
 	struct efx_nic *efx;
+	int core_index;
 	struct efx_rx_buffer *buffer;
 	struct efx_special_buffer rxd;
 	unsigned int ptr_mask;
@@ -1044,7 +1047,7 @@ static inline bool efx_tx_queue_used(struct efx_tx_queue *tx_queue)
 
 static inline bool efx_channel_has_rx_queue(struct efx_channel *channel)
 {
-	return channel->channel < channel->efx->n_rx_channels;
+	return channel->rx_queue.core_index >= 0;
 }
 
 static inline struct efx_rx_queue *
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 6d1c6cf..3f598bf 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -478,7 +478,7 @@ static void efx_rx_packet_gro(struct efx_channel *channel,
 		skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
 				  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
 
-		skb_record_rx_queue(skb, channel->channel);
+		skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
 		gro_result = napi_gro_frags(napi);
 	} else {
@@ -570,6 +570,9 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Set the SKB flags */
 	skb_checksum_none_assert(skb);
 
+	/* Record the rx_queue */
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
+
 	/* Pass the packet up */
 	netif_receive_skb(skb);
 
@@ -607,7 +610,7 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 		 * at the ethernet header */
 		skb->protocol = eth_type_trans(skb, efx->net_dev);
 
-		skb_record_rx_queue(skb, channel->channel);
+		skb_record_rx_queue(skb, channel->rx_queue.core_index);
 	}
 
 	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* Re: [patch net-next] team: refine IFF_XMIT_DST_RELEASE capability
From: Eric Dumazet @ 2012-07-18 18:17 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, edumazet
In-Reply-To: <1342633178-1411-1-git-send-email-jiri@resnulli.us>

On Wed, 2012-07-18 at 19:39 +0200, Jiri Pirko wrote:
> Cloned patch of Eric Dumazet for bonding.
> 
> Some workloads greatly benefit of IFF_XMIT_DST_RELEASE capability
> on output net device, avoiding dirtying dst refcount.
> 
> team currently disables IFF_XMIT_DST_RELEASE unconditionally.
> 
> If all ports have the IFF_XMIT_DST_RELEASE bit set, then
> team dev can also have it in its priv_flags.
> 
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---
>  drivers/net/team/team.c |    5 +++++
>  1 file changed, 5 insertions(+)

Acked-by: Eric Dumazet <edumazet@google.com>

Thanks

^ permalink raw reply

* [PATCH] sunrpc: clnt: Add missing braces
From: Joe Perches @ 2012-07-18 18:17 UTC (permalink / raw)
  To: Trond Myklebust, J. Bruce Fields, Chuck Lever
  Cc: David S. Miller, linux-nfs, netdev, linux-kernel

Add a missing set of braces that commit 4e0038b6b24
("SUNRPC: Move clnt->cl_server into struct rpc_xprt")
forgot.

Signed-off-by: Joe Perches <joe@perches.com>
---
 net/sunrpc/clnt.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f56f045..aaf70aa 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1844,12 +1844,13 @@ call_timeout(struct rpc_task *task)
 		return;
 	}
 	if (RPC_IS_SOFT(task)) {
-		if (clnt->cl_chatty)
+		if (clnt->cl_chatty) {
 			rcu_read_lock();
 			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
 				clnt->cl_protname,
 				rcu_dereference(clnt->cl_xprt)->servername);
 			rcu_read_unlock();
+		}
 		if (task->tk_flags & RPC_TASK_TIMEOUT)
 			rpc_exit(task, -ETIMEDOUT);
 		else

^ permalink raw reply related

* pull request: sfc-next 2012-07-18
From: Ben Hutchings @ 2012-07-18 18:16 UTC (permalink / raw)
  To: David Miller; +Cc: linux-net-drivers, netdev, Andrew Jackson, Richard Cochran

[-- Attachment #1: Type: text/plain, Size: 1996 bytes --]

The following changes since commit f150bd7f8cf742c4cdd0c929aa494ef72f7f5b13:

  driver: net: ethernet: cpsw: runtime PM support (2012-07-18 09:40:54 -0700)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/bwh/sfc-next.git for-davem

(commit 2482aab313c3657557f1dad4c1b20c0934b84919)

This adds PTP hardware timestamping support for SFC9000-family
controllers with the appropriate peripheral (currently SFN5322F and
SFN6322F boards).

Ben.

Ben Hutchings (3):
      sfc: Support variable-length response to MCDI GET_BOARD_CFG
      sfc: Expose FPGA bitfile partition through MTD
      sfc: Bump version to 3.2

Stuart Hodgson (4):
      sfc: Add explicit RX queue flag to channel
      sfc: Add channel specific receive_skb handler and post_remove callback
      sfc: Allow efx_mcdi_rpc to be called in two parts
      sfc: Add support for IEEE-1588 PTP

 drivers/net/ethernet/sfc/Kconfig      |    7 +
 drivers/net/ethernet/sfc/Makefile     |    1 +
 drivers/net/ethernet/sfc/efx.c        |   17 +-
 drivers/net/ethernet/sfc/efx.h        |    1 +
 drivers/net/ethernet/sfc/ethtool.c    |    1 +
 drivers/net/ethernet/sfc/mcdi.c       |   44 +-
 drivers/net/ethernet/sfc/mcdi.h       |    6 +
 drivers/net/ethernet/sfc/mcdi_pcol.h  |    1 +
 drivers/net/ethernet/sfc/mtd.c        |    7 +-
 drivers/net/ethernet/sfc/net_driver.h |   29 +-
 drivers/net/ethernet/sfc/nic.h        |   31 +
 drivers/net/ethernet/sfc/ptp.c        | 1519 +++++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/rx.c         |   20 +-
 drivers/net/ethernet/sfc/siena.c      |    1 +
 drivers/net/ethernet/sfc/tx.c         |    6 +
 15 files changed, 1673 insertions(+), 18 deletions(-)
 create mode 100644 drivers/net/ethernet/sfc/ptp.c

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.




[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* [PATCH net-next v4] ipv6: add ipv6_addr_hash() helper
From: Eric Dumazet @ 2012-07-18 18:11 UTC (permalink / raw)
  To: David Miller; +Cc: Joe Perches, netdev, Andrew McGregor, Dave Taht, Tom Herbert
In-Reply-To: <1342621670.2626.2818.camel@edumazet-glaptop>

From: Eric Dumazet <edumazet@google.com>

Introduce ipv6_addr_hash() helper doing a XOR on all bits
of an IPv6 address, with an optimized x86_64 version.

Use it in flow dissector, as suggested by Andrew McGregor,
to reduce hash collision probabilities in fq_codel (and other
users of flow dissector)

Use it in ip6_tunnel.c and use more bit shuffling, as suggested
by David Laight, as existing hash was ignoring most of them.

Use it in sunrpc and use more bit shuffling, using hash_32().

Use it in net/ipv6/addrconf.c, using hash_32() as well.

As a cleanup, use it in net/ipv4/tcp_metrics.c

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrew McGregor <andrewmcgr@gmail.com>
Cc: Dave Taht <dave.taht@gmail.com>
Cc: Tom Herbert <therbert@google.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Joe Perches <joe@perches.com>
---
v4: net/ipv6/addrconf.c part, sorry again David

 include/net/addrconf.h    |    3 ++-
 include/net/ipv6.h        |   13 +++++++++++++
 net/core/flow_dissector.c |    5 +++--
 net/ipv4/tcp_metrics.c    |   15 +++------------
 net/ipv6/addrconf.c       |   21 ++++++++-------------
 net/ipv6/ip6_tunnel.c     |   20 ++++++++++++--------
 net/sunrpc/svcauth_unix.c |   22 ++++------------------
 7 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f2b801c..089a09d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -46,7 +46,8 @@ struct prefix_info {
 #include <net/if_inet6.h>
 #include <net/ipv6.h>
 
-#define IN6_ADDR_HSIZE		16
+#define IN6_ADDR_HSIZE_SHIFT	4
+#define IN6_ADDR_HSIZE		(1 << IN6_ADDR_HSIZE_SHIFT)
 
 extern int			addrconf_init(void);
 extern void			addrconf_cleanup(void);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f695f39..01c34b3 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -419,6 +419,19 @@ static inline bool ipv6_addr_any(const struct in6_addr *a)
 #endif
 }
 
+static inline u32 ipv6_addr_hash(const struct in6_addr *a)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	const unsigned long *ul = (const unsigned long *)a;
+	unsigned long x = ul[0] ^ ul[1];
+
+	return (u32)(x ^ (x >> 32));
+#else
+	return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
+			     a->s6_addr32[2] ^ a->s6_addr32[3]);
+#endif
+}
+
 static inline bool ipv6_addr_loopback(const struct in6_addr *a)
 {
 	return (a->s6_addr32[0] | a->s6_addr32[1] |
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a225089..466820b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -4,6 +4,7 @@
 #include <linux/ipv6.h>
 #include <linux/if_vlan.h>
 #include <net/ip.h>
+#include <net/ipv6.h>
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
@@ -55,8 +56,8 @@ ipv6:
 			return false;
 
 		ip_proto = iph->nexthdr;
-		flow->src = iph->saddr.s6_addr32[3];
-		flow->dst = iph->daddr.s6_addr32[3];
+		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 		nhoff += sizeof(struct ipv6hdr);
 		break;
 	}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 5a38a2d..1a115b6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -211,10 +211,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 		break;
 	case AF_INET6:
 		*(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
-		hash = ((__force unsigned int) addr.addr.a6[0] ^
-			(__force unsigned int) addr.addr.a6[1] ^
-			(__force unsigned int) addr.addr.a6[2] ^
-			(__force unsigned int) addr.addr.a6[3]);
+		hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
 		break;
 	default:
 		return NULL;
@@ -251,10 +248,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 	case AF_INET6:
 		tw6 = inet6_twsk((struct sock *)tw);
 		*(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
-		hash = ((__force unsigned int) addr.addr.a6[0] ^
-			(__force unsigned int) addr.addr.a6[1] ^
-			(__force unsigned int) addr.addr.a6[2] ^
-			(__force unsigned int) addr.addr.a6[3]);
+		hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
 		break;
 	default:
 		return NULL;
@@ -291,10 +285,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 		break;
 	case AF_INET6:
 		*(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
-		hash = ((__force unsigned int) addr.addr.a6[0] ^
-			(__force unsigned int) addr.addr.a6[1] ^
-			(__force unsigned int) addr.addr.a6[2] ^
-			(__force unsigned int) addr.addr.a6[3]);
+		hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
 		break;
 	default:
 		return NULL;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8f6411c..7918181 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -63,6 +63,7 @@
 #include <linux/delay.h>
 #include <linux/notifier.h>
 #include <linux/string.h>
+#include <linux/hash.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -579,15 +580,9 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
 	list_add_tail(&ifp->if_list, p);
 }
 
-static u32 ipv6_addr_hash(const struct in6_addr *addr)
+static u32 inet6_addr_hash(const struct in6_addr *addr)
 {
-	/*
-	 * We perform the hash function over the last 64 bits of the address
-	 * This will include the IEEE address token on links that support it.
-	 */
-	return jhash_2words((__force u32)addr->s6_addr32[2],
-			    (__force u32)addr->s6_addr32[3], 0)
-		& (IN6_ADDR_HSIZE - 1);
+	return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT);
 }
 
 /* On success it returns ifp with increased reference count */
@@ -662,7 +657,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 	in6_ifa_hold(ifa);
 
 	/* Add to big hash table */
-	hash = ipv6_addr_hash(addr);
+	hash = inet6_addr_hash(addr);
 
 	hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
 	spin_unlock(&addrconf_hash_lock);
@@ -1270,7 +1265,7 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 {
 	struct inet6_ifaddr *ifp;
 	struct hlist_node *node;
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 
 	rcu_read_lock_bh();
 	hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) {
@@ -1293,7 +1288,7 @@ EXPORT_SYMBOL(ipv6_chk_addr);
 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
 			       struct net_device *dev)
 {
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 	struct inet6_ifaddr *ifp;
 	struct hlist_node *node;
 
@@ -1336,7 +1331,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
 				     struct net_device *dev, int strict)
 {
 	struct inet6_ifaddr *ifp, *result = NULL;
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 	struct hlist_node *node;
 
 	rcu_read_lock_bh();
@@ -3223,7 +3218,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
 	int ret = 0;
 	struct inet6_ifaddr *ifp = NULL;
 	struct hlist_node *n;
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 
 	rcu_read_lock_bh();
 	hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index db32846..9a1d5fe 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -40,6 +40,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/netfilter_ipv6.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
@@ -70,11 +71,15 @@ MODULE_ALIAS_NETDEV("ip6tnl0");
 #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
 #define IPV6_TCLASS_SHIFT 20
 
-#define HASH_SIZE  32
+#define HASH_SIZE_SHIFT  5
+#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
 
-#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
-		     (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
-		    (HASH_SIZE - 1))
+static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
+{
+	u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
+
+	return hash_32(hash, HASH_SIZE_SHIFT);
+}
 
 static int ip6_tnl_dev_init(struct net_device *dev);
 static void ip6_tnl_dev_setup(struct net_device *dev);
@@ -166,12 +171,11 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
 static struct ip6_tnl *
 ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
 {
-	unsigned int h0 = HASH(remote);
-	unsigned int h1 = HASH(local);
+	unsigned int hash = HASH(remote, local);
 	struct ip6_tnl *t;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
-	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) {
+	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
 		if (ipv6_addr_equal(local, &t->parms.laddr) &&
 		    ipv6_addr_equal(remote, &t->parms.raddr) &&
 		    (t->dev->flags & IFF_UP))
@@ -205,7 +209,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p)
 
 	if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
 		prio = 1;
-		h = HASH(remote) ^ HASH(local);
+		h = HASH(remote, local);
 	}
 	return &ip6n->tnls[prio][h];
 }
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2777fa8..4d01292 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -104,23 +104,9 @@ static void ip_map_put(struct kref *kref)
 	kfree(im);
 }
 
-#if IP_HASHBITS == 8
-/* hash_long on a 64 bit machine is currently REALLY BAD for
- * IP addresses in reverse-endian (i.e. on a little-endian machine).
- * So use a trivial but reliable hash instead
- */
-static inline int hash_ip(__be32 ip)
-{
-	int hash = (__force u32)ip ^ ((__force u32)ip>>16);
-	return (hash ^ (hash>>8)) & 0xff;
-}
-#endif
-static inline int hash_ip6(struct in6_addr ip)
+static inline int hash_ip6(const struct in6_addr *ip)
 {
-	return (hash_ip(ip.s6_addr32[0]) ^
-		hash_ip(ip.s6_addr32[1]) ^
-		hash_ip(ip.s6_addr32[2]) ^
-		hash_ip(ip.s6_addr32[3]));
+	return hash_32(ipv6_addr_hash(ip), IP_HASHBITS);
 }
 static int ip_map_match(struct cache_head *corig, struct cache_head *cnew)
 {
@@ -301,7 +287,7 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
 	ip.m_addr = *addr;
 	ch = sunrpc_cache_lookup(cd, &ip.h,
 				 hash_str(class, IP_HASHBITS) ^
-				 hash_ip6(*addr));
+				 hash_ip6(addr));
 
 	if (ch)
 		return container_of(ch, struct ip_map, h);
@@ -331,7 +317,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
 	ip.h.expiry_time = expiry;
 	ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
 				 hash_str(ipm->m_class, IP_HASHBITS) ^
-				 hash_ip6(ipm->m_addr));
+				 hash_ip6(&ipm->m_addr));
 	if (!ch)
 		return -ENOMEM;
 	cache_put(ch, cd);

^ permalink raw reply related

* [PATCH v2] sctp: Implement quick failover draft from tsvwg
From: Neil Horman @ 2012-07-18 18:01 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp
In-Reply-To: <1342203998-24037-1-git-send-email-nhorman@tuxdriver.com>

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.
---
 Documentation/networking/ip-sysctl.txt |   14 +++++
 include/net/sctp/constants.h           |    1 +
 include/net/sctp/structs.h             |   11 +++-
 include/net/sctp/user.h                |   11 ++++
 net/sctp/associola.c                   |   36 ++++++++++--
 net/sctp/outqueue.c                    |    6 +-
 net/sctp/sm_sideeffect.c               |   33 ++++++++++-
 net/sctp/socket.c                      |   96 ++++++++++++++++++++++++++++++++
 net/sctp/sysctl.c                      |    9 +++
 net/sctp/transport.c                   |    4 +-
 10 files changed, 206 insertions(+), 15 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79..c636f9c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1408,6 +1408,20 @@ path_max_retrans - INTEGER
 
 	Default: 5
 
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature
+
+	Default: 0
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 942b864..d053d2e 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -334,6 +334,7 @@ typedef enum {
 typedef enum {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
+	SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e4652fe..f70726c 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -160,6 +160,7 @@ extern struct sctp_globals {
 	int max_retrans_association;
 	int max_retrans_path;
 	int max_retrans_init;
+	int pf_retrans;
 
 	/*
 	 * Policy for preforming sctp/socket accounting
@@ -258,6 +259,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy	 	(sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
+#define sctp_pf_retrans			(sctp_globals.pf_retrans)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
 #define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
@@ -987,10 +989,15 @@ struct sctp_transport {
 
 	/* This is the max_retrans value for the transport and will
 	 * be initialized from the assocs value.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
 	 */
 	__u16 pathmaxrxt;
 
+	/* This is the partially failed retrans value for the transport
+	 * and will be initialized from the assocs value.  This can be changed
+	 * using the SCTP_PEER_ADDR_THLDS socket option
+	 */
+	int pf_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1660,6 +1667,8 @@ struct sctp_association {
 	 */
 	int max_retrans;
 
+	int pf_retrans;
+
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index 0842ef0..1b02d7a 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
 #define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -649,6 +650,7 @@ struct sctp_paddrinfo {
  */
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
+	SCTP_PF,
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
@@ -741,4 +743,13 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5bc9ab1..b357195 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	 * socket values.
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->pf_retrans  = sctp_pf_retrans;
+
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
 	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@@ -685,6 +687,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the path max_retrans.  */
 	peer->pathmaxrxt = asoc->pathmaxrxt;
 
+	/* And the partial failure retrnas threshold */
+	peer->pf_retrans = asoc->pf_retrans;
+
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
 	 */
@@ -840,6 +845,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	struct sctp_ulpevent *event;
 	struct sockaddr_storage addr;
 	int spc_state = 0;
+	bool ulp_notify = true;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -853,6 +859,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			spc_state = SCTP_ADDR_CONFIRMED;
 		else
 			spc_state = SCTP_ADDR_AVAILABLE;
+		/* Don't inform ULP about transition from PF to
+		 * active state and set cwnd to 1, see SCTP
+		 * Quick failover draft section 5.1, point 5
+		 */
+		if (transport->state == SCTP_PF) {
+			ulp_notify = false;
+			transport->cwnd = 1;
+		}
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
+	case SCTP_TRANSPORT_PF:
+		transport->state = SCTP_PF;
+		ulp_notify = false;
+		break;
 	default:
 		return;
 	}
@@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
 	 * user.
 	 */
-	memset(&addr, 0, sizeof(struct sockaddr_storage));
-	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-				0, spc_state, error, GFP_ATOMIC);
-	if (event)
-		sctp_ulpq_tail_event(&asoc->ulpq, event);
+	if (ulp_notify) {
+		memset(&addr, 0, sizeof(struct sockaddr_storage));
+		memcpy(&addr, &transport->ipaddr,
+		       transport->af_specific->sockaddr_len);
+		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+					0, spc_state, error, GFP_ATOMIC);
+		if (event)
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+	}
 
 	/* Select new active and retran paths. */
 
@@ -899,7 +920,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 			transports) {
 
 		if ((t->state == SCTP_INACTIVE) ||
-		    (t->state == SCTP_UNCONFIRMED))
+		    (t->state == SCTP_UNCONFIRMED) ||
+		    (t->state == SCTP_PF))
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index a0fa19f..e7aa177c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			if (!new_transport)
 				new_transport = asoc->peer.active_path;
 		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			   (new_transport->state == SCTP_UNCONFIRMED) ||
+			   (new_transport->state == SCTP_PF)) {
 			/* If the chunk is Heartbeat or Heartbeat Ack,
 			 * send it to chunk->transport, even if it's
 			 * inactive.
@@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			new_transport = chunk->transport;
 			if (!new_transport ||
 			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED)))
+			     (new_transport->state == SCTP_UNCONFIRMED) ||
+			     (new_transport->state == SCTP_PF)))
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED)
 				continue;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c96d1a8..285e26a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_cmd_seq_t *commands,
 			     gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
@@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
 {
@@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 			transport->error_count++;
 	}
 
+	/* If the transport error count is greater than the pf_retrans
+	 * threshold, and less than pathmaxrtx, then mark this transport
+	 * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+	 * point 1
+	 */
+	if ((transport->state != SCTP_PF) &&
+	   (asoc->pf_retrans < transport->pathmaxrxt) &&
+	   (transport->error_count > asoc->pf_retrans)) {
+
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_PF,
+					     0);
+
+		/* Update the hb timer to resend a heartbeat every rto */
+		sctp_cmd_hb_timer_update(commands, transport);
+	}
+
 	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count > transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 					     SCTP_HEARTBEAT_SUCCESS);
 	}
 
+	if (t->state == SCTP_PF)
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
 	 * using the time value carried in the HEARTBEAT ACK chunk.
@@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-						    0);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						    cmd->obj.transport, 0);
 			break;
 
 		case SCTP_CMD_TRANSPORT_IDLE:
@@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_do_8_2_transport_strike(asoc, t, 1);
+			sctp_do_8_2_transport_strike(commands, asoc,
+						     t, 1);
 			t->hb_sent = 1;
 			break;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..dfffece 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,52 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3665,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -5490,6 +5539,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
 	return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+			val.spt_assoc_id);
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
@@ -5628,6 +5721,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
 		.extra2		= &int_max
 	},
 	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
 		.procname	= "max_init_retransmits",
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Initialize the default path max_retrans.  */
 	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;
 
 	INIT_LIST_HEAD(&peer->transmitted);
 	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
 	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
 		timeout += t->hbinterval;
 	timeout += jiffies;
 	return timeout;
-- 
1.7.7.6

^ permalink raw reply related

* Re: That's pretty much it for 3.5.0
From: Eric Dumazet @ 2012-07-18 17:55 UTC (permalink / raw)
  To: Rustad, Mark D
  Cc: Neil Horman, Fastabend, John R,
	<h@hmsreliant.think-freely.org>, David Miller,
	<netdev@vger.kernel.org>,
	<linux-wireless@vger.kernel.org>,
	<netfilter-devel@vger.kernel.org>
In-Reply-To: <205259E8-A99F-4573-96C9-7A394235B338@intel.com>

On Wed, 2012-07-18 at 17:36 +0000, Rustad, Mark D wrote:
> On Jul 18, 2012, at 6:04 AM, Neil Horman wrote:
> 
> > John, can you post the backtrace you got for this?  I replied to the patch that
> > you posted for this fix.  the cgroup subsystem has an early_init flag thats
> > supposed to prevent the initialization of cgroups that don't need initialization
> > until later (like via module_init() calls).
> 
> Here is the backtrace that I get and below a patch that fixes it:
> 
> [    0.010958] Initializing cgroup subsys net_prio
> [    0.011040] BUG: unable to handle kernel NULL pointer dereference at 0000000000000828
> [    0.011998] IP: [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
> [    0.011998] PGD 0 
> [    0.011998] Oops: 0000 [#1] SMP 
> [    0.011998] CPU 0 
> [    0.011998] Modules linked in:
> [    0.011998] 
> [    0.011998] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc7-mdrlinux+ #10 Bochs Bochs
> [    0.011998] RIP: 0010:[<ffffffff814202c8>]  [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
> [    0.011998] RSP: 0000:ffffffff81a01e68  EFLAGS: 00010246
> [    0.011998] RAX: 0000000000000000 RBX: fffffffffffffed0 RCX: 0000000000000000
> [    0.011998] RDX: 0000000000000006 RSI: 2222222222222222 RDI: 2222222222222222
> [    0.011998] RBP: ffffffff81a01e88 R08: 2222222222222222 R09: 2222222222222222
> [    0.011998] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001
> [    0.011998] R13: 0000000000000000 R14: ffff88007ff608c0 R15: 00000000000143d0
> [    0.011998] FS:  0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
> [    0.011998] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [    0.011998] CR2: 0000000000000828 CR3: 0000000001a0b000 CR4: 00000000000006b0
> [    0.011998] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [    0.011998] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [    0.011998] Process swapper/0 (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a13420)
> [    0.011998] Stack:
> [    0.011998]  ffffffff81a88020 0000000000000000 ffff88007d3a38f0 fffffffffffffff4
> [    0.011998]  ffffffff81a01ec8 ffffffff814203cd ffff88007ff608c0 ffffffff817d8e9a
> [    0.011998]  ffffffff81a88cd8 ffffffff81a88020 ffffffff81a88020 ffffffff81b010a0
> [    0.011998] Call Trace:
> [    0.011998]  [<ffffffff814203cd>] cgrp_create+0x8d/0xc0
> [    0.011998]  [<ffffffff81ade14a>] cgroup_init_subsys+0x80/0x126
> [    0.011998]  [<ffffffff81ade380>] cgroup_init+0x36/0x117
> [    0.011998]  [<ffffffff81acab71>] start_kernel+0x32e/0x34f
> [    0.011998]  [<ffffffff81aca6d5>] ? repair_env_string+0x5a/0x5a
> [    0.011998]  [<ffffffff81aca346>] x86_64_start_reservations+0x101/0x105
> [    0.011998]  [<ffffffff81aca120>] ? early_idt_handlers+0x120/0x120
> [    0.011998]  [<ffffffff81aca417>] x86_64_start_kernel+0xcd/0xdc
> [    0.011998] Code: 0f 1f 00 48 8b 83 30 01 00 00 48 8d 98 d0 fe ff ff 48 3d a8 e8 52 82 74 3a e8 25 db c3 ff 85 c0 74 09 80 3d bb 3d 68 00 00 74 40 <48> 8b 83 58 09 00 00 48 85 c0 74 cc 44 3b 60 10 76 c6 44 89 e6 
> [    0.011998] RIP  [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
> [    0.011998]  RSP <ffffffff81a01e68>
> [    0.011998] CR2: 0000000000000828
> [    0.012009] ---[ end trace a7919e7f17c0a725 ]---
> [    0.012601] Kernel panic - not syncing: Attempted to kill the idle task!
> 
> The following change simply statically initializes init_net.dev_base_head. I copied and pasted it into the email, so this rendering may not work, but I can send it if this approach looks reasonable. I have verified that it resolves the issue above.
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0f28a9e..db1ba61 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6283,8 +6283,6 @@ static struct hlist_head *netdev_create_hash(void)
>  /* Initialize per network namespace state */
>  static int __net_init netdev_init(struct net *net)
>  {
> -       INIT_LIST_HEAD(&net->dev_base_head);
> -

	if (net != &init_net)
		INIT_LIST_HEAD(&net->dev_base_head);

>         net->dev_name_head = netdev_create_hash();
>         if (net->dev_name_head == NULL)
>                 goto err_name;
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index dddbacb..42f1e1c 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -27,7 +27,9 @@ static DEFINE_MUTEX(net_mutex);
>  LIST_HEAD(net_namespace_list);
>  EXPORT_SYMBOL_GPL(net_namespace_list);
>  
> -struct net init_net;
> +struct net init_net = {
> +       .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
> +};
>  EXPORT_SYMBOL(init_net);
>  
>  #define INITIAL_NET_GEN_PTRS   13 /* +1 for len +2 for rcu_head */
> 



^ permalink raw reply

* Re: r8169: link up, link down
From: Francois Romieu @ 2012-07-18 17:39 UTC (permalink / raw)
  To: J. Christopher Pereira; +Cc: netdev
In-Reply-To: <02b401cd64f9$115acc90$341065b0$@cl>

J. Christopher Pereira <kripper@imatronix.cl> :
[...]
> Is there any solution or workarround?

If it's an XID 98000000 - i.e. old new hardware - you may try to remove the
device then rescan the PCI bus through sysfs.

Building a modern kernel is strongly suggested if the hardware includes
a recent 816x chipset.

-- 
Ueimor

^ permalink raw reply

* [patch net-next] team: refine IFF_XMIT_DST_RELEASE capability
From: Jiri Pirko @ 2012-07-18 17:39 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet

Cloned patch of Eric Dumazet for bonding.

Some workloads greatly benefit of IFF_XMIT_DST_RELEASE capability
on output net device, avoiding dirtying dst refcount.

team currently disables IFF_XMIT_DST_RELEASE unconditionally.

If all ports have the IFF_XMIT_DST_RELEASE bit set, then
team dev can also have it in its priv_flags.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 drivers/net/team/team.c |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 1a13470..813e131 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -733,12 +733,14 @@ static void __team_compute_features(struct team *team)
 	struct team_port *port;
 	u32 vlan_features = TEAM_VLAN_FEATURES;
 	unsigned short max_hard_header_len = ETH_HLEN;
+	unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
 
 	list_for_each_entry(port, &team->port_list, list) {
 		vlan_features = netdev_increment_features(vlan_features,
 					port->dev->vlan_features,
 					TEAM_VLAN_FEATURES);
 
+		dst_release_flag &= port->dev->priv_flags;
 		if (port->dev->hard_header_len > max_hard_header_len)
 			max_hard_header_len = port->dev->hard_header_len;
 	}
@@ -746,6 +748,9 @@ static void __team_compute_features(struct team *team)
 	team->dev->vlan_features = vlan_features;
 	team->dev->hard_header_len = max_hard_header_len;
 
+	flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
+	team->dev->priv_flags = flags | dst_release_flag;
+
 	netdev_change_features(team->dev);
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH] SUNRPC: Prevent kernel stack corruption on long values of flush
From: J. Bruce Fields @ 2012-07-18 17:39 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Trond.Myklebust, davem, davej, linux-nfs, netdev, linux-kernel
In-Reply-To: <1342476086-21638-1-git-send-email-levinsasha928@gmail.com>

On Tue, Jul 17, 2012 at 12:01:26AM +0200, Sasha Levin wrote:
> The buffer size in read_flush() is too small for the longest possible values
> for it. This can lead to a kernel stack corruption:

Thanks!

> 
> diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
> index 2afd2a8..f86d95e 100644
> --- a/net/sunrpc/cache.c
> +++ b/net/sunrpc/cache.c
> @@ -1409,11 +1409,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
>  			  size_t count, loff_t *ppos,
>  			  struct cache_detail *cd)
>  {
> -	char tbuf[20];
> +	char tbuf[22];

I wonder how common this sort of calculation is in the kernel?  It might
provide some peace of mind to be able to write this something like

	char tbuf[MAXLEN_BASE10_UL + 2]  /* + 2 for final "\n\0" */

--b.

>  	unsigned long p = *ppos;
>  	size_t len;
>  
> -	sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time));
> +	snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time));
>  	len = strlen(tbuf);
>  	if (p >= len)
>  		return 0;
> -- 
> 1.7.8.6
> 

^ permalink raw reply

* Re: That's pretty much it for 3.5.0
From: Rustad, Mark D @ 2012-07-18 17:36 UTC (permalink / raw)
  To: Neil Horman
  Cc: Fastabend, John R, <h@hmsreliant.think-freely.org>,
	David Miller, <netdev@vger.kernel.org>,
	<linux-wireless@vger.kernel.org>,
	<netfilter-devel@vger.kernel.org>
In-Reply-To: <20120718130430.GE25563@hmsreliant.think-freely.org>

On Jul 18, 2012, at 6:04 AM, Neil Horman wrote:

> John, can you post the backtrace you got for this?  I replied to the patch that
> you posted for this fix.  the cgroup subsystem has an early_init flag thats
> supposed to prevent the initialization of cgroups that don't need initialization
> until later (like via module_init() calls).

Here is the backtrace that I get and below a patch that fixes it:

[    0.010958] Initializing cgroup subsys net_prio
[    0.011040] BUG: unable to handle kernel NULL pointer dereference at 0000000000000828
[    0.011998] IP: [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
[    0.011998] PGD 0 
[    0.011998] Oops: 0000 [#1] SMP 
[    0.011998] CPU 0 
[    0.011998] Modules linked in:
[    0.011998] 
[    0.011998] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc7-mdrlinux+ #10 Bochs Bochs
[    0.011998] RIP: 0010:[<ffffffff814202c8>]  [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
[    0.011998] RSP: 0000:ffffffff81a01e68  EFLAGS: 00010246
[    0.011998] RAX: 0000000000000000 RBX: fffffffffffffed0 RCX: 0000000000000000
[    0.011998] RDX: 0000000000000006 RSI: 2222222222222222 RDI: 2222222222222222
[    0.011998] RBP: ffffffff81a01e88 R08: 2222222222222222 R09: 2222222222222222
[    0.011998] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001
[    0.011998] R13: 0000000000000000 R14: ffff88007ff608c0 R15: 00000000000143d0
[    0.011998] FS:  0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[    0.011998] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.011998] CR2: 0000000000000828 CR3: 0000000001a0b000 CR4: 00000000000006b0
[    0.011998] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.011998] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    0.011998] Process swapper/0 (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a13420)
[    0.011998] Stack:
[    0.011998]  ffffffff81a88020 0000000000000000 ffff88007d3a38f0 fffffffffffffff4
[    0.011998]  ffffffff81a01ec8 ffffffff814203cd ffff88007ff608c0 ffffffff817d8e9a
[    0.011998]  ffffffff81a88cd8 ffffffff81a88020 ffffffff81a88020 ffffffff81b010a0
[    0.011998] Call Trace:
[    0.011998]  [<ffffffff814203cd>] cgrp_create+0x8d/0xc0
[    0.011998]  [<ffffffff81ade14a>] cgroup_init_subsys+0x80/0x126
[    0.011998]  [<ffffffff81ade380>] cgroup_init+0x36/0x117
[    0.011998]  [<ffffffff81acab71>] start_kernel+0x32e/0x34f
[    0.011998]  [<ffffffff81aca6d5>] ? repair_env_string+0x5a/0x5a
[    0.011998]  [<ffffffff81aca346>] x86_64_start_reservations+0x101/0x105
[    0.011998]  [<ffffffff81aca120>] ? early_idt_handlers+0x120/0x120
[    0.011998]  [<ffffffff81aca417>] x86_64_start_kernel+0xcd/0xdc
[    0.011998] Code: 0f 1f 00 48 8b 83 30 01 00 00 48 8d 98 d0 fe ff ff 48 3d a8 e8 52 82 74 3a e8 25 db c3 ff 85 c0 74 09 80 3d bb 3d 68 00 00 74 40 <48> 8b 83 58 09 00 00 48 85 c0 74 cc 44 3b 60 10 76 c6 44 89 e6 
[    0.011998] RIP  [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
[    0.011998]  RSP <ffffffff81a01e68>
[    0.011998] CR2: 0000000000000828
[    0.012009] ---[ end trace a7919e7f17c0a725 ]---
[    0.012601] Kernel panic - not syncing: Attempted to kill the idle task!

The following change simply statically initializes init_net.dev_base_head. I copied and pasted it into the email, so this rendering may not work, but I can send it if this approach looks reasonable. I have verified that it resolves the issue above.

diff --git a/net/core/dev.c b/net/core/dev.c
index 0f28a9e..db1ba61 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6283,8 +6283,6 @@ static struct hlist_head *netdev_create_hash(void)
 /* Initialize per network namespace state */
 static int __net_init netdev_init(struct net *net)
 {
-       INIT_LIST_HEAD(&net->dev_base_head);
-
        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
                goto err_name;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index dddbacb..42f1e1c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -27,7 +27,9 @@ static DEFINE_MUTEX(net_mutex);
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
 
-struct net init_net;
+struct net init_net = {
+       .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+};
 EXPORT_SYMBOL(init_net);
 
 #define INITIAL_NET_GEN_PTRS   13 /* +1 for len +2 for rcu_head */

-- 
Mark Rustad, LAN Access Division, Intel Corporation

^ permalink raw reply related

* Re: getsockopt/setsockopt with SO_RCVBUF and SO_SNDBUF "non-standard" behaviour
From: Rick Jones @ 2012-07-18 17:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Eugen Dedu, linux-kernel@vger.kernel.org, netdev
In-Reply-To: <1342627875.2626.3070.camel@edumazet-glaptop>

On 07/18/2012 09:11 AM, Eric Dumazet wrote:
>
> That the way it's done on linux since day 0
>
> You can probably find a lot of pages on the web explaining the
> rationale.
>
> If your application handles UDP frames, what SO_RCVBUF should count ?
>
> If its the amount of payload bytes, you could have a pathological
> situation where an attacker sends 1-byte UDP frames fast enough and
> could consume a lot of kernel memory.
>
> Each frame consumes a fair amount of kernel memory (between 512 bytes
> and 8 Kbytes depending on the driver).
>
> So linux says : If user expect to receive  XXXX bytes, set a limit of
> _kernel_ memory used to store these bytes, and use an estimation of 100%
> of overhead. That is : allow 2*XXXX bytes to be allocated for socket
> receive buffers.

Expanding on/rewording that, in a setsockopt() call SO_RCVBUF specifies 
the data bytes and gets doubled to become the kernel/overhead byte 
limit.  Unless the doubling would be greater than net.core.rmem_max, in 
which case the limit becomes net.core.rmem_max.

But on getsockopt() SO_RCVBUF is always the kernel/overhead byte limit.

In one call it is fish.  In the other it is fowl.

Other stacks appear to keep their kernel/overhead limit quiet, keeping 
SO_RCVBUF an expression of a data limit in both setsockopt() and 
getsockopt().  With those stacks, there is I suppose the possible source 
of confusion when/if someone tests the queuing to a socket, sends "high 
overhead" packets and doesn't get to SO_RCVBUF worth of data though I 
don't recall encountering that in my "pre-linux" time.

The sometimes fish, sometimes fowl version (along with the auto tuning 
when one doesn't make setsockopt() calls) gave me fits in netperf for 
years until I finally relented and split the socket buffer size 
variables into three - what netperf's user requested via the command 
line, what it was right after the socket was created, and what it was at 
the end of the data phase of the test.

rick jones

^ permalink raw reply

* Re: [RFC PATCH] net: cgroup: null ptr dereference in netprio cgroup during init
From: Neil Horman @ 2012-07-18 17:14 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, gaofeng, mark.d.rustad, netdev, eric.dumazet
In-Reply-To: <5006D2E0.1070404@intel.com>

On Wed, Jul 18, 2012 at 08:14:40AM -0700, John Fastabend wrote:
> On 7/18/2012 7:21 AM, John Fastabend wrote:
> >On 7/18/2012 5:45 AM, Neil Horman wrote:
> >>On Tue, Jul 17, 2012 at 05:33:16PM -0700, John Fastabend wrote:
> >>>When the netprio cgroup is built in the kernel cgroup_init will call
> >>>cgrp_create which eventually calls update_netdev_tables. This is
> >>>being called before do_initcalls() so a null ptr dereference occurs
> >>>on init_net.
> >>>
> >>>This patch adds a check on init_net.count to verify the structure
> >>>has been initialized. The failure was introduced here,
> >>>
> >>>commit ef209f15980360f6945873df3cd710c5f62f2a3e
> >>>Author: Gao feng <gaofeng@cn.fujitsu.com>
> >>>Date:   Wed Jul 11 21:50:15 2012 +0000
> >>>
> >>>     net: cgroup: fix access the unallocated memory in netprio cgroup
> >>>
> >>>Tested with ping with netprio_cgroup as a module and built in.
> >>>
> >>>Marked RFC for now I think DaveM might have a reason why this needs
> >>>some improvement.
> >>>
> >>>Reported-by: Mark Rustad <mark.d.rustad@intel.com>
> >>>Cc: Neil Horman <nhorman@tuxdriver.com>
> >>>Cc: Eric Dumazet <edumazet@google.com>
> >>>Cc: Gao feng <gaofeng@cn.fujitsu.com>
> >>>Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> >>>---
> >>>
> >>>  net/core/netprio_cgroup.c |    3 +++
> >>>  1 files changed, 3 insertions(+), 0 deletions(-)
> >>>
> >>>diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
> >>>index b2e9caa..e9fd7fd 100644
> >>>--- a/net/core/netprio_cgroup.c
> >>>+++ b/net/core/netprio_cgroup.c
> >>>@@ -116,6 +116,9 @@ static int update_netdev_tables(void)
> >>>      u32 max_len;
> >>>      struct netprio_map *map;
> >>>
> >>>+    if (!atomic_read(&init_net.count))
> >>>+        return ret;
> >>>+
> >>>      rtnl_lock();
> >>>      max_len = atomic_read(&max_prioidx) + 1;
> >>>      for_each_netdev(&init_net, dev) {
> >>>
> >>>
> >>
> >>John, do you have a stack trace of this.  I'm having a hard time
> >>seeing how we
> >>get into this path prior to the network stack being initalized.
> >
> >Mark had a partial trace
> >
> >[    0.003455] Dentry cache hash table entries: 262144 (order: 9,
> >2097152 bytes)
> >[    0.005550] Inode-cache hash table entries: 131072 (order: 8, 1048576
> >bytes)
> >[    0.007165] Mount-cache hash table entries: 256
> >[    0.010289] Initializing cgroup subsys net_cls
> >[    0.010947] Initializing cgroup subsys net_prio
> >[    0.011039] BUG: unable to handle kernel NULL pointer dereference at
> >0000000000000828
> >[    0.011998] IP: [<ffffffff814202c8>] update_netdev_tables+0x68/0xe0
> >
> >
> >>
> >>It also brings up another point.  If this is happening, and we're
> >>creating the
> >>root cgroup from start_kernel, Then we're actually initalizing some
> >>cgroups
> >>twice, because a few cgroups register themselves via
> >>cgroup_load_subsys in
> >>module_init specified routines.  So if you're building netprio_cgroup or
> >>net_cls_cgroup as part of the monolithic kernel, you'll get
> >>cgroup_create called
> >>prior to your module_init() call.  Thats not good.
> >
> >Well your module_init() wouldn't be called in this case right? I think
> >netprio has a bug where we only register a netdevice notifier when
> >its built as a module.
> >
> >same issue with cls_cgroup and register_tcf_proto_ops?
> >
> 
> Neil, I was very unclear in the above. What I meant here was
> cgroup_load_subsys() checks ss->module so you should _not_
> get two create calls. And returns 0 so the register calls for
> netdev notifiers should get setup.
> 
> I missed the return 0 part and so I thought we might abort before
> this occurs but it looks ok to me on second glance.
> 

John, et al.

Just so we all have it, I've got the problem reproduced here, and it gives me
this backtrace:

 0.149924] Mount-cache hash table entries: 256
[    0.163754] Initializing cgroup subsys cpuacct
[    0.176991] Initializing cgroup subsys memory
[    0.190012] Initializing cgroup subsys devices
[    0.203249] Initializing cgroup subsys freezer
[    0.216484] Initializing cgroup subsys net_cls
[    0.229719] Initializing cgroup subsys blkio
[    0.242436] Initializing cgroup subsys perf_event
[    0.256451] Initializing cgroup subsys net_prio
[    0.269948] BUG: unable to handle kernel NULL pointer dereference at
0000000000000698
[    0.293303] IP: [<ffffffff81512e37>] cgrp_create+0x107/0x1c0
[    0.310175] PGD 0 
[    0.316157] Oops: 0000 [#1] SMP 
[    0.325775] CPU 0 
[    0.331227] Modules linked in:
[    0.340846] 
[    0.345264] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc7+ #1 AMD Dinar/Dinar
[    0.366555] RIP: 0010:[<ffffffff81512e37>]  [<ffffffff81512e37>]
cgrp_create+0x107/0x1c0
[    0.390681] RSP: 0000:ffffffff81c01ea8  EFLAGS: 00010213
[    0.406501] RAX: 0000000000000000 RBX: ffffffffffffff10 RCX: 0000000000000000
[    0.427764] RDX: 0000000000000000 RSI: 0000000000000246 RDI: ffffffff81c9d840
[    0.449026] RBP: ffffffff81c01ed8 R08: 00000000000164e0 R09: 0000000000000000
[    0.470289] R10: ffff8804278303c0 R11: 0000000000000000 R12: 0000000000000001
[    0.491553] R13: ffff8804278303c0 R14: ffff881036fd0700 R15: 0000000000000000
[    0.512819] FS:  0000000000000000(0000) GS:ffff880427c00000(0000)
knlGS:0000000000000000
[    0.536932] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.554049] CR2: 0000000000000698 CR3: 0000000001c0b000 CR4: 00000000000406b0
[    0.575311] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.596574] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    0.617838] Process swapper/0 (pid: 0, threadinfo ffffffff81c00000, task
ffffffff81c13420)
[    0.642471] Stack:
[    0.648442]  ffffffff81c01eb8 ffffffff81c9f320 ffffffff81c9f320
ffffffff81c9f320
[    0.670522]  ffffffff81c9f320 ffffffff81d482c0 ffffffff81c01ef8
ffffffff81d10397
[    0.692604]  ffffffff81e99790 0000000000000048 ffffffff81c01f18
ffffffff81d1062e
[    0.714687] Call Trace:
[    0.721960]  [<ffffffff81d10397>] cgroup_init_subsys+0x51/0xdf
[    0.739337]  [<ffffffff81d1062e>] cgroup_init+0x36/0x119
[    0.755160]  [<ffffffff81cf5c02>] start_kernel+0x38f/0x3c4
[    0.771501]  [<ffffffff81cf5672>] ? repair_env_string+0x5e/0x5e
[    0.789138]  [<ffffffff81cf5356>] x86_64_start_reservations+0x131/0x135
[    0.808849]  [<ffffffff81cf545a>] x86_64_start_kernel+0x100/0x10f
[    0.827003] Code: 10 ff ff ff 75 25 e9 89 00 00 00 66 0f 1f 84 00 00 00 00 00
48 8b 93 f0 00 00 00 48 81 fa 38 39 f9 81 48 8d 9a 10 ff ff ff 74 69 <48> 8b 93
88 07 00 00 48 85 d2 74 dd 44 3b 62 10 76 d7 48 8d bb 
[    0.883860] RIP  [<ffffffff81512e37>] cgrp_create+0x107/0x1c0
[    0.900988]  RSP <ffffffff81c01ea8>
[    0.911366] CR2: 0000000000000698
[    0.921235] ---[ end trace a7919e7f17c0a725 ]---


So yes, it appears to me that we're calling cgrp_create from cgroup_init_subsys
prior to having the module_init routine called for netprio_cgroup.  It seems to
me that (given that we have a cgroup_early_init patch), we can move the
cgroup_init call until later in the boot process.  I'll spend the some time in
the next few weeks tinkering with that.
Best
Neil

^ permalink raw reply

* [PATCH 3/3] tilegx net: use eth_hw_addr_random(), not random_ether_addr()
From: Chris Metcalf @ 2012-07-18 16:53 UTC (permalink / raw)
  To: David S. Miller, netdev, linux-kernel
In-Reply-To: <201207181650.q6IGodZ7007565@lab-41.internal.tilera.com>

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 drivers/net/ethernet/tile/tilegx.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index f78effc..4e2a162 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -1851,7 +1851,7 @@ static void tile_net_dev_init(const char *name, const uint8_t *mac)
 		memcpy(dev->dev_addr, mac, 6);
 		dev->addr_len = 6;
 	} else {
-		random_ether_addr(dev->dev_addr);
+		eth_hw_addr_random(dev);
 	}
 
 	/* Register the network device. */
-- 
1.7.10.3

^ permalink raw reply related

* [PATCH 2/3] tilegx net driver: handle payload data not in frags
From: Chris Metcalf @ 2012-07-18 16:52 UTC (permalink / raw)
  To: David S. Miller, netdev, linux-kernel
In-Reply-To: <201207181650.q6IGodZ7007565@lab-41.internal.tilera.com>

The original driver implementation assumed that for TSO, all the
payload data would be in the frags.  This isn't always true; change
the driver to support payload data at skb->data between
"skb_transport_offset(skb) + tcp_hdrlen(skb)" and "skb->hdr_len",
followed by the data in the frags.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 drivers/net/ethernet/tile/tilegx.c |   36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index c7bde28..f78effc 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -1333,11 +1333,12 @@ static s64 tile_net_equeue_try_reserve(struct net_device *dev,
 static int tso_count_edescs(struct sk_buff *skb)
 {
 	struct skb_shared_info *sh = skb_shinfo(skb);
-	unsigned int data_len = skb->data_len;
+	unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	unsigned int data_len = skb->data_len + skb->hdr_len - sh_len;
 	unsigned int p_len = sh->gso_size;
 	long f_id = -1;    /* id of the current fragment */
-	long f_size = -1;  /* size of the current fragment */
-	long f_used = -1;  /* bytes used from the current fragment */
+	long f_size = skb->hdr_len;  /* size of the current fragment */
+	long f_used = sh_len;  /* bytes used from the current fragment */
 	long n;            /* size of the current piece of payload */
 	int num_edescs = 0;
 	int segment;
@@ -1382,13 +1383,14 @@ static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
 	struct skb_shared_info *sh = skb_shinfo(skb);
 	struct iphdr *ih;
 	struct tcphdr *th;
-	unsigned int data_len = skb->data_len;
+	unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	unsigned int data_len = skb->data_len + skb->hdr_len - sh_len;
 	unsigned char *data = skb->data;
-	unsigned int ih_off, th_off, sh_len, p_len;
+	unsigned int ih_off, th_off, p_len;
 	unsigned int isum_seed, tsum_seed, id, seq;
 	long f_id = -1;    /* id of the current fragment */
-	long f_size = -1;  /* size of the current fragment */
-	long f_used = -1;  /* bytes used from the current fragment */
+	long f_size = skb->hdr_len;  /* size of the current fragment */
+	long f_used = sh_len;  /* bytes used from the current fragment */
 	long n;            /* size of the current piece of payload */
 	int segment;
 
@@ -1397,14 +1399,13 @@ static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
 	th = tcp_hdr(skb);
 	ih_off = skb_network_offset(skb);
 	th_off = skb_transport_offset(skb);
-	sh_len = th_off + tcp_hdrlen(skb);
 	p_len = sh->gso_size;
 
 	/* Set up seed values for IP and TCP csum and initialize id and seq. */
 	isum_seed = ((0xFFFF - ih->check) +
 		     (0xFFFF - ih->tot_len) +
 		     (0xFFFF - ih->id));
-	tsum_seed = th->check + (0xFFFF ^ htons(skb->len));
+	tsum_seed = th->check + (0xFFFF ^ htons(sh_len + data_len));
 	id = ntohs(ih->id);
 	seq = ntohl(th->seq);
 
@@ -1476,21 +1477,22 @@ static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
 {
 	struct tile_net_priv *priv = netdev_priv(dev);
 	struct skb_shared_info *sh = skb_shinfo(skb);
-	unsigned int data_len = skb->data_len;
+	unsigned int sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	unsigned int data_len = skb->data_len + skb->hdr_len - sh_len;
 	unsigned int p_len = sh->gso_size;
 	gxio_mpipe_edesc_t edesc_head = { { 0 } };
 	gxio_mpipe_edesc_t edesc_body = { { 0 } };
 	long f_id = -1;    /* id of the current fragment */
-	long f_size = -1;  /* size of the current fragment */
-	long f_used = -1;  /* bytes used from the current fragment */
+	long f_size = skb->hdr_len;  /* size of the current fragment */
+	long f_used = sh_len;  /* bytes used from the current fragment */
+	void *f_data = skb->data;
 	long n;            /* size of the current piece of payload */
 	unsigned long tx_packets = 0, tx_bytes = 0;
-	unsigned int csum_start, sh_len;
+	unsigned int csum_start;
 	int segment;
 
 	/* Prepare to egress the headers: set up header edesc. */
 	csum_start = skb_checksum_start_offset(skb);
-	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
 	edesc_head.csum = 1;
 	edesc_head.csum_start = csum_start;
 	edesc_head.csum_dest = csum_start + skb->csum_offset;
@@ -1502,7 +1504,6 @@ static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
 
 	/* Egress all the edescs. */
 	for (segment = 0; segment < sh->gso_segs; segment++) {
-		void *va;
 		unsigned char *buf;
 		unsigned int p_used = 0;
 
@@ -1521,10 +1522,9 @@ static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
 				f_id++;
 				f_size = sh->frags[f_id].size;
 				f_used = 0;
+				f_data = tile_net_frag_buf(&sh->frags[f_id]);
 			}
 
-			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
-
 			/* Use bytes from the current fragment. */
 			n = p_len - p_used;
 			if (n > f_size - f_used)
@@ -1533,7 +1533,7 @@ static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
 			p_used += n;
 
 			/* Egress a piece of the payload. */
-			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.va = va_to_tile_io_addr(f_data) + f_used;
 			edesc_body.xfer_size = n;
 			edesc_body.bound = !(p_used < p_len);
 			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
-- 
1.7.10.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox