Netdev List
 help / color / mirror / Atom feed
* [PATCH 14/15] ipv4: Turn rt->rt_route_iif into rt->rt_is_input.
From: David Miller @ 2012-07-18 18:24 UTC (permalink / raw)
  To: netdev


That is this value's only use, as a boolean to indicate whether
a route is an input route or not.

So implement it that way, using a u16 gap present in the struct
already.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    6 +++---
 net/ipv4/route.c        |   10 +++++-----
 net/ipv4/xfrm4_policy.c |    2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index ee3bf84..f3ef18a 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -47,8 +47,8 @@ struct rtable {
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
+	__u16			rt_is_input;
 
-	int			rt_route_iif;
 	int			rt_iif;
 
 	/* Info on neighbour */
@@ -61,12 +61,12 @@ struct rtable {
 
 static inline bool rt_is_input_route(const struct rtable *rt)
 {
-	return rt->rt_route_iif != 0;
+	return rt->rt_is_input != 0;
 }
 
 static inline bool rt_is_output_route(const struct rtable *rt)
 {
-	return rt->rt_route_iif == 0;
+	return rt->rt_is_input == 0;
 }
 
 struct ip_rt_acct {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4da374c..ee35047 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1296,7 +1296,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_route_iif = dev->ifindex;
+	rth->rt_is_input= 1;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
@@ -1426,7 +1426,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_route_iif = in_dev->dev->ifindex;
+	rth->rt_is_input = 1;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
@@ -1604,7 +1604,7 @@ local_input:
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_route_iif = dev->ifindex;
+	rth->rt_is_input = 1;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
@@ -1769,7 +1769,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_route_iif = 0;
+	rth->rt_is_input = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
@@ -2044,7 +2044,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_route_iif = ort->rt_route_iif;
+		rt->rt_is_input = ort->rt_is_input;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_pmtu = ort->rt_pmtu;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3c99b4c..c628184 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,7 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 
 	xdst->u.dst.dev = dev;
@@ -87,6 +86,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
+	xdst->u.rt.rt_is_input = rt->rt_is_input;
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 13/15] ipv4: Kill rt->rt_oif
From: David Miller @ 2012-07-18 18:24 UTC (permalink / raw)
  To: netdev


Never actually used.

It was being set on output routes to the original OIF specified in the
flow key used for the lookup.

But the only user was in ipmr_rt_fib_lookup() which always runs on an
input route.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/ipmr.c         |    1 -
 net/ipv4/route.c        |    5 -----
 net/ipv4/xfrm4_policy.c |    1 -
 4 files changed, 8 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 6d111bc..ee3bf84 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -50,7 +50,6 @@ struct rtable {
 
 	int			rt_route_iif;
 	int			rt_iif;
-	int			rt_oif;
 
 	/* Info on neighbour */
 	__be32			rt_gateway;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index eee3bf6..fa75f73 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1795,7 +1795,6 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
 		.daddr = iph->daddr,
 		.saddr = iph->saddr,
 		.flowi4_tos = RT_TOS(iph->tos),
-		.flowi4_oif = rt->rt_oif,
 		.flowi4_iif = rt->rt_iif,
 		.flowi4_mark = skb->mark,
 	};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2190fc4..4da374c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1298,7 +1298,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
-	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	rth->fi = NULL;
@@ -1429,7 +1428,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_type = res->type;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
-	rth->rt_oif 	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	rth->fi = NULL;
@@ -1608,7 +1606,6 @@ local_input:
 	rth->rt_type	= res.type;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
-	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	rth->fi = NULL;
@@ -1774,7 +1771,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_type	= type;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
-	rth->rt_oif	= orig_oif;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
 	rth->fi = NULL;
@@ -2050,7 +2046,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
-		rt->rt_oif = ort->rt_oif;
 		rt->rt_pmtu = ort->rt_pmtu;
 
 		rt->rt_genid = rt_genid(net);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6074b69..3c99b4c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -81,7 +81,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
-	xdst->u.rt.rt_oif = fl4->flowi4_oif;
 
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 12/15] ipv4: Dirty less cache lines in route caching paths.
From: David Miller @ 2012-07-18 18:24 UTC (permalink / raw)
  To: netdev


Don't bother incrementing dst->__use and setting dst->lastuse,
they are completely pointless and just slow things down.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d4a3c6e..2190fc4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1409,7 +1409,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		if (!(flags & RTCF_DIRECTSRC) && !itag) {
 			rth = FIB_RES_NH(*res).nh_rth_input;
 			if (rth) {
-				dst_use(&rth->dst, jiffies);
+				dst_hold(&rth->dst);
 				goto out;
 			}
 			do_cache = true;
@@ -1585,7 +1585,7 @@ local_input:
 		if (!(flags & RTCF_DIRECTSRC) && !itag) {
 			rth = FIB_RES_NH(res).nh_rth_input;
 			if (rth) {
-				dst_use(&rth->dst, jiffies);
+				dst_hold(&rth->dst);
 				goto set_and_out;
 			}
 			do_cache = true;
@@ -1755,7 +1755,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		if (!fnhe) {
 			rth = FIB_RES_NH(*res).nh_rth_output;
 			if (rth) {
-				dst_use(&rth->dst, jiffies);
+				dst_hold(&rth->dst);
 				return rth;
 			}
 		}
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 11/15] ipv4: Kill FLOWI_FLAG_RT_NOCACHE and associated code.
From: David Miller @ 2012-07-18 18:24 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h                 |    1 -
 include/net/inet_connection_sock.h |    3 +--
 net/dccp/ipv4.c                    |    2 +-
 net/ipv4/inet_connection_sock.c    |    5 +----
 net/ipv4/route.c                   |    3 ---
 net/ipv4/tcp_ipv4.c                |    4 ++--
 6 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index ce9cb76..e1dd508 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -21,7 +21,6 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_CAN_SLEEP		0x02
-#define FLOWI_FLAG_RT_NOCACHE		0x04
 	__u32	flowic_secid;
 };
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 2cf44b4..5ee66f5 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -250,8 +250,7 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 extern struct dst_entry* inet_csk_route_req(struct sock *sk,
 					    struct flowi4 *fl4,
-					    const struct request_sock *req,
-					    bool nocache);
+					    const struct request_sock *req);
 extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk,
 						   struct sock *newsk,
 						   const struct request_sock *req);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ab4f44c..25428d0 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -508,7 +508,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = inet_csk_route_req(sk, &fl4, req, false);
+	dst = inet_csk_route_req(sk, &fl4, req);
 	if (dst == NULL)
 		goto out;
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0a290d7..db0cf17 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,8 +368,7 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 
 struct dst_entry *inet_csk_route_req(struct sock *sk,
 				     struct flowi4 *fl4,
-				     const struct request_sock *req,
-				     bool nocache)
+				     const struct request_sock *req)
 {
 	struct rtable *rt;
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -377,8 +376,6 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 	struct net *net = sock_net(sk);
 	int flags = inet_sk_flowi_flags(sk);
 
-	if (nocache)
-		flags |= FLOWI_FLAG_RT_NOCACHE;
 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b2f5c33..d4a3c6e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1802,9 +1802,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rt_set_nexthop(rth, res, fnhe, fi, type, 0);
 
-	if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
-		rth->dst.flags |= DST_NOCACHE;
-
 	return rth;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9caf5c..de6969a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -824,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	struct sk_buff * skb;
 
 	/* First, grab a route. */
-	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 		return -1;
 
 	skb = tcp_make_synack(sk, dst, req, rvp);
@@ -1378,7 +1378,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
+		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
 		    fl4.daddr == saddr) {
 			if (!tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 10/15] ipv4: Cache input routes in fib_info nexthops.
From: David Miller @ 2012-07-18 18:24 UTC (permalink / raw)
  To: netdev


Caching input routes is slightly simpler than output routes, since we
don't need to be concerned with nexthop exceptions.  (locally
destined, and routed packets, never trigger PMTU events or redirects
that will be processed by us).

However, we have to elide caching for the DIRECTSRC and non-zero itag
cases.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |    1 +
 net/ipv4/fib_semantics.c |    2 ++
 net/ipv4/route.c         |   47 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 23c9f9e..b64a19c 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -82,6 +82,7 @@ struct fib_nh {
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
 	struct rtable		*nh_rth_output;
+	struct rtable		*nh_rth_input;
 	struct fnhe_hash_bucket	*nh_exceptions;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 83d0f42..e55171f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -173,6 +173,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			free_nh_exceptions(nexthop_nh);
 		if (nexthop_nh->nh_rth_output)
 			dst_release(&nexthop_nh->nh_rth_output->dst);
+		if (nexthop_nh->nh_rth_input)
+			dst_release(&nexthop_nh->nh_rth_input->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2e66b9a..b2f5c33 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1202,6 +1202,9 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 	static DEFINE_SPINLOCK(fib_cache_lock);
 	struct rtable **p = &nh->nh_rth_output;
 
+	if (rt_is_input_route(rt))
+		p = &nh->nh_rth_input;
+
 	if (*p)
 		return;
 
@@ -1228,8 +1231,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct fib_result *res,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		if (!(rt->dst.flags & DST_HOST) &&
-		    rt_is_output_route(rt))
+		if (!(rt->dst.flags & DST_HOST))
 			rt_cache_route(nh, rt);
 	}
 
@@ -1355,11 +1357,11 @@ static int __mkroute_input(struct sk_buff *skb,
 			   __be32 daddr, __be32 saddr, u32 tos,
 			   struct rtable **result)
 {
-	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
 	unsigned int flags = 0;
+	bool do_cache;
 	u32 itag;
 
 	/* get a working reference to the output device */
@@ -1402,13 +1404,21 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	fnhe = NULL;
-	if (res->fi)
-		fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+	do_cache = false;
+	if (res->fi) {
+		if (!(flags & RTCF_DIRECTSRC) && !itag) {
+			rth = FIB_RES_NH(*res).nh_rth_input;
+			if (rth) {
+				dst_use(&rth->dst, jiffies);
+				goto out;
+			}
+			do_cache = true;
+		}
+	}
 
 	rth = rt_dst_alloc(out_dev->dev,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
-			   IN_DEV_CONF_GET(out_dev, NOXFRM), false);
+			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
 	if (!rth) {
 		err = -ENOBUFS;
 		goto cleanup;
@@ -1427,8 +1437,8 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
-	rt_set_nexthop(rth, res, fnhe, res->fi, res->type, itag);
-
+	rt_set_nexthop(rth, res, NULL, res->fi, res->type, itag);
+out:
 	*result = rth;
 	err = 0;
  cleanup:
@@ -1480,6 +1490,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	struct rtable	*rth;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
+	bool do_cache;
 
 	/* IP on this device is disabled. */
 
@@ -1493,6 +1504,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
 		goto martian_source;
 
+	res.fi = NULL;
 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
 		goto brd_input;
 
@@ -1568,8 +1580,20 @@ brd_input:
 	RT_CACHE_STAT_INC(in_brd);
 
 local_input:
+	do_cache = false;
+	if (res.fi) {
+		if (!(flags & RTCF_DIRECTSRC) && !itag) {
+			rth = FIB_RES_NH(res).nh_rth_input;
+			if (rth) {
+				dst_use(&rth->dst, jiffies);
+				goto set_and_out;
+			}
+			do_cache = true;
+		}
+	}
+
 	rth = rt_dst_alloc(net->loopback_dev,
-			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
 	if (!rth)
 		goto e_nobufs;
 
@@ -1593,6 +1617,9 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
+	if (do_cache)
+		rt_cache_route(&FIB_RES_NH(res), rth);
+set_and_out:
 	skb_dst_set(skb, &rth->dst);
 	err = 0;
 	goto out;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 09/15] ipv4: Cache output routes in fib_info nexthops.
From: David Miller @ 2012-07-18 18:24 UTC (permalink / raw)
  To: netdev


If we have an output route that lacks nexthop exceptions, we can cache
it in the FIB info nexthop.

Such routes will have DST_HOST cleared because such routes refer to a
family of destinations, rather than just one.

The sequence of the handling of exceptions during route lookup is
adjusted to make the logic work properly.

Before we allocate the route, we lookup the exception.

Then we know if we will cache this route or not, and therefore whether
DST_HOST should be set on the allocated route.

Then we use DST_HOST to key off whether we should store the resulting
route, during rt_set_nexthop(), in the FIB nexthop cache.

To counter adding a new argument to rt_set_nexthop() I'm removing the
'fl4' arg to rt_set_nexthop() and rt_init_metrics(), which is no
longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |    2 +
 net/ipv4/fib_semantics.c |    2 +
 net/ipv4/route.c         |  109 ++++++++++++++++++++++++++++++++--------------
 3 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e9ee1ca..23c9f9e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -46,6 +46,7 @@ struct fib_config {
  };
 
 struct fib_info;
+struct rtable;
 
 struct fib_nh_exception {
 	struct fib_nh_exception __rcu	*fnhe_next;
@@ -80,6 +81,7 @@ struct fib_nh {
 	__be32			nh_gw;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
+	struct rtable		*nh_rth_output;
 	struct fnhe_hash_bucket	*nh_exceptions;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2b57d76..83d0f42 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -171,6 +171,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			dev_put(nexthop_nh->nh_dev);
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
+		if (nexthop_nh->nh_rth_output)
+			dst_release(&nexthop_nh->nh_rth_output->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4d170a1..2e66b9a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1151,8 +1151,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	return mtu;
 }
 
-static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
-			    struct fib_info *fi)
+static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
 {
 	if (fi->fib_metrics != (u32 *) dst_default_metrics) {
 		rt->fi = fi;
@@ -1161,38 +1160,61 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 }
 
-static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
 	struct fib_nh_exception *fnhe;
 	u32 hval;
 
+	if (!hash)
+		return NULL;
+
 	hval = fnhe_hashfun(daddr);
 
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-		if (fnhe->fnhe_daddr == daddr) {
-			if (fnhe->fnhe_pmtu) {
-				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = jiffies - expires;
-
-				if (time_before(jiffies, expires)) {
-					rt->rt_pmtu = fnhe->fnhe_pmtu;
-					dst_set_expires(&rt->dst, diff);
-				}
-			}
-			if (fnhe->fnhe_gw) {
-				rt->rt_flags |= RTCF_REDIRECTED;
-				rt->rt_gateway = fnhe->fnhe_gw;
-			}
-			fnhe->fnhe_stamp = jiffies;
-			break;
+		if (fnhe->fnhe_daddr == daddr)
+			return fnhe;
+	}
+	return NULL;
+}
+
+static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+	if (fnhe->fnhe_pmtu) {
+		unsigned long expires = fnhe->fnhe_expires;
+		unsigned long diff = jiffies - expires;
+
+		if (time_before(jiffies, expires)) {
+			rt->rt_pmtu = fnhe->fnhe_pmtu;
+			dst_set_expires(&rt->dst, diff);
 		}
 	}
+	if (fnhe->fnhe_gw) {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		rt->rt_gateway = fnhe->fnhe_gw;
+	}
+	fnhe->fnhe_stamp = jiffies;
 }
 
-static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
-			   const struct fib_result *res,
+static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+	static DEFINE_SPINLOCK(fib_cache_lock);
+	struct rtable **p = &nh->nh_rth_output;
+
+	if (*p)
+		return;
+
+	spin_lock_bh(&fib_cache_lock);
+	if (!*p) {
+		*p = rt;
+		dst_clone(&rt->dst);
+	}
+	spin_unlock_bh(&fib_cache_lock);
+}
+
+static void rt_set_nexthop(struct rtable *rt, const struct fib_result *res,
+			   struct fib_nh_exception *fnhe,
 			   struct fib_info *fi, u16 type, u32 itag)
 {
 	if (fi) {
@@ -1200,12 +1222,15 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = nh->nh_gw;
-		if (unlikely(nh->nh_exceptions))
-			rt_bind_exception(rt, nh, fl4->daddr);
-		rt_init_metrics(rt, fl4, fi);
+		if (unlikely(fnhe))
+			rt_bind_exception(rt, fnhe);
+		rt_init_metrics(rt, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+		rt->dst.tclassid = nh->nh_tclassid;
 #endif
+		if (!(rt->dst.flags & DST_HOST) &&
+		    rt_is_output_route(rt))
+			rt_cache_route(nh, rt);
 	}
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1217,10 +1242,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 }
 
 static struct rtable *rt_dst_alloc(struct net_device *dev,
-				   bool nopolicy, bool noxfrm)
+				   bool nopolicy, bool noxfrm, bool will_cache)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-			 DST_HOST | DST_NOCACHE |
+			 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1257,7 +1282,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			goto e_err;
 	}
 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
-			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
 	if (!rth)
 		goto e_nobufs;
 
@@ -1330,6 +1355,7 @@ static int __mkroute_input(struct sk_buff *skb,
 			   __be32 daddr, __be32 saddr, u32 tos,
 			   struct rtable **result)
 {
+	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
@@ -1376,9 +1402,13 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
+	fnhe = NULL;
+	if (res->fi)
+		fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+
 	rth = rt_dst_alloc(out_dev->dev,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
-			   IN_DEV_CONF_GET(out_dev, NOXFRM));
+			   IN_DEV_CONF_GET(out_dev, NOXFRM), false);
 	if (!rth) {
 		err = -ENOBUFS;
 		goto cleanup;
@@ -1397,7 +1427,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
-	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
+	rt_set_nexthop(rth, res, fnhe, res->fi, res->type, itag);
 
 	*result = rth;
 	err = 0;
@@ -1539,7 +1569,7 @@ brd_input:
 
 local_input:
 	rth = rt_dst_alloc(net->loopback_dev,
-			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
 	if (!rth)
 		goto e_nobufs;
 
@@ -1653,6 +1683,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 				       unsigned int flags)
 {
 	struct fib_info *fi = res->fi;
+	struct fib_nh_exception *fnhe;
 	struct in_device *in_dev;
 	u16 type = res->type;
 	struct rtable *rth;
@@ -1691,9 +1722,21 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 			fi = NULL;
 	}
 
+	fnhe = NULL;
+	if (fi) {
+		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+		if (!fnhe) {
+			rth = FIB_RES_NH(*res).nh_rth_output;
+			if (rth) {
+				dst_use(&rth->dst, jiffies);
+				return rth;
+			}
+		}
+	}
 	rth = rt_dst_alloc(dev_out,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
-			   IN_DEV_CONF_GET(in_dev, NOXFRM));
+			   IN_DEV_CONF_GET(in_dev, NOXFRM),
+			   fi && !fnhe);
 	if (!rth)
 		return ERR_PTR(-ENOBUFS);
 
@@ -1730,7 +1773,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 #endif
 	}
 
-	rt_set_nexthop(rth, fl4, res, fi, type, 0);
+	rt_set_nexthop(rth, res, fnhe, fi, type, 0);
 
 	if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
 		rth->dst.flags |= DST_NOCACHE;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 08/15] ipv4: Kill routes during PMTU/redirect updates.
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


Mark them obsolete so there will be a re-lookup to fetch the
FIB nexthop exception info.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c |   39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7ebf788..4d170a1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -657,7 +657,8 @@ out:
 	return fnhe;
 }
 
-static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+			     bool kill_route)
 {
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
@@ -716,8 +717,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 					fnhe->fnhe_gw = new_gw;
 				spin_unlock_bh(&fnhe_lock);
 			}
-			rt->rt_gateway = new_gw;
-			rt->rt_flags |= RTCF_REDIRECTED;
+			if (kill_route)
+				rt->dst.obsolete = -2;
 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 		}
 		neigh_release(n);
@@ -748,7 +749,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 	rt = (struct rtable *) dst;
 
 	ip_rt_build_flow_key(&fl4, sk, skb);
-	__ip_do_redirect(rt, skb, &fl4);
+	__ip_do_redirect(rt, skb, &fl4, true);
 }
 
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -907,7 +908,7 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
 	struct fib_result res;
 
@@ -926,8 +927,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		}
 		spin_unlock_bh(&fnhe_lock);
 	}
-	rt->rt_pmtu = mtu;
-	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+	return mtu;
 }
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -937,7 +937,14 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 	struct flowi4 fl4;
 
 	ip_rt_build_flow_key(&fl4, sk, skb);
-	__ip_rt_update_pmtu(rt, &fl4, mtu);
+	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
+
+	if (!rt->rt_pmtu) {
+		dst->expires = -2;
+	} else {
+		rt->rt_pmtu = mtu;
+		dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+	}
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -983,7 +990,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		__ip_do_redirect(rt, skb, &fl4);
+		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
 	}
 }
@@ -998,7 +1005,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 	rt = __ip_route_output_key(sock_net(sk), &fl4);
 	if (!IS_ERR(rt)) {
-		__ip_do_redirect(rt, skb, &fl4);
+		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
 	}
 }
@@ -1008,7 +1015,13 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
-	if (rt_is_expired(rt))
+	/* All IPV4 dsts are created with obsoluete set to -1, that
+	 * forces validation calls down into this function always.
+	 *
+	 * When a PMTU/redirect information update invalidates a
+	 * route, this is indicated by setting obsolete to -2.
+	 */
+	if (dst->obsolete == -2 || rt_is_expired(rt))
 		return NULL;
 	return dst;
 }
@@ -1168,8 +1181,10 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 					dst_set_expires(&rt->dst, diff);
 				}
 			}
-			if (fnhe->fnhe_gw)
+			if (fnhe->fnhe_gw) {
+				rt->rt_flags |= RTCF_REDIRECTED;
 				rt->rt_gateway = fnhe->fnhe_gw;
+			}
 			fnhe->fnhe_stamp = jiffies;
 			break;
 		}
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 07/15] ipv4: Adjust semantics of rt->rt_gateway.
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


In order to allow prefixed routes, we have to adjust how rt_gateway
is set an interpreted.

The new interpretation is:

1) rt_gateway == 0, destination is on-link, nexthop is iph->daddr

2) rt_gateway != 0, destination requires a nexthop gateway

Signed-off-by: David S. Miller <davem@davemloft.net>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 net/ipv4/arp.c                      |    3 ++-
 net/ipv4/inet_connection_sock.c     |    4 ++--
 net/ipv4/ip_gre.c                   |    2 ++
 net/ipv4/ip_output.c                |    2 +-
 net/ipv4/ipip.c                     |    2 ++
 net/ipv4/netfilter/ipt_MASQUERADE.c |    7 +++++--
 net/ipv4/route.c                    |   18 ++++++++++--------
 7 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c38293f..672d6f3 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -476,7 +476,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 	}
 
 	paddr = skb_rtable(skb)->rt_gateway;
-
+	if (!paddr)
+		paddr = ip_hdr(skb)->daddr;
 	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
 			       paddr, dev))
 		return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c7a4de0..0a290d7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -389,7 +389,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
 		goto route_err;
 	return &rt->dst;
 
@@ -422,7 +422,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
 		goto route_err;
 	return &rt->dst;
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 42c44b1..1ff6bf8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -767,6 +767,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		if (skb->protocol == htons(ETH_P_IP)) {
 			rt = skb_rtable(skb);
 			dst = rt->rt_gateway;
+			if (!dst)
+				dst = old_iph->daddr;
 		}
 #if IS_ENABLED(CONFIG_IPV6)
 		else if (skb->protocol == htons(ETH_P_IPV6)) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc52679..6b805e0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
 		goto no_route;
 
 	/* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2c2c35b..59e0e95 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -488,6 +488,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 			goto tx_error;
 		}
 		dst = rt->rt_gateway;
+		if (!dst)
+			dst = old_iph->daddr;
 	}
 
 	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 2f210c7..b99746b 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_nat_ipv4_range newrange;
 	const struct nf_nat_ipv4_multi_range_compat *mr;
 	const struct rtable *rt;
-	__be32 newsrc;
+	__be32 newsrc, nh;
 
 	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
 
@@ -70,7 +70,10 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	mr = par->targinfo;
 	rt = skb_rtable(skb);
-	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	nh = rt->rt_gateway;
+	if (!nh)
+		nh = ip_hdr(skb)->daddr;
+	newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
 	if (!newsrc) {
 		pr_info("%s ate my IP address\n", par->out->name);
 		return NF_DROP;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 76eb78e..7ebf788 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1079,8 +1079,10 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
 		else
-			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
-					RT_SCOPE_UNIVERSE);
+			src = inet_select_addr(rt->dst.dev, (rt->rt_gateway ?
+							     rt->rt_gateway :
+							     iph->daddr),
+					       RT_SCOPE_UNIVERSE);
 		rcu_read_unlock();
 	}
 	memcpy(addr, &src, 4);
@@ -1126,7 +1128,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	mtu = dst->dev->mtu;
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-		if (rt->rt_gateway != 0 && mtu > 576)
+		if (rt->rt_gateway && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1256,7 +1258,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway	= daddr;
+	rth->rt_gateway	= 0;
 	rth->fi = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
@@ -1374,7 +1376,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway	= daddr;
+	rth->rt_gateway	= 0;
 	rth->fi = NULL;
 
 	rth->dst.input = ip_forward;
@@ -1539,7 +1541,7 @@ local_input:
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway	= daddr;
+	rth->rt_gateway	= 0;
 	rth->fi = NULL;
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -1689,7 +1691,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway = fl4->daddr;
+	rth->rt_gateway = 0;
 	rth->fi = NULL;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2051,7 +2053,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (fl4->daddr != rt->rt_gateway &&
+	if (rt->rt_gateway &&
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 06/15] ipv4: Remove 'rt_dst' from 'struct rtable'
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/route.c        |   45 +++++++++------------------------------------
 net/ipv4/xfrm4_policy.c |    1 -
 3 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 757fe40..6d111bc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -48,7 +48,6 @@ struct rtable {
 	unsigned int		rt_flags;
 	__u16			rt_type;
 
-	__be32			rt_dst;	/* Path destination	*/
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5ef9f38..76eb78e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -838,7 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 		    peer->rate_tokens == ip_rt_redirect_number)
 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 					     &ip_hdr(skb)->saddr, rt->rt_iif,
-					     &rt->rt_dst, &rt->rt_gateway);
+					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
 #endif
 	}
 out_put_peer:
@@ -1126,8 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	mtu = dst->dev->mtu;
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-
-		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+		if (rt->rt_gateway != 0 && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1253,7 +1252,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_dst	= daddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1372,7 +1370,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_dst	= daddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
@@ -1538,7 +1535,6 @@ local_input:
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_dst	= daddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1689,7 +1685,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_dst	= fl4->daddr;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
@@ -1977,7 +1972,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_genid = rt_genid(net);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
-		rt->rt_dst = ort->rt_dst;
 		rt->rt_gateway = ort->rt_gateway;
 		rt->fi = ort->fi;
 		if (rt->fi)
@@ -2008,9 +2002,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
-			struct sk_buff *skb, u32 pid, u32 seq, int event,
-			int nowait, unsigned int flags)
+static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
+			struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
+			u32 seq, int event, int nowait, unsigned int flags)
 {
 	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
@@ -2037,7 +2031,7 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	if (rt->rt_flags & RTCF_NOTIFY)
 		r->rtm_flags |= RTM_F_NOTIFY;
 
-	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+	if (nla_put_be32(skb, RTA_DST, dst))
 		goto nla_put_failure;
 	if (src) {
 		r->rtm_src_len = 32;
@@ -2078,29 +2072,8 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	}
 
 	if (rt_is_input_route(rt)) {
-#ifdef CONFIG_IP_MROUTE
-		__be32 dst = rt->rt_dst;
-
-		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
-			int err = ipmr_get_route(net, skb,
-						 fl4->saddr, fl4->daddr,
-						 r, nowait);
-			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-					error = err;
-				}
-			}
-		} else
-#endif
-			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
-				goto nla_put_failure;
+		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+			goto nla_put_failure;
 	}
 
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
@@ -2195,7 +2168,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, src, &fl4, skb,
+	err = rt_fill_info(net, dst, src, &fl4, skb,
 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index f73ba82..6074b69 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 05/15] ipv4: Remove 'rt_mark' from 'struct rtable'
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/ipmr.c         |    2 +-
 net/ipv4/route.c        |    9 ++-------
 net/ipv4/xfrm4_policy.c |    1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 85d1093..757fe40 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -52,7 +52,6 @@ struct rtable {
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
-	__u32			rt_mark;
 
 	/* Info on neighbour */
 	__be32			rt_gateway;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5716c6b..eee3bf6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1797,7 +1797,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
 		.flowi4_tos = RT_TOS(iph->tos),
 		.flowi4_oif = rt->rt_oif,
 		.flowi4_iif = rt->rt_iif,
-		.flowi4_mark = rt->rt_mark,
+		.flowi4_mark = skb->mark,
 	};
 	struct mr_table *mrt;
 	int err;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2d4ae01..5ef9f38 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1257,7 +1257,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
 	rth->fi = NULL;
@@ -1377,7 +1376,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
 	rth->fi = NULL;
@@ -1544,7 +1542,6 @@ local_input:
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
 	rth->fi = NULL;
@@ -1696,7 +1693,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
-	rth->rt_mark    = fl4->flowi4_mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = fl4->daddr;
 	rth->fi = NULL;
@@ -1976,7 +1972,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
-		rt->rt_mark = ort->rt_mark;
 		rt->rt_pmtu = ort->rt_pmtu;
 
 		rt->rt_genid = rt_genid(net);
@@ -2069,8 +2064,8 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
 		goto nla_put_failure;
 
-	if (rt->rt_mark &&
-	    nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+	if (fl4->flowi4_mark &&
+	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
 		goto nla_put_failure;
 
 	error = rt->dst.error;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 00d49e4..f73ba82 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -82,7 +82,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-	xdst->u.rt.rt_mark = fl4->flowi4_mark;
 
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 04/15] ipv4: Kill 'rt_src' from 'struct rtable'
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/route.c        |   34 +++++++++++++++-------------------
 net/ipv4/xfrm4_policy.c |    1 -
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 935fa59..85d1093 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -49,7 +49,6 @@ struct rtable {
 	__u16			rt_type;
 
 	__be32			rt_dst;	/* Path destination	*/
-	__be32			rt_src;	/* Path source		*/
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 83e9663..2d4ae01 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1254,7 +1254,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1375,7 +1374,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
@@ -1543,7 +1541,6 @@ local_input:
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1696,7 +1693,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
 	rth->rt_dst	= fl4->daddr;
-	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
@@ -1987,7 +1983,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_dst = ort->rt_dst;
-		rt->rt_src = ort->rt_src;
 		rt->rt_gateway = ort->rt_gateway;
 		rt->fi = ort->fi;
 		if (rt->fi)
@@ -2018,7 +2013,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
+static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2036,7 +2031,7 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= tos;
+	r->rtm_tos	= fl4->flowi4_tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2063,11 +2058,11 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != src) {
-		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+	    fl4->saddr != src) {
+		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (rt->rt_dst != rt->rt_gateway &&
+	if (fl4->daddr != rt->rt_gateway &&
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
@@ -2094,7 +2089,7 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
-						 rt->rt_src, rt->rt_dst,
+						 fl4->saddr, fl4->daddr,
 						 r, nowait);
 			if (err <= 0) {
 				if (!nowait) {
@@ -2129,6 +2124,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
 	struct rtable *rt = NULL;
+	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
 	u32 iif;
@@ -2163,6 +2159,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = dst;
+	fl4.saddr = src;
+	fl4.flowi4_tos = rtm->rtm_tos;
+	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+	fl4.flowi4_mark = mark;
+
 	if (iif) {
 		struct net_device *dev;
 
@@ -2183,13 +2186,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 		if (err == 0 && rt->dst.error)
 			err = -rt->dst.error;
 	} else {
-		struct flowi4 fl4 = {
-			.daddr = dst,
-			.saddr = src,
-			.flowi4_tos = rtm->rtm_tos,
-			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-			.flowi4_mark = mark,
-		};
 		rt = ip_route_output_key(net, &fl4);
 
 		err = 0;
@@ -2204,7 +2200,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+	err = rt_fill_info(net, src, &fl4, skb,
 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 2a8d5cf..00d49e4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -92,7 +92,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-	xdst->u.rt.rt_src = rt->rt_src;
 	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
-- 
1.7.10.4

^ permalink raw reply related

* RE: r8169: link up, link down
From: J. Christopher Pereira @ 2012-07-18 18:23 UTC (permalink / raw)
  To: 'Francois Romieu'; +Cc: netdev
In-Reply-To: <20120718173909.GA12524@electric-eye.fr.zoreil.com>

Hi Francois:

Thanks for answering.

> If it's an XID 98000000 - i.e. old new hardware - you may try to remove
the device then rescan the PCI bus through sysfs.

dmesg says "eth0: RTL8110s at 0xffffc2000067ec00, 00:4f:4a:10:1e:cf, XID
04000000 IRQ 16".

> Building a modern kernel is strongly suggested if the hardware includes a
recent 816x chipset.

Is there any particular patch I could apply and just recompile the driver?
My hope was to first receive feedback and identify some probably related
known bug, in order to avoid searching for a solution by trial and error or
by updating the whole environment.

^ permalink raw reply

* [PATCH 03/15] ipv4: Remove rt_key_{src,dst,tos} from struct rtable.
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


They are always used in contexts where they can be reconstituted,
or where the finally resolved rt->rt_{src,dst} is semantically
equivalent.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    5 -----
 net/ipv4/route.c        |   39 +++++++++------------------------------
 net/ipv4/xfrm4_policy.c |    3 ---
 3 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 5c86c47..935fa59 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -44,14 +44,9 @@ struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
 
-	/* Lookup key. */
-	__be32			rt_key_dst;
-	__be32			rt_key_src;
-
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
-	__u8			rt_key_tos;
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 534bc4d..83e9663 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1250,12 +1250,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	rth->dst.output = ip_rt_bug;
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1374,12 +1371,9 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
@@ -1545,12 +1539,9 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1650,9 +1641,7 @@ EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-				       const struct flowi4 *fl4,
-				       __be32 orig_daddr, __be32 orig_saddr,
-				       int orig_oif, __u8 orig_rtos,
+				       const struct flowi4 *fl4, int orig_oif,
 				       struct net_device *dev_out,
 				       unsigned int flags)
 {
@@ -1703,12 +1692,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rth->dst.output = ip_output;
 
-	rth->rt_key_dst	= orig_daddr;
-	rth->rt_key_src	= orig_saddr;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_key_tos	= orig_rtos;
 	rth->rt_dst	= fl4->daddr;
 	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
@@ -1759,16 +1745,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	__be32 orig_daddr;
-	__be32 orig_saddr;
 	int orig_oif;
 
 	res.tclassid	= 0;
 	res.fi		= NULL;
 	res.table	= NULL;
 
-	orig_daddr = fl4->daddr;
-	orig_saddr = fl4->saddr;
 	orig_oif = fl4->flowi4_oif;
 
 	fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -1930,8 +1912,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 
 make_route:
-	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-			       tos, dev_out, flags);
+	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
 
 out:
 	rcu_read_unlock();
@@ -1996,9 +1977,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_key_dst = ort->rt_key_dst;
-		rt->rt_key_src = ort->rt_key_src;
-		rt->rt_key_tos = ort->rt_key_tos;
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
@@ -2040,7 +2018,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,
+static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2058,7 +2036,7 @@ static int rt_fill_info(struct net *net,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= rt->rt_key_tos;
+	r->rtm_tos	= tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2071,9 +2049,9 @@ static int rt_fill_info(struct net *net,
 
 	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
 		goto nla_put_failure;
-	if (rt->rt_key_src) {
+	if (src) {
 		r->rtm_src_len = 32;
-		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+		if (nla_put_be32(skb, RTA_SRC, src))
 			goto nla_put_failure;
 	}
 	if (rt->dst.dev &&
@@ -2085,7 +2063,7 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != rt->rt_key_src) {
+	    rt->rt_src != src) {
 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 			goto nla_put_failure;
 	}
@@ -2226,7 +2204,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index fcf7678..2a8d5cf 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,9 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_key_dst = fl4->daddr;
-	xdst->u.rt.rt_key_src = fl4->saddr;
-	xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 02/15] ipv4: Kill ip_route_input_noref().
From: David Miller @ 2012-07-18 18:23 UTC (permalink / raw)
  To: netdev


The "noref" argument to ip_route_input_common() is now always ignored
because we do not cache routes, and in that case we must always grab
a reference to the resulting 'dst'.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h    |   16 ++--------------
 net/ipv4/arp.c         |    2 +-
 net/ipv4/ip_fragment.c |    4 ++--
 net/ipv4/ip_input.c    |    4 ++--
 net/ipv4/route.c       |    6 +++---
 net/ipv4/xfrm4_input.c |    4 ++--
 6 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 5dcfeb6..5c86c47 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -160,20 +160,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 	return ip_route_output_key(net, fl4);
 }
 
-extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
-
-static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
-}
-
-static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
-}
+extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
+			  u8 tos, struct net_device *devin);
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			     int oif, u32 mark, u8 protocol, int flow_flags);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0..c38293f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c97..7ad88e5 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
 		/* skb dst is stale, drop it, and perform route lookup again */
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev);
+		err = ip_route_input(head, iph->daddr, iph->saddr,
+				     iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b27d444..4ebc6fe 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
+		int err = ip_route_input(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EXDEV)
 				NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 60ce756..534bc4d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1602,8 +1602,8 @@ martian_source_keep_err:
 	goto out;
 }
 
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
 {
 	int res;
 
@@ -1646,7 +1646,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_unlock();
 	return res;
 }
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6..58d23a5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 01/15] ipv4: Delete routing cache.
From: David Miller @ 2012-07-18 18:22 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/fib_frontend.c |    5 -
 net/ipv4/route.c        |  940 +----------------------------------------------
 3 files changed, 13 insertions(+), 933 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index ace3cb4..5dcfeb6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
 struct in_device;
 extern int		ip_rt_init(void);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(struct net *net);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 					   struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7a31194..c1fde53 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1071,11 +1071,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		/* The batch unregister is only called on the first
-		 * device in the list of devices being unregistered.
-		 * Therefore we should not pass dev_net(dev) in here.
-		 */
-		rt_cache_flush_batch(NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e702..60ce756 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
-static int rt_chain_length_max __read_mostly	= 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
 
 /*
  *	Interface to generic destination cache.
@@ -152,7 +148,6 @@ static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			    int how)
@@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
-	.gc =			rt_garbage_collect,
 	.check =		ipv4_dst_check,
 	.default_advmss =	ipv4_default_advmss,
 	.mtu =			ipv4_mtu,
@@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = {
 };
 EXPORT_SYMBOL(ip_tos2prio);
 
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- *    as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- *    they do so with atomic increments and with the
- *    lock held.
- */
-
-struct rt_hash_bucket {
-	struct rtable __rcu	*chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ	256
-#else
-# if NR_CPUS >= 32
-#  define RT_HASH_LOCK_SZ	4096
-# elif NR_CPUS >= 16
-#  define RT_HASH_LOCK_SZ	2048
-# elif NR_CPUS >= 8
-#  define RT_HASH_LOCK_SZ	1024
-# elif NR_CPUS >= 4
-#  define RT_HASH_LOCK_SZ	512
-# else
-#  define RT_HASH_LOCK_SZ	256
-# endif
-#endif
-
-static spinlock_t	*rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
-	int i;
-
-	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
-			GFP_KERNEL);
-	if (!rt_hash_locks)
-		panic("IP: failed to allocate rt_hash_locks\n");
-
-	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
-		spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
-static unsigned int		rt_hash_mask __read_mostly;
-static unsigned int		rt_hash_log  __read_mostly;
-
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-				   int genid)
-{
-	return jhash_3words((__force u32)daddr, (__force u32)saddr,
-			    idx, genid)
-		& rt_hash_mask;
-}
-
 static inline int rt_genid(struct net *net)
 {
 	return atomic_read(&net->ipv4.rt_genid);
 }
 
 #ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
-	struct seq_net_private p;
-	int bucket;
-	int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	struct rtable *r = NULL;
-
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
-			continue;
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-		while (r) {
-			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
-			    r->rt_genid == st->genid)
-				return r;
-			r = rcu_dereference_bh(r->dst.rt_next);
-		}
-		rcu_read_unlock_bh();
-	}
-	return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
-					  struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-
-	r = rcu_dereference_bh(r->dst.rt_next);
-	while (!r) {
-		rcu_read_unlock_bh();
-		do {
-			if (--st->bucket < 0)
-				return NULL;
-		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
-					struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-		if (dev_net(r->dst.dev) != seq_file_net(seq))
-			continue;
-		if (r->rt_genid == st->genid)
-			break;
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
-	struct rtable *r = rt_cache_get_first(seq);
-
-	if (r)
-		while (pos && (r = rt_cache_get_next(seq, r)))
-			--pos;
-	return pos ? NULL : r;
-}
-
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct rt_cache_iter_state *st = seq->private;
 	if (*pos)
-		return rt_cache_get_idx(seq, *pos - 1);
-	st->genid = rt_genid(seq_file_net(seq));
+		return NULL;
 	return SEQ_START_TOKEN;
 }
 
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct rtable *r;
-
-	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(seq);
-	else
-		r = rt_cache_get_next(seq, v);
 	++*pos;
-	return r;
+	return NULL;
 }
 
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
-	if (v && v != SEQ_START_TOKEN)
-		rcu_read_unlock_bh();
 }
 
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 			   "HHUptod\tSpecDst");
-	else {
-		struct rtable *r = v;
-		int len;
-
-		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-			   "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			   r->dst.dev ? r->dst.dev->name : "*",
-			   (__force u32)r->rt_dst,
-			   (__force u32)r->rt_gateway,
-			   r->rt_flags, atomic_read(&r->dst.__refcnt),
-			   r->dst.__use, 0, (__force u32)r->rt_src,
-			   dst_metric_advmss(&r->dst) + 40,
-			   dst_metric(&r->dst, RTAX_WINDOW), 0,
-			   r->rt_key_tos,
-			   -1, 0, 0, &len);
-
-		seq_printf(seq, "%*s\n", 127 - len, "");
-	}
 	return 0;
 }
 
@@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = {
 
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &rt_cache_seq_ops,
-			sizeof(struct rt_cache_iter_state));
+	return seq_open(file, &rt_cache_seq_ops);
 }
 
 static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = {
 	.open	 = rt_cache_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release_net,
+	.release = seq_release,
 };
 
 
@@ -625,263 +446,12 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
-	ip_rt_put(rt);
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
-	/* Kill broadcast/multicast entries very aggresively, if they
-	   collide in hash table with more useful entries */
-	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
-	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		rth->dst.expires;
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
-	unsigned long age;
-	int ret = 0;
-
-	if (atomic_read(&rth->dst.__refcnt))
-		goto out;
-
-	age = jiffies - rth->dst.lastuse;
-	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
-	    (age <= tmo2 && rt_valuable(rth)))
-		goto out;
-	ret = 1;
-out:	return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
-	u32 score = jiffies - rt->dst.lastuse;
-
-	score = ~score & ~(3<<30);
-
-	if (rt_valuable(rt))
-		score |= (1<<31);
-
-	if (rt_is_output_route(rt) ||
-	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
-		score |= (1<<30);
-
-	return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
-	return net->ipv4.current_rt_cache_rebuild_count <=
-		net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct rtable *rt1,
-				       const struct rtable *rt2)
-{
-	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
-	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_mark ^ rt2->rt_mark) |
-		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
-		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
-	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
 static inline int rt_is_expired(struct rtable *rth)
 {
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
 /*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
-	unsigned int i;
-	struct rtable *rth, *next;
-
-	for (i = 0; i <= rt_hash_mask; i++) {
-		struct rtable __rcu **pprev;
-		struct rtable *list;
-
-		if (process_context && need_resched())
-			cond_resched();
-		rth = rcu_access_pointer(rt_hash_table[i].chain);
-		if (!rth)
-			continue;
-
-		spin_lock_bh(rt_hash_lock_addr(i));
-
-		list = NULL;
-		pprev = &rt_hash_table[i].chain;
-		rth = rcu_dereference_protected(*pprev,
-			lockdep_is_held(rt_hash_lock_addr(i)));
-
-		while (rth) {
-			next = rcu_dereference_protected(rth->dst.rt_next,
-				lockdep_is_held(rt_hash_lock_addr(i)));
-
-			if (!net ||
-			    net_eq(dev_net(rth->dst.dev), net)) {
-				rcu_assign_pointer(*pprev, next);
-				rcu_assign_pointer(rth->dst.rt_next, list);
-				list = rth;
-			} else {
-				pprev = &rth->dst.rt_next;
-			}
-			rth = next;
-		}
-
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; list; list = next) {
-			next = rcu_dereference_protected(list->dst.rt_next, 1);
-			rt_free(list);
-		}
-	}
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
-	const struct rtable *aux = head;
-
-	while (aux != rth) {
-		if (compare_hash_inputs(aux, rth))
-			return 0;
-		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
-	}
-	return ONE;
-}
-
-static void rt_check_expire(void)
-{
-	static unsigned int rover;
-	unsigned int i = rover, goal;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long samples = 0;
-	unsigned long sum = 0, sum2 = 0;
-	unsigned long delta;
-	u64 mult;
-
-	delta = jiffies - expires_ljiffies;
-	expires_ljiffies = jiffies;
-	mult = ((u64)delta) << rt_hash_log;
-	if (ip_rt_gc_timeout > 1)
-		do_div(mult, ip_rt_gc_timeout);
-	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask)
-		goal = rt_hash_mask + 1;
-	for (; goal > 0; goal--) {
-		unsigned long tmo = ip_rt_gc_timeout;
-		unsigned long length;
-
-		i = (i + 1) & rt_hash_mask;
-		rthp = &rt_hash_table[i].chain;
-
-		if (need_resched())
-			cond_resched();
-
-		samples++;
-
-		if (rcu_dereference_raw(*rthp) == NULL)
-			continue;
-		length = 0;
-		spin_lock_bh(rt_hash_lock_addr(i));
-		while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
-			prefetch(rth->dst.rt_next);
-			if (rt_is_expired(rth) ||
-			    rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				continue;
-			}
-
-			/* We only count entries on a chain with equal
-			 * hash inputs once so that entries for
-			 * different QOS levels, and other non-hash
-			 * input attributes don't unfairly skew the
-			 * length computation
-			 */
-			tmo >>= 1;
-			rthp = &rth->dst.rt_next;
-			length += has_noalias(rt_hash_table[i].chain, rth);
-		}
-		spin_unlock_bh(rt_hash_lock_addr(i));
-		sum += length;
-		sum2 += length*length;
-	}
-	if (samples) {
-		unsigned long avg = sum / samples;
-		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
-		rt_chain_length_max = max_t(unsigned long,
-					ip_rt_gc_elasticity,
-					(avg + 4*sd) >> FRACT_BITS);
-	}
-	rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
-	rt_check_expire();
-	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
-/*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
  * many times (2^24) without giving recent rt_genid.
@@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net)
 void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
-	if (delay >= 0)
-		rt_do_flush(net, !in_softirq());
-}
-
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
-	rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
-	net_warn_ratelimited("Route hash chain too long!\n");
-	rt_cache_invalidate(net);
-}
-
-/*
-   Short description of GC goals.
-
-   We want to build algorithm, which will keep routing cache
-   at some equilibrium point, when number of aged off entries
-   is kept approximately equal to newly generated ones.
-
-   Current expiration strength is variable "expire".
-   We try to adjust it dynamically, so that if networking
-   is idle expires is large enough to keep enough of warm entries,
-   and when load increases it reduces to limit cache size.
- */
-
-static int rt_garbage_collect(struct dst_ops *ops)
-{
-	static unsigned long expire = RT_GC_TIMEOUT;
-	static unsigned long last_gc;
-	static int rover;
-	static int equilibrium;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long now = jiffies;
-	int goal;
-	int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
-	/*
-	 * Garbage collection is pretty expensive,
-	 * do not make it too frequently.
-	 */
-
-	RT_CACHE_STAT_INC(gc_total);
-
-	if (now - last_gc < ip_rt_gc_min_interval &&
-	    entries < ip_rt_max_size) {
-		RT_CACHE_STAT_INC(gc_ignored);
-		goto out;
-	}
-
-	entries = dst_entries_get_slow(&ipv4_dst_ops);
-	/* Calculate number of entries, which we want to expire now. */
-	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-	if (goal <= 0) {
-		if (equilibrium < ipv4_dst_ops.gc_thresh)
-			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = entries - equilibrium;
-		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = entries - equilibrium;
-		}
-	} else {
-		/* We are in dangerous area. Try to reduce cache really
-		 * aggressively.
-		 */
-		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = entries - goal;
-	}
-
-	if (now - last_gc >= ip_rt_gc_min_interval)
-		last_gc = now;
-
-	if (goal <= 0) {
-		equilibrium += goal;
-		goto work_done;
-	}
-
-	do {
-		int i, k;
-
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
-			unsigned long tmo = expire;
-
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
-				if (!rt_is_expired(rth) &&
-					!rt_may_expire(rth, tmo, expire)) {
-					tmo >>= 1;
-					rthp = &rth->dst.rt_next;
-					continue;
-				}
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				goal--;
-			}
-			spin_unlock_bh(rt_hash_lock_addr(k));
-			if (goal <= 0)
-				break;
-		}
-		rover = k;
-
-		if (goal <= 0)
-			goto work_done;
-
-		/* Goal is not achieved. We stop process if:
-
-		   - if expire reduced to zero. Otherwise, expire is halfed.
-		   - if table is not full.
-		   - if we are called from interrupt.
-		   - jiffies check is just fallback/debug loop breaker.
-		     We will not spin here for long time in any case.
-		 */
-
-		RT_CACHE_STAT_INC(gc_goal_miss);
-
-		if (expire == 0)
-			break;
-
-		expire >>= 1;
-
-		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-			goto out;
-	} while (!in_softirq() && time_before_eq(jiffies, now));
-
-	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	net_warn_ratelimited("dst cache overflow\n");
-	RT_CACHE_STAT_INC(gc_dst_overflow);
-	return 1;
-
-work_done:
-	expire += ip_rt_gc_min_interval;
-	if (expire > ip_rt_gc_timeout ||
-	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
-	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
-		expire = ip_rt_gc_timeout;
-out:	return 0;
-}
-
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
-	int length = 0;
-	const struct rtable *rth = head;
-
-	while (rth) {
-		length += has_noalias(head, rth);
-		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
-	}
-	return length >> FRACT_BITS;
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
-static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
-				     struct sk_buff *skb, int ifindex)
-{
-	struct rtable	*rth, *cand;
-	struct rtable __rcu **rthp, **candp;
-	unsigned long	now;
-	u32 		min_score;
-	int		chain_length;
-
-restart:
-	chain_length = 0;
-	min_score = ~(u32)0;
-	cand = NULL;
-	candp = NULL;
-	now = jiffies;
-
-	if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
-		/*
-		 * If we're not caching, just tell the caller we
-		 * were successful and don't touch the route.  The
-		 * caller hold the sole reference to the cache entry, and
-		 * it will be released when the caller is done with it.
-		 * If we drop it here, the callers have no way to resolve routes
-		 * when we're not caching.  Instead, just point *rp at rt, so
-		 * the caller gets a single use out of the route
-		 * Note that we do rt_free on this new route entry, so that
-		 * once its refcount hits zero, we are still able to reap it
-		 * (Thanks Alexey)
-		 * Note: To avoid expensive rcu stuff for this uncached dst,
-		 * we set DST_NOCACHE so that dst_release() can free dst without
-		 * waiting a grace period.
-		 */
-
-		rt->dst.flags |= DST_NOCACHE;
-		goto skip_hashing;
-	}
-
-	rthp = &rt_hash_table[hash].chain;
-
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (rt_is_expired(rth)) {
-			*rthp = rth->dst.rt_next;
-			rt_free(rth);
-			continue;
-		}
-		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
-			/* Put it first */
-			*rthp = rth->dst.rt_next;
-			/*
-			 * Since lookup is lockfree, the deletion
-			 * must be visible to another weakly ordered CPU before
-			 * the insertion at the start of the hash chain.
-			 */
-			rcu_assign_pointer(rth->dst.rt_next,
-					   rt_hash_table[hash].chain);
-			/*
-			 * Since lookup is lockfree, the update writes
-			 * must be ordered for consistency on SMP.
-			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
-			dst_use(&rth->dst, now);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			rt_drop(rt);
-			if (skb)
-				skb_dst_set(skb, &rth->dst);
-			return rth;
-		}
-
-		if (!atomic_read(&rth->dst.__refcnt)) {
-			u32 score = rt_score(rth);
-
-			if (score <= min_score) {
-				cand = rth;
-				candp = rthp;
-				min_score = score;
-			}
-		}
-
-		chain_length++;
-
-		rthp = &rth->dst.rt_next;
-	}
-
-	if (cand) {
-		/* ip_rt_gc_elasticity used to be average length of chain
-		 * length, when exceeded gc becomes really aggressive.
-		 *
-		 * The second limit is less certain. At the moment it allows
-		 * only 2 entries per bucket. We will see.
-		 */
-		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->dst.rt_next;
-			rt_free(cand);
-		}
-	} else {
-		if (chain_length > rt_chain_length_max &&
-		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-			struct net *net = dev_net(rt->dst.dev);
-			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(net)) {
-				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
-					rt->dst.dev->name, num);
-			}
-			rt_emergency_hash_rebuild(net);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-					ifindex, rt_genid(net));
-			goto restart;
-		}
-	}
-
-	rt->dst.rt_next = rt_hash_table[hash].chain;
-
-	/*
-	 * Since lookup is lockfree, we must make sure
-	 * previous writes to rt are committed to memory
-	 * before making rt visible to other CPUS.
-	 */
-	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
-	if (skb)
-		skb_dst_set(skb, &rt->dst);
-	return rt;
-}
-
 /*
  * Peer allocation may fail only in serious out-of-memory conditions.  However
  * we still can generate some output.
@@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void rt_del(unsigned int hash, struct rtable *rt)
-{
-	struct rtable __rcu **rthp;
-	struct rtable *aux;
-
-	rthp = &rt_hash_table[hash].chain;
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	ip_rt_put(rt);
-	while ((aux = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->dst.rt_next;
-			rt_free(aux);
-			continue;
-		}
-		rthp = &aux->dst.rt_next;
-	}
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-
 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 			     const struct iphdr *iph,
 			     int oif, u8 tos,
@@ -1506,10 +762,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 			   rt->dst.expires) {
-			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-						rt->rt_oif,
-						rt_genid(dev_net(dst->dev)));
-			rt_del(hash, rt);
+			ip_rt_put(rt);
 			ret = NULL;
 		}
 	}
@@ -1951,7 +1204,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 				   bool nopolicy, bool noxfrm)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-			 DST_HOST |
+			 DST_HOST | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1960,7 +1213,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned int hash;
 	struct rtable *rth;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	u32 itag = 0;
@@ -2024,9 +1276,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
-	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
-	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
+	skb_dst_set(skb, &rth->dst);
+	return 0;
 
 e_nobufs:
 	return -ENOBUFS;
@@ -2158,7 +1409,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 	struct rtable *rth = NULL;
 	int err;
-	unsigned int hash;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1)
@@ -2170,12 +1420,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	if (err)
 		return err;
 
-	/* put it into the cache */
-	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
-		       rt_genid(dev_net(rth->dst.dev)));
-	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
-	if (IS_ERR(rth))
-		return PTR_ERR(rth);
+	skb_dst_set(skb, &rth->dst);
 	return 0;
 }
 
@@ -2199,7 +1444,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned int	flags = 0;
 	u32		itag = 0;
 	struct rtable	*rth;
-	unsigned int	hash;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
 
@@ -2321,11 +1565,8 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
-	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+	skb_dst_set(skb, &rth->dst);
 	err = 0;
-	if (IS_ERR(rth))
-		err = PTR_ERR(rth);
 	goto out;
 
 no_route:
@@ -2364,46 +1605,10 @@ martian_source_keep_err:
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			   u8 tos, struct net_device *dev, bool noref)
 {
-	struct rtable	*rth;
-	unsigned int	hash;
-	int iif = dev->ifindex;
-	struct net *net;
 	int res;
 
-	net = dev_net(dev);
-
 	rcu_read_lock();
 
-	if (!rt_caching(net))
-		goto skip_cache;
-
-	tos &= IPTOS_RT_MASK;
-	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-	     rth = rcu_dereference(rth->dst.rt_next)) {
-		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
-		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
-		     (rth->rt_route_iif ^ iif) |
-		     (rth->rt_key_tos ^ tos)) == 0 &&
-		    rth->rt_mark == skb->mark &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			if (noref) {
-				dst_use_noref(&rth->dst, jiffies);
-				skb_dst_set_noref(skb, &rth->dst);
-			} else {
-				dst_use(&rth->dst, jiffies);
-				skb_dst_set(skb, &rth->dst);
-			}
-			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
-			return 0;
-		}
-		RT_CACHE_STAT_INC(in_hlist_search);
-	}
-
-skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2545,10 +1750,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 /*
  * Major route resolver routine.
- * called with rcu_read_lock();
  */
 
-static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2728,57 +1932,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 make_route:
 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
 			       tos, dev_out, flags);
-	if (!IS_ERR(rth)) {
-		unsigned int hash;
-
-		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
-			       rt_genid(dev_net(dev_out)));
-		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
-	}
 
 out:
 	rcu_read_unlock();
 	return rth;
 }
-
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-{
-	struct rtable *rth;
-	unsigned int hash;
-
-	if (!rt_caching(net))
-		goto slow_output;
-
-	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
-
-	rcu_read_lock_bh();
-	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-		rth = rcu_dereference_bh(rth->dst.rt_next)) {
-		if (rth->rt_key_dst == flp4->daddr &&
-		    rth->rt_key_src == flp4->saddr &&
-		    rt_is_output_route(rth) &&
-		    rth->rt_oif == flp4->flowi4_oif &&
-		    rth->rt_mark == flp4->flowi4_mark &&
-		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
-			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			dst_use(&rth->dst, jiffies);
-			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
-			if (!flp4->saddr)
-				flp4->saddr = rth->rt_src;
-			if (!flp4->daddr)
-				flp4->daddr = rth->rt_dst;
-			return rth;
-		}
-		RT_CACHE_STAT_INC(out_hlist_search);
-	}
-	rcu_read_unlock_bh();
-
-slow_output:
-	return ip_route_output_slow(net, flp4);
-}
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -3084,43 +2242,6 @@ errout_free:
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
-	struct rtable *rt;
-	int h, s_h;
-	int idx, s_idx;
-	struct net *net;
-
-	net = sock_net(skb->sk);
-
-	s_h = cb->args[0];
-	if (s_h < 0)
-		s_h = 0;
-	s_idx = idx = cb->args[1];
-	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
-		if (!rt_hash_table[h].chain)
-			continue;
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
-			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
-				continue;
-			if (rt_is_expired(rt))
-				continue;
-			skb_dst_set_noref(skb, &rt->dst);
-			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
-					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					 1, NLM_F_MULTI) <= 0) {
-				skb_dst_drop(skb);
-				rcu_read_unlock_bh();
-				goto done;
-			}
-			skb_dst_drop(skb);
-		}
-		rcu_read_unlock_bh();
-	}
-
-done:
-	cb->args[0] = h;
-	cb->args[1] = idx;
 	return skb->len;
 }
 
@@ -3354,22 +2475,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_IP_ROUTE_CLASSID */
 
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
-{
-	ssize_t ret;
-
-	if (!str)
-		return 0;
-
-	ret = kstrtoul(str, 0, &rhash_entries);
-	if (ret)
-		return 0;
-
-	return 1;
-}
-__setup("rhash_entries=", set_rhash_entries);
-
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3392,31 +2497,12 @@ int __init ip_rt_init(void)
 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
 
-	rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(totalram_pages >= 128 * 1024) ?
-					15 : 17,
-					0,
-					&rt_hash_log,
-					&rt_hash_mask,
-					0,
-					rhash_entries ? 0 : 512 * 1024);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = ~0;
+	ip_rt_max_size = INT_MAX;
 
 	devinet_init();
 	ip_fib_init();
 
-	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
-	expires_ljiffies = jiffies;
-	schedule_delayed_work(&expires_work,
-		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
 	if (ip_rt_proc_init())
 		pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 00/15] Kill the routing cache.
From: David Miller @ 2012-07-18 18:22 UTC (permalink / raw)
  To: netdev


And I'm really serious this time :-)

The general flow of this patch series is that first the routing
cache is removed.  We build a completely new rtable entry every
lookup request.

Next we make some simplifications due to the fact that removing the
routing cache causes several members of struct rtable to become
no longer necessary.

Then we need to make some amends such that we can legally cache
pre-constructed routes in the FIB nexthops.  Firstly, we need to
invalidate routes which are hit with nexthop exceptions.  Secondly we
have to change the semantics of rt->rt_gateway such that zero means
that the destination is on-link and non-zero otherwise.

Now that the preparations are ready, we start caching precomputed
routes in the FIB nexthops.  Output and input routes need different
kinds of care when determining if we can legally do such caching or
not.  The details are in the commit log messages for those changes.

The patch series then winds down with some more struct rtable
simplifications and other tidy ups that remove unnecessary overhead.

On a SPARC-T3 output route lookups are ~870 cycles.  Input route
lookups are ~970 cycles with rpfilter disabled, and about ~1400 with
rpfilter enabled.

These measurements were taken with the kbench_mod test module in the
net_test_tools GIT tree:

git://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git

Performance can easily be improved further.  For example
fib_table_lookup() performs a lot of excessive computations with all
the masking and shifting, some of it conditionalized to deal with edge
cases.

Also, Eric's no-ref optimization for input route lookups can be
re-instated for the FIB nexthop caching code path.  I would be really
pleased if someone would work on that.

In fact anyone suitable motivated can just fire up perf on the loading
of the test net_test_tools benchmark kernel module.  I spend much of
my time going:

bash# perf record insmod ./kbench_mod.ko dst=172.30.42.22 src=74.128.0.1 iif=2
bash# perf report

Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* [PATCH net-next 7/7] sfc: Bump version to 3.2
From: Ben Hutchings @ 2012-07-18 18:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

The key new feature for 3.2 is PTP support.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/net_driver.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index f84a5d5..27b44a3 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -37,7 +37,7 @@
  *
  **************************************************************************/
 
-#define EFX_DRIVER_VERSION	"3.1"
+#define EFX_DRIVER_VERSION	"3.2"
 
 #ifdef DEBUG
 #define EFX_BUG_ON_PARANOID(x) BUG_ON(x)
-- 
1.7.7.6


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 6/7] sfc: Expose FPGA bitfile partition through MTD
From: Ben Hutchings @ 2012-07-18 18:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mtd.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mtd.c b/drivers/net/ethernet/sfc/mtd.c
index 8f4604d..08f825b 100644
--- a/drivers/net/ethernet/sfc/mtd.c
+++ b/drivers/net/ethernet/sfc/mtd.c
@@ -585,6 +585,7 @@ static const struct siena_nvram_type_info siena_nvram_types[] = {
 	[MC_CMD_NVRAM_TYPE_EXP_ROM_CFG_PORT1]	= { 1, "sfc_exp_rom_cfg" },
 	[MC_CMD_NVRAM_TYPE_PHY_PORT0]		= { 0, "sfc_phy_fw" },
 	[MC_CMD_NVRAM_TYPE_PHY_PORT1]		= { 1, "sfc_phy_fw" },
+	[MC_CMD_NVRAM_TYPE_FPGA]		= { 0, "sfc_fpga" },
 };
 
 static int siena_mtd_probe_partition(struct efx_nic *efx,
@@ -598,7 +599,8 @@ static int siena_mtd_probe_partition(struct efx_nic *efx,
 	bool protected;
 	int rc;
 
-	if (type >= ARRAY_SIZE(siena_nvram_types))
+	if (type >= ARRAY_SIZE(siena_nvram_types) ||
+	    siena_nvram_types[type].name == NULL)
 		return -ENODEV;
 
 	info = &siena_nvram_types[type];
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 5/7] sfc: Support variable-length response to MCDI GET_BOARD_CFG
From: Ben Hutchings @ 2012-07-18 18:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mcdi.c |   18 +++++++++++++-----
 drivers/net/ethernet/sfc/mtd.c  |    3 ++-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 294df4b..9f74ab6 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -660,8 +660,8 @@ fail:
 int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address,
 			   u16 *fw_subtype_list, u32 *capabilities)
 {
-	uint8_t outbuf[MC_CMD_GET_BOARD_CFG_OUT_LENMIN];
-	size_t outlen;
+	uint8_t outbuf[MC_CMD_GET_BOARD_CFG_OUT_LENMAX];
+	size_t outlen, src_size, dst_size;
 	int port_num = efx_port_num(efx);
 	int offset;
 	int rc;
@@ -683,11 +683,19 @@ int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address,
 		: MC_CMD_GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT0_OFST;
 	if (mac_address)
 		memcpy(mac_address, outbuf + offset, ETH_ALEN);
-	if (fw_subtype_list)
+	if (fw_subtype_list) {
+		/* Truncate or zero-pad as necessary */
+		src_size = (outlen -
+			    MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_OFST);
+		dst_size = (MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM *
+			    sizeof(*fw_subtype_list));
 		memcpy(fw_subtype_list,
 		       outbuf + MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_OFST,
-		       MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MINNUM *
-		       sizeof(fw_subtype_list[0]));
+		       min(src_size, dst_size));
+		if (dst_size < src_size)
+			memset(fw_subtype_list + src_size, 0,
+			       dst_size - src_size);
+	}
 	if (capabilities) {
 		if (port_num)
 			*capabilities = MCDI_DWORD(outbuf,
diff --git a/drivers/net/ethernet/sfc/mtd.c b/drivers/net/ethernet/sfc/mtd.c
index 7581483..8f4604d 100644
--- a/drivers/net/ethernet/sfc/mtd.c
+++ b/drivers/net/ethernet/sfc/mtd.c
@@ -627,7 +627,8 @@ static int siena_mtd_get_fw_subtypes(struct efx_nic *efx,
 				     struct efx_mtd *efx_mtd)
 {
 	struct efx_mtd_partition *part;
-	uint16_t fw_subtype_list[MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MINNUM];
+	uint16_t fw_subtype_list[
+		MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM];
 	int rc;
 
 	rc = efx_mcdi_get_board_cfg(efx, NULL, fw_subtype_list, NULL);
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 4/7] sfc: Add support for IEEE-1588 PTP
From: Ben Hutchings @ 2012-07-18 18:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Add PTP IEEE-1588 support and make accesible via the PHC subsystem.

This work is based on prior code by Andrew Jackson

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/Kconfig      |    7 +
 drivers/net/ethernet/sfc/Makefile     |    1 +
 drivers/net/ethernet/sfc/efx.c        |    3 +
 drivers/net/ethernet/sfc/ethtool.c    |    1 +
 drivers/net/ethernet/sfc/mcdi_pcol.h  |    1 +
 drivers/net/ethernet/sfc/net_driver.h |   19 +-
 drivers/net/ethernet/sfc/nic.h        |   31 +
 drivers/net/ethernet/sfc/ptp.c        | 1519 +++++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/rx.c         |    2 +-
 drivers/net/ethernet/sfc/siena.c      |    1 +
 drivers/net/ethernet/sfc/tx.c         |    6 +
 11 files changed, 1589 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/sfc/ptp.c

diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig
index fb3cbc2..78c5d435 100644
--- a/drivers/net/ethernet/sfc/Kconfig
+++ b/drivers/net/ethernet/sfc/Kconfig
@@ -34,3 +34,10 @@ config SFC_SRIOV
 	  This enables support for the SFC9000 I/O Virtualization
 	  features, allowing accelerated network performance in
 	  virtualized environments.
+config SFC_PTP
+	bool "Solarflare SFC9000-family PTP support"
+	depends on SFC && PTP_1588_CLOCK
+	default y
+	---help---
+	  This enables support for the Precision Time Protocol (PTP)
+	  on SFC9000-family NICs
diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile
index ea1f8db..e11f2ec 100644
--- a/drivers/net/ethernet/sfc/Makefile
+++ b/drivers/net/ethernet/sfc/Makefile
@@ -5,5 +5,6 @@ sfc-y			+= efx.o nic.o falcon.o siena.o tx.o rx.o filter.o \
 			   mcdi.o mcdi_phy.o mcdi_mon.o
 sfc-$(CONFIG_SFC_MTD)	+= mtd.o
 sfc-$(CONFIG_SFC_SRIOV)	+= siena_sriov.o
+sfc-$(CONFIG_SFC_PTP)	+= ptp.o
 
 obj-$(CONFIG_SFC)	+= sfc.o
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 1c53d4b..e6631f0e 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1748,6 +1748,9 @@ static int efx_ioctl(struct net_device *net_dev, struct ifreq *ifr, int cmd)
 
 	EFX_ASSERT_RESET_SERIALISED(efx);
 
+	if (cmd == SIOCSHWTSTAMP)
+		return efx_ptp_ioctl(efx, ifr, cmd);
+
 	/* Convert phy_id from older PRTAD/DEVAD format */
 	if ((cmd == SIOCGMIIREG || cmd == SIOCSMIIREG) &&
 	    (data->phy_id & 0xfc00) == 0x0400)
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 10536f9..50cdd39 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -1170,6 +1170,7 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
 	.get_rxfh_indir		= efx_ethtool_get_rxfh_indir,
 	.set_rxfh_indir		= efx_ethtool_set_rxfh_indir,
+	.get_ts_info		= efx_ptp_get_ts_info,
 	.get_module_info	= efx_ethtool_get_module_info,
 	.get_module_eeprom	= efx_ethtool_get_module_eeprom,
 };
diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h
index 0310b9f0..0017f98 100644
--- a/drivers/net/ethernet/sfc/mcdi_pcol.h
+++ b/drivers/net/ethernet/sfc/mcdi_pcol.h
@@ -290,6 +290,7 @@
 #define          MCDI_EVENT_CODE_TX_FLUSH  0xc /* enum */
 #define          MCDI_EVENT_CODE_PTP_RX  0xd /* enum */
 #define          MCDI_EVENT_CODE_PTP_FAULT  0xe /* enum */
+#define          MCDI_EVENT_CODE_PTP_PPS  0xf /* enum */
 #define       MCDI_EVENT_CMDDONE_DATA_OFST 0
 #define       MCDI_EVENT_CMDDONE_DATA_LBN 0
 #define       MCDI_EVENT_CMDDONE_DATA_WIDTH 32
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 9913e32..f84a5d5 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -56,7 +56,8 @@
 #define EFX_MAX_CHANNELS 32U
 #define EFX_MAX_RX_QUEUES EFX_MAX_CHANNELS
 #define EFX_EXTRA_CHANNEL_IOV	0
-#define EFX_MAX_EXTRA_CHANNELS	1U
+#define EFX_EXTRA_CHANNEL_PTP	1
+#define EFX_MAX_EXTRA_CHANNELS	2U
 
 /* Checksum generation is a per-queue option in hardware, so each
  * queue visible to the networking core is backed by two hardware TX
@@ -68,6 +69,9 @@
 #define EFX_TXQ_TYPES		4
 #define EFX_MAX_TX_QUEUES	(EFX_TXQ_TYPES * EFX_MAX_CHANNELS)
 
+/* Forward declare Precision Time Protocol (PTP) support structure. */
+struct efx_ptp_data;
+
 struct efx_self_tests;
 
 /**
@@ -736,6 +740,7 @@ struct vfdi_status;
  *	%local_addr_list. Protected by %local_lock.
  * @local_lock: Mutex protecting %local_addr_list and %local_page_list.
  * @peer_work: Work item to broadcast peer addresses to VMs.
+ * @ptp_data: PTP state data
  * @monitor_work: Hardware monitor workitem
  * @biu_lock: BIU (bus interface unit) lock
  * @last_irq_cpu: Last CPU to handle a possible test interrupt.  This
@@ -860,6 +865,10 @@ struct efx_nic {
 	struct work_struct peer_work;
 #endif
 
+#ifdef CONFIG_SFC_PTP
+	struct efx_ptp_data *ptp_data;
+#endif
+
 	/* The following fields may be written more often */
 
 	struct delayed_work monitor_work ____cacheline_aligned_in_smp;
@@ -1122,5 +1131,13 @@ static inline void clear_bit_le(unsigned nr, unsigned char *addr)
 #define EFX_MAX_FRAME_LEN(mtu) \
 	((((mtu) + ETH_HLEN + VLAN_HLEN + 4/* FCS */ + 7) & ~7) + 16)
 
+static inline bool efx_xmit_with_hwtstamp(struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP;
+}
+static inline void efx_xmit_hwtstamp_pending(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+}
 
 #endif /* EFX_NET_DRIVER_H */
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index bab5cd9..cd53ab5 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -250,6 +250,37 @@ extern int efx_sriov_get_vf_config(struct net_device *dev, int vf,
 extern int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf,
 				     bool spoofchk);
 
+struct ethtool_ts_info;
+#ifdef CONFIG_SFC_PTP
+extern void efx_ptp_probe(struct efx_nic *efx);
+extern int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd);
+extern int efx_ptp_get_ts_info(struct net_device *net_dev,
+			       struct ethtool_ts_info *ts_info);
+extern bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb);
+extern int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb);
+extern void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev);
+#else
+static inline void efx_ptp_probe(struct efx_nic *efx) {}
+static inline int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+static inline int efx_ptp_get_ts_info(struct net_device *net_dev,
+				      struct ethtool_ts_info *ts_info)
+{
+	return -EOPNOTSUPP;
+}
+static inline bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return false;
+}
+static inline int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return NETDEV_TX_OK;
+}
+static inline void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev) {}
+#endif
+
 extern const struct efx_nic_type falcon_a1_nic_type;
 extern const struct efx_nic_type falcon_b0_nic_type;
 extern const struct efx_nic_type siena_a0_nic_type;
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
new file mode 100644
index 0000000..ba1e76a
--- /dev/null
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -0,0 +1,1519 @@
+/****************************************************************************
+ * Driver for Solarflare Solarstorm network controllers and boards
+ * Copyright 2011 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+/* Theory of operation:
+ *
+ * PTP support is assisted by firmware running on the MC, which provides
+ * the hardware timestamping capabilities.  Both transmitted and received
+ * PTP event packets are queued onto internal queues for subsequent processing;
+ * this is because the MC operations are relatively long and would block
+ * block NAPI/interrupt operation.
+ *
+ * Receive event processing:
+ *	The event contains the packet's UUID and sequence number, together
+ *	with the hardware timestamp.  The PTP receive packet queue is searched
+ *	for this UUID/sequence number and, if found, put on a pending queue.
+ *	Packets not matching are delivered without timestamps (MCDI events will
+ *	always arrive after the actual packet).
+ *	It is important for the operation of the PTP protocol that the ordering
+ *	of packets between the event and general port is maintained.
+ *
+ * Work queue processing:
+ *	If work waiting, synchronise host/hardware time
+ *
+ *	Transmit: send packet through MC, which returns the transmission time
+ *	that is converted to an appropriate timestamp.
+ *
+ *	Receive: the packet's reception time is converted to an appropriate
+ *	timestamp.
+ */
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include "net_driver.h"
+#include "efx.h"
+#include "mcdi.h"
+#include "mcdi_pcol.h"
+#include "io.h"
+#include "regs.h"
+#include "nic.h"
+
+/* Maximum number of events expected to make up a PTP event */
+#define	MAX_EVENT_FRAGS			3
+
+/* Maximum delay, ms, to begin synchronisation */
+#define	MAX_SYNCHRONISE_WAIT_MS		2
+
+/* How long, at most, to spend synchronising */
+#define	SYNCHRONISE_PERIOD_NS		250000
+
+/* How often to update the shared memory time */
+#define	SYNCHRONISATION_GRANULARITY_NS	200
+
+/* Minimum permitted length of a (corrected) synchronisation time */
+#define	MIN_SYNCHRONISATION_NS		120
+
+/* Maximum permitted length of a (corrected) synchronisation time */
+#define	MAX_SYNCHRONISATION_NS		1000
+
+/* How many (MC) receive events that can be queued */
+#define	MAX_RECEIVE_EVENTS		8
+
+/* Length of (modified) moving average. */
+#define	AVERAGE_LENGTH			16
+
+/* How long an unmatched event or packet can be held */
+#define PKT_EVENT_LIFETIME_MS		10
+
+/* Offsets into PTP packet for identification.  These offsets are from the
+ * start of the IP header, not the MAC header.  Note that neither PTP V1 nor
+ * PTP V2 permit the use of IPV4 options.
+ */
+#define PTP_DPORT_OFFSET	22
+
+#define PTP_V1_VERSION_LENGTH	2
+#define PTP_V1_VERSION_OFFSET	28
+
+#define PTP_V1_UUID_LENGTH	6
+#define PTP_V1_UUID_OFFSET	50
+
+#define PTP_V1_SEQUENCE_LENGTH	2
+#define PTP_V1_SEQUENCE_OFFSET	58
+
+/* The minimum length of a PTP V1 packet for offsets, etc. to be valid:
+ * includes IP header.
+ */
+#define	PTP_V1_MIN_LENGTH	64
+
+#define PTP_V2_VERSION_LENGTH	1
+#define PTP_V2_VERSION_OFFSET	29
+
+/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),
+ * the MC only captures the last six bytes of the clock identity. These values
+ * reflect those, not the ones used in the standard.  The standard permits
+ * mapping of V1 UUIDs to V2 UUIDs with these same values.
+ */
+#define PTP_V2_MC_UUID_LENGTH	6
+#define PTP_V2_MC_UUID_OFFSET	50
+
+#define PTP_V2_SEQUENCE_LENGTH	2
+#define PTP_V2_SEQUENCE_OFFSET	58
+
+/* The minimum length of a PTP V2 packet for offsets, etc. to be valid:
+ * includes IP header.
+ */
+#define	PTP_V2_MIN_LENGTH	63
+
+#define	PTP_MIN_LENGTH		63
+
+#define PTP_ADDRESS		0xe0000181	/* 224.0.1.129 */
+#define PTP_EVENT_PORT		319
+#define PTP_GENERAL_PORT	320
+
+/* Annoyingly the format of the version numbers are different between
+ * versions 1 and 2 so it isn't possible to simply look for 1 or 2.
+ */
+#define	PTP_VERSION_V1		1
+
+#define	PTP_VERSION_V2		2
+#define	PTP_VERSION_V2_MASK	0x0f
+
+enum ptp_packet_state {
+	PTP_PACKET_STATE_UNMATCHED = 0,
+	PTP_PACKET_STATE_MATCHED,
+	PTP_PACKET_STATE_TIMED_OUT,
+	PTP_PACKET_STATE_MATCH_UNWANTED
+};
+
+/* NIC synchronised with single word of time only comprising
+ * partial seconds and full nanoseconds: 10^9 ~ 2^30 so 2 bits for seconds.
+ */
+#define	MC_NANOSECOND_BITS	30
+#define	MC_NANOSECOND_MASK	((1 << MC_NANOSECOND_BITS) - 1)
+#define	MC_SECOND_MASK		((1 << (32 - MC_NANOSECOND_BITS)) - 1)
+
+/* Maximum parts-per-billion adjustment that is acceptable */
+#define MAX_PPB			1000000
+
+/* Number of bits required to hold the above */
+#define	MAX_PPB_BITS		20
+
+/* Number of extra bits allowed when calculating fractional ns.
+ * EXTRA_BITS + MC_CMD_PTP_IN_ADJUST_BITS + MAX_PPB_BITS should
+ * be less than 63.
+ */
+#define	PPB_EXTRA_BITS		2
+
+/* Precalculate scale word to avoid long long division at runtime */
+#define	PPB_SCALE_WORD	((1LL << (PPB_EXTRA_BITS + MC_CMD_PTP_IN_ADJUST_BITS +\
+			MAX_PPB_BITS)) / 1000000000LL)
+
+#define PTP_SYNC_ATTEMPTS	4
+
+/**
+ * struct efx_ptp_match - Matching structure, stored in sk_buff's cb area.
+ * @words: UUID and (partial) sequence number
+ * @expiry: Time after which the packet should be delivered irrespective of
+ *            event arrival.
+ * @state: The state of the packet - whether it is ready for processing or
+ *         whether that is of no interest.
+ */
+struct efx_ptp_match {
+	u32 words[DIV_ROUND_UP(PTP_V1_UUID_LENGTH, 4)];
+	unsigned long expiry;
+	enum ptp_packet_state state;
+};
+
+/**
+ * struct efx_ptp_event_rx - A PTP receive event (from MC)
+ * @seq0: First part of (PTP) UUID
+ * @seq1: Second part of (PTP) UUID and sequence number
+ * @hwtimestamp: Event timestamp
+ */
+struct efx_ptp_event_rx {
+	struct list_head link;
+	u32 seq0;
+	u32 seq1;
+	ktime_t hwtimestamp;
+	unsigned long expiry;
+};
+
+/**
+ * struct efx_ptp_timeset - Synchronisation between host and MC
+ * @host_start: Host time immediately before hardware timestamp taken
+ * @seconds: Hardware timestamp, seconds
+ * @nanoseconds: Hardware timestamp, nanoseconds
+ * @host_end: Host time immediately after hardware timestamp taken
+ * @waitns: Number of nanoseconds between hardware timestamp being read and
+ *          host end time being seen
+ * @window: Difference of host_end and host_start
+ * @valid: Whether this timeset is valid
+ */
+struct efx_ptp_timeset {
+	u32 host_start;
+	u32 seconds;
+	u32 nanoseconds;
+	u32 host_end;
+	u32 waitns;
+	u32 window;	/* Derived: end - start, allowing for wrap */
+};
+
+/**
+ * struct efx_ptp_data - Precision Time Protocol (PTP) state
+ * @channel: The PTP channel
+ * @rxq: Receive queue (awaiting timestamps)
+ * @txq: Transmit queue
+ * @evt_list: List of MC receive events awaiting packets
+ * @evt_free_list: List of free events
+ * @evt_lock: Lock for manipulating evt_list and evt_free_list
+ * @rx_evts: Instantiated events (on evt_list and evt_free_list)
+ * @workwq: Work queue for processing pending PTP operations
+ * @work: Work task
+ * @reset_required: A serious error has occurred and the PTP task needs to be
+ *                  reset (disable, enable).
+ * @rxfilter_event: Receive filter when operating
+ * @rxfilter_general: Receive filter when operating
+ * @config: Current timestamp configuration
+ * @enabled: PTP operation enabled
+ * @mode: Mode in which PTP operating (PTP version)
+ * @evt_frags: Partly assembled PTP events
+ * @evt_frag_idx: Current fragment number
+ * @evt_code: Last event code
+ * @start: Address at which MC indicates ready for synchronisation
+ * @host_base_time: (Synchronised with mc_base_time) host time
+ * @mc_base_time: (Synchronised with host_base_time) MC/hardware time
+ * @base_time_valid: Whether host_base_time and mc_base_time are synchronised
+ * @last_sync_ns: Last number of nanoseconds between readings when synchronising
+ * @base_sync_ns: Number of nanoseconds for last synchronisation.
+ * @base_sync_valid: Whether base_sync_time is valid.
+ * @current_adjfreq: Current ppb adjustment.
+ * @phc_clock: Pointer to registered phc device
+ * @phc_clock_info: Registration structure for phc device
+ * @pps_work: pps work task for handling pps events
+ * @pps_workwq: pps work queue
+ * @nic_ts_enabled: Flag indicating if NIC generated TS events are handled
+ * @txbuf: Buffer for use when transmitting (PTP) packets to MC (avoids
+ *         allocations in main data path).
+ * @debug_ptp_dir: PTP debugfs directory
+ * @missed_rx_sync: Number of packets received without syncrhonisation.
+ * @good_syncs: Number of successful synchronisations.
+ * @no_time_syncs: Number of synchronisations with no good times.
+ * @bad_sync_durations: Number of synchronisations with bad durations.
+ * @bad_syncs: Number of failed synchronisations.
+ * @last_sync_time: Number of nanoseconds for last synchronisation.
+ * @sync_timeouts: Number of synchronisation timeouts
+ * @fast_syncs: Number of synchronisations requiring short delay
+ * @min_sync_delta: Minimum time between event and synchronisation
+ * @max_sync_delta: Maximum time between event and synchronisation
+ * @average_sync_delta: Average time between event and synchronisation.
+ *                      Modified moving average.
+ * @last_sync_delta: Last time between event and synchronisation
+ * @mc_stats: Context value for MC statistics
+ * @timeset: Last set of synchronisation statistics.
+ */
+struct efx_ptp_data {
+	struct efx_channel *channel;
+	struct sk_buff_head rxq;
+	struct sk_buff_head txq;
+	struct list_head evt_list;
+	struct list_head evt_free_list;
+	spinlock_t evt_lock;
+	struct efx_ptp_event_rx rx_evts[MAX_RECEIVE_EVENTS];
+	struct workqueue_struct *workwq;
+	struct work_struct work;
+	bool reset_required;
+	u32 rxfilter_event;
+	u32 rxfilter_general;
+	bool rxfilter_installed;
+	struct hwtstamp_config config;
+	bool enabled;
+	unsigned int mode;
+	efx_qword_t evt_frags[MAX_EVENT_FRAGS];
+	int evt_frag_idx;
+	int evt_code;
+	struct efx_buffer start;
+	ktime_t host_base_time;
+	ktime_t mc_base_time;
+	bool base_time_valid;
+	unsigned last_sync_ns;
+	unsigned base_sync_ns;
+	bool base_sync_valid;
+	s64 current_adjfreq;
+	struct ptp_clock *phc_clock;
+	struct ptp_clock_info phc_clock_info;
+	struct work_struct pps_work;
+	struct workqueue_struct *pps_workwq;
+	bool nic_ts_enabled;
+	u8 txbuf[ALIGN(MC_CMD_PTP_IN_TRANSMIT_LEN(
+			       MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM), 4)];
+	struct efx_ptp_timeset
+	timeset[MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_MAXNUM];
+};
+
+static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta);
+static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta);
+static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts);
+static int efx_phc_settime(struct ptp_clock_info *ptp,
+			   const struct timespec *e_ts);
+static int efx_phc_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *request, int on);
+
+/* Enable MCDI PTP support. */
+static int efx_ptp_enable(struct efx_nic *efx)
+{
+	u8 inbuf[MC_CMD_PTP_IN_ENABLE_LEN];
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ENABLE);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_QUEUE,
+		       efx->ptp_data->channel->channel);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_MODE, efx->ptp_data->mode);
+
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+/* Disable MCDI PTP support.
+ *
+ * Note that this function should never rely on the presence of ptp_data -
+ * may be called before that exists.
+ */
+static int efx_ptp_disable(struct efx_nic *efx)
+{
+	u8 inbuf[MC_CMD_PTP_IN_DISABLE_LEN];
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_DISABLE);
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+static void efx_ptp_deliver_rx_queue(struct sk_buff_head *q)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(q))) {
+		local_bh_disable();
+		netif_receive_skb(skb);
+		local_bh_enable();
+	}
+}
+
+static void efx_ptp_handle_no_channel(struct efx_nic *efx)
+{
+	netif_err(efx, drv, efx->net_dev,
+		  "ERROR: PTP requires MSI-X and 1 additional interrupt"
+		  "vector. PTP disabled\n");
+}
+
+/* Repeatedly send the host time to the MC which will capture the hardware
+ * time.
+ */
+static void efx_ptp_send_times(struct efx_nic *efx, struct timespec *last_time)
+{
+	struct timespec now;
+	struct timespec limit;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct timespec start;
+	int *mc_running = ptp->start.addr;
+
+	getnstimeofday(&now);
+	start = now;
+	limit = now;
+	timespec_add_ns(&limit, SYNCHRONISE_PERIOD_NS);
+
+	/* Write host time for specified period or until MC is done */
+	while ((timespec_compare(&now, &limit) < 0) &&
+	       ACCESS_ONCE(*mc_running)) {
+		struct timespec update_time;
+		unsigned int host_time;
+
+		/* Don't update continuously to avoid saturating the PCIe bus */
+		update_time = now;
+		timespec_add_ns(&update_time, SYNCHRONISATION_GRANULARITY_NS);
+		do {
+			getnstimeofday(&now);
+		} while ((timespec_compare(&now, &update_time) < 0) &&
+			 ACCESS_ONCE(*mc_running));
+
+		/* Synchronise NIC with single word of time only */
+		host_time = (now.tv_sec << MC_NANOSECOND_BITS) | now.tv_nsec;
+		/* Update host time in NIC memory */
+		_efx_writed(efx, host_time,
+			    FR_CZ_MC_TREG_SMEM + MC_SMEM_P0_PTP_TIME_OFST);
+	}
+	*last_time = now;
+	start = timespec_sub(now, start);
+}
+
+/* Read a timeset from the MC's results and partial process. */
+static void efx_ptp_read_timeset(u8 *data, struct efx_ptp_timeset *timeset)
+{
+	unsigned start_ns, end_ns;
+
+	timeset->host_start = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTSTART);
+	timeset->seconds = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_SECONDS);
+	timeset->nanoseconds = MCDI_DWORD(data,
+					 PTP_OUT_SYNCHRONIZE_NANOSECONDS);
+	timeset->host_end = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTEND),
+	timeset->waitns = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_WAITNS);
+
+	/* Ignore seconds */
+	start_ns = timeset->host_start & MC_NANOSECOND_MASK;
+	end_ns = timeset->host_end & MC_NANOSECOND_MASK;
+	/* Allow for rollover */
+	if (end_ns < start_ns)
+		end_ns += NSEC_PER_SEC;
+	/* Determine duration of operation */
+	timeset->window = end_ns - start_ns;
+}
+
+/* Process times received from MC.
+ *
+ * Extract times from returned results, and establish the minimum value
+ * seen.  The minimum value represents the "best" possible time and events
+ * too much greater than this are rejected - the machine is, perhaps, too
+ * busy. A number of readings are taken so that, hopefully, at least one good
+ * synchronisation will be seen in the results.
+ */
+static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
+				 size_t response_length,
+				 struct timespec *last_time)
+{
+	unsigned number_readings = (response_length /
+			       MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN);
+	unsigned i;
+	unsigned min;
+	unsigned min_set = 0;
+	unsigned total;
+	unsigned ngood = 0;
+	unsigned last_good = 0;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool min_valid = false;
+	u32 last_sec;
+	u32 start_sec;
+
+	if (number_readings == 0)
+		return -EAGAIN;
+
+	/* Find minimum value in this set of results, discarding clearly
+	 * erroneous results.
+	 */
+	for (i = 0; i < number_readings; i++) {
+		efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]);
+		synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN;
+		if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) {
+			if (min_valid) {
+				if (ptp->timeset[i].window < min_set)
+					min_set = ptp->timeset[i].window;
+			} else {
+				min_valid = true;
+				min_set = ptp->timeset[i].window;
+			}
+		}
+	}
+
+	if (min_valid) {
+		if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns))
+			min = ptp->base_sync_ns;
+		else
+			min = min_set;
+	} else {
+		min = SYNCHRONISATION_GRANULARITY_NS;
+	}
+
+	/* Discard excessively long synchronise durations.  The MC times
+	 * when it finishes reading the host time so the corrected window
+	 * time should be fairly constant for a given platform.
+	 */
+	total = 0;
+	for (i = 0; i < number_readings; i++)
+		if (ptp->timeset[i].window > ptp->timeset[i].waitns) {
+			unsigned win;
+
+			win = ptp->timeset[i].window - ptp->timeset[i].waitns;
+			if (win >= MIN_SYNCHRONISATION_NS &&
+			    win < MAX_SYNCHRONISATION_NS) {
+				total += ptp->timeset[i].window;
+				ngood++;
+				last_good = i;
+			}
+		}
+
+	if (ngood == 0) {
+		netif_warn(efx, drv, efx->net_dev,
+			   "PTP no suitable synchronisations %dns %dns\n",
+			   ptp->base_sync_ns, min_set);
+		return -EAGAIN;
+	}
+
+	/* Average minimum this synchronisation */
+	ptp->last_sync_ns = DIV_ROUND_UP(total, ngood);
+	if (!ptp->base_sync_valid || (ptp->last_sync_ns < ptp->base_sync_ns)) {
+		ptp->base_sync_valid = true;
+		ptp->base_sync_ns = ptp->last_sync_ns;
+	}
+
+	ptp->mc_base_time = ktime_set(ptp->timeset[last_good].seconds,
+				      ptp->timeset[last_good].nanoseconds);
+	last_time->tv_nsec =
+		ptp->timeset[last_good].host_start & MC_NANOSECOND_MASK;
+
+	/* It is possible that the seconds rolled over between taking
+	 * the start reading and the last value written by the host.  The
+	 * timescales are such that a gap of more than one second is never
+	 * expected.
+	 */
+	start_sec = ptp->timeset[last_good].host_start >> MC_NANOSECOND_BITS;
+	last_sec = last_time->tv_sec & MC_SECOND_MASK;
+	if (start_sec != last_sec) {
+		if (((start_sec + 1) & MC_SECOND_MASK) != last_sec) {
+			netif_warn(efx, hw, efx->net_dev,
+				   "PTP bad synchronisation seconds\n");
+			return -EAGAIN;
+		} else {
+			last_time->tv_sec--;
+		}
+	}
+	ptp->host_base_time = ktime_set(last_time->tv_sec,
+					last_time->tv_nsec);
+
+	/* At least one good synchronisation */
+	ptp->base_time_valid = true;
+
+	return 0;
+}
+
+/* Synchronize times between the host and the MC */
+static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	u8 synch_buf[MC_CMD_PTP_OUT_SYNCHRONIZE_LENMAX];
+	size_t response_length;
+	int rc;
+	unsigned long timeout;
+	struct timespec last_time;
+	unsigned int loops = 0;
+	int *start = ptp->start.addr;
+
+	last_time.tv_sec = 0;
+	last_time.tv_nsec = 0;
+
+	MCDI_SET_DWORD(synch_buf, PTP_IN_OP, MC_CMD_PTP_OP_SYNCHRONIZE);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_NUMTIMESETS,
+		       num_readings);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR_LO,
+		       (u32)ptp->start.dma_addr);
+	MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR_HI,
+		       (u32)((u64)ptp->start.dma_addr >> 32));
+
+	/* Clear flag that signals MC ready */
+	ACCESS_ONCE(*start) = 0;
+	efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf,
+			   MC_CMD_PTP_IN_SYNCHRONIZE_LEN);
+
+	/* Wait for start from MCDI (or timeout) */
+	timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS);
+	while (!ACCESS_ONCE(*start) && (time_before(jiffies, timeout))) {
+		udelay(20);	/* Usually start MCDI execution quickly */
+		loops++;
+	}
+
+	if (ACCESS_ONCE(*start))
+		efx_ptp_send_times(efx, &last_time);
+
+	/* Collect results */
+	rc = efx_mcdi_rpc_finish(efx, MC_CMD_PTP,
+				 MC_CMD_PTP_IN_SYNCHRONIZE_LEN,
+				 synch_buf, sizeof(synch_buf),
+				 &response_length);
+	if (rc == 0)
+		rc = efx_ptp_process_times(efx, synch_buf, response_length,
+					   &last_time);
+
+	return rc;
+}
+
+/* Get the host time from a given hardware time */
+static bool efx_ptp_get_host_time(struct efx_nic *efx,
+				  struct skb_shared_hwtstamps *timestamps)
+{
+	if (efx->ptp_data->base_time_valid) {
+		ktime_t diff = ktime_sub(timestamps->hwtstamp,
+					 efx->ptp_data->mc_base_time);
+
+		timestamps->syststamp = ktime_add(efx->ptp_data->host_base_time,
+						  diff);
+	}
+
+	return efx->ptp_data->base_time_valid;
+}
+
+/* Transmit a PTP packet, via the MCDI interface, to the wire. */
+static int efx_ptp_xmit_skb(struct efx_nic *efx, struct sk_buff *skb)
+{
+	u8 *txbuf = efx->ptp_data->txbuf;
+	struct skb_shared_hwtstamps timestamps;
+	int rc = -EIO;
+	/* MCDI driver requires word aligned lengths */
+	size_t len = ALIGN(MC_CMD_PTP_IN_TRANSMIT_LEN(skb->len), 4);
+	u8 txtime[MC_CMD_PTP_OUT_TRANSMIT_LEN];
+
+	MCDI_SET_DWORD(txbuf, PTP_IN_OP, MC_CMD_PTP_OP_TRANSMIT);
+	MCDI_SET_DWORD(txbuf, PTP_IN_TRANSMIT_LENGTH, skb->len);
+	if (skb_shinfo(skb)->nr_frags != 0) {
+		rc = skb_linearize(skb);
+		if (rc != 0)
+			goto fail;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		rc = skb_checksum_help(skb);
+		if (rc != 0)
+			goto fail;
+	}
+	skb_copy_from_linear_data(skb,
+				  &txbuf[MC_CMD_PTP_IN_TRANSMIT_PACKET_OFST],
+				  len);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, txbuf, len, txtime,
+			  sizeof(txtime), &len);
+	if (rc != 0)
+		goto fail;
+
+	memset(&timestamps, 0, sizeof(timestamps));
+	timestamps.hwtstamp = ktime_set(
+		MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_SECONDS),
+		MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_NANOSECONDS));
+	if (efx_ptp_get_host_time(efx, &timestamps))
+		skb_tstamp_tx(skb, &timestamps);
+	/* Success even if hardware timestamping failed */
+	rc = 0;
+
+fail:
+	dev_kfree_skb(skb);
+
+	return rc;
+}
+
+static void efx_ptp_drop_time_expired_events(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct list_head *cursor;
+	struct list_head *next;
+
+	/* Drop time-expired events */
+	spin_lock_bh(&ptp->evt_lock);
+	if (!list_empty(&ptp->evt_list)) {
+		list_for_each_safe(cursor, next, &ptp->evt_list) {
+			struct efx_ptp_event_rx *evt;
+
+			evt = list_entry(cursor, struct efx_ptp_event_rx,
+					 link);
+			if (time_after(jiffies, evt->expiry)) {
+				list_del(&evt->link);
+				list_add(&evt->link, &ptp->evt_free_list);
+				netif_warn(efx, hw, efx->net_dev,
+					   "PTP rx event dropped\n");
+			}
+		}
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+}
+
+static enum ptp_packet_state efx_ptp_match_rx(struct efx_nic *efx,
+					      struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool evts_waiting;
+	struct list_head *cursor;
+	struct list_head *next;
+	struct efx_ptp_match *match;
+	enum ptp_packet_state rc = PTP_PACKET_STATE_UNMATCHED;
+
+	spin_lock_bh(&ptp->evt_lock);
+	evts_waiting = !list_empty(&ptp->evt_list);
+	spin_unlock_bh(&ptp->evt_lock);
+
+	if (!evts_waiting)
+		return PTP_PACKET_STATE_UNMATCHED;
+
+	match = (struct efx_ptp_match *)skb->cb;
+	/* Look for a matching timestamp in the event queue */
+	spin_lock_bh(&ptp->evt_lock);
+	list_for_each_safe(cursor, next, &ptp->evt_list) {
+		struct efx_ptp_event_rx *evt;
+
+		evt = list_entry(cursor, struct efx_ptp_event_rx, link);
+		if ((evt->seq0 == match->words[0]) &&
+		    (evt->seq1 == match->words[1])) {
+			struct skb_shared_hwtstamps *timestamps;
+
+			/* Match - add in hardware timestamp */
+			timestamps = skb_hwtstamps(skb);
+			timestamps->hwtstamp = evt->hwtimestamp;
+
+			match->state = PTP_PACKET_STATE_MATCHED;
+			rc = PTP_PACKET_STATE_MATCHED;
+			list_del(&evt->link);
+			list_add(&evt->link, &ptp->evt_free_list);
+			break;
+		}
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+
+	return rc;
+}
+
+/* Process any queued receive events and corresponding packets
+ *
+ * q is returned with all the packets that are ready for delivery.
+ * true is returned if at least one of those packets requires
+ * synchronisation.
+ */
+static bool efx_ptp_process_events(struct efx_nic *efx, struct sk_buff_head *q)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	bool rc = false;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ptp->rxq))) {
+		struct efx_ptp_match *match;
+
+		match = (struct efx_ptp_match *)skb->cb;
+		if (match->state == PTP_PACKET_STATE_MATCH_UNWANTED) {
+			__skb_queue_tail(q, skb);
+		} else if (efx_ptp_match_rx(efx, skb) ==
+			   PTP_PACKET_STATE_MATCHED) {
+			rc = true;
+			__skb_queue_tail(q, skb);
+		} else if (time_after(jiffies, match->expiry)) {
+			match->state = PTP_PACKET_STATE_TIMED_OUT;
+			netif_warn(efx, rx_err, efx->net_dev,
+				   "PTP packet - no timestamp seen\n");
+			__skb_queue_tail(q, skb);
+		} else {
+			/* Replace unprocessed entry and stop */
+			skb_queue_head(&ptp->rxq, skb);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+/* Complete processing of a received packet */
+static void efx_ptp_process_rx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
+
+	/* Translate timestamps, as required */
+	if (match->state == PTP_PACKET_STATE_MATCHED) {
+		struct skb_shared_hwtstamps *timestamps;
+
+		timestamps = skb_hwtstamps(skb);
+		efx_ptp_get_host_time(efx, timestamps);
+	}
+
+	local_bh_disable();
+	netif_receive_skb(skb);
+	local_bh_enable();
+}
+
+static int efx_ptp_start(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_filter_spec rxfilter;
+	int rc;
+
+	ptp->reset_required = false;
+
+	/* Must resynchronise when starting */
+	ptp->base_time_valid = false;
+	ptp->base_sync_valid = false;
+
+	/* Must filter on both event and general ports to ensure
+	 * that there is no packet re-ordering.
+	 */
+	efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0,
+			   efx_rx_queue_index(
+				   efx_channel_get_rx_queue(ptp->channel)));
+	rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP,
+				       htonl(PTP_ADDRESS),
+				       htons(PTP_EVENT_PORT));
+	if (rc != 0)
+		return rc;
+
+	rc = efx_filter_insert_filter(efx, &rxfilter, true);
+	if (rc < 0)
+		return rc;
+	ptp->rxfilter_event = rc;
+
+	efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0,
+			   efx_rx_queue_index(
+				   efx_channel_get_rx_queue(ptp->channel)));
+	rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP,
+				       htonl(PTP_ADDRESS),
+				       htons(PTP_GENERAL_PORT));
+	if (rc != 0)
+		goto fail;
+
+	rc = efx_filter_insert_filter(efx, &rxfilter, true);
+	if (rc < 0)
+		goto fail;
+	ptp->rxfilter_general = rc;
+
+	rc = efx_ptp_enable(efx);
+	if (rc != 0)
+		goto fail2;
+
+	ptp->evt_frag_idx = 0;
+	ptp->current_adjfreq = 0;
+	ptp->rxfilter_installed = true;
+
+	return 0;
+
+fail2:
+	efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+				  ptp->rxfilter_general);
+fail:
+	efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+				  ptp->rxfilter_event);
+
+	return rc;
+}
+
+static int efx_ptp_stop(struct efx_nic *efx)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int rc = efx_ptp_disable(efx);
+	struct list_head *cursor;
+	struct list_head *next;
+
+	if (ptp->rxfilter_installed) {
+		efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+					  ptp->rxfilter_general);
+		efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED,
+					  ptp->rxfilter_event);
+		ptp->rxfilter_installed = false;
+	}
+
+	/* Make sure RX packets are really delivered */
+	efx_ptp_deliver_rx_queue(&efx->ptp_data->rxq);
+	skb_queue_purge(&efx->ptp_data->txq);
+
+	/* Drop any pending receive events */
+	spin_lock_bh(&efx->ptp_data->evt_lock);
+	list_for_each_safe(cursor, next, &efx->ptp_data->evt_list) {
+		list_del(cursor);
+		list_add(cursor, &efx->ptp_data->evt_free_list);
+	}
+	spin_unlock_bh(&efx->ptp_data->evt_lock);
+
+	return rc;
+}
+
+static void efx_ptp_pps_worker(struct work_struct *work)
+{
+	struct efx_ptp_data *ptp =
+		container_of(work, struct efx_ptp_data, pps_work);
+	struct efx_nic *efx = ptp->channel->efx;
+	struct timespec event_gen_time;
+	struct ptp_clock_event ptp_pps_evt;
+	ktime_t gen_time_host;
+
+	if (efx_ptp_synchronize(efx, PTP_SYNC_ATTEMPTS))
+		return;
+
+	gen_time_host = ktime_sub(ptp->mc_base_time,
+				  ptp->host_base_time);
+	event_gen_time = ktime_to_timespec(gen_time_host);
+
+	ptp_pps_evt.type = PTP_CLOCK_EXTTS;
+	ptp_pps_evt.timestamp = ktime_to_ns(gen_time_host);
+	ptp_clock_event(ptp->phc_clock, &ptp_pps_evt);
+}
+
+/* Process any pending transmissions and timestamp any received packets.
+ *
+ * Host and NIC time are synchronised once if there is any work to do:
+ * the process is relatively expensive so don't do it for each packet.
+ */
+static void efx_ptp_worker(struct work_struct *work)
+{
+	struct efx_ptp_data *ptp_data =
+		container_of(work, struct efx_ptp_data, work);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	struct sk_buff *skb;
+	struct sk_buff_head tempq;
+
+	if (ptp_data->reset_required) {
+		efx_ptp_stop(efx);
+		efx_ptp_start(efx);
+		return;
+	}
+
+	efx_ptp_drop_time_expired_events(efx);
+
+	__skb_queue_head_init(&tempq);
+	if (efx_ptp_process_events(efx, &tempq) ||
+	    !skb_queue_empty(&ptp_data->txq)) {
+		/* Synchronise PC/MC times when there's work to do. This
+		 * isn't fatal but would be unusual (because of the retries
+		 * within efx_ptp_synchronize).  Failure may suggest a heavily
+		 * overloaded system.
+		 */
+		if (0 != efx_ptp_synchronize(efx, PTP_SYNC_ATTEMPTS))
+			netif_warn(efx, drv, efx->net_dev,
+				   "PTP couldn't get synchronisation\n");
+
+		while ((skb = skb_dequeue(&ptp_data->txq)))
+			efx_ptp_xmit_skb(efx, skb);
+	}
+
+	while ((skb = __skb_dequeue(&tempq)))
+		efx_ptp_process_rx(efx, skb);
+}
+
+/* Initialise PTP channel and state.
+ *
+ * Setting core_index to zero causes the queue to be initialised and doesn't
+ * overlap with 'rxq0' because ptp.c doesn't use skb_record_rx_queue.
+ */
+static int efx_ptp_probe_channel(struct efx_channel *channel)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp;
+	int rc = 0;
+	unsigned int pos;
+
+	channel->irq_moderation = 0;
+	channel->rx_queue.core_index = 0;
+
+	ptp = kzalloc(sizeof(struct efx_ptp_data), GFP_KERNEL);
+	efx->ptp_data = ptp;
+	if (!efx->ptp_data)
+		return -ENOMEM;
+
+	rc = efx_nic_alloc_buffer(efx, &ptp->start, sizeof(int));
+	if (rc != 0)
+		goto fail1;
+
+	ptp->channel = channel;
+	skb_queue_head_init(&ptp->rxq);
+	skb_queue_head_init(&ptp->txq);
+	ptp->workwq = create_singlethread_workqueue("sfc_ptp");
+	if (!ptp->workwq) {
+		rc = -ENOMEM;
+		goto fail2;
+	}
+
+	INIT_WORK(&ptp->work, efx_ptp_worker);
+	ptp->config.flags = 0;
+	ptp->config.tx_type = HWTSTAMP_TX_OFF;
+	ptp->config.rx_filter = HWTSTAMP_FILTER_NONE;
+	INIT_LIST_HEAD(&ptp->evt_list);
+	INIT_LIST_HEAD(&ptp->evt_free_list);
+	spin_lock_init(&ptp->evt_lock);
+	for (pos = 0; pos < MAX_RECEIVE_EVENTS; pos++)
+		list_add(&ptp->rx_evts[pos].link, &ptp->evt_free_list);
+
+	ptp->phc_clock_info.owner = THIS_MODULE;
+	snprintf(ptp->phc_clock_info.name,
+		 sizeof(ptp->phc_clock_info.name),
+		 "%pm", efx->net_dev->perm_addr);
+	ptp->phc_clock_info.max_adj = MAX_PPB;
+	ptp->phc_clock_info.n_alarm = 0;
+	ptp->phc_clock_info.n_ext_ts = 1;
+	ptp->phc_clock_info.n_per_out = 0;
+	ptp->phc_clock_info.pps = 0;
+	ptp->phc_clock_info.adjfreq = efx_phc_adjfreq;
+	ptp->phc_clock_info.adjtime = efx_phc_adjtime;
+	ptp->phc_clock_info.gettime = efx_phc_gettime;
+	ptp->phc_clock_info.settime = efx_phc_settime;
+	ptp->phc_clock_info.enable = efx_phc_enable;
+
+	ptp->phc_clock = ptp_clock_register(&ptp->phc_clock_info);
+	if (!ptp->phc_clock)
+		goto fail3;
+
+	INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker);
+	ptp->pps_workwq = create_singlethread_workqueue("sfc_pps");
+	if (!ptp->pps_workwq) {
+		rc = -ENOMEM;
+		goto fail4;
+	}
+	ptp->nic_ts_enabled = false;
+
+	return 0;
+fail4:
+	ptp_clock_unregister(efx->ptp_data->phc_clock);
+
+fail3:
+	destroy_workqueue(efx->ptp_data->workwq);
+
+fail2:
+	efx_nic_free_buffer(efx, &ptp->start);
+
+fail1:
+	kfree(efx->ptp_data);
+	efx->ptp_data = 0;
+
+	return rc;
+}
+
+static void efx_ptp_remove_channel(struct efx_channel *channel)
+{
+	struct efx_nic *efx = channel->efx;
+
+	if (!efx->ptp_data)
+		return;
+
+	(void)efx_ptp_disable(channel->efx);
+
+	cancel_work_sync(&efx->ptp_data->work);
+	cancel_work_sync(&efx->ptp_data->pps_work);
+
+	skb_queue_purge(&efx->ptp_data->rxq);
+	skb_queue_purge(&efx->ptp_data->txq);
+
+	ptp_clock_unregister(efx->ptp_data->phc_clock);
+
+	destroy_workqueue(efx->ptp_data->workwq);
+	destroy_workqueue(efx->ptp_data->pps_workwq);
+
+	efx_nic_free_buffer(efx, &efx->ptp_data->start);
+	kfree(efx->ptp_data);
+}
+
+static void efx_ptp_get_channel_name(struct efx_channel *channel,
+				     char *buf, size_t len)
+{
+	snprintf(buf, len, "%s-ptp", channel->efx->name);
+}
+
+/* Determine whether this packet should be processed by the PTP module
+ * or transmitted conventionally.
+ */
+bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	return efx->ptp_data &&
+		efx->ptp_data->enabled &&
+		skb->len >= PTP_MIN_LENGTH &&
+		skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM &&
+		likely(skb->protocol == htons(ETH_P_IP)) &&
+		ip_hdr(skb)->protocol == IPPROTO_UDP &&
+		udp_hdr(skb)->dest == htons(PTP_EVENT_PORT);
+}
+
+/* Receive a PTP packet.  Packets are queued until the arrival of
+ * the receive timestamp from the MC - this will probably occur after the
+ * packet arrival because of the processing in the MC.
+ */
+static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
+{
+	struct efx_nic *efx = channel->efx;
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
+	u8 *data;
+	unsigned int version;
+
+	match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
+
+	/* Correct version? */
+	if (ptp->mode == MC_CMD_PTP_MODE_V1) {
+		if (skb->len < PTP_V1_MIN_LENGTH) {
+			netif_receive_skb(skb);
+			return;
+		}
+		version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);
+		if (version != PTP_VERSION_V1) {
+			netif_receive_skb(skb);
+			return;
+		}
+	} else {
+		if (skb->len < PTP_V2_MIN_LENGTH) {
+			netif_receive_skb(skb);
+			return;
+		}
+		version = skb->data[PTP_V2_VERSION_OFFSET];
+
+		BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2);
+		BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET);
+		BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH);
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
+		BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
+
+		if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
+			netif_receive_skb(skb);
+			return;
+		}
+	}
+
+	/* Does this packet require timestamping? */
+	if (ntohs(*(__be16 *)&skb->data[PTP_DPORT_OFFSET]) == PTP_EVENT_PORT) {
+		struct skb_shared_hwtstamps *timestamps;
+
+		match->state = PTP_PACKET_STATE_UNMATCHED;
+
+		/* Clear all timestamps held: filled in later */
+		timestamps = skb_hwtstamps(skb);
+		memset(timestamps, 0, sizeof(*timestamps));
+
+		/* Extract UUID/Sequence information */
+		data = skb->data + PTP_V1_UUID_OFFSET;
+		match->words[0] = (data[0]         |
+				   (data[1] << 8)  |
+				   (data[2] << 16) |
+				   (data[3] << 24));
+		match->words[1] = (data[4]         |
+				   (data[5] << 8)  |
+				   (skb->data[PTP_V1_SEQUENCE_OFFSET +
+					      PTP_V1_SEQUENCE_LENGTH - 1] <<
+				    16));
+	} else {
+		match->state = PTP_PACKET_STATE_MATCH_UNWANTED;
+	}
+
+	skb_queue_tail(&ptp->rxq, skb);
+	queue_work(ptp->workwq, &ptp->work);
+}
+
+/* Transmit a PTP packet.  This has to be transmitted by the MC
+ * itself, through an MCDI call.  MCDI calls aren't permitted
+ * in the transmit path so defer the actual transmission to a suitable worker.
+ */
+int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	skb_queue_tail(&ptp->txq, skb);
+
+	if ((udp_hdr(skb)->dest == htons(PTP_EVENT_PORT)) &&
+	    (skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM))
+		efx_xmit_hwtstamp_pending(skb);
+	queue_work(ptp->workwq, &ptp->work);
+
+	return NETDEV_TX_OK;
+}
+
+static int efx_ptp_change_mode(struct efx_nic *efx, bool enable_wanted,
+			       unsigned int new_mode)
+{
+	if ((enable_wanted != efx->ptp_data->enabled) ||
+	    (enable_wanted && (efx->ptp_data->mode != new_mode))) {
+		int rc;
+
+		if (enable_wanted) {
+			/* Change of mode requires disable */
+			if (efx->ptp_data->enabled &&
+			    (efx->ptp_data->mode != new_mode)) {
+				efx->ptp_data->enabled = false;
+				rc = efx_ptp_stop(efx);
+				if (rc != 0)
+					return rc;
+			}
+
+			/* Set new operating mode and establish
+			 * baseline synchronisation, which must
+			 * succeed.
+			 */
+			efx->ptp_data->mode = new_mode;
+			rc = efx_ptp_start(efx);
+			if (rc == 0) {
+				rc = efx_ptp_synchronize(efx,
+							 PTP_SYNC_ATTEMPTS * 2);
+				if (rc != 0)
+					efx_ptp_stop(efx);
+			}
+		} else {
+			rc = efx_ptp_stop(efx);
+		}
+
+		if (rc != 0)
+			return rc;
+
+		efx->ptp_data->enabled = enable_wanted;
+	}
+
+	return 0;
+}
+
+static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
+{
+	bool enable_wanted = false;
+	unsigned int new_mode;
+	int rc;
+
+	if (init->flags)
+		return -EINVAL;
+
+	if ((init->tx_type != HWTSTAMP_TX_OFF) &&
+	    (init->tx_type != HWTSTAMP_TX_ON))
+		return -ERANGE;
+
+	new_mode = efx->ptp_data->mode;
+	/* Determine whether any PTP HW operations are required */
+	switch (init->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+		init->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		new_mode = MC_CMD_PTP_MODE_V1;
+		enable_wanted = true;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	/* Although these three are accepted only IPV4 packets will be
+	 * timestamped
+	 */
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		new_mode = MC_CMD_PTP_MODE_V2;
+		enable_wanted = true;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		/* Non-IP timestamping not supported */
+		return -ERANGE;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (init->tx_type != HWTSTAMP_TX_OFF)
+		enable_wanted = true;
+
+	rc = efx_ptp_change_mode(efx, enable_wanted, new_mode);
+	if (rc != 0)
+		return rc;
+
+	efx->ptp_data->config = *init;
+
+	return 0;
+}
+
+int
+efx_ptp_get_ts_info(struct net_device *net_dev, struct ethtool_ts_info *ts_info)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	if (!ptp)
+		return -EOPNOTSUPP;
+
+	ts_info->so_timestamping = (SOF_TIMESTAMPING_TX_HARDWARE |
+				    SOF_TIMESTAMPING_RX_HARDWARE |
+				    SOF_TIMESTAMPING_SYS_HARDWARE |
+				    SOF_TIMESTAMPING_RAW_HARDWARE);
+	ts_info->phc_index = ptp_clock_index(ptp->phc_clock);
+	ts_info->tx_types = 1 << HWTSTAMP_TX_OFF | 1 << HWTSTAMP_TX_ON;
+	ts_info->rx_filters = (1 << HWTSTAMP_FILTER_NONE |
+			       1 << HWTSTAMP_FILTER_PTP_V1_L4_EVENT |
+			       1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC |
+			       1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ |
+			       1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT |
+			       1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC |
+			       1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ |
+			       1 << HWTSTAMP_FILTER_PTP_V2_EVENT |
+			       1 << HWTSTAMP_FILTER_PTP_V2_SYNC |
+			       1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ);
+	return 0;
+}
+
+int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config config;
+	int rc;
+
+	/* Not a PTP enabled port */
+	if (!efx->ptp_data)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	rc = efx_ptp_ts_init(efx, &config);
+	if (rc != 0)
+		return rc;
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config))
+		? -EFAULT : 0;
+}
+
+static void ptp_event_failure(struct efx_nic *efx, int expected_frag_len)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+
+	netif_err(efx, hw, efx->net_dev,
+		"PTP unexpected event length: got %d expected %d\n",
+		ptp->evt_frag_idx, expected_frag_len);
+	ptp->reset_required = true;
+	queue_work(ptp->workwq, &ptp->work);
+}
+
+/* Process a completed receive event.  Put it on the event queue and
+ * start worker thread.  This is required because event and their
+ * correspoding packets may come in either order.
+ */
+static void ptp_event_rx(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	struct efx_ptp_event_rx *evt = NULL;
+
+	if (ptp->evt_frag_idx != 3) {
+		ptp_event_failure(efx, 3);
+		return;
+	}
+
+	spin_lock_bh(&ptp->evt_lock);
+	if (!list_empty(&ptp->evt_free_list)) {
+		evt = list_first_entry(&ptp->evt_free_list,
+				       struct efx_ptp_event_rx, link);
+		list_del(&evt->link);
+
+		evt->seq0 = EFX_QWORD_FIELD(ptp->evt_frags[2], MCDI_EVENT_DATA);
+		evt->seq1 = (EFX_QWORD_FIELD(ptp->evt_frags[2],
+					     MCDI_EVENT_SRC)        |
+			     (EFX_QWORD_FIELD(ptp->evt_frags[1],
+					      MCDI_EVENT_SRC) << 8) |
+			     (EFX_QWORD_FIELD(ptp->evt_frags[0],
+					      MCDI_EVENT_SRC) << 16));
+		evt->hwtimestamp = ktime_set(
+			EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA),
+			EFX_QWORD_FIELD(ptp->evt_frags[1], MCDI_EVENT_DATA));
+		evt->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
+		list_add_tail(&evt->link, &ptp->evt_list);
+
+		queue_work(ptp->workwq, &ptp->work);
+	} else {
+		netif_err(efx, rx_err, efx->net_dev, "No free PTP event");
+	}
+	spin_unlock_bh(&ptp->evt_lock);
+}
+
+static void ptp_event_fault(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	int code = EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA);
+	if (ptp->evt_frag_idx != 1) {
+		ptp_event_failure(efx, 1);
+		return;
+	}
+
+	netif_err(efx, hw, efx->net_dev, "PTP error %d\n", code);
+}
+
+static void ptp_event_pps(struct efx_nic *efx, struct efx_ptp_data *ptp)
+{
+	if (ptp->nic_ts_enabled)
+		queue_work(ptp->pps_workwq, &ptp->pps_work);
+}
+
+void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev)
+{
+	struct efx_ptp_data *ptp = efx->ptp_data;
+	int code = EFX_QWORD_FIELD(*ev, MCDI_EVENT_CODE);
+
+	if (!ptp->enabled)
+		return;
+
+	if (ptp->evt_frag_idx == 0) {
+		ptp->evt_code = code;
+	} else if (ptp->evt_code != code) {
+		netif_err(efx, hw, efx->net_dev,
+			  "PTP out of sequence event %d\n", code);
+		ptp->evt_frag_idx = 0;
+	}
+
+	ptp->evt_frags[ptp->evt_frag_idx++] = *ev;
+	if (!MCDI_EVENT_FIELD(*ev, CONT)) {
+		/* Process resulting event */
+		switch (code) {
+		case MCDI_EVENT_CODE_PTP_RX:
+			ptp_event_rx(efx, ptp);
+			break;
+		case MCDI_EVENT_CODE_PTP_FAULT:
+			ptp_event_fault(efx, ptp);
+			break;
+		case MCDI_EVENT_CODE_PTP_PPS:
+			ptp_event_pps(efx, ptp);
+			break;
+		default:
+			netif_err(efx, hw, efx->net_dev,
+				  "PTP unknown event %d\n", code);
+			break;
+		}
+		ptp->evt_frag_idx = 0;
+	} else if (MAX_EVENT_FRAGS == ptp->evt_frag_idx) {
+		netif_err(efx, hw, efx->net_dev,
+			  "PTP too many event fragments\n");
+		ptp->evt_frag_idx = 0;
+	}
+}
+
+static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	u8 inadj[MC_CMD_PTP_IN_ADJUST_LEN];
+	s64 adjustment_ns;
+	int rc;
+
+	if (delta > MAX_PPB)
+		delta = MAX_PPB;
+	else if (delta < -MAX_PPB)
+		delta = -MAX_PPB;
+
+	/* Convert ppb to fixed point ns. */
+	adjustment_ns = (((s64)delta * PPB_SCALE_WORD) >>
+			 (PPB_EXTRA_BITS + MAX_PPB_BITS));
+
+	MCDI_SET_DWORD(inadj, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_FREQ_LO, (u32)adjustment_ns);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_FREQ_HI,
+		       (u32)(adjustment_ns >> 32));
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_SECONDS, 0);
+	MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_NANOSECONDS, 0);
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inadj, sizeof(inadj),
+			  NULL, 0, NULL);
+	if (rc != 0)
+		return rc;
+
+	ptp_data->current_adjfreq = delta;
+	return 0;
+}
+
+static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	struct timespec delta_ts = ns_to_timespec(delta);
+	u8 inbuf[MC_CMD_PTP_IN_ADJUST_LEN];
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_FREQ_LO, 0);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_FREQ_HI, 0);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_SECONDS, (u32)delta_ts.tv_sec);
+	MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_NANOSECONDS, (u32)delta_ts.tv_nsec);
+	return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			    NULL, 0, NULL);
+}
+
+static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	struct efx_nic *efx = ptp_data->channel->efx;
+	u8 inbuf[MC_CMD_PTP_IN_READ_NIC_TIME_LEN];
+	u8 outbuf[MC_CMD_PTP_OUT_READ_NIC_TIME_LEN];
+	int rc;
+
+	MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_READ_NIC_TIME);
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf),
+			  outbuf, sizeof(outbuf), NULL);
+	if (rc != 0)
+		return rc;
+
+	ts->tv_sec = MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_SECONDS);
+	ts->tv_nsec = MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_NANOSECONDS);
+	return 0;
+}
+
+static int efx_phc_settime(struct ptp_clock_info *ptp,
+			   const struct timespec *e_ts)
+{
+	/* We must provide this function, but we cannot actually set the time */
+	return -EOPNOTSUPP;
+}
+
+static int efx_phc_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *request,
+			  int enable)
+{
+	struct efx_ptp_data *ptp_data = container_of(ptp,
+						     struct efx_ptp_data,
+						     phc_clock_info);
+	if (request->type != PTP_CLK_REQ_EXTTS)
+		return -EOPNOTSUPP;
+
+	ptp_data->nic_ts_enabled = !!enable;
+	return 0;
+}
+
+static const struct efx_channel_type efx_ptp_channel_type = {
+	.handle_no_channel	= efx_ptp_handle_no_channel,
+	.pre_probe		= efx_ptp_probe_channel,
+	.post_remove		= efx_ptp_remove_channel,
+	.get_name		= efx_ptp_get_channel_name,
+	/* no copy operation; there is no need to reallocate this channel */
+	.receive_skb		= efx_ptp_rx,
+	.keep_eventq		= false,
+};
+
+void efx_ptp_probe(struct efx_nic *efx)
+{
+	/* Check whether PTP is implemented on this NIC.  The DISABLE
+	 * operation will succeed if and only if it is implemented.
+	 */
+	if (efx_ptp_disable(efx) == 0)
+		efx->extra_channel_type[EFX_EXTRA_CHANNEL_PTP] =
+			&efx_ptp_channel_type;
+}
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 92699a0..8923863 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -573,7 +573,7 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Record the rx_queue */
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	/* Does the channel want to handle the skb */
+	/* Pass the packet up */
 	if (channel->type->receive_skb)
 		channel->type->receive_skb(channel, skb);
 	else
diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c
index 6bafd21..84b41bf 100644
--- a/drivers/net/ethernet/sfc/siena.c
+++ b/drivers/net/ethernet/sfc/siena.c
@@ -335,6 +335,7 @@ static int siena_probe_nic(struct efx_nic *efx)
 		goto fail5;
 
 	efx_sriov_probe(efx);
+	efx_ptp_probe(efx);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 9b225a7..66badf8 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -347,6 +347,12 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 
 	EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));
 
+	/* PTP "event" packet */
+	if (unlikely(efx_xmit_with_hwtstamp(skb)) &&
+	    unlikely(efx_ptp_is_ptp_tx(efx, skb))) {
+		return efx_ptp_tx(efx, skb);
+	}
+
 	index = skb_get_queue_mapping(skb);
 	type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;
 	if (index >= efx->n_tx_channels) {
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 3/7] sfc: Allow efx_mcdi_rpc to be called in two parts
From: Ben Hutchings @ 2012-07-18 18:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

For NIC/System time synchonisation efx_mcdi_rpc needs to be split in
efx_mcdi_rpc_start and efx_mcdi_rpc_finish operations.

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mcdi.c |   26 +++++++++++++++++++++++---
 drivers/net/ethernet/sfc/mcdi.h |    6 ++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index fc5e7bb..294df4b 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -320,14 +320,20 @@ static void efx_mcdi_ev_cpl(struct efx_nic *efx, unsigned int seqno,
 		efx_mcdi_complete(mcdi);
 }
 
-/* Issue the given command by writing the data into the shared memory PDU,
- * ring the doorbell and wait for completion. Copyout the result. */
 int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd,
 		 const u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen,
 		 size_t *outlen_actual)
 {
+	efx_mcdi_rpc_start(efx, cmd, inbuf, inlen);
+	return efx_mcdi_rpc_finish(efx, cmd, inlen,
+				   outbuf, outlen, outlen_actual);
+}
+
+void efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd, const u8 *inbuf,
+			size_t inlen)
+{
 	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
-	int rc;
+
 	BUG_ON(efx_nic_rev(efx) < EFX_REV_SIENA_A0);
 
 	efx_mcdi_acquire(mcdi);
@@ -338,6 +344,15 @@ int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd,
 	spin_unlock_bh(&mcdi->iface_lock);
 
 	efx_mcdi_copyin(efx, cmd, inbuf, inlen);
+}
+
+int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			u8 *outbuf, size_t outlen, size_t *outlen_actual)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+	int rc;
+
+	BUG_ON(efx_nic_rev(efx) < EFX_REV_SIENA_A0);
 
 	if (mcdi->mode == MCDI_MODE_POLL)
 		rc = efx_mcdi_poll(efx);
@@ -563,6 +578,11 @@ void efx_mcdi_process_event(struct efx_channel *channel,
 	case MCDI_EVENT_CODE_FLR:
 		efx_sriov_flr(efx, MCDI_EVENT_FIELD(*event, FLR_VF));
 		break;
+	case MCDI_EVENT_CODE_PTP_RX:
+	case MCDI_EVENT_CODE_PTP_FAULT:
+	case MCDI_EVENT_CODE_PTP_PPS:
+		efx_ptp_event(efx, event);
+		break;
 
 	default:
 		netif_err(efx, hw, efx->net_dev, "Unknown MCDI event 0x%x\n",
diff --git a/drivers/net/ethernet/sfc/mcdi.h b/drivers/net/ethernet/sfc/mcdi.h
index 0bdf3e3..dc25caa 100644
--- a/drivers/net/ethernet/sfc/mcdi.h
+++ b/drivers/net/ethernet/sfc/mcdi.h
@@ -71,6 +71,12 @@ extern int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd, const u8 *inbuf,
 			size_t inlen, u8 *outbuf, size_t outlen,
 			size_t *outlen_actual);
 
+extern void efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd,
+			       const u8 *inbuf, size_t inlen);
+extern int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
+			       u8 *outbuf, size_t outlen,
+			       size_t *outlen_actual);
+
 extern int efx_mcdi_poll_reboot(struct efx_nic *efx);
 extern void efx_mcdi_mode_poll(struct efx_nic *efx);
 extern void efx_mcdi_mode_event(struct efx_nic *efx);
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 2/7] sfc: Add channel specific receive_skb handler and post_remove callback
From: Ben Hutchings @ 2012-07-18 18:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

Allows an extra channel to override the standard receive_skb handler
and also for extra non generic operations to be performed on remove.

Also set default rx strategy so only skbs can be delivered to the
PTP receive function.

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        |    6 ++++++
 drivers/net/ethernet/sfc/efx.h        |    1 +
 drivers/net/ethernet/sfc/net_driver.h |    3 +++
 drivers/net/ethernet/sfc/rx.c         |   15 ++++++++++++---
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 3ae6774..1c53d4b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -714,6 +714,7 @@ static void efx_remove_channel(struct efx_channel *channel)
 	efx_for_each_possible_channel_tx_queue(tx_queue, channel)
 		efx_remove_tx_queue(tx_queue);
 	efx_remove_eventq(channel);
+	channel->type->post_remove(channel);
 }
 
 static void efx_remove_channels(struct efx_nic *efx)
@@ -828,6 +829,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue)
 
 static const struct efx_channel_type efx_default_channel_type = {
 	.pre_probe		= efx_channel_dummy_op_int,
+	.post_remove		= efx_channel_dummy_op_void,
 	.get_name		= efx_get_channel_name,
 	.copy			= efx_copy_channel,
 	.keep_eventq		= false,
@@ -838,6 +840,10 @@ int efx_channel_dummy_op_int(struct efx_channel *channel)
 	return 0;
 }
 
+void efx_channel_dummy_op_void(struct efx_channel *channel)
+{
+}
+
 /**************************************************************************
  *
  * Port handling
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index be8f915..121441f 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -96,6 +96,7 @@ static inline void efx_filter_rfs_expire(struct efx_channel *channel) {}
 
 /* Channels */
 extern int efx_channel_dummy_op_int(struct efx_channel *channel);
+extern void efx_channel_dummy_op_void(struct efx_channel *channel);
 extern void efx_process_channel_now(struct efx_channel *channel);
 extern int
 efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 3894593..9913e32 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -393,14 +393,17 @@ struct efx_channel {
  * @get_name: Generate the channel's name (used for its IRQ handler)
  * @copy: Copy the channel state prior to reallocation.  May be %NULL if
  *	reallocation is not supported.
+ * @receive_skb: Handle an skb ready to be passed to netif_receive_skb()
  * @keep_eventq: Flag for whether event queue should be kept initialised
  *	while the device is stopped
  */
 struct efx_channel_type {
 	void (*handle_no_channel)(struct efx_nic *);
 	int (*pre_probe)(struct efx_channel *);
+	void (*post_remove)(struct efx_channel *);
 	void (*get_name)(struct efx_channel *, char *buf, size_t len);
 	struct efx_channel *(*copy)(const struct efx_channel *);
+	void (*receive_skb)(struct efx_channel *, struct sk_buff *);
 	bool keep_eventq;
 };
 
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 3f598bf..92699a0 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -573,8 +573,11 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Record the rx_queue */
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	/* Pass the packet up */
-	netif_receive_skb(skb);
+	/* Does the channel want to handle the skb */
+	if (channel->type->receive_skb)
+		channel->type->receive_skb(channel, skb);
+	else
+		netif_receive_skb(skb);
 
 	/* Update allocation strategy method */
 	channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
@@ -616,7 +619,8 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
 		rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
 
-	if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)))
+	if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) &&
+	    !channel->type->receive_skb)
 		efx_rx_packet_gro(channel, rx_buf, eh);
 	else
 		efx_rx_deliver(channel, rx_buf);
@@ -626,6 +630,11 @@ void efx_rx_strategy(struct efx_channel *channel)
 {
 	enum efx_rx_alloc_method method = rx_alloc_method;
 
+	if (channel->type->receive_skb) {
+		channel->rx_alloc_push_pages = false;
+		return;
+	}
+
 	/* Only makes sense to use page based allocation if GRO is enabled */
 	if (!(channel->efx->net_dev->features & NETIF_F_GRO)) {
 		method = RX_ALLOC_METHOD_SKB;
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH net-next 1/7] sfc: Add explicit RX queue flag to channel
From: Ben Hutchings @ 2012-07-18 18:19 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Andrew Jackson, Richard Cochran
In-Reply-To: <1342635392.2617.52.camel@bwh-desktop.uk.solarflarecom.com>

The PTP channel will have its own RX queue even though it's not
a regular traffic channel.

Original work by Ben Hutchings <bhutchings@solarflare.com>

Signed-off-by: Stuart Hodgson <smhodgson@solarflare.com>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        |    8 +++++++-
 drivers/net/ethernet/sfc/net_driver.h |    5 ++++-
 drivers/net/ethernet/sfc/rx.c         |    7 +++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 70554a1..3ae6774 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1422,10 +1422,16 @@ static void efx_set_channels(struct efx_nic *efx)
 	efx->tx_channel_offset =
 		separate_tx_channels ? efx->n_channels - efx->n_tx_channels : 0;
 
-	/* We need to adjust the TX queue numbers if we have separate
+	/* We need to mark which channels really have RX and TX
+	 * queues, and adjust the TX queue numbers if we have separate
 	 * RX-only and TX-only channels.
 	 */
 	efx_for_each_channel(channel, efx) {
+		if (channel->channel < efx->n_rx_channels)
+			channel->rx_queue.core_index = channel->channel;
+		else
+			channel->rx_queue.core_index = -1;
+
 		efx_for_each_channel_tx_queue(tx_queue, channel)
 			tx_queue->queue -= (efx->tx_channel_offset *
 					    EFX_TXQ_TYPES);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 55be2fd..3894593 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -242,6 +242,8 @@ struct efx_rx_page_state {
 /**
  * struct efx_rx_queue - An Efx RX queue
  * @efx: The associated Efx NIC
+ * @core_index:  Index of network core RX queue.  Will be >= 0 iff this
+ *	is associated with a real RX queue.
  * @buffer: The software buffer ring
  * @rxd: The hardware descriptor ring
  * @ptr_mask: The size of the ring minus 1.
@@ -263,6 +265,7 @@ struct efx_rx_page_state {
  */
 struct efx_rx_queue {
 	struct efx_nic *efx;
+	int core_index;
 	struct efx_rx_buffer *buffer;
 	struct efx_special_buffer rxd;
 	unsigned int ptr_mask;
@@ -1044,7 +1047,7 @@ static inline bool efx_tx_queue_used(struct efx_tx_queue *tx_queue)
 
 static inline bool efx_channel_has_rx_queue(struct efx_channel *channel)
 {
-	return channel->channel < channel->efx->n_rx_channels;
+	return channel->rx_queue.core_index >= 0;
 }
 
 static inline struct efx_rx_queue *
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 6d1c6cf..3f598bf 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -478,7 +478,7 @@ static void efx_rx_packet_gro(struct efx_channel *channel,
 		skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
 				  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
 
-		skb_record_rx_queue(skb, channel->channel);
+		skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
 		gro_result = napi_gro_frags(napi);
 	} else {
@@ -570,6 +570,9 @@ static void efx_rx_deliver(struct efx_channel *channel,
 	/* Set the SKB flags */
 	skb_checksum_none_assert(skb);
 
+	/* Record the rx_queue */
+	skb_record_rx_queue(skb, channel->rx_queue.core_index);
+
 	/* Pass the packet up */
 	netif_receive_skb(skb);
 
@@ -607,7 +610,7 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf)
 		 * at the ethernet header */
 		skb->protocol = eth_type_trans(skb, efx->net_dev);
 
-		skb_record_rx_queue(skb, channel->channel);
+		skb_record_rx_queue(skb, channel->rx_queue.core_index);
 	}
 
 	if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* Re: [patch net-next] team: refine IFF_XMIT_DST_RELEASE capability
From: Eric Dumazet @ 2012-07-18 18:17 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, edumazet
In-Reply-To: <1342633178-1411-1-git-send-email-jiri@resnulli.us>

On Wed, 2012-07-18 at 19:39 +0200, Jiri Pirko wrote:
> Cloned patch of Eric Dumazet for bonding.
> 
> Some workloads greatly benefit of IFF_XMIT_DST_RELEASE capability
> on output net device, avoiding dirtying dst refcount.
> 
> team currently disables IFF_XMIT_DST_RELEASE unconditionally.
> 
> If all ports have the IFF_XMIT_DST_RELEASE bit set, then
> team dev can also have it in its priv_flags.
> 
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---
>  drivers/net/team/team.c |    5 +++++
>  1 file changed, 5 insertions(+)

Acked-by: Eric Dumazet <edumazet@google.com>

Thanks

^ permalink raw reply

* [PATCH] sunrpc: clnt: Add missing braces
From: Joe Perches @ 2012-07-18 18:17 UTC (permalink / raw)
  To: Trond Myklebust, J. Bruce Fields, Chuck Lever
  Cc: David S. Miller, linux-nfs, netdev, linux-kernel

Add a missing set of braces that commit 4e0038b6b24
("SUNRPC: Move clnt->cl_server into struct rpc_xprt")
forgot.

Signed-off-by: Joe Perches <joe@perches.com>
---
 net/sunrpc/clnt.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f56f045..aaf70aa 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1844,12 +1844,13 @@ call_timeout(struct rpc_task *task)
 		return;
 	}
 	if (RPC_IS_SOFT(task)) {
-		if (clnt->cl_chatty)
+		if (clnt->cl_chatty) {
 			rcu_read_lock();
 			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
 				clnt->cl_protname,
 				rcu_dereference(clnt->cl_xprt)->servername);
 			rcu_read_unlock();
+		}
 		if (task->tk_flags & RPC_TASK_TIMEOUT)
 			rpc_exit(task, -ETIMEDOUT);
 		else

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox