Netdev List
 help / color / mirror / Atom feed
* [PATCH 5/7] ipvs: Complete IPv6 fragment handling for IPVS
From: Simon Horman @ 2012-09-28  2:55 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Hans Schillstrom,
	Jesper Dangaard Brouer, Simon Horman
In-Reply-To: <1348800904-23902-1-git-send-email-horms@verge.net.au>

From: Jesper Dangaard Brouer <brouer@redhat.com>

IPVS now supports fragmented packets, with support from nf_conntrack_reasm.c

Based on patch from: Hans Schillstrom.

IPVS do like conntrack i.e. use the skb->nfct_reasm
(i.e. when all fragments is collected, nf_ct_frag6_output()
starts a "re-play" of all fragments into the interrupted
PREROUTING chain at prio -399 (NF_IP6_PRI_CONNTRACK_DEFRAG+1)
with nfct_reasm pointing to the assembled packet.)

Notice, module nf_defrag_ipv6 must be loaded for this to work.
Report unhandled fragments, and recommend user to load nf_defrag_ipv6.

To handle fw-mark for fragments.  Add a new IPVS hook into prerouting
chain at prio -99 (NF_IP6_PRI_NAT_DST+1) to catch fragments, and copy
fw-mark info from the first packet with an upper layer header.

IPv6 fragment handling should be the last thing on the IPVS IPv6
missing support list.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |   39 ++++++++++++-
 net/netfilter/ipvs/Kconfig      |    6 +-
 net/netfilter/ipvs/ip_vs_conn.c |    2 +-
 net/netfilter/ipvs/ip_vs_core.c |  117 ++++++++++++++++++++++++++++++++-------
 net/netfilter/ipvs/ip_vs_xmit.c |   36 +++++++++---
 5 files changed, 164 insertions(+), 36 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 29265bf..98806b6 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -109,6 +109,7 @@ extern int ip_vs_conn_tab_size;
 struct ip_vs_iphdr {
 	__u32 len;	/* IPv4 simply where L4 starts
 			   IPv6 where L4 Transport Header starts */
+	__u32 thoff_reasm; /* Transport Header Offset in nfct_reasm skb */
 	__u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/
 	__s16 protocol;
 	__s32 flags;
@@ -116,6 +117,35 @@ struct ip_vs_iphdr {
 	union nf_inet_addr daddr;
 };
 
+/* Dependency to module: nf_defrag_ipv6 */
+#if defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE)
+static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
+{
+	return skb->nfct_reasm;
+}
+static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
+				      int len, void *buffer,
+				      const struct ip_vs_iphdr *ipvsh)
+{
+	if (unlikely(ipvsh->fragoffs && skb_nfct_reasm(skb)))
+		return skb_header_pointer(skb_nfct_reasm(skb),
+					  ipvsh->thoff_reasm, len, buffer);
+
+	return skb_header_pointer(skb, offset, len, buffer);
+}
+#else
+static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
+{
+	return NULL;
+}
+static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
+				      int len, void *buffer,
+				      const struct ip_vs_iphdr *ipvsh)
+{
+	return skb_header_pointer(skb, offset, len, buffer);
+}
+#endif
+
 static inline void
 ip_vs_fill_ip4hdr(const void *nh, struct ip_vs_iphdr *iphdr)
 {
@@ -141,12 +171,19 @@ ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr)
 			(struct ipv6hdr *)skb_network_header(skb);
 		iphdr->saddr.in6 = iph->saddr;
 		iphdr->daddr.in6 = iph->daddr;
-		/* ipv6_find_hdr() updates len, flags */
+		/* ipv6_find_hdr() updates len, flags, thoff_reasm */
+		iphdr->thoff_reasm = 0;
 		iphdr->len	 = 0;
 		iphdr->flags	 = 0;
 		iphdr->protocol  = ipv6_find_hdr(skb, &iphdr->len, -1,
 						 &iphdr->fragoffs,
 						 &iphdr->flags);
+		/* get proto from re-assembled packet and it's offset */
+		if (skb_nfct_reasm(skb))
+			iphdr->protocol = ipv6_find_hdr(skb_nfct_reasm(skb),
+							&iphdr->thoff_reasm,
+							-1, NULL, NULL);
+
 	} else
 #endif
 	{
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index a97ae53..0c3b167 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -30,11 +30,9 @@ config	IP_VS_IPV6
 	depends on IPV6 = y || IP_VS = IPV6
 	select IP6_NF_IPTABLES
 	---help---
-	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+	  Add IPv6 support to IPVS.
 
-	  See http://www.mindbasket.com/ipvs for more information.
-
-	  Say N if unsure.
+	  Say Y if unsure.
 
 config	IP_VS_DEBUG
 	bool "IP virtual server debugging"
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 1548df9..d6c1c26 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -314,7 +314,7 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
 	__be16 _ports[2], *pptr;
 	struct net *net = skb_net(skb);
 
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	pptr = frag_safe_skb_hp(skb, proto_off, sizeof(_ports), _ports, iph);
 	if (pptr == NULL)
 		return 1;
 
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 19c0842..19b89ff 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -402,8 +402,12 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	unsigned int flags;
 
 	*ignored = 1;
+
+	/*
+	 * IPv6 frags, only the first hit here.
+	 */
 	ip_vs_fill_iph_skb(svc->af, skb, &iph);
-	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+	pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports, &iph);
 	if (pptr == NULL)
 		return NULL;
 
@@ -507,8 +511,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 #endif
 
 	ip_vs_fill_iph_skb(svc->af, skb, &iph);
-
-	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+	pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports, &iph);
 	if (pptr == NULL) {
 		ip_vs_service_put(svc);
 		return NF_DROP;
@@ -654,14 +657,6 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 	return err;
 }
 
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
-{
-	/* TODO IPv6: Find out what to do here for IPv6 */
-	return 0;
-}
-#endif
-
 static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
 {
 #ifdef CONFIG_IP_VS_IPV6
@@ -939,8 +934,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
 	ip_vs_fill_iph_skb(AF_INET6, skb, ipvsh);
 
 	*related = 1;
-
-	ic = skb_header_pointer(skb, ipvsh->len, sizeof(_icmph), &_icmph);
+	ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
 	if (ic == NULL)
 		return NF_DROP;
 
@@ -955,6 +949,11 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
 		*related = 0;
 		return NF_ACCEPT;
 	}
+	/* Fragment header that is before ICMP header tells us that:
+	 * it's not an error message since they can't be fragmented.
+	 */
+	if (ipvsh->flags & IP6T_FH_F_FRAG)
+		return NF_DROP;
 
 	IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
 		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
@@ -1117,6 +1116,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	ip_vs_fill_iph_skb(af, skb, &iph);
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
+		if (!iph.fragoffs && skb_nfct_reasm(skb)) {
+			struct sk_buff *reasm = skb_nfct_reasm(skb);
+			/* Save fw mark for coming frags */
+			reasm->ipvs_property = 1;
+			reasm->mark = skb->mark;
+		}
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 			int related;
 			int verdict = ip_vs_out_icmp_v6(skb, &related,
@@ -1124,7 +1129,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 
 			if (related)
 				return verdict;
-			ip_vs_fill_iph_skb(af, skb, &iph);
 		}
 	} else
 #endif
@@ -1134,7 +1138,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 
 			if (related)
 				return verdict;
-			ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
 		}
 
 	pd = ip_vs_proto_data_get(net, iph.protocol);
@@ -1167,8 +1170,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	     pp->protocol == IPPROTO_SCTP)) {
 		__be16 _ports[2], *pptr;
 
-		pptr = skb_header_pointer(skb, iph.len,
-					  sizeof(_ports), _ports);
+		pptr = frag_safe_skb_hp(skb, iph.len,
+					 sizeof(_ports), _ports, &iph);
 		if (pptr == NULL)
 			return NF_ACCEPT;	/* Not for me */
 		if (ip_vs_lookup_real_service(net, af, iph.protocol,
@@ -1468,7 +1471,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	*related = 1;
 
-	ic = skb_header_pointer(skb, iph->len, sizeof(_icmph), &_icmph);
+	ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
 	if (ic == NULL)
 		return NF_DROP;
 
@@ -1483,6 +1486,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 		*related = 0;
 		return NF_ACCEPT;
 	}
+	/* Fragment header that is before ICMP header tells us that:
+	 * it's not an error message since they can't be fragmented.
+	 */
+	if (iph->flags & IP6T_FH_F_FRAG)
+		return NF_DROP;
 
 	IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
 		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
@@ -1514,10 +1522,20 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
 		      "Checking incoming ICMPv6 for");
 
-	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_in_get(AF_INET6, skb, &ciph, ciph.len, 1);
+	/* The embedded headers contain source and dest in reverse order
+	 * if not from localhost
+	 */
+	cp = pp->conn_in_get(AF_INET6, skb, &ciph, ciph.len,
+			     (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+
 	if (!cp)
 		return NF_ACCEPT;
+	/* VS/TUN, VS/DR and LOCALNODE just let it go */
+	if ((hooknum == NF_INET_LOCAL_OUT) &&
+	    (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+		__ip_vs_conn_put(cp);
+		return NF_ACCEPT;
+	}
 
 	/* do the statistics and put it back */
 	ip_vs_in_stats(cp, skb);
@@ -1590,6 +1608,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
+		if (!iph.fragoffs && skb_nfct_reasm(skb)) {
+			struct sk_buff *reasm = skb_nfct_reasm(skb);
+			/* Save fw mark for coming frags. */
+			reasm->ipvs_property = 1;
+			reasm->mark = skb->mark;
+		}
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 			int related;
 			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
@@ -1614,13 +1638,16 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	pp = pd->pp;
 	/*
 	 * Check if the packet belongs to an existing connection entry
-	 * Only sched first IPv6 fragment.
 	 */
 	cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
 
 	if (unlikely(!cp) && !iph.fragoffs) {
+		/* No (second) fragments need to enter here, as nf_defrag_ipv6
+		 * replayed fragment zero will already have created the cp
+		 */
 		int v;
 
+		/* Schedule and create new connection entry into &cp */
 		if (!pp->conn_schedule(af, skb, pd, &v, &cp))
 			return v;
 	}
@@ -1629,6 +1656,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 		/* sorry, all this trouble for a no-hit :) */
 		IP_VS_DBG_PKT(12, af, pp, skb, 0,
 			      "ip_vs_in: packet continues traversal as normal");
+		if (iph.fragoffs && !skb_nfct_reasm(skb)) {
+			/* Fragment that couldn't be mapped to a conn entry
+			 * and don't have any pointer to a reasm skb
+			 * is missing module nf_defrag_ipv6
+			 */
+			IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+			IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+		}
 		return NF_ACCEPT;
 	}
 
@@ -1713,6 +1748,38 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
 #ifdef CONFIG_IP_VS_IPV6
 
 /*
+ * AF_INET6 fragment handling
+ * Copy info from first fragment, to the rest of them.
+ */
+static unsigned int
+ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff *reasm = skb_nfct_reasm(skb);
+	struct net *net;
+
+	/* Skip if not a "replay" from nf_ct_frag6_output or first fragment.
+	 * ipvs_property is set when checking first fragment
+	 * in ip_vs_in() and ip_vs_out().
+	 */
+	if (reasm)
+		IP_VS_DBG(2, "Fragment recv prop:%d\n", reasm->ipvs_property);
+	if (!reasm || !reasm->ipvs_property)
+		return NF_ACCEPT;
+
+	net = skb_net(skb);
+	if (!net_ipvs(net)->enable)
+		return NF_ACCEPT;
+
+	/* Copy stored fw mark, saved in ip_vs_{in,out} */
+	skb->mark = reasm->mark;
+
+	return NF_ACCEPT;
+}
+
+/*
  *	AF_INET6 handler in NF_INET_LOCAL_IN chain
  *	Schedule and forward packets from remote clients
  */
@@ -1851,6 +1918,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.priority	= 100,
 	},
 #ifdef CONFIG_IP_VS_IPV6
+	/* After mangle & nat fetch 2:nd fragment and following */
+	{
+		.hook		= ip_vs_preroute_frag6,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_NAT_DST + 1,
+	},
 	/* After packet filtering, change source only for VS/NAT */
 	{
 		.hook		= ip_vs_reply6,
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 428de75..a8b75fc 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -496,13 +496,15 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;			/* Route to the other host */
-	struct ipv6hdr  *iph = ipv6_hdr(skb);
+	struct ip_vs_iphdr iph;
 	int    mtu;
 
 	EnterFunction(10);
+	ip_vs_fill_iph_skb(cp->af, skb, &iph);
 
-	if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0,
-					 IP_VS_RT_MODE_NON_LOCAL)))
+	rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph.daddr.in6, NULL, 0,
+				   IP_VS_RT_MODE_NON_LOCAL);
+	if (!rt)
 		goto tx_error_icmp;
 
 	/* MTU checking */
@@ -513,7 +515,9 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 			skb->dev = net->loopback_dev;
 		}
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		/* only send ICMP too big on first fragment */
+		if (!iph.fragoffs)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error;
@@ -685,7 +689,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_vs_fill_iph_skb(cp->af, skb, &iph);
 
 	/* check if it is a connection of no-client-port */
-	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph.fragoffs)) {
 		__be16 _pt, *p;
 		p = skb_header_pointer(skb, iph.len, sizeof(_pt), &_pt);
 		if (p == NULL)
@@ -735,7 +739,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 			skb->dev = net->loopback_dev;
 		}
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		/* only send ICMP too big on first fragment */
+		if (!iph.fragoffs)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): frag needed for");
 		goto tx_error_put;
@@ -940,8 +946,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
 	int ret;
+	struct ip_vs_iphdr ipvsh;
 
 	EnterFunction(10);
+	ip_vs_fill_iph_skb(cp->af, skb, &ipvsh);
 
 	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
 					 &saddr, 1, (IP_VS_RT_MODE_LOCAL |
@@ -970,7 +978,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 			skb->dev = net->loopback_dev;
 		}
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		/* only send ICMP too big on first fragment */
+		if (!ipvsh.fragoffs)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error_put;
 	}
@@ -1116,8 +1126,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rt6_info *rt;			/* Route to the other host */
 	int    mtu;
+	struct ip_vs_iphdr iph;
 
 	EnterFunction(10);
+	ip_vs_fill_iph_skb(cp->af, skb, &iph);
 
 	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
 					 0, (IP_VS_RT_MODE_LOCAL |
@@ -1136,7 +1148,9 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 			skb->dev = net->loopback_dev;
 		}
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		/* only send ICMP too big on first fragment */
+		if (!iph.fragoffs)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error;
@@ -1308,8 +1322,10 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int rc;
 	int local;
 	int rt_mode;
+	struct ip_vs_iphdr iph;
 
 	EnterFunction(10);
+	ip_vs_fill_iph_skb(cp->af, skb, &iph);
 
 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
 	   forwarded directly here, because there is no need to
@@ -1372,7 +1388,9 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 			skb->dev = net->loopback_dev;
 		}
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		/* only send ICMP too big on first fragment */
+		if (!iph.fragoffs)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error_put;
 	}
-- 
1.7.10.4

^ permalink raw reply related

* Lab: v.1.8 + Linux 2.6.37.6+up #1 + ESXi 5.0 - VM - Slow Network Performance/Failures
From: Mike Harris @ 2012-09-28  2:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <CAJXRGaj8t5Qx_jewDfS-zYOSQL4iNx_vLM46t72=uYF1x7x=Pw@mail.gmail.com>

Hi,

I hope everyone is well!

Some network throughput/performance oddness with a linux based virtual firewall…

Lab scenario;

[Windows VM #1] --- VLAN X-----(*)Linux Firewall VM ----- VLAN Y
---[Windows VM #2]

A tcpdump is kicked off with the following options on the firewall
this a 100MB file is copied between VM #1 to VM #2 (SMB).

tcpdump -i eth0.x -n -s0 -w file-transfer-1.pcap -c100

Notes:

+ Physical blade run ESXi 5.0.
+ Windows VMs run on the same vSwitch and physical blade.
+ VLAN X and Y support up to 1500 bytes MTU.
+ Virtual firewall is configured with the 4095 (any) VLAN (receives
and transmits tagged frames).
+ Virtual firewall is runs Linux 2.6.37.6+up #1
+ Virtual and physical backbones do support up to 9k frames.

Observations;

1. The file copy fails…
2. The pcap reveals a 12,443 byte uber jumbo frame is present shortly
after the file transfer starts.

Repeated the same scenario using a test vyatta 6.4 VM and the file
copy completes normally… no jumbo frames or any other oddness.
Virtual and physical networking can not originate such a frame
normally.

Given this, I suspect there's a general framing failure of the network
driver on the virtual linux firewall, which lead me to the dmesg
command and this mailing list :)

Has anyone else seen this behavior before on a linux VM before?

Thoughts?

Helpful suggestions :)

Mike

^ permalink raw reply

* Re: [PATCH] rtlwifi: use %*ph[C] to dump small buffers
From: Joe Perches @ 2012-09-28  3:13 UTC (permalink / raw)
  To: Larry Finger
  Cc: Andy Shevchenko, Chaoming Li, David S. Miller,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <5064D318.6090705-tQ5ms3gMjBLk1uMJSBkQmQ@public.gmane.org>

On Thu, 2012-09-27 at 17:28 -0500, Larry Finger wrote:
> On 09/26/2012 11:45 AM, Joe Perches wrote:
> > rate_mask uses:
> >
> > 	u32 ratr_bitmap = (u32) mac->basic_rates;
> > ...
> > 	u8 rate_mask[5];
> > ...
> > 	[sets ratr_bitmap as u32]
> > ...
> > 	*(u32 *)&rate_mask = ((ratr_bitmap & 0x0fffffff) |
> > 				      ratr_index << 28);
> > ...
> > 	rtl92c_fill_h2c_cmd(hw, H2C_RA_MASK, 5, rate_mask);
> >
> > Looks like a possible endian misuse to me.
> 
> After a second look at the code, I don't think this is an endian issue; however, 
> I am proposing the following changes to remove any doubt:

I believe if it wasn't an endian problem, then the new
code makes it one.

> Index: wireless-testing-new/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c
> ===================================================================
> --- wireless-testing-new.orig/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c
> +++ wireless-testing-new/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c
> @@ -1964,8 +1965,9 @@ static void rtl92ce_update_hal_rate_mask
> 
>          RT_TRACE(rtlpriv, COMP_RATR, DBG_DMESG,
>                   "ratr_bitmap :%x\n", ratr_bitmap);
> -       *(u32 *)&rate_mask = (ratr_bitmap & 0x0fffffff) |
> -                                    (ratr_index << 28);
> +       for (i = 0; i < 3; i++)
> +               rate_mask[i] = ratr_bitmap & (0xff << (i * 4));

rate_mask is u8, doesn't this needs (calc) >> (i * 8)

> +       rate_mask[3] = (ratr_bitmap & 0x0f000000) | (ratr_index << 28);

Perhaps you meant:

		((ratr_bitmap & 0x0f000000) >> 24) | (ratr_index << 4)

>          rate_mask[4] = macid | (shortgi ? 0x20 : 0x00) | 0x80;
>          RT_TRACE(rtlpriv, COMP_RATR, DBG_DMESG,
>                   "Rate_index:%x, ratr_val:%x, %*phC\n",
> 
> The downstream routine uses this info as an array of u8, thus it is OK. A proper 
> patch will be sent later. At least we avoid that ugly cast.



--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 7/7 net-next] tg3: Change default number of tx rings to 1.
From: Michael Chan @ 2012-09-28  4:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120927.192345.2124577537241070059.davem@davemloft.net>

On Thu, 2012-09-27 at 19:23 -0400, David Miller wrote: 
> From: "Michael Chan" <mchan@broadcom.com>
> Date: Wed, 26 Sep 2012 15:32:49 -0700
> 
> > Hardware tx scheduling can cause some starvation of a tx ring with small
> > packets if other tx rings have jumbo or TSO packets.  The default setting
> > of 1 TX ring gives the best overall performance in many common traffic
> > scenarios.  The user can change it using ethttol -L if desired.
> > 
> > Update version to 3.125.
> > 
> > Reviewed-by: Nithin Nayak Sujir <nsujir@broadcom.com>
> > Reviewed-by: Benjamin Li <benli@broadcom.com>
> > Signed-off-by: Michael Chan <mchan@broadcom.com>
> 
> This gets into an area I don't like.
> 
> Individual drivers making decisions about defaults that sound like
> system wide ones.
> 
> What makes tg3 so special that only it should have this default
> setting?

It was poor hardware design.  Other bnx2/bnx2x designs do not have this
design flaw.

> 
> I also can't see how this "one guy spamming small packets while
> another generates TSO frames" completely nullifies the SMP gains
> from using multiple TX rings and distributing traffic.

The issue was reported by a user in a RHEL bugzilla complaining about
less than 100Mbps in some common test environment.  We then root caused
it to the hardware design flaw in the multi TX ring hardware
implementation.

In the simplest case, assume you have 2 TCP streams running in opposite
directions.  The TX traffic (mostly TSO) will hash to one tx ring.  The
ACKs for the incoming data on a different TCP connection will hash to
another TX ring.  The hardware fetches one complete TSO packet from the
first ring (up to 64K data) before servicing the other TX ring.  And
when it gets to the other TX ring, it will fetch only one packet (one
64-byte ACK packet in this case) and then immediately switches back to
the 1st ring (filled with more TSO packets).  In reality, there may be
over 10 ACK packets waiting in the 2nd ring because a lot of incoming
data has been received and ACKed during this time.  Because the ACKs are
going out so slowly, the incoming throughput slows to a trickle.

Our other devices don't do this simple Round-robin tx scheduling.
Instead, there are independent DMA channels fetching and interleaving tx
data from multiple rings.

> 
> I'm not applying this patch set.
> 

I hope I have convinced you to reconsider.  Many years ago, we disabled
TSO by default on the old 5704 after we found out that the firmware
implementation was actually slower than no TSO.

At least consider patches 1 - 6 that will allow the user to disable
multiple TX rings if he runs into this scenario.  Including patch 7 will
be the best in my opinion.  Thanks.

^ permalink raw reply

* Re: [PATCH 7/7 net-next] tg3: Change default number of tx rings to 1.
From: David Miller @ 2012-09-28  4:49 UTC (permalink / raw)
  To: mchan; +Cc: netdev
In-Reply-To: <1348807471.7220.138.camel@LTIRV-MCHAN1.corp.ad.broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Thu, 27 Sep 2012 21:44:31 -0700

> In the simplest case, assume you have 2 TCP streams running in opposite
> directions.  The TX traffic (mostly TSO) will hash to one tx ring.  The
> ACKs for the incoming data on a different TCP connection will hash to
> another TX ring.  The hardware fetches one complete TSO packet from the
> first ring (up to 64K data) before servicing the other TX ring.  And
> when it gets to the other TX ring, it will fetch only one packet (one
> 64-byte ACK packet in this case) and then immediately switches back to
> the 1st ring (filled with more TSO packets).  In reality, there may be
> over 10 ACK packets waiting in the 2nd ring because a lot of incoming
> data has been received and ACKed during this time.  Because the ACKs are
> going out so slowly, the incoming throughput slows to a trickle.

Thanks for the explanation, this is the kind of text that belongs in
the commit message.  Otherwise the next person who reads the patch,
like me, will ask why this is being done.

> Our other devices don't do this simple Round-robin tx scheduling.
> Instead, there are independent DMA channels fetching and interleaving tx
> data from multiple rings.

Ok.

Please respin your patch set, adding the detailed explanation you gave
me here to the TX queue change, and I will apply it.

Thanks.

^ permalink raw reply

* [PATCH] netdev: octeon: fix return value check in octeon_mgmt_init_phy()
From: Wei Yongjun @ 2012-09-28  5:04 UTC (permalink / raw)
  To: grant.likely, rob.herring; +Cc: yongjun_wei, netdev, devicetree-discuss

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

In case of error, the function of_phy_connect() returns NULL
pointer not ERR_PTR(). The IS_ERR() test in the return value
check should be replaced with NULL test.

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 drivers/net/ethernet/octeon/octeon_mgmt.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/octeon/octeon_mgmt.c b/drivers/net/ethernet/octeon/octeon_mgmt.c
index c42bbb1..a688a2d 100644
--- a/drivers/net/ethernet/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/octeon/octeon_mgmt.c
@@ -722,10 +722,8 @@ static int octeon_mgmt_init_phy(struct net_device *netdev)
 				   octeon_mgmt_adjust_link, 0,
 				   PHY_INTERFACE_MODE_MII);
 
-	if (IS_ERR(p->phydev)) {
-		p->phydev = NULL;
+	if (!p->phydev)
 		return -1;
-	}
 
 	phy_start_aneg(p->phydev);
 

^ permalink raw reply related

* Re: [PATCH] netdev: octeon: fix return value check in octeon_mgmt_init_phy()
From: David Miller @ 2012-09-28  5:18 UTC (permalink / raw)
  To: weiyj.lk; +Cc: grant.likely, rob.herring, yongjun_wei, netdev,
	devicetree-discuss
In-Reply-To: <CAPgLHd-Tk9eFL2Sj-7_TZA0ngWejuL69=qzENNG_CrJpaQ_S8A@mail.gmail.com>

From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Fri, 28 Sep 2012 13:04:21 +0800

> From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
> 
> In case of error, the function of_phy_connect() returns NULL
> pointer not ERR_PTR(). The IS_ERR() test in the return value
> check should be replaced with NULL test.
> 
> dpatch engine is used to auto generate this patch.
> (https://github.com/weiyj/dpatch)
> 
> Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Applied, thanks.

^ permalink raw reply

* [PATCH v2 net-next 2/3] net: add gro_cells infrastructure
From: Eric Dumazet @ 2012-09-28  5:29 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, netdev
In-Reply-To: <1348788457.10741.39.camel@deadeye.wl.decadent.org.uk>

From: Eric Dumazet <edumazet@google.com>

This adds a new include file (include/net/gro_cells.h), to bring GRO
(Generic Receive Offload) capability to tunnels, in a modular way.

Because tunnels receive path is lockless, and GRO adds a serialization
using a napi_struct, I chose to add an array of up to
DEFAULT_MAX_NUM_RSS_QUEUES cells, so that multi queue devices wont be
slowed down because of GRO layer.

skb_get_rx_queue() is used as selector.

In the future, we might add optional fanout capabilities, using rxhash
for example.

With help from Ben Hutchings who reminded me
netif_get_num_default_rss_queues() function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
---
v2: change the 8 value by call to netif_get_num_default_rss_queues()
    as Ben pointed out. (Thanks Ben !)

 include/net/gro_cells.h |  103 ++++++++++++++++++++++++++++++++++++++
 net/core/dev.c          |    2 
 2 files changed, 105 insertions(+)

diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
new file mode 100644
index 0000000..ba93b1b
--- /dev/null
+++ b/include/net/gro_cells.h
@@ -0,0 +1,103 @@
+#ifndef _NET_GRO_CELLS_H
+#define _NET_GRO_CELLS_H
+
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+
+struct gro_cell {
+	struct sk_buff_head	napi_skbs;
+	struct napi_struct	napi;
+} ____cacheline_aligned_in_smp;
+
+struct gro_cells {
+	unsigned int		gro_cells_mask;
+	struct gro_cell		*cells;
+};
+
+static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
+{
+	unsigned long flags;
+	struct gro_cell *cell = gcells->cells;
+	struct net_device *dev = skb->dev;
+
+	if (!cell || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) {
+		netif_rx(skb);
+		return;
+	}
+
+	if (skb_rx_queue_recorded(skb))
+		cell += skb_get_rx_queue(skb) & gcells->gro_cells_mask;
+
+	if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+		atomic_long_inc(&dev->rx_dropped);
+		kfree_skb(skb);
+		return;
+	}
+
+	spin_lock_irqsave(&cell->napi_skbs.lock, flags);
+
+	__skb_queue_tail(&cell->napi_skbs, skb);
+	if (skb_queue_len(&cell->napi_skbs) == 1)
+		napi_schedule(&cell->napi);
+
+	spin_unlock_irqrestore(&cell->napi_skbs.lock, flags);
+}
+
+static inline int gro_cell_poll(struct napi_struct *napi, int budget)
+{
+	struct gro_cell *cell = container_of(napi, struct gro_cell, napi);
+	struct sk_buff *skb;
+	int work_done = 0;
+
+	while (work_done < budget) {
+		skb = skb_dequeue(&cell->napi_skbs);
+		if (!skb)
+			break;
+
+		napi_gro_receive(napi, skb);
+		work_done++;
+	}
+
+	if (work_done < budget)
+		napi_complete(napi);
+	return work_done;
+}
+
+static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
+{
+	int i;
+
+	gcells->gro_cells_mask = roundup_pow_of_two(netif_get_num_default_rss_queues()) - 1;
+	gcells->cells = kcalloc(sizeof(struct gro_cell),
+				gcells->gro_cells_mask + 1,
+				GFP_KERNEL);
+	if (!gcells->cells)
+		return -ENOMEM;
+
+	for (i = 0; i <= gcells->gro_cells_mask; i++) {
+		struct gro_cell *cell = gcells->cells + i;
+
+		skb_queue_head_init(&cell->napi_skbs);
+		netif_napi_add(dev, &cell->napi, gro_cell_poll, 64);
+		napi_enable(&cell->napi);
+	}
+	return 0;
+}
+
+static inline void gro_cells_destroy(struct gro_cells *gcells)
+{
+	struct gro_cell *cell = gcells->cells;
+	int i;
+
+	if (!cell)
+		return;
+	for (i = 0; i <= gcells->gro_cells_mask; i++,cell++) {
+		netif_napi_del(&cell->napi);	
+		skb_queue_purge(&cell->napi_skbs);
+	}
+	kfree(gcells->cells);
+	gcells->cells = NULL;
+}
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 707b124..9f63660 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2644,6 +2644,8 @@ EXPORT_SYMBOL(dev_queue_xmit);
   =======================================================================*/
 
 int netdev_max_backlog __read_mostly = 1000;
+EXPORT_SYMBOL(netdev_max_backlog);
+
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */

^ permalink raw reply related

* RE: [net-next PATCH 4/5] be2net: get rid of AMAP_SET/GET macros in TX path
From: Perla, Sathya @ 2012-09-28  5:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <20120927.222900.1633352325278735069.davem@davemloft.net>

>-----Original Message-----
>From: David Miller [mailto:davem@davemloft.net]
>
>Now you have endianness bugs.  The previous code worked with 8-bit
>struct members and as such was endian neutral.
>
>Now you're working with words, so you thus have to take endianness
>into consideration.

Dave,
endianness is handled even in this patch. The call to wrb_fill_hdr()
is followed by be_dws_cpu_to_le() to handle this.

The old code did not set the 8-bit members in the amap_eth_wrb_hdr struct.
The TX hdr wrb structure is actually 4 words (32-bit * 4) long. Each word
has some fields with different bit sizes. In the old code a *separate* psuedo
structure -- called the amap_eth_wrb_hdr -- was defined, wherein a field
of size x bits in the actual TX-hrd-wrb was defined as *x bytes* long. This was
done to be able to calculate the mask and shift values of each bit-field without having to define
them manually.

There has been feedback in the community earlier that this method of calculating
bit-field masks and shifts is unusual and to be consistent with other drivers' code we
manually define the mask and shift values for each field.

>
>The readability aspect is also extremely questionable, here's why.
>The old code accessed struct members with _NAMES_ which described what
>the values are and what they do.
>
>This new code puts values into opaque "word" array members.  That's
>about as crappy as it comes wrt. readability.  What in the world
>does word[0] do?  I can't tell from it's name.  Yet with the existing
>"struct amap_eth_hdr_wrb" there is none of that kind of confusion.

The word[2] of TX-hdr-wrb consists for various 1-bit fields.
I could call this word "flags". But, word[3] consists of unrelated fields like
wrb-num (5bits),  lso-mss-len(14 bits) etc . So, I just used dw[3].

I could rename the current definition of TX-hdr-wrb from
struct be_eth_hdr_wrb {
	u32 dw[4];
};

to

struct be_eth_hdr_wrb {
	u32 rsvd0;
	u32 rsvd0;
	/* compl, evt, crc & cksum bits */
	u32 flags;
	/* num-wrb, lso-mss, vlan-tci etc */
	u32 info;
};

Pls let me know if this addresses your concerns...

thanks,
-Sathya

^ permalink raw reply

* Re: [RFC PATCH net-next] tcp: introduce tcp_tw_interval to specifiy the time of TIME-WAIT
From: Cong Wang @ 2012-09-28  6:33 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, David S. Miller, Alexey Kuznetsov, Patrick McHardy,
	Eric Dumazet
In-Reply-To: <20120927142334.GA3194@neilslaptop.think-freely.org>

On Thu, 2012-09-27 at 10:23 -0400, Neil Horman wrote:
> On Thu, Sep 27, 2012 at 04:41:01PM +0800, Cong Wang wrote:
> > Some customer requests this feature, as they stated:
> > 
> > 	"This parameter is necessary, especially for software that continually 
> >         creates many ephemeral processes which open sockets, to avoid socket 
> >         exhaustion. In many cases, the risk of the exhaustion can be reduced by 
> >         tuning reuse interval to allow sockets to be reusable earlier.
> > 
> >         In commercial Unix systems, this kind of parameters, such as 
> >         tcp_timewait in AIX and tcp_time_wait_interval in HP-UX, have 
> >         already been available. Their implementations allow users to tune 
> >         how long they keep TCP connection as TIME-WAIT state on the 
> >         millisecond time scale."
> > 
> > We indeed have "tcp_tw_reuse" and "tcp_tw_recycle", but these tunings
> > are not equivalent in that they cannot be tuned directly on the time
> > scale nor in a safe way, as some combinations of tunings could still
> > cause some problem in NAT. And, I think second scale is enough, we don't
> > have to make it in millisecond time scale.
> > 
> I think I have a little difficultly seeing how this does anything other than
> pay lip service to actually having sockets spend time in TIME_WAIT state.  That
> is to say, while I see users using this to just make the pain stop.  If we wait
> less time than it takes to be sure that a connection isn't being reused (either
> by waiting two segment lifetimes, or by checking timestamps), then you might as
> well not wait at all.  I see how its tempting to be able to say "Just don't wait
> as long", but it seems that theres no difference between waiting half as long as
> the RFC mandates, and waiting no time at all.  Neither is a good idea.

I don't think reducing TIME_WAIT is a good idea either, but there must
be some reason behind as several UNIX provides a microsecond-scale
tuning interface, or maybe in non-recycle mode, their RTO is much less
than 2*MSL?

> 
> Given the problem you're trying to solve here, I'll ask the standard question in
> response: How does using SO_REUSEADDR not solve the problem?  Alternatively, in
> a pinch, why not reduce the tcp_max_tw_buckets sufficiently to start forcing
> TIME_WAIT sockets back into CLOSED state?
> 
> The code looks fine, but the idea really doesn't seem like a good plan to me.
> I'm sure HPUX/Solaris/AIX/etc have done this in response to customer demand, but
> that doesn't make it the right solution.
> 

*I think* the customer doesn't want to modify their applications, so
that is why they don't use SO_REUSERADDR.

I didn't know tcp_max_tw_buckets can do the trick, nor the customer, so
this is a side effect of tcp_max_tw_buckets? Is it documented?

Thanks.

^ permalink raw reply

* Re: [net-next PATCH 4/5] be2net: get rid of AMAP_SET/GET macros in TX path
From: David Miller @ 2012-09-28  6:40 UTC (permalink / raw)
  To: Sathya.Perla; +Cc: netdev
In-Reply-To: <CF9D1877D81D214CB0CA0669EFAE020C6562EE@CMEXMB1.ad.emulex.com>

From: "Perla, Sathya" <Sathya.Perla@Emulex.Com>
Date: Fri, 28 Sep 2012 05:47:25 +0000

> endianness is handled even in this patch. The call to wrb_fill_hdr()
> is followed by be_dws_cpu_to_le() to handle this.

That swap_dws() thing is the most inefficient thing I've ever seen.

Instead of being able to benefit from compile time optimizations
such as byte swaps of constants, you do everything hidden from the
compiler so nothing gets optimized.

The aspect you are changing is the least of your problems in this
area.

^ permalink raw reply

* Re: [RFC PATCH net-next] tcp: introduce tcp_tw_interval to specifiy the time of TIME-WAIT
From: Cong Wang @ 2012-09-28  6:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, kuznet, kaber, edumazet, nhorman
In-Reply-To: <20120927.130529.620560818048014548.davem@davemloft.net>

On Thu, 2012-09-27 at 13:05 -0400, David Miller wrote:
> 
> Without appropriate confirmation that an early time-wait reuse is
> valid, decreasing this interval can only be dangerous.

Yeah, would a proper documentation cure this? Something like we did for
other tuning:

"It should not be changed without advice/request of technical experts."

^ permalink raw reply

* Re: [PATCH 1/2] Fix build error caused by broken PCH_PTP module dependency.
From: Haicheng Li @ 2012-09-28  6:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, tshimizu818, linux-kernel, haicheng.lee
In-Reply-To: <20120927.180942.969924012897168294.davem@davemloft.net>

On 09/28/2012 06:09 AM, David Miller wrote:
> Look at how other people submit patches, do any other patch submissions
> look like your's having all of this metadata in the message body:
I'm sorry for it.

> As for this specific patch:
>
>> -	depends on PTP_1588_CLOCK_PCH
>> +	depends on PTP_1588_CLOCK_PCH = PCH_GBE
>
> This is not the correct way to ensure that the module'ness of one
> config option meets the module'ness requirements of another.
> The correct way is to say something like "&&  (PCH_GBE || PCH_GBE=n)"

This case is a little bit tricky than usual, with PCH_PTP selected, the valid 
config would be either "PTP_1588_CLOCK_PCH=PCH_GBE=m" or 
"PTP_1588_CLOCK_PCH=PCH_GBE=y", and PTP_1588_CLOCK_PCH depends on PCH_GBE.

So are you ok with this:
+	depends on PTP_1588_CLOCK_PCH && (PCH_GBE=m || PTP_1588_CLOCK_PCH=y)

or simply like:
---
From: Haicheng Li <haicheng.lee@gmail.com>

Fix build error caused by broken PCH_PTP module dependency.
The .config is:
         CONFIG_PCH_GBE=y
         CONFIG_PCH_PTP=y
         CONFIG_PTP_1588_CLOCK=m

The build error:

drivers/built-in.o: In function `pch_tx_timestamp':
.../pch_gbe_main.c:215: undefined reference to `pch_ch_event_read'
.../pch_gbe_main.c:225: undefined reference to `pch_tx_snap_read'
.../pch_gbe_main.c:231: undefined reference to `pch_ch_event_write'

.../pch_gbe_main.c:170: undefined reference to `pch_ch_event_read'
.../pch_gbe_main.c:175: undefined reference to `pch_src_uuid_lo_read'
.../pch_gbe_main.c:176: undefined reference to `pch_src_uuid_hi_read'
.../pch_gbe_main.c:190: undefined reference to `pch_ch_event_write'
.../pch_gbe_main.c:184: undefined reference to `pch_rx_snap_read'

.../pch_gbe_main.c:267: undefined reference to `pch_ch_control_write'
.../pch_gbe_main.c:271: undefined reference to `pch_ch_control_write'
.../pch_gbe_main.c:275: undefined reference to `pch_ch_control_write'
.../pch_gbe_main.c:281: undefined reference to `pch_ch_control_write'
.../pch_gbe_main.c:283: undefined reference to `pch_set_station_address'
.../pch_gbe_main.c:290: undefined reference to `pch_ch_event_write'

Signed-off-by: Haicheng Li <haicheng.lee@gmail.com>
---
  drivers/net/ethernet/oki-semi/pch_gbe/Kconfig |    4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig 
b/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig
index bce0164..df1e649 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/Kconfig
@@ -21,12 +21,12 @@ config PCH_GBE
  	  ML7223/ML7831 is companion chip for Intel Atom E6xx series.
  	  ML7223/ML7831 is completely compatible for Intel EG20T PCH.

-if PCH_GBE
+if PTP_1588_CLOCK_PCH

  config PCH_PTP
  	bool "PCH PTP clock support"
  	default n
-	depends on PTP_1588_CLOCK_PCH
+	depends on PTP_1588_CLOCK_PCH=y || PCH_GBE=m
  	---help---
  	  Say Y here if you want to use Precision Time Protocol (PTP) in the
  	  driver. PTP is a method to precisely synchronize distributed clocks
-- 
1.7.1



-haicheng

^ permalink raw reply related

* Re: [RFC PATCH net-next] tcp: introduce tcp_tw_interval to specifiy the time of TIME-WAIT
From: David Miller @ 2012-09-28  6:43 UTC (permalink / raw)
  To: amwang; +Cc: nhorman, netdev, kuznet, kaber, edumazet
In-Reply-To: <1348813987.7264.41.camel@cr0>

From: Cong Wang <amwang@redhat.com>
Date: Fri, 28 Sep 2012 14:33:07 +0800

> I don't think reducing TIME_WAIT is a good idea either, but there must
> be some reason behind as several UNIX provides a microsecond-scale
> tuning interface, or maybe in non-recycle mode, their RTO is much less
> than 2*MSL?

Yes, there is a reason.  It's there for retaining multi-million-dollar
customers.

There is no other reasons these other systems provide these
facilities, they are simply there in an attempt to retain a dwindling
customer base.

Any other belief is extremely naive.

^ permalink raw reply

* Re: [PATCH 2/2] Fix a typo in PTP_1588_CLOCK_PCH Kconfig help info.
From: Haicheng Li @ 2012-09-28  6:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, tshimizu818, linux-kernel, haicheng.lee
In-Reply-To: <20120927.180640.447002776076460117.davem@davemloft.net>

On 09/28/2012 06:06 AM, David Miller wrote:
>> -	  will be called ptp_pch.
>> +	  will be called by pch_ptp.
>
> The original sentence is correct, it is stating the name of the module
> that will be built not the module that will call it.
You're right.

> Rather, the "pch_ptp" is what might need to be adjusted.
No need, the output is ptp_pch.ko.

So pls. ignore this patch. sorry for the bothering.

-haicheng

^ permalink raw reply

* Re: [RFC PATCH net-next] tcp: introduce tcp_tw_interval to specifiy the time of TIME-WAIT
From: David Miller @ 2012-09-28  6:44 UTC (permalink / raw)
  To: amwang; +Cc: netdev, kuznet, kaber, edumazet, nhorman
In-Reply-To: <1348814399.7264.44.camel@cr0>

From: Cong Wang <amwang@redhat.com>
Date: Fri, 28 Sep 2012 14:39:59 +0800

> On Thu, 2012-09-27 at 13:05 -0400, David Miller wrote:
>> 
>> Without appropriate confirmation that an early time-wait reuse is
>> valid, decreasing this interval can only be dangerous.
> 
> Yeah, would a proper documentation cure this? Something like we did for
> other tuning:
> 
> "It should not be changed without advice/request of technical experts."

No, we're not adding this facility.

^ permalink raw reply

* Re: [PATCH 1/2] Fix build error caused by broken PCH_PTP module dependency.
From: David Miller @ 2012-09-28  6:46 UTC (permalink / raw)
  To: haicheng.li; +Cc: netdev, tshimizu818, linux-kernel, haicheng.lee
In-Reply-To: <506546A7.7030500@linux.intel.com>

From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Fri, 28 Sep 2012 14:41:43 +0800

> On 09/28/2012 06:09 AM, David Miller wrote:
>> Look at how other people submit patches, do any other patch
>> submissions
>> look like your's having all of this metadata in the message body:
> I'm sorry for it.
> 
>> As for this specific patch:
>>
>>> -	depends on PTP_1588_CLOCK_PCH
>>> +	depends on PTP_1588_CLOCK_PCH = PCH_GBE
>>
>> This is not the correct way to ensure that the module'ness of one
>> config option meets the module'ness requirements of another.
>> The correct way is to say something like "&&  (PCH_GBE || PCH_GBE=n)"
> 
> This case is a little bit tricky than usual, with PCH_PTP selected,
> the valid config would be either "PTP_1588_CLOCK_PCH=PCH_GBE=m" or
> "PTP_1588_CLOCK_PCH=PCH_GBE=y", and PTP_1588_CLOCK_PCH depends on
> PCH_GBE.

And a simple "&& PCH_GBE" should accomplish this, no?

^ permalink raw reply

* Re: Possible bug with r8169 driver
From: Nolwenn @ 2012-09-28  6:47 UTC (permalink / raw)
  To: Francois Romieu; +Cc: hayeswang, netdev
In-Reply-To: <20120927222147.GA29047@electric-eye.fr.zoreil.com>

Le vendredi 28 septembre 2012 00:21:47 Francois Romieu a écrit :

> Can you send an 'ip -s link' before any ipv6 traffic flows, then
> after ?
> 

Before ipv6 traffic

% ip -s link
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue state UNKNOWN 
mode DEFAULT 
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    RX: bytes  packets  errors  dropped overrun mcast   
    0          0        0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    0          0        0       0       0       0      
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state 
UP mode DEFAULT qlen 1000
    link/ether 30:85:a9:ee:43:fa brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast   
    2607155    2232     0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    142314     1725     0       0       0       0

During tcpdump interception

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue state UNKNOWN 
mode DEFAULT 
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    RX: bytes  packets  errors  dropped overrun mcast   
    0          0        0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    0          0        0       0       0       0      
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state 
UP mode DEFAULT qlen 1000
    link/ether 30:85:a9:ee:43:fa brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast   
    4191544    4587     0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    268787     3317     0       0       0       0 

After set set eth0 in promisc mode

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue state UNKNOWN 
mode DEFAULT 
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    RX: bytes  packets  errors  dropped overrun mcast   
    0          0        0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    0          0        0       0       0       0      
2: eth0: <BROADCAST,MULTICAST,PROMISC,UP,LOWER_UP> mtu 1500 qdisc 
pfifo_fast state UP mode DEFAULT qlen 1000
    link/ether 30:85:a9:ee:43:fa brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast   
    10301337   12212    0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    803894     9309     0       0       0       0 

> > % dmesg | grep r8169
> > 
> > [    1.810423] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
> > [    1.810548] r8169 0000:03:00.0: irq 44 for MSI/MSI-X
> > [    1.810671] r8169 0000:03:00.0: eth0: RTL8168f/8111f at
> > 0xffffc900057ae000, 30:85:a9:ee:43:fa, XID 08000800 IRQ 44 [    1.810672]
> > r8169 0000:03:00.0: eth0: jumbo features [frames: 9200 bytes, tx
> > checksumming: ko]
> Hayes, is there by any luck something different for the 8168f regarding
> the layout of the multicast filtering registers ?

I don't have sufficient skills in networks to understand what do you mean, 
sorry. Any command line, files comparing to get the information?

^ permalink raw reply

* Re: Possible networking regression in 3.6.0
From: David Miller @ 2012-09-28  6:53 UTC (permalink / raw)
  To: eric.dumazet; +Cc: chris2553, netdev, gpiez
In-Reply-To: <1348780624.5093.1767.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 27 Sep 2012 23:17:04 +0200

> Yes it seems the problem. On the host I tried :
> 
> # ip ro get 8.8.8.8 from 192.168.200.1 iif tap1
> 8.8.8.8 from 192.168.200.1 via 172.30.42.1 dev eth0 
>     cache  iif *
> 
> So if the guest tries to send a frame to 8.8.8.8 we are going to forward
> the packet to eth0
> 
> But if the guest tries to send to 255.255.255.255, we try to deliver the
> packet to the host itself, instead of broadcasting to eth0
> 
> # ip ro get 255.255.255.255 from 192.168.200.1 iif tap1
> broadcast 255.255.255.255 from 192.168.200.1 dev lo 
>     cache <local,brd>  iif *
> 
> David, maybe you'll have an idea ?

Perhaps this was introduced by:

commit 7bd86cc282a458b66c41e3f6676de6656c99b8db
Author: Yan, Zheng <zheng.z.yan@intel.com>
Date:   Sun Aug 12 20:09:59 2012 +0000

    ipv4: Cache local output routes
    
    Commit caacf05e5ad1abf causes big drop of UDP loop back performance.
    The cause of the regression is that we do not cache the local output
    routes. Each time we send a datagram from unconnected UDP socket,
    the kernel allocates a dst_entry and adds it to the rt_uncached_list.
    It creates lock contention on the rt_uncached_lock.
    
    Reported-by: Alex Shi <alex.shi@intel.com>
    Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e4ba974..fd9ecb5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2028,7 +2028,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 		}
 		dev_out = net->loopback_dev;
 		fl4->flowi4_oif = dev_out->ifindex;
-		res.fi = NULL;
 		flags |= RTCF_LOCAL;
 		goto make_route;
 	}

^ permalink raw reply related

* Re: [PATCH 1/2] Fix build error caused by broken PCH_PTP module dependency.
From: Haicheng Li @ 2012-09-28  6:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, tshimizu818, linux-kernel, haicheng.lee
In-Reply-To: <20120928.024621.1635385815700269914.davem@davemloft.net>

On 09/28/2012 02:46 PM, David Miller wrote:
> From: Haicheng Li<haicheng.li@linux.intel.com>
> Date: Fri, 28 Sep 2012 14:41:43 +0800
>
>> On 09/28/2012 06:09 AM, David Miller wrote:
>>> Look at how other people submit patches, do any other patch
>>> submissions
>>> look like your's having all of this metadata in the message body:
>> I'm sorry for it.
>>
>>> As for this specific patch:
>>>
>>>> -	depends on PTP_1588_CLOCK_PCH
>>>> +	depends on PTP_1588_CLOCK_PCH = PCH_GBE
>>>
>>> This is not the correct way to ensure that the module'ness of one
>>> config option meets the module'ness requirements of another.
>>> The correct way is to say something like "&&   (PCH_GBE || PCH_GBE=n)"
>>
>> This case is a little bit tricky than usual, with PCH_PTP selected,
>> the valid config would be either "PTP_1588_CLOCK_PCH=PCH_GBE=m" or
>> "PTP_1588_CLOCK_PCH=PCH_GBE=y", and PTP_1588_CLOCK_PCH depends on
>> PCH_GBE.
>
> And a simple "&&  PCH_GBE" should accomplish this, no?
No sir. it's actually same with the original Kconfig (by a if PCH_GBE"), it 
just failed with this config:

         CONFIG_PCH_GBE=y
         CONFIG_PCH_PTP=y
         CONFIG_PTP_1588_CLOCK=m

-haicheng

^ permalink raw reply

* RE: [net-next PATCH 4/5] be2net: get rid of AMAP_SET/GET macros in TX path
From: Perla, Sathya @ 2012-09-28  7:05 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <20120928.024034.1426515423229309957.davem@davemloft.net>

>-----Original Message-----
>From: David Miller [mailto:davem@davemloft.net]
>
>> endianness is handled even in this patch. The call to wrb_fill_hdr()
>> is followed by be_dws_cpu_to_le() to handle this.
>
>That swap_dws() thing is the most inefficient thing I've ever seen.
>
>Instead of being able to benefit from compile time optimizations
>such as byte swaps of constants, you do everything hidden from the
>compiler so nothing gets optimized.

I'd like to clarify that swap_dws() was being used/needed even in the old code. 
The AMAP_SET_BITS() macros (that this patch removes) did nothing to take care of endian byte-swapping.
They are just calculating the mask/shift values of each bit-field and setting the value
in host-endian.

But, as you are convinced that this patch doesn't provide much value, I'll just drop it and 
re-send the patch set....

^ permalink raw reply

* Re: [net-next PATCH 4/5] be2net: get rid of AMAP_SET/GET macros in TX path
From: David Miller @ 2012-09-28  7:10 UTC (permalink / raw)
  To: Sathya.Perla; +Cc: netdev
In-Reply-To: <CF9D1877D81D214CB0CA0669EFAE020C65638C@CMEXMB1.ad.emulex.com>

From: "Perla, Sathya" <Sathya.Perla@Emulex.Com>
Date: Fri, 28 Sep 2012 07:05:27 +0000

>>-----Original Message-----
>>From: David Miller [mailto:davem@davemloft.net]
>>
>>> endianness is handled even in this patch. The call to wrb_fill_hdr()
>>> is followed by be_dws_cpu_to_le() to handle this.
>>
>>That swap_dws() thing is the most inefficient thing I've ever seen.
>>
>>Instead of being able to benefit from compile time optimizations
>>such as byte swaps of constants, you do everything hidden from the
>>compiler so nothing gets optimized.
> 
> I'd like to clarify that swap_dws() was being used/needed even in
> the old code.

I fully understand this.

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2012-09-28  7:31 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


The most important bit here is the netfilter fix:

1) Netfilter xt_limit module can use uninitialized rules, from
   Jan Engelhardt.

2) Wei Yongjun has found several more spots where error pointers
   were treated as NULL/non-NULL and vice versa.

3) bnx2x was converted to pci_io{,un}map() but one remaining plain
   iounmap() got missed.  From Neil Horman.

4) Due to a fence-post type error in initialization of inetpeer
   entries (which is where we store the ICMP rate limiting
   information), we can erroneously drop ICMPs if the inetpeer
   was created right around when jiffies wraps.

   Fix from Nicolas Dichtel.

5) smsc75xx resume fix from Steve Glendinnig.

6) LAN87xx smsc chips need an explicit hardware init, from Marek
   Vasut.

7) qlcnic uses msleep() with locks held, fix from Narendra K.

Please pull, thanks a lot.

The following changes since commit 5e19997a742c7c8203be628a7a69babc3bcf01a4:

  Merge tag 'for-linus' of git://linux-c6x.org/git/projects/linux-c6x-upstreaming (2012-09-26 14:28:17 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git master

for you to fetch changes up to df555b665367f9de6c04826acc482096f17c243d:

  netdev: octeon: fix return value check in octeon_mgmt_init_phy() (2012-09-28 01:18:32 -0400)

----------------------------------------------------------------
David S. Miller (1):
      Merge branch 'master' of git://1984.lsi.us.es/nf

Jan Engelhardt (1):
      netfilter: xt_limit: have r->cost != 0 case work

Marek Vasut (1):
      net: phy: smsc: Implement PHY config_init for LAN87xx

Narendra K (1):
      qlcnic: Fix scheduling while atomic bug

Neil Horman (1):
      bnx2: Clean up remaining iounmap

Nicolas Dichtel (1):
      inetpeer: fix token initialization

Steve Glendinning (1):
      smsc75xx: fix resume after device reset

Wei Yongjun (4):
      l2tp: fix return value check
      team: fix return value check
      netdev: pasemi: fix return value check in pasemi_mac_phy_init()
      netdev: octeon: fix return value check in octeon_mgmt_init_phy()

 drivers/net/ethernet/broadcom/bnx2.c            |  2 +-
 drivers/net/ethernet/octeon/octeon_mgmt.c       |  4 +---
 drivers/net/ethernet/pasemi/pasemi_mac.c        |  4 ++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c |  4 ++--
 drivers/net/phy/smsc.c                          | 28 +++++++++++++++++++++++++++-
 drivers/net/team/team.c                         | 12 ++++++------
 drivers/net/usb/smsc75xx.c                      |  1 +
 net/ipv4/inetpeer.c                             |  5 ++++-
 net/l2tp/l2tp_netlink.c                         | 12 ++++++------
 net/netfilter/xt_limit.c                        |  8 ++++----
 10 files changed, 54 insertions(+), 26 deletions(-)

^ permalink raw reply

* RE: [PATCH] qlcnic - Fix scheduling while atomic bug
From: Narendra_K @ 2012-09-28  7:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, sony.chacko, jitendra.kalsaria, john.r.fastabend
In-Reply-To: <20120927.192602.1354851114513597606.davem@davemloft.net>

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of David Miller
> Sent: Friday, September 28, 2012 4:56 AM
> To: K, Narendra
> Cc: netdev@vger.kernel.org; sony.chacko@qlogic.com;
> jitendra.kalsaria@qlogic.com; john.r.fastabend@intel.com
> Subject: Re: [PATCH] qlcnic - Fix scheduling while atomic bug
> 
> From: <Narendra_K@Dell.com>
> Date: Tue, 25 Sep 2012 10:53:19 -0700
> 
> > From: Narendra K <narendra_k@dell.com>
> >
> > In the device close path, 'qlcnic_fw_destroy_ctx' and
> > 'qlcnic_poll_rsp' call msleep. But  'qlcnic_fw_destroy_ctx' and
> > 'qlcnic_poll_rsp' are called with 'adapter->tx_clean_lock' spin lock
> > held resulting in scheduling while atomic bug causing the following
> > trace.
> >
> > I observed that the commit 012dc19a45b2b9cc2ebd14aaa401cf782c2abba4
> > from John Fastabend addresses a similar issue in ixgbevf driver.
> > Adopting the same approach used in the commit, this patch uses mdelay
> > to address the issue.
>  ...
> > Signed-off-by: Narendra K <narendra_k@dell.com>
> 
> Applied, thanks.
> 
> In the future please format your Subject lines for patches as
> "subsytem: Description."  instead of this "subsystem - Description."
> layout.

Hi David, thank you for reviewing and applying the patch. I will follow the suggested layout for future submissions.

Hi Jitendra, thank you for reviewing the patch.
 
With regards,
Narendra K

^ permalink raw reply

* Re: [PATCH RFC net-next 1/1] ptp: add an ioctl to compare PHC time with system time
From: Miroslav Lichvar @ 2012-09-28  7:53 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev, David Miller, Jacob Keller, John Stultz
In-Reply-To: <f0c20e2d1a303b0247b1e0e0def19f131de162ff.1348768886.git.richardcochran@gmail.com>

On Thu, Sep 27, 2012 at 08:12:16PM +0200, Richard Cochran wrote:
> This patch adds an ioctl for PTP Hardware Clock (PHC) devices that allows
> user space to measure the time offset between the PHC and the system
> clock. Rather than hard coding any kind of estimation algorithm into the
> kernel, this patch takes the more flexible approach of just delivering
> an array of raw clock readings. In that way, the user space clock servo
> may be adapted to new and different hardware clocks.

Would it make sense to extend the ioctl to allow also comparing the
PHC with another PHC or perhaps even a different system clock than
CLOCK_REALTIME?

I'm thinking if someone wanted to synchronize one PHC to another, it
should be better to work with phc1-phc2 offsets than combine phc1-sys
and sys-phc2 offsets.

-- 
Miroslav Lichvar

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox