Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring, David S . Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy
In-Reply-To: <1461140382-4784-1-git-send-email-aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

This patch introduces neighbour discovery ops callback structure. The
structure contains at first receive and transmit handling for NS/NA and
userspace option field functionality.

These callback offers 6lowpan different handling, such as 802.15.4 short
address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over
6LoWPANs).

Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Hideaki YOSHIFUJI <yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>
Cc: Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>
Signed-off-by: Alexander Aring <aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 include/linux/netdevice.h |  3 ++
 include/net/ndisc.h       | 96 +++++++++++++++++++++++++++++++++++++++++++----
 net/ipv6/addrconf.c       |  1 +
 net/ipv6/ndisc.c          | 71 ++++++++++++++++++++++++-----------
 net/ipv6/route.c          |  2 +-
 5 files changed, 144 insertions(+), 29 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0052c42..bc60033 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1677,6 +1677,9 @@ struct net_device {
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	const struct l3mdev_ops	*l3mdev_ops;
 #endif
+#if IS_ENABLED(CONFIG_IPV6)
+	const struct ndisc_ops *ndisc_ops;
+#endif
 
 	const struct header_ops *header_ops;
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index aac868e..14ed016 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -110,7 +110,8 @@ struct ndisc_options {
 
 #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
 
-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+					  u8 *opt, int opt_len,
 					  struct ndisc_options *ndopts);
 
 /*
@@ -173,6 +174,93 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons
 	return n;
 }
 
+static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt)
+{
+	return opt->nd_opt_type == ND_OPT_RDNSS ||
+		opt->nd_opt_type == ND_OPT_DNSSL;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+struct ndisc_ops {
+	int	(*is_useropt)(struct nd_opt_hdr *opt);
+	void	(*send_na)(struct net_device *dev,
+			   const struct in6_addr *daddr,
+			   const struct in6_addr *solicited_addr,
+			   bool router, bool solicited,
+			   bool override, bool inc_opt);
+	void	(*recv_na)(struct sk_buff *skb);
+	void	(*send_ns)(struct net_device *dev,
+			   const struct in6_addr *solicit,
+			   const struct in6_addr *daddr,
+			   const struct in6_addr *saddr);
+	void	(*recv_ns)(struct sk_buff *skb);
+};
+
+static inline int ndisc_is_useropt(const struct net_device *dev,
+				   struct nd_opt_hdr *opt)
+{
+	if (likely(dev->ndisc_ops->is_useropt))
+		return dev->ndisc_ops->is_useropt(opt);
+	else
+		return 0;
+}
+
+static inline void ndisc_send_na(struct net_device *dev,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *solicited_addr,
+				 bool router, bool solicited, bool override,
+				 bool inc_opt)
+{
+	if (likely(dev->ndisc_ops->send_na))
+		dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router,
+					solicited, override, inc_opt);
+}
+
+static inline void ndisc_recv_na(struct sk_buff *skb)
+{
+	if (likely(skb->dev->ndisc_ops->recv_na))
+		skb->dev->ndisc_ops->recv_na(skb);
+}
+
+static inline void ndisc_send_ns(struct net_device *dev,
+				 const struct in6_addr *solicit,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *saddr)
+{
+	if (likely(dev->ndisc_ops->send_ns))
+		dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr);
+}
+
+static inline void ndisc_recv_ns(struct sk_buff *skb)
+{
+	if (likely(skb->dev->ndisc_ops->recv_ns))
+		skb->dev->ndisc_ops->recv_ns(skb);
+}
+#else
+static inline int ndisc_is_useropt(const struct net_device *dev,
+				   struct nd_opt_hdr *opt)
+{
+	return 0;
+}
+
+static inline void ndisc_send_na(struct net_device *dev,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *solicited_addr,
+				 bool router, bool solicited, bool override,
+				 bool inc_opt) { }
+
+static inline void ndisc_recv_na(struct sk_buff *skb) { }
+
+static inline void ndisc_send_ns(struct net_device *dev,
+				 const struct in6_addr *solicit,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *saddr) { }
+
+static inline void ndisc_recv_ns(struct sk_buff *skb) { }
+#endif
+
+void ip6_register_ndisc_ops(struct net_device *dev);
+
 int ndisc_init(void);
 int ndisc_late_init(void);
 
@@ -181,14 +269,8 @@ void ndisc_cleanup(void);
 
 int ndisc_rcv(struct sk_buff *skb);
 
-void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr);
-
 void ndisc_send_rs(struct net_device *dev,
 		   const struct in6_addr *saddr, const struct in6_addr *daddr);
-void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
-		   const struct in6_addr *solicited_addr,
-		   bool router, bool solicited, bool override, bool inc_opt);
 
 void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 54e18c2..a2ef04b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3266,6 +3266,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			idev = ipv6_add_dev(dev);
 			if (IS_ERR(idev))
 				return notifier_from_errno(PTR_ERR(idev));
+			ip6_register_ndisc_ops(dev);
 		}
 		break;
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 176c7c4..297080a 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -185,24 +185,25 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
 	return cur <= end && cur->nd_opt_type == type ? cur : NULL;
 }
 
-static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
+static inline int ip6_ndisc_is_useropt(struct nd_opt_hdr *opt)
 {
-	return opt->nd_opt_type == ND_OPT_RDNSS ||
-		opt->nd_opt_type == ND_OPT_DNSSL;
+	return __ip6_ndisc_is_useropt(opt);
 }
 
-static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
+static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
+					     struct nd_opt_hdr *cur,
 					     struct nd_opt_hdr *end)
 {
 	if (!cur || !end || cur >= end)
 		return NULL;
 	do {
 		cur = ((void *)cur) + (cur->nd_opt_len << 3);
-	} while (cur < end && !ndisc_is_useropt(cur));
-	return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
+	} while (cur < end && !ndisc_is_useropt(dev, cur));
+	return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
 }
 
-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
+					  u8 *opt, int opt_len,
 					  struct ndisc_options *ndopts)
 {
 	struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
@@ -243,7 +244,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
 			break;
 #endif
 		default:
-			if (ndisc_is_useropt(nd_opt)) {
+			if (ndisc_is_useropt(dev, nd_opt)) {
 				ndopts->nd_useropts_end = nd_opt;
 				if (!ndopts->nd_useropts)
 					ndopts->nd_useropts = nd_opt;
@@ -479,9 +480,11 @@ static void ndisc_send_skb(struct sk_buff *skb,
 	rcu_read_unlock();
 }
 
-void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
-		   const struct in6_addr *solicited_addr,
-		   bool router, bool solicited, bool override, bool inc_opt)
+static void ip6_ndisc_send_na(struct net_device *dev,
+			      const struct in6_addr *daddr,
+			      const struct in6_addr *solicited_addr,
+			      bool router, bool solicited, bool override,
+			      bool inc_opt)
 {
 	struct sk_buff *skb;
 	struct in6_addr tmpaddr;
@@ -555,8 +558,10 @@ static void ndisc_send_unsol_na(struct net_device *dev)
 	in6_dev_put(idev);
 }
 
-void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr)
+static void ip6_ndisc_send_ns(struct net_device *dev,
+			      const struct in6_addr *solicit,
+			      const struct in6_addr *daddr,
+			      const struct in6_addr *saddr)
 {
 	struct sk_buff *skb;
 	struct in6_addr addr_buf;
@@ -702,7 +707,7 @@ static int pndisc_is_router(const void *pkey,
 	return ret;
 }
 
-static void ndisc_recv_ns(struct sk_buff *skb)
+static void ip6_ndisc_recv_ns(struct sk_buff *skb)
 {
 	struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
 	const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
@@ -738,7 +743,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 		return;
 	}
 
-	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+	if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND options\n");
 		return;
 	}
@@ -874,7 +879,7 @@ out:
 		in6_dev_put(idev);
 }
 
-static void ndisc_recv_na(struct sk_buff *skb)
+static void ip6_ndisc_recv_na(struct sk_buff *skb)
 {
 	struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
 	struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
@@ -912,7 +917,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	    idev->cnf.drop_unsolicited_na)
 		return;
 
-	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+	if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND option\n");
 		return;
 	}
@@ -1019,7 +1024,7 @@ static void ndisc_recv_rs(struct sk_buff *skb)
 		goto out;
 
 	/* Parse ND options */
-	if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
+	if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n");
 		goto out;
 	}
@@ -1137,7 +1142,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		return;
 	}
 
-	if (!ndisc_parse_options(opt, optlen, &ndopts)) {
+	if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) {
 		ND_PRINTK(2, warn, "RA: invalid ND options\n");
 		return;
 	}
@@ -1424,7 +1429,8 @@ skip_routeinfo:
 		struct nd_opt_hdr *p;
 		for (p = ndopts.nd_useropts;
 		     p;
-		     p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) {
+		     p = ndisc_next_useropt(skb->dev, p,
+					    ndopts.nd_useropts_end)) {
 			ndisc_ra_useropt(skb, p);
 		}
 	}
@@ -1462,7 +1468,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 		return;
 	}
 
-	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts))
+	if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
 		return;
 
 	if (!ndopts.nd_opts_rh) {
@@ -1783,6 +1789,29 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *bu
 
 #endif
 
+static const struct ndisc_ops ip6_ndisc_ops = {
+	.is_useropt = ip6_ndisc_is_useropt,
+	.send_na = ip6_ndisc_send_na,
+	.recv_na = ip6_ndisc_recv_na,
+	.send_ns = ip6_ndisc_send_ns,
+	.recv_ns = ip6_ndisc_recv_ns,
+};
+
+void ip6_register_ndisc_ops(struct net_device *dev)
+{
+	switch (dev->type) {
+	default:
+		if (dev->ndisc_ops) {
+			ND_PRINTK(2, warn,
+				  "%s: ndisc_ops already defined for interface type=%d\n",
+				  __func__, dev->type);
+		} else {
+			dev->ndisc_ops = &ip6_ndisc_ops;
+		}
+		break;
+	}
+}
+
 static int __net_init ndisc_net_init(struct net *net)
 {
 	struct ipv6_pinfo *np;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc180b3..5fa276d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2149,7 +2149,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 *	first-hop router for the specified ICMP Destination Address.
 	 */
 
-	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
+	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
 		return;
 	}
-- 
2.8.0

^ permalink raw reply related

* [PATCHv2 bluetooth-next 05/10] ndisc: add addr_len parameter to ndisc_opt_addr_data
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring, David S . Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy
In-Reply-To: <1461140382-4784-1-git-send-email-aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

This patch makes the address length as argument for the
ndisc_opt_addr_data function. This is necessary to handle addresses
which don't use dev->addr_len as address length.

Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Hideaki YOSHIFUJI <yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>
Cc: Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>
Signed-off-by: Alexander Aring <aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 include/net/ndisc.h |  5 +++--
 net/ipv6/ndisc.c    | 11 +++++++----
 net/ipv6/route.c    |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index ef43e88..aac868e 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -134,12 +134,13 @@ static inline int ndisc_opt_addr_space(struct net_device *dev,
 }
 
 static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
-				      struct net_device *dev)
+				      struct net_device *dev,
+				      unsigned char addr_len)
 {
 	u8 *lladdr = (u8 *)(p + 1);
 	int lladdrlen = p->nd_opt_len << 3;
 	int prepad = ndisc_addr_option_pad(dev->type);
-	if (lladdrlen != ndisc_opt_addr_space(dev, dev->addr_len))
+	if (lladdrlen != ndisc_opt_addr_space(dev, addr_len))
 		return NULL;
 	return lladdr + prepad;
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 69e20e3..4e91d5e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -744,7 +744,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 	}
 
 	if (ndopts.nd_opts_src_lladdr) {
-		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
+		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev,
+					     dev->addr_len);
 		if (!lladdr) {
 			ND_PRINTK(2, warn,
 				  "NS: invalid link-layer address length\n");
@@ -916,7 +917,8 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		return;
 	}
 	if (ndopts.nd_opts_tgt_lladdr) {
-		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
+		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev,
+					     dev->addr_len);
 		if (!lladdr) {
 			ND_PRINTK(2, warn,
 				  "NA: invalid link-layer address length\n");
@@ -1024,7 +1026,7 @@ static void ndisc_recv_rs(struct sk_buff *skb)
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
-					     skb->dev);
+					     skb->dev, skb->dev->addr_len);
 		if (!lladdr)
 			goto out;
 	}
@@ -1322,7 +1324,8 @@ skip_linkparms:
 		u8 *lladdr = NULL;
 		if (ndopts.nd_opts_src_lladdr) {
 			lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
-						     skb->dev);
+						     skb->dev,
+						     skb->dev->addr_len);
 			if (!lladdr) {
 				ND_PRINTK(2, warn,
 					  "RA: invalid link-layer address length\n");
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ed44663..cc180b3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2157,7 +2157,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	lladdr = NULL;
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
-					     skb->dev);
+					     skb->dev, skb->dev->addr_len);
 		if (!lladdr) {
 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
 			return;
-- 
2.8.0

^ permalink raw reply related

* [PATCHv2 bluetooth-next 04/10] ndisc: add addr_len parameter to ndisc_opt_addr_space
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring, David S . Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy
In-Reply-To: <1461140382-4784-1-git-send-email-aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

This patch makes the address length as argument for the
ndisc_opt_addr_space function. This is necessary to handle addresses
which don't use dev->addr_len as address length.

Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Hideaki YOSHIFUJI <yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>
Cc: Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>
Signed-off-by: Alexander Aring <aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 include/net/ndisc.h |  8 ++++----
 net/ipv6/ndisc.c    | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 2d8edaa..ef43e88 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -127,10 +127,10 @@ static inline int ndisc_addr_option_pad(unsigned short type)
 	}
 }
 
-static inline int ndisc_opt_addr_space(struct net_device *dev)
+static inline int ndisc_opt_addr_space(struct net_device *dev,
+				       unsigned char addr_len)
 {
-	return NDISC_OPT_SPACE(dev->addr_len +
-			       ndisc_addr_option_pad(dev->type));
+	return NDISC_OPT_SPACE(addr_len + ndisc_addr_option_pad(dev->type));
 }
 
 static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
@@ -139,7 +139,7 @@ static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
 	u8 *lladdr = (u8 *)(p + 1);
 	int lladdrlen = p->nd_opt_len << 3;
 	int prepad = ndisc_addr_option_pad(dev->type);
-	if (lladdrlen != ndisc_opt_addr_space(dev))
+	if (lladdrlen != ndisc_opt_addr_space(dev, dev->addr_len))
 		return NULL;
 	return lladdr + prepad;
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c245895..69e20e3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -154,7 +154,7 @@ static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
 {
 	int pad   = ndisc_addr_option_pad(skb->dev->type);
 	int data_len = skb->dev->addr_len;
-	int space = ndisc_opt_addr_space(skb->dev);
+	int space = ndisc_opt_addr_space(skb->dev, skb->dev->addr_len);
 	u8 *opt = skb_put(skb, space);
 
 	opt[0] = type;
@@ -509,7 +509,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
 	if (!dev->addr_len)
 		inc_opt = 0;
 	if (inc_opt)
-		optlen += ndisc_opt_addr_space(dev);
+		optlen += ndisc_opt_addr_space(dev, dev->addr_len);
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -574,7 +574,7 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 	if (ipv6_addr_any(saddr))
 		inc_opt = false;
 	if (inc_opt)
-		optlen += ndisc_opt_addr_space(dev);
+		optlen += ndisc_opt_addr_space(dev, dev->addr_len);
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -626,7 +626,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
 	}
 #endif
 	if (send_sllao)
-		optlen += ndisc_opt_addr_space(dev);
+		optlen += ndisc_opt_addr_space(dev, dev->addr_len);
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -1563,7 +1563,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 			memcpy(ha_buf, neigh->ha, dev->addr_len);
 			read_unlock_bh(&neigh->lock);
 			ha = ha_buf;
-			optlen += ndisc_opt_addr_space(dev);
+			optlen += ndisc_opt_addr_space(dev, dev->addr_len);
 		} else
 			read_unlock_bh(&neigh->lock);
 
-- 
2.8.0

^ permalink raw reply related

* [PATCHv2 bluetooth-next 03/10] 6lowpan: remove ipv6 module request
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring
In-Reply-To: <1461140382-4784-1-git-send-email-aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

Since we use exported function from ipv6 kernel module we don't need to
request the module anymore to have ipv6 functionality.

Signed-off-by: Alexander Aring <aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 net/6lowpan/core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index fbae31e..824d1bc 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -158,8 +158,6 @@ static int __init lowpan_module_init(void)
 		return ret;
 	}
 
-	request_module_nowait("ipv6");
-
 	request_module_nowait("nhc_dest");
 	request_module_nowait("nhc_fragment");
 	request_module_nowait("nhc_hop");
-- 
2.8.0

^ permalink raw reply related

* [PATCHv2 bluetooth-next 02/10] 6lowpan: add 802.15.4 short addr slaac
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring, David S . Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy
In-Reply-To: <1461140382-4784-1-git-send-email-aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

This patch adds the autoconfiguration if a valid 802.15.4 short address
is available for 802.15.4 6LoWPAN interfaces.

Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Hideaki YOSHIFUJI <yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>
Cc: Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>
Signed-off-by: Alexander Aring <aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 include/net/addrconf.h |  3 +++
 net/6lowpan/core.c     | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/addrconf.c    |  5 +++--
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 730d856..b1774eb 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -94,6 +94,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
+void addrconf_add_linklocal(struct inet6_dev *idev,
+			    const struct in6_addr *addr, u32 flags);
+
 static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
 {
 	if (dev->addr_len != ETH_ALEN)
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 7a240b3..fbae31e 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 
 #include <net/6lowpan.h>
+#include <net/addrconf.h>
 
 #include "6lowpan_i.h"
 
@@ -72,16 +73,61 @@ void lowpan_unregister_netdev(struct net_device *dev)
 }
 EXPORT_SYMBOL(lowpan_unregister_netdev);
 
+static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
+{
+	struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
+
+	/* Set short_addr autoconfiguration if short_addr is present only */
+	if (!ieee802154_is_valid_src_short_addr(wpan_dev->short_addr))
+		return -1;
+
+	/* For either address format, all zero addresses MUST NOT be used */
+	if (wpan_dev->pan_id == cpu_to_le16(0x0000) &&
+	    wpan_dev->short_addr == cpu_to_le16(0x0000))
+		return -1;
+
+	/* Alternatively, if no PAN ID is known, 16 zero bits may be used */
+	if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+		memset(eui, 0, 2);
+	else
+		ieee802154_le16_to_be16(eui, &wpan_dev->pan_id);
+
+	/* The "Universal/Local" (U/L) bit shall be set to zero */
+	eui[0] &= ~2;
+	eui[2] = 0;
+	eui[3] = 0xFF;
+	eui[4] = 0xFE;
+	eui[5] = 0;
+	ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr);
+	return 0;
+}
+
 static int lowpan_event(struct notifier_block *unused,
 			unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct inet6_dev *idev;
+	struct in6_addr addr;
 	int i;
 
 	if (dev->type != ARPHRD_6LOWPAN)
 		return NOTIFY_DONE;
 
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return NOTIFY_DONE;
+
 	switch (event) {
+	case NETDEV_UP:
+	case NETDEV_CHANGE:
+		/* (802.15.4 6LoWPAN short address slaac handling */
+		if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) &&
+		    addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) {
+			__ipv6_addr_set_half(&addr.s6_addr32[0],
+					     htonl(0xFE800000), 0);
+			addrconf_add_linklocal(idev, &addr, 0);
+		}
+		break;
 	case NETDEV_DOWN:
 		for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
 			clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1a..54e18c2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2948,8 +2948,8 @@ static void init_loopback(struct net_device *dev)
 	}
 }
 
-static void addrconf_add_linklocal(struct inet6_dev *idev,
-				   const struct in6_addr *addr, u32 flags)
+void addrconf_add_linklocal(struct inet6_dev *idev,
+			    const struct in6_addr *addr, u32 flags)
 {
 	struct inet6_ifaddr *ifp;
 	u32 addr_flags = flags | IFA_F_PERMANENT;
@@ -2968,6 +2968,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev,
 		in6_ifa_put(ifp);
 	}
 }
+EXPORT_SYMBOL(addrconf_add_linklocal);
 
 static bool ipv6_reserved_interfaceid(struct in6_addr address)
 {
-- 
2.8.0

^ permalink raw reply related

* [PATCHv2 bluetooth-next 01/10] 6lowpan: add private neighbour data
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring, David S . Miller
In-Reply-To: <1461140382-4784-1-git-send-email-aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

This patch will introduce a 6lowpan neighbour private data. Like the
interface private data we handle private data for generic 6lowpan and
for link-layer specific 6lowpan.

The current first use case if to save the short address for a 802.15.4
6lowpan neighbour.

Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Signed-off-by: Alexander Aring <aar-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
---
 include/linux/netdevice.h     |  3 +--
 include/net/6lowpan.h         | 24 ++++++++++++++++++++++++
 net/bluetooth/6lowpan.c       |  2 ++
 net/ieee802154/6lowpan/core.c | 12 ++++++++++++
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 166402a..0052c42 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1487,8 +1487,7 @@ enum netdev_priv_flags {
  * 	@perm_addr:		Permanent hw address
  * 	@addr_assign_type:	Hw address assignment type
  * 	@addr_len:		Hardware address length
- * 	@neigh_priv_len;	Used in neigh_alloc(),
- * 				initialized only in atm/clip.c
+ *	@neigh_priv_len;	Used in neigh_alloc()
  * 	@dev_id:		Used to differentiate devices that share
  * 				the same link layer address
  * 	@dev_port:		Used to differentiate devices that share
diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h
index da84cf9..61c6517 100644
--- a/include/net/6lowpan.h
+++ b/include/net/6lowpan.h
@@ -98,6 +98,9 @@ static inline bool lowpan_is_iphc(u8 dispatch)
 #define LOWPAN_PRIV_SIZE(llpriv_size)	\
 	(sizeof(struct lowpan_dev) + llpriv_size)
 
+#define LOWPAN_NEIGH_PRIV_SIZE(llneigh_priv_size)	\
+	(sizeof(struct lowpan_neigh) + llneigh_priv_size)
+
 enum lowpan_lltypes {
 	LOWPAN_LLTYPE_BTLE,
 	LOWPAN_LLTYPE_IEEE802154,
@@ -141,6 +144,27 @@ struct lowpan_dev {
 	u8 priv[0] __aligned(sizeof(void *));
 };
 
+struct lowpan_neigh {
+	/* 6LoWPAN neigh private data */
+	/* must be last */
+	u8 priv[0] __aligned(sizeof(void *));
+};
+
+struct lowpan_802154_neigh {
+	__le16 short_addr;
+};
+
+static inline struct lowpan_neigh *lowpan_neigh(void *neigh_priv)
+{
+	return neigh_priv;
+}
+
+static inline
+struct lowpan_802154_neigh *lowpan_802154_neigh(void *neigh_priv)
+{
+	return (struct lowpan_802154_neigh *)lowpan_neigh(neigh_priv)->priv;
+}
+
 static inline
 struct lowpan_dev *lowpan_dev(const struct net_device *dev)
 {
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 38e82dd..b7c4efa 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -833,6 +833,8 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev)
 	list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
 	spin_unlock(&devices_lock);
 
+	netdev->neigh_priv_len = LOWPAN_NEIGH_PRIV_SIZE(0);
+
 	err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE);
 	if (err < 0) {
 		BT_INFO("register_netdev failed %d", err);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index dd085db..3162632 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -92,11 +92,21 @@ static int lowpan_stop(struct net_device *dev)
 	return 0;
 }
 
+static int lowpan_neigh_construct(struct neighbour *n)
+{
+	struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n));
+
+	/* default no short_addr is available for a neighbour */
+	neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
+	return 0;
+}
+
 static const struct net_device_ops lowpan_netdev_ops = {
 	.ndo_init		= lowpan_dev_init,
 	.ndo_start_xmit		= lowpan_xmit,
 	.ndo_open		= lowpan_open,
 	.ndo_stop		= lowpan_stop,
+	.ndo_neigh_construct    = lowpan_neigh_construct,
 };
 
 static void lowpan_setup(struct net_device *ldev)
@@ -161,6 +171,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
 				wdev->needed_headroom;
 	ldev->needed_tailroom = wdev->needed_tailroom;
 
+	ldev->neigh_priv_len = LOWPAN_NEIGH_PRIV_SIZE(sizeof(struct lowpan_802154_neigh));
+
 	ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154);
 	if (ret < 0) {
 		dev_put(wdev);
-- 
2.8.0

^ permalink raw reply related

* [PATCHv2 bluetooth-next 00/10] 6lowpan: introduce basic 6lowpan-nd
From: Alexander Aring @ 2016-04-20  8:19 UTC (permalink / raw)
  To: linux-wpan-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-bIcnvbaLZ9MEGnE8C9+IrQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	jukka.rissanen-VuQAYsv1563Yd54FQh9/CA,
	hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r,
	stefan-JPH+aEBZ4P+UEJcrhfAQsw, mcr-SWp7JaYWvAQV+D8aMU/kSg,
	werner-SEdMjqphH88wryQfseakQg,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Alexander Aring, David S . Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy

Hi,

this patch series introduces a layer for IPv6 neighbour discovery. At first
it introduce the "ndisc_ops" to run a different handling for recv/send of
NA/NS messages. The use case for such ndisc operation is RFC 6775 [0].
Which describes a different neighbour discovery handling for 6LoWPAN networks.

I didn't implement RFC 6775 in this patch series, but introduce callback
structure for replace different functions in ndisc implementation might be
the right direction.

Another use case would be RFC 7400 [1] which describes a new option field to
getting capabilities of 6LoWPAN next header compression methods.

What I implemented is a necessary functionality to handle short address for
802.15.4 6LoWPAN networks. The L2-Layer "802.15.4" can have two different
link-layer addresses which can be used mixed at the same time inside 802.15.4
networks. To deal with such behaviour in ndisc, it is defined at RFC 4944 [2].
The bad news is, that I saw different handling of such handling. What Linux
will do is to add two source/target address information option fields, each
with different length, if short address is valid (can also not be given).
Example:

- WPAN interface address settings
 - extended addr (must always be there)
 - short addr (0xfffe or 0xffff -> invalid)

Will add an extended addr to source/target address information option field.
If short addr is in some valid range, then both address will be added to
the option fields. Indicated are these different address types by the length
field (extended -> length=2, short -> length=1), according to [1].

The tested 6LoWPAN implementation (RIOT-OS) allows only one source/target
option field which is short XOR extended, otherwise it will be dropped.
There is some lack of information there [2] and I don't know how do deal with
it right, maybe we need to update the implementation there if it's really
wrong.

To save such information for each neighbour we use the already implemented
neighbour private data which some casting strategy for 6LoWPAN and 6LoWPAN
link-layer specific data e.g. 802.15.4 short address handling.

Additional I implemented to add 6CO to the is_useropt callback in case of
6LoWPAN interface. The 6CO option will currently parsed in userspace which
are placed in RA-Messages.

The ndisc_ops are not finished yet, of course we need handling for RS messages
to place the 802.15.4 short address there as well and then also processing
of RA messages for the 802.15.4 SLLAO option field.

- Alex

[0] https://tools.ietf.org/html/rfc6775
[1] https://tools.ietf.org/html/rfc7400#section-3.3
[2] https://tools.ietf.org/html/rfc4944#section-8

Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Hideaki YOSHIFUJI <yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>
Cc: Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>

changes since v2:
 - replace #ifdef CONFIG_IPV6 to #if IS_ENABLED(...)
 - replace #ifdef CONFIG_IEEE802154... to #if IS_ENABLED(...)
 - add more #if IS_ENABLED(CONFIG_IPV6) in ndisc.h

Alexander Aring (10):
  6lowpan: add private neighbour data
  6lowpan: add 802.15.4 short addr slaac
  6lowpan: remove ipv6 module request
  ndisc: add addr_len parameter to ndisc_opt_addr_space
  ndisc: add addr_len parameter to ndisc_opt_addr_data
  ndisc: add addr_len parameter to ndisc_fill_addr_option
  ipv6: introduce neighbour discovery ops
  ipv6: export ndisc functions
  6lowpan: introduce 6lowpan-nd
  6lowpan: add support for 802.15.4 short addr handling

 include/linux/netdevice.h     |   6 +-
 include/net/6lowpan.h         |  24 ++
 include/net/addrconf.h        |   3 +
 include/net/ndisc.h           | 124 ++++++++-
 net/6lowpan/6lowpan_i.h       |   2 +
 net/6lowpan/Makefile          |   2 +-
 net/6lowpan/core.c            |  50 +++-
 net/6lowpan/iphc.c            | 167 +++++++++--
 net/6lowpan/ndisc.c           | 633 ++++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/6lowpan.c       |   2 +
 net/ieee802154/6lowpan/core.c |  12 +
 net/ieee802154/6lowpan/tx.c   | 107 ++++---
 net/ipv6/addrconf.c           |   7 +-
 net/ipv6/ndisc.c              | 132 +++++----
 net/ipv6/route.c              |   4 +-
 15 files changed, 1117 insertions(+), 158 deletions(-)
 create mode 100644 net/6lowpan/ndisc.c

-- 
2.8.0

^ permalink raw reply

* Re: [PATCH v2] carl9170: Clarify kconfig text
From: Kalle Valo @ 2016-04-20  7:59 UTC (permalink / raw)
  To: Christian Lamparter; +Cc: Lauri Kasanen, linux-wireless, netdev, linux-kernel
In-Reply-To: <11418613.XH7h8m2RjK@debian64>

Christian Lamparter <chunkeey@googlemail.com> writes:

> On Monday, April 18, 2016 07:42:05 PM Kalle Valo wrote:
>> Christian Lamparter <chunkeey@googlemail.com> writes:
>> 
>> > On Monday, April 18, 2016 06:45:09 PM Kalle Valo wrote:
>> >
>> >> Why even mention anything about a "special firmware" as the firmware is
>> >> already available from linux-firmware.git? 
>> >
>> > Yes and no. 1.9.6 is in linux-firmware.git. I've tried to add 1.9.9 too
>> > but that failed.
>> > <http://comments.gmane.org/gmane.linux.kernel.wireless.general/114639>
>> 
>> Rick's comment makes sense to me, better just to provide the latest
>> version. No need to unnecessary confuse the users. And if someone really
>> wants to use an older version that she can retrieve it from the git
>> history.
>
> Part of the fun here is that firmware is GPLv2. The linux-firmware.git has
> to point to or add the firmware source to their tree. They have added every
> single source file to it.... instead of "packaging" it in a tar.bz2/gz/xz
> like you normally do for release sources.
>
> If you want to read more about it:
> <http://www.spinics.net/lists/linux-wireless/msg101868.html>

Yeah, that's more work. I get that. But I'm still not understanding
what's the actual problem which prevents us from updating carl9170
firmware in linux-firmware.

-- 
Kalle Valo

^ permalink raw reply

* [PATCH v2 net 1/3] qede: Fix various memory allocation error flows for fastpath
From: Manish Chopra @ 2016-04-20  7:03 UTC (permalink / raw)
  To: davem; +Cc: netdev, Ariel.Elior, Yuval.Mintz
In-Reply-To: <1461135809-9776-1-git-send-email-manish.chopra@qlogic.com>

This patch handles memory allocation failures for fastpath
gracefully in the driver.

Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: Yuval Mintz <yuval.mintz@qlogic.com>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 140 ++++++++++++++++-----------
 1 file changed, 85 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 518af32..5cf1eb2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -750,6 +750,12 @@ static bool qede_has_tx_work(struct qede_fastpath *fp)
 	return false;
 }
 
+static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq)
+{
+	qed_chain_consume(&rxq->rx_bd_ring);
+	rxq->sw_rx_cons++;
+}
+
 /* This function reuses the buffer(from an offset) from
  * consumer index to producer index in the bd ring
  */
@@ -773,6 +779,21 @@ static inline void qede_reuse_page(struct qede_dev *edev,
 	curr_cons->data = NULL;
 }
 
+/* In case of allocation failures reuse buffers
+ * from consumer index to produce buffers for firmware
+ */
+static void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq,
+				    struct qede_dev *edev, u8 count)
+{
+	struct sw_rx_data *curr_cons;
+
+	for (; count > 0; count--) {
+		curr_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
+		qede_reuse_page(edev, rxq, curr_cons);
+		qede_rx_bd_ring_consume(rxq);
+	}
+}
+
 static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
 					 struct qede_rx_queue *rxq,
 					 struct sw_rx_data *curr_cons)
@@ -781,8 +802,14 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
 	curr_cons->page_offset += rxq->rx_buf_seg_size;
 
 	if (curr_cons->page_offset == PAGE_SIZE) {
-		if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+		if (unlikely(qede_alloc_rx_buffer(edev, rxq))) {
+			/* Since we failed to allocate new buffer
+			 * current buffer can be used again.
+			 */
+			curr_cons->page_offset -= rxq->rx_buf_seg_size;
+
 			return -ENOMEM;
+		}
 
 		dma_unmap_page(&edev->pdev->dev, curr_cons->mapping,
 			       PAGE_SIZE, DMA_FROM_DEVICE);
@@ -901,7 +928,10 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 			   len_on_bd);
 
 	if (unlikely(qede_realloc_rx_buffer(edev, rxq, current_bd))) {
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		/* Incr page ref count to reuse on allocation failure
+		 * so that it doesn't get freed while freeing SKB.
+		 */
+		atomic_inc(&current_bd->data->_count);
 		goto out;
 	}
 
@@ -915,6 +945,8 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	return 0;
 
 out:
+	tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+	qede_recycle_rx_bd_ring(rxq, edev, 1);
 	return -ENOMEM;
 }
 
@@ -966,8 +998,9 @@ static void qede_tpa_start(struct qede_dev *edev,
 	tpa_info->skb = netdev_alloc_skb(edev->ndev,
 					 le16_to_cpu(cqe->len_on_first_bd));
 	if (unlikely(!tpa_info->skb)) {
+		DP_NOTICE(edev, "Failed to allocate SKB for gro\n");
 		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
-		return;
+		goto cons_buf;
 	}
 
 	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
@@ -990,6 +1023,7 @@ static void qede_tpa_start(struct qede_dev *edev,
 	/* This is needed in order to enable forwarding support */
 	qede_set_gro_params(edev, tpa_info->skb, cqe);
 
+cons_buf: /* We still need to handle bd_len_list to consume buffers */
 	if (likely(cqe->ext_bd_len_list[0]))
 		qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
 				   le16_to_cpu(cqe->ext_bd_len_list[0]));
@@ -1244,17 +1278,17 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 				  "CQE in CONS = %u has error, flags = %x, dropping incoming packet\n",
 				  sw_comp_cons, parse_flag);
 			rxq->rx_hw_errors++;
-			qede_reuse_page(edev, rxq, sw_rx_data);
-			goto next_rx;
+			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+			goto next_cqe;
 		}
 
 		skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
 		if (unlikely(!skb)) {
 			DP_NOTICE(edev,
 				  "Build_skb failed, dropping incoming packet\n");
-			qede_reuse_page(edev, rxq, sw_rx_data);
+			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
 			rxq->rx_alloc_errors++;
-			goto next_rx;
+			goto next_cqe;
 		}
 
 		/* Copy data into SKB */
@@ -1288,11 +1322,22 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 			if (unlikely(qede_realloc_rx_buffer(edev, rxq,
 							    sw_rx_data))) {
 				DP_ERR(edev, "Failed to allocate rx buffer\n");
+				/* Incr page ref count to reuse on allocation
+				 * failure so that it doesn't get freed while
+				 * freeing SKB.
+				 */
+
+				atomic_inc(&sw_rx_data->data->_count);
 				rxq->rx_alloc_errors++;
+				qede_recycle_rx_bd_ring(rxq, edev,
+							fp_cqe->bd_num);
+				dev_kfree_skb_any(skb);
 				goto next_cqe;
 			}
 		}
 
+		qede_rx_bd_ring_consume(rxq);
+
 		if (fp_cqe->bd_num != 1) {
 			u16 pkt_len = le16_to_cpu(fp_cqe->pkt_len);
 			u8 num_frags;
@@ -1303,18 +1348,27 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 			     num_frags--) {
 				u16 cur_size = pkt_len > rxq->rx_buf_size ?
 						rxq->rx_buf_size : pkt_len;
+				if (unlikely(!cur_size)) {
+					DP_ERR(edev,
+					       "Still got %d BDs for mapping jumbo, but length became 0\n",
+					       num_frags);
+					qede_recycle_rx_bd_ring(rxq, edev,
+								num_frags);
+					dev_kfree_skb_any(skb);
+					goto next_cqe;
+				}
 
-				WARN_ONCE(!cur_size,
-					  "Still got %d BDs for mapping jumbo, but length became 0\n",
-					  num_frags);
-
-				if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+				if (unlikely(qede_alloc_rx_buffer(edev, rxq))) {
+					qede_recycle_rx_bd_ring(rxq, edev,
+								num_frags);
+					dev_kfree_skb_any(skb);
 					goto next_cqe;
+				}
 
-				rxq->sw_rx_cons++;
 				sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
 				sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
-				qed_chain_consume(&rxq->rx_bd_ring);
+				qede_rx_bd_ring_consume(rxq);
+
 				dma_unmap_page(&edev->pdev->dev,
 					       sw_rx_data->mapping,
 					       PAGE_SIZE, DMA_FROM_DEVICE);
@@ -1330,7 +1384,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 				pkt_len -= cur_size;
 			}
 
-			if (pkt_len)
+			if (unlikely(pkt_len))
 				DP_ERR(edev,
 				       "Mapped all BDs of jumbo, but still have %d bytes\n",
 				       pkt_len);
@@ -1349,10 +1403,6 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 		skb_record_rx_queue(skb, fp->rss_id);
 
 		qede_skb_receive(edev, fp, skb, le16_to_cpu(fp_cqe->vlan_tag));
-
-		qed_chain_consume(&rxq->rx_bd_ring);
-next_rx:
-		rxq->sw_rx_cons++;
 next_rx_only:
 		rx_pkt++;
 
@@ -2257,7 +2307,7 @@ static void qede_free_sge_mem(struct qede_dev *edev,
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
 		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
 
-		if (replace_buf) {
+		if (replace_buf->data) {
 			dma_unmap_page(&edev->pdev->dev,
 				       dma_unmap_addr(replace_buf, mapping),
 				       PAGE_SIZE, DMA_FROM_DEVICE);
@@ -2377,7 +2427,7 @@ err:
 static int qede_alloc_mem_rxq(struct qede_dev *edev,
 			      struct qede_rx_queue *rxq)
 {
-	int i, rc, size, num_allocated;
+	int i, rc, size;
 
 	rxq->num_rx_buffers = edev->q_num_rx_buffers;
 
@@ -2394,6 +2444,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev,
 	rxq->sw_rx_ring = kzalloc(size, GFP_KERNEL);
 	if (!rxq->sw_rx_ring) {
 		DP_ERR(edev, "Rx buffers ring allocation failed\n");
+		rc = -ENOMEM;
 		goto err;
 	}
 
@@ -2421,26 +2472,16 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev,
 	/* Allocate buffers for the Rx ring */
 	for (i = 0; i < rxq->num_rx_buffers; i++) {
 		rc = qede_alloc_rx_buffer(edev, rxq);
-		if (rc)
-			break;
-	}
-	num_allocated = i;
-	if (!num_allocated) {
-		DP_ERR(edev, "Rx buffers allocation failed\n");
-		goto err;
-	} else if (num_allocated < rxq->num_rx_buffers) {
-		DP_NOTICE(edev,
-			  "Allocated less buffers than desired (%d allocated)\n",
-			  num_allocated);
+		if (rc) {
+			DP_ERR(edev,
+			       "Rx buffers allocation failed at index %d\n", i);
+			goto err;
+		}
 	}
 
-	qede_alloc_sge_mem(edev, rxq);
-
-	return 0;
-
+	rc = qede_alloc_sge_mem(edev, rxq);
 err:
-	qede_free_mem_rxq(edev, rxq);
-	return -ENOMEM;
+	return rc;
 }
 
 static void qede_free_mem_txq(struct qede_dev *edev,
@@ -2523,10 +2564,8 @@ static int qede_alloc_mem_fp(struct qede_dev *edev,
 	}
 
 	return 0;
-
 err:
-	qede_free_mem_fp(edev, fp);
-	return -ENOMEM;
+	return rc;
 }
 
 static void qede_free_mem_load(struct qede_dev *edev)
@@ -2549,22 +2588,13 @@ static int qede_alloc_mem_load(struct qede_dev *edev)
 		struct qede_fastpath *fp = &edev->fp_array[rss_id];
 
 		rc = qede_alloc_mem_fp(edev, fp);
-		if (rc)
-			break;
-	}
-
-	if (rss_id != QEDE_RSS_CNT(edev)) {
-		/* Failed allocating memory for all the queues */
-		if (!rss_id) {
+		if (rc) {
 			DP_ERR(edev,
-			       "Failed to allocate memory for the leading queue\n");
-			rc = -ENOMEM;
-		} else {
-			DP_NOTICE(edev,
-				  "Failed to allocate memory for all of RSS queues\n Desired: %d queues, allocated: %d queues\n",
-				  QEDE_RSS_CNT(edev), rss_id);
+			       "Failed to allocate memory for fastpath - rss id = %d\n",
+			       rss_id);
+			qede_free_mem_load(edev);
+			return rc;
 		}
-		edev->num_rss = rss_id;
 	}
 
 	return 0;
-- 
2.7.2

^ permalink raw reply related

* [PATCH v2 net 0/3] qede: Bug fixes
From: Manish Chopra @ 2016-04-20  7:03 UTC (permalink / raw)
  To: davem; +Cc: netdev, Ariel.Elior, Yuval.Mintz

Hi David,

This series fixes -

* various memory allocation failure flows for fastpath
* issues with respect to driver GRO packets handling

V1->V2

* Send series against net instead of net-next.

Please consider applying this series to "net"

Thanks,
Manish

Manish Chopra (3):
  qede: Fix various memory allocation error flows for fastpath
  qede: Fix setting Skb network header
  qede: Fix single MTU sized packet from firmware GRO flow

 drivers/net/ethernet/qlogic/qede/qede_main.c | 157 +++++++++++++++++----------
 1 file changed, 100 insertions(+), 57 deletions(-)

-- 
2.7.2

^ permalink raw reply

* [PATCH v2 net 2/3] qede: Fix setting Skb network header
From: Manish Chopra @ 2016-04-20  7:03 UTC (permalink / raw)
  To: davem; +Cc: netdev, Ariel.Elior, Yuval.Mintz
In-Reply-To: <1461135809-9776-1-git-send-email-manish.chopra@qlogic.com>

Skb's network header needs to be set before extracting IPv4/IPv6
headers from it.

Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: Yuval Mintz <yuval.mintz@qlogic.com>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 5cf1eb2..bf0fb99 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1041,7 +1041,6 @@ static void qede_gro_ip_csum(struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct tcphdr *th;
 
-	skb_set_network_header(skb, 0);
 	skb_set_transport_header(skb, sizeof(struct iphdr));
 	th = tcp_hdr(skb);
 
@@ -1056,7 +1055,6 @@ static void qede_gro_ipv6_csum(struct sk_buff *skb)
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct tcphdr *th;
 
-	skb_set_network_header(skb, 0);
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 	th = tcp_hdr(skb);
 
@@ -1073,6 +1071,8 @@ static void qede_gro_receive(struct qede_dev *edev,
 {
 #ifdef CONFIG_INET
 	if (skb_shinfo(skb)->gso_size) {
+		skb_set_network_header(skb, 0);
+
 		switch (skb->protocol) {
 		case htons(ETH_P_IP):
 			qede_gro_ip_csum(skb);
-- 
2.7.2

^ permalink raw reply related

* [PATCH v2 net 3/3] qede: Fix single MTU sized packet from firmware GRO flow
From: Manish Chopra @ 2016-04-20  7:03 UTC (permalink / raw)
  To: davem; +Cc: netdev, Ariel.Elior, Yuval.Mintz
In-Reply-To: <1461135809-9776-1-git-send-email-manish.chopra@qlogic.com>

In firmware assisted GRO flow there could be a single MTU sized
segment arriving due to firmware aggregation timeout/last segment
in an aggregation flow, which is not expected to be an actual gro
packet. So If a skb has zero frags from the GRO flow then simply
push it in the stack as non gso skb.

Signed-off-by: Manish Chopra <manish.chopra@qlogic.com>
Signed-off-by: Yuval Mintz <yuval.mintz@qlogic.com>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index bf0fb99..7869465 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1069,6 +1069,17 @@ static void qede_gro_receive(struct qede_dev *edev,
 			     struct sk_buff *skb,
 			     u16 vlan_tag)
 {
+	/* FW can send a single MTU sized packet from gro flow
+	 * due to aggregation timeout/last segment etc. which
+	 * is not expected to be a gro packet. If a skb has zero
+	 * frags then simply push it in the stack as non gso skb.
+	 */
+	if (unlikely(!skb->data_len)) {
+		skb_shinfo(skb)->gso_type = 0;
+		skb_shinfo(skb)->gso_size = 0;
+		goto send_skb;
+	}
+
 #ifdef CONFIG_INET
 	if (skb_shinfo(skb)->gso_size) {
 		skb_set_network_header(skb, 0);
@@ -1087,6 +1098,8 @@ static void qede_gro_receive(struct qede_dev *edev,
 		}
 	}
 #endif
+
+send_skb:
 	skb_record_rx_queue(skb, fp->rss_id);
 	qede_skb_receive(edev, fp, skb, vlan_tag);
 }
-- 
2.7.2

^ permalink raw reply related

* Re: [PATCH net-next v5] rtnetlink: add new RTM_GETSTATS message to dump link stats
From: Johannes Berg @ 2016-04-20  7:32 UTC (permalink / raw)
  To: David Ahern, David Miller
  Cc: eric.dumazet, roopa, netdev, jhs, tgraf, nicolas.dichtel,
	egrumbach
In-Reply-To: <5716E123.8040002@cumulusnetworks.com>

On Tue, 2016-04-19 at 19:53 -0600, David Ahern wrote:
> 
> The kernel can set a flag in the response that it acknowledges the
> new  attribute/flag. I did that for filtering neigh dumps --
> 21fdd092acc7.
> 

Hm, that works, but I think it requires writing extra code, which I was
kinda trying to avoid. With the patch that Emmanuel wrote, we can
restrict the changes to just nla_parse().

Anyway, I think we just have to document the behaviour very precisely,
and userspace can make its own decisions.

Essentially, apps will have a number of choices:

1) Use the new attribute flag only with commands known to have been
   added after the kernel support was added.

2) Use the new attribute flag with some required attribute for
   existing commands, so that older kernel will not find the required
   attribute and will reject the operation entirely.
   May or may not fall back to trying the operation again without the
   flag.

3) Simply use the new flag and do unexpected things on kernels not
   supporting the rejection mechanism - not much worse than today in
   many cases.

I guess we'll write a proper commit message and send the patch.

johannes

^ permalink raw reply

* Re: [PATCH net-next 0/2] act_bpf, cls_bpf: send eBPF bytecode through
From: Quentin Monnet @ 2016-04-20  7:25 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev
In-Reply-To: <20160415184445.GA58007@ast-mbp.thefacebook.com>

Hi Daniel, Alexei, and many thanks for your answers,

2016-04-15 (11:44 UTC-0700) ~ Alexei Starovoitov:
> On Fri, Apr 15, 2016 at 12:41:05PM +0200, Daniel Borkmann wrote:
>> Hi Quentin,
>>
>> On 04/15/2016 12:07 PM, Quentin Monnet wrote:
>>> When a new BPF traffic control filter or action is set up with tc, the
>>> bytecode is sent back to userspace through a netlink socket for cBPF, but
>>> not for eBPF (the file descriptor pointing to the object file containing
>>> the bytecode is sent instead).
>>>
>>> This patch makes cls_bpf and act_bpf modules send the bytecode for eBPF as
>>> well (in addition to the file descriptor).
>>>
[…]
>>
>> Thanks for working on this, but it's unfortunately not that easy. Let
>> me ask, what would be the intended use-case to dump the insns?
> 
> +1
> 
>> I'm asking because if you dump them as-is, then a reinject at a later
>> time of that bytecode back into the kernel will most likely be rejected
>> by the verifier.
>>
>> This is because on load time, verifier does rewrites/expansion on some
>> of the insns (f.e. map pointers, helper functions, ctx access etc, see
>> also appendix in [1]), so the code as seen in the kernel would need to
>> be sanitized first.
> 
> +1
> we had similar discussion about this in seccomp context and decided that
> the only sensible way is to keep original instructions, but it's wasteful
> to do unconditionally and snapshotting of maps is not possible,
> so there was no use for such dumping facility other than debugging.
> Is it what the patch after?
> We need to discuss it in the proper context.

I am experimenting with BPF, and so far I was just trying to dump the
bytecode sent from tc to the kernel. I had not realized that the
verifier would bring some changes to the instructions. And I agree that
a more comprehensive debugging solution could be obtained if I can find
some way to get a snapshot of the maps.

>> Also, how would you make sense/transform maps into a meaningful
>> representation (probably possible to find a scheme when they are pinned)?
>>
>> Another possibility is that such programs need to be pinned (can be done
>> easily by tc in the background) and then implement a CRIU facility into
>> the bpf(2) syscall to retrieve them. tc could make use of this w/o too
>> much effort, and at the same time it would help CRIU folks, too. It
>> also seems cleaner to have only one central api (bpf(2)) to dump them,
>> but needs a bit of thought.
> 
> +1
> any debugging or criu needs to be done in a centralized way via syscall
> and/or bpffs.

Maintaining a central API around bpf() makes sense to me. I have been
looking at the BPF filesystem to see what information I can obtain from
it, but I did not understand it well. I read the logs of Daniel's commit
b2197755b263 (“bpf: add support for persistent maps/progs”), but I am
unsure how I could use it in order to gather data about the maps and
programs (if this is possible at all). I tried to set up some BPF
filters working with maps, but I could not find any file under
/sys/fs/bpf/tc.

Would you have a pointer to some documentation about this filesystem? Or
is there only the kernel code?

^ permalink raw reply

* AW: [PATCH net-next] net/hsr: Added support for HSR v1
From: Heise, Peter @ 2016-04-20  7:10 UTC (permalink / raw)
  To: David Miller, stephen@networkplumber.org
  Cc: arvid.brodin@alten.se, hannes@stressinduktion.org,
	sd@queasysnail.net, henrik@austad.us, nikolay@cumulusnetworks.com,
	tgraf@suug.ch, linville@tuxdriver.com, gospo@cumulusnetworks.com,
	dsa@cumulusnetworks.com, eranbe@mellanox.com, ast@plumgrid.com,
	netdev@vger.kernel.org
In-Reply-To: <8104_1461082351_571658EF_8104_4642_1_20160419.121227.1437003570791513649.davem@davemloft.net>

Thanks for the advice Stephen, new patch submitted!

-----Ursprüngliche Nachricht-----
Von: David Miller [mailto:davem@davemloft.net] 
Gesendet: Dienstag, 19. April 2016 18:12
An: stephen@networkplumber.org
Cc: mail@pheise.de; arvid.brodin@alten.se; hannes@stressinduktion.org; sd@queasysnail.net; henrik@austad.us; nikolay@cumulusnetworks.com; tgraf@suug.ch; linville@tuxdriver.com; gospo@cumulusnetworks.com; dsa@cumulusnetworks.com; eranbe@mellanox.com; ast@plumgrid.com; netdev@vger.kernel.org; Heise, Peter
Betreff: Re: [PATCH net-next] net/hsr: Added support for HSR v1

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 19 Apr 2016 08:21:00 -0700

> On Wed, 13 Apr 2016 13:52:22 +0200
> Peter Heise <mail@pheise.de> wrote:
> 
>> diff --git a/include/uapi/linux/if_link.h 
>> b/include/uapi/linux/if_link.h index 9427f17..bb3a90b 100644
>> --- a/include/uapi/linux/if_link.h
>> +++ b/include/uapi/linux/if_link.h
>> @@ -773,6 +773,7 @@ enum {
>>  	IFLA_HSR_SLAVE1,
>>  	IFLA_HSR_SLAVE2,
>>  	IFLA_HSR_MULTICAST_SPEC,	/* Last byte of supervision addr */
>> +	IFLA_HSR_VERSION,		/* HSR version */
>>  	IFLA_HSR_SUPERVISION_ADDR,	/* Supervision frame multicast addr */
>>  	IFLA_HSR_SEQ_NR,
> 
> You added a new value into the middle of an enumeration field.
> This breaks kernel ABI. Older applications (like iproute) would see 
> the wrong values.
> 
> Please submit a new change which moves HSR_VERSION to the end of the 
> enum

Good catch, Stephen.

This mail has originated outside your organization, either from an external partner or the Global Internet.
Keep this in mind if you answer this message.

The information in this e-mail is confidential. The contents may not be disclosed or used by anyone other than the addressee. Access to this e-mail by anyone else is unauthorised.
If you are not the intended recipient, please notify Airbus immediately and delete this e-mail.
Airbus cannot accept any responsibility for the accuracy or completeness of this e-mail as it has been sent over public networks. If you have any concerns over the content of this message or its Accuracy or Integrity, please contact Airbus immediately.
All outgoing e-mails from Airbus are checked using regularly updated virus scanning software but you should take whatever measures you deem to be appropriate to ensure that this message and any attachments are virus free.

^ permalink raw reply

* [PATCH net-next] net/hsr: Fixed version field in ENUM
From: Peter Heise @ 2016-04-20  7:08 UTC (permalink / raw)
  To: netdev, davem, stephen; +Cc: peter.heise

New field (IFLA_HSR_VERSION) was added in the middle of an existing
ENUM and would break kernel ABI, therefore moved to the end.
Reported by Stephen Hemminger.

Signed-off-by: Peter Heise <peter.heise@airbus.com>
---
 include/uapi/linux/if_link.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5ffdcb3..af8fd58 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -774,9 +774,9 @@ enum {
 	IFLA_HSR_SLAVE1,
 	IFLA_HSR_SLAVE2,
 	IFLA_HSR_MULTICAST_SPEC,	/* Last byte of supervision addr */
-	IFLA_HSR_VERSION,		/* HSR version */
 	IFLA_HSR_SUPERVISION_ADDR,	/* Supervision frame multicast addr */
 	IFLA_HSR_SEQ_NR,
+	IFLA_HSR_VERSION,		/* HSR version */
 	__IFLA_HSR_MAX,
 };
 
-- 
2.5.0

^ permalink raw reply related

* [RFC PATCH v3 net-next 2/3] tcp: Handle eor bit when coalescing skb
From: Martin KaFai Lau @ 2016-04-20  6:24 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team
In-Reply-To: <1461133497-1515104-1-git-send-email-kafai@fb.com>

This patch:
1. Prevent next_skb from coalescing to the prev_skb if
   TCP_SKB_CB(prev_skb)->eor is set
2. Update the TCP_SKB_CB(prev_skb)->eor if coalescing is
   allowed

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_input.c  | 4 ++++
 net/ipv4/tcp_output.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 75e8336..68c55e5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1303,6 +1303,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 		TCP_SKB_CB(prev)->end_seq++;
 
@@ -1368,6 +1369,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
 		goto fallback;
 
+	if (TCP_SKB_CB(prev)->eor)
+		goto fallback;
+
 	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
 		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a6e4a83..96bdf98 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2494,6 +2494,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	 * packet counting does not break.
 	 */
 	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
 
 	/* changed transmit queue under us so clear hints */
 	tcp_clear_retrans_hints_partial(tp);
@@ -2545,6 +2546,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 		if (!tcp_can_collapse(sk, skb))
 			break;
 
+		if (TCP_SKB_CB(to)->eor)
+			break;
+
 		space -= skb->len;
 
 		if (first) {
-- 
2.5.1

^ permalink raw reply related

* [RFC PATCH v3 net-next 0/3] tcp: Make use of MSG_EOR in tcp_sendmsg
From: Martin KaFai Lau @ 2016-04-20  6:24 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team

v3:
~ Separate EOR marking from the SKBTX_ANY_TSTAMP logic.
~ Move the eor bit test back to the loop in tcp_sendmsg and
  tcp_sendpage because there could be >1 threads doing
  sendmsg.
~ Thanks to Eric Dumazet's suggestions on v2.
~ The TCP timestamp bug fixes are separated into other threads.

v2:
~ Rework based on the recent work
  "add TX timestamping via cmsg" by
  Soheil Hassas Yeganeh <soheil.kdev@gmail.com>
~ This version takes the MSG_EOR bit as a signal of
  end-of-response-message and leave the selective
  timestamping job to the cmsg
~ Changes based on the v1 feedback (like avoid
  unlikely check in a loop and adding tcp_sendpage
  support)
~ The first 3 patches are bug fixes.  The fixes in this
  series depend on the newly introduced txstamp_ack in
  net-next.  I will make relevant patches against net after
  getting some feedback.
~ The test results are based on the recently posted net fix:
  "tcp: Fix SOF_TIMESTAMPING_TX_ACK when handling dup acks"
~ Due to the lacking cmsg support in packetdrill (or may
  be I just could not find it), a BPF prog is used to kprobe
  to sock_queue_err_skb() and print out the value of
  serr->ee.ee_data.  The BPF prog (run-able from bcc) is
  attached at the end.

One potential use case is to use MSG_EOR with
SOF_TIMESTAMPING_TX_ACK to get a more accurate
TCP ack timestamping on application protocol with
multiple outgoing response messages (e.g. HTTP2).

One of our use case is at the webserver.  The webserver tracks
the HTTP2 response latency by measuring when the webserver sends
the first byte to the socket till the TCP ACK of the last byte
is received.  In the cases where we don't have client side
measurement, measuring from the server side is the only option.
In the cases we have the client side measurement, the server side
data can also be used to justify/cross-check-with the client
side data.

^ permalink raw reply

* [RFC PATCH v3 net-next 1/3] tcp: Make use of MSG_EOR in tcp_sendmsg and tcp_sendpage
From: Martin KaFai Lau @ 2016-04-20  6:24 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team
In-Reply-To: <1461133497-1515104-1-git-send-email-kafai@fb.com>

This patch adds an eor bit to the TCP_SKB_CB.  When MSG_EOR
is passed to tcp_sendmsg/tcp_sendpage, the eor bit will
be set at the skb containing the last byte of the userland's
msg.  The eor bit will prevent data from appending to that
skb in the future.

This patch handles the tcp_sendmsg and tcp_sendpage cases.

The followup patches will handle other skb coalescing
and fragment cases.

One potential use case is to use MSG_EOR with
SOF_TIMESTAMPING_TX_ACK to get a more accurate
TCP ack timestamping on application protocol with
multiple outgoing response messages (e.g. HTTP2).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
---
 include/net/tcp.h | 3 ++-
 net/ipv4/tcp.c    | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c0ef054..ac31798 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -762,7 +762,8 @@ struct tcp_skb_cb {
 
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
-			unused:7;
+			eor:1,		/* Is skb MSG_EOR marked */
+			unused:6;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct inet_skb_parm	h4;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4d73858..7df0c1a88 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -908,7 +908,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 		int copy, i;
 		bool can_coalesce;
 
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+		    TCP_SKB_CB(skb)->eor) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -960,6 +961,7 @@ new_segment:
 		size -= copy;
 		if (!size) {
 			tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+			TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
 			goto out;
 		}
 
@@ -1156,7 +1158,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			copy = max - skb->len;
 		}
 
-		if (copy <= 0) {
+		if (copy <= 0 || TCP_SKB_CB(skb)->eor) {
 new_segment:
 			/* Allocate new segment. If the interface is SG,
 			 * allocate skb fitting to single page.
@@ -1250,6 +1252,7 @@ new_segment:
 		copied += copy;
 		if (!msg_data_left(msg)) {
 			tcp_tx_timestamp(sk, sockc.tsflags, skb);
+			TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
 			goto out;
 		}
 
-- 
2.5.1

^ permalink raw reply related

* [RFC PATCH v3 net-next 3/3] tcp: Handle eor bit when fragmenting a skb
From: Martin KaFai Lau @ 2016-04-20  6:24 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team
In-Reply-To: <1461133497-1515104-1-git-send-email-kafai@fb.com>

When fragmenting a skb, the next_skb should carry
the eor from prev_skb.  The eor of prev_skb should
also be reset.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_output.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 96bdf98..95f419b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1128,6 +1128,12 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
 	}
 }
 
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+	TCP_SKB_CB(skb)->eor = 0;
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -1173,6 +1179,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
 	TCP_SKB_CB(buff)->tcp_flags = flags;
 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+	tcp_skb_fragment_eor(skb, buff);
 
 	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
 		/* Copy and checksum data tail into the new buffer. */
@@ -1733,6 +1740,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	/* This packet was never sent out yet, so no SACK bits. */
 	TCP_SKB_CB(buff)->sacked = 0;
 
+	tcp_skb_fragment_eor(skb, buff);
+
 	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
 	tcp_fragment_tstamp(skb, buff);
-- 
2.5.1

^ permalink raw reply related

* Re: [Intel-wired-lan] [PATCH net-next V5 2/2] intel: ixgbevf: Support Windows hosts (Hyper-V)
From: Jeff Kirsher @ 2016-04-20  6:24 UTC (permalink / raw)
  To: K. Y. Srinivasan, davem, netdev, linux-kernel, devel, olaf, apw,
	jasowang, eli, jackm, yevgenyp, john.ronciak, intel-wired-lan,
	alexander.duyck
In-Reply-To: <1461118677-28142-2-git-send-email-kys@microsoft.com>

[-- Attachment #1: Type: text/plain, Size: 2169 bytes --]

On Tue, 2016-04-19 at 19:17 -0700, K. Y. Srinivasan wrote:
> On Hyper-V, the VF/PF communication is a via software mediated path
> as opposed to the hardware mailbox. Make the necessary
> adjustments to support Hyper-V.
> 
> Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
> ---
>         V2: Addressed most of the comments from
>             Alexander Duyck <alexander.duyck@gmail.com>
>             and Rustad, Mark D <mark.d.rustad@intel.com>.
> 
>         V3: Addressed additional comments from
>             Alexander Duyck <alexander.duyck@gmail.com>
> 
>         V4: Addressed kbuild errors reported by:
>             kbuild test robot <lkp@intel.com>
> 
>         V5: Addressed additional comments from
>             Alexander Duyck <alexander.duyck@gmail.com>

First I commend you on actually making a proper changelog for a patch.
 The only issue I have is that the changelog provides not actual
changes.  Saying you reacted to comments does not summarize what the
actual changes were.  I purposely did not review the earlier versions
because by the time I went to do a review, there was another version
already submitted.  So your changelog requires that users look at all
the previous emails to actually see what Alex and Mark requested in
changes.

I am telling you this for future patches, it will not impact these
patches, just really disappointing.  I will take the extra time to
track down all the previous emails and the changes requested to ensure
you made all the changes that were requested of you. :-(  This will
just delay when I apply the patches for testing.

> 
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |   12 ++
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   31 +++-
>  drivers/net/ethernet/intel/ixgbevf/mbx.c          |   12 ++
>  drivers/net/ethernet/intel/ixgbevf/vf.c           |  216
> +++++++++++++++++++++
>  drivers/net/ethernet/intel/ixgbevf/vf.h           |    2 +
>  5 files changed, 266 insertions(+), 7 deletions(-)

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* [PATCH net-next 1/2] tcp: Carry txstamp_ack in tcp_fragment_tstamp
From: Martin KaFai Lau @ 2016-04-20  5:50 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team
In-Reply-To: <1461131448-1460418-1-git-send-email-kafai@fb.com>

When a tcp skb is sliced into two smaller skbs (e.g. in
tcp_fragment() and tso_fragment()),  it does not carry
the txstamp_ack bit to the newly created skb if it is needed.
The end result is a timestamping event (SCM_TSTAMP_ACK) will
be missing from the sk->sk_error_queue.

This patch carries this bit to the new skb2
in tcp_fragment_tstamp().

BPF Output Before:
~~~~~~
<No output due to missing SCM_TSTAMP_ACK timestamp>

BPF Output After:
~~~~~~
<...>-2050  [000] d.s.   100.928763: : ee_data:14599

Packetdrill Script:
~~~~~~
+0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10`
+0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 14600) = 14600
+0 setsockopt(4, SOL_SOCKET, 37, [2176], 4) = 0

0.200 > . 1:7301(7300) ack 1
0.200 > P. 7301:14601(7300) ack 1

0.300 < . 1:1(0) ack 14601 win 257

0.300 close(4) = 0
0.300 > F. 14601:14601(0) ack 1
0.400 < F. 1:1(0) ack 16062 win 257
0.400 > . 14602:14602(0) ack 2

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
---
 net/ipv4/tcp_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 96182a2..f7c3bc0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1123,6 +1123,8 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
 		shinfo->tx_flags &= ~tsflags;
 		shinfo2->tx_flags |= tsflags;
 		swap(shinfo->tskey, shinfo2->tskey);
+		TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+		TCP_SKB_CB(skb)->txstamp_ack = 0;
 	}
 }
 
-- 
2.5.1

^ permalink raw reply related

* [PATCH net-next 2/2] tcp: Merge txstamp_ack in tcp_skb_collapse_tstamp
From: Martin KaFai Lau @ 2016-04-20  5:50 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team
In-Reply-To: <1461131448-1460418-1-git-send-email-kafai@fb.com>

When collapsing skbs, txstamp_ack also needs to be merged.

Retrans Collapse Test:
~~~~~~
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

0.200 write(4, ..., 730) = 730
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 730) = 730
+0 setsockopt(4, SOL_SOCKET, 37, [2176], 4) = 0
0.200 write(4, ..., 11680) = 11680

0.200 > P. 1:731(730) ack 1
0.200 > P. 731:1461(730) ack 1
0.200 > . 1461:8761(7300) ack 1
0.200 > P. 8761:13141(4380) ack 1

0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:5841,nop,nop>
0.300 > P. 1:1461(1460) ack 1
0.400 < . 1:1(0) ack 13141 win 257

BPF Output Before:
~~~~~
<No output due to missing SCM_TSTAMP_ACK timestamp>

BPF Output After:
~~~~~
<...>-2027  [007] d.s.    79.765921: : ee_data:1459

Sacks Collapse Test:
~~~~~
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

0.200 write(4, ..., 1460) = 1460
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 13140) = 13140
+0 setsockopt(4, SOL_SOCKET, 37, [2176], 4) = 0

0.200 > P. 1:1461(1460) ack 1
0.200 > . 1461:8761(7300) ack 1
0.200 > P. 8761:14601(5840) ack 1

0.300 < . 1:1(0) ack 1 win 257 <sack 1461:14601,nop,nop>
0.300 > P. 1:1461(1460) ack 1
0.400 < . 1:1(0) ack 14601 win 257

BPF Output Before:
~~~~~
<No output due to missing SCM_TSTAMP_ACK timestamp>

BPF Output After:
~~~~~
<...>-2049  [007] d.s.    89.185538: : ee_data:14599

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f7c3bc0..a6e4a83 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2454,6 +2454,8 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 
 		shinfo->tx_flags |= tsflags;
 		shinfo->tskey = next_shinfo->tskey;
+		TCP_SKB_CB(skb)->txstamp_ack |=
+			TCP_SKB_CB(next_skb)->txstamp_ack;
 	}
 }
 
-- 
2.5.1

^ permalink raw reply related

* [PATCH net-next 0/2] tcp: Handle txstamp_ack when fragmenting/coalescing skbs
From: Martin KaFai Lau @ 2016-04-20  5:50 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team

This patchset is to handle the txstamp-ack bit when
fragmenting/coalescing skbs.

The second patch depends on the recently posted series
for the net branch:
"tcp: Merge timestamp info when coalescing skbs"

A BPF prog is used to kprobe to sock_queue_err_skb()
and print out the value of serr->ee.ee_data.  The BPF
prog (run-able from bcc) is attached here:

BPF prog used for testing:
~~~~~
#!/usr/bin/env python

from __future__ import print_function
from bcc import BPF

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
#include <linux/errqueue.h>

#ifdef memset
#undef memset
#endif

int trace_err_skb(struct pt_regs *ctx)
{
	struct sk_buff *skb = (struct sk_buff *)ctx->si;
	struct sock *sk = (struct sock *)ctx->di;
	struct sock_exterr_skb *serr;
	u32 ee_data = 0;

	if (!sk || !skb)
		return 0;

	serr = SKB_EXT_ERR(skb);
	bpf_probe_read(&ee_data, sizeof(ee_data), &serr->ee.ee_data);
	bpf_trace_printk("ee_data:%u\\n", ee_data);

	return 0;
};
"""

b = BPF(text=bpf_text)
b.attach_kprobe(event="sock_queue_err_skb", fn_name="trace_err_skb")
print("Attached to kprobe")
b.trace_print()

^ permalink raw reply

* [PATCH net 1/2] tcp: Merge tx_flags and tskey in tcp_collapse_retrans
From: Martin KaFai Lau @ 2016-04-20  5:39 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Neal Cardwell, Soheil Hassas Yeganeh,
	Willem de Bruijn, Yuchung Cheng, Kernel Team
In-Reply-To: <1461130769-1442865-1-git-send-email-kafai@fb.com>

If two skbs are merged/collapsed during retransmission, the current
logic does not merge the tx_flags and tskey.  The end result is
the SCM_TSTAMP_ACK timestamp could be missing for a packet.

The patch:
1. Merge the tx_flags
2. Overwrite the prev_skb's tskey with the next_skb's tskey

BPF Output Before:
~~~~~~
<no-output-due-to-missing-tstamp-event>

BPF Output After:
~~~~~~
packetdrill-2092  [001] d.s.   453.998486: : ee_data:1459

Packetdrill Script:
~~~~~~
+0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10`
+0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1`
+0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0

0.100 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
+0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0

0.200 write(4, ..., 730) = 730
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0
0.200 write(4, ..., 730) = 730
+0 setsockopt(4, SOL_SOCKET, 37, [2176], 4) = 0
0.200 write(4, ..., 11680) = 11680
+0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0

0.200 > P. 1:731(730) ack 1
0.200 > P. 731:1461(730) ack 1
0.200 > . 1461:8761(7300) ack 1
0.200 > P. 8761:13141(4380) ack 1

0.300 < . 1:1(0) ack 1 win 257 <sack 1461:2921,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:4381,nop,nop>
0.300 < . 1:1(0) ack 1 win 257 <sack 1461:5841,nop,nop>
0.300 > P. 1:1461(1460) ack 1
0.400 < . 1:1(0) ack 13141 win 257

0.400 close(4) = 0
0.400 > F. 13141:13141(0) ack 1
0.500 < F. 1:1(0) ack 13142 win 257
0.500 > . 13142:13142(0) ack 2

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_output.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d2dc01..5bc3c30 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2441,6 +2441,20 @@ u32 __tcp_select_window(struct sock *sk)
 	return window;
 }
 
+static void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+				    const struct sk_buff *next_skb)
+{
+	const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
+	u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+	if (unlikely(tsflags)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+		shinfo->tx_flags |= tsflags;
+		shinfo->tskey = next_shinfo->tskey;
+	}
+}
+
 /* Collapses two adjacent SKB's during retransmission. */
 static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 {
@@ -2484,6 +2498,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 
 	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
 
+	tcp_skb_collapse_tstamp(skb, next_skb);
+
 	sk_wmem_free_skb(sk, next_skb);
 }
 
-- 
2.5.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox