Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next] selftests/net: fix bugs in cfg_port initialization
From: Sowmini Varadhan @ 2017-12-25 22:17 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Willem de Bruijn, Network Development, David Miller
In-Reply-To: <CAF=yD-KvPZxHHGtDaoMJhf=qEUkYzae3mbwYp4ZF6uSOHX9MWQ@mail.gmail.com>

On (12/25/17 15:49), Willem de Bruijn wrote:
> 
> It would be good to also address the other instance of this at the
> same time: protocol family (-4 or -6) has to be set before a call to
> setup_sockaddr. Unlike this case, it hits an error() if called in the
> "incorrect" order, but that is still a crutch.

Yeah, I thought of that one too, but "usually" (at least
that's what is instinctive to me) you bind to INADDR_ANY
or ::, so this only becomes an issue on the client side. 
(And there, most people put the -4 or -6 before the address,
but I agree it would be nice to fix this..)

> But even better will be to save cfg_port, and src + dst addr optargs
> as local variables, then call the init function only after parsing when
> all state is available. Where this patch adds calls to
> init_sockaddr_port, indeed.

sure, I can put that out on V2 later this week.

^ permalink raw reply

* [PATCH bpf-next 1/3] libbpf: add function to setup XDP
From: Eric Leblond @ 2017-12-25 22:13 UTC (permalink / raw)
  To: netdev, daniel; +Cc: linux-kernel, ast, Eric Leblond
In-Reply-To: <20171225221325.9680-1-eric@regit.org>

Most of the code is taken from set_link_xdp_fd() in bpf_load.c and
slightly modified to be library compliant.

Signed-off-by: Eric Leblond <eric@regit.org>
---
 tools/lib/bpf/bpf.c    | 126 ++++++++++++++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.c |   2 +
 tools/lib/bpf/libbpf.h |   4 ++
 3 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 5128677e4117..1e3cfe6b9fce 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -25,6 +25,16 @@
 #include <asm/unistd.h>
 #include <linux/bpf.h>
 #include "bpf.h"
+#include "libbpf.h"
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <errno.h>
+
+#ifndef IFLA_XDP_MAX
+#define IFLA_XDP	43
+#define IFLA_XDP_FD	1
+#define IFLA_XDP_FLAGS	3
+#endif
 
 /*
  * When building perf, unistd.h is overridden. __NR_bpf is
@@ -46,8 +56,6 @@
 # endif
 #endif
 
-#define min(x, y) ((x) < (y) ? (x) : (y))
-
 static inline __u64 ptr_to_u64(const void *ptr)
 {
 	return (__u64) (unsigned long) ptr;
@@ -413,3 +421,117 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
 
 	return err;
 }
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
+{
+	struct sockaddr_nl sa;
+	int sock, seq = 0, len, ret = -1;
+	char buf[4096];
+	struct nlattr *nla, *nla_xdp;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	struct nlmsghdr *nh;
+	struct nlmsgerr *err;
+	socklen_t addrlen;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0) {
+		return -errno;
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	addrlen = sizeof(sa);
+	if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+		ret = errno;
+		goto cleanup;
+	}
+
+	if (addrlen != sizeof(sa)) {
+		ret = errno;
+		goto cleanup;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+
+	/* started nested attribute for XDP */
+	nla = (struct nlattr *)(((char *)&req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+	nla->nla_len = NLA_HDRLEN;
+
+	/* add XDP fd */
+	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+	nla_xdp->nla_type = IFLA_XDP_FD;
+	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+	nla->nla_len += nla_xdp->nla_len;
+
+	/* if user passed in any flags, add those too */
+	if (flags) {
+		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+		nla_xdp->nla_type = IFLA_XDP_FLAGS;
+		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+		nla->nla_len += nla_xdp->nla_len;
+	}
+
+	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	len = recv(sock, buf, sizeof(buf), 0);
+	if (len < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+	     nh = NLMSG_NEXT(nh, len)) {
+		if (nh->nlmsg_pid != sa.nl_pid) {
+			ret = -LIBBPF_ERRNO__WRNGPID;
+			goto cleanup;
+		}
+		if (nh->nlmsg_seq != seq) {
+			ret = -LIBBPF_ERRNO__INVSEQ;
+			goto cleanup;
+		}
+		switch (nh->nlmsg_type) {
+		case NLMSG_ERROR:
+			err = (struct nlmsgerr *)NLMSG_DATA(nh);
+			if (!err->error)
+				continue;
+			ret = err->error;
+			goto cleanup;
+		case NLMSG_DONE:
+			break;
+		default:
+			break;
+		}
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock);
+	return ret;
+}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e9c4b7cabcf2..5fe8aaa2123e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -106,6 +106,8 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(PROG2BIG)]	= "Program too big",
 	[ERRCODE_OFFSET(KVER)]		= "Incorrect kernel version",
 	[ERRCODE_OFFSET(PROGTYPE)]	= "Kernel doesn't support this program type",
+	[ERRCODE_OFFSET(WRNGPID)]	= "Wrong pid in netlink message",
+	[ERRCODE_OFFSET(INVSEQ)]	= "Invalid netlink sequence",
 };
 
 int libbpf_strerror(int err, char *buf, size_t size)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 6e20003109e0..e42f96900318 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -42,6 +42,8 @@ enum libbpf_errno {
 	LIBBPF_ERRNO__PROG2BIG,	/* Program too big */
 	LIBBPF_ERRNO__KVER,	/* Incorrect kernel version */
 	LIBBPF_ERRNO__PROGTYPE,	/* Kernel doesn't support this program type */
+	LIBBPF_ERRNO__WRNGPID,	/* Wrong pid in netlink message */
+	LIBBPF_ERRNO__INVSEQ,	/* Invalid netlink sequence */
 	__LIBBPF_ERRNO__END,
 };
 
@@ -246,4 +248,6 @@ long libbpf_get_error(const void *ptr);
 
 int bpf_prog_load(const char *file, enum bpf_prog_type type,
 		  struct bpf_object **pobj, int *prog_fd);
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 #endif
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 3/3] libbpf: break loop earlier
From: Eric Leblond @ 2017-12-25 22:13 UTC (permalink / raw)
  To: netdev, daniel; +Cc: linux-kernel, ast, Eric Leblond
In-Reply-To: <20171225221325.9680-1-eric@regit.org>

Get out of the loop when we have a match.

Signed-off-by: Eric Leblond <eric@regit.org>
---
 tools/lib/bpf/libbpf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5fe8aaa2123e..d263748aa341 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -412,6 +412,7 @@ bpf_object__init_prog_names(struct bpf_object *obj)
 					   prog->section_name);
 				return -LIBBPF_ERRNO__LIBELF;
 			}
+			break;
 		}
 
 		if (!name) {
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 2/3] libbpf: add error reporting in XDP
From: Eric Leblond @ 2017-12-25 22:13 UTC (permalink / raw)
  To: netdev, daniel; +Cc: linux-kernel, ast, Eric Leblond
In-Reply-To: <20171225221325.9680-1-eric@regit.org>

Parse netlink ext attribute to get the error message returned by
the card.

Signed-off-by: Eric Leblond <eric@regit.org>
---
 tools/lib/bpf/Build    |   2 +-
 tools/lib/bpf/bpf.c    |   9 +++
 tools/lib/bpf/nlattr.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/nlattr.h | 164 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 362 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/bpf/nlattr.c
 create mode 100644 tools/lib/bpf/nlattr.h

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index d8749756352d..64c679d67109 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o
+libbpf-y := libbpf.o bpf.o nlattr.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 1e3cfe6b9fce..cfd30a0cbce4 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -26,6 +26,7 @@
 #include <linux/bpf.h>
 #include "bpf.h"
 #include "libbpf.h"
+#include "nlattr.h"
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
 #include <errno.h>
@@ -436,6 +437,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 	struct nlmsghdr *nh;
 	struct nlmsgerr *err;
 	socklen_t addrlen;
+	int one;
 
 	memset(&sa, 0, sizeof(sa));
 	sa.nl_family = AF_NETLINK;
@@ -445,6 +447,12 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 		return -errno;
 	}
 
+	if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one)) < 0) {
+		/* debug/verbose message that it is not supported */
+		fprintf(stderr, "Netlink error reporting not supported\n"); 
+	}
+
 	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
 		ret = -errno;
 		goto cleanup;
@@ -521,6 +529,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 			if (!err->error)
 				continue;
 			ret = err->error;
+			nla_dump_errormsg(nh);
 			goto cleanup;
 		case NLMSG_DONE:
 			break;
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
new file mode 100644
index 000000000000..962de14f74e3
--- /dev/null
+++ b/tools/lib/bpf/nlattr.c
@@ -0,0 +1,188 @@
+
+/*
+ * NETLINK      Netlink attributes
+ *
+ *		Authors:	Thomas Graf <tgraf@suug.ch>
+ *				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <errno.h>
+#include "nlattr.h"
+#include <linux/rtnetlink.h>
+#include <string.h>
+#include <stdio.h>
+
+static const __u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
+	[NLA_U8]	= sizeof(__u8),
+	[NLA_U16]	= sizeof(__u16),
+	[NLA_U32]	= sizeof(__u32),
+	[NLA_U64]	= sizeof(__u64),
+	[NLA_MSECS]	= sizeof(__u64),
+	[NLA_NESTED]	= NLA_HDRLEN,
+	[NLA_S8]	= sizeof(__s8),
+	[NLA_S16]	= sizeof(__s16),
+	[NLA_S32]	= sizeof(__s32),
+	[NLA_S64]	= sizeof(__s64),
+};
+
+static int validate_nla(const struct nlattr *nla, int maxtype,
+			const struct nla_policy *policy)
+{
+	const struct nla_policy *pt;
+	int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
+
+	if (type <= 0 || type > maxtype)
+		return 0;
+
+	pt = &policy[type];
+
+	if (pt->type > NLA_TYPE_MAX)
+		return -EINVAL;
+
+	switch (pt->type) {
+	case NLA_FLAG:
+		if (attrlen > 0)
+			return -ERANGE;
+		break;
+
+	case NLA_NUL_STRING:
+		if (pt->len)
+			minlen = min(attrlen, pt->len + 1);
+		else
+			minlen = attrlen;
+
+		if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL)
+			return -EINVAL;
+		/* fall through */
+
+	case NLA_STRING:
+		if (attrlen < 1)
+			return -ERANGE;
+
+		if (pt->len) {
+			char *buf = nla_data(nla);
+
+			if (buf[attrlen - 1] == '\0')
+				attrlen--;
+
+			if (attrlen > pt->len)
+				return -ERANGE;
+		}
+		break;
+
+	case NLA_BINARY:
+		if (pt->len && attrlen > pt->len)
+			return -ERANGE;
+		break;
+
+	case NLA_NESTED_COMPAT:
+		if (attrlen < pt->len)
+			return -ERANGE;
+		if (attrlen < NLA_ALIGN(pt->len))
+			break;
+		if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN)
+			return -ERANGE;
+		nla = nla_data(nla) + NLA_ALIGN(pt->len);
+		if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN + nla_len(nla))
+			return -ERANGE;
+		break;
+	case NLA_NESTED:
+		/* a nested attributes is allowed to be empty; if its not,
+		 * it must have a size of at least NLA_HDRLEN.
+		 */
+		if (attrlen == 0)
+			break;
+	default:
+		if (pt->len)
+			minlen = pt->len;
+		else if (pt->type != NLA_UNSPEC)
+			minlen = nla_attr_minlen[pt->type];
+
+		if (attrlen < minlen)
+			return -ERANGE;
+	}
+
+	return 0;
+}
+
+/**
+ * nla_parse - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be silently ignored for backwards compatibility
+ * reasons. policy may be set to NULL if no validation is required.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
+	      int len, const struct nla_policy *policy)
+{
+	const struct nlattr *nla;
+	int rem, err;
+
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+
+	nla_for_each_attr(nla, head, len, rem) {
+		__u16 type = nla_type(nla);
+
+		if (type > 0 && type <= maxtype) {
+			if (policy) {
+				err = validate_nla(nla, maxtype, policy);
+				if (err < 0)
+					goto errout;
+			}
+
+			tb[type] = (struct nlattr *)nla;
+		}
+	}
+
+	err = 0;
+errout:
+	return err;
+}
+
+/* dump netlink extended ack error message */
+int nla_dump_errormsg(struct nlmsghdr *nlh)
+{
+	const struct nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = {
+		[NLMSGERR_ATTR_MSG]	= { .type = NLA_STRING },
+		[NLMSGERR_ATTR_OFFS]	= { .type = NLA_U32 },
+	};
+	struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr;
+	struct nlmsgerr *err;
+	char *errmsg = NULL;
+	int hlen, alen;
+
+	/* no TLVs, nothing to do here */
+	if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
+		return 0;
+
+	err = (struct nlmsgerr *)NLMSG_DATA(nlh);
+	hlen = sizeof(*err);
+
+	/* if NLM_F_CAPPED is set then the inner err msg was capped */
+	if (!(nlh->nlmsg_flags & NLM_F_CAPPED))
+		hlen += nlmsg_len(&err->msg);
+
+	attr = (struct nlattr *) ((void *) err + hlen);
+	alen = nlh->nlmsg_len - hlen;
+
+	if (nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) {
+		fprintf(stderr,
+			"Failed to parse extended error attributes\n");
+		return 0;
+	}
+
+	if (tb[NLMSGERR_ATTR_MSG])
+		errmsg = (char *) nla_data(tb[NLMSGERR_ATTR_MSG]);
+
+	fprintf(stderr, "Kernel error message: %s\n", errmsg);
+
+	return 0;
+}
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
new file mode 100644
index 000000000000..b95f3e64c14d
--- /dev/null
+++ b/tools/lib/bpf/nlattr.h
@@ -0,0 +1,164 @@
+#ifndef __NLATTR_H
+#define __NLATTR_H
+
+#include <linux/netlink.h>
+
+/**
+ * Standard attribute types to specify validation policy
+ */
+enum {
+	NLA_UNSPEC,
+	NLA_U8,
+	NLA_U16,
+	NLA_U32,
+	NLA_U64,
+	NLA_STRING,
+	NLA_FLAG,
+	NLA_MSECS,
+	NLA_NESTED,
+	NLA_NESTED_COMPAT,
+	NLA_NUL_STRING,
+	NLA_BINARY,
+	NLA_S8,
+	NLA_S16,
+	NLA_S32,
+	NLA_S64,
+	__NLA_TYPE_MAX,
+};
+
+#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)
+
+/**
+ * nla_type - attribute type
+ * @nla: netlink attribute
+ */
+static inline int nla_type(const struct nlattr *nla)
+{
+	return nla->nla_type & NLA_TYPE_MASK;
+}
+
+/**
+ * nla_len - length of payload
+ * @nla: netlink attribute
+ */
+static inline int nla_len(const struct nlattr *nla)
+{
+	return nla->nla_len - NLA_HDRLEN;
+}
+
+/**
+ * struct nla_policy - attribute validation policy
+ * @type: Type of attribute or NLA_UNSPEC
+ * @len: Type specific length of payload
+ *
+ * Policies are defined as arrays of this struct, the array must be
+ * accessible by attribute type up to the highest identifier to be expected.
+ *
+ * Meaning of `len' field:
+ *    NLA_STRING           Maximum length of string
+ *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
+ *    NLA_FLAG             Unused
+ *    NLA_BINARY           Maximum length of attribute payload
+ *    NLA_NESTED           Don't use `len' field -- length verification is
+ *                         done by checking len of nested header (or empty)
+ *    NLA_NESTED_COMPAT    Minimum length of structure payload
+ *    NLA_U8, NLA_U16,
+ *    NLA_U32, NLA_U64,
+ *    NLA_S8, NLA_S16,
+ *    NLA_S32, NLA_S64,
+ *    NLA_MSECS            Leaving the length field zero will verify the
+ *                         given type fits, using it verifies minimum length
+ *                         just like "All other"
+ *    All other            Minimum length of attribute payload
+ *
+ * Example:
+ * static const struct nla_policy my_policy[ATTR_MAX+1] = {
+ *      [ATTR_FOO] = { .type = NLA_U16 },
+ *      [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
+ *      [ATTR_BAZ] = { .len = sizeof(struct mystruct) },
+ * };
+ */
+struct nla_policy {
+	__u16	type;
+	__u16	len;
+};
+
+/**
+ * nla_ok - check if the netlink attribute fits into the remaining bytes
+ * @nla: netlink attribute
+ * @remaining: number of bytes remaining in attribute stream
+ */
+static inline int nla_ok(const struct nlattr *nla, int remaining)
+{
+	return remaining >= (int) sizeof(*nla) &&
+	       nla->nla_len >= sizeof(*nla) &&
+	       nla->nla_len <= remaining;
+}
+
+/**
+ * nla_next - next netlink attribute in attribute stream
+ * @nla: netlink attribute
+ * @remaining: number of bytes remaining in attribute stream
+ *
+ * Returns the next netlink attribute in the attribute stream and
+ * decrements remaining by the size of the current attribute.
+ */
+static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
+{
+	unsigned int totlen = NLA_ALIGN(nla->nla_len);
+
+	*remaining -= totlen;
+	return (struct nlattr *) ((char *) nla + totlen);
+}
+
+/**
+ * nla_for_each_attr - iterate over a stream of attributes
+ * @pos: loop counter, set to current attribute
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nla_for_each_attr(pos, head, len, rem) \
+	for (pos = head, rem = len; \
+	     nla_ok(pos, rem); \
+	     pos = nla_next(pos, &(rem)))
+
+/**
+ * nla_data - head of payload
+ * @nla: netlink attribute
+ */
+static inline void *nla_data(const struct nlattr *nla)
+{
+	return (char *) nla + NLA_HDRLEN;
+}
+
+/**
+ * nla_get_u32 - return payload of u32 attribute
+ * @nla: u32 netlink attribute
+ */
+static inline __u32 nla_get_u32(const struct nlattr *nla)
+{
+	return *(__u32 *) nla_data(nla);
+}
+
+/**
+ * nla_get_u16 - return payload of u16 attribute
+ * @nla: u16 netlink attribute
+ */
+static inline __u16 nla_get_u16(const struct nlattr *nla)
+{
+	return *(__u16 *) nla_data(nla);
+}
+
+/**
+ * nlmsg_len - length of message payload
+ * @nlh: netlink message header
+ */
+static inline int nlmsg_len(const struct nlmsghdr *nlh)
+{
+	return nlh->nlmsg_len - NLMSG_HDRLEN;
+}
+
+int nla_dump_errormsg(struct nlmsghdr *nlh);
+
+#endif /* __NLATTR_H */
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 0/3] add XDP loading support to libbpf
From: Eric Leblond @ 2017-12-25 22:13 UTC (permalink / raw)
  To: netdev, daniel; +Cc: linux-kernel, ast


Hello,

This patchset adds a function to load XDP eBPF file in the libbpf
library. It gets the netlink extended ack from the driver in case
of failure and print the error to stderr.

Best regards,
--
Eric Leblond

^ permalink raw reply

* [PATCH v2 bpf-next 2/3] selftests/bpf: additional stack depth tests
From: Alexei Starovoitov @ 2017-12-25 21:15 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Jann Horn, netdev, kernel-team
In-Reply-To: <20171225211542.3155576-1-ast@kernel.org>

to test inner logic of stack depth tracking

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 121 ++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 41dcc7dbba42..b5a7a6c530dc 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -8764,6 +8764,127 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
+		"calls: stack depth check using three frames. test1",
+		.insns = {
+			/* main */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* A */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+			BPF_EXIT_INSN(),
+			/* B */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_XDP,
+		/* stack_main=32, stack_A=256, stack_B=64
+		 * and max(main+A, main+A+B) < 512
+		 */
+		.result = ACCEPT,
+	},
+	{
+		"calls: stack depth check using three frames. test2",
+		.insns = {
+			/* main */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* A */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+			BPF_EXIT_INSN(),
+			/* B */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_XDP,
+		/* stack_main=32, stack_A=64, stack_B=256
+		 * and max(main+A, main+A+B) < 512
+		 */
+		.result = ACCEPT,
+	},
+	{
+		"calls: stack depth check using three frames. test3",
+		.insns = {
+			/* main */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 6), /* call A */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 8), /* call B */
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 0, 1),
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* A */
+			BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 10, 1),
+			BPF_EXIT_INSN(),
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -224, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+			/* B */
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 1),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -6), /* call A */
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_XDP,
+		/* stack_main=64, stack_A=224, stack_B=256
+		 * and max(main+A, main+A+B) > 512
+		 */
+		.errstr = "combined stack",
+		.result = REJECT,
+	},
+	{
+		"calls: stack depth check using three frames. test4",
+		/* void main(void) {
+		 *   func1(0);
+		 *   func1(1);
+		 *   func2(1);
+		 * }
+		 * void func1(int alloc_or_recurse) {
+		 *   if (alloc_or_recurse) {
+		 *     frame_pointer[-300] = 1;
+		 *   } else {
+		 *     func2(alloc_or_recurse);
+		 *   }
+		 * }
+		 * void func2(int alloc_or_recurse) {
+		 *   if (alloc_or_recurse) {
+		 *     frame_pointer[-300] = 1;
+		 *   }
+		 * }
+		 */
+		.insns = {
+			/* main */
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 6), /* call A */
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 7), /* call B */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* A */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 2),
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+			BPF_EXIT_INSN(),
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */
+			BPF_EXIT_INSN(),
+			/* B */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+			BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_XDP,
+		.result = REJECT,
+		.errstr = "combined stack",
+	},
+	{
 		"calls: spill into caller stack frame",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
-- 
2.9.5

^ permalink raw reply related

* [PATCH v2 bpf-next 3/3] bpf: fix max call depth check
From: Alexei Starovoitov @ 2017-12-25 21:15 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Jann Horn, netdev, kernel-team
In-Reply-To: <20171225211542.3155576-1-ast@kernel.org>

fix off by one error in max call depth check
and add a test

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       |  4 ++--
 tools/testing/selftests/bpf/test_verifier.c | 35 +++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 738e919efdf0..52ad60b3b8be 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2126,9 +2126,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	struct bpf_func_state *caller, *callee;
 	int i, subprog, target_insn;
 
-	if (state->curframe >= MAX_CALL_FRAMES) {
+	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
 		verbose(env, "the call stack of %d frames is too deep\n",
-			state->curframe);
+			state->curframe + 2);
 		return -E2BIG;
 	}
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index b5a7a6c530dc..5d0a574ce270 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -8885,6 +8885,41 @@ static struct bpf_test tests[] = {
 		.errstr = "combined stack",
 	},
 	{
+		"calls: stack depth check using three frames. test5",
+		.insns = {
+			/* main */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */
+			BPF_EXIT_INSN(),
+			/* A */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */
+			BPF_EXIT_INSN(),
+			/* B */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */
+			BPF_EXIT_INSN(),
+			/* C */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */
+			BPF_EXIT_INSN(),
+			/* D */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */
+			BPF_EXIT_INSN(),
+			/* E */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */
+			BPF_EXIT_INSN(),
+			/* F */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */
+			BPF_EXIT_INSN(),
+			/* G */
+			BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */
+			BPF_EXIT_INSN(),
+			/* H */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_XDP,
+		.errstr = "call stack",
+		.result = REJECT,
+	},
+	{
 		"calls: spill into caller stack frame",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
-- 
2.9.5

^ permalink raw reply related

* [PATCH v2 bpf-next 0/3] bpf: two stack check fixes
From: Alexei Starovoitov @ 2017-12-25 21:15 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Jann Horn, netdev, kernel-team

Jann reported an issue with stack depth tracking. Fix it and add tests.
Also fix off-by-one error in MAX_CALL_FRAMES check.
This set is on top of Jann's "selftest for late caller stack size increase" test.

Alexei Starovoitov (3):
  bpf: fix maximum stack depth tracking logic
  selftests/bpf: additional stack depth tests
  bpf: fix max call depth check

 include/linux/bpf_verifier.h                |   1 +
 kernel/bpf/verifier.c                       |  86 +++++++++++----
 tools/testing/selftests/bpf/test_verifier.c | 156 ++++++++++++++++++++++++++++
 3 files changed, 225 insertions(+), 18 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH v2 bpf-next 1/3] bpf: fix maximum stack depth tracking logic
From: Alexei Starovoitov @ 2017-12-25 21:15 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Jann Horn, netdev, kernel-team
In-Reply-To: <20171225211542.3155576-1-ast@kernel.org>

Instead of computing max stack depth for current call chain
during the main verifier pass track stack depth of each
function independently and after do_check() is done do
another pass over all instructions analyzing depth
of all possible call stacks.

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 82 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index aaac589e490c..94a02ceb1246 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -194,6 +194,7 @@ struct bpf_verifier_env {
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifer_log log;
 	u32 subprog_starts[BPF_MAX_SUBPROGS];
+	/* computes the stack depth of each bpf function */
 	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 82ae580440b8..738e919efdf0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1430,33 +1430,80 @@ static int update_stack_depth(struct bpf_verifier_env *env,
 			      const struct bpf_func_state *func,
 			      int off)
 {
-	u16 stack = env->subprog_stack_depth[func->subprogno], total = 0;
-	struct bpf_verifier_state *cur = env->cur_state;
-	int i;
+	u16 stack = env->subprog_stack_depth[func->subprogno];
 
 	if (stack >= -off)
 		return 0;
 
 	/* update known max for given subprogram */
 	env->subprog_stack_depth[func->subprogno] = -off;
+	return 0;
+}
 
-	/* compute the total for current call chain */
-	for (i = 0; i <= cur->curframe; i++) {
-		u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno];
-
-		/* round up to 32-bytes, since this is granularity
-		 * of interpreter stack sizes
-		 */
-		depth = round_up(depth, 32);
-		total += depth;
-	}
+/* starting from main bpf function walk all instructions of the function
+ * and recursively walk all callees that given function can call.
+ * Ignore jump and exit insns.
+ * Since recursion is prevented by check_cfg() this algorithm
+ * only needs a local stack of MAX_CALL_FRAMES to remember callsites
+ */
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	int ret_insn[MAX_CALL_FRAMES];
+	int ret_prog[MAX_CALL_FRAMES];
 
-	if (total > MAX_BPF_STACK) {
+process_func:
+	/* round up to 32-bytes, since this is granularity
+	 * of interpreter stack size
+	 */
+	depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	if (depth > MAX_BPF_STACK) {
 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
-			cur->curframe, total);
+			frame + 1, depth);
 		return -EACCES;
 	}
-	return 0;
+continue_func:
+	if (env->subprog_cnt == subprog)
+		subprog_end = insn_cnt;
+	else
+		subprog_end = env->subprog_starts[subprog];
+	for (; i < subprog_end; i++) {
+		if (insn[i].code != (BPF_JMP | BPF_CALL))
+			continue;
+		if (insn[i].src_reg != BPF_PSEUDO_CALL)
+			continue;
+		/* remember insn and function to return to */
+		ret_insn[frame] = i + 1;
+		ret_prog[frame] = subprog;
+
+		/* find the callee */
+		i = i + insn[i].imm + 1;
+		subprog = find_subprog(env, i);
+		if (subprog < 0) {
+			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+				  i);
+			return -EFAULT;
+		}
+		subprog++;
+		frame++;
+		if (frame >= MAX_CALL_FRAMES) {
+			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
+			return -EFAULT;
+		}
+		goto process_func;
+	}
+	/* end of for() loop means the last insn of the 'subprog'
+	 * was reached. Doesn't matter whether it was JA or EXIT
+	 */
+	if (frame == 0)
+		return 0;
+	depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	frame--;
+	i = ret_insn[frame];
+	subprog = ret_prog[frame];
+	goto continue_func;
 }
 
 static int get_callee_stack_depth(struct bpf_verifier_env *env,
@@ -5404,6 +5451,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		sanitize_dead_code(env);
 
 	if (ret == 0)
+		ret = check_max_stack_depth(env);
+
+	if (ret == 0)
 		/* program is valid, convert *(u32*)(ctx + off) accesses */
 		ret = convert_ctx_accesses(env);
 
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next] selftests/net: fix bugs in cfg_port initialization
From: Willem de Bruijn @ 2017-12-25 20:49 UTC (permalink / raw)
  To: Sowmini Varadhan; +Cc: Willem de Bruijn, Network Development, David Miller
In-Reply-To: <1514136201-111109-1-git-send-email-sowmini.varadhan@oracle.com>

On Sun, Dec 24, 2017 at 12:23 PM, Sowmini Varadhan
<sowmini.varadhan@oracle.com> wrote:
> If -S is not used in the command line, we should
> be binding to *.<cfg-port>. Similarly, cfg_port should be
> used to connect to the remote host even if it is processed
> after -D. Thus we need to make sure that the cfg_port in
> cfg_src_addr and cfg_dst_addr are always initialized
> after all other command line options are parsed.

Thanks, Sowmini. Yes, input parsing as is is dependent on
order, which is surprising and fragile.

It would be good to also address the other instance of this at the
same time: protocol family (-4 or -6) has to be set before a call to
setup_sockaddr. Unlike this case, it hits an error() if called in the
"incorrect" order, but that is still a crutch.

The new init_sockaddr_port duplicates most of setup_sockaddr.
More concise is to add a branch to that to skip inet_pton if !str_addr.

But even better will be to save cfg_port, and src + dst addr optargs
as local variables, then call the init function only after parsing when
all state is available. Where this patch adds calls to
init_sockaddr_port, indeed.

^ permalink raw reply

* iproute2 net-next
From: Stephen Hemminger @ 2017-12-25 18:49 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 534 bytes --]

David Ahern has agreed to take over managing the net-next branch of iproute2.
The new location is:
 https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/

In the past, I have accepted new features into iproute2 master branch, but
am changing the policy so that outside of the merge window (up until -rc1)
new features will get put into net-next to get some more review and testing
time. This means that things like the proposed batch streaming mode will
go through net-next.

Thanks
Stephen Hemminger

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Fw: [Bug 198253] New: stopping htb results in null pointer dereference
From: Stephen Hemminger @ 2017-12-25 17:52 UTC (permalink / raw)
  To: netdev



Begin forwarded message:

Date: Mon, 25 Dec 2017 08:25:44 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 198253] New: stopping htb results in null pointer dereference


https://bugzilla.kernel.org/show_bug.cgi?id=198253

            Bug ID: 198253
           Summary: stopping htb results in null pointer dereference
           Product: Networking
           Version: 2.5
    Kernel Version: 4.15
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: low
          Priority: P1
         Component: Other
          Assignee: stephen@networkplumber.org
          Reporter: lorinc.czegledi@gmail.com
        Regression: No

Created attachment 261323
  --> https://bugzilla.kernel.org/attachment.cgi?id=261323&action=edit  
BUG: unable to handle kernel NULL pointer dereference at 0000000000000018

the latest kernel linux415-4.15.r17121 as built by the manjaro team, using
fireqos traffic shapeing frontend with attached config produces BUG: unable to
handle kernel NULL pointer dereference at 0000000000000018

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: Add ethtool stats
From: Florian Fainelli @ 2017-12-25 16:47 UTC (permalink / raw)
  To: Toshiaki Makita, Stephen Hemminger
  Cc: David S . Miller, Michael S . Tsirkin, Jason Wang, netdev,
	virtualization
In-Reply-To: <be339a85-4368-257e-5f23-d0e35c77cd5b@lab.ntt.co.jp>

On December 25, 2017 3:45:36 AM PST, Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
>On 2017/12/25 3:16, Stephen Hemminger wrote:
>> On Wed, 20 Dec 2017 13:40:37 +0900
>> Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
>> 
>>> +
>>> +static const struct virtnet_gstats virtnet_gstrings_stats[] = {
>>> +	{ "rx_packets",		VIRTNET_NETDEV_STAT(rx_packets) },
>>> +	{ "tx_packets",		VIRTNET_NETDEV_STAT(tx_packets) },
>>> +	{ "rx_bytes",		VIRTNET_NETDEV_STAT(rx_bytes) },
>>> +	{ "tx_bytes",		VIRTNET_NETDEV_STAT(tx_bytes) },
>>> +	{ "rx_dropped",		VIRTNET_NETDEV_STAT(rx_dropped) },
>>> +	{ "rx_length_errors",	VIRTNET_NETDEV_STAT(rx_length_errors) },
>>> +	{ "rx_frame_errors",	VIRTNET_NETDEV_STAT(rx_frame_errors) },
>>> +	{ "tx_dropped",		VIRTNET_NETDEV_STAT(tx_dropped) },
>>> +	{ "tx_fifo_errors",	VIRTNET_NETDEV_STAT(tx_fifo_errors) },
>>> +};
>>> +
>> 
>> Please do not merge pre-existing global stats into ethtool.
>> It just duplicates existing functionality.
>
>Thanks for the feedback.
>I thought it's handy to be able to check stats like this in ethtool as
>well. Some drivers like ixgbe and mlx5 do similar thing.
>I can remove these duplicate stats (unless anyone has objection).

For existing drivers, ethtool stats are kind of user ABI, so once these stats get in, we should not consider removing them. For new drivers or existing drivers without ethtool stats, we can do things a bit better as Stephen suggested.

-- 
Florian

^ permalink raw reply

* Re: [net 02/14] Revert "mlx5: move affinity hints assignments to generic code"
From: Sagi Grimberg @ 2017-12-25 13:53 UTC (permalink / raw)
  To: Saeed Mahameed, David S. Miller; +Cc: netdev, Thomas Gleixner, Jes Sorensen
In-Reply-To: <20171219222456.29627-3-saeedm@mellanox.com>


> Before the offending commit, mlx5 core did the IRQ affinity itself,
> and it seems that the new generic code have some drawbacks and one
> of them is the lack for user ability to modify irq affinity after
> the initial affinity values got assigned.
> 
> The issue is still being discussed and a solution in the new generic code
> is required, until then we need to revert this patch.
> 
> This fixes the following issue:
> echo <new affinity> > /proc/irq/<x>/smp_affinity
> fails with  -EIO
> 
> This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664.
> Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since
 > it is used in mlx5_ib driver.

This won't work for sure because the msi_desc affinity cpumask won't
ever be populated. You need to re-implement it in mlx5 if you don't want
to break rdma ULPs that rely on it.

^ permalink raw reply

* Re: [PATCH] flex_can: Fix checking can_dlc
From: Oliver Hartkopp @ 2017-12-25 11:58 UTC (permalink / raw)
  To: Luu An Phu, wg, mkl; +Cc: linux-can, netdev, stefan-gabriel.mirea
In-Reply-To: <486344fc-f4cf-6311-ab83-ecf9958d16e9@hartkopp.net>

Answering myself after reading my own comment once more:

In fact the code fix seems to be correct but the commit comment was 
completely wrong which lead to my answer ...

can_dlc can hold values from 0 .. 8.

The first 4 bytes are placed in data[0..3]. When we have more(!) than 4 
bytes in can_dlc, the bytes 5..8 are placed in data[4..7].

The good thing was, that the current check was more conservative than 
your suggestion so that we did not detect any data loss.

Please send a V2 patch with corrected commit message.

Thanks,
Oliver

ps.

>> From: "phu.luuan" <phu.luuan@nxp.com>
>> + * Copyright 2017 NXP

A one-liner contribution like this tiny fix usually does not lead to an 
attribution in the copyrights. Your contribution is already perfectly 
recorded by the Signed-off-by tag.

^ permalink raw reply

* [RFC PATCH v11 3/5] mwifiex: Disable wakeup irq handling for pcie
From: Jeffy Chen @ 2017-12-25 11:47 UTC (permalink / raw)
  To: linux-kernel, bhelgaas
  Cc: linux-pm, tony, shawn.lin, briannorris, rjw, dianders, Jeffy Chen,
	Xinming Hu, Kalle Valo, Ganapathi Bhat, Amitkumar Karwar,
	linux-wireless, Nishant Sarmukadam, netdev
In-Reply-To: <20171225114742.18920-1-jeffy.chen@rock-chips.com>

We are going to handle the wakeup irq in the pci core.

Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
---

Changes in v11: None
Changes in v10: None
Changes in v9: None
Changes in v8: None
Changes in v7: None
Changes in v6: None
Changes in v5: None
Changes in v3: None
Changes in v2: None

 drivers/net/wireless/marvell/mwifiex/main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index a96bd7e653bf..3cc3403b977a 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1567,6 +1567,10 @@ static void mwifiex_probe_of(struct mwifiex_adapter *adapter)
 		goto err_exit;
 
 	adapter->dt_node = dev->of_node;
+
+	if (adapter->iface_type != MWIFIEX_PCIE)
+		goto err_exit;
+
 	adapter->irq_wakeup = irq_of_parse_and_map(adapter->dt_node, 0);
 	if (!adapter->irq_wakeup) {
 		dev_dbg(dev, "fail to parse irq_wakeup from device tree\n");
-- 
2.11.0

^ permalink raw reply related

* [RFC PATCH v11 0/5] PCI: rockchip: Move PCIe WAKE# handling into pci core
From: Jeffy Chen @ 2017-12-25 11:47 UTC (permalink / raw)
  To: linux-kernel, bhelgaas
  Cc: Mark Rutland, linux-wireless, Heiko Stuebner, tony, linux-pci,
	shawn.lin, Will Deacon, Amitkumar Karwar, Frank Rowand,
	briannorris, linux-rockchip, Matthias Kaehlcke, linux-arm-kernel,
	Catalin Marinas, Caesar Wang, devicetree, Xinming Hu, linux-pm,
	Jeffy Chen, Nishant Sarmukadam, Rob Herring, Kalle Valo,
	Ganapathi Bhat, netdev, rjw, dianders, Enric Balletbo i Serra


Currently we are handling wake irq in mrvl wifi driver. Move it into
pci core.

Tested on my chromebook bob(with cros 4.4 kernel and mrvl wifi).


Changes in v11:
Only add irq definitions for PCI devices and rewrite the commit message.
Address Brian's comments.
Only support 1-per-device PCIe WAKE# pin as suggested.
Move to pcie port as Brian suggested.

Changes in v10:
Use device_set_wakeup_capable() instead of device_set_wakeup_enable(),
since dedicated wakeirq will be lost in device_set_wakeup_enable(false).

Changes in v9:
Add section for PCI devices and rewrite the commit message.
Fix check error in .cleanup().
Move dedicated wakeirq setup to setup() callback and use
device_set_wakeup_enable() to enable/disable.
Rewrite the commit message.

Changes in v8:
Add optional "pci", and rewrite commit message.
Add pci-of.c and use platform_pm_ops to handle the PCIe WAKE# signal.
Rewrite the commit message.

Changes in v7:
Move PCIE_WAKE handling into pci core.

Changes in v6:
Fix device_init_wake error handling, and add some comments.

Changes in v5:
Move to pci.txt
Rebase.
Use "wakeup" instead of "wake"

Changes in v3:
Fix error handling.

Changes in v2:
Use dev_pm_set_dedicated_wake_irq.

Jeffy Chen (5):
  dt-bindings: PCI: Add definition of PCIe WAKE# irq and PCI irq
  of/irq: Adjust of_pci_irq parsing for multiple interrupts
  mwifiex: Disable wakeup irq handling for pcie
  PCI / PM: Add support for the PCIe WAKE# signal for OF
  arm64: dts: rockchip: Move PCIe WAKE# irq to pcie port for Gru

 Documentation/devicetree/bindings/pci/pci.txt | 10 ++++
 arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi  | 11 +++--
 drivers/net/wireless/marvell/mwifiex/main.c   |  4 ++
 drivers/of/of_pci_irq.c                       | 71 +++++++++++++++++++++++++--
 drivers/pci/pci-driver.c                      | 10 ++++
 include/linux/of_pci.h                        |  9 ++++
 6 files changed, 107 insertions(+), 8 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: Add ethtool stats
From: Toshiaki Makita @ 2017-12-25 11:45 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, virtualization, David S . Miller, Michael S . Tsirkin
In-Reply-To: <20171224101640.4b14002f@xeon-e3>

On 2017/12/25 3:16, Stephen Hemminger wrote:
> On Wed, 20 Dec 2017 13:40:37 +0900
> Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
> 
>> +
>> +static const struct virtnet_gstats virtnet_gstrings_stats[] = {
>> +	{ "rx_packets",		VIRTNET_NETDEV_STAT(rx_packets) },
>> +	{ "tx_packets",		VIRTNET_NETDEV_STAT(tx_packets) },
>> +	{ "rx_bytes",		VIRTNET_NETDEV_STAT(rx_bytes) },
>> +	{ "tx_bytes",		VIRTNET_NETDEV_STAT(tx_bytes) },
>> +	{ "rx_dropped",		VIRTNET_NETDEV_STAT(rx_dropped) },
>> +	{ "rx_length_errors",	VIRTNET_NETDEV_STAT(rx_length_errors) },
>> +	{ "rx_frame_errors",	VIRTNET_NETDEV_STAT(rx_frame_errors) },
>> +	{ "tx_dropped",		VIRTNET_NETDEV_STAT(tx_dropped) },
>> +	{ "tx_fifo_errors",	VIRTNET_NETDEV_STAT(tx_fifo_errors) },
>> +};
>> +
> 
> Please do not merge pre-existing global stats into ethtool.
> It just duplicates existing functionality.

Thanks for the feedback.
I thought it's handy to be able to check stats like this in ethtool as
well. Some drivers like ixgbe and mlx5 do similar thing.
I can remove these duplicate stats (unless anyone has objection).

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [PATCH] flex_can: Fix checking can_dlc
From: Oliver Hartkopp @ 2017-12-25 11:38 UTC (permalink / raw)
  To: Luu An Phu, wg, mkl; +Cc: linux-can, netdev, stefan-gabriel.mirea
In-Reply-To: <1513672833-28402-1-git-send-email-phu.luuan@nxp.com>

This patch looks wrong to me.

On 12/19/2017 09:40 AM, Luu An Phu wrote:
> From: "phu.luuan" <phu.luuan@nxp.com>
> 
> flexcan_start_xmit should write data to register when can_dlc > 4.
> Because can_dlc is data length and it has value from 1 to 8.

No. can_dlc can contain values from 0 to 8. Even 0 is a valid DLC.

> But CAN data
> index has value from 0 to 7. So in case we have data in cf->data[4],
> the can_dlc has value is more than 4.
> 
> Signed-off-by: Luu An Phu <phu.luuan@nxp.com>
> ---
>   drivers/net/can/flexcan.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
> index 0626dcf..0e3ff13 100644
> --- a/drivers/net/can/flexcan.c
> +++ b/drivers/net/can/flexcan.c
> @@ -5,6 +5,7 @@
>    * Copyright (c) 2009 Sascha Hauer, Pengutronix
>    * Copyright (c) 2010-2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
>    * Copyright (c) 2014 David Jander, Protonic Holland
> + * Copyright 2017 NXP
>    *
>    * Based on code originally by Andrey Volkov <avolkov@varma-el.com>
>    *
> @@ -526,7 +527,7 @@ static int flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev)
>   		data = be32_to_cpup((__be32 *)&cf->data[0]);
>   		flexcan_write(data, &priv->tx_mb->data[0]);
>   	}
> -	if (cf->can_dlc > 3) {

This is correct as data[0 .. 3] are filled from the code above. And if 
can_dlc is greater than 3 (== 4 .. 8) the following 4 bytes are copied 
into the registers.

So everything is correct with the current code.

> +	if (cf->can_dlc > 4) {
>   		data = be32_to_cpup((__be32 *)&cf->data[4]);
>   		flexcan_write(data, &priv->tx_mb->data[1]);
>   	}
> 

Regards,
Oliver

^ permalink raw reply

* Re: [patch net-next 02/10] devlink: Add support for resource abstraction
From: Arkadi Sharshevsky @ 2017-12-25 11:12 UTC (permalink / raw)
  To: David Miller, jiri
  Cc: netdev, mlxsw, andrew, vivien.didelot, f.fainelli, michael.chan,
	ganeshgr, saeedm, matanb, leonro, idosch, jakub.kicinski, ast,
	daniel, simon.horman, pieter.jansenvanvuuren, john.hurley,
	alexander.h.duyck, linville, gospo, steven.lin1, yuvalm, ogerlitz,
	dsa, roopa
In-Reply-To: <20171220.144358.1892414104907740492.davem@davemloft.net>



On 12/20/2017 09:43 PM, David Miller wrote:
> From: Jiri Pirko <jiri@resnulli.us>
> Date: Wed, 20 Dec 2017 12:58:13 +0100
> 
>> From: Arkadi Sharshevsky <arkadis@mellanox.com>
>>
>> Add support for hardware resource abstraction over devlink. Each resource
>> is identified via id, furthermore it contains information regarding its
>> size and its related sub resources. Each resource can also provide its
>> current occupancy.
>>
>> In some cases the sizes of some resources can be changed, yet for those
>> changes to take place a hot driver reload may be needed. The reload
>> capability will be introduced in the next patch.
>>
>> Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> 
> In what units are these sizes?  If it depends upon the resource, it would
> be great to have a way to introspect the units given a resource.
> 

This is problematic. Currently the units are actually double words
(single entry is 64 bit) because this resource is a actually a memory.
So my first thought was adding an enum in UAPI of resource_units

enum resource_units {
	DEVLINK_RESOURCE_UNITS_WORD,
	DEVLINK_RESOURCE_UNITS_DOUBLE_WORD,
	DEVLINK_RESOURCE_UNITS_ITEM, /* this is in order to define some
                                        driver specific stuff*/
        ...
};

But the 'item' is too vague, because for example, we will have the
RIF bank as resource. What unit will it have? rifs? items?

Any inputs on this?


>> +	struct devlink_resource_ops *resource_ops;
> 
> Const?
> 
>> +static inline int
>> +devlink_resource_register(struct devlink *devlink,
>> +			  const char *resource_name,
>> +			  bool top_hierarchy,
>> +			  u64 resource_size,
>> +			  u64 resource_id,
>> +			  u64 parent_resource_id,
>> +			  struct devlink_resource_ops *resource_ops)
> 
> Const for resource_ops?
> 
>> +int devlink_resource_register(struct devlink *devlink,
>> +			      const char *resource_name,
>> +			      bool top_hierarchy,
>> +			      u64 resource_size,
>> +			      u64 resource_id,
>> +			      u64 parent_resource_id,
>> +			      struct devlink_resource_ops *resource_ops)
> 
> Likewise.
> 

^ permalink raw reply

* Re: [patch net-next v4 00/10] net: sched: allow qdiscs to share filter block instances
From: Jiri Pirko @ 2017-12-25 10:23 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel
In-Reply-To: <780a80d0-9384-ae34-4cab-3070b004b64e@gmail.com>

Sun, Dec 24, 2017 at 05:25:41PM CET, dsahern@gmail.com wrote:
>On 12/24/17 1:19 AM, Jiri Pirko wrote:
>> Sun, Dec 24, 2017 at 02:54:47AM CET, dsahern@gmail.com wrote:
>>> On 12/23/17 9:54 AM, Jiri Pirko wrote:
>>>> So back to the example. First, we create 2 qdiscs. Both will share
>>>> block number 22. "22" is just an identification. If we don't pass any
>>>> block number, a new one will be generated by kernel:
>>>>
>>>> $ tc qdisc add dev ens7 ingress block 22
>>>>                                 ^^^^^^^^
>>>> $ tc qdisc add dev ens8 ingress block 22
>>>>                                 ^^^^^^^^
>>>>
>>>> Now if we list the qdiscs, we will see the block index in the output:
>>>>
>>>> $ tc qdisc
>>>> qdisc ingress ffff: dev ens7 parent ffff:fff1 block 22
>>>> qdisc ingress ffff: dev ens8 parent ffff:fff1 block 22
>>>>
>>>> To make is more visual, the situation looks like this:
>>>>
>>>>    ens7 ingress qdisc                 ens7 ingress qdisc
>>>>           |                                  |
>>>>           |                                  |
>>>>           +---------->  block 22  <----------+
>>>>
>>>> Unlimited number of qdiscs may share the same block.
>>>>
>>>> Now we can add filter to any of qdiscs sharing the same block:
>>>>
>>>> $ tc filter add dev ens7 ingress protocol ip pref 25 flower dst_ip 192.168.0.0/16 action drop
>>>
>>>
>>> Allowing config of a shared block through any qdisc that references it
>>> is akin to me allowing nexthop objects to be manipulated by any route
>>> that references it -- sure, it could be done but causes a lot surprises
>>> to the user.
>>>
>>> You are adding a new tc object -- a shared block. Why the resistance to
>>> creating a proper API for managing it?
>> 
>> Again, no resistance, I said many times it would be done as a follow-up.
>> But as an api already exists, it has to continue to work. Or do you
>> suggest it should stop working? That, I don't agree with.
>> 
>
>That is exactly what I am saying - principle of least surprise. The new
>object brings its own API and can only be modified using the new API.
>The scheme above can and will surprise users. You are thinking like a tc
>developer, someone intimately familiar with the code, and not like an
>ordinary user of this new feature.

Breaking exising tools is newer good. Note that not only about filter
add/del iface but also dump and notifications. I agree to extend the api
for the "block handle", sure, but the existing api should continue to
work.

^ permalink raw reply

* Re: [QUESTION] Doubt about NAPI_GRO_CB(skb)->is_atomic in tcpv4 gro process
From: Yunsheng Lin @ 2017-12-25  9:32 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: netdev@vger.kernel.org, davem@davemloft.net, linuxarm@huawei.com,
	yuxiaowu, wzhen.wang, Xuehuahu
In-Reply-To: <CAKgT0Uczi6icYfQnuYYzA3_keXVTjJ2GEKfBabPbwFtcd=nddg@mail.gmail.com>

Hi, Alexander

On 2017/12/23 0:32, Alexander Duyck wrote:
> On Fri, Dec 22, 2017 at 12:49 AM, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>> Hi, Alexander
>>
>> On 2017/12/22 0:29, Alexander Duyck wrote:
>>> On Thu, Dec 21, 2017 at 1:16 AM, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>> Hi, Alexander
>>>>
>>>> On 2017/12/21 0:24, Alexander Duyck wrote:
>>>>> On Wed, Dec 20, 2017 at 1:09 AM, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>>>> Hi, all
>>>>>>         I have some doubt about NAPI_GRO_CB(skb)->is_atomic when
>>>>>> analyzing the tcpv4 gro process:
>>>>>>
>>>>>> Firstly we set NAPI_GRO_CB(skb)->is_atomic to 1 in dev_gro_receive:
>>>>>> https://elixir.free-electrons.com/linux/v4.15-rc4/source/net/core/dev.c#L4838
>>>>>>
>>>>>> And then in inet_gro_receive, we check the NAPI_GRO_CB(skb)->is_atomic
>>>>>> before setting NAPI_GRO_CB(skb)->is_atomic according to IP_DF bit in the ip header:
>>>>>> https://elixir.free-electrons.com/linux/v4.15-rc4/source/net/ipv4/af_inet.c#L1319
>>>>>>
>>>>>> struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
>>>>>> {
>>>>>> .....................
>>>>>>         for (p = *head; p; p = p->next) {
>>>>>> ........................
>>>>>>
>>>>>>                 /* If the previous IP ID value was based on an atomic
>>>>>>                  * datagram we can overwrite the value and ignore it.
>>>>>>                  */
>>>>>>                 if (NAPI_GRO_CB(skb)->is_atomic)                      //we check it here
>>>>>>                         NAPI_GRO_CB(p)->flush_id = flush_id;
>>>>>>                 else
>>>>>>                         NAPI_GRO_CB(p)->flush_id |= flush_id;
>>>>>>         }
>>>>>>
>>>>>>         NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));  //we set it here
>>>>>>         NAPI_GRO_CB(skb)->flush |= flush;
>>>>>>         skb_set_network_header(skb, off);
>>>>>> ................................
>>>>>> }
>>>>>>
>>>>>> My question is whether we should check the NAPI_GRO_CB(skb)->is_atomic or NAPI_GRO_CB(p)->is_atomic?
>>>>>> If we should check NAPI_GRO_CB(skb)->is_atomic, then maybe it is unnecessary because it is alway true.
>>>>>> If we should check NAPI_GRO_CB(p)->is_atomic, maybe there is a bug here.
>>>>>>
>>>>>> So what is the logic here? I am just start analyzing the gro, maybe I miss something obvious here.
>>>>>
>>>>> The logic there is to address the multiple IP header case where there
>>>>> are 2 or more IP headers due to things like VXLAN or GRE tunnels. So
>>>>> what will happen is that an outer IP header will end up being sent
>>>>> with DF not set and will clear the is_atomic value then we want to OR
>>>>> in the next header that is applied. It defaults to assignment on
>>>>> is_atomic because the first IP header will encounter flush_id with no
>>>>> previous configuration occupying it.
>>>>
>>>> I see your point now.
>>>>
>>>> But for the same flow of tunnels packet, the outer and inner ip header must
>>>> have the same fixed id or increment id?
>>>>
>>>> For example, if we have a flow of tunnels packet which has fixed id in outer
>>>> header and increment id in inner header(the inner header does have DF flag set):
>>
>> Sorry, a typo error here. I meant the inner header does *not* have DF flag set here.
>>
>>>>
>>>> 1. For the first packet, NAPI_GRO_CB(skb)->is_atomic will be set to zero when
>>>> inet_gro_receive is processing the inner ip header.
>>>>
>>>> 2. For the second packet, when inet_gro_receive is processing the outer ip header
>>>> which has a fixed id, NAPI_GRO_CB(p)->is_atomic is zero according to [1], so
>>>> NAPI_GRO_CB(p)->flush_id will be set to 0xFFFF, then the second packet will not
>>>> be merged to first packet in tcp_gro_receive.
>>>
>>> I'm not sure how valid your case here is. The is_atomic is only really
>>> meant to apply to the inner-most header.
>>
>> For the new skb, NAPI_GRO_CB(skb)->is_atomic is indeed applied to the
>> inner-most header.
>>
>> What about the NAPI_GRO_CB(p)->is_atomic, p is the same skb flow already
>> merged by gro.
>>
>> Let me try if I understand it correctly:
>> when there is only one skb merged in p, then NAPI_GRO_CB(p)->is_atomic
>> is set according to the first skb' inner-most ip DF flags.
>>
>> When the second skb comes, and inet_gro_receive is processing the
>> outer-most ip, for the below code, NAPI_GRO_CB(p)->is_atomic
>> is for first skb's inner-most ip DF flags, and "iph->frag_off & htons(IP_DF)"
>> is for second skb' outer-most ip DF flags.
>> Why don't we use the first skb's outer-most ip DF flags here? I think it is
>> more logical to use first skb's outer-most ip DF flags here. But the first
>> skb's outer-most ip DF flags is lost when we get here, right?
>>
>>                 if (!NAPI_GRO_CB(p)->is_atomic ||
>>                     !(iph->frag_off & htons(IP_DF))) {
>>                         flush_id ^= NAPI_GRO_CB(p)->count;
>>                         flush_id = flush_id ? 0xFFFF : 0;
>>                 }
> 
> We already did in a way. If you look earlier up we will set flush if
> the DF bit of this packet and the next don't match. So if the DF bit
> changes we are flushing anyway.

Hmm, sorry for missing that.

What puzzle me most is the differentiation between NAPI_GRO_CB(p)->is_atomic
and NAPI_GRO_CB(skb)->is_atomic. So I spent some time on reading the code
related to them. Here is what I found:

Suppose there are only two ip headers and focus on the DF flag and ID field
in ip header(assume other conditions are meet). Because when the same flow
of second skb is being processed, the NAPI_GRO_CB(p)->is_atomic set to 0 or 1
accoding to the first skb' inner-most ip DF flag, So I analyze it based on
the first skb' inner-most ip DF flag.

When the first skb' inner-most ip DF flag is not set, then
NAPI_GRO_CB(p)->is_atomic is always zero. When the skb's DF flags and ID field
meet the follow condition, it will be merged:

outer-DF    outer-ID  inner-DF  inner-ID
    1          any       0      increment
    0       increment    0      increment


When the first skb' inner-most ip DF flag is set, NAPI_GRO_CB(p)->is_atomic is 1
at first and will be changed accordingly when processing the second skb.
When the skb's DF flags and ID field meet the follow condition, it will be merged
and NAPI_GRO_CB(p)->is_atomic is changed accordingly:

outer-DF   outer-ID   inner-DF  inner-ID
  1          any         1       fixed
  1          any         1      increment  [NAPI_GRO_CB(p)->is_atomic changes to zero]
  0        increment     1      increment  [NAPI_GRO_CB(p)->is_atomic changes to zero]
  0        increment     1       fixed

Is my above analysis correct? I have tried my best to understand it. :)

According to the analysis above, If the DF flags in outer-most ip header is set,
then ID can be any value. But not the same as the ID in the inner-most ip header,
it can only be fixed or increment. According to rcf6864 originating sources MAY
set the IPv4 ID field of atomic datagrams to any value if DF is set.
I wonder what is the different here?

Also I wonder if gro support more than two ip header now.


Refer:
https://tools.ietf.org/html/rfc6864#section-4.1

> 
>>
>>
>>  In the case of TCP the
>>> inner-most header should almost always have the DF bit set which means
>>> the inner-most is almost always atomic.
>>>
>>>> I thought outer ip header could have a fixed id while inner ip header could
>>>> have a increment id. Do I miss something here?
>>>
>>> You have it backwards. The innermost will have DF bit set so it can be
>>> fixed, the outer-most will in many cases not since it is usually UDP
>>> and as such it will likely need to increment.
>>
>> Actually I got it backwards because that is what intel do when TSOing.
>>
>> Below is quoted from 710 datasheet in page 1052:
>> The Identification field (in the IP header in non tunneled packets or the inner IP header in
>> tunneled packets) is taken from the TSO header in the first segment and then it is increased by
>> one for each transmitted segment.
>> The Identification field (in the external IP header) is taken from the TSO header in the first
>> segment and then it is either increased by one for each transmitted segment or remains
>> constant depending on the EIP_NOINC setting in the transmit context descriptor.
>>
>> If I understand it correctly, intel is doing oppositely as pointed out by you.
>> Am I miss something here?
>>
>> Refer:
>> https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xl710-10-40-controller-datasheet.pdf
> 
> The only form of IP ID mangling supported by i40e is to increment
> everything always for outer and inner header. This code was meant more
> to handle the setup generated by igb or ixgbe when we are doing
> GSO_PARTIAL. We don't support fixed outer IDs in that driver if I
> recall correctly. The hardware might support it but the stack doesn't.
> Basically if DF is not set then we must increment and if DF is set we
> could either increment or be fixed and we would be within spec for how
> IP IDs are supposed to be handled.>
>>>
>>>>>
>>>>> The part I am not sure about is if we should be using assignment for
>>>>> is_atomic or using an "&=" to clear the bit and leave it cleared.
>>>>
>>>> I am not sure I understood you here. is_atomic is a bit field, why do you
>>>> want to use "&="?
>>>
>>> Actually that was my mind kind of wandering. It has been a while since
>>> I looked at this code and the use of &= wouldn't be appropriate since
>>> is_atomic should only apply to the innermost header.
>>>
>>> Basically the only acceptable combinations for is_atomic and flush_id
>>> are false with 0, or true with 1. We can't have a fixed outer header
>>> value if DF is not set.
>>>
>>> Hope that helps to clarify things.
>>
>> Yes, Your reqly has helped a lot. But I still have some doubt above,
>> please help me understand it.
>> I hope this does not take much of your time.
>> Thanks very much.
>>
>> Yunsheng Lin
>>
> 
> You are free to doubt, but I suggest you actually walk through the
> protocol and thoroughly read the code. Arguing hypothetical bugs is
> rather tedious when I can easily point out the flaws and why something
> isn't a bug.

I will try my best to do as you suggested.
Thanks very much for your time.

Best Regards
Yunsheng Lin

> 
> - Alex
> 
> .
> 

^ permalink raw reply

* [patch net-next 0/2] net: sched: Fix RED qdisc offload flag
From: Nogah Frankel @ 2017-12-25  8:51 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri, mlxsw, Nogah Frankel

Replacing the RED offload flag (TC_RED_OFFLOADED) with the generic one
(TCQ_F_OFFLOADED) deleted some of the logic behind it. This patchset fixes
this problem.

Nogah Frankel (2):
  net_sch: red: Fix the new offload indication
  net: sched: Move offload check till after dump call

 net/sched/sch_api.c |  5 ++---
 net/sched/sch_red.c | 26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 15 deletions(-)

-- 
2.4.3

^ permalink raw reply

* [patch net-next 2/2] net: sched: Move offload check till after dump call
From: Nogah Frankel @ 2017-12-25  8:51 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri, mlxsw, Nogah Frankel
In-Reply-To: <1514191902-53752-1-git-send-email-nogahf@mellanox.com>

Move the check of the offload state to after the qdisc dump action was
called, so the qdisc could update it if it was changed.

Fixes: 7a4fa29106d9 ("net: sched: Add TCA_HW_OFFLOAD")
Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
---
 net/sched/sch_api.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3a3a1da..758f132 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -807,11 +807,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
-	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
-		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
-
+	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+		goto nla_put_failure;
 	qlen = qdisc_qlen_sum(q);
 
 	stab = rtnl_dereference(q->stab);
-- 
2.4.3

^ permalink raw reply related

* [patch net-next 1/2] net_sch: red: Fix the new offload indication
From: Nogah Frankel @ 2017-12-25  8:51 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri, mlxsw, Nogah Frankel
In-Reply-To: <1514191902-53752-1-git-send-email-nogahf@mellanox.com>

Update the offload flag, TCQ_F_OFFLOADED, in each dump call (and ignore
the offloading function return value in relation to this flag).
This is done because a qdisc is being initialized, and therefore offloaded
before being grafted. Since the ability of the driver to offload the qdisc
depends on its location, a qdisc can be offloaded and un-offloaded by graft
calls, that doesn't effect the qdisc itself.

Fixes: 428a68af3a7c ("net: sched: Move to new offload indication in RED"
Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
---
 net/sched/sch_red.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index ec0bd36..a392eaa 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -157,7 +157,6 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		.handle = sch->handle,
 		.parent = sch->parent,
 	};
-	int err;
 
 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 		return -EOPNOTSUPP;
@@ -172,14 +171,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.command = TC_RED_DESTROY;
 	}
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
-
-	if (!err && enable)
-		sch->flags |= TCQ_F_OFFLOADED;
-	else
-		sch->flags &= ~TCQ_F_OFFLOADED;
-
-	return err;
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
 }
 
 static void red_destroy(struct Qdisc *sch)
@@ -297,12 +289,22 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 			.stats.qstats = &sch->qstats,
 		},
 	};
+	int err;
+
+	sch->flags &= ~TCQ_F_OFFLOADED;
 
-	if (!(sch->flags & TCQ_F_OFFLOADED))
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					    &hw_stats);
+	if (err == -EOPNOTSUPP)
 		return 0;
 
-	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-					     &hw_stats);
+	if (!err)
+		sch->flags |= TCQ_F_OFFLOADED;
+
+	return err;
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
2.4.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox