Netdev List
 help / color / mirror / Atom feed
* [patch iproute2] tc: add -bs option for batch mode
From: Chris Mi @ 2017-12-19  6:33 UTC (permalink / raw)
  To: netdev; +Cc: gerlitz.or

Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patch, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, we allocate an array of struct iovec.
If batchsize is bigger than 1 and we haven't accumulated enough
commands, rtnl_talk() will return without actually sending the message.
One exception is that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patch is the user mode and kernel mode
context switch. So this patch works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
	tools/testing/selftests/tc-testing/tdc_batch.py

Without this patch, 'tc -b $file' exection time is:

real    0m14.916s
user    0m6.808s
sys     0m8.046s

With this patch, 'tc -b $file -bs 10' exection time is:

real    0m12.286s
user    0m5.903s
sys     0m6.312s

The insertion rate is improved more than 10%.

Signed-off-by: Chris Mi <chrism@mellanox.com>
---
 include/libnetlink.h | 27 ++++++++++++++++
 include/utils.h      |  8 +++++
 lib/libnetlink.c     | 30 +++++++++++++-----
 lib/utils.c          | 20 ++++++++++++
 man/man8/tc.8        |  5 +++
 tc/m_action.c        | 63 ++++++++++++++++++++++++++++---------
 tc/tc.c              | 27 ++++++++++++++--
 tc/tc_common.h       |  3 ++
 tc/tc_filter.c       | 89 ++++++++++++++++++++++++++++++++++++----------------
 9 files changed, 221 insertions(+), 51 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..07e88c94 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -13,6 +13,8 @@
 #include <linux/netconf.h>
 #include <arpa/inet.h>
 
+#define MSG_IOV_MAX 256
+
 struct rtnl_handle {
 	int			fd;
 	struct sockaddr_nl	local;
@@ -93,6 +95,31 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 			void *arg, __u16 nc_flags);
 #define rtnl_dump_filter(rth, filter, arg) \
 	rtnl_dump_filter_nc(rth, filter, arg, 0)
+
+extern int msg_iov_index;
+static inline int get_msg_iov_index(void)
+{
+	return msg_iov_index;
+}
+static inline void set_msg_iov_index(int index)
+{
+	msg_iov_index = index;
+}
+static inline void incr_msg_iov_index(void)
+{
+	++msg_iov_index;
+}
+
+extern int batch_size;
+static inline int get_batch_size(void)
+{
+	return batch_size;
+}
+static inline void set_batch_size(int size)
+{
+	batch_size = size;
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr **answer)
 	__attribute__((warn_unused_result));
diff --git a/include/utils.h b/include/utils.h
index d3895d56..66cb4747 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -235,6 +235,14 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n);
 
 extern int cmdlineno;
 ssize_t getcmdline(char **line, size_t *len, FILE *in);
+
+extern int cmdlinetotal;
+static inline int getcmdlinetotal(void)
+{
+	return cmdlinetotal;
+}
+void setcmdlinetotal(const char *name);
+
 int makeargs(char *line, char *argv[], int maxargs);
 int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
 
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..f9be1c6d 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -24,6 +24,7 @@
 #include <sys/uio.h>
 
 #include "libnetlink.h"
+#include "utils.h"
 
 #ifndef SOL_NETLINK
 #define SOL_NETLINK 270
@@ -581,6 +582,10 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err,
 		strerror(-err->error));
 }
 
+static struct iovec msg_iov[MSG_IOV_MAX];
+int msg_iov_index;
+int batch_size = 1;
+
 static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		       struct nlmsghdr **answer,
 		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
@@ -589,23 +594,34 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	unsigned int seq;
 	struct nlmsghdr *h;
 	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-	struct iovec iov = {
-		.iov_base = n,
-		.iov_len = n->nlmsg_len
-	};
+	struct iovec *iov = &msg_iov[get_msg_iov_index()];
+	int index;
+	char *buf;
+
+	iov->iov_base = n;
+	iov->iov_len = n->nlmsg_len;
+
+	incr_msg_iov_index();
 	struct msghdr msg = {
 		.msg_name = &nladdr,
 		.msg_namelen = sizeof(nladdr),
-		.msg_iov = &iov,
-		.msg_iovlen = 1,
+		.msg_iov = msg_iov,
+		.msg_iovlen = get_msg_iov_index(),
 	};
-	char *buf;
 
 	n->nlmsg_seq = seq = ++rtnl->seq;
 
 	if (answer == NULL)
 		n->nlmsg_flags |= NLM_F_ACK;
 
+	index = get_msg_iov_index() % get_batch_size();
+	if (index != 0 && get_batch_size() > 1 &&
+	    cmdlineno != getcmdlinetotal() &&
+	    (n->nlmsg_type == RTM_NEWTFILTER ||
+	    n->nlmsg_type == RTM_NEWACTION))
+		return 3;
+	set_msg_iov_index(index);
+
 	status = sendmsg(rtnl->fd, &msg, 0);
 	if (status < 0) {
 		perror("Cannot talk to rtnetlink");
diff --git a/lib/utils.c b/lib/utils.c
index 7ced8c06..53ca389f 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -1202,6 +1202,26 @@ ssize_t getcmdline(char **linep, size_t *lenp, FILE *in)
 	return cc;
 }
 
+int cmdlinetotal;
+
+void setcmdlinetotal(const char *name)
+{
+	char *line = NULL;
+	size_t len = 0;
+
+	if (name && strcmp(name, "-") != 0) {
+		if (freopen(name, "r", stdin) == NULL) {
+			fprintf(stderr, "Cannot open file \"%s\" for reading: %s\n",
+				name, strerror(errno));
+			return;
+		}
+	}
+
+	cmdlinetotal = 0;
+	while (getcmdline(&line, &len, stdin) != -1)
+		cmdlinetotal++;
+}
+
 /* split command line into argument vector */
 int makeargs(char *line, char *argv[], int maxargs)
 {
diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..de137e16 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,11 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
diff --git a/tc/m_action.c b/tc/m_action.c
index 13f942bf..574b78ca 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include <arpa/inet.h>
 #include <string.h>
 #include <dlfcn.h>
+#include <errno.h>
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,33 +547,65 @@ bad_val:
 	return ret;
 }
 
+typedef struct {
+	struct nlmsghdr		n;
+	struct tcamsg		t;
+	char			buf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+
+void free_action_reqs(void)
+{
+	if (action_reqs)
+		free(action_reqs);
+}
+
+static tc_action_req *get_action_req(void)
+{
+	tc_action_req *req;
+
+	if (action_reqs == NULL) {
+		action_reqs = malloc(get_batch_size() * sizeof (tc_action_req));
+		if (action_reqs == NULL)
+			return NULL;
+	}
+	req = &action_reqs[get_msg_iov_index()];
+	memset(req, 0, sizeof (*req));
+
+	return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
 			    int *argc_p, char ***argv_p)
 {
 	int argc = *argc_p;
 	char **argv = *argv_p;
 	int ret = 0;
-	struct {
-		struct nlmsghdr         n;
-		struct tcamsg           t;
-		char                    buf[MAX_MSG];
-	} req = {
-		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-		.n.nlmsg_flags = NLM_F_REQUEST | flags,
-		.n.nlmsg_type = cmd,
-		.t.tca_family = AF_UNSPEC,
-	};
-	struct rtattr *tail = NLMSG_TAIL(&req.n);
+	tc_action_req *req;
+
+	req = get_action_req();
+	if (req == NULL) {
+		fprintf(stderr, "get_action_req error: not enough buffer\n");
+		return -ENOMEM;
+	}
+
+	req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+	req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+	req->n.nlmsg_type = cmd;
+	req->t.tca_family = AF_UNSPEC;
+	struct rtattr *tail = NLMSG_TAIL(&req->n);
 
 	argc -= 1;
 	argv += 1;
-	if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) {
+	if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) {
 		fprintf(stderr, "Illegal \"action\"\n");
 		return -1;
 	}
-	tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
+	tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail;
 
-	if (rtnl_talk(&rth, &req.n, NULL) < 0) {
+	ret = rtnl_talk(&rth, &req->n, NULL);
+	if (ret < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		ret = -1;
 	}
@@ -739,5 +772,5 @@ int do_action(int argc, char **argv)
 			return -1;
 	}
 
-	return 0;
+	return ret;
 }
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..9c8e828b 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -189,7 +189,7 @@ static void usage(void)
 	fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT { COMMAND | help }\n"
 			"       tc [-force] -batch filename\n"
 			"where  OBJECT := { qdisc | class | filter | action | monitor | exec }\n"
-	                "       OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -p[retty] | -b[atch] [filename] | -n[etns] name |\n"
+	                "       OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -p[retty] | -b[atch] [filename] | -bs | -batchsize [size] | -n[etns] name |\n"
 			"                    -nm | -nam[es] | { -cf | -conf } path } | -j[son]\n");
 }
 
@@ -223,6 +223,9 @@ static int batch(const char *name)
 	size_t len = 0;
 	int ret = 0;
 
+	if (get_batch_size() > 1)
+		setcmdlinetotal(name);
+
 	batch_mode = 1;
 	if (name && strcmp(name, "-") != 0) {
 		if (freopen(name, "r", stdin) == NULL) {
@@ -248,15 +251,22 @@ static int batch(const char *name)
 		if (largc == 0)
 			continue;	/* blank line */
 
-		if (do_cmd(largc, largv)) {
+		ret = do_cmd(largc, largv);
+		if (ret != 0 && ret != 3) {
 			fprintf(stderr, "Command failed %s:%d\n", name, cmdlineno);
 			ret = 1;
 			if (!force)
 				break;
 		}
 	}
+	if (ret == 3)
+		fprintf(stderr,
+			"Command is not sent to kernel due to internal error %s:%d\n",
+			name, cmdlineno);
 	if (line)
 		free(line);
+	free_filter_reqs();
+	free_action_reqs();
 
 	rtnl_close(&rth);
 	return ret;
@@ -267,6 +277,7 @@ int main(int argc, char **argv)
 {
 	int ret;
 	char *batch_file = NULL;
+	int size;
 
 	while (argc > 1) {
 		if (argv[1][0] != '-')
@@ -297,6 +308,16 @@ int main(int argc, char **argv)
 			if (argc <= 1)
 				usage();
 			batch_file = argv[1];
+		} else if (matches(argv[1], "-batchsize") == 0 ||
+				matches(argv[1], "-bs") == 0) {
+			argc--;	argv++;
+			if (argc <= 1)
+				usage();
+			size = atoi(argv[1]);
+			if (size > MSG_IOV_MAX)
+				set_batch_size(MSG_IOV_MAX);
+			else
+				set_batch_size(size);
 		} else if (matches(argv[1], "-netns") == 0) {
 			NEXT_ARG();
 			if (netns_switch(argv[1]))
@@ -342,6 +363,8 @@ int main(int argc, char **argv)
 	}
 
 	ret = do_cmd(argc-1, argv+1);
+	free_filter_reqs();
+	free_action_reqs();
 Exit:
 	rtnl_close(&rth);
 
diff --git a/tc/tc_common.h b/tc/tc_common.h
index 264fbdac..fde8db1b 100644
--- a/tc/tc_common.h
+++ b/tc/tc_common.h
@@ -24,5 +24,8 @@ struct tc_sizespec;
 extern int parse_size_table(int *p_argc, char ***p_argv, struct tc_sizespec *s);
 extern int check_size_table_opts(struct tc_sizespec *s);
 
+extern void free_filter_reqs(void);
+extern void free_action_reqs(void);
+
 extern int show_graph;
 extern bool use_names;
diff --git a/tc/tc_filter.c b/tc/tc_filter.c
index 545cc3a1..dc53c128 100644
--- a/tc/tc_filter.c
+++ b/tc/tc_filter.c
@@ -19,6 +19,7 @@
 #include <arpa/inet.h>
 #include <string.h>
 #include <linux/if_ether.h>
+#include <errno.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -42,18 +43,51 @@ static void usage(void)
 		"OPTIONS := ... try tc filter add <desired FILTER_KIND> help\n");
 }
 
+typedef struct {
+	struct nlmsghdr		n;
+	struct tcmsg		t;
+	char			buf[MAX_MSG];
+} tc_filter_req;
+
+static tc_filter_req *filter_reqs;
+
+void free_filter_reqs(void)
+{
+	if (filter_reqs)
+		free(filter_reqs);
+}
+
+static tc_filter_req *get_filter_req(void)
+{
+	tc_filter_req *req;
+
+	if (filter_reqs == NULL) {
+		filter_reqs = malloc(get_batch_size() * sizeof (tc_filter_req));
+		if (filter_reqs == NULL)
+			return NULL;
+	}
+	req = &filter_reqs[get_msg_iov_index()];
+	memset(req, 0, sizeof (*req));
+
+	return req;
+}
+
 static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 {
-	struct {
-		struct nlmsghdr	n;
-		struct tcmsg		t;
-		char			buf[MAX_MSG];
-	} req = {
-		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
-		.n.nlmsg_flags = NLM_F_REQUEST | flags,
-		.n.nlmsg_type = cmd,
-		.t.tcm_family = AF_UNSPEC,
-	};
+	tc_filter_req *req;
+	int ret;
+
+	req = get_filter_req();
+	if (req == NULL) {
+		fprintf(stderr, "get_filter_req error: not enough buffer\n");
+		return -ENOMEM;
+	}
+
+	req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+	req->n.nlmsg_type = cmd;
+	req->t.tcm_family = AF_UNSPEC;
+
 	struct filter_util *q = NULL;
 	__u32 prio = 0;
 	__u32 protocol = 0;
@@ -75,37 +109,37 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 				duparg("dev", *argv);
 			strncpy(d, *argv, sizeof(d)-1);
 		} else if (strcmp(*argv, "root") == 0) {
-			if (req.t.tcm_parent) {
+			if (req->t.tcm_parent) {
 				fprintf(stderr,
 					"Error: \"root\" is duplicate parent ID\n");
 				return -1;
 			}
-			req.t.tcm_parent = TC_H_ROOT;
+			req->t.tcm_parent = TC_H_ROOT;
 		} else if (strcmp(*argv, "ingress") == 0) {
-			if (req.t.tcm_parent) {
+			if (req->t.tcm_parent) {
 				fprintf(stderr,
 					"Error: \"ingress\" is duplicate parent ID\n");
 				return -1;
 			}
-			req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
+			req->t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
 						     TC_H_MIN_INGRESS);
 		} else if (strcmp(*argv, "egress") == 0) {
-			if (req.t.tcm_parent) {
+			if (req->t.tcm_parent) {
 				fprintf(stderr,
 					"Error: \"egress\" is duplicate parent ID\n");
 				return -1;
 			}
-			req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
+			req->t.tcm_parent = TC_H_MAKE(TC_H_CLSACT,
 						     TC_H_MIN_EGRESS);
 		} else if (strcmp(*argv, "parent") == 0) {
 			__u32 handle;
 
 			NEXT_ARG();
-			if (req.t.tcm_parent)
+			if (req->t.tcm_parent)
 				duparg("parent", *argv);
 			if (get_tc_classid(&handle, *argv))
 				invarg("Invalid parent ID", *argv);
-			req.t.tcm_parent = handle;
+			req->t.tcm_parent = handle;
 		} else if (strcmp(*argv, "handle") == 0) {
 			NEXT_ARG();
 			if (fhandle)
@@ -152,26 +186,26 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 		argc--; argv++;
 	}
 
-	req.t.tcm_info = TC_H_MAKE(prio<<16, protocol);
+	req->t.tcm_info = TC_H_MAKE(prio<<16, protocol);
 
 	if (chain_index_set)
-		addattr32(&req.n, sizeof(req), TCA_CHAIN, chain_index);
+		addattr32(&req->n, sizeof(*req), TCA_CHAIN, chain_index);
 
 	if (k[0])
-		addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+		addattr_l(&req->n, sizeof(*req), TCA_KIND, k, strlen(k)+1);
 
 	if (d[0])  {
 		ll_init_map(&rth);
 
-		req.t.tcm_ifindex = ll_name_to_index(d);
-		if (req.t.tcm_ifindex == 0) {
+		req->t.tcm_ifindex = ll_name_to_index(d);
+		if (req->t.tcm_ifindex == 0) {
 			fprintf(stderr, "Cannot find device \"%s\"\n", d);
 			return 1;
 		}
 	}
 
 	if (q) {
-		if (q->parse_fopt(q, fhandle, argc, argv, &req.n))
+		if (q->parse_fopt(q, fhandle, argc, argv, &req->n))
 			return 1;
 	} else {
 		if (fhandle) {
@@ -190,14 +224,15 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 	}
 
 	if (est.ewma_log)
-		addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
+		addattr_l(&req->n, sizeof(*req), TCA_RATE, &est, sizeof(est));
 
-	if (rtnl_talk(&rth, &req.n, NULL) < 0) {
+	ret = rtnl_talk(&rth, &req->n, NULL);
+	if (ret < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		return 2;
 	}
 
-	return 0;
+	return ret;
 }
 
 static __u32 filter_parent;
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH v2 2/3] vsprintf: print <no-symbol> if symbol not found
From: Tobin C. Harding @ 2017-12-19  6:33 UTC (permalink / raw)
  To: Joe Perches
  Cc: kernel-hardening, Steven Rostedt, Tycho Andersen, Linus Torvalds,
	Kees Cook, Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development
In-Reply-To: <1513664307.1234.21.camel@perches.com>

On Mon, Dec 18, 2017 at 10:18:27PM -0800, Joe Perches wrote:
> On Tue, 2017-12-19 at 14:28 +1100, Tobin C. Harding wrote:
> > Depends on: commit 40eee173a35e ("kallsyms: don't leak address when
> > symbol not found")
> > 
> > Currently vsprintf for specifiers %p[SsB] relies on the behaviour of
> > kallsyms (sprint_symbol()) and prints the actual address if a symbol is
> > not found. Previous patch changes this behaviour so that sprint_symbol()
> > returns an error if symbol not found. With this patch in place we can
> > print a sanitized message '<symbol not found>' instead of leaking the
> > address.
> > 
> > Print '<symbol not found>' for printk specifier %p[sSB] if symbol look
> > up fails.
> > 
> > Signed-off-by: Tobin C. Harding <me@tobin.cc>
> > ---
> >  lib/vsprintf.c | 11 ++++++++---
> >  1 file changed, 8 insertions(+), 3 deletions(-)
> > 
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > index 01c3957b2de6..820ed4fe6e6c 100644
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > @@ -674,6 +674,8 @@ char *symbol_string(char *buf, char *end, void *ptr,
> >  	unsigned long value;
> >  #ifdef CONFIG_KALLSYMS
> >  	char sym[KSYM_SYMBOL_LEN];
> > +	const char *sym_not_found = "<symbol not found>";
> 
> This will be reinitialized on every use.
> 
> > +	int ret;
> >  #endif
> >  
> >  	if (fmt[1] == 'R')
> > @@ -682,11 +684,14 @@ char *symbol_string(char *buf, char *end, void *ptr,
> >  
> >  #ifdef CONFIG_KALLSYMS
> >  	if (*fmt == 'B')
> > -		sprint_backtrace(sym, value);
> > +		ret = sprint_backtrace(sym, value);
> >  	else if (*fmt != 'f' && *fmt != 's')
> > -		sprint_symbol(sym, value);
> > +		ret = sprint_symbol(sym, value);
> >  	else
> > -		sprint_symbol_no_offset(sym, value);
> > +		ret = sprint_symbol_no_offset(sym, value);
> > +
> > +	if (ret == -1)
> > +		strcpy(sym, sym_not_found);
> 
> 
> This could avoid the unnecessary strcpy if sym_not_found
> was not used at all and this was used instead
> 
> 	if (ret == -1)
> 		return string(buf, end, "<symbol not found>", spec);
> 
> 	return string(buf, end, sym, spec);
> 
> or maybe
> 
> 	return string(buf, end, ret == -1 ? "<symbol not found>" : sum, spec);

Oh, thanks. This is much cleaner. Will re-spin.

thanks,
Tobin.

^ permalink raw reply

* Re: [PATCH v10 1/5] add infrastructure for tagging functions as error injectable
From: Masami Hiramatsu @ 2017-12-19  6:29 UTC (permalink / raw)
  To: Josef Bacik
  Cc: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, Josef Bacik
In-Reply-To: <1513365176-6744-2-git-send-email-josef@toxicpanda.com>

On Fri, 15 Dec 2017 14:12:52 -0500
Josef Bacik <josef@toxicpanda.com> wrote:

> From: Josef Bacik <jbacik@fb.com>
> 
> Using BPF we can override kprob'ed functions and return arbitrary
> values.  Obviously this can be a bit unsafe, so make this feature opt-in
> for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
> order to give BPF access to that function for error injection purposes.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> Acked-by: Ingo Molnar <mingo@kernel.org>
> ---
>  include/asm-generic/vmlinux.lds.h |  10 +++
>  include/linux/bpf.h               |  11 +++
>  include/linux/kprobes.h           |   1 +
>  include/linux/module.h            |   5 ++
>  kernel/kprobes.c                  | 163 ++++++++++++++++++++++++++++++++++++++
>  kernel/module.c                   |   6 +-
>  6 files changed, 195 insertions(+), 1 deletion(-)
> 
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index ee8b707d9fa9..a2e8582d094a 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -136,6 +136,15 @@
>  #define KPROBE_BLACKLIST()
>  #endif
>  
> +#ifdef CONFIG_BPF_KPROBE_OVERRIDE
> +#define ERROR_INJECT_LIST()	. = ALIGN(8);						\
> +				VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;	\
> +				KEEP(*(_kprobe_error_inject_list))			\
> +				VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .;
> +#else
> +#define ERROR_INJECT_LIST()
> +#endif
> +
>  #ifdef CONFIG_EVENT_TRACING
>  #define FTRACE_EVENTS()	. = ALIGN(8);					\
>  			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
> @@ -564,6 +573,7 @@
>  	FTRACE_EVENTS()							\
>  	TRACE_SYSCALLS()						\
>  	KPROBE_BLACKLIST()						\
> +	ERROR_INJECT_LIST()						\
>  	MEM_DISCARD(init.rodata)					\
>  	CLK_OF_TABLES()							\
>  	RESERVEDMEM_OF_TABLES()						\
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e55e4255a210..7f4d2a953173 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -576,4 +576,15 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
>  void bpf_user_rnd_init_once(void);
>  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>  
> +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
> +#ifdef CONFIG_BPF_KPROBE_OVERRIDE

BTW, CONFIG_BPF_KPROBE_OVERRIDE is also confusable name.
Since this feature override a function to just return with
some return value (as far as I understand, or would you
also plan to modify execution path inside a function?),
I think it should be better CONFIG_BPF_FUNCTION_OVERRIDE or
CONFIG_BPF_EXECUTION_OVERRIDE.

Indeed, BPF is based on kprobes, but it seems you are limiting it
with ftrace (function-call trace) (I'm not sure the reason why),
so using "kprobes" for this feature seems strange for me.

The idea in this patch itself (marking injectable function on
a list) is OK to me. 

Thank you,

> +#define BPF_ALLOW_ERROR_INJECTION(fname)				\
> +static unsigned long __used						\
> +	__attribute__((__section__("_kprobe_error_inject_list")))	\
> +	_eil_addr_##fname = (unsigned long)fname;
> +#else
> +#define BPF_ALLOW_ERROR_INJECTION(fname)
> +#endif
> +#endif
> +
>  #endif /* _LINUX_BPF_H */
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index 9440a2fc8893..963fd364f3d6 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
>  extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
>  
>  extern bool within_kprobe_blacklist(unsigned long addr);
> +extern bool within_kprobe_error_injection_list(unsigned long addr);
>  
>  struct kprobe_insn_cache {
>  	struct mutex mutex;
> diff --git a/include/linux/module.h b/include/linux/module.h
> index c69b49abe877..548fa09fa806 100644
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -475,6 +475,11 @@ struct module {
>  	ctor_fn_t *ctors;
>  	unsigned int num_ctors;
>  #endif
> +
> +#ifdef CONFIG_BPF_KPROBE_OVERRIDE
> +	unsigned int num_kprobe_ei_funcs;
> +	unsigned long *kprobe_ei_funcs;
> +#endif
>  } ____cacheline_aligned __randomize_layout;
>  #ifndef MODULE_ARCH_INIT
>  #define MODULE_ARCH_INIT {}
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index da2ccf142358..b4aab48ad258 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
>  	return &(kretprobe_table_locks[hash].lock);
>  }
>  
> +/* List of symbols that can be overriden for error injection. */
> +static LIST_HEAD(kprobe_error_injection_list);
> +static DEFINE_MUTEX(kprobe_ei_mutex);
> +struct kprobe_ei_entry {
> +	struct list_head list;
> +	unsigned long start_addr;
> +	unsigned long end_addr;
> +	void *priv;
> +};
> +
>  /* Blacklist -- list of struct kprobe_blacklist_entry */
>  static LIST_HEAD(kprobe_blacklist);
>  
> @@ -1394,6 +1404,17 @@ bool within_kprobe_blacklist(unsigned long addr)
>  	return false;
>  }
>  
> +bool within_kprobe_error_injection_list(unsigned long addr)
> +{
> +	struct kprobe_ei_entry *ent;
> +
> +	list_for_each_entry(ent, &kprobe_error_injection_list, list) {
> +		if (addr >= ent->start_addr && addr < ent->end_addr)
> +			return true;
> +	}
> +	return false;
> +}
> +
>  /*
>   * If we have a symbol_name argument, look it up and add the offset field
>   * to it. This way, we can specify a relative address to a symbol.
> @@ -2168,6 +2189,86 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_BPF_KPROBE_OVERRIDE
> +/* Markers of the _kprobe_error_inject_list section */
> +extern unsigned long __start_kprobe_error_inject_list[];
> +extern unsigned long __stop_kprobe_error_inject_list[];
> +
> +/*
> + * Lookup and populate the kprobe_error_injection_list.
> + *
> + * For safety reasons we only allow certain functions to be overriden with
> + * bpf_error_injection, so we need to populate the list of the symbols that have
> + * been marked as safe for overriding.
> + */
> +static void populate_kprobe_error_injection_list(unsigned long *start,
> +						 unsigned long *end,
> +						 void *priv)
> +{
> +	unsigned long *iter;
> +	struct kprobe_ei_entry *ent;
> +	unsigned long entry, offset = 0, size = 0;
> +
> +	mutex_lock(&kprobe_ei_mutex);
> +	for (iter = start; iter < end; iter++) {
> +		entry = arch_deref_entry_point((void *)*iter);
> +
> +		if (!kernel_text_address(entry) ||
> +		    !kallsyms_lookup_size_offset(entry, &size, &offset)) {
> +			pr_err("Failed to find error inject entry at %p\n",
> +				(void *)entry);
> +			continue;
> +		}
> +
> +		ent = kmalloc(sizeof(*ent), GFP_KERNEL);
> +		if (!ent)
> +			break;
> +		ent->start_addr = entry;
> +		ent->end_addr = entry + size;
> +		ent->priv = priv;
> +		INIT_LIST_HEAD(&ent->list);
> +		list_add_tail(&ent->list, &kprobe_error_injection_list);
> +	}
> +	mutex_unlock(&kprobe_ei_mutex);
> +}
> +
> +static void __init populate_kernel_kprobe_ei_list(void)
> +{
> +	populate_kprobe_error_injection_list(__start_kprobe_error_inject_list,
> +					     __stop_kprobe_error_inject_list,
> +					     NULL);
> +}
> +
> +static void module_load_kprobe_ei_list(struct module *mod)
> +{
> +	if (!mod->num_kprobe_ei_funcs)
> +		return;
> +	populate_kprobe_error_injection_list(mod->kprobe_ei_funcs,
> +					     mod->kprobe_ei_funcs +
> +					     mod->num_kprobe_ei_funcs, mod);
> +}
> +
> +static void module_unload_kprobe_ei_list(struct module *mod)
> +{
> +	struct kprobe_ei_entry *ent, *n;
> +	if (!mod->num_kprobe_ei_funcs)
> +		return;
> +
> +	mutex_lock(&kprobe_ei_mutex);
> +	list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) {
> +		if (ent->priv == mod) {
> +			list_del_init(&ent->list);
> +			kfree(ent);
> +		}
> +	}
> +	mutex_unlock(&kprobe_ei_mutex);
> +}
> +#else
> +static inline void __init populate_kernel_kprobe_ei_list(void) {}
> +static inline void module_load_kprobe_ei_list(struct module *m) {}
> +static inline void module_unload_kprobe_ei_list(struct module *m) {}
> +#endif
> +
>  /* Module notifier call back, checking kprobes on the module */
>  static int kprobes_module_callback(struct notifier_block *nb,
>  				   unsigned long val, void *data)
> @@ -2178,6 +2279,11 @@ static int kprobes_module_callback(struct notifier_block *nb,
>  	unsigned int i;
>  	int checkcore = (val == MODULE_STATE_GOING);
>  
> +	if (val == MODULE_STATE_COMING)
> +		module_load_kprobe_ei_list(mod);
> +	else if (val == MODULE_STATE_GOING)
> +		module_unload_kprobe_ei_list(mod);
> +
>  	if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
>  		return NOTIFY_DONE;
>  
> @@ -2240,6 +2346,8 @@ static int __init init_kprobes(void)
>  		pr_err("Please take care of using kprobes.\n");
>  	}
>  
> +	populate_kernel_kprobe_ei_list();
> +
>  	if (kretprobe_blacklist_size) {
>  		/* lookup the function address from its name */
>  		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
> @@ -2407,6 +2515,56 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
>  	.release        = seq_release,
>  };
>  
> +/*
> + * kprobes/error_injection_list -- shows which functions can be overriden for
> + * error injection.
> + * */
> +static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos)
> +{
> +	mutex_lock(&kprobe_ei_mutex);
> +	return seq_list_start(&kprobe_error_injection_list, *pos);
> +}
> +
> +static void kprobe_ei_seq_stop(struct seq_file *m, void *v)
> +{
> +	mutex_unlock(&kprobe_ei_mutex);
> +}
> +
> +static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> +	return seq_list_next(v, &kprobe_error_injection_list, pos);
> +}
> +
> +static int kprobe_ei_seq_show(struct seq_file *m, void *v)
> +{
> +	char buffer[KSYM_SYMBOL_LEN];
> +	struct kprobe_ei_entry *ent =
> +		list_entry(v, struct kprobe_ei_entry, list);
> +
> +	sprint_symbol(buffer, ent->start_addr);
> +	seq_printf(m, "%s\n", buffer);
> +	return 0;
> +}
> +
> +static const struct seq_operations kprobe_ei_seq_ops = {
> +	.start = kprobe_ei_seq_start,
> +	.next  = kprobe_ei_seq_next,
> +	.stop  = kprobe_ei_seq_stop,
> +	.show  = kprobe_ei_seq_show,
> +};
> +
> +static int kprobe_ei_open(struct inode *inode, struct file *filp)
> +{
> +	return seq_open(filp, &kprobe_ei_seq_ops);
> +}
> +
> +static const struct file_operations debugfs_kprobe_ei_ops = {
> +	.open           = kprobe_ei_open,
> +	.read           = seq_read,
> +	.llseek         = seq_lseek,
> +	.release        = seq_release,
> +};
> +
>  static void arm_all_kprobes(void)
>  {
>  	struct hlist_head *head;
> @@ -2548,6 +2706,11 @@ static int __init debugfs_kprobe_init(void)
>  	if (!file)
>  		goto error;
>  
> +	file = debugfs_create_file("error_injection_list", 0444, dir, NULL,
> +				  &debugfs_kprobe_ei_ops);
> +	if (!file)
> +		goto error;
> +
>  	return 0;
>  
>  error:
> diff --git a/kernel/module.c b/kernel/module.c
> index dea01ac9cb74..bd695bfdc5c4 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -3118,7 +3118,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
>  					     sizeof(*mod->ftrace_callsites),
>  					     &mod->num_ftrace_callsites);
>  #endif
> -
> +#ifdef CONFIG_BPF_KPROBE_OVERRIDE
> +	mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list",
> +					    sizeof(*mod->kprobe_ei_funcs),
> +					    &mod->num_kprobe_ei_funcs);
> +#endif
>  	mod->extable = section_objs(info, "__ex_table",
>  				    sizeof(*mod->extable), &mod->num_exentries);
>  
> -- 
> 2.7.5
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply

* Re: pull-request: bpf-next 2017-12-18
From: Alexei Starovoitov @ 2017-12-19  6:28 UTC (permalink / raw)
  To: David Miller; +Cc: daniel, ast, netdev
In-Reply-To: <20171218.105153.395572657387757515.davem@davemloft.net>

On Mon, Dec 18, 2017 at 10:51:53AM -0500, David Miller wrote:
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Mon, 18 Dec 2017 01:33:07 +0100
> 
> > The following pull-request contains BPF updates for your *net-next* tree.
> > 
> > The main changes are:
> > 
> > 1) Allow arbitrary function calls from one BPF function to another BPF function.
> >    As of today when writing BPF programs, __always_inline had to be used in
> >    the BPF C programs for all functions, unnecessarily causing LLVM to inflate
> >    code size. Handle this more naturally with support for BPF to BPF calls
> >    such that this __always_inline restriction can be overcome. As a result,
> >    it allows for better optimized code and finally enables to introduce core
> >    BPF libraries in the future that can be reused out of different projects.
> >    x86 and arm64 JIT support was added as well, from Alexei.
> 
> Exciting... but now there's a lot of JIT work to do.

I've looked at sparc64. It should be simpler than arm64.
First reaction was that it would need dumb version of
emit_loadimm64() (similar to arm's emit_addr_mov_i64), but not,
since it's not used in emit_call.
I can take a stab at it, but cannot test. The most time
consuming part is to setup the latest llvm on the system
to compile *_noinline.c tests.
Note to self, I really need to make test_verifier run the tests.

^ permalink raw reply

* [PATCH bpf 07/11] bpf: Add support for reading sk_state and more
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Add support for reading many more tcp_sock fields

  state,	same as sk->sk_state
  rtt_min	same as sk->rtt_min.s[0].v (current rtt_min)
  snd_ssthresh
  rcv_nxt
  snd_nxt
  snd_una
  mss_cache
  ecn_flags
  rate_delivered
  rate_interval_us
  packets_out
  retrans_out
  total_retrans
  segs_in
  data_segs_in
  segs_out
  data_segs_out
  bytes_received (__u64)
  bytes_acked    (__u64)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 19 ++++++++++
 net/core/filter.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1c36795..19a0b1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -954,6 +954,25 @@ struct bpf_sock_ops {
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 	__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 2692514..2628077 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3828,7 +3828,7 @@ static bool __is_valid_sock_ops_access(int off, int size)
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
+	if (size != sizeof(__u32) && size != sizeof(__u64))
 		return false;
 
 	return true;
@@ -4448,6 +4448,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
+	case offsetof(struct bpf_sock_ops, state):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_state));
+		break;
+
+	case offsetof(struct bpf_sock_ops, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      FIELD_SIZEOF(struct minmax_sample, t));
+		break;
+
 /* Helper macro for adding read access to tcp_sock or sock fields. */
 #define SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ)				      \
 	do {								      \
@@ -4528,6 +4554,74 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		SOCK_OPS_GET_OR_SET_FIELD(bpf_sock_ops_flags, struct tcp_sock,
 					  type);
 		break;
+
+	case offsetof(struct bpf_sock_ops, snd_ssthresh):
+		SOCK_OPS_GET_FIELD(snd_ssthresh, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rcv_nxt):
+		SOCK_OPS_GET_FIELD(rcv_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_nxt):
+		SOCK_OPS_GET_FIELD(snd_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_una):
+		SOCK_OPS_GET_FIELD(snd_una, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, mss_cache):
+		SOCK_OPS_GET_FIELD(mss_cache, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, ecn_flags):
+		SOCK_OPS_GET_FIELD(ecn_flags, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_delivered):
+		SOCK_OPS_GET_FIELD(rate_delivered, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_interval_us):
+		SOCK_OPS_GET_FIELD(rate_interval_us, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, packets_out):
+		SOCK_OPS_GET_FIELD(packets_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, retrans_out):
+		SOCK_OPS_GET_FIELD(retrans_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, total_retrans):
+		SOCK_OPS_GET_FIELD(total_retrans, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_in):
+		SOCK_OPS_GET_FIELD(segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_in):
+		SOCK_OPS_GET_FIELD(data_segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_received):
+		SOCK_OPS_GET_FIELD(bytes_received, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_acked):
+		SOCK_OPS_GET_FIELD(bytes_acked, struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 0/11] bpf: more sock_ops callbacks
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann

This patchset adds support for:

- direct R or R/W access to many tcp_sock fields
- passing up to 4 arguments to sock_ops BPF functions
- tcp_sock field bpf_sock_ops_flags for controlling callbacks
- optionally calling sock_ops BPF program when RTO fires
- optionally calling sock_ops BPF program when packet is retransmitted
- optionally calling sock_ops BPF program when TCP state changes
- access to tclass and sk_txhash
- new selftest

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>

Consists of the following patches:
[PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
[PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
[PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
[PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
[PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
[PATCH bpf 06/11] bpf: Add sock_ops RTO callback
[PATCH bpf 07/11] bpf: Add support for reading sk_state and more
[PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
[PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
[PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
[PATCH bpf 11/11] bpf: add selftest for tcpbpf

include/linux/filter.h                         |   4 +
include/linux/tcp.h                            |   8 ++
include/net/tcp.h                              |  66 +++++++++-
include/uapi/linux/bpf.h                       |  39 +++++-
include/uapi/linux/tcp.h                       |   5 +
net/core/filter.c                              | 212 ++++++++++++++++++++++++++++++--
net/ipv4/tcp.c                                 |   4 +-
net/ipv4/tcp_nv.c                              |   2 +-
net/ipv4/tcp_output.c                          |   5 +-
net/ipv4/tcp_timer.c                           |   9 ++
tools/include/uapi/linux/bpf.h                 |  45 ++++++-
tools/testing/selftests/bpf/Makefile           |   5 +-
tools/testing/selftests/bpf/tcp_client.py      |  57 +++++++++
tools/testing/selftests/bpf/tcp_server.py      |  83 +++++++++++++
tools/testing/selftests/bpf/test_tcpbpf_kern.c | 133 ++++++++++++++++++++
tools/testing/selftests/bpf/test_tcpbpf_user.c | 119 ++++++++++++++++++
16 files changed, 772 insertions(+), 24 deletions(-)

^ permalink raw reply

* [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds field bpf_sock_ops_flags to tcp_sock and bpf_sock_ops. Its primary
use is to determine if there should be calls to sock_ops bpf program at
various points in the TCP code. The field is initialized to zero,
disabling the calls. A sock_ops BPF program can set, per connection and
as necessary, when the connection is established.

It also adds support for reading and writting the field within a
sock_ops BPF program.

Examples of where to call the bpf program:

1) When RTO fires
2) When a packet is retransmitted
3) When the connection terminates
4) When a packet is sent
5) When a packet is received

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/tcp.h      | 8 ++++++++
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 6 ++++++
 3 files changed, 15 insertions(+)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index df5d97a..c46553f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -372,6 +372,14 @@ struct tcp_sock {
 	 */
 	struct request_sock *fastopen_rsk;
 	u32	*saved_syn;
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+	u32     bpf_sock_ops_flags;     /* values defined in uapi/linux/tcp.h */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
 };
 
 enum tsq_enum {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index addd849..dfbf43a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -953,6 +953,7 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 97e65df..2692514 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3842,6 +3842,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, op) ...
 		     offsetof(struct bpf_sock_ops, replylong[3]):
+		case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
 			break;
 		default:
 			return false;
@@ -4522,6 +4523,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, srtt_us):
 		SOCK_OPS_GET_FIELD(srtt_us, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
+		SOCK_OPS_GET_OR_SET_FIELD(bpf_sock_ops_flags, struct tcp_sock,
+					  type);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds direct R/W access to sk_txhash and access to tclass for ipv6 flows
through getsockopt and setsockopt. Sample usage for tclass:

  bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v))

where skops is a pointer to the ctx (struct bpf_sock_ops).

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 19a0b1b..fe2b692 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -973,6 +973,7 @@ struct bpf_sock_ops {
 	__u32 data_segs_out;
 	__u64 bytes_received;
 	__u64 bytes_acked;
+	__u32 sk_txhash;
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 2628077..5cb2b70 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3229,6 +3229,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct ipv6_pinfo *np = inet6_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				np->tclass = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
+#endif
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
@@ -3238,7 +3261,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false, reinit);
+			ret = tcp_set_congestion_control(sk, name, false,
+							 reinit);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3304,6 +3328,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			*((int *)optval) = (int)np->tclass;
+			break;
+		default:
+			goto err_clear;
+		}
+#endif
 	} else {
 		goto err_clear;
 	}
@@ -3843,6 +3883,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 		case offsetof(struct bpf_sock_ops, op) ...
 		     offsetof(struct bpf_sock_ops, replylong[3]):
 		case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
+		case offsetof(struct bpf_sock_ops, sk_txhash):
 			break;
 		default:
 			return false;
@@ -4622,6 +4663,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, bytes_acked):
 		SOCK_OPS_GET_FIELD(bytes_acked, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, sk_txhash):
+		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, struct sock, type);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 11/11] bpf: add selftest for tcpbpf
From: Lawrence Brakmo @ 2017-12-19  6:22 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Added a selftest for tcpbpf (sock_ops) that checks that the appropriate
callbacks occured and that it can access tcp_sock fields and that their
values are correct.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 tools/include/uapi/linux/bpf.h                 |  45 ++++++++-
 tools/testing/selftests/bpf/Makefile           |   5 +-
 tools/testing/selftests/bpf/tcp_client.py      |  57 +++++++++++
 tools/testing/selftests/bpf/tcp_server.py      |  83 +++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_kern.c | 133 +++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_user.c | 119 ++++++++++++++++++++++
 6 files changed, 438 insertions(+), 4 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/tcp_client.py
 create mode 100755 tools/testing/selftests/bpf/tcp_server.py
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_user.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cf446c2..b018d6f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -936,8 +936,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -946,6 +947,33 @@ struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+	__u32 sk_txhash;
 };
 
 /* List of known BPF sock_ops operators.
@@ -981,6 +1009,19 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 255fb1f..f3632b2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,11 +13,12 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i
 LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_align test_verifier_log test_dev_cgroup
+	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
-	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o
+	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
+	test_tcpbpf_kern.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
new file mode 100755
index 0000000..de5b0e6
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -0,0 +1,57 @@
+#!/usr/local/bin/python
+#
+# Copyright (c) 2017 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+serverPort = int(sys.argv[1])
+HostName = socket.gethostname()
+
+time.sleep(1)
+
+# create active socket
+sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+try:
+    sock.connect((HostName, serverPort))
+except socket.error as e:
+    sys.exit(1)
+
+buf = ''
+n = 0
+while n < 1000:
+    buf += '+'
+    n += 1
+
+n = send(sock, buf)
+n = read(sock, 500)
+sys.exit(0)
+
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
new file mode 100755
index 0000000..b9391f3
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -0,0 +1,83 @@
+#!/usr/local/bin/python
+#
+# Copyright (c) 2017 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+SERVER_PORT = 12877
+MAX_PORTS = 2
+
+serverPort = SERVER_PORT
+serverSocket = None
+
+HostName = socket.gethostname()
+
+# create passive socket
+serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+host = socket.gethostname()
+
+while serverPort < SERVER_PORT + 5:
+	try: serverSocket.bind((host, serverPort))
+	except socket.error as msg:
+            serverPort += 1
+            continue
+	break
+
+cmdStr = ("./tcp_client.py %d &") % (serverPort)
+os.system(cmdStr)
+
+buf = ''
+n = 0
+while n < 500:
+    buf += '.'
+    n += 1
+
+serverSocket.listen(MAX_PORTS)
+readList = [serverSocket]
+
+while True:
+    readyRead, readyWrite, inError = \
+        select.select(readList, [], [], 10)
+
+    if len(readyRead) > 0:
+        waitCount = 0
+        for sock in readyRead:
+            if sock == serverSocket:
+                (clientSocket, address) = serverSocket.accept()
+                address = str(address[0])
+                readList.append(clientSocket)
+            else:
+                s = read(sock, 1000)
+                n = send(sock, buf)
+                sock.close()
+                time.sleep(1)
+                sys.exit(0)
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
new file mode 100644
index 0000000..6b69105
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
@@ -0,0 +1,133 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <netinet/in.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+
+struct globals {
+	__u32 event_map;
+	__u32 total_retrans;
+	__u32 data_segs_in;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct globals),
+	.max_entries = 2,
+};
+
+
+static inline void update_event_map(int event)
+{
+	__u32 key = 0;
+	struct globals g, *gp;
+
+	gp = bpf_map_lookup_elem(&global_map, &key);
+	if (gp == NULL) {
+		struct globals g = {0, 0, 0, 0, 0, 0};
+
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	} else {
+		g = *gp;
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	}
+}
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+	int rv = -1;
+	int op;
+	int init_seq = 0;
+	int ret = 0;
+	int v = 0;
+
+	/* For testing purposes, only execute rest of BPF program
+	 * if remote port number is in the range 12877..12887
+	 * I.e. the active side of the connection
+	 */
+	if ((bpf_ntohl(skops->remote_port) < 12877 ||
+	     bpf_ntohl(skops->remote_port) >= 12887)) {
+		skops->reply = -1;
+		return 1;
+	}
+
+	op = (int) skops->op;
+
+	/* Check that both hosts are within same datacenter. For this example
+	 * it is the case when the first 5.5 bytes of their IPv6 addresses are
+	 * the same.
+	 */
+	if (1) {
+		update_event_map(op);
+
+		switch (op) {
+		case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+			skops->bpf_sock_ops_flags = 0xfff;
+			init_seq = skops->snd_nxt;
+			break;
+		case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+			init_seq = skops->snd_nxt;
+			skops->bpf_sock_ops_flags = 0xfff;
+			skops->sk_txhash = 0x12345f;
+			v = 0xff;
+			ret = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
+					     sizeof(v));
+			break;
+		case BPF_SOCK_OPS_RTO_CB:
+			break;
+		case BPF_SOCK_OPS_RETRANS_CB:
+			break;
+		case BPF_SOCK_OPS_STATE_CB:
+			if (skops->args[1] == 7) {
+				__u32 key = 0;
+				struct globals g, *gp;
+
+				gp = bpf_map_lookup_elem(&global_map, &key);
+				if (gp == NULL) {
+				} else {
+					g = *gp;
+					g.total_retrans = skops->total_retrans;
+					g.data_segs_in = skops->data_segs_in;
+					g.data_segs_out = skops->data_segs_out;
+					g.bytes_received =
+						skops->bytes_received;
+					g.bytes_acked = skops->bytes_acked;
+					bpf_map_update_elem(&global_map, &key,
+							    &g, BPF_ANY);
+				}
+			}
+			break;
+		default:
+			rv = -1;
+		}
+	} else {
+		rv = -1;
+	}
+	skops->reply = rv;
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
new file mode 100644
index 0000000..8d941fd
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -0,0 +1,119 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <assert.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+//#include "bpf_load.h"
+#include "bpf_util.h"
+#include <linux/perf_event.h>
+
+struct globals {
+	__u32 event_map;
+	__u32 total_retrans;
+	__u32 data_segs_in;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map) {
+		printf("%s:FAIL:map '%s' not found\n", test, name);
+		return -1;
+	}
+	return bpf_map__fd(map);
+}
+
+#define SYSTEM(CMD)						\
+	do {							\
+		if (system(CMD)) {				\
+			printf("system(%s) FAILS!\n", CMD);	\
+		}						\
+	} while (0)
+
+int main(int argc, char **argv)
+{
+	struct globals g = {0, 0, 0, 0, 0, 0};
+	__u32 key = 0;
+	int rv;
+	int pid;
+	int error = EXIT_FAILURE;
+	int cg_fd, prog_fd, map_fd;
+	char cmd[100], *dir;
+	const char *file = "./test_tcpbpf_kern.o";
+	struct bpf_object *obj;
+	struct stat buffer;
+
+	dir = "/tmp/cgroupv2/foo";
+
+	if (stat(dir, &buffer) != 0) {
+		SYSTEM("mkdir -p /tmp/cgroupv2");
+		SYSTEM("mount -t cgroup2 none /tmp/cgroupv2");
+		SYSTEM("mkdir -p /tmp/cgroupv2/foo");
+	}
+	pid = (int) getpid();
+	sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid);
+	SYSTEM(cmd);
+
+	cg_fd = open(dir, O_DIRECTORY, O_RDONLY);
+	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+//	if (load_bpf_file(prog)) {
+		printf("FAILED: load_bpf_file failed for: %s\n", file);
+//		printf("%s", bpf_log_buf);
+		goto err;
+	}
+
+	rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (rv) {
+		printf("FAILED: bpf_prog_attach: %d (%s)\n",
+		       error, strerror(errno));
+		goto err;
+	}
+
+	SYSTEM("./tcp_server.py");
+
+	map_fd = bpf_find_map(__func__, obj, "global_map");
+	if (map_fd < 0)
+		goto err;
+
+	rv = bpf_map_lookup_elem(map_fd, &key, &g);
+	if (rv != 0) {
+		printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+		goto err;
+	}
+
+	if (g.bytes_received != 501 || g.bytes_acked != 1002 ||
+	    g.data_segs_in != 1 || g.data_segs_out != 1 ||
+		g.event_map != 0x45e) {
+		printf("FAILED: Wrong stats\n");
+		goto err;
+	}
+	printf("PASSED!\n");
+	error = 0;
+err:
+	bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+	return error;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds support for calling sock_ops BPF program when there is a TCP state
change. Two arguments are used; one for the old state and another for
the new state.

New op: BPF_SOCK_OPS_STATE_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 4 ++++
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp.c           | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7165619..b018d6f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1018,6 +1018,10 @@ enum {
 					 * Arg1: sequence number of 1st byte
 					 * Arg2: # segments
 					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index dc36d3c..211322c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -262,6 +262,7 @@ struct tcp_md5sig {
 /* Definitions for bpf_sock_ops_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
 #define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
 
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 817df3f..e70dd2f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2041,6 +2041,8 @@ void tcp_set_state(struct sock *sk, int state)
 	int oldstate = sk->sk_state;
 
 	trace_tcp_set_state(sk, oldstate, state);
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
 
 	switch (state) {
 	case TCP_ESTABLISHED:
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 06/11] bpf: Add sock_ops RTO callback
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds an optional call to sock_ops BPF program based on whether the
BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags.
The BPF program is passed 2 arguments: icsk_retransmits and whether the
RTO has expired.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 5 +++++
 include/uapi/linux/tcp.h | 3 +++
 net/ipv4/tcp_timer.c     | 9 +++++++++
 3 files changed, 17 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dfbf43a..1c36795 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -989,6 +989,11 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index b4a4f64..089c19e 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -259,6 +259,9 @@ struct tcp_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
+/* Definitions for bpf_sock_ops_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
 	__u8	tcpm_family;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 16df6dd..e6afd93 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -230,9 +230,18 @@ static int tcp_write_timeout(struct sock *sk)
 	}
 	if (expired) {
 		/* Has it gone just too far? */
+		if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+			tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+					  icsk->icsk_retransmits,
+					  icsk->icsk_rto, 1);
 		tcp_write_err(sk);
 		return 1;
 	}
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+				  icsk->icsk_retransmits,
+				  icsk->icsk_rto, 0);
 	return 0;
 }
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds support for calling sock_ops BPF program when there is a
retransmission. Two arguments are used; one for the sequence number and
other for the number of segments retransmitted. Does not include syn-ack
retransmissions.

New op: BPF_SOCK_OPS_RETRANS_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 4 ++++
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp_output.c    | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fe2b692..7165619 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1014,6 +1014,10 @@ enum {
 					 * Arg2: value of icsk_rto
 					 * Arg3: whether RTO has expired
 					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 089c19e..dc36d3c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -261,6 +261,7 @@ struct tcp_md5sig {
 
 /* Definitions for bpf_sock_ops_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
 
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 50cb242..b8ad088 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2910,6 +2910,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
+		if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+			tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+					  TCP_SKB_CB(skb)->seq, segs);
 	} else if (err != -EBUSY) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Make SOCK_OPS_GET_TCP helper macro size independent (before only worked
with 4-byte fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/core/filter.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 754abe1..d47d126 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4448,9 +4448,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 /* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4462,16 +4463,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
+						   FIELD_NAME), si->dst_reg,  \
+				      si->dst_reg,			      \
 				      offsetof(struct tcp_sock, FIELD_NAME)); \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP32(snd_cwnd);
+		SOCK_OPS_GET_TCP(snd_cwnd);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP32(srtt_us);
+		SOCK_OPS_GET_TCP(srtt_us);
 		break;
 	}
 	return insn - insn_buf;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added a new
argument so now it can also work with struct sock fields.

Previous: SOCK_OPS_GET_TCP(FIELD_NAME)
New:      SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ)

Where OBJ is either "struct tcp_sock" or "struct sock" (without
quotation). Assumes FIELD_NAME is a field in the struct
bpf_sock_ops and in the OBJ specified.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/core/filter.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index d47d126..f808269 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4447,10 +4447,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ)				      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) >		      \
 			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
@@ -4463,18 +4463,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
-						   FIELD_NAME), si->dst_reg,  \
-				      si->dst_reg,			      \
-				      offsetof(struct tcp_sock, FIELD_NAME)); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \
+						       FIELD_NAME),	      \
+				      si->dst_reg, si->dst_reg,		      \
+				      offsetof(OBJ, FIELD_NAME));	      \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP(snd_cwnd);
+		SOCK_OPS_GET_FIELD(snd_cwnd, struct tcp_sock);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP(srtt_us);
+		SOCK_OPS_GET_FIELD(srtt_us, struct tcp_sock);
 		break;
 	}
 	return insn - insn_buf;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
struct tcp_sock or struct sock fields. This required adding a new
field "temp" to struct bpf_sock_ops_kern for temporary storage that
is used by sock_ops_convert_ctx_access. It is used to store and recover
the contents of a register, so the register can be used to store the
address of the sk. Since we cannot overwrite the dst_reg because it
contains the pointer to ctx, nor the src_reg since it contains the value
we want to store, we need an extra register to contain the address
of the sk.

Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
GET or SET macros depending on the value of the TYPE field.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h |  3 +++
 include/net/tcp.h      |  2 +-
 net/core/filter.c      | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5feb441..8929162 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -987,6 +987,9 @@ struct bpf_sock_ops_kern {
 		u32 replylong[4];
 	};
 	u32	is_fullsock;
+	u64	temp;			/* Used by sock_ops_convert_ctx_access
+					 * as temporary storaage of a register
+					 */
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6cc205c..e0213f1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, is_fullsock));
 	if (sk_fullsock(sk)) {
 		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index f808269..97e65df 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4469,6 +4469,52 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(OBJ, FIELD_NAME));	      \
 	} while (0)
 
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an aditional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ)				      \
+	do {								      \
+		int reg = BPF_REG_9;					      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
+		while (si->dst_reg == reg || si->src_reg == reg)	      \
+			reg--;						      \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, FIELD_NAME),      \
+				      reg, si->src_reg,			      \
+				      offsetof(OBJ, FIELD_NAME));	      \
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+	} while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(FIELD_NAME, OBJ, TYPE)		      \
+	do {								      \
+		if (TYPE == BPF_WRITE)					      \
+			SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ);		      \
+		else							      \
+			SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ);		      \
+	} while (0)
+
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
 		SOCK_OPS_GET_FIELD(snd_cwnd, struct tcp_sock);
 		break;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        | 64 ++++++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/bpf.h |  5 ++--
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_nv.c        |  2 +-
 net/ipv4/tcp_output.c    |  2 +-
 6 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8929162..2a09f27 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -983,6 +983,7 @@ struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
 	union {
+		u32 args[4];
 		u32 reply;
 		u32 replylong[4];
 	};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e0213f1..c262be6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk);
  * program loaded).
  */
 #ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 
 	sock_ops.sk = sk;
 	sock_ops.op = op;
+	if (nargs > 0)
+		memcpy(sock_ops.args, args, nargs*sizeof(u32));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
 	if (ret == 0)
@@ -2027,18 +2029,70 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 		ret = -1;
 	return ret;
 }
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+	return tcp_call_bpf(sk, op, 1, &arg);
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	u32 args[2] = {arg1, arg2};
+
+	return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	u32 args[3] = {arg1, arg2, arg3};
+
+	return tcp_call_bpf(sk, op, 3, args);
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3, u32 arg4)
+{
+	u32 args[4] = {arg1, arg2, arg3, arg4};
+
+	return tcp_call_bpf(sk, op, 4, args);
+}
+
 #else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	return -EPERM;
 }
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+	u32 arg3)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3, u32 arg4)
+{
+	return -EPERM;
+}
+
 #endif
 
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
-	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2103,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 {
 	int rwnd;
 
-	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
 
 	if (rwnd < 0)
 		rwnd = 0;
@@ -2058,7 +2112,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 
 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
-	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 595bda1..addd849 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -936,8 +936,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1803116..817df3f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -465,7 +465,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05b..ddbce73 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
 	 * within a datacenter, where we have reasonable estimates of
 	 * RTTs
 	 */
-	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
 	if (base_rtt > 0) {
 		ca->nv_base_rtt = base_rtt;
 		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c..50cb242 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3471,7 +3471,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
-	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH v2 2/3] vsprintf: print <no-symbol> if symbol not found
From: Joe Perches @ 2017-12-19  6:18 UTC (permalink / raw)
  To: Tobin C. Harding, kernel-hardening
  Cc: Steven Rostedt, Tycho Andersen, Linus Torvalds, Kees Cook,
	Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development
In-Reply-To: <1513654094-16832-3-git-send-email-me@tobin.cc>

On Tue, 2017-12-19 at 14:28 +1100, Tobin C. Harding wrote:
> Depends on: commit 40eee173a35e ("kallsyms: don't leak address when
> symbol not found")
> 
> Currently vsprintf for specifiers %p[SsB] relies on the behaviour of
> kallsyms (sprint_symbol()) and prints the actual address if a symbol is
> not found. Previous patch changes this behaviour so that sprint_symbol()
> returns an error if symbol not found. With this patch in place we can
> print a sanitized message '<symbol not found>' instead of leaking the
> address.
> 
> Print '<symbol not found>' for printk specifier %p[sSB] if symbol look
> up fails.
> 
> Signed-off-by: Tobin C. Harding <me@tobin.cc>
> ---
>  lib/vsprintf.c | 11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index 01c3957b2de6..820ed4fe6e6c 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -674,6 +674,8 @@ char *symbol_string(char *buf, char *end, void *ptr,
>  	unsigned long value;
>  #ifdef CONFIG_KALLSYMS
>  	char sym[KSYM_SYMBOL_LEN];
> +	const char *sym_not_found = "<symbol not found>";

This will be reinitialized on every use.

> +	int ret;
>  #endif
>  
>  	if (fmt[1] == 'R')
> @@ -682,11 +684,14 @@ char *symbol_string(char *buf, char *end, void *ptr,
>  
>  #ifdef CONFIG_KALLSYMS
>  	if (*fmt == 'B')
> -		sprint_backtrace(sym, value);
> +		ret = sprint_backtrace(sym, value);
>  	else if (*fmt != 'f' && *fmt != 's')
> -		sprint_symbol(sym, value);
> +		ret = sprint_symbol(sym, value);
>  	else
> -		sprint_symbol_no_offset(sym, value);
> +		ret = sprint_symbol_no_offset(sym, value);
> +
> +	if (ret == -1)
> +		strcpy(sym, sym_not_found);


This could avoid the unnecessary strcpy if sym_not_found
was not used at all and this was used instead

	if (ret == -1)
		return string(buf, end, "<symbol not found>", spec);

	return string(buf, end, sym, spec);

or maybe

	return string(buf, end, ret == -1 ? "<symbol not found>" : sum, spec);

>  
>  	return string(buf, end, sym, spec);
>  #else

^ permalink raw reply

* net/wireless/certs/*.x509 binary files
From: Randy Dunlap @ 2017-12-19  6:01 UTC (permalink / raw)
  To: LKML; +Cc: netdev@vger.kernel.org, Johannes Berg

This is just an FYI/acknowledgment that net/wireless/certs/*.x509
binary file(s) practically kills use of Linux kernel tarballs.

Of course, someone can always enable EXPERT and CFG80211_CERTIFICATION_ONUS
and disable the REGDB kconfig symbols to get around this.
Oh, and then chmod +x tools/objtool/sync-check.sh (unrelated problem).

Then you are good to go. :)

Background:
I was getting build errors on net/wireless/shipped-certs.o and the
build log didn't help me at all. It just said something like,
Error: build failed on net/wireless/shipped-certs.o.
Even building with V=1 didn't help.
So I finally discovered the reason and worked around it.

-- 
~Randy

PS:  Yes, I know about git.

^ permalink raw reply

* Re: r8169 regression: UDP packets dropped intermittantly
From: Jonathan Woithe @ 2017-12-19  5:45 UTC (permalink / raw)
  To: Holger Hoffstätte; +Cc: netdev, linux-kernel
In-Reply-To: <20171218223224.GA13172@marvin.atrad.com.au>

Hi again

This is a follow up to my earlier message.

On Tue, Dec 19, 2017 at 09:02:25AM +1030, Jonathan Woithe wrote:
> On Mon, Dec 18, 2017 at 02:38:53PM +0100, Holger Hoffstätte wrote:
> > Since I've seen your postings several times now with no comment or resolution
> > I've decided to try your reproducer on my own systems. In short, I cannot
> > reproduce any packet loss, despite having 2 (cheap) 1Gb switches between the
> > two machines. Both are running 4.14.7.
> 
> Thanks for trying the test program on your system.  The result indicates
> that the problem might be specific to the behaviour of a particular network
> variant of the r8169 chip.

I was able to temporarily acquire a PCIe card which uses the r8169 driver.
This allowed me to run the reproducer on the same machine with two different
r8169-based cards.  The original NIC is this:

  05:01.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL-8169 
  Gigabit Ethernet (rev 10) [10ec:8169]
          Subsystem: Netgear GA311 [1385:311a]

The PCIe card is this:

  02:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168B 
  PCI Express Gigabit Ethernet controller (rev 06) [10ec:8168]
          Subsystem: Realtek Semiconductor Co., Ltd. Device 0123 [10ec:0123]

The test was conducted with kernel 4.3.0 since both the 4.3.0 driver (which
triggers the fault) and the forward ported driver (which predates commit
da78dbff2e05630921c551dbbc70a4b7981a8fff) was available.  For the record,
the machine used as the slave in these tests (the one receiving the 6 byte
request and sending the 14 byte response) was using its onboard NIC:

  00:19.0 Ethernet controller: Intel Corporation 82579V Gigabit Network 
  Connection (rev 05) [8086:1503]
          Subsystem: Gigabyte Technology Co., Ltd 82579V Gigabit Network 
          Connection [1458:e000]

Test outcomes were as follows:

  PCIe card, unpatched 4.3.0 r8169 driver: no error (tested for 1 hour)
  PCIe card, forward ported r8169 driver:  no error (tested for 1 hour)

  GA311 card, unpatched 4.3.0 r8169 driver: test fail in under 4 minutes
  GA311 card, forward ported r8169 driver:  no error (tested for 1 hour)

For completeness, I then booted 4.14 and repeated the test with its r8168
driver.  The PCIe card ran for an hour without triggering the error, while
the GA311 triggered it quickly (in under 3 minutes).

This clearly indicates that not every card using the r8169 driver is
vulnerable to the problem.  It also explains why Holger was unable to
reproduce the result on his system: the PCIe cards do not appear to suffer
from the problem.  Most likely the PCI RTL-8169 chip is affected, but newer
PCIe variations do not.  However, obviously more testing will be required
with a wider variety of cards if this inference is to hold up.

The above result (and those from Holger) allow the problem description to be
refined a little: changes in commit da78dbff2e05630921c551dbbc70a4b7981a8fff
cause GA311 NICs (and possibly other PCI cards using an RTL-8169) to have
trouble with small UDP packets, while PCIe variants are seemingly
unaffected.

Does this help?

Regards
  jonathan

^ permalink raw reply

* [PATCH V4 14/26] pch_gbe: deprecate pci_get_bus_and_slot()
From: Sinan Kaya @ 2017-12-19  5:37 UTC (permalink / raw)
  To: linux-pci, timur
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, David S. Miller,
	Kees Cook, Eric Dumazet, Tobias Klauser, Andy Shevchenko,
	open list:NETWORKING DRIVERS, open list
In-Reply-To: <1513661883-28662-1-git-send-email-okaya@codeaurora.org>

pci_get_bus_and_slot() is restrictive such that it assumes domain=0 as
where a PCI device is present. This restricts the device drivers to be
reused for other domain numbers.

Getting ready to remove pci_get_bus_and_slot() function in favor of
pci_get_domain_bus_and_slot().

Use the domain information from pdev while calling into
pci_get_domain_bus_and_slot() function.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 40e52ff..7cd4946 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -2594,8 +2594,10 @@ static int pch_gbe_probe(struct pci_dev *pdev,
 	if (adapter->pdata && adapter->pdata->platform_init)
 		adapter->pdata->platform_init(pdev);
 
-	adapter->ptp_pdev = pci_get_bus_and_slot(adapter->pdev->bus->number,
-					       PCI_DEVFN(12, 4));
+	adapter->ptp_pdev =
+		pci_get_domain_bus_and_slot(pci_domain_nr(adapter->pdev->bus),
+					    adapter->pdev->bus->number,
+					    PCI_DEVFN(12, 4));
 
 	netdev->netdev_ops = &pch_gbe_netdev_ops;
 	netdev->watchdog_timeo = PCH_GBE_WATCHDOG_PERIOD;
-- 
1.9.1

^ permalink raw reply related

* [PATCH V4 13/26] bnx2x: deprecate pci_get_bus_and_slot()
From: Sinan Kaya @ 2017-12-19  5:37 UTC (permalink / raw)
  To: linux-pci, timur
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Ariel Elior,
	supporter:BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER,
	open list:BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER, open list
In-Reply-To: <1513661883-28662-1-git-send-email-okaya@codeaurora.org>

pci_get_bus_and_slot() is restrictive such that it assumes domain=0 as
where a PCI device is present. This restricts the device drivers to be
reused for other domain numbers.

Getting ready to remove pci_get_bus_and_slot() function in favor of
pci_get_domain_bus_and_slot().

Introduce bnx2x_vf_domain() function to extract the domain information
and save it to VF specific data structure.

Use the saved domain value while calling pci_get_domain_bus_and_slot().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 10 +++++++++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 3591077..ffa7959 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -812,7 +812,7 @@ static u8 bnx2x_vf_is_pcie_pending(struct bnx2x *bp, u8 abs_vfid)
 	if (!vf)
 		return false;
 
-	dev = pci_get_bus_and_slot(vf->bus, vf->devfn);
+	dev = pci_get_domain_bus_and_slot(vf->domain, vf->bus, vf->devfn);
 	if (dev)
 		return bnx2x_is_pcie_pending(dev);
 	return false;
@@ -1041,6 +1041,13 @@ void bnx2x_iov_init_dmae(struct bnx2x *bp)
 		REG_WR(bp, DMAE_REG_BACKWARD_COMP_EN, 0);
 }
 
+static int bnx2x_vf_domain(struct bnx2x *bp, int vfid)
+{
+	struct pci_dev *dev = bp->pdev;
+
+	return pci_domain_nr(dev->bus);
+}
+
 static int bnx2x_vf_bus(struct bnx2x *bp, int vfid)
 {
 	struct pci_dev *dev = bp->pdev;
@@ -1606,6 +1613,7 @@ int bnx2x_iov_nic_init(struct bnx2x *bp)
 		struct bnx2x_virtf *vf = BP_VF(bp, vfid);
 
 		/* fill in the BDF and bars */
+		vf->domain = bnx2x_vf_domain(bp, vfid);
 		vf->bus = bnx2x_vf_bus(bp, vfid);
 		vf->devfn = bnx2x_vf_devfn(bp, vfid);
 		bnx2x_vf_set_bars(bp, vf);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
index 53466f6..eb814c6 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
@@ -182,6 +182,7 @@ struct bnx2x_virtf {
 	u32 error;	/* 0 means all's-well */
 
 	/* BDF */
+	unsigned int domain;
 	unsigned int bus;
 	unsigned int devfn;
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v4 25/36] nds32: Miscellaneous header files
From: Greentime Hu @ 2017-12-19  5:34 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Greentime, Linux Kernel Mailing List, linux-arch, Thomas Gleixner,
	Jason Cooper, Marc Zyngier, Rob Herring, Networking, Vincent Chen,
	DTML, Al Viro, David Howells, Will Deacon, Daniel Lezcano,
	linux-serial-u79uwXL29TY76Z2rM5mHXA, Geert Uytterhoeven,
	Linus Walleij, Mark Rutland, Greg KH, Guo Ren
In-Reply-To: <CAK8P3a1EOD3WB=c4JybBD5bazhVJ5reuegyA1Hyoj-nPHpsW6g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hi, Arnd:

2017-12-18 19:13 GMT+08:00 Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>:
> On Mon, Dec 18, 2017 at 7:46 AM, Greentime Hu <green.hu-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> From: Greentime Hu <greentime-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
>>
>> This patch introduces some miscellaneous header files.
>
>> +static inline void __delay(unsigned long loops)
>> +{
>> +       __asm__ __volatile__(".align 2\n"
>> +                            "1:\n"
>> +                            "\taddi\t%0, %0, -1\n"
>> +                            "\tbgtz\t%0, 1b\n"
>> +                            :"=r"(loops)
>> +                            :"0"(loops));
>> +}
>> +
>> +static inline void __udelay(unsigned long usecs, unsigned long lpj)
>> +{
>> +       usecs *= (unsigned long)(((0x8000000000000000ULL / (500000 / HZ)) +
>> +                                 0x80000000ULL) >> 32);
>> +       usecs = (unsigned long)(((unsigned long long)usecs * lpj) >> 32);
>> +       __delay(usecs);
>> +}
>
> Do you have a reliable clocksource that you can read here instead of doing the
> loop? It's generally preferred to have an accurate delay if at all possible, the
> delay loop calibration is only for those architectures that don't have any
> way to observe how much time has passed accurately.
>

We currently only have atcpit100 as clocksource but it is an IP of  SoC.
These delay API will be unavailable if we changed to another SoC
unless all these timer driver provided the same APIs.
It may suffer our customers if they forget to port these APIs in their
timer drivers when they try to use nds32 in the first beginning.
Or maybe I can use a CONFIG_USE_ACCURATE_DELAY to keep these 2
implementions for these purposes?
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Linux ECN Handling
From: Steve Ibanez @ 2017-12-19  5:16 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: Eric Dumazet, Yuchung Cheng, Daniel Borkmann, Netdev,
	Florian Westphal, Mohammad Alizadeh, Lawrence Brakmo
In-Reply-To: <CADVnQy=9xxQ2XH0rKhR+kHp6J9nOvM7e=6z-AJPAiTaap0e_bA@mail.gmail.com>

Hi Neal,

I started looking into this receiver ACKing issue today. Strangely,
when I tried adding printk statements at the top of the
tcp_v4_do_rcv(), tcp_rcv_established(), __tcp_ack_snd_check() and
tcp_send_delayed_ack() functions they were never executed on the
machine running the iperf3 server (i.e. the destination of the flows).
Maybe the iperf3 server is using its own TCP stack?

In any case, the ACKing problem is reproducible using just normal
iperf for which I do see my printk statements being executed. I can
now confirm that when the CWR marked packet (for which no ACK is sent)
arrives at the receiver, the __tcp_ack_snd_check() function is never
invoked; and hence neither is the tcp_send_delayed_ack() function.
Hopefully this helps narrow down where the issue might be? I started
adding some printk statements into the tcp_rcv_established() function,
but I'm not sure where the best places to look would be so I wanted to
ask your advice on this.

In case you're interested, I instrumented the __tcp_ack_snd_check()
function with the following printk statements:

@@ -5057,9 +5117,15 @@ static inline void tcp_data_snd_check(struct sock *sk)
 /*
  * Check if sending an ack is needed.
  */
-static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible,
const struct tcphdr *th)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_sock *inet = inet_sk(sk);

            /* More than one full frame received... */
        if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5071,21 +5137,31 @@ static void __tcp_ack_snd_check(struct sock
*sk, int ofo_possible)
            tcp_in_quickack_mode(sk) ||
            /* We have out of order data. */
            (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
+                // SI: Debugging TCP ECN handeling
+                if (sk->sk_family == AF_INET && th->cwr) {
+                        printk("tcp_debug: __tcp_ack_snd_check:
%pI4/%u CWR set and sending ACK now - rcv_nxt=%u\n",
+                                 &inet->inet_daddr,
ntohs(inet->inet_sport), tp->rcv_nxt);
+                 }
                /* Then ack it now */
                tcp_send_ack(sk);
        } else {
+                // SI: Debugging TCP ECN handeling
+                if (sk->sk_family == AF_INET && th->cwr) {
+                        printk("tcp_debug: __tcp_ack_snd_check:
%pI4/%u CWR set and sending delayed ACK - rcv_nxt=%u\n",
+                                 &inet->inet_daddr,
ntohs(inet->inet_sport), tp->rcv_nxt);
+                 }
                /* Else, send delayed ack. */
-               tcp_send_delayed_ack(sk);
+               tcp_send_delayed_ack(sk, th);
        }
 }

In the kernel log on the receiver, I see the following sequence of
events at a timeout:

[ 2730.145023] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2317949387
[ 2730.145543] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2318243331 <-- last log statement before
timeout
[ 2730.452540] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2318593747
[ 2730.453137] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2318813843

>From the tcpdump trace at the receiver's interface I see that the last
log statement before the timeout corresponds exactly to one CWR packet
before the unACKed CWR packet. For example, in this case, the CWR
packet to which the indicated log statement corresponds has seqNo =
2318196995 and length 46336 ==> 2318196995 + 46336 = 2318243331, which
is exactly the rcv_nxt value of the indicated log statement. And then
unACKed CWR packet arrives and it is completely missing from the log
file, there is no indication of sending a delayed ACK either. Hence my
conclusion that the __tcp_ack_snd_check() function is never invoked by
the receiver upon receiving the unACKed CWR packet.

Sorry if that was long and verbose, I just wanted to be clear on what
I had done. Please do let me know if you have any questions though.

Thanks,
-Steve


On Tue, Dec 5, 2017 at 12:04 PM, Neal Cardwell <ncardwell@google.com> wrote:
> On Tue, Dec 5, 2017 at 2:36 PM, Steve Ibanez <sibanez@stanford.edu> wrote:
>> Hi Neal,
>>
>> I've included a link to small trace of 13 packets which is different
>> from the screenshot I attached in my last email, but shows the same
>> sequence of events. It's a bit hard to read the tcptrace due to the
>> 300ms timeout, so I figured this was the best approach.
>>
>> slice.pcap: https://drive.google.com/open?id=1hYXbUClHGbQv1hWG1HZWDO2WYf30N6G8
>
> Thanks for the trace! Attached is a screen shot (first screen shot is
> for the arriving packets with CWR; second is after the RTO). The
> sender behavior looks reasonable. I don't see why the receiver is not
> ACKing. As you say, it does look like a receiver bug. You could try
> adding instrumentation to try to isolate why the receiver is not
> sending an ACK immediately. You might instrument __tcp_ack_snd_check()
> and tcp_send_delayed_ack() so that when the most recent incoming
> packet had cwr set they printk to log what they are deciding in this
> case. Perhaps the tcp_send_delayed_ack()  code is hitting the max_ato
> = HZ / 2 code path?
>
> neal

^ permalink raw reply

* Re: [Patch net-next] net_sched: properly check for empty skb array on error path
From: John Fastabend @ 2017-12-19  4:54 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers
In-Reply-To: <CAM_iQpU8EuhBvfFtf0t0TCS-XKLffuF+E6PBM7JS8Y5dh5sR8w@mail.gmail.com>

On 12/18/2017 08:31 PM, Cong Wang wrote:
> On Mon, Dec 18, 2017 at 7:58 PM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> On 12/18/2017 06:20 PM, Cong Wang wrote:
>>> On Mon, Dec 18, 2017 at 5:25 PM, John Fastabend
>>> <john.fastabend@gmail.com> wrote:
>>>> On 12/18/2017 02:34 PM, Cong Wang wrote:
>>>>> First, the check of &q->ring.queue against NULL is wrong, it
>>>>> is always false. We should check the value rather than the address.
>>>>>
>>>>
>>>> Thanks.
>>>>
>>>>> Secondly, we need the same check in pfifo_fast_reset() too,
>>>>> as both ->reset() and ->destroy() are called in qdisc_destroy().
>>>>>
>>>>
>>>> not that it hurts to have the check here, but if init fails
>>>> in qdisc_create it seems only ->destroy() is called without
>>>> a ->reset().
>>>>
>>>> Is there another path for init() to fail that I'm missing.
>>>
>>> Pretty sure ->reset() is called in qdisc_destroy() and also before
>>> ->destroy():
>>>
>>
>> Except, the failed init path does not call qdisc_destroy.
>>
>> static struct Qdisc *qdisc_create(struct net_device *dev,
>> [...]
>>
>>         if (ops->init) {
>>                 err = ops->init(sch, tca[TCA_OPTIONS]);
>>                 if (err != 0)
>>                         goto err_out5;
>>         }
>> [...]
>>
>> err_out5:
>>         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
>>         if (ops->destroy)
>>                 ops->destroy(sch);
> 
> Didn't I say qdisc_destroy() rather than ->destroy()? :-)
> 

Yep, thanks for the fix.

Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* RE: [PATCH net-next] netdevsim: correctly check return value of debugfs_create_dir
From: Prashant Bhole @ 2017-12-19  4:45 UTC (permalink / raw)
  To: 'Jakub Kicinski', 'David Miller'; +Cc: netdev
In-Reply-To: <20171211104029.63635671@cakuba.netronome.com>


> From: Jakub Kicinski [mailto:jakub.kicinski@netronome.com]
> 
> On Mon, 11 Dec 2017 13:46:48 +0900, Prashant Bhole wrote:
> > > From: David Miller [mailto:davem@davemloft.net]
> > >
> > > From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> > > Date: Fri,  8 Dec 2017 09:52:50 +0900
> > >
> > > > Return value is now checked with IS_ERROR_OR_NULL because
> > > > debugfs_create_dir doesn't return error value. It either returns
> > > > NULL or a valid pointer.
> > > >
> > > > Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> > > > ---
> > > >  drivers/net/netdevsim/netdev.c | 2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/net/netdevsim/netdev.c
> > > > b/drivers/net/netdevsim/netdev.c index eb8c679fca9f..88d8ee2c89da
> > > > 100644
> > > > --- a/drivers/net/netdevsim/netdev.c
> > > > +++ b/drivers/net/netdevsim/netdev.c
> > > > @@ -469,7 +469,7 @@ static int __init nsim_module_init(void)
> > > >  	int err;
> > > >
> > > >  	nsim_ddir = debugfs_create_dir(DRV_NAME, NULL);
> > > > -	if (IS_ERR(nsim_ddir))
> > > > +	if (IS_ERR_OR_NULL(nsim_ddir))
> > > >  		return PTR_ERR(nsim_ddir);
> > >
> > > debugfs_create_dir() should really be fixed, either it uses error
> > > pointers consistently and therefore always provides a suitable error
> > > code to return
> > or it
> > > always uses NULL.
> > >
> > > This in-between behavior makes using it as an interface painful
> > > because no
> > clear
> > > meaning is given to NULL.
> > >
> > > So please do the work necessary to make debugfs_create_dir()'s
> > > return semantics clearer and more useful.
> > >
> > > Thank you.
> >
> > Dave,
> > Thanks for comments. I will try to fix error handling in netdevsim
first.
> >
> > Jakub,
> > Let's decide with an example. The typical directory structure for
> > netdevsim interface is as below:
> > /sys/kernel/debug/netdevsim/sim0/bpf_bound_progs/
> > Please let me know if you are ok with following:
> >
> > 1) If debugfs_create_dir() fails in module_init, let's keep it fatal
> > error with corrected condition:
> > +	if (IS_ERR_OR_NULL(nsim_ddir))
> > +		return -ENOMEM;
> >
> > 2) In case sim0 or bpf_bound_progs are  fail to create, we need to add
> > checks before creating any file in them.
> 
> Fine with me, although if you fix DebugFS first you could use the real
error from
> the start here.

Jakub, Dave,
Sorry for late reply.
I tried to evaluate whether fixing return value of debugfs_create_dir() (and
friends) will be useful or not because it has not been changed since very
long time. Now I am not much convinced about changing this api. 

Important and possible error codes could be -EEXIST and -ENOMEM. Suppose
-EEXIST is returned, IMO the directory shouldn't exists in the first place
because it is specific to particular module. Also, there is no point in
creating file in such directory, because directory owner (creator) might
remove it too. This means there are less chances that api change will be
useful. Please let me know your opinion on it.

If you are ok with above explanation, shall I submit v2 for this patch?

-Prashant  

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox