Netdev List
 help / color / mirror / Atom feed
* [PATCH iproute2 -net-next] bpf: add initial support for attaching xdp progs
From: Daniel Borkmann @ 2016-12-06  1:21 UTC (permalink / raw)
  To: stephen; +Cc: alexei.starovoitov, netdev, Daniel Borkmann
In-Reply-To: <96d911b76235bec7fc7551eb227eb750ff8c5de1.1480986807.git.daniel@iogearbox.net>

Now that we made the BPF loader generic as a library, reuse it
for loading XDP programs as well. This basically adds a minimal
start of a facility for iproute2 to load XDP programs. There
currently only exists the xdp1_user.c sample code in the kernel
tree that sets up netlink directly and an iovisor/bcc front-end.

Since we have all the necessary infrastructure in place already
from tc side, we can just reuse its loader back-end and thus
facilitate migration and usability among the two for people
familiar with tc/bpf already. Sharing maps, performing tail calls,
etc works the same way as with tc. Naturally, once kernel
configuration API evolves, we will extend new features for XDP
here as well, resp. extend dumping of related netlink attributes.

Minimal example:

  clang -target bpf -O2 -Wall -c prog.c -o prog.o
  ip [-force] link set dev em1 xdp obj prog.o       # attaching
  ip [-d] link                                      # dumping
  ip link set dev em1 xdp off                       # detaching

For the dump, intention is that in the first line for each ip
link entry, we'll see "xdp" to indicate that this device has an
XDP program attached. Once we dump some more useful information
via netlink (digest, etc), idea is that 'ip -d link' will then
display additional relevant program information below the "link/
ether [...]" output line for such devices, for example.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/bpf_api.h     |  5 +++
 include/bpf_elf.h     |  1 +
 include/utils.h       |  7 +++-
 ip/Makefile           |  2 +-
 ip/ipaddress.c        |  3 ++
 ip/iplink.c           | 22 +++++++-----
 ip/iplink_xdp.c       | 75 ++++++++++++++++++++++++++++++++++++++++
 ip/xdp.h              |  9 +++++
 lib/bpf.c             |  6 ++++
 man/man8/ip-link.8.in | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 10 files changed, 213 insertions(+), 12 deletions(-)
 create mode 100644 ip/iplink_xdp.c
 create mode 100644 ip/xdp.h

diff --git a/include/bpf_api.h b/include/bpf_api.h
index 7642623..72578c9 100644
--- a/include/bpf_api.h
+++ b/include/bpf_api.h
@@ -72,6 +72,11 @@
 	__section(__stringify(ID) "/" __stringify(KEY))
 #endif
 
+#ifndef __section_xdp_entry
+# define __section_xdp_entry						\
+	__section(ELF_SECTION_PROG)
+#endif
+
 #ifndef __section_cls_entry
 # define __section_cls_entry						\
 	__section(ELF_SECTION_CLASSIFIER)
diff --git a/include/bpf_elf.h b/include/bpf_elf.h
index 36cc988..239a0f3 100644
--- a/include/bpf_elf.h
+++ b/include/bpf_elf.h
@@ -15,6 +15,7 @@
 /* ELF section names, etc */
 #define ELF_SECTION_LICENSE	"license"
 #define ELF_SECTION_MAPS	"maps"
+#define ELF_SECTION_PROG	"prog"
 #define ELF_SECTION_CLASSIFIER	"classifier"
 #define ELF_SECTION_ACTION	"action"
 
diff --git a/include/utils.h b/include/utils.h
index 1b4f939..26c970d 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -239,7 +239,12 @@ ssize_t getcmdline(char **line, size_t *len, FILE *in);
 int makeargs(char *line, char *argv[], int maxargs);
 int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
 
-struct iplink_req;
+struct iplink_req {
+	struct nlmsghdr		n;
+	struct ifinfomsg	i;
+	char			buf[1024];
+};
+
 int iplink_parse(int argc, char **argv, struct iplink_req *req,
 		char **name, char **type, char **link, char **dev,
 		int *group, int *index);
diff --git a/ip/Makefile b/ip/Makefile
index 86c8cdc..c8e6c61 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -2,7 +2,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     rtm_map.o iptunnel.o ip6tunnel.o tunnel.o ipneigh.o ipntable.o iplink.o \
     ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o iptuntap.o iptoken.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
-    iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
+    iplink_vlan.o link_veth.o link_gre.o iplink_can.o iplink_xdp.o \
     iplink_macvlan.o ipl2tp.o link_vti.o link_vti6.o \
     iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 50897e6..de64877 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -35,6 +35,7 @@
 #include "utils.h"
 #include "ll_map.h"
 #include "ip_common.h"
+#include "xdp.h"
 #include "color.h"
 
 enum {
@@ -838,6 +839,8 @@ int print_linkinfo(const struct sockaddr_nl *who,
 
 	if (tb[IFLA_MTU])
 		fprintf(fp, "mtu %u ", *(int *)RTA_DATA(tb[IFLA_MTU]));
+	if (tb[IFLA_XDP])
+		xdp_dump(fp, tb[IFLA_XDP]);
 	if (tb[IFLA_QDISC])
 		fprintf(fp, "qdisc %s ", rta_getattr_str(tb[IFLA_QDISC]));
 	if (tb[IFLA_MASTER]) {
diff --git a/ip/iplink.c b/ip/iplink.c
index 1e603e7..2638408 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -32,6 +32,7 @@
 #include "rt_names.h"
 #include "utils.h"
 #include "ip_common.h"
+#include "xdp.h"
 #include "namespace.h"
 
 #define IPLINK_IOCTL_COMPAT	1
@@ -54,6 +55,7 @@ void iplink_usage(void)
 			"                   [ numtxqueues QUEUE_COUNT ]\n"
 			"                   [ numrxqueues QUEUE_COUNT ]\n"
 			"                   type TYPE [ ARGS ]\n"
+			"\n"
 			"       ip link delete { DEVICE | dev DEVICE | group DEVGROUP } type TYPE [ ARGS ]\n"
 			"\n"
 			"       ip link set { DEVICE | dev DEVICE | group DEVGROUP }\n"
@@ -79,24 +81,28 @@ void iplink_usage(void)
 		"			  [ alias NAME ]\n"
 		"	                  [ vf NUM [ mac LLADDR ]\n"
 		"				   [ vlan VLANID [ qos VLAN-QOS ] [ proto VLAN-PROTO ] ]\n"
-
 		"				   [ rate TXRATE ]\n"
 		"				   [ max_tx_rate TXRATE ]\n"
 		"				   [ min_tx_rate TXRATE ]\n"
-
 		"				   [ spoofchk { on | off} ]\n"
 		"				   [ query_rss { on | off} ]\n"
 		"				   [ state { auto | enable | disable} ] ]\n"
 		"				   [ trust { on | off} ] ]\n"
+		"			  [ xdp { off |\n"
+		"				  object FILE [ section NAME ] [ verbose ] |\n"
+		"				  pinned FILE } ]\n"
 		"			  [ master DEVICE ][ vrf NAME ]\n"
 		"			  [ nomaster ]\n"
 		"			  [ addrgenmode { eui64 | none | stable_secret | random } ]\n"
 		"	                  [ protodown { on | off } ]\n"
+		"\n"
 		"       ip link show [ DEVICE | group GROUP ] [up] [master DEV] [vrf NAME] [type TYPE]\n");
 
 	if (iplink_have_newlink()) {
 		fprintf(stderr,
-			"       ip link help [ TYPE ]\n\n"
+			"\n"
+			"       ip link help [ TYPE ]\n"
+			"\n"
 			"TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | macvtap |\n"
 			"          bridge | bond | team | ipoib | ip6tnl | ipip | sit | vxlan |\n"
 			"          gre | gretap | ip6gre | ip6gretap | vti | nlmon | team_slave |\n"
@@ -221,12 +227,6 @@ static int iplink_have_newlink(void)
 }
 #endif /* ! IPLINK_IOCTL_COMPAT */
 
-struct iplink_req {
-	struct nlmsghdr		n;
-	struct ifinfomsg	i;
-	char			buf[1024];
-};
-
 static int nl_get_ll_addr_len(unsigned int dev_index)
 {
 	int len;
@@ -602,6 +602,10 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
 			if (get_integer(&mtu, *argv, 0))
 				invarg("Invalid \"mtu\" value\n", *argv);
 			addattr_l(&req->n, sizeof(*req), IFLA_MTU, &mtu, 4);
+		} else if (strcmp(*argv, "xdp") == 0) {
+			NEXT_ARG();
+			if (xdp_parse(&argc, &argv, req))
+				exit(-1);
 		} else if (strcmp(*argv, "netns") == 0) {
 			NEXT_ARG();
 			if (netns != -1)
diff --git a/ip/iplink_xdp.c b/ip/iplink_xdp.c
new file mode 100644
index 0000000..a81ed97
--- /dev/null
+++ b/ip/iplink_xdp.c
@@ -0,0 +1,75 @@
+/*
+ * iplink_xdp.c XDP program loader
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Daniel Borkmann <daniel@iogearbox.net>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <linux/bpf.h>
+
+#include "xdp.h"
+#include "bpf_util.h"
+
+extern int force;
+
+static void xdp_ebpf_cb(void *raw, int fd, const char *annotation)
+{
+	__u32 flags = !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0;
+	struct iplink_req *req = raw;
+	struct rtattr *xdp;
+
+	xdp = addattr_nest(&req->n, sizeof(*req), IFLA_XDP);
+	addattr32(&req->n, sizeof(*req), IFLA_XDP_FD, fd);
+	addattr32(&req->n, sizeof(*req), IFLA_XDP_FLAGS, flags);
+	addattr_nest_end(&req->n, xdp);
+}
+
+static const struct bpf_cfg_ops bpf_cb_ops = {
+	.ebpf_cb = xdp_ebpf_cb,
+};
+
+static int xdp_delete(struct iplink_req *req)
+{
+	xdp_ebpf_cb(req, -1, NULL);
+	return 0;
+}
+
+int xdp_parse(int *argc, char ***argv, struct iplink_req *req)
+{
+	struct bpf_cfg_in cfg = {
+		.argc = *argc,
+		.argv = *argv,
+	};
+
+	if (*argc == 1) {
+		if (strcmp(**argv, "none") == 0 ||
+		    strcmp(**argv, "off") == 0)
+			return xdp_delete(req);
+	}
+	if (bpf_parse_common(BPF_PROG_TYPE_XDP, &cfg, &bpf_cb_ops, req))
+		return -1;
+
+	*argc = cfg.argc;
+	*argv = cfg.argv;
+	return 0;
+}
+
+void xdp_dump(FILE *fp, struct rtattr *xdp)
+{
+	struct rtattr *tb[IFLA_XDP_MAX + 1];
+
+	parse_rtattr_nested(tb, IFLA_XDP_MAX, xdp);
+	if (!tb[IFLA_XDP_ATTACHED] ||
+	    !rta_getattr_u8(tb[IFLA_XDP_ATTACHED]))
+		return;
+
+	fprintf(fp, "xdp ");
+	/* More to come here in future for 'ip -d link' (digest, etc) ... */
+}
diff --git a/ip/xdp.h b/ip/xdp.h
new file mode 100644
index 0000000..bc69645
--- /dev/null
+++ b/ip/xdp.h
@@ -0,0 +1,9 @@
+#ifndef __XDP__
+#define __XDP__
+
+#include "utils.h"
+
+int xdp_parse(int *argc, char ***argv, struct iplink_req *req);
+void xdp_dump(FILE *fp, struct rtattr *tb);
+
+#endif /* __XDP__ */
diff --git a/lib/bpf.c b/lib/bpf.c
index f714993..0d7e783 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -55,6 +55,7 @@ struct bpf_prog_meta {
 static const enum bpf_prog_type __bpf_types[] = {
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_XDP,
 };
 
 static const struct bpf_prog_meta __bpf_prog_meta[] = {
@@ -70,6 +71,11 @@ static const struct bpf_prog_meta __bpf_prog_meta[] = {
 		.section	= ELF_SECTION_ACTION,
 		.may_uds_export	= true,
 	},
+	[BPF_PROG_TYPE_XDP] = {
+		.type		= "xdp",
+		.subdir		= "xdp",
+		.section	= ELF_SECTION_PROG,
+	},
 };
 
 static const char *bpf_prog_to_subdir(enum bpf_prog_type type)
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 18e9417..469bb43 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -126,6 +126,19 @@ ip-link \- network device configuration
 .RB "[ " port_guid " eui64 ] ]"
 .br
 .in -9
+.RB "[ " xdp  " { " off " | "
+.br
+.in +8
+.BR object
+.IR FILE
+.RB "[ " section
+.IR NAME " ]"
+.RB "[ " verbose " ] |"
+.br
+.BR pinned
+.IR FILE " } ]"
+.br
+.in -8
 .RB "[ " master
 .IR DEVICE " ]"
 .br
@@ -1319,6 +1332,60 @@ which may impact security and/or performance. (e.g. VF multicast promiscuous mod
 .in -8
 
 .TP
+.B xdp object "|" pinned "|" off
+set (or unset) a XDP ("express data path") BPF program to run on every
+packet at driver level.
+
+.B off
+(or
+.B none
+)
+- Detaches any currently attached XDP/BPF program from the given device.
+
+.BI object " FILE "
+- Attaches a XDP/BPF program to the given device. The
+.I FILE
+points to a BPF ELF file (f.e. generated by LLVM) that contains the BPF
+program code, map specifications, etc. If a XDP/BPF program is already
+attached to the given device, an error will be thrown. If no XDP/BPF
+program is currently attached, the device supports XDP and the program
+from the BPF ELF file passes the kernel verifier, then it will be attached
+to the device. If the option
+.I -force
+is passed to
+.B ip
+then any prior attached XDP/BPF program will be atomically overridden and
+no error will be thrown in this case. If no
+.B section
+option is passed, then the default section name ("prog") will be assumed,
+otherwise the provided section name will be used. If no
+.B verbose
+option is passed, then a verifier log will only be dumped on load error.
+See also
+.B EXAMPLES
+section for usage examples.
+
+.BI section " NAME "
+- Specifies a section name that contains the BPF program code. If no section
+name is specified, the default one ("prog") will be used. This option is
+to be passed with the
+.B object
+option.
+
+.BI verbose
+- Act in verbose mode. For example, even in case of success, this will
+print the verifier log in case a program was loaded from a BPF ELF file.
+
+.BI pinned " FILE "
+- Attaches a XDP/BPF program to the given device. The
+.I FILE
+points to an already pinned BPF program in the BPF file system. The option
+.B section
+doesn't apply here, but otherwise semantics are the same as with the option
+.B object
+described already.
+
+.TP
 .BI master " DEVICE"
 set master device of the device (enslave device).
 
@@ -1604,7 +1671,33 @@ encap-dport 5555 encap-csum encap-remcsum
 .RS 4
 Creates an IPIP that is encapsulated with Generic UDP Encapsulation,
 and the outer UDP checksum and remote checksum offload are enabled.
-
+.RE
+.PP
+ip link set dev eth0 xdp obj prog.o
+.RS 4
+Attaches a XDP/BPF program to device eth0, where the program is
+located in prog.o, section "prog" (default section). In case a
+XDP/BPF program is already attached, throw an error.
+.RE
+.PP
+ip -force link set dev eth0 xdp obj prog.o sec foo
+.RS 4
+Attaches a XDP/BPF program to device eth0, where the program is
+located in prog.o, section "foo". In case a XDP/BPF program is
+already attached, it will be overridden by the new one.
+.RE
+.PP
+ip -force link set dev eth0 xdp pinned /sys/fs/bpf/foo
+.RS 4
+Attaches a XDP/BPF program to device eth0, where the program was
+previously pinned as an object node into BPF file system under
+name foo.
+.RE
+.PP
+ip link set dev eth0 xdp off
+.RS 4
+If a XDP/BPF program is attached on device eth0, detach it and
+effectively turn off XDP for device eth0.
 .RE
 .PP
 ip link add link wpan0 lowpan0 type lowpan
-- 
1.9.3

^ permalink raw reply related

* [PATCH iproute2 -net-next] bpf: check for owner_prog_type and notify users when differ
From: Daniel Borkmann @ 2016-12-06  1:17 UTC (permalink / raw)
  To: stephen; +Cc: alexei.starovoitov, netdev, Daniel Borkmann

Kernel commit 21116b7068b9 ("bpf: add owner_prog_type and accounted mem
to array map's fdinfo") added support for telling the owner prog type in
case of prog arrays. Give a notification to the user when they differ,
and the program eventually fails to load.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 lib/bpf.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/bpf.c b/lib/bpf.c
index 8a5b84b..f714993 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -273,11 +273,11 @@ static void bpf_map_pin_report(const struct bpf_elf_map *pin,
 }
 
 static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map,
-				    int length)
+				    int length, enum bpf_prog_type type)
 {
 	char file[PATH_MAX], buff[4096];
 	struct bpf_elf_map tmp = {}, zero = {};
-	unsigned int val;
+	unsigned int val, owner_type = 0;
 	FILE *fp;
 
 	snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
@@ -299,10 +299,19 @@ static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map,
 			tmp.max_elem = val;
 		else if (sscanf(buff, "map_flags:\t%i", &val) == 1)
 			tmp.flags = val;
+		else if (sscanf(buff, "owner_prog_type:\t%i", &val) == 1)
+			owner_type = val;
 	}
 
 	fclose(fp);
 
+	/* The decision to reject this is on kernel side eventually, but
+	 * at least give the user a chance to know what's wrong.
+	 */
+	if (owner_type && owner_type != type)
+		fprintf(stderr, "Program array map owner types differ: %u (obj) != %u (pin)\n",
+			type, owner_type);
+
 	if (!memcmp(&tmp, map, length)) {
 		return 0;
 	} else {
@@ -818,7 +827,8 @@ int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv)
 	}
 
 	ret = bpf_map_selfcheck_pinned(map_fd, &test,
-				       offsetof(struct bpf_elf_map, max_elem));
+				       offsetof(struct bpf_elf_map, max_elem),
+				       type);
 	if (ret < 0) {
 		fprintf(stderr, "Map \'%s\' self-check failed!\n", map_path);
 		goto out_map;
@@ -1300,7 +1310,7 @@ static int bpf_map_attach(const char *name, const struct bpf_elf_map *map,
 	if (fd > 0) {
 		ret = bpf_map_selfcheck_pinned(fd, map,
 					       offsetof(struct bpf_elf_map,
-							id));
+							id), ctx->type);
 		if (ret < 0) {
 			close(fd);
 			fprintf(stderr, "Map \'%s\' self-check failed!\n",
-- 
1.9.3

^ permalink raw reply related

* Re: [net-next][PATCH 02/18] RDS: mark few internal functions static to make sparse build happy
From: Santosh Shilimkar @ 2016-12-06  1:17 UTC (permalink / raw)
  To: Sergei Shtylyov, netdev, davem; +Cc: linux-kernel
In-Reply-To: <604d4954-7302-018e-6f9b-71c0aea6509e@cogentembedded.com>

On 12/5/2016 1:45 AM, Sergei Shtylyov wrote:
> Hello!
>
> On 12/5/2016 9:57 AM, Santosh Shilimkar wrote:
>
[...]

>> -void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
>> +static void rds_walk_conn_path_info(struct socket *sock, unsigned int
>> len,
>>                   struct rds_info_iterator *iter,
>>                   struct rds_info_lengths *lens,
>>                   int (*visitor)(struct rds_conn_path *, void *),
>
>    You now need to realign the continuation lines.
>
Right. Will fix that. Thanks !!

Regards,
Santosh

^ permalink raw reply

* Re: [PATCH net-next] net: remove abuse of VLAN DEI/CFI bit
From: Ben Pfaff @ 2016-12-06  0:59 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	bridge-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
In-Reply-To: <20161205225247.e3dd6dcw3ryjjlp2-CoA6ZxLDdyEEUmgCuDUIdw@public.gmane.org>

On Mon, Dec 05, 2016 at 11:52:47PM +0100, Michał Mirosław wrote:
> On Mon, Dec 05, 2016 at 10:55:45AM -0800, Ben Pfaff wrote:
> > On Mon, Dec 05, 2016 at 06:24:36PM +0100, Michał Mirosław wrote:
> > > On Sat, Dec 03, 2016 at 03:27:30PM -0800, Ben Pfaff wrote:
> > > > On Sat, Dec 03, 2016 at 10:22:28AM +0100, Michał Mirosław wrote:
> > > > > This All-in-one patch removes abuse of VLAN CFI bit, so it can be passed
> > > > > intact through linux networking stack.
> > > > This appears to change the established Open vSwitch userspace API.  You
> > > > can see that simply from the way that it changes the documentation for
> > > > the userspace API.  If I'm right about that, then this change will break
> > > > all userspace programs that use the Open vSwitch kernel module,
> > > > including Open vSwitch itself.
> > > 
> > > If I understood the code correctly, it does change expected meaning for
> > > the (unlikely?) case of header truncated just before the VLAN TCI - it will
> > > be impossible to differentiate this case from the VLAN TCI == 0.
> > > 
> > > I guess this is a problem with OVS API, because it doesn't directly show
> > > the "missing" state of elements, but relies on an "invalid" value.
> > 
> > That particular corner case should not be a huge problem in any case.
> > 
> > The real problem is that this appears to break the common case use of
> > VLANs in Open vSwitch.  After this patch, parse_vlan() in
> > net/openvswitch/flow.c copies the tpid and tci from sk_buff (either the
> > accelerated version of them or the version in the skb data) into
> > sw_flow_key members.  OK, that's fine on it's own.  However, I don't see
> > any corresponding change to the code in flow_netlink.c to compensate for
> > the fact that, until now, the VLAN CFI bit (formerly VLAN_TAG_PRESENT)
> > was always required to be set to 1 in flow matches inside Netlink
> > messages sent from userspace, and the kernel always set it to 1 in
> > corresponding messages sent to userspace.
> > 
> > In other words, if I'm reading this change correctly:
> > 
> >     * With a kernel before this change, userspace always had to set
> >       VLAN_TAG_PRESENT to 1 to match on a VLAN, or the kernel would
> >       reject the flow match.
> > 
> >     * With a kernel after this change, userspace must not set
> >       VLAN_TAG_PRESENT to 1, otherwise the kernel will accept the flow
> >       match but nothing will ever match because packets do not actually
> >       have the CFI bit set.
> > 
> > Take a look at this code that the patch deletes from
> > validate_vlan_from_nlattrs(), for example, and see how it insisted that
> > VLAN_TAG_PRESENT was set:
> > 
> > 	if (!(tci & htons(VLAN_TAG_PRESENT))) {
> > 		if (tci) {
> > 			OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
> > 				  (inner) ? "C-VLAN" : "VLAN");
> > 			return -EINVAL;
> > 		} else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
> > 			/* Corner case for truncated VLAN header. */
> > 			OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
> > 				  (inner) ? "C-VLAN" : "VLAN");
> > 			return -EINVAL;
> > 		}
> > 	}
> > 
> > Please let me know if I'm overlooking something.
> 
> Hmm. So the easiest change without disrupting current userspace, would be
> to flip the CFI bit on the way to/from OVS userspace. Does this seem
> correct?

That sounds correct.  (The bit should not be flipped in the mask.)

Thanks,

Ben.
_______________________________________________
dev mailing list
dev@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

^ permalink raw reply

* Re: [patch net v3] net: fec: fix compile with CONFIG_M5272
From: David Miller @ 2016-12-06  0:56 UTC (permalink / raw)
  To: nikita.yoush
  Cc: fugang.duan, troy.kisky, andrew, eric, tremyfr, johannes, netdev,
	cphealy, fabio.estevam, linux-kernel
In-Reply-To: <1480959661-29834-1-git-send-email-nikita.yoush@cogentembedded.com>

From: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
Date: Mon,  5 Dec 2016 20:41:01 +0300

> Commit 4dfb80d18d05 ("net: fec: cache statistics while device is down")
> introduced unconditional statistics-related actions.

I do not see this commit in any of my trees:

[davem@localhost net-next]$ git describe 4dfb80d18d05
fatal: Not a valid object name 4dfb80d18d05
[davem@localhost net-next]$ cd ../net
[davem@localhost net]$ git describe 4dfb80d18d05
fatal: Not a valid object name 4dfb80d18d05
[davem@localhost net]$

^ permalink raw reply

* Re: [PATCH net-next 4/7] liquidio CN23XX: VF scatter gather lists
From: David Miller @ 2016-12-06  0:48 UTC (permalink / raw)
  To: rvatsavayi
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	felix.manlunas
In-Reply-To: <1480929318-8511-5-git-send-email-rvatsavayi@caviumnetworks.com>

From: Raghu Vatsavayi <rvatsavayi@caviumnetworks.com>
Date: Mon, 5 Dec 2016 01:15:15 -0800

> +	kfree((void *)lio->glist);
> +	kfree((void *)lio->glist_lock);
> +}
 ...
> +	if (!lio->glist) {
> +		kfree((void *)lio->glist_lock);
> +		return 1;
> +	}

These void casts are unnecessary, please remove them.

^ permalink raw reply

* Re: [v3 PATCH] netlink: Do not schedule work from sk_destruct
From: David Miller @ 2016-12-06  0:44 UTC (permalink / raw)
  To: herbert
  Cc: xiyou.wangcong, andreyknvl, johannes.berg, fw, edumazet, me, tom,
	decot, netdev, linux-kernel
In-Reply-To: <20161205072820.GB10204@gondor.apana.org.au>

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 5 Dec 2016 15:28:21 +0800

> It is wrong to schedule a work from sk_destruct using the socket
> as the memory reserve because the socket will be freed immediately
> after the return from sk_destruct.
> 
> Instead we should do the deferral prior to sk_free.
> 
> This patch does just that.
> 
> Fixes: 707693c8a498 ("netlink: Call cb->done from a worker thread")
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

^ permalink raw reply

* Re: [RESEND][PATCH v4] cgroup: Use CAP_SYS_RESOURCE to allow a process to migrate other tasks between cgroups
From: Andy Lutomirski @ 2016-12-06  0:36 UTC (permalink / raw)
  To: John Stultz
  Cc: Alexei Starovoitov, Andy Lutomirski, Mickaël Salaün,
	Daniel Mack, David S. Miller, kafai-b10kYP2dOMg, Florian Westphal,
	Harald Hoyer, Network Development, Sargun Dhillon,
	Pablo Neira Ayuso, lkml, Tejun Heo, Li Zefan, Jonathan Corbet,
	open list:CONTROL GROUP (CGROUP), Android Kernel Team,
	Rom Lemarchand, Colin Cross, Dmitry Shmidt
In-Reply-To: <CALAqxLW-mq4Lnudtn5KMBPdzBTg5bhTXV9QmUC2vfabVru+fUA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Mon, Dec 5, 2016 at 4:28 PM, John Stultz <john.stultz-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org> wrote:
> On Tue, Nov 22, 2016 at 4:57 PM, John Stultz <john.stultz-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org> wrote:
>> On Tue, Nov 8, 2016 at 4:12 PM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
>>> On Tue, Nov 8, 2016 at 4:03 PM, Alexei Starovoitov
>>> <alexei.starovoitov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>>>> On Tue, Nov 08, 2016 at 03:51:40PM -0800, Andy Lutomirski wrote:
>>>>>
>>>>> I hate to say it, but I think I may see a problem.  Current
>>>>> developments are afoot to make cgroups do more than resource control.
>>>>> For example, there's Landlock and there's Daniel's ingress/egress
>>>>> filter thing.  Current cgroup controllers can mostly just DoS their
>>>>> controlled processes.  These new controllers (or controller-like
>>>>> things) can exfiltrate data and change semantics.
>>>>>
>>>>> Does anyone have a security model in mind for these controllers and
>>>>> the cgroups that they're attached to?  I'm reasonably confident that
>>>>> CAP_SYS_RESOURCE is not the answer...
>>>>
>>>> and specifically the answer is... ?
>>>> Also would be great if you start with specifying the question first
>>>> and the problem you're trying to solve.
>>>>
>>>
>>> I don't have a good answer right now.  Here are some constraints, though:
>>>
>>> 1. An insufficiently privileged process should not be able to move a
>>> victim into a dangerous cgroup.
>>>
>>> 2. An insufficiently privileged process should not be able to move
>>> itself into a dangerous cgroup and then use execve to gain privilege
>>> such that the execve'd program can be compromised.
>>>
>>> 3. An insufficiently privileged process should not be able to make an
>>> existing cgroup dangerous in a way that could compromise a victim in
>>> that cgroup.
>>>
>>> 4. An insufficiently privileged process should not be able to make a
>>> cgroup dangerous in a way that bypasses protections that would
>>> otherwise protect execve() as used by itself or some other process in
>>> that cgroup.
>>>
>>> Keep in mind that "dangerous" may apply to a cgroup's descendents in
>>> addition to the cgroup being controlled.
>>
>> Sorry for taking awhile to get back to you here.  I'm a little
>> befuddled as to what next steps I should consider (and honestly, I'm
>> not totally sure I really grok your concern here, particularly what
>> you mean with "dangrous cgroups").
>>
>> So is going back to the CAP_CGROUP_MIGRATE approach (to properly
>> separate "sufficiently" from "insufficiently privileged") better?
>>
>> Or something closer to the original method Android used of each cgroup
>> having an allow_attach() check which could determine what is
>> sufficiently privledged for the respective level of danger the cgroup
>> might poise?
>>
>> Or just stepping back, what method would you imagine to be reasonable
>> to allow a specified task to migrate other tasks between cgroups
>> without it having to be root/suid?
>
> Any suggested feedback here?

I really don't know.  The cgroupfs interface is a bit unfortunate in
that it doesn't really express the constraints.  To safely migrate a
task, ISTM you ought to have some form of privilege over the task
*and* some form of privilege over the cgroup.  cgroupfs only handles
the latter.

CAP_CGROUP_MIGRATE ought to be okay.  Or maybe cgroupfs needs to gain
a concept of "dangerous" cgroups and further restrict them and
CAP_SYS_RESOURCE should be fine for non-dangerous cgroups?  I think I
favor the latter, but it might be nice to hear from Tejun first.

--Andy

^ permalink raw reply

* Re: [RESEND][PATCH v4] cgroup: Use CAP_SYS_RESOURCE to allow a process to migrate other tasks between cgroups
From: John Stultz @ 2016-12-06  0:28 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Andy Lutomirski, Mickaël Salaün,
	Daniel Mack, David S. Miller, kafai, Florian Westphal,
	Harald Hoyer, Network Development, Sargun Dhillon,
	Pablo Neira Ayuso, lkml, Tejun Heo, Li Zefan, Jonathan Corbet,
	open list:CONTROL GROUP (CGROUP), Android Kernel Team,
	Rom Lemarchand, Colin Cross, Dmitry Shmidt
In-Reply-To: <CALAqxLXk+A9=A0oXGEduphXOQdbMg-wuGxmLkEk9cPHyn44X5Q@mail.gmail.com>

On Tue, Nov 22, 2016 at 4:57 PM, John Stultz <john.stultz@linaro.org> wrote:
> On Tue, Nov 8, 2016 at 4:12 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Tue, Nov 8, 2016 at 4:03 PM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>>> On Tue, Nov 08, 2016 at 03:51:40PM -0800, Andy Lutomirski wrote:
>>>>
>>>> I hate to say it, but I think I may see a problem.  Current
>>>> developments are afoot to make cgroups do more than resource control.
>>>> For example, there's Landlock and there's Daniel's ingress/egress
>>>> filter thing.  Current cgroup controllers can mostly just DoS their
>>>> controlled processes.  These new controllers (or controller-like
>>>> things) can exfiltrate data and change semantics.
>>>>
>>>> Does anyone have a security model in mind for these controllers and
>>>> the cgroups that they're attached to?  I'm reasonably confident that
>>>> CAP_SYS_RESOURCE is not the answer...
>>>
>>> and specifically the answer is... ?
>>> Also would be great if you start with specifying the question first
>>> and the problem you're trying to solve.
>>>
>>
>> I don't have a good answer right now.  Here are some constraints, though:
>>
>> 1. An insufficiently privileged process should not be able to move a
>> victim into a dangerous cgroup.
>>
>> 2. An insufficiently privileged process should not be able to move
>> itself into a dangerous cgroup and then use execve to gain privilege
>> such that the execve'd program can be compromised.
>>
>> 3. An insufficiently privileged process should not be able to make an
>> existing cgroup dangerous in a way that could compromise a victim in
>> that cgroup.
>>
>> 4. An insufficiently privileged process should not be able to make a
>> cgroup dangerous in a way that bypasses protections that would
>> otherwise protect execve() as used by itself or some other process in
>> that cgroup.
>>
>> Keep in mind that "dangerous" may apply to a cgroup's descendents in
>> addition to the cgroup being controlled.
>
> Sorry for taking awhile to get back to you here.  I'm a little
> befuddled as to what next steps I should consider (and honestly, I'm
> not totally sure I really grok your concern here, particularly what
> you mean with "dangrous cgroups").
>
> So is going back to the CAP_CGROUP_MIGRATE approach (to properly
> separate "sufficiently" from "insufficiently privileged") better?
>
> Or something closer to the original method Android used of each cgroup
> having an allow_attach() check which could determine what is
> sufficiently privledged for the respective level of danger the cgroup
> might poise?
>
> Or just stepping back, what method would you imagine to be reasonable
> to allow a specified task to migrate other tasks between cgroups
> without it having to be root/suid?

Any suggested feedback here?

thanks
-john

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Florian Westphal @ 2016-12-06  0:20 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Willem de Bruijn, Pablo Neira Ayuso, netfilter-devel,
	Network Development, Daniel Borkmann, Florian Westphal,
	Eric Dumazet
In-Reply-To: <CAF=yD-J-vcQQ_-48xG8qhMuYvLvucCr7SG99rsfJA5-okvhHzA@mail.gmail.com>

Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> While we're discussing the patch, another question, about revisions: I
> tested both modified and original iptables binaries on both standard
> and modified kernels. It all works as expected, except for the case
> where both binaries are used on a single kernel. For instance:
> 
>   iptables -A OUTPUT -m bpf --bytecode "`./nfbpf_compile RAW 'udp port
> 8000'`" -j LOG
>   ./iptables.new -L
> 
> Here the new binary will interpret the object as xt_bpf_match_v1, but
> iptables has inserted xt_bpf_match. The same problem happens the other
> way around. A new binary can be made robust to detect old structs, but
> not the other way around. Specific to bpf, the existing xt_bpf code
> has an unfortunate bug that it always prints at least one line of
> code, even if ->bpf_program_num_elems == 0.
> 
> I notice that other extensions also do not necessarily only extend
> struct vN in vN+1. Is the above a known issue?

Yes, I guess noone ever bothered to fix this.

The kernel blob should contain the match/target revision number,
so userspace can in fact see that 'this is bpf v42', but iirc
the netfilter userspace just loads the highest userspace revision
supported by the kernel (which is then different for the 2 iptables
binaries).

But we *could* display message like 'kernel uses revision 2 but I can
only find 0 and 1' or fall back to the lower supported revision without
guess-the-struct-by-size games.

^ permalink raw reply

* Oops with CONFIG_VMAP_STCK and bond device + virtio-net
From: Laura Abbott @ 2016-12-05 23:53 UTC (permalink / raw)
  To: Michael S. Tsirkin, Jason Wang
  Cc: zbyszek, virtualization, netdev, Linux Kernel Mailing List

Hi,

Fedora got a bug report https://bugzilla.redhat.com/show_bug.cgi?id=1401612
In qemu with two virtio-net interfaces:

$ ip l
...
5: ens14: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether 52:54:00:e9:64:41 brd ff:ff:ff:ff:ff:ff
6: ens15: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether 52:54:00:e9:64:42 brd ff:ff:ff:ff:ff:ff

$ sudo ip link add bond1 type bond
$ sudo ip link set ens14 master bond1
Segmentation fault

 ------------[ cut here ]------------
 kernel BUG at ./include/linux/scatterlist.h:140!
 invalid opcode: 0000 [#1] SMP
 Modules linked in: bonding ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip
  ata_generic crc32c_intel qxl drm_kms_helper virtio_pci serio_raw ttm drm pata_acpi
 CPU: 5 PID: 1983 Comm: ip Not tainted 4.9.0-0.rc6.git2.1.fc26.x86_64 #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015
 task: ffff9d50a3583240 task.stack: ffffb06e41040000
 RIP: 0010:[<ffffffffbc4896fc>]  [<ffffffffbc4896fc>] sg_init_one+0x8c/0xa0
 RSP: 0018:ffffb06e41043698  EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffffb06e41043774 RCX: 0000000000000028
 RDX: 0000131ec1043774 RSI: 0000000000000013 RDI: ffffb06ec1043774
 RBP: ffffb06e410436b0 R08: 00000000001ddbe0 R09: ffffb06e410436c8
 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000006
 R13: ffffb06e410436c8 R14: ffff9d50b2dc1800 R15: ffff9d50b3db9600
 FS:  00007f15347e5700(0000) GS:ffff9d50bb000000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007ffc09bc4000 CR3: 0000000135797000 CR4: 00000000000406e0
 Stack:
  ffff9d50b229d000 0000000000000000 ffffb06e41043772 ffffb06e41043720
  ffffffffc0051123 ffff9d50a3583240 0000000087654321 0000000000000002
  0000000000000000 0000000000000000 0000000000000000 000000007b8f5301
 Call Trace:
  [<ffffffffc0051123>] virtnet_set_mac_address+0xb3/0x140 [virtio_net]
  [<ffffffffbc7ae305>] dev_set_mac_address+0x55/0xc0
  [<ffffffffc03f319e>] bond_enslave+0x34e/0x1180 [bonding]
  [<ffffffffbc7ca22f>] do_setlink+0x6cf/0xd10
  [<ffffffffbc20dd6a>] ? get_page_from_freelist+0x6ba/0xca0
  [<ffffffffbc037de9>] ? sched_clock+0x9/0x10
  [<ffffffffbc068475>] ? kvm_sched_clock_read+0x25/0x40
  [<ffffffffbc111ed6>] ? __lock_acquire+0x346/0x1290
  [<ffffffffbc4aa436>] ? nla_parse+0xa6/0x120
  [<ffffffffbc7ce9e8>] rtnl_newlink+0x5c8/0x870
  [<ffffffffbc3ecb32>] ? avc_has_perm_noaudit+0x32/0x210
  [<ffffffffbc0bbfca>] ? ns_capable_common+0x7a/0x90
  [<ffffffffbc0bbff3>] ? ns_capable+0x13/0x20
  [<ffffffffbc7ced76>] rtnetlink_rcv_msg+0xe6/0x210
  [<ffffffffbc7c951b>] ? rtnetlink_rcv+0x1b/0x40
  [<ffffffffbc7c951b>] ? rtnetlink_rcv+0x1b/0x40
  [<ffffffffbc7cec90>] ? rtnl_newlink+0x870/0x870
  [<ffffffffbc7f7394>] netlink_rcv_skb+0xa4/0xc0
  [<ffffffffbc7c952a>] rtnetlink_rcv+0x2a/0x40
  [<ffffffffbc7f6d07>] netlink_unicast+0x1f7/0x2f0
  [<ffffffffbc7f6c7f>] ? netlink_unicast+0x16f/0x2f0
  [<ffffffffbc7f7102>] netlink_sendmsg+0x302/0x3c0
  [<ffffffffbc790c28>] sock_sendmsg+0x38/0x50
  [<ffffffffbc791773>] ___sys_sendmsg+0x2e3/0x2f0
  [<ffffffffbc18830d>] ? __audit_syscall_entry+0xad/0xf0
  [<ffffffffbc068475>] ? kvm_sched_clock_read+0x25/0x40
  [<ffffffffbc037de9>] ? sched_clock+0x9/0x10
  [<ffffffffbc18830d>] ? __audit_syscall_entry+0xad/0xf0
  [<ffffffffbc18830d>] ? __audit_syscall_entry+0xad/0xf0
  [<ffffffffbc111775>] ? trace_hardirqs_on_caller+0xf5/0x1b0
  [<ffffffffbc7924b4>] __sys_sendmsg+0x54/0x90
  [<ffffffffbc792502>] SyS_sendmsg+0x12/0x20
  [<ffffffffbc003eec>] do_syscall_64+0x6c/0x1f0
  [<ffffffffbc917589>] entry_SYSCALL64_slow_path+0x25/0x25
 Code: ca 75 2c 49 8b 55 08 f6 c2 01 75 25 83 e2 03 81 e3 ff 0f 00 00 45 89 65 14 48
 RIP  [<ffffffffbc4896fc>] sg_init_one+0x8c/0xa0
  RSP <ffffb06e41043698>
 ---[ end trace 9076d2284efbf735 ]---

This looks like an issue with CONFIG_VMAP_STACK since bond_enslave uses
struct sockaddr from the stack and virtnet_set_mac_address calls
sg_init_one which triggers BUG_ON(!virt_addr_valid(buf));

I know there have been a lot of CONFIG_VMAP_STACK fixes around but I
didn't find this one reported yet.

Thanks,
Laura

^ permalink raw reply

* Re: wl1251 & mac address & calibration data
From: Tony Lindgren @ 2016-12-05 23:51 UTC (permalink / raw)
  To: Pali Rohár
  Cc: Aaro Koskinen, Sebastian Reichel, Pavel Machek, Michal Kazior,
	Kalle Valo, Ivaylo Dimitrov, linux-wireless, Network Development,
	linux-kernel
In-Reply-To: <201611261820.47995@pali>

* Pali Rohár <pali.rohar@gmail.com> [161126 09:21]:
> On Thursday 24 November 2016 19:46:01 Aaro Koskinen wrote:
> > Hi,
> > 
> > On Thu, Nov 24, 2016 at 04:20:45PM +0100, Pali Rohár wrote:
> > > Proprietary, signed and closed bootloader NOLO does not support DT.
> > > So for booting you need to append DTS file to kernel image.
> > > 
> > > U-Boot is optional and can be used as intermediate bootloader
> > > between NOLO and kernel. But still it has problems with reading
> > > from nand, so cannot read NVS data nor MAC address.
> > 
> > You could use kexec to pass the fixed DT.
> > 
> > A.
> 
> IIRC it was broken for N900/omap3, no idea if somebody fixed it.

FYI, at least in next-20161205 kexec works on omap3 for me.

Regards,

Tony

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-05 23:51 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Pablo Neira Ayuso, netfilter-devel, Network Development,
	Daniel Borkmann, Florian Westphal, Eric Dumazet
In-Reply-To: <CA+FuTSfHryyqBhRyfQCChG2VPwpj1KYBk2Z7DZ-dcYL-FbbSeA@mail.gmail.com>

On Mon, Dec 5, 2016 at 6:29 PM, Willem de Bruijn <willemb@google.com> wrote:
> On Mon, Dec 5, 2016 at 6:22 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> On Mon, Dec 05, 2016 at 06:06:05PM -0500, Willem de Bruijn wrote:
>> [...]
>>> Eric also suggests a private variable to avoid being subject to
>>> changes to PATH_MAX. Then we can indeed also choose an arbitrary lower
>>> length than current PATH_MAX.
>>
>> Good.
>>
>>> FWIW, there is a workaround for users with deeply nested paths: the
>>> path passed does not have to be absolute. It is literally what is
>>> passed on the command line to iptables right now, including relative
>>> addresses.
>>
>> If iptables userspace always expects to have the bpf file repository
>> in some given location (suggesting to have a directory that we specify
>> at ./configure time, similar to what we do with connlabel.conf), then
>> I think we can rely on relative paths. Would this be flexible enough
>> for your usecase?
>
> As long as it accepts relative paths, I think it will always work.
> Worst case, a user has to cd. No need for hardcoding the bpf mount
> point at compile time.
>
> I have the matching iptables patch for pinned objects, btw. Not for
> elf objects, which requires linking to libelf and parsing the object,
> which is more work (and perhaps best punted on by expanding libbpf in
> bcc to include this functionality. it already exists under samples/bpf
> and iproute2).

While we're discussing the patch, another question, about revisions: I
tested both modified and original iptables binaries on both standard
and modified kernels. It all works as expected, except for the case
where both binaries are used on a single kernel. For instance:

  iptables -A OUTPUT -m bpf --bytecode "`./nfbpf_compile RAW 'udp port
8000'`" -j LOG
  ./iptables.new -L

Here the new binary will interpret the object as xt_bpf_match_v1, but
iptables has inserted xt_bpf_match. The same problem happens the other
way around. A new binary can be made robust to detect old structs, but
not the other way around. Specific to bpf, the existing xt_bpf code
has an unfortunate bug that it always prints at least one line of
code, even if ->bpf_program_num_elems == 0.

I notice that other extensions also do not necessarily only extend
struct vN in vN+1. Is the above a known issue?

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-05 23:29 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Willem de Bruijn, netfilter-devel, Network Development,
	Daniel Borkmann, Florian Westphal, Eric Dumazet
In-Reply-To: <20161205232214.GA15825@salvia>

On Mon, Dec 5, 2016 at 6:22 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Mon, Dec 05, 2016 at 06:06:05PM -0500, Willem de Bruijn wrote:
> [...]
>> Eric also suggests a private variable to avoid being subject to
>> changes to PATH_MAX. Then we can indeed also choose an arbitrary lower
>> length than current PATH_MAX.
>
> Good.
>
>> FWIW, there is a workaround for users with deeply nested paths: the
>> path passed does not have to be absolute. It is literally what is
>> passed on the command line to iptables right now, including relative
>> addresses.
>
> If iptables userspace always expects to have the bpf file repository
> in some given location (suggesting to have a directory that we specify
> at ./configure time, similar to what we do with connlabel.conf), then
> I think we can rely on relative paths. Would this be flexible enough
> for your usecase?

As long as it accepts relative paths, I think it will always work.
Worst case, a user has to cd. No need for hardcoding the bpf mount
point at compile time.

I have the matching iptables patch for pinned objects, btw. Not for
elf objects, which requires linking to libelf and parsing the object,
which is more work (and perhaps best punted on by expanding libbpf in
bcc to include this functionality. it already exists under samples/bpf
and iproute2).

^ permalink raw reply

* [PATCH next] Revert "dctcp: update cwnd on congestion event"
From: Florian Westphal @ 2016-12-05 23:23 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal, Neal Cardwell

Neal Cardwell says:
 If I am reading the code correctly, then I would have two concerns:
 1) Has that been tested? That seems like an extremely dramatic
    decrease in cwnd. For example, if the cwnd is 80, and there are 40
    ACKs, and half the ACKs are ECE marked, then my back-of-the-envelope
    calculations seem to suggest that after just 11 ACKs the cwnd would be
    down to a minimal value of 2 [..]
 2) That seems to contradict another passage in the draft [..] where it
    sazs:
       Just as specified in [RFC3168], DCTCP does not react to congestion
       indications more than once for every window of data.

Neal is right.  Fortunately we don't have to complicate this by testing
vs. current rtt estimate, we can just revert the patch.

Normal stack already handles this for us: receiving ACKs with ECE
set causes a call to tcp_enter_cwr(), from there on the ssthresh gets
adjusted and prr will take care of cwnd adjustment.

Fixes: 4780566784b396 ("dctcp: update cwnd on congestion event")
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/ipv4/tcp_dctcp.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index bde22ebb92a8..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -188,8 +188,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 
 static void dctcp_update_alpha(struct sock *sk, u32 flags)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	struct dctcp *ca = inet_csk_ca(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
 	u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
 
 	/* If ack did not advance snd_una, count dupack as MSS size.
@@ -229,13 +229,6 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
 		WRITE_ONCE(ca->dctcp_alpha, alpha);
 		dctcp_reset(tp, ca);
 	}
-
-	if (flags & CA_ACK_ECE) {
-		unsigned int cwnd = dctcp_ssthresh(sk);
-
-		if (cwnd != tp->snd_cwnd)
-			tp->snd_cwnd = cwnd;
-	}
 }
 
 static void dctcp_state(struct sock *sk, u8 new_state)
-- 
2.7.3

^ permalink raw reply related

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Pablo Neira Ayuso @ 2016-12-05 23:22 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: netfilter-devel, Network Development, Daniel Borkmann,
	Willem de Bruijn, Florian Westphal, Eric Dumazet
In-Reply-To: <CAF=yD-Jk4W+zU=tdixWe3LPGcFp-sANxXLLoQTRvrkN_3=a9uw@mail.gmail.com>

On Mon, Dec 05, 2016 at 06:06:05PM -0500, Willem de Bruijn wrote:
[...]
> Eric also suggests a private variable to avoid being subject to
> changes to PATH_MAX. Then we can indeed also choose an arbitrary lower
> length than current PATH_MAX.

Good.

> FWIW, there is a workaround for users with deeply nested paths: the
> path passed does not have to be absolute. It is literally what is
> passed on the command line to iptables right now, including relative
> addresses.

If iptables userspace always expects to have the bpf file repository
in some given location (suggesting to have a directory that we specify
at ./configure time, similar to what we do with connlabel.conf), then
I think we can rely on relative paths. Would this be flexible enough
for your usecase?

^ permalink raw reply

* Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Lino Sanfilippo @ 2016-12-05 23:11 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Giuseppe CAVALLARO, alexandre.torgue, David Miller, netdev,
	linux-kernel
In-Reply-To: <98fb3577-5f31-bf17-3e02-96c150854108@gmx.de>


> 
> You mean stmmac_xmit()? Thats also softirq AFAICT, its the TX softirq....
> 
> Regards,
> Lino
> 
> 

Hmm. netdevices.txt says:

ndo_start_xmit:
	...

	Context: Process with BHs disabled or BH (timer),
	         will be called with interrupts disabled by netconsole.

	...

If this is correct it can indeed be process context, too. However BHs are already
disabled.

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-05 23:06 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, Network Development, Daniel Borkmann,
	Willem de Bruijn, Florian Westphal, Eric Dumazet
In-Reply-To: <20161205230055.GA15379@salvia>

On Mon, Dec 5, 2016 at 6:00 PM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Mon, Dec 05, 2016 at 11:34:15PM +0100, Pablo Neira Ayuso wrote:
>> On Mon, Dec 05, 2016 at 10:30:01PM +0100, Florian Westphal wrote:
>> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > > On Mon, 2016-12-05 at 15:28 -0500, Willem de Bruijn wrote:
>> > > > From: Willem de Bruijn <willemb@google.com>
>> > > >
>> > > > Add support for attaching an eBPF object by file descriptor.
>> > > >
>> > > > The iptables binary can be called with a path to an elf object or a
>> > > > pinned bpf object. Also pass the mode and path to the kernel to be
>> > > > able to return it later for iptables dump and save.
>> > > >
>> > > > Signed-off-by: Willem de Bruijn <willemb@google.com>
>> > > > ---
>> > >
>> > > Assuming there is no simple way to get variable matchsize in iptables,
>> > > this looks good to me, thanks.
>> >
>> > It should be possible by setting kernel .matchsize to ~0 which
>> > suppresses strict size enforcement.
>> >
>> > Its currently only used by ebt_among, but this should work for any xtables
>> > module.
>>
>> This is likely going to trigger a large rewrite of the core userspace
>> iptables codebase, and likely going to pull part of the mess we have
>> in ebtables into iptables. So I'd prefer not to follow this path.
>
> So this variable path is there to annotate what userspace claims that
> is the file that contains the bpf blob that was loaded, actually this
> is irrelevant to the kernel, so this is just there to dump it back
> when iptables-save it is called. Just a side note, one could set
> anything there from userspace, point somewhere else actually...
>
> Well anyway, going back to the path problem to keep it simple: Why
> don't just trim this down to something smaller, are you really
> expecting to reach PATH_MAX in your usecase?

Not often. Module-specific limitations that differ from global
definitions are just a pain when they bite. This module also has an
arbitrary low limit on the length of the cBPF program passed, for
instance.

Eric also suggests a private variable to avoid being subject to
changes to PATH_MAX. Then we can indeed also choose an arbitrary lower
length than current PATH_MAX.

FWIW, there is a workaround for users with deeply nested paths: the
path passed does not have to be absolute. It is literally what is
passed on the command line to iptables right now, including relative
addresses.

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Pablo Neira Ayuso @ 2016-12-05 23:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Florian Westphal, Willem de Bruijn, netfilter-devel, netdev,
	daniel, Willem de Bruijn
In-Reply-To: <1480978749.18162.561.camel@edumazet-glaptop3.roam.corp.google.com>

On Mon, Dec 05, 2016 at 02:59:09PM -0800, Eric Dumazet wrote:
> On Mon, 2016-12-05 at 23:40 +0100, Florian Westphal wrote:
> 
> > Fair enough, I have no objections to the patch.
> 
> An additional question is about PATH_MAX :
> 
> Is it guaranteed to stay at 4096 forever ?
> 
> To be safe, maybe we should use a constant of our own.

Right, this reminds me we have to fix something else.

So constant of our own plus something smaller, if possible, would be
good to go. Thanks.

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-05 23:01 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netfilter-devel, pablo, Network Development, Willem de Bruijn
In-Reply-To: <5845F060.2050700@iogearbox.net>

On Mon, Dec 5, 2016 at 5:55 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> Hi Willem,
>
> On 12/05/2016 09:28 PM, Willem de Bruijn wrote:
>>
>> From: Willem de Bruijn <willemb@google.com>
>>
>> Add support for attaching an eBPF object by file descriptor.
>>
>> The iptables binary can be called with a path to an elf object or a
>> pinned bpf object. Also pass the mode and path to the kernel to be
>> able to return it later for iptables dump and save.
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>
>
> just out of pure curiosity, use case is for android guys wrt
> accounting, or anything specific that cls_bpf on tc ingress +
> egress cannot do already?

That is the immediate motivation, yes.

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Pablo Neira Ayuso @ 2016-12-05 23:00 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: netfilter-devel, netdev, daniel, Willem de Bruijn,
	Florian Westphal, Eric Dumazet
In-Reply-To: <20161205223415.GA14689@salvia>

On Mon, Dec 05, 2016 at 11:34:15PM +0100, Pablo Neira Ayuso wrote:
> On Mon, Dec 05, 2016 at 10:30:01PM +0100, Florian Westphal wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > On Mon, 2016-12-05 at 15:28 -0500, Willem de Bruijn wrote:
> > > > From: Willem de Bruijn <willemb@google.com>
> > > > 
> > > > Add support for attaching an eBPF object by file descriptor.
> > > > 
> > > > The iptables binary can be called with a path to an elf object or a
> > > > pinned bpf object. Also pass the mode and path to the kernel to be
> > > > able to return it later for iptables dump and save.
> > > > 
> > > > Signed-off-by: Willem de Bruijn <willemb@google.com>
> > > > ---
> > > 
> > > Assuming there is no simple way to get variable matchsize in iptables,
> > > this looks good to me, thanks.
> > 
> > It should be possible by setting kernel .matchsize to ~0 which
> > suppresses strict size enforcement.
> > 
> > Its currently only used by ebt_among, but this should work for any xtables
> > module.
> 
> This is likely going to trigger a large rewrite of the core userspace
> iptables codebase, and likely going to pull part of the mess we have
> in ebtables into iptables. So I'd prefer not to follow this path.

So this variable path is there to annotate what userspace claims that
is the file that contains the bpf blob that was loaded, actually this
is irrelevant to the kernel, so this is just there to dump it back
when iptables-save it is called. Just a side note, one could set
anything there from userspace, point somewhere else actually...

Well anyway, going back to the path problem to keep it simple: Why
don't just trim this down to something smaller, are you really
expecting to reach PATH_MAX in your usecase?

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Eric Dumazet @ 2016-12-05 22:59 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, Willem de Bruijn, netfilter-devel, netdev,
	daniel, Willem de Bruijn
In-Reply-To: <20161205224051.GB16819@breakpoint.cc>

On Mon, 2016-12-05 at 23:40 +0100, Florian Westphal wrote:

> Fair enough, I have no objections to the patch.

An additional question is about PATH_MAX :

Is it guaranteed to stay at 4096 forever ?

To be safe, maybe we should use a constant of our own.



^ permalink raw reply

* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Daniel Borkmann @ 2016-12-05 22:55 UTC (permalink / raw)
  To: Willem de Bruijn, netfilter-devel; +Cc: pablo, netdev, Willem de Bruijn
In-Reply-To: <1480969684-74414-1-git-send-email-willemdebruijn.kernel@gmail.com>

Hi Willem,

On 12/05/2016 09:28 PM, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> Add support for attaching an eBPF object by file descriptor.
>
> The iptables binary can be called with a path to an elf object or a
> pinned bpf object. Also pass the mode and path to the kernel to be
> able to return it later for iptables dump and save.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>

just out of pure curiosity, use case is for android guys wrt
accounting, or anything specific that cls_bpf on tc ingress +
egress cannot do already?

Thanks,
Daniel

^ permalink raw reply

* Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Lino Sanfilippo @ 2016-12-05 22:54 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Giuseppe CAVALLARO, alexandre.torgue, David Miller, netdev,
	linux-kernel
In-Reply-To: <20161205224057.GB19135@amd>

On 05.12.2016 23:40, Pavel Machek wrote:
> On Mon 2016-12-05 23:37:09, Lino Sanfilippo wrote:
>> Hi Pavel,
>> 
>> On 05.12.2016 23:02, Pavel Machek wrote:
>> > 
>> > we need spin_lock_bh at minimum, as we are locking user context
>> > against timer.
>> > 
>> > Best regards,
>> > 									Pavel
>> > 
>> 
>> I was referring to stmmac_tx_clean() which AFAICS is only called from softirq context,
>> (one time in the timer handler and one time in napi poll handler) so a spin_lock() should
>> be sufficient. I cant see how this is called from userspace. If it were, a spin_lock_bh() had
>> to be used, of course.
> 
> stmmac_tx_clean() shares lock with stmmac_tx() -- and that's process
> context as far as I can tell. So... spin_lock_bh() at
> minimum... right?
> 
> 									Pavel
> 

You mean stmmac_xmit()? Thats also softirq AFAICT, its the TX softirq....

Regards,
Lino

^ permalink raw reply

* Re: [PATCH net-next] net: remove abuse of VLAN DEI/CFI bit
From: Michał Mirosław @ 2016-12-05 22:52 UTC (permalink / raw)
  To: Ben Pfaff
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	bridge-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
In-Reply-To: <20161205185545.GB3129-LZ6Gd1LRuIk@public.gmane.org>

On Mon, Dec 05, 2016 at 10:55:45AM -0800, Ben Pfaff wrote:
> On Mon, Dec 05, 2016 at 06:24:36PM +0100, Michał Mirosław wrote:
> > On Sat, Dec 03, 2016 at 03:27:30PM -0800, Ben Pfaff wrote:
> > > On Sat, Dec 03, 2016 at 10:22:28AM +0100, Michał Mirosław wrote:
> > > > This All-in-one patch removes abuse of VLAN CFI bit, so it can be passed
> > > > intact through linux networking stack.
> > > This appears to change the established Open vSwitch userspace API.  You
> > > can see that simply from the way that it changes the documentation for
> > > the userspace API.  If I'm right about that, then this change will break
> > > all userspace programs that use the Open vSwitch kernel module,
> > > including Open vSwitch itself.
> > 
> > If I understood the code correctly, it does change expected meaning for
> > the (unlikely?) case of header truncated just before the VLAN TCI - it will
> > be impossible to differentiate this case from the VLAN TCI == 0.
> > 
> > I guess this is a problem with OVS API, because it doesn't directly show
> > the "missing" state of elements, but relies on an "invalid" value.
> 
> That particular corner case should not be a huge problem in any case.
> 
> The real problem is that this appears to break the common case use of
> VLANs in Open vSwitch.  After this patch, parse_vlan() in
> net/openvswitch/flow.c copies the tpid and tci from sk_buff (either the
> accelerated version of them or the version in the skb data) into
> sw_flow_key members.  OK, that's fine on it's own.  However, I don't see
> any corresponding change to the code in flow_netlink.c to compensate for
> the fact that, until now, the VLAN CFI bit (formerly VLAN_TAG_PRESENT)
> was always required to be set to 1 in flow matches inside Netlink
> messages sent from userspace, and the kernel always set it to 1 in
> corresponding messages sent to userspace.
> 
> In other words, if I'm reading this change correctly:
> 
>     * With a kernel before this change, userspace always had to set
>       VLAN_TAG_PRESENT to 1 to match on a VLAN, or the kernel would
>       reject the flow match.
> 
>     * With a kernel after this change, userspace must not set
>       VLAN_TAG_PRESENT to 1, otherwise the kernel will accept the flow
>       match but nothing will ever match because packets do not actually
>       have the CFI bit set.
> 
> Take a look at this code that the patch deletes from
> validate_vlan_from_nlattrs(), for example, and see how it insisted that
> VLAN_TAG_PRESENT was set:
> 
> 	if (!(tci & htons(VLAN_TAG_PRESENT))) {
> 		if (tci) {
> 			OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
> 				  (inner) ? "C-VLAN" : "VLAN");
> 			return -EINVAL;
> 		} else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
> 			/* Corner case for truncated VLAN header. */
> 			OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
> 				  (inner) ? "C-VLAN" : "VLAN");
> 			return -EINVAL;
> 		}
> 	}
> 
> Please let me know if I'm overlooking something.

Hmm. So the easiest change without disrupting current userspace, would be
to flip the CFI bit on the way to/from OVS userspace. Does this seem
correct?

Best Regards,
Michał Mirosław

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox