Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v6 4/6] bpf: Add support for reading socket family, type, protocol
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add socket family, type and protocol to bpf_sock allowing bpf programs
read-only access.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- new patch for version 6 of set

 include/net/sock.h       | 15 +++++++++++++++
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 442cbb118a07..69afda6bea15 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -389,6 +389,21 @@ struct sock {
 	 * Because of non atomicity rules, all
 	 * changes are protected by socket lock.
 	 */
+	unsigned int		__sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT  16
+#define SK_FL_PROTO_MASK   0x00ff0000
+
+#define SK_FL_TYPE_SHIFT   0
+#define SK_FL_TYPE_MASK    0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT  8
+#define SK_FL_PROTO_MASK   0x0000ff00
+
+#define SK_FL_TYPE_SHIFT   16
+#define SK_FL_TYPE_MASK    0xffff0000
+#endif
+
 	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 2,
 				sk_no_check_tx : 1,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 75964e00d947..b47ffd117fd6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -541,6 +541,9 @@ struct bpf_tunnel_key {
 
 struct bpf_sock {
 	__u32 bound_dev_if;
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
 };
 
 /* User return codes for XDP prog type.
diff --git a/net/core/filter.c b/net/core/filter.c
index 5ee722dc097d..ddc86efe1911 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2960,6 +2960,33 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+#define SOCKF_AD_TYPE     1
+#define SOCKF_AD_PROTOCOL 2
+
+static u32 convert_sock_access(int sock_field, int dst_reg, int src_reg,
+			       struct bpf_insn *insn_buf)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (sock_field) {
+	case SOCKF_AD_TYPE:
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, __sk_flags_offset));
+		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+		break;
+
+	case SOCKF_AD_PROTOCOL:
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, __sk_flags_offset));
+		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 					  int dst_reg, int src_reg,
 					  int ctx_off,
@@ -2979,6 +3006,21 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
+
+	case offsetof(struct bpf_sock, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+				      offsetof(struct sock, sk_family));
+		break;
+
+	case offsetof(struct bpf_sock, type):
+		return convert_sock_access(SOCKF_AD_TYPE, dst_reg, src_reg,
+					   insn_buf);
+
+	case offsetof(struct bpf_sock, protocol):
+		return convert_sock_access(SOCKF_AD_PROTOCOL, dst_reg, src_reg,
+					   insn_buf);
 	}
 
 	return insn - insn_buf;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 5/6] samples/bpf: Update bpf loader for cgroup section names
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add support for section names starting with cgroup/skb and cgroup/sock.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- new patch for version 6

 samples/bpf/bpf_load.c | 14 +++++++++++---
 samples/bpf/bpf_load.h |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 62f54d6eb8bf..49b45ccbe153 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -52,6 +52,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
 	bool is_xdp = strncmp(event, "xdp", 3) == 0;
 	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
+	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
+	bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
 	enum bpf_prog_type prog_type;
 	char buf[256];
 	int fd, efd, err, id;
@@ -72,6 +74,10 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_XDP;
 	} else if (is_perf_event) {
 		prog_type = BPF_PROG_TYPE_PERF_EVENT;
+	} else if (is_cgroup_skb) {
+		prog_type = BPF_PROG_TYPE_CGROUP_SKB;
+	} else if (is_cgroup_sk) {
+		prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -85,7 +91,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
-	if (is_xdp || is_perf_event)
+	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
 		return 0;
 
 	if (is_socket) {
@@ -334,7 +340,8 @@ int load_bpf_file(char *path)
 			    memcmp(shname_prog, "tracepoint/", 11) == 0 ||
 			    memcmp(shname_prog, "xdp", 3) == 0 ||
 			    memcmp(shname_prog, "perf_event", 10) == 0 ||
-			    memcmp(shname_prog, "socket", 6) == 0)
+			    memcmp(shname_prog, "socket", 6) == 0 ||
+			    memcmp(shname_prog, "cgroup/", 7) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
 	}
@@ -353,7 +360,8 @@ int load_bpf_file(char *path)
 		    memcmp(shname, "tracepoint/", 11) == 0 ||
 		    memcmp(shname, "xdp", 3) == 0 ||
 		    memcmp(shname, "perf_event", 10) == 0 ||
-		    memcmp(shname, "socket", 6) == 0)
+		    memcmp(shname, "socket", 6) == 0 ||
+		    memcmp(shname, "cgroup/", 7) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
 
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index dfa57fe65c8e..4adeeef53ad6 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -7,6 +7,7 @@
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
 extern int event_fd[MAX_PROGS];
+extern int prog_cnt;
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 6/6] samples/bpf: add userspace example for prohibiting sockets
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add examples preventing a process in a cgroup from opening a socket
based family, protocol and type.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- new patch for version 6

 samples/bpf/Makefile            |  4 ++
 samples/bpf/sock_flags_kern.c   | 37 +++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.c  | 66 +++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.sh | 81 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 188 insertions(+)
 create mode 100644 samples/bpf/sock_flags_kern.c
 create mode 100644 samples/bpf/test_cgrp2_sock2.c
 create mode 100755 samples/bpf/test_cgrp2_sock2.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a335b218198e..8df12f9429dc 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += test_cgrp2_attach
 hostprogs-y += test_cgrp2_sock
+hostprogs-y += test_cgrp2_sock2
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -53,6 +54,7 @@ test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
 test_cgrp2_sock-objs := libbpf.o test_cgrp2_sock.o
+test_cgrp2_sock2-objs := bpf_load.o libbpf.o test_cgrp2_sock2.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
@@ -73,6 +75,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
 always += tcbpf1_kern.o
@@ -106,6 +109,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
new file mode 100644
index 000000000000..d6a7f0013a5d
--- /dev/null
+++ b/samples/bpf/sock_flags_kern.c
@@ -0,0 +1,37 @@
+#include <uapi/linux/bpf.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+
+SEC("cgroup/sock1")
+int bpf_prog1(struct bpf_sock *sk)
+{
+	char fmt[] = "socket: family %d type %d protocol %d\n";
+
+	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+
+	/* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
+	 * ie., make ping6 fail
+	 */
+	if (sk->family == PF_INET6 && sk->type == 3 && sk->protocol == 58)
+		return 0;
+
+	return 1;
+}
+
+SEC("cgroup/sock2")
+int bpf_prog2(struct bpf_sock *sk)
+{
+	char fmt[] = "socket: family %d type %d protocol %d\n";
+
+	bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+
+	/* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets
+	 * ie., make ping fail
+	 */
+	if (sk->family == PF_INET && sk->type == 3 && sk->protocol == 1)
+		return 0;
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
new file mode 100644
index 000000000000..455ef0d06e93
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -0,0 +1,66 @@
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program loads a filter from file and attaches the
+ *   program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static int usage(const char *argv0)
+{
+	printf("Usage: %s cg-path filter-path [filter-id]\n", argv0);
+	return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+	int cg_fd, ret, filter_id = 0;
+
+	if (argc < 3)
+		return usage(argv[0]);
+
+	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	if (load_bpf_file(argv[2]))
+		return EXIT_FAILURE;
+
+	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+	if (argc > 3)
+		filter_id = atoi(argv[3]);
+
+	if (filter_id > prog_cnt) {
+		printf("Invalid program id; program not found in file\n");
+		return EXIT_FAILURE;
+	}
+
+	ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
+			      BPF_CGROUP_INET_SOCK_CREATE);
+	if (ret < 0) {
+		printf("Failed to attach prog to cgroup: '%s'\n",
+		       strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh
new file mode 100755
index 000000000000..891f12a0e26f
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+function config_device {
+	ip netns add at_ns0
+	ip link add veth0 type veth peer name veth0b
+	ip link set veth0b up
+	ip link set veth0 netns at_ns0
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add 172.16.1.101/24 dev veth0b
+	ip addr add 2401:db00::2/64 dev veth0b nodad
+}
+
+function config_cgroup {
+	rm -rf /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2
+	mount -t cgroup2 none /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2/foo
+	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+}
+
+
+function attach_bpf {
+	test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1
+	[ $? -ne 0 ] && exit 1
+}
+
+function cleanup {
+	ip link del veth0b
+	ip netns delete at_ns0
+	umount /tmp/cgroupv2
+	rm -rf /tmp/cgroupv2
+}
+
+cleanup 2>/dev/null
+
+set -e
+config_device
+config_cgroup
+set +e
+
+#
+# Test 1 - fail ping6
+#
+attach_bpf 0
+ping -c1 -w1 172.16.1.100
+if [ $? -ne 0 ]; then
+	echo "ping failed when it should succeed"
+	cleanup
+	exit 1
+fi
+
+ping6 -c1 -w1 2401:db00::1
+if [ $? -eq 0 ]; then
+	echo "ping6 succeeded when it should not"
+	cleanup
+	exit 1
+fi
+
+#
+# Test 2 - fail ping
+#
+attach_bpf 1
+ping6 -c1 -w1 2401:db00::1
+if [ $? -ne 0 ]; then
+	echo "ping6 failed when it should succeed"
+	cleanup
+	exit 1
+fi
+
+ping -c1 -w1 172.16.1.100
+if [ $? -eq 0 ]; then
+	echo "ping succeeded when it should not"
+	cleanup
+	exit 1
+fi
+
+cleanup
+echo
+echo "*** PASS ***"
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 3/6] samples: bpf: add userspace example for modifying sk_bound_dev_if
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add a simple program to demonstrate the ability to attach a bpf program
to a cgroup that sets sk_bound_dev_if for AF_INET{6} sockets when they
are created.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- added conversion from device name to index in test program

v5
- changed BPF_CGROUP_INET_SOCK to BPF_CGROUP_INET_SOCK_CREATE

v4
- added test_cgrp2_sock.sh for an automated test

v3
- revert to BPF_PROG_TYPE_CGROUP_SOCK prog type

v2
- removed bpf_sock_store_u32 references
- changed BPF_CGROUP_INET_SOCK_CREATE to BPF_CGROUP_INET_SOCK
- remove BPF_PROG_TYPE_CGROUP_SOCK prog type and add prog_subtype

 samples/bpf/Makefile           |  2 +
 samples/bpf/test_cgrp2_sock.c  | 83 ++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock.sh | 47 ++++++++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 samples/bpf/test_cgrp2_sock.c
 create mode 100755 samples/bpf/test_cgrp2_sock.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3ceb5a9d86df..a335b218198e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -23,6 +23,7 @@ hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += test_cgrp2_attach
+hostprogs-y += test_cgrp2_sock
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -51,6 +52,7 @@ map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
+test_cgrp2_sock-objs := libbpf.o test_cgrp2_sock.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
new file mode 100644
index 000000000000..d467b3c1c55c
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -0,0 +1,83 @@
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program sets the sk_bound_dev_if index in new AF_INET{6}
+ *   sockets opened by processes in the cgroup.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+
+static int prog_load(int idx)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_MOV64_IMM(BPF_REG_3, idx),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			     "GPL", 0);
+}
+
+static int usage(const char *argv0)
+{
+	printf("Usage: %s cg-path device-index\n", argv0);
+	return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+	int cg_fd, prog_fd, ret;
+	unsigned int idx;
+
+	if (argc < 2)
+		return usage(argv[0]);
+
+	idx = if_nametoindex(argv[2]);
+	if (!idx) {
+		printf("Invalid device name\n");
+		return EXIT_FAILURE;
+	}
+
+	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	prog_fd = prog_load(idx);
+	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+	if (prog_fd < 0) {
+		printf("Failed to load prog: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+	if (ret < 0) {
+		printf("Failed to attach prog to cgroup: '%s'\n",
+		       strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
new file mode 100755
index 000000000000..925fd467c7cc
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+function config_device {
+	ip netns add at_ns0
+	ip link add veth0 type veth peer name veth0b
+	ip link set veth0b up
+	ip link set veth0 netns at_ns0
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip link add foo type vrf table 1234
+	ip link set foo up
+	ip addr add 172.16.1.101/24 dev veth0b
+	ip addr add 2401:db00::2/64 dev veth0b nodad
+	ip link set veth0b master foo
+}
+
+function attach_bpf {
+	rm -rf /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2
+	mount -t cgroup2 none /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2/foo
+	test_cgrp2_sock /tmp/cgroupv2/foo foo
+	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+}
+
+function cleanup {
+	set +ex
+	ip netns delete at_ns0
+	ip link del veth0
+	ip link del foo
+	umount /tmp/cgroupv2
+	rm -rf /tmp/cgroupv2
+	set -ex
+}
+
+function do_test {
+	ping -c1 -w1 172.16.1.100
+	ping6 -c1 -w1 2401:db00::1
+}
+
+cleanup 2>/dev/null
+config_device
+attach_bpf
+do_test
+cleanup
+echo "*** PASS ***"
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 2/6] bpf: Add new cgroup attach type to enable sock modifications
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6
- added size check to sock_filter_is_valid_access; accesses must be u32

v5
- no change

v4
- dropped tweak to bpf_func signature
- dropped cg_sock_func_proto in favor of sk_filter_func_proto
- new __cgroup_bpf_run_filter_sk versus overloading __cgroup_bpf_run_filter
- reverted BPF_CGROUP_INET_SOCK to BPF_CGROUP_INET_SOCK_CREATE

v3
- reverted to new prog type BPF_PROG_TYPE_CGROUP_SOCK
- dropped the subtype

v2
- dropped the bpf_sock_store_u32 helper
- dropped the new prog type BPF_PROG_TYPE_CGROUP_SOCK
- moved valid access and context conversion to use subtype
- dropped CREATE from BPF_CGROUP_INET_SOCK and related function names
- moved running of filter from sk_alloc to inet{6}_create

 include/linux/bpf-cgroup.h | 14 +++++++++++
 include/uapi/linux/bpf.h   |  6 +++++
 kernel/bpf/cgroup.c        | 33 ++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  5 +++-
 net/core/filter.c          | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c         | 12 ++++++++-
 net/ipv6/af_inet6.c        |  8 ++++++
 7 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7f0fc635b13e..7de376e37c5c 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -41,6 +41,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -64,6 +67,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
+						 BPF_CGROUP_INET_SOCK_CREATE); \
+	}								       \
+	__ret;								       \
+})
+
 #else
 
 struct cgroup_bpf {};
@@ -73,6 +86,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d1456f..75964e00d947 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,11 +101,13 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -537,6 +539,10 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8fe55ffd109d..a515f7b007c6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5518a6839ab1..85af86c496cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_EGRESS:
 		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
-
+	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262b8ebb..5ee722dc097d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2676,6 +2676,32 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off + size > sizeof(struct bpf_sock))
+		return false;
+
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -2934,6 +2960,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,
@@ -3007,6 +3057,12 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops cg_sock_ops = {
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sock_filter_is_valid_access,
+	.convert_ctx_access	= sock_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3088,11 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+	.ops	= &cg_sock_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SOCK
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3100,7 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&cg_sock_type);
 
 	return 0;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..24d2550492ee 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -374,8 +374,18 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
-		if (err)
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
 			sk_common_release(sk);
+			goto out;
+		}
 	}
 out:
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a3737a..237e654ba717 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -258,6 +258,14 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 			goto out;
 		}
 	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 out:
 	return err;
 out_rcu_unlock:
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 1/6] bpf: Refactor cgroups code in prep for new type
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480529810-25850-1-git-send-email-dsa@cumulusnetworks.com>

Code move and rename only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v6, v5
- no change

v4
- dropped refactor of __cgroup_bpf_run_filter and renamed it
  to __cgroup_bpf_run_filter_skb

v3
- dropped the rename

v2
- fix bpf_prog_run_clear_cb to bpf_prog_run_save_cb as caught by Daniel

- rename BPF_PROG_TYPE_CGROUP_SKB and its cg_skb functions to
  BPF_PROG_TYPE_CGROUP and cgroup

 include/linux/bpf-cgroup.h | 46 +++++++++++++++++++++++-----------------------
 kernel/bpf/cgroup.c        | 10 +++++-----
 kernel/bpf/syscall.c       | 28 +++++++++++++++-------------
 3 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ec80d0c0953e..7f0fc635b13e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -37,31 +37,31 @@ void cgroup_bpf_update(struct cgroup *cgrp,
 		       struct bpf_prog *prog,
 		       enum bpf_attach_type type);
 
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type);
-
-/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
-#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled)						\
-		__ret = __cgroup_bpf_run_filter(sk, skb,		\
-						BPF_CGROUP_INET_INGRESS); \
-									\
-	__ret;								\
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
+						    BPF_CGROUP_INET_INGRESS); \
+									      \
+	__ret;								      \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
-		typeof(sk) __sk = sk_to_full_sk(sk);			\
-		if (sk_fullsock(__sk))					\
-			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
-						BPF_CGROUP_INET_EGRESS); \
-	}								\
-	__ret;								\
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		       \
+		typeof(sk) __sk = sk_to_full_sk(sk);			       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
+						      BPF_CGROUP_INET_EGRESS); \
+	}								       \
+	__ret;								       \
 })
 
 #else
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8c784f8c67cd..8fe55ffd109d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -118,7 +118,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 }
 
 /**
- * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socken sending or receiving traffic
  * @skb: The skb that is being sent or received
  * @type: The type of program to be exectuted
@@ -132,9 +132,9 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
  * This function will return %-EPERM if any if an attached program was found
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type)
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
@@ -164,4 +164,4 @@ int __cgroup_bpf_run_filter(struct sock *sk,
 
 	return ret;
 }
-EXPORT_SYMBOL(__cgroup_bpf_run_filter);
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4caa18e6860a..5518a6839ab1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -856,6 +856,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
+	enum bpf_prog_type ptype;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -866,25 +867,26 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
-		prog = bpf_prog_get_type(attr->attach_bpf_fd,
-					 BPF_PROG_TYPE_CGROUP_SKB);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp)) {
-			bpf_prog_put(prog);
-			return PTR_ERR(cgrp);
-		}
-
-		cgroup_bpf_update(cgrp, prog, attr->attach_type);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
 
 	default:
 		return -EINVAL;
 	}
 
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(cgrp);
+	}
+
+	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	cgroup_put(cgrp);
+
 	return 0;
 }
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v6 0/6] net: Add bpf support for sockets
From: David Ahern @ 2016-11-30 18:16 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern

The recently added VRF support in Linux leverages the bind-to-device
API for programs to specify an L3 domain for a socket. While
SO_BINDTODEVICE has been around for ages, not every ipv4/ipv6 capable
program has support for it. Even for those programs that do support it,
the API requires processes to be started as root (CAP_NET_RAW) which
is not desirable from a general security perspective.

This patch set leverages Daniel Mack's work to attach bpf programs to
a cgroup to provide a capability to set sk_bound_dev_if for all
AF_INET{6} sockets opened by a process in a cgroup when the sockets
are allocated.

For example:
 1. configure vrf (e.g., using ifupdown2)
        auto eth0
        iface eth0 inet dhcp
            vrf mgmt

        auto mgmt
        iface mgmt
            vrf-table auto

 2. configure cgroup
        mount -t cgroup2 none /tmp/cgroupv2
        mkdir /tmp/cgroupv2/mgmt
        test_cgrp2_sock /tmp/cgroupv2/mgmt 15

 3. set shell into cgroup (e.g., can be done at login using pam)
        echo $$ >> /tmp/cgroupv2/mgmt/cgroup.procs

At this point all commands run in the shell (e.g, apt) have sockets
automatically bound to the VRF (see output of ss -ap 'dev == <vrf>'),
including processes not running as root.

This capability enables running any program in a VRF context and is key
to deploying Management VRF, a fundamental configuration for networking
gear, with any Linux OS installation.

This patchset also exports the socket family, type and protocol as
read-only allowing bpf filters to deny a process in a cgroup the ability
to open specific types of AF_INET or AF_INET6 sockets.

v6
- add export of socket family, type and protocol

David Ahern (6):
  bpf: Refactor cgroups code in prep for new type
  bpf: Add new cgroup attach type to enable sock modifications
  samples: bpf: add userspace example for modifying sk_bound_dev_if
  bpf: Add support for reading socket family, type, protocol
  samples/bpf: Update bpf loader for cgroup section names
  samples/bpf: add userspace example for prohibiting sockets

 include/linux/bpf-cgroup.h      |  60 ++++++++++++++---------
 include/net/sock.h              |  15 ++++++
 include/uapi/linux/bpf.h        |   9 ++++
 kernel/bpf/cgroup.c             |  43 +++++++++++++++--
 kernel/bpf/syscall.c            |  33 +++++++------
 net/core/filter.c               | 104 ++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c              |  12 ++++-
 net/ipv6/af_inet6.c             |   8 ++++
 samples/bpf/Makefile            |   6 +++
 samples/bpf/bpf_load.c          |  14 ++++--
 samples/bpf/bpf_load.h          |   1 +
 samples/bpf/sock_flags_kern.c   |  37 ++++++++++++++
 samples/bpf/test_cgrp2_sock.c   |  83 ++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock.sh  |  47 ++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.c  |  66 +++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.sh |  81 +++++++++++++++++++++++++++++++
 16 files changed, 573 insertions(+), 46 deletions(-)
 create mode 100644 samples/bpf/sock_flags_kern.c
 create mode 100644 samples/bpf/test_cgrp2_sock.c
 create mode 100755 samples/bpf/test_cgrp2_sock.sh
 create mode 100644 samples/bpf/test_cgrp2_sock2.c
 create mode 100755 samples/bpf/test_cgrp2_sock2.sh

-- 
2.1.4

^ permalink raw reply

* [PATCH net-next] cgroup, bpf: remove unnecessary #include
From: Alexei Starovoitov @ 2016-11-30 18:16 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Daniel Mack, Rami Rosen, netdev

this #include is unnecessary and brings whole set of
other headers into cgroup-defs.h. Remove it.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Rami Rosen <roszenrami@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Mack <daniel@zonque.org>
---
Dave,
this patch got lost somehow (marked accepted, but not in net-next).
Resending.

 include/linux/bpf-cgroup.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ec80d0c0953e..0cf1adfadd2d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -1,7 +1,6 @@
 #ifndef _BPF_CGROUP_H
 #define _BPF_CGROUP_H
 
-#include <linux/bpf.h>
 #include <linux/jump_label.h>
 #include <uapi/linux/bpf.h>
 
-- 
2.8.0

^ permalink raw reply related

* Re: Netperf UDP issue with connected sockets
From: David Miller @ 2016-11-30 18:11 UTC (permalink / raw)
  To: rick.jones2; +Cc: brouer, eric.dumazet, netdev
In-Reply-To: <086ec7ce-ea6a-55bd-cefe-095c2cad9f09@hpe.com>

From: Rick Jones <rick.jones2@hpe.com>
Date: Wed, 30 Nov 2016 09:42:40 -0800

> And indeed, based on a quick check, send() is what is being called,
> though it becomes it seems a sendto() system call - with the
> destination information NJULL:
> 
> write(1, "send\n", 5)                   = 5
> sendto(4, "netperf\0netperf\0netperf\0netperf\0"..., 1024, 0, NULL, 0)
> = 1024
> write(1, "send\n", 5)                   = 5
> sendto(4, "netperf\0netperf\0netperf\0netperf\0"..., 1024, 0, NULL, 0)
> = 1024
> 
> So I'm not sure what might be going-on there.

It's because of glibc's implementation of send() which is:

ssize_t
__libc_send (int sockfd, const void *buffer, size_t len, int flags)
{
  return SYSCALL_CANCEL (sendto, sockfd, buffer, len, flags, NULL, 0);
}
strong_alias (__libc_send, __send)
weak_alias (__libc_send, send)

^ permalink raw reply

* Re: DSA vs. SWTICHDEV ?
From: Florian Fainelli @ 2016-11-30 18:10 UTC (permalink / raw)
  To: Joakim Tjernlund, andrew@lunn.ch; +Cc: netdev@vger.kernel.org
In-Reply-To: <1480527852.3563.146.camel@infinera.com>

On 11/30/2016 09:44 AM, Joakim Tjernlund wrote:
> On Wed, 2016-11-30 at 17:55 +0100, Andrew Lunn wrote:
>>> This is an embedded system with several boards in a subrack.
>>> Each board has eth I/F connected to a switch to communicate with each other.
>>> One of the board will also house the actual switch device and manage the switch.
>>> Then the normal app just communicates over the physical eth I/F like any other board
>>> in the system. There is a "manage switch app" which brings the switch up and partition
>>> phys VLANs etc. (each phys I/F would be a a separate domain so no loop)
>>
>> So you are planning on throwing away the "manage switch app", and just
>> use standard linux networking commands? That is what switchdev is all
>> about really, throwing away the vendor SDK for the switch, making a
>> switch just a bunch on interfaces on the host which you manage as
>> normal interfaces.
> 
> Something like that. I need to run routing protocols on the switch I/Fs and egress
> pkgs on selected switch I/Fs bypassing ARP, just like DSA does with its vendor
> tags.
> 
>>
>>> I guess I could skip the phys I/F and have the switch app create a virtual eth0 I/F over PCIe
>>
>> No need to create this interface. It will exist if you go the
>> switchdev route.
>>
>>>>> And switchdev can do all this over PCIe instead? Can you have a
>>>>> switch tree in switchdev too?
>>>>
>>>> Mellonex says so, but i don't think they have actually implemented it.
>>>
>>> Not impl. any of DSAs features? What can you do with a Mellonex switch then?
>>
>> They don't have a tree of switches, as far as i know. Just a single
>> switch. But DSA does support a tree of switches, that is what the D in
>> DSA means, distributed. And there are a couple of boards which have 2
>> to 4 switches in a tree.
> 
> We might have a tree as well so now I really wonder: Given we write a
> proper switchdev driver, can it support switchtrees without touching
> switchdev infra structure? If not I guess we will attach a physical
> eth I/F to the switch and use both DSA and switchdev to support both trees
> and HW offload. 

switchdev in itself really is the glue layer between the networking
stack and how to push specific objects down to the Ethernet switch
driver, and that Ethernet switch driver. Switchdev does not enforce a
specific network device driver model object, and just provides standard
hooks for your network devices to register with switchdev in order to
push/receive offloads. DSA on the other hand, utilizes switchdev to get
notifications about offloads from the networking stack, but also exposes
a clearly and well defined Ethernet switch device driver model, as
Andrew described, it creates per-port network devices, binds the ports
to their PHYs (built-in, or external), and also takes care of
encapsulating/decapsulating the switch specific tagging protocol.

We should probably put that in some crystal clear sentence somewhere in
Documentation/networking/ but switchdev and DSA are complementary and
not competitors, they just do not tackle the problems from the same angle.

> 
>>
>> I think this is partially down to market segments. Mellonex market is
>> top of rack switches. High port count, very high bandwidth. DSA is
>> more wireless access points, set top boxes, generally up to 7 ports of
>> 1Gbps and a few custom embedded products which need more ports, so
>> build a tree of switches.
> 
> We have on an existing board with a BCM ROBO switch with lots of ports(>24),
> managed over SPI. Looking at BCM DSA tag code it looks like it only supports
> some 8 ports or so. I still have to find out if this is a limitation in BCM tagging
> protocol or if just not impl. in DSA yet.

Oh cool, can you share the model by chance? I suspect the tagging format
of that switch is going to be different than what net/dsa/tag_brcm.c, so
feel free to add something NET_DSA_TAG_BRCM8B (for 8 bytes) or something
like that.

Note that DSA currently hardcodes the maximum number of ports to 14
(DSA_MAX_PORTS), but this should obviously be something dynamically
determined based on probing the switch device.

Can you also evaluate if using drivers/net/dsa/b53/ would work for you?
My hope would be that they preserved the register compatibility here,
but since this has a large number of ports, it may have completely
offset most registers.

BTW, there is #linuxswitch on Freenode if you want to chat!
-- 
Florian

^ permalink raw reply

* Re: DSA vs. SWTICHDEV ?
From: Andrew Lunn @ 2016-11-30 18:09 UTC (permalink / raw)
  To: Joakim Tjernlund, Jiri Pirko, Florian Fainelli; +Cc: netdev@vger.kernel.org
In-Reply-To: <1480527852.3563.146.camel@infinera.com>

> Something like that. I need to run routing protocols on the switch I/Fs and egress
> pkgs on selected switch I/Fs bypassing ARP, just like DSA does with its vendor
> tags.

Does the switch have an equivalent tagging protocol? If you are
building a tree of switches you need something like this for frames
going from the host via intermediate switches and out a specific port
on a remote switch.

> We might have a tree as well so now I really wonder: Given we write a
> proper switchdev driver, can it support switchtrees without touching
> switchdev infra structure?

Jiri Pirko <jiri@resnulli.us> is probably the best person to ask about
this. DSA hides the knowledge that there is multiple switches. To
switchdev, a tree of switches looks like one switch. This is not
because of switchdev, it is just the existing DSA code worked when
switchdev came along.

 If not I guess we will attach a physical
> eth I/F to the switch and use both DSA and switchdev to support both trees
> and HW offload. 

This only works if the switch has the necessary tagging protocol to
pass through multiple switches.

> We have on an existing board with a BCM ROBO switch with lots of ports(>24),
> managed over SPI. Looking at BCM DSA tag code it looks like it only supports
> some 8 ports or so. I still have to find out if this is a limitation in BCM tagging
> protocol or if just not impl. in DSA yet.

Hi Florian, care to comment?

As far as i understand, the tag used for SF2 and B53 does not support
a tree of switches. But the big ROBO switches might have a different
tagging protocol.

  Andrew

^ permalink raw reply

* Re: DSA vs. SWTICHDEV ?
From: Joakim Tjernlund @ 2016-11-30 17:44 UTC (permalink / raw)
  To: andrew@lunn.ch; +Cc: netdev@vger.kernel.org
In-Reply-To: <20161130165500.GH21645@lunn.ch>

On Wed, 2016-11-30 at 17:55 +0100, Andrew Lunn wrote:
> > This is an embedded system with several boards in a subrack.
> > Each board has eth I/F connected to a switch to communicate with each other.
> > One of the board will also house the actual switch device and manage the switch.
> > Then the normal app just communicates over the physical eth I/F like any other board
> > in the system. There is a "manage switch app" which brings the switch up and partition
> > phys VLANs etc. (each phys I/F would be a a separate domain so no loop)
> 
> So you are planning on throwing away the "manage switch app", and just
> use standard linux networking commands? That is what switchdev is all
> about really, throwing away the vendor SDK for the switch, making a
> switch just a bunch on interfaces on the host which you manage as
> normal interfaces.

Something like that. I need to run routing protocols on the switch I/Fs and egress
pkgs on selected switch I/Fs bypassing ARP, just like DSA does with its vendor
tags.

> 
> > I guess I could skip the phys I/F and have the switch app create a virtual eth0 I/F over PCIe
> 
> No need to create this interface. It will exist if you go the
> switchdev route.
> 
> > > > And switchdev can do all this over PCIe instead? Can you have a
> > > > switch tree in switchdev too?
> > > 
> > > Mellonex says so, but i don't think they have actually implemented it.
> > 
> > Not impl. any of DSAs features? What can you do with a Mellonex switch then?
> 
> They don't have a tree of switches, as far as i know. Just a single
> switch. But DSA does support a tree of switches, that is what the D in
> DSA means, distributed. And there are a couple of boards which have 2
> to 4 switches in a tree.

We might have a tree as well so now I really wonder: Given we write a
proper switchdev driver, can it support switchtrees without touching
switchdev infra structure? If not I guess we will attach a physical
eth I/F to the switch and use both DSA and switchdev to support both trees
and HW offload. 

> 
> I think this is partially down to market segments. Mellonex market is
> top of rack switches. High port count, very high bandwidth. DSA is
> more wireless access points, set top boxes, generally up to 7 ports of
> 1Gbps and a few custom embedded products which need more ports, so
> build a tree of switches.

We have on an existing board with a BCM ROBO switch with lots of ports(>24),
managed over SPI. Looking at BCM DSA tag code it looks like it only supports
some 8 ports or so. I still have to find out if this is a limitation in BCM tagging
protocol or if just not impl. in DSA yet.

^ permalink raw reply

* Re: [patch net v2] net: fec: cache statistics while device is down
From: David Miller @ 2016-11-30 17:44 UTC (permalink / raw)
  To: nikita.yoush
  Cc: fugang.duan, troy.kisky, andrew, eric, tremyfr, johannes, netdev,
	cphealy, fabio.estevam, linux-kernel
In-Reply-To: <1480401891-19333-1-git-send-email-nikita.yoush@cogentembedded.com>

From: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
Date: Tue, 29 Nov 2016 09:44:51 +0300

> Execution 'ethtool -S' on fec device that is down causes OOPS on Vybrid
> board:
> 
> Unhandled fault: external abort on non-linefetch (0x1008) at 0xe0898200
> pgd = ddecc000
> [e0898200] *pgd=9e406811, *pte=400d1653, *ppte=400d1453
> Internal error: : 1008 [#1] SMP ARM
> ...
> 
> Reason of OOPS is that fec_enet_get_ethtool_stats() accesses fec
> registers while IPG clock is stopped by PM.
> 
> Fix that by caching statistics in fec_enet_private. Cache is initialized
> at device probe time, and updated at statistics request time if device
> is up, and also just before turning device off on down path.
> 
> Additional locking is not needed, since cached statistics is accessed
> either before device is registered, or under rtnl_lock().
> 
> Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next] samples/bpf: fix include path
From: David Miller @ 2016-11-30 17:42 UTC (permalink / raw)
  To: ast; +Cc: daniel, netdev
In-Reply-To: <1480399642-2475887-1-git-send-email-ast@fb.com>

From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 28 Nov 2016 22:07:22 -0800

> Fix the following build error:
> HOSTCC  samples/bpf/test_lru_dist.o
> ../samples/bpf/test_lru_dist.c:25:22: fatal error: bpf_util.h: No such file or directory
> 
> This is due to objtree != srctree.
> Use srctree, since that's where bpf_util.h is located.
> 
> Fixes: e00c7b216f34 ("bpf: fix multiple issues in selftest suite and samples")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Applied, thanks.

^ permalink raw reply

* Re: Netperf UDP issue with connected sockets
From: Rick Jones @ 2016-11-30 17:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Eric Dumazet, netdev
In-Reply-To: <20161130114339.78c6d4fa@redhat.com>

On 11/30/2016 02:43 AM, Jesper Dangaard Brouer wrote:
> Notice the "fib_lookup" cost is still present, even when I use
> option "-- -n -N" to create a connected socket.  As Eric taught us,
> this is because we should use syscalls "send" or "write" on a connected
> socket.

In theory, once the data socket is connected, the send_data() call in 
src/nettest_omni.c is supposed to use send() rather than sendto().

And indeed, based on a quick check, send() is what is being called, 
though it becomes it seems a sendto() system call - with the destination 
information NJULL:

write(1, "send\n", 5)                   = 5
sendto(4, "netperf\0netperf\0netperf\0netperf\0"..., 1024, 0, NULL, 0) = 
1024
write(1, "send\n", 5)                   = 5
sendto(4, "netperf\0netperf\0netperf\0netperf\0"..., 1024, 0, NULL, 0) = 
1024

So I'm not sure what might be going-on there.

You can get netperf to use write() instead of send() by adding a 
test-specific -I option.

happy benchmarking,

rick

>
> My udp_flood tool[1] cycle through the different syscalls:
>
> taskset -c 2 ~/git/network-testing/src/udp_flood 198.18.50.1 --count $((10**7)) --pmtu 2
>              	ns/pkt	pps		cycles/pkt
> send      	473.08	2113816.28	1891
> sendto    	558.58	1790265.84	2233
> sendmsg   	587.24	1702873.80	2348
> sendMmsg/32  	547.57	1826265.90	2189
> write     	518.36	1929175.52	2072
>
> Using "send" seems to be the fastest option.
>
> Some notes on test: I've forced TX completions to happen on another CPU0
> and pinned the udp_flood program (to CPU2) as I want to avoid the CPU
> scheduler to move udp_flood around as this cause fluctuations in the
> results (as it stress the memory allocations more).
>
> My udp_flood --pmtu option is documented in the --help usage text (see below signature)
>

^ permalink raw reply

* Re: [net-next] macvtap: replace printk with netdev_err
From: David Miller @ 2016-11-30 17:41 UTC (permalink / raw)
  To: zhangshengju; +Cc: netdev, jasowang
In-Reply-To: <1480389992-69609-1-git-send-email-zhangshengju@cmss.chinamobile.com>

From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Tue, 29 Nov 2016 11:26:32 +0800

> This patch replaces printk() with netdev_err() for macvtap device.
> 
> Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net 1/2] Set skb->protocol properly before calling dst_output()
From: David Miller @ 2016-11-30 17:38 UTC (permalink / raw)
  To: elicooper; +Cc: netdev, eric.dumazet
In-Reply-To: <20161129023529.17645-1-elicooper@gmx.com>

From: Eli Cooper <elicooper@gmx.com>
Date: Tue, 29 Nov 2016 10:35:28 +0800

> When xfrm is applied to TSO/GSO packets, it follows this path:
> 
>     xfrm_output() -> xfrm_output_gso() -> skb_gso_segment()
> 
> where skb_gso_segment() relies on skb->protocol to function properly.
> 
> This patch sets skb->protocol properly before dst_output() is called,
> fixing a bug where GSO packets sent through a sit or ipip6 tunnel are
> dropped when xfrm is involved.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Eli Cooper <elicooper@gmx.com>
> ---
>  net/ipv4/ip_output.c   | 4 +++-
>  net/ipv6/output_core.c | 4 +++-
>  2 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 105908d..0180e44 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -117,8 +117,10 @@ int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
>  	int err;
>  
>  	err = __ip_local_out(net, sk, skb);
> -	if (likely(err == 1))
> +	if (likely(err == 1)) {
> +		skb->protocol = htons(ETH_P_IP);
>  		err = dst_output(net, sk, skb);
> +	}
>  

__ip_local_out() potentially does a dst_output() call too via the
netfilter hook, so you definitely need to place the skb->protocol
assignment before that netfilter hook.

^ permalink raw reply

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Eric Dumazet @ 2016-11-30 17:35 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan
In-Reply-To: <CALzJLG8pcuNo4uD3+dVms1gHT9=WfZAM1DDp_J10NgBZ7YZnEA@mail.gmail.com>

On Wed, 2016-11-30 at 18:46 +0200, Saeed Mahameed wrote:

> we had/still have the proper stats they are the ones that
> mlx4_en_fold_software_stats is trying to cache into  (they always
> exist),
> but the ones that you are trying to read from (the mlx4 rings) are gone !
> 
> This bug is totally new and as i warned, this is another symptom of
> the real root cause (can't sleep while reading stats).
> 
> Eric what do you suggest ? Keep pre-allocated MAX_RINGS stats  and
> always iterate over all of them to query stats ?
> what if you have one ring/none/1K ? how would you know how many to query ?

I am suggesting I will fix the bug I introduced.

Do not panic.

^ permalink raw reply

* Re: [PATCH 2/6] net: ethernet: ti: cpts: add support for ext rftclk selection
From: Grygorii Strashko @ 2016-11-30 17:35 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA, Mugunthan V N,
	Sekhar Nori, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-omap-u79uwXL29TY76Z2rM5mHXA, Rob Herring,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Murali Karicheri, Wingman Kwok
In-Reply-To: <20161130095632.GC28680-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>



On 11/30/2016 03:56 AM, Richard Cochran wrote:
> On Mon, Nov 28, 2016 at 05:04:24PM -0600, Grygorii Strashko wrote:
>> Some CPTS instances, which can be found on KeyStone 2 1/10G Ethernet
>> Switch Subsystems, can control an external multiplexer that selects
>> one of up to 32 clocks for time sync reference (RFTCLK). This feature
>> can be configured through CPTS_RFTCLK_SEL register (offset: x08).
>>
>> Hence, introduce optional DT cpts_rftclk_sel poperty wich, if present,
>> will specify CPTS reference clock. The cpts_rftclk_sel should be
>> omitted in DT if HW doesn't support this feature. The external fixed
>> rate clocks can be defined in board files as "fixed-clock".
> 
> Can't you implement this using the clock tree, rather than an ad-hoc
> DT property?
> 

I've thought about this, but decided to move forward with this impl
 which is pretty simple. I will try.


-- 
regards,
-grygorii
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: BUG() can be hit in tcp_collapse()
From: Eric Dumazet @ 2016-11-30 17:33 UTC (permalink / raw)
  To: Vladis Dronov; +Cc: netdev, stable, Marco Grassi
In-Reply-To: <1418136049.827916.1480525217226.JavaMail.zimbra@redhat.com>

On Wed, 2016-11-30 at 12:00 -0500, Vladis Dronov wrote:
> Hello, Eric, Marco, all,
> 
> This is JFYI and a follow-up message.
> 
> A further investigation was made to find out the Linux kernel commit which has
> introduced the flaw. It appeared that previous Linux kernel versions are vulnerable,
> down to v3.6-rc1. This fact was hidden by 'net.ipv4.tcp_fastopen' set to 0 by default,
> and now it is easier to notice since kernel v3.12 due to commit 0d41cca490 where the
> default was changed to 1. With 'net.ipv4.tcp_fastopen' set to 1, previous Linux
> kernels (including RHEL-7 ones) are also vulnerable.
> 
> The bug is here since tcp-fastopen feature was introduced in kernel v3.6-rc1, the first
> commit when the reproducer starts to panic the kernel with net.ipv4.tcp_fastopen=1 set
> is cf60af03ca, which is a part of commit sequence 2100c8d2d9..67da22d23f introducing
> net-tcp-fastopen feature:
> 
> $ git bisect bad cf60af03ca4e71134206809ea892e49b92a88896
> cf60af03ca4e71134206809ea892e49b92a88896 is the first bad commit
> commit cf60af03ca4e71134206809ea892e49b92a88896
> Author: Yuchung Cheng <ycheng@google.com>
> Date:   Thu Jul 19 06:43:09 2012 +0000
> 
> So, ideally, the upstream commit ac6e780070 which fixes the bug should have
> "Fixes: cf60af03ca" statement, unfortunately, this investigation was not completed at
> the time the patch was accepted upstream. And unfortunately I do not see other way
> to add this information except making notes in a comment in the related code, which
> seems weird.

Well, the crash can happen way before Yuchung patch.

It is a 0-day bug.

^ permalink raw reply

* Re: [PATCH 1/6] net: ethernet: ti: netcp: add support of cpts
From: Grygorii Strashko @ 2016-11-30 17:31 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA, Mugunthan V N,
	Sekhar Nori, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-omap-u79uwXL29TY76Z2rM5mHXA, Rob Herring,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Murali Karicheri, Wingman Kwok
In-Reply-To: <20161130094441.GB28680-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>



On 11/30/2016 03:44 AM, Richard Cochran wrote:
> On Mon, Nov 28, 2016 at 05:04:23PM -0600, Grygorii Strashko wrote:
>> @@ -678,6 +744,9 @@ struct gbe_priv {
>>  	int				num_et_stats;
>>  	/*  Lock for updating the hwstats */
>>  	spinlock_t			hw_stats_lock;
>> +
>> +	int                             cpts_registered;
> 
> The usage of this counter is racy.
> 
>> +	struct cpts                     *cpts;
>>  };
> 
> This ++ and -- business ...
> 
>> +static void gbe_register_cpts(struct gbe_priv *gbe_dev)
>> +{
>> +	if (!gbe_dev->cpts)
>> +		return;
>> +
>> +	if (gbe_dev->cpts_registered > 0)
>> +		goto done;
>> +
>> +	if (cpts_register(gbe_dev->cpts)) {
>> +		dev_err(gbe_dev->dev, "error registering cpts device\n");
>> +		return;
>> +	}
>> +
>> +done:
>> +	++gbe_dev->cpts_registered;
>> +}
>> +
>> +static void gbe_unregister_cpts(struct gbe_priv *gbe_dev)
>> +{
>> +	if (!gbe_dev->cpts || (gbe_dev->cpts_registered <= 0))
>> +		return;
>> +
>> +	if (--gbe_dev->cpts_registered)
>> +		return;
>> +
>> +	cpts_unregister(gbe_dev->cpts);
>> +}
> 
> is invoked from your open() and close() methods, but those methods
> are not serialized among multiple ports.
> 

ok. Seems my assumption that ndo_open/ndo_close serialized by rtnl_lock is incorrect. Right?
net_device_ops.ndo_open ->
 netcp_ndo_open
 gbe_open
 gbe_register_cpts

-- 
regards,
-grygorii
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [WIP] net+mlx4: auto doorbell
From: Eric Dumazet @ 2016-11-30 17:28 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Jesper Dangaard Brouer, Rick Jones, Linux Netdev List,
	Saeed Mahameed, Tariq Toukan
In-Reply-To: <CALzJLG9iCWkk2WTGrkFhZ61Sx5R-JDCxANhLQZPO8E47QGwOmQ@mail.gmail.com>

On Wed, 2016-11-30 at 18:27 +0200, Saeed Mahameed wrote:

> 
> In this case, i think they should implement their own bulking (pktgen
> is not a good example)
> but XDP can predict if it has more packets to xmit  as long as all of
> them fall in the same NAPI cycle.


> Others should try and do the same.

We can not trust user space to signal us when it is the good time to
perform a doorbell.

trafgen is using af_packet with qdisc bypass for example.

^ permalink raw reply

* Re: [PATCH net-next V2 1/7] net/mlx5e: Implement Fragmented Work Queue (WQ)
From: Sebastian Ott @ 2016-11-30 17:26 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: David S. Miller, netdev, Tariq Toukan, Or Gerlitz, Roi Dayan
In-Reply-To: <1480521583-12755-2-git-send-email-saeedm@mellanox.com>

Hi,

On Wed, 30 Nov 2016, Saeed Mahameed wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> Add new type of struct mlx5_frag_buf which is used to allocate fragmented
> buffers rather than contiguous, and make the Completion Queues (CQs) use
> it as they are big (default of 2MB per CQ in Striding RQ).
> 
> This fixes the failures of type:
> "mlx5e_open_locked: mlx5e_open_channels failed, -12"
> due to dma_zalloc_coherent insufficient contiguous coherent memory to
> satisfy the driver's request when the user tries to setup more or larger
> rings.

Thanks for that patch! I can confirm that this fixes the lage allocation
issue.

Regards,
Sebastian

^ permalink raw reply

* Re: [Patch net-next] audit: remove useless synchronize_net()
From: Cong Wang @ 2016-11-30 17:20 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: Linux Kernel Network Developers, linux-audit, Paul Moore
In-Reply-To: <20161130091643.GA32562@madcap2.tricolour.ca>

On Wed, Nov 30, 2016 at 1:16 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2016-11-29 09:14, Cong Wang wrote:
>> netlink kernel socket is protected by refcount, not RCU.
>> Its rcv path is neither protected by RCU. So the synchronize_net()
>> is just pointless.
>
> If I understand correctly, xfrm_user_net_exit() usage of
> RCU_INIT_POINTER() and synchronize_net() is similarly pointless?  Also
> net/phonet/socket.c?  I probably modelled things based on the former...

Possibly, but xfrm case is slightly different, it has two copies of the pointer
to the netlink socket, also it uses exit_batch(). I need to double check.

Take a look at better examples, fib_front, genetlink, rtnetlink.

^ permalink raw reply

* Re: qed, qedi patchset submission
From: Arun Easi @ 2016-11-30 17:15 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: David Miller, linux-scsi, netdev
In-Reply-To: <yq1oa0wj4qy.fsf@sermon.lab.mkp.net>

Thanks for the response, Martin.

On Wed, 30 Nov 2016, 8:45am, Martin K. Petersen wrote:

> >>>>> "Arun" == Arun Easi <arun.easi@cavium.com> writes:
> 
> Arun,
> 
> Arun> So far, we have been posting qedi changes split into functional
> Arun> blocks, for review, but was not bisectable. With Martin ok to our
> Arun> request to squash all patches while committing to tree, we were
> Arun> wondering if we should post the qedi patches squashed, with all
> Arun> the Reviewed-by added, or continue to post as before?
> 
> I guess it depends how things can be split up in a bisectable fashion.

These were the patches that was sent:
  1 qed: Add support for hardware offloaded iSCSI.
  2 qed: Add iSCSI out of order packet handling.
  3 qedi: Add QLogic FastLinQ offload iSCSI driver framework.
  4 qedi: Add LL2 iSCSI interface for offload iSCSI.
  5 qedi: Add support for iSCSI session management.
  6 qedi: Add support for data path.

> 
> If the net/ pieces can be completely separated from the scsi/ pieces
> maybe it would be best to have two patches?
> 

Yes, those pieces can be completely separated; there are actually 3 parts, 
2 that goes to net/ and 1 to scsi/.

In the list above, 1 & 2 goes to net/ and are bisectable. 3 through 6 are 
scsi/ pieces, which are the ones requested for collapsing.

We will post these pieces for V3, then:

  1 [PATCH v3 net-next 1/3] qed: Add support for hardware offloaded iSCSI.
  2 [PATCH v3 net-next 2/3] qed: Add iSCSI out of order packet handling.
  3 [PATCH v3 3/3] qedi: Add QLogic FastLinQ offload iSCSI driver framework.

Regards,
-Arun

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox