Netdev List
 help / color / mirror / Atom feed
* [PATCH v7 net-next 3/6] samples: bpf: add userspace example for modifying sk_bound_dev_if
From: David Ahern @ 2016-12-01 16:48 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480610888-31082-1-git-send-email-dsa@cumulusnetworks.com>

Add a simple program to demonstrate the ability to attach a bpf program
to a cgroup that sets sk_bound_dev_if for AF_INET{6} sockets when they
are created.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v7
- no change

v6
- added conversion from device name to index in test program

v5
- changed BPF_CGROUP_INET_SOCK to BPF_CGROUP_INET_SOCK_CREATE

v4
- added test_cgrp2_sock.sh for an automated test

v3
- revert to BPF_PROG_TYPE_CGROUP_SOCK prog type

v2
- removed bpf_sock_store_u32 references
- changed BPF_CGROUP_INET_SOCK_CREATE to BPF_CGROUP_INET_SOCK
- remove BPF_PROG_TYPE_CGROUP_SOCK prog type and add prog_subtype

 samples/bpf/Makefile           |  2 +
 samples/bpf/test_cgrp2_sock.c  | 83 ++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock.sh | 47 ++++++++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 samples/bpf/test_cgrp2_sock.c
 create mode 100755 samples/bpf/test_cgrp2_sock.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3ceb5a9d86df..a335b218198e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -23,6 +23,7 @@ hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += test_cgrp2_attach
+hostprogs-y += test_cgrp2_sock
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -51,6 +52,7 @@ map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
+test_cgrp2_sock-objs := libbpf.o test_cgrp2_sock.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
new file mode 100644
index 000000000000..d467b3c1c55c
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -0,0 +1,83 @@
+/* eBPF example program:
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program sets the sk_bound_dev_if index in new AF_INET{6}
+ *   sockets opened by processes in the cgroup.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+
+static int prog_load(int idx)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_MOV64_IMM(BPF_REG_3, idx),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			     "GPL", 0);
+}
+
+static int usage(const char *argv0)
+{
+	printf("Usage: %s cg-path device-index\n", argv0);
+	return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+	int cg_fd, prog_fd, ret;
+	unsigned int idx;
+
+	if (argc < 2)
+		return usage(argv[0]);
+
+	idx = if_nametoindex(argv[2]);
+	if (!idx) {
+		printf("Invalid device name\n");
+		return EXIT_FAILURE;
+	}
+
+	cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	prog_fd = prog_load(idx);
+	printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+	if (prog_fd < 0) {
+		printf("Failed to load prog: '%s'\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+	if (ret < 0) {
+		printf("Failed to attach prog to cgroup: '%s'\n",
+		       strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
new file mode 100755
index 000000000000..925fd467c7cc
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+function config_device {
+	ip netns add at_ns0
+	ip link add veth0 type veth peer name veth0b
+	ip link set veth0b up
+	ip link set veth0 netns at_ns0
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip link add foo type vrf table 1234
+	ip link set foo up
+	ip addr add 172.16.1.101/24 dev veth0b
+	ip addr add 2401:db00::2/64 dev veth0b nodad
+	ip link set veth0b master foo
+}
+
+function attach_bpf {
+	rm -rf /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2
+	mount -t cgroup2 none /tmp/cgroupv2
+	mkdir -p /tmp/cgroupv2/foo
+	test_cgrp2_sock /tmp/cgroupv2/foo foo
+	echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+}
+
+function cleanup {
+	set +ex
+	ip netns delete at_ns0
+	ip link del veth0
+	ip link del foo
+	umount /tmp/cgroupv2
+	rm -rf /tmp/cgroupv2
+	set -ex
+}
+
+function do_test {
+	ping -c1 -w1 172.16.1.100
+	ping6 -c1 -w1 2401:db00::1
+}
+
+cleanup 2>/dev/null
+config_device
+attach_bpf
+do_test
+cleanup
+echo "*** PASS ***"
-- 
2.1.4

^ permalink raw reply related

* [PATCH v7 net-next 2/6] bpf: Add new cgroup attach type to enable sock modifications
From: David Ahern @ 2016-12-01 16:48 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480610888-31082-1-git-send-email-dsa@cumulusnetworks.com>

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v7
- no change

v6
- added size check to sock_filter_is_valid_access; accesses must be u32

v5
- no change

v4
- dropped tweak to bpf_func signature
- dropped cg_sock_func_proto in favor of sk_filter_func_proto
- new __cgroup_bpf_run_filter_sk versus overloading __cgroup_bpf_run_filter
- reverted BPF_CGROUP_INET_SOCK to BPF_CGROUP_INET_SOCK_CREATE

v3
- reverted to new prog type BPF_PROG_TYPE_CGROUP_SOCK
- dropped the subtype

v2
- dropped the bpf_sock_store_u32 helper
- dropped the new prog type BPF_PROG_TYPE_CGROUP_SOCK
- moved valid access and context conversion to use subtype
- dropped CREATE from BPF_CGROUP_INET_SOCK and related function names
- moved running of filter from sk_alloc to inet{6}_create

 include/linux/bpf-cgroup.h | 14 +++++++++++
 include/uapi/linux/bpf.h   |  6 +++++
 kernel/bpf/cgroup.c        | 33 ++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  5 +++-
 net/core/filter.c          | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c         | 12 ++++++++-
 net/ipv6/af_inet6.c        |  8 ++++++
 7 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index af2ca8b432c0..7b6e5d168c95 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -40,6 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -63,6 +66,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
+						 BPF_CGROUP_INET_SOCK_CREATE); \
+	}								       \
+	__ret;								       \
+})
+
 #else
 
 struct cgroup_bpf {};
@@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d1456f..75964e00d947 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,11 +101,13 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -537,6 +539,10 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8fe55ffd109d..a515f7b007c6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5518a6839ab1..85af86c496cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_EGRESS:
 		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
-
+	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262b8ebb..5ee722dc097d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2676,6 +2676,32 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off + size > sizeof(struct bpf_sock))
+		return false;
+
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -2934,6 +2960,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,
@@ -3007,6 +3057,12 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops cg_sock_ops = {
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sock_filter_is_valid_access,
+	.convert_ctx_access	= sock_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3088,11 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+	.ops	= &cg_sock_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SOCK
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3100,7 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&cg_sock_type);
 
 	return 0;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..24d2550492ee 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -374,8 +374,18 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
-		if (err)
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
 			sk_common_release(sk);
+			goto out;
+		}
 	}
 out:
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a3737a..237e654ba717 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -258,6 +258,14 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 			goto out;
 		}
 	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 out:
 	return err;
 out_rcu_unlock:
-- 
2.1.4

^ permalink raw reply related

* [PATCH v7 net-next 1/6] bpf: Refactor cgroups code in prep for new type
From: David Ahern @ 2016-12-01 16:48 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern
In-Reply-To: <1480610888-31082-1-git-send-email-dsa@cumulusnetworks.com>

Code move and rename only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v7, v6, v5
- no change

v4
- dropped refactor of __cgroup_bpf_run_filter and renamed it
  to __cgroup_bpf_run_filter_skb

v3
- dropped the rename

v2
- fix bpf_prog_run_clear_cb to bpf_prog_run_save_cb as caught by Daniel

- rename BPF_PROG_TYPE_CGROUP_SKB and its cg_skb functions to
  BPF_PROG_TYPE_CGROUP and cgroup

 include/linux/bpf-cgroup.h | 46 +++++++++++++++++++++++-----------------------
 kernel/bpf/cgroup.c        | 10 +++++-----
 kernel/bpf/syscall.c       | 28 +++++++++++++++-------------
 3 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 0cf1adfadd2d..af2ca8b432c0 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -36,31 +36,31 @@ void cgroup_bpf_update(struct cgroup *cgrp,
 		       struct bpf_prog *prog,
 		       enum bpf_attach_type type);
 
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type);
-
-/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
-#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled)						\
-		__ret = __cgroup_bpf_run_filter(sk, skb,		\
-						BPF_CGROUP_INET_INGRESS); \
-									\
-	__ret;								\
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
+						    BPF_CGROUP_INET_INGRESS); \
+									      \
+	__ret;								      \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
-		typeof(sk) __sk = sk_to_full_sk(sk);			\
-		if (sk_fullsock(__sk))					\
-			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
-						BPF_CGROUP_INET_EGRESS); \
-	}								\
-	__ret;								\
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		       \
+		typeof(sk) __sk = sk_to_full_sk(sk);			       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
+						      BPF_CGROUP_INET_EGRESS); \
+	}								       \
+	__ret;								       \
 })
 
 #else
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8c784f8c67cd..8fe55ffd109d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -118,7 +118,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 }
 
 /**
- * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socken sending or receiving traffic
  * @skb: The skb that is being sent or received
  * @type: The type of program to be exectuted
@@ -132,9 +132,9 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
  * This function will return %-EPERM if any if an attached program was found
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type)
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
@@ -164,4 +164,4 @@ int __cgroup_bpf_run_filter(struct sock *sk,
 
 	return ret;
 }
-EXPORT_SYMBOL(__cgroup_bpf_run_filter);
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4caa18e6860a..5518a6839ab1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -856,6 +856,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
+	enum bpf_prog_type ptype;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -866,25 +867,26 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
-		prog = bpf_prog_get_type(attr->attach_bpf_fd,
-					 BPF_PROG_TYPE_CGROUP_SKB);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp)) {
-			bpf_prog_put(prog);
-			return PTR_ERR(cgrp);
-		}
-
-		cgroup_bpf_update(cgrp, prog, attr->attach_type);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
 
 	default:
 		return -EINVAL;
 	}
 
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(cgrp);
+	}
+
+	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	cgroup_put(cgrp);
+
 	return 0;
 }
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH v7 net-next 0/6] net: Add bpf support for sockets
From: David Ahern @ 2016-12-01 16:48 UTC (permalink / raw)
  To: netdev; +Cc: daniel, ast, daniel, maheshb, tgraf, David Ahern

The recently added VRF support in Linux leverages the bind-to-device
API for programs to specify an L3 domain for a socket. While
SO_BINDTODEVICE has been around for ages, not every ipv4/ipv6 capable
program has support for it. Even for those programs that do support it,
the API requires processes to be started as root (CAP_NET_RAW) which
is not desirable from a general security perspective.

This patch set leverages Daniel Mack's work to attach bpf programs to
a cgroup to provide a capability to set sk_bound_dev_if for all
AF_INET{6} sockets opened by a process in a cgroup when the sockets
are allocated.

For example:
 1. configure vrf (e.g., using ifupdown2)
        auto eth0
        iface eth0 inet dhcp
            vrf mgmt

        auto mgmt
        iface mgmt
            vrf-table auto

 2. configure cgroup
        mount -t cgroup2 none /tmp/cgroupv2
        mkdir /tmp/cgroupv2/mgmt
        test_cgrp2_sock /tmp/cgroupv2/mgmt 15

 3. set shell into cgroup (e.g., can be done at login using pam)
        echo $$ >> /tmp/cgroupv2/mgmt/cgroup.procs

At this point all commands run in the shell (e.g, apt) have sockets
automatically bound to the VRF (see output of ss -ap 'dev == <vrf>'),
including processes not running as root.

This capability enables running any program in a VRF context and is key
to deploying Management VRF, a fundamental configuration for networking
gear, with any Linux OS installation.

This patchset also exports the socket family, type and protocol as
read-only allowing bpf filters to deny a process in a cgroup the ability
to open specific types of AF_INET or AF_INET6 sockets.

v7
- comments from Alexei

v6
- add export of socket family, type and protocol


David Ahern (6):
  bpf: Refactor cgroups code in prep for new type
  bpf: Add new cgroup attach type to enable sock modifications
  samples: bpf: add userspace example for modifying sk_bound_dev_if
  bpf: Add support for reading socket family, type, protocol
  samples/bpf: Update bpf loader for cgroup section names
  samples/bpf: add userspace example for prohibiting sockets

 include/linux/bpf-cgroup.h      | 60 +++++++++++++++++------------
 include/net/sock.h              | 15 ++++++++
 include/uapi/linux/bpf.h        |  9 +++++
 kernel/bpf/cgroup.c             | 43 ++++++++++++++++++---
 kernel/bpf/syscall.c            | 33 +++++++++-------
 net/core/filter.c               | 83 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c              | 12 +++++-
 net/ipv6/af_inet6.c             |  8 ++++
 samples/bpf/Makefile            |  6 +++
 samples/bpf/bpf_load.c          | 14 +++++--
 samples/bpf/bpf_load.h          |  1 +
 samples/bpf/sock_flags_kern.c   | 44 ++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock.c   | 83 +++++++++++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock.sh  | 47 +++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.c  | 66 ++++++++++++++++++++++++++++++++
 samples/bpf/test_cgrp2_sock2.sh | 81 ++++++++++++++++++++++++++++++++++++++++
 16 files changed, 559 insertions(+), 46 deletions(-)
 create mode 100644 samples/bpf/sock_flags_kern.c
 create mode 100644 samples/bpf/test_cgrp2_sock.c
 create mode 100755 samples/bpf/test_cgrp2_sock.sh
 create mode 100644 samples/bpf/test_cgrp2_sock2.c
 create mode 100755 samples/bpf/test_cgrp2_sock2.sh

-- 
2.1.4

^ permalink raw reply

* Re: pull request (net-next): ipsec-next 2016-12-01
From: David Miller @ 2016-12-01 16:45 UTC (permalink / raw)
  To: steffen.klassert; +Cc: herbert, netdev
In-Reply-To: <1480592885-3903-1-git-send-email-steffen.klassert@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 1 Dec 2016 12:48:04 +0100

>   git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master
> 
> for you to fetch changes up to 2258d927a691ddd2ab585adb17ea9f96e89d0638:
> 
>   xfrm: remove unused helper (2016-09-30 08:20:56 +0200)

Hmmm, when I try to pull I don't get anything:

[davem@dhcp-10-15-49-210 net-next]$ git pull --no-ff git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master
>From git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next
 * branch            master     -> FETCH_HEAD
Already up-to-date.

^ permalink raw reply

* Re: [PATCH net] RDS: TCP: unregister_netdevice_notifier() in error path of rds_tcp_init_net
From: Santosh Shilimkar @ 2016-12-01 16:40 UTC (permalink / raw)
  To: Sowmini Varadhan, netdev; +Cc: davem
In-Reply-To: <1480596283-204869-1-git-send-email-sowmini.varadhan@oracle.com>

On 12/1/2016 4:44 AM, Sowmini Varadhan wrote:
> If some error is encountered in rds_tcp_init_net, make sure to
> unregister_netdevice_notifier(), else we could trigger a panic
> later on, when the modprobe from a netns fails.
>
> Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>

^ permalink raw reply

* Re: pull request (net): ipsec 2016-12-01
From: David Miller @ 2016-12-01 16:36 UTC (permalink / raw)
  To: steffen.klassert; +Cc: herbert, netdev
In-Reply-To: <1480592692-3653-1-git-send-email-steffen.klassert@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 1 Dec 2016 12:44:49 +0100

> 1) Change the error value when someone tries to run 32bit
>    userspace on a 64bit host from -ENOTSUPP to the userspace
>    exported -EOPNOTSUPP. Fix from Yi Zhao.
> 
> 2) On inbound, ESN sequence numbers are already in network
>    byte order. So don't try to convert it again, this fixes
>    integrity verification for ESN. Fixes from Tobias Brunner.
> 
> Please pull or let me know if there are problems.

Pulled, thanks Steffen.

^ permalink raw reply

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Saeed Mahameed @ 2016-12-01 16:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan
In-Reply-To: <1480607729.18162.311.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, Dec 1, 2016 at 5:55 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-12-01 at 17:38 +0200, Saeed Mahameed wrote:
>
>>
>> Hi Eric, Thanks for the patch, I already acked it.
>
> Thanks !
>
>>
>> I have one educational question (not related to this patch, but
>> related to stats reading in general).
>> I was wondering why do we need to disable bh every time we read stats
>> "spin_lock_bh" ? is it essential ?
>>
>> I checked and in mlx4 we don't hold stats_lock in softirq
>> (en_rx.c/en_tx.c), so I don't see any deadlock risk in here..
>
> Excellent question, and I chose to keep the spinlock.
>
> That would be doable, only if we do not overwrite dev->stats.
>
> Current code is :
>
> static struct rtnl_link_stats64 *
> mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         spin_lock_bh(&priv->stats_lock);
>         mlx4_en_fold_software_stats(dev);
>         netdev_stats_to_stats64(stats, &dev->stats);
>         spin_unlock_bh(&priv->stats_lock);
>
>         return stats;
> }
>
> If you remove the spin_lock_bh() :
>
>
> static struct rtnl_link_stats64 *
> mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         mlx4_en_fold_software_stats(dev); // possible races
>
>         netdev_stats_to_stats64(stats, &dev->stats);
>
>         return stats;
> }
>
> 1) one mlx4_en_fold_software_stats(dev) could be preempted
> on a CONFIG_PREEMPT kernel, or interrupted by long irqs.
>
> 2) Another cpu would also call mlx4_en_fold_software_stats(dev) while
>    first cpu is busy.
>
> 3) Then when resuming first cpu/thread, part of the dev->stats fieds
> would be updated with 'old counters',
> while another thread might have updated them with newer values.
>
> 4) A SNMP reader could then get counters that are not monotonically
> increasing,
> which would be confusing/buggy.
>
> So removing the spinlock is doable, but needs to add a new parameter
> to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
> before mlx4_en_fold_software_stats(dev)
>
> static struct rtnl_link_stats64 *
> mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
>
>         netdev_stats_to_stats64(stats, &dev->stats);
>
>         // Passing a non NULL stats asks mlx4_en_fold_software_stats()
>         // to not update dev->stats, but stats directly.
>
>         mlx4_en_fold_software_stats(dev, stats)
>
>
>         return stats;
> }
>
>

Thanks for the detailed answer !!

BTW you went 5 steps ahead of my original question :)), so far you
already have a patch without locking at all (really impressive).

What i wanted to ask originally, was regarding the "_bh", i didn't
mean to completely remove the "spin_lock_bh",
I meant, what happens if we replace "spin_lock_bh"  with "spin_lock",
without disabling bh ?
I gues raw "sping_lock" handles points (2 to 4) from above, but it
won't handle long irqs.

^ permalink raw reply

* Re: [Patch net-next] audit: remove useless synchronize_net()
From: David Miller @ 2016-12-01 16:29 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev, rgb
In-Reply-To: <1480439696-21818-1-git-send-email-xiyou.wangcong@gmail.com>

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 29 Nov 2016 09:14:56 -0800

> netlink kernel socket is protected by refcount, not RCU.
> Its rcv path is neither protected by RCU. So the synchronize_net()
> is just pointless.
> 
> Cc: Richard Guy Briggs <rgb@redhat.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next v4 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Thomas Graf @ 2016-12-01 16:28 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, netdev, alexei.starovoitov, tom, roopa, hannes
In-Reply-To: <584012CC.4030004@iogearbox.net>

On 12/01/16 at 01:08pm, Daniel Borkmann wrote:
> For the verifier change in may_access_direct_pkt_data(), would be
> great if you could later on follow up with a selftest-suite case,
> one where BPF_PROG_TYPE_LWT_IN/OUT prog tries to write and fails,
> and one where BPF_PROG_TYPE_LWT_IN/OUT prog uses pkt data to pass
> to helpers, for example, so that we can keep testing it when future
> changes in that area are made. Thanks.

Good idea, will do.

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: Thomas Graf @ 2016-12-01 16:28 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: Florian Westphal, netdev
In-Reply-To: <7e2be2fc-7c04-b333-59c7-43d4fcfcb451@stressinduktion.org>

On 12/01/16 at 04:52pm, Hannes Frederic Sowa wrote:
> First of all, this is a rant targeted at XDP and not at eBPF as a whole.
> XDP manipulates packets at free will and thus all security guarantees
> are off as well as in any user space solution.
> 
> Secondly user space provides policy, acl, more controlled memory
> protection, restartability and better debugability. If I had multi
> tenant workloads I would definitely put more complex "business/acl"
> logic into user space, so I can make use of LSM and other features to
> especially prevent a network facing service to attack the tenants. If
> stuff gets put into the kernel you run user controlled code in the
> kernel exposing a much bigger attack vector.
> 
> What use case do you see in XDP specifically e.g. for container networking?

DDOS mitigation to protect distributed applications in large clusters.
Relying on CDN works to protect API gateways and frontends (as long as
they don't throw you out of their network) but offers no protection
beyond that, e.g. a noisy/hostile neighbour. Doing this at the server
level and allowing the mitigation capability to scale up with the number
of servers is natural and cheap.

> > I agree with you if the LB is a software based appliance in either a
> > dedicated VM or on dedicated baremetal.
> > 
> > The reality is turning out to be different in many cases though, LB
> > needs to be performed not only for north south but east west as well.
> > So even if I would handle LB for traffic entering my datacenter in user
> > space, I will need the same LB for packets from my applications and
> > I definitely don't want to move all of that into user space.
> 
> The open question to me is why is programmability needed here.
> 
> Look at the discussion about ECMP and consistent hashing. It is not very
> easy to actually write this code correctly. Why can't we just put C code
> into the kernel that implements this once and for all and let user space
> update the policies?

Whatever LB logic is put in place with native C code now is unlikely the
logic we need in two years. We can't really predict the future. If it
was the case, networking would have been done long ago and we would all
be working on self eating ice cream now.

> Load balancers have to deal correctly with ICMP packets, e.g. they even
> have to be duplicated to every ECMP route. This seems to be problematic
> to do in eBPF programs due to looping constructs so you end up with
> complicated user space anyway.

Feel free to implement such complex LBs in user space or natively. It is
not required for the majority of use cases. The most popular LBs for
application load balancing have no idea of ECMP and require ECMP aware
routers to be made redundant itself.

^ permalink raw reply

* Re: [PATCH v2 net-next 0/4] Adding PHY MDI(X) support
From: David Miller @ 2016-12-01 16:27 UTC (permalink / raw)
  To: Raju.Lakkaraju; +Cc: netdev, f.fainelli, Allan.Nielsen, andrew
In-Reply-To: <1480412809-6122-1-git-send-email-Raju.Lakkaraju@microsemi.com>

From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Tue, 29 Nov 2016 15:16:45 +0530

> I updated all review comments which were given by Andrew and Florian.
> 
> This series add support for PHY MDI(X), and implement it for MSCC phys.
> 
> Tested on Beaglebone Black with VSC 8531 PHY.

Series applied, thanks.

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: David Miller @ 2016-12-01 16:19 UTC (permalink / raw)
  To: tgraf; +Cc: fw, netdev
In-Reply-To: <20161201145834.GA569@pox.localdomain>

From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 1 Dec 2016 15:58:34 +0100

> The benefits of XDP for this use case are extremely obvious in combination
> with local applications which need to be protected. ntuple filters won't
> cut it. They are limited and subject to a certain rate at which they
> can be configured. Any serious mitigation will require stateful filtering
> with at least minimal L7 matching abilities and this is exactly where XDP
> will excel.

+1

Saying that ntuple filters can handle the early drop use case doesn't
take into consideration the nature of the tables (hundreds of
thousands of "evil" IP addresses), whether hardware can actually
handle that (it can't), and whether simple IP address matching is the
full extent of it (it isn't).

Most of the time when I hear anti-XDP rhetoric, it's usually comes
from a crowd who for some reason feels threatened by the technology
and what it might replace and make useless.

That to me says that we are _exactly_ going down the right path.

^ permalink raw reply

* Re: [PATCH 00/11] Netfilter fixes for net
From: David Miller @ 2016-12-01 16:13 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 30 Nov 2016 22:57:14 +0100

> This is a large batch of Netfilter fixes for net, they are:
 ...
> I know is late but I think these are important, specifically the NAT
> bits, as they are mostly addressing fallout from recent changes. I also
> read there are chances to have -rc8, if that is the case, that would
> also give us a bit more time to test this.
> 
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thanks Pablo.

^ permalink raw reply

* Re: [PATCH net v2] tipc: check minimum bearer MTU
From: Ben Hutchings @ 2016-12-01 16:11 UTC (permalink / raw)
  To: Michal Kubecek, Jon Maloy, Ying Xue
  Cc: David S. Miller, tipc-discussion, netdev, linux-kernel,
	Qian Zhang
In-Reply-To: <20161201110205.10749A0F33@unicorn.suse.cz>

[-- Attachment #1: Type: text/plain, Size: 716 bytes --]

On Thu, 2016-12-01 at 12:02 +0100, Michal Kubecek wrote:
[...] 
> +/* check if device MTU is sufficient for tipc headers */
> +static inline bool tipc_check_mtu(struct net_device *dev, unsigned int reserve)
> +{
> +	if (dev->mtu >= TIPC_MIN_BEARER_MTU + reserve)
> +		return false;
> +	netdev_warn(dev, "MTU too low for tipc bearer\n");
> +	return true;
> +}
[...]

The comment says "check if ... sufficient" but the return value
indicates the opposite.  Could you make these consistent?

Other than that, this looks OK to me.  I haven't tested any version as
I don't know how to use TIPC.

Ben.

-- 
Ben Hutchings
A free society is one where it is safe to be unpopular. - Adlai
Stevenson


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Eric Dumazet @ 2016-12-01 16:08 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan
In-Reply-To: <1480607729.18162.311.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, 2016-12-01 at 07:55 -0800, Eric Dumazet wrote:

> So removing the spinlock is doable, but needs to add a new parameter
> to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
> before mlx4_en_fold_software_stats(dev)

Untested patch would be :

 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 -
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |   10 +----
 drivers/net/ethernet/mellanox/mlx4/en_port.c    |   24 +++++++++-----
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    3 +
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index d9c9f86a30df953fa555934c5406057dcaf28960..676050e352703cebe7fcaa5202a06496f7a5a0df 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -367,7 +367,7 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev,
 
 	spin_lock_bh(&priv->stats_lock);
 
-	mlx4_en_fold_software_stats(dev);
+	mlx4_en_fold_software_stats(dev, NULL);
 
 	for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
 		if (bitmap_iterator_test(&it))
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 091b904262bc7932d3edf99cf850affb23b9ce6e..6ee9e31e59c392cb88faedf9c541b3bc6d195228 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1321,13 +1321,9 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
 static struct rtnl_link_stats64 *
 mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-
-	spin_lock_bh(&priv->stats_lock);
-	mlx4_en_fold_software_stats(dev);
 	netdev_stats_to_stats64(stats, &dev->stats);
-	spin_unlock_bh(&priv->stats_lock);
-
+	/* Must be called after netdev_stats_to_stats64() */
+	mlx4_en_fold_software_stats(dev, stats);
 	return stats;
 }
 
@@ -1810,7 +1806,7 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 	netif_tx_disable(dev);
 
 	spin_lock_bh(&priv->stats_lock);
-	mlx4_en_fold_software_stats(dev);
+	mlx4_en_fold_software_stats(dev, NULL);
 	/* Set port as not active */
 	priv->port_up = false;
 	spin_unlock_bh(&priv->stats_lock);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index 9166d90e732858610b1407fe85cbf6cbe27f5e0b..eea042a18e3cfba62745ece4ca673c2db967b9aa 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -147,7 +147,8 @@ static unsigned long en_stats_adder(__be64 *start, __be64 *next, int num)
 	return ret;
 }
 
-void mlx4_en_fold_software_stats(struct net_device *dev)
+void mlx4_en_fold_software_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -165,9 +166,13 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
 		packets += READ_ONCE(ring->packets);
 		bytes   += READ_ONCE(ring->bytes);
 	}
-	dev->stats.rx_packets = packets;
-	dev->stats.rx_bytes = bytes;
-
+	if (stats) {
+		stats->rx_packets = packets;
+		stats->rx_bytes = bytes;
+	} else {
+		dev->stats.rx_packets = packets;
+		dev->stats.rx_bytes = bytes;
+	}
 	packets = 0;
 	bytes = 0;
 	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
@@ -176,8 +181,13 @@ void mlx4_en_fold_software_stats(struct net_device *dev)
 		packets += READ_ONCE(ring->packets);
 		bytes   += READ_ONCE(ring->bytes);
 	}
-	dev->stats.tx_packets = packets;
-	dev->stats.tx_bytes = bytes;
+	if (stats) {
+		stats->tx_packets = packets;
+		stats->tx_bytes = bytes;
+	} else {
+		dev->stats.tx_packets = packets;
+		dev->stats.tx_bytes = bytes;
+	}
 }
 
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
@@ -208,7 +218,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
 	spin_lock_bh(&priv->stats_lock);
 
-	mlx4_en_fold_software_stats(dev);
+	mlx4_en_fold_software_stats(dev, NULL);
 
 	priv->port_stats.rx_chksum_good = 0;
 	priv->port_stats.rx_chksum_none = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 20a936428f4a44c8ca0a7161855da310f9166b50..92dbb41f425b282e9ab7c8d534f091da0ba661c3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -755,7 +755,8 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq);
 int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
 
-void mlx4_en_fold_software_stats(struct net_device *dev);
+void mlx4_en_fold_software_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats);
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
 int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
 

^ permalink raw reply related

* Re: [PATCH v5 net-next 0/7] Support Armada 37xx SoC (ARMv8 64-bits) in mvneta driver
From: Gregory CLEMENT @ 2016-12-01 16:09 UTC (permalink / raw)
  To: Marcin Wojtas
  Cc: David S. Miller, linux-kernel, netdev, Jisheng Zhang,
	Arnd Bergmann, Jason Cooper, Andrew Lunn, Sebastian Hesselbarth,
	Thomas Petazzoni, linux-arm-kernel@lists.infradead.org,
	Nadav Haklai, Dmitri Epshtein, Yelena Krivosheev
In-Reply-To: <CAPv3WKdWyS0wCVsRKR86qpMx4r8NN9=RmjM9oHFy2mmvaR5PAA@mail.gmail.com>

Hi Marcin,
 
 On jeu., déc. 01 2016, Marcin Wojtas <mw@semihalf.com> wrote:

> Hi Gregory,
>
> Checked on a388-gp with and without HWBM, also both ports work on
> a3700 (second one after changing to sgmii).
>
> Tested-by: Marcin Wojtas <mw@semihalf.com>

Thanks, I am going to send a new version with tour tested-by and the dts
fix for the second port.

Gregory

>
> Best regards,
> Marcin
>
> 2016-11-30 22:42 GMT+01:00 Gregory CLEMENT <gregory.clement@free-electrons.com>:
>> Hi,
>>
>> The Armada 37xx is a new ARMv8 SoC from Marvell using same network
>> controller as the older Armada 370/38x/XP SoCs. This series adapts the
>> driver in order to be able to use it on this new SoC. The main changes
>> are:
>>
>> - 64-bits support: the first patches allow using the driver on a 64-bit
>>   architecture.
>>
>> - MBUS support: the mbus configuration is different on Armada 37xx
>>   from the older SoCs.
>>
>> - per cpu interrupt: Armada 37xx do not support per cpu interrupt for
>>   the NETA IP, the non-per-CPU behavior was added back.
>>
>> The first patch is an optimization in the rx path in swbm mode.
>> The second patch remove unnecessary allocation for HWBM.
>> The first item is solved by patches 4 and 5.
>> The 2 last items are solved by patch 6.
>> In patch 7 the dt support is added.
>>
>> Beside Armada 37xx, this series have been again tested on Armada XP
>> and Armada 38x (with Hardware Buffer Management and with Software
>> Buffer Management).
>>
>> This is the 5th version of the series:
>> - 1st version:
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/469588.html
>>
>> - 2nd version:
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/470476.html
>>
>> - 3rd version:
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/470901.html
>>
>> - 4th version:
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/471039.html
>>
>> Changelog:
>> v4 -> v5:
>>  - remove unnecessary cast in patch 3
>>
>> v3 -> v4:
>>  - Adding new patch: "net: mvneta: do not allocate buffer in rxq init
>>    with HWBM"
>>
>>  - Simplify the HWBM case in patch 3 as suggested by Marcin
>>
>> v2 -> v3:
>>  - Adding patch 1 "Optimize rx path for small frame"
>>
>>  - Fix the kbuild error by moving the "phys_addr += pp->rx_offset_correction;"
>>   line from patch 2 to patch 3 where rx_offset_correction is introduced.
>>
>>  - Move the memory allocation of the buf_virt_addr of the rxq to be
>>    called by the probe function in order to avoid a memory leak.
>>
>> Thanks,
>>
>> Gregory
>>
>> Gregory CLEMENT (5):
>>   net: mvneta: Optimize rx path for small frame
>>   net: mvneta: Do not allocate buffer in rxq init with HWBM
>>   net: mvneta: Use cacheable memory to store the rx buffer virtual address
>>   net: mvneta: Only disable mvneta_bm for 64-bits
>>   ARM64: dts: marvell: Add network support for Armada 3700
>>
>> Marcin Wojtas (2):
>>   net: mvneta: Convert to be 64 bits compatible
>>   net: mvneta: Add network support for Armada 3700 SoC
>>
>>  Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt |   7 +-
>>  arch/arm64/boot/dts/marvell/armada-3720-db.dts                    |  23 +++++-
>>  arch/arm64/boot/dts/marvell/armada-37xx.dtsi                      |  23 +++++-
>>  drivers/net/ethernet/marvell/Kconfig                              |  10 +-
>>  drivers/net/ethernet/marvell/mvneta.c                             | 344 +++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
>>  5 files changed, 305 insertions(+), 102 deletions(-)
>>
>> base-commit: 436accebb53021ef7c63535f60bda410aa87c136
>> --
>> git-series 0.8.10

-- 
Gregory Clement, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: Florian Westphal @ 2016-12-01 16:06 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Florian Westphal, netdev
In-Reply-To: <20161201145834.GA569@pox.localdomain>

Thomas Graf <tgraf@suug.ch> wrote:
> On 12/01/16 at 10:11am, Florian Westphal wrote:
> > Aside from this, XDP, like DPDK, is a kernel bypass.
> > You might say 'Its just stack bypass, not a kernel bypass!'.
> > But what does that mean exactly?  That packets can still be passed
> > onward to normal stack?
> > Bypass solutions like netmap can also inject packets back to
> > kernel stack again.
> 
> I have a fundamental issue with the approach of exporting packets into
> user space and reinjecting them: Once the packet leaves the kernel,
> any security guarantees are off. I have no control over what is
> running in user space and whether whatever listener up there has been
> compromised or not. To me, that's a no go, in particular for servers
> hosting multi tenant workloads. This is one of the main reasons why
> XDP, in particular in combination with BPF, is very interesting to me.

Funny, I see it exactly the other way around :)

To me packet coming from this "userspace injection" is no different than
a tun/tap, or any other packet coming from network.

I see no change or increase in attack surface.

^ permalink raw reply

* Re: [PATCH v5 net-next 0/7] Support Armada 37xx SoC (ARMv8 64-bits) in mvneta driver
From: Marcin Wojtas @ 2016-12-01 16:07 UTC (permalink / raw)
  To: Gregory CLEMENT
  Cc: David S. Miller, linux-kernel, netdev, Jisheng Zhang,
	Arnd Bergmann, Jason Cooper, Andrew Lunn, Sebastian Hesselbarth,
	Thomas Petazzoni, linux-arm-kernel@lists.infradead.org,
	Nadav Haklai, Dmitri Epshtein, Yelena Krivosheev
In-Reply-To: <cover.0270f6d2413a709521fe2c8c17fbebea6f2e78d1.1480542157.git-series.gregory.clement@free-electrons.com>

Hi Gregory,

Checked on a388-gp with and without HWBM, also both ports work on
a3700 (second one after changing to sgmii).

Tested-by: Marcin Wojtas <mw@semihalf.com>

Best regards,
Marcin

2016-11-30 22:42 GMT+01:00 Gregory CLEMENT <gregory.clement@free-electrons.com>:
> Hi,
>
> The Armada 37xx is a new ARMv8 SoC from Marvell using same network
> controller as the older Armada 370/38x/XP SoCs. This series adapts the
> driver in order to be able to use it on this new SoC. The main changes
> are:
>
> - 64-bits support: the first patches allow using the driver on a 64-bit
>   architecture.
>
> - MBUS support: the mbus configuration is different on Armada 37xx
>   from the older SoCs.
>
> - per cpu interrupt: Armada 37xx do not support per cpu interrupt for
>   the NETA IP, the non-per-CPU behavior was added back.
>
> The first patch is an optimization in the rx path in swbm mode.
> The second patch remove unnecessary allocation for HWBM.
> The first item is solved by patches 4 and 5.
> The 2 last items are solved by patch 6.
> In patch 7 the dt support is added.
>
> Beside Armada 37xx, this series have been again tested on Armada XP
> and Armada 38x (with Hardware Buffer Management and with Software
> Buffer Management).
>
> This is the 5th version of the series:
> - 1st version:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/469588.html
>
> - 2nd version:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/470476.html
>
> - 3rd version:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/470901.html
>
> - 4th version:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-November/471039.html
>
> Changelog:
> v4 -> v5:
>  - remove unnecessary cast in patch 3
>
> v3 -> v4:
>  - Adding new patch: "net: mvneta: do not allocate buffer in rxq init
>    with HWBM"
>
>  - Simplify the HWBM case in patch 3 as suggested by Marcin
>
> v2 -> v3:
>  - Adding patch 1 "Optimize rx path for small frame"
>
>  - Fix the kbuild error by moving the "phys_addr += pp->rx_offset_correction;"
>   line from patch 2 to patch 3 where rx_offset_correction is introduced.
>
>  - Move the memory allocation of the buf_virt_addr of the rxq to be
>    called by the probe function in order to avoid a memory leak.
>
> Thanks,
>
> Gregory
>
> Gregory CLEMENT (5):
>   net: mvneta: Optimize rx path for small frame
>   net: mvneta: Do not allocate buffer in rxq init with HWBM
>   net: mvneta: Use cacheable memory to store the rx buffer virtual address
>   net: mvneta: Only disable mvneta_bm for 64-bits
>   ARM64: dts: marvell: Add network support for Armada 3700
>
> Marcin Wojtas (2):
>   net: mvneta: Convert to be 64 bits compatible
>   net: mvneta: Add network support for Armada 3700 SoC
>
>  Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt |   7 +-
>  arch/arm64/boot/dts/marvell/armada-3720-db.dts                    |  23 +++++-
>  arch/arm64/boot/dts/marvell/armada-37xx.dtsi                      |  23 +++++-
>  drivers/net/ethernet/marvell/Kconfig                              |  10 +-
>  drivers/net/ethernet/marvell/mvneta.c                             | 344 +++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
>  5 files changed, 305 insertions(+), 102 deletions(-)
>
> base-commit: 436accebb53021ef7c63535f60bda410aa87c136
> --
> git-series 0.8.10

^ permalink raw reply

* Re: [PATCH v5 net-next 7/7] ARM64: dts: marvell: Add network support for Armada 3700
From: Marcin Wojtas @ 2016-12-01 16:05 UTC (permalink / raw)
  To: Gregory CLEMENT
  Cc: David S. Miller, linux-kernel, netdev, Jisheng Zhang,
	Arnd Bergmann, Jason Cooper, Andrew Lunn, Sebastian Hesselbarth,
	Thomas Petazzoni, linux-arm-kernel@lists.infradead.org,
	Nadav Haklai, Dmitri Epshtein, Yelena Krivosheev
In-Reply-To: <f205231e640a2324b8f007073cf54b166a263ed4.1480542157.git-series.gregory.clement@free-electrons.com>

Hi Gregory,

2016-11-30 22:42 GMT+01:00 Gregory CLEMENT <gregory.clement@free-electrons.com>:
> Add neta nodes for network support both in device tree for the SoC and
> the board.
>
> Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
> ---
>  arch/arm64/boot/dts/marvell/armada-3720-db.dts | 23 +++++++++++++++++++-
>  arch/arm64/boot/dts/marvell/armada-37xx.dtsi   | 23 +++++++++++++++++++-
>  2 files changed, 46 insertions(+), 0 deletions(-)
>
> diff --git a/arch/arm64/boot/dts/marvell/armada-3720-db.dts b/arch/arm64/boot/dts/marvell/armada-3720-db.dts
> index 1372e9a6aaa4..c8b82e4145de 100644
> --- a/arch/arm64/boot/dts/marvell/armada-3720-db.dts
> +++ b/arch/arm64/boot/dts/marvell/armada-3720-db.dts
> @@ -81,3 +81,26 @@
>  &pcie0 {
>         status = "okay";
>  };
> +
> +&mdio {
> +       status = "okay";
> +       phy0: ethernet-phy@0 {
> +               reg = <0>;
> +       };
> +
> +       phy1: ethernet-phy@1 {
> +               reg = <1>;
> +       };
> +};
> +
> +&eth0 {
> +       phy-mode = "rgmii-id";
> +       phy = <&phy0>;
> +       status = "okay";
> +};
> +
> +&eth1 {
> +       phy-mode = "rgmii-id";

Should be "sgmii".

Best regards,
Marcin

^ permalink raw reply

* Re: [WIP] net+mlx4: auto doorbell
From: Jesper Dangaard Brouer @ 2016-12-01 16:04 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Saeed Mahameed, Rick Jones, Linux Netdev List, Saeed Mahameed,
	Tariq Toukan, brouer
In-Reply-To: <1480602274.18162.285.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, 01 Dec 2016 06:24:34 -0800
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Thu, 2016-12-01 at 13:05 +0100, Jesper Dangaard Brouer wrote:
> > On Wed, 30 Nov 2016 18:27:45 +0200
> > Saeed Mahameed <saeedm@dev.mellanox.co.il> wrote:
> >   
> > > >> All in all, this is risky business :),  the right way to go is to
> > > >> force the upper layer to use xmit-more and delay doorbells/use bulking
> > > >> but from the same context (xmit routine).  For example see
> > > >> Achiad's suggestion (attached in Jesper's response), he used stop
> > > >> queue to force the stack to queue up packets (TX bulking)
> > > >> which would set xmit-more and will use the next completion to
> > > >> release the "stopped" ring TXQ rather than hit the doorbell on
> > > >> behalf of it.    
> > > >
> > > > Well, you depend on having a higher level queue like a qdisc.
> > > >
> > > > Some users do not use a qdisc.
> > > > If you stop the queue, they no longer can send anything -> drops.
> > > >  
> > 
> > You do have a point that stopping the device might not be the best way
> > to create a push-back (to allow stack queue packets).
> > 
> >  netif_tx_stop_queue() / __QUEUE_STATE_DRV_XOFF
> > 
> >   
> > > In this case, i think they should implement their own bulking (pktgen
> > > is not a good example) but XDP can predict if it has more packets to
> > > xmit  as long as all of them fall in the same NAPI cycle.
> > > Others should try and do the same.  
> > 
> > I actually agree with Saeed here.
> > 
> > Maybe we can come up with another __QUEUE_STATE_xxx that informs the
> > upper layer what the driver is doing.  Then users not using a qdisc can
> > use this indication (like the qdisc could).  (qdisc-bypass users already
> > check the QUEUE_STATE flags e.g. via netif_xmit_frozen_or_drv_stopped).  
> 
> Can you explain how this is going to help trafgen using AF_PACKET with
> Qdisc bypass ?
> 
> Say trafgen wants to send 10 or 1000 packets back to back (as fast as
> possible)
>
> With my proposal, only the first is triggering a doorbell from
> ndo_start_xmit(). Following ones are driven by TX completion logic, or
> BQL if we can push packets faster than TX interrupt can be
> delivered/handled.
> 
> If you stop the queue (with yet another atomic operations to stop/unstop
> btw), packet_direct_xmit() will have to drop trafgen packets on the
> floor.

I think you misunderstood my concept[1].  I don't want to stop the
queue. The new __QUEUE_STATE_FLUSH_NEEDED does not stop the queue, is
it just indicating that someone need to flush/ring-doorbell.  Maybe it
need another name, because it also indicate that the driver can see
that its TX queue is so busy that we don't need to call it immediately.
The qdisc layer can then choose to enqueue instead if doing direct xmit.

When qdisc layer or trafgen/af_packet see this indication it knows it
should/must flush the queue when it don't have more work left.  Perhaps
through net_tx_action(), by registering itself and e.g. if qdisc_run()
is called and queue is empty then check if queue needs a flush. I would
also allow driver to flush and clear this bit.

I just see it as an extension of your solution, as we still need the
driver to figure out then the doorbell/flush can be delayed.
p.s. don't be discouraged by this feedback, I'm just very excited and
happy that your are working on a solution in this area. As this is a
problem area that I've not been able to solve myself for the last
approx 2 years. Keep up the good work!

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

[1] http://lkml.kernel.org/r/20161130233015.3de95356@redhat.com

^ permalink raw reply

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Eric Dumazet @ 2016-12-01 15:55 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan
In-Reply-To: <CALzJLG8O28ZBS3sdqLpohPZtpLbLs228K4aO6hYd4URsA3yefA@mail.gmail.com>

On Thu, 2016-12-01 at 17:38 +0200, Saeed Mahameed wrote:

> 
> Hi Eric, Thanks for the patch, I already acked it.

Thanks !

> 
> I have one educational question (not related to this patch, but
> related to stats reading in general).
> I was wondering why do we need to disable bh every time we read stats
> "spin_lock_bh" ? is it essential ?
> 
> I checked and in mlx4 we don't hold stats_lock in softirq
> (en_rx.c/en_tx.c), so I don't see any deadlock risk in here..

Excellent question, and I chose to keep the spinlock.

That would be doable, only if we do not overwrite dev->stats.

Current code is :

static struct rtnl_link_stats64 *
mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct mlx4_en_priv *priv = netdev_priv(dev);

        spin_lock_bh(&priv->stats_lock);
        mlx4_en_fold_software_stats(dev);
        netdev_stats_to_stats64(stats, &dev->stats);
        spin_unlock_bh(&priv->stats_lock);

        return stats;
}

If you remove the spin_lock_bh() :

static struct rtnl_link_stats64 *
mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct mlx4_en_priv *priv = netdev_priv(dev);

        mlx4_en_fold_software_stats(dev); // possible races

        netdev_stats_to_stats64(stats, &dev->stats);

        return stats;
}

1) one mlx4_en_fold_software_stats(dev) could be preempted
on a CONFIG_PREEMPT kernel, or interrupted by long irqs.

2) Another cpu would also call mlx4_en_fold_software_stats(dev) while
   first cpu is busy.

3) Then when resuming first cpu/thread, part of the dev->stats fieds 
would be updated with 'old counters',
while another thread might have updated them with newer values.

4) A SNMP reader could then get counters that are not monotonically
increasing,
which would be confusing/buggy.

So removing the spinlock is doable, but needs to add a new parameter
to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
before mlx4_en_fold_software_stats(dev)

static struct rtnl_link_stats64 *
mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct mlx4_en_priv *priv = netdev_priv(dev);

        netdev_stats_to_stats64(stats, &dev->stats);

	// Passing a non NULL stats asks mlx4_en_fold_software_stats()
	// to not update dev->stats, but stats directly.

        mlx4_en_fold_software_stats(dev, stats)


        return stats;
}

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: Hannes Frederic Sowa @ 2016-12-01 15:52 UTC (permalink / raw)
  To: Thomas Graf, Florian Westphal; +Cc: netdev
In-Reply-To: <20161201145834.GA569@pox.localdomain>

Hi,

On 01.12.2016 15:58, Thomas Graf wrote:
> On 12/01/16 at 10:11am, Florian Westphal wrote:
>> Aside from this, XDP, like DPDK, is a kernel bypass.
>> You might say 'Its just stack bypass, not a kernel bypass!'.
>> But what does that mean exactly?  That packets can still be passed
>> onward to normal stack?
>> Bypass solutions like netmap can also inject packets back to
>> kernel stack again.
> 
> I have a fundamental issue with the approach of exporting packets into
> user space and reinjecting them: Once the packet leaves the kernel,
> any security guarantees are off. I have no control over what is
> running in user space and whether whatever listener up there has been
> compromised or not. To me, that's a no go, in particular for servers
> hosting multi tenant workloads. This is one of the main reasons why
> XDP, in particular in combination with BPF, is very interesting to me.

First of all, this is a rant targeted at XDP and not at eBPF as a whole.
XDP manipulates packets at free will and thus all security guarantees
are off as well as in any user space solution.

Secondly user space provides policy, acl, more controlled memory
protection, restartability and better debugability. If I had multi
tenant workloads I would definitely put more complex "business/acl"
logic into user space, so I can make use of LSM and other features to
especially prevent a network facing service to attack the tenants. If
stuff gets put into the kernel you run user controlled code in the
kernel exposing a much bigger attack vector.

What use case do you see in XDP specifically e.g. for container networking?

>> b). with regards to a programmable data path: IFF one wants to do this
>> in kernel (and thats a big if), it seems much more preferrable to provide
>> a config/data-based approach rather than a programmable one.  If you want
>> full freedom DPDK is architecturally just too powerful to compete with.
> 
> I must have missed the legal disclaimer that is usually put in front
> of the DPDK marketing show :-)
>
> I don't want full freedom. I want programmability with stack integration
> at sufficient speed and the ability to benefit from the hardware
> abstractions that the kernel provides.
> 
>> Proponents of XDP sometimes provide usage examples.
>> Lets look at some of these.
> 
> [ I won't comment on any of the other use cases because they are of no
>   interest to me ]
> 
>> * Load balancer
>> State holding algorithm need sorting and searching, so also no fit for
>> eBPF (could be exposed by function exports, but then can we do DoS by
>> finding worst case scenarios?).
>>
>> Also again needs way to forward frame out via another interface.
>>
>> For cases where packet gets sent out via same interface it would appear
>> to be easier to use port mirroring in a switch and use stochastic filtering
>> on end nodes to determine which host should take responsibility.
>>
>> XDP plus: central authority over how distribution will work in case
>> nodes are added/removed from pool.
>> But then again, it will be easier to hande this with netmap/dpdk where
>> more complicated scheduling algorithms can be used.
> 
> I agree with you if the LB is a software based appliance in either a
> dedicated VM or on dedicated baremetal.
> 
> The reality is turning out to be different in many cases though, LB
> needs to be performed not only for north south but east west as well.
> So even if I would handle LB for traffic entering my datacenter in user
> space, I will need the same LB for packets from my applications and
> I definitely don't want to move all of that into user space.

The open question to me is why is programmability needed here.

Look at the discussion about ECMP and consistent hashing. It is not very
easy to actually write this code correctly. Why can't we just put C code
into the kernel that implements this once and for all and let user space
update the policies?

Load balancers have to deal correctly with ICMP packets, e.g. they even
have to be duplicated to every ECMP route. This seems to be problematic
to do in eBPF programs due to looping constructs so you end up with
complicated user space anyway.

>> * early drop/filtering.
>> While its possible to do "u32" like filters with ebpf, all modern nics
>> support ntuple filtering in hardware, which is going to be faster because
>> such packet will never even be signalled to the operating system.
>> For more complicated cases (e.g. doing socket lookup to check if particular
>> packet does match bound socket (and expected sequence numbers etc) I don't
>> see easy ways to do that with XDP (and without sk_buff context).
>> Providing it via function exports is possible of course, but that will only
>> result in an "arms race" where we will see special-sauce functions
>> all over the place -- DoS will always attempt to go for something
>> that is difficult to filter against, cf. all the recent volume-based
>> floodings.
> 
> You probably put this last because this was the most difficult to
> shoot down ;-)
> 
> The benefits of XDP for this use case are extremely obvious in combination
> with local applications which need to be protected. ntuple filters won't
> cut it. They are limited and subject to a certain rate at which they
> can be configured. Any serious mitigation will require stateful filtering
> with at least minimal L7 matching abilities and this is exactly where XDP
> will excel.

In my experience and research of DoS attacks you certainly want to put a
bit more logic into a filter than to look up something from hash tables
and drop it then. You certainly also want to have some more logic than
32 * 4096 instructions to execute, e.g. parsing and matching of DNS/NTP
packets with certain conditions and side look-ups. If you seriously do
that stuff you end up with a highly optimized programs containing
stochastic filters and also complex database logic.

If I want to drop based on hash table lookups, as Florian wrote, I would
let the hardware do that and assemble the tables in user space.

Bye,
Hannes

^ permalink raw reply

* [PATCH 1/1] NET: usb: qmi_wwan: add support for Telit LE922A PID 0x1040
From: Daniele Palmas @ 2016-12-01 15:52 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: netdev, Daniele Palmas
In-Reply-To: <1480607525-23044-1-git-send-email-dnlplm@gmail.com>

This patch adds support for PID 0x1040 of Telit LE922A.

The qmi adapter requires to have DTR set for proper working,
so QMI_WWAN_QUIRK_DTR has been enabled.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 3ff76c6..6fe1cdb 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -894,6 +894,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},	/* Alcatel L800MA */
 	{QMI_FIXED_INTF(0x2357, 0x0201, 4)},	/* TP-LINK HSUPA Modem MA180 */
 	{QMI_FIXED_INTF(0x2357, 0x9000, 4)},	/* TP-LINK MA260 */
+	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)},	/* Telit LE922A */
 	{QMI_FIXED_INTF(0x1bc7, 0x1200, 5)},	/* Telit LE920 */
 	{QMI_FIXED_INTF(0x1bc7, 0x1201, 2)},	/* Telit LE920 */
 	{QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)},	/* XS Stick W100-2 from 4G Systems */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 0/1] NET: usb: qmi_wwan: add support for Telit LE922A PID 0x1040
From: Daniele Palmas @ 2016-12-01 15:52 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: netdev, Daniele Palmas

This patch adds support for PID 0x1040 of Telit LE922A.

The qmi adapter requires to have DTR set for proper working,
so QMI_WWAN_QUIRK_DTR has been enabled.

Following verbose lsusb output of the composition:

Bus 003 Device 006: ID 1bc7:1040 Telit Wireless Solutions 
Device Descriptor:
  bLength                18
  bDescriptorType         1
  bcdUSB               2.00
  bDeviceClass            0 (Defined at Interface level)
  bDeviceSubClass         0 
  bDeviceProtocol         0 
  bMaxPacketSize0        64
  idVendor           0x1bc7 Telit Wireless Solutions
  idProduct          0x1040 
  bcdDevice            3.10
  iManufacturer           1 Android
  iProduct                2 Android
  iSerial                 3 359fb2
  bNumConfigurations      1
  Configuration Descriptor:
    bLength                 9
    bDescriptorType         2
    wTotalLength          281
    bNumInterfaces          7
    bConfigurationValue     1
    iConfiguration          0 
    bmAttributes         0x80
      (Bus Powered)
    MaxPower              500mA
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        0
      bAlternateSetting       0
      bNumEndpoints           2
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass    255 Vendor Specific Subclass
      bInterfaceProtocol    255 Vendor Specific Protocol
      iInterface              0 
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x81  EP 1 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x01  EP 1 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        1
      bAlternateSetting       0
      bNumEndpoints           2
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass     66 
      bInterfaceProtocol      1 
      iInterface              4 ADB Interface
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x02  EP 2 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x82  EP 2 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        2
      bAlternateSetting       0
      bNumEndpoints           3
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass    255 Vendor Specific Subclass
      bInterfaceProtocol    255 Vendor Specific Protocol
      iInterface              0 
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x84  EP 4 IN
        bmAttributes            3
          Transfer Type            Interrupt
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0008  1x 8 bytes
        bInterval               9
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x83  EP 3 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x03  EP 3 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        3
      bAlternateSetting       0
      bNumEndpoints           3
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass      0 
      bInterfaceProtocol      0 
      iInterface              0 
      ** UNRECOGNIZED:  05 24 00 10 01
      ** UNRECOGNIZED:  05 24 01 00 00
      ** UNRECOGNIZED:  04 24 02 02
      ** UNRECOGNIZED:  05 24 06 00 00
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x86  EP 6 IN
        bmAttributes            3
          Transfer Type            Interrupt
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x000a  1x 10 bytes
        bInterval               9
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x85  EP 5 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x04  EP 4 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        4
      bAlternateSetting       0
      bNumEndpoints           3
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass      0 
      bInterfaceProtocol      0 
      iInterface              0 
      ** UNRECOGNIZED:  05 24 00 10 01
      ** UNRECOGNIZED:  05 24 01 00 00
      ** UNRECOGNIZED:  04 24 02 02
      ** UNRECOGNIZED:  05 24 06 00 00
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x88  EP 8 IN
        bmAttributes            3
          Transfer Type            Interrupt
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x000a  1x 10 bytes
        bInterval               9
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x87  EP 7 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x05  EP 5 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        5
      bAlternateSetting       0
      bNumEndpoints           3
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass      0 
      bInterfaceProtocol      0 
      iInterface              0 
      ** UNRECOGNIZED:  05 24 00 10 01
      ** UNRECOGNIZED:  05 24 01 00 00
      ** UNRECOGNIZED:  04 24 02 02
      ** UNRECOGNIZED:  05 24 06 00 00
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x8a  EP 10 IN
        bmAttributes            3
          Transfer Type            Interrupt
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x000a  1x 10 bytes
        bInterval               9
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x89  EP 9 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x06  EP 6 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
    Interface Descriptor:
      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        6
      bAlternateSetting       0
      bNumEndpoints           3
      bInterfaceClass       255 Vendor Specific Class
      bInterfaceSubClass      0 
      bInterfaceProtocol      0 
      iInterface              0 
      ** UNRECOGNIZED:  05 24 00 10 01
      ** UNRECOGNIZED:  05 24 01 00 00
      ** UNRECOGNIZED:  04 24 02 02
      ** UNRECOGNIZED:  05 24 06 00 00
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x8c  EP 12 IN
        bmAttributes            3
          Transfer Type            Interrupt
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x000a  1x 10 bytes
        bInterval               9
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x8b  EP 11 IN
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x07  EP 7 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0200  1x 512 bytes
        bInterval               0
Device Qualifier (for other device speed):
  bLength                10
  bDescriptorType         6
  bcdUSB               2.00
  bDeviceClass            0 (Defined at Interface level)
  bDeviceSubClass         0 
  bDeviceProtocol         0 
  bMaxPacketSize0        64
  bNumConfigurations      1
Device Status:     0x0000
  (Bus Powered)


Daniele Palmas (1):
  NET: usb: qmi_wwan: add support for Telit LE922A PID 0x1040

 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

-- 
2.7.4

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox