Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC v2 PATCH 2/4] ebpf: Add sg_filter_run and sg helper
From: Tushar Dave @ 2018-06-19 18:00 UTC (permalink / raw)
  To: ast, daniel, davem, jakub.kicinski, quentin.monnet, jiong.wang,
	guro, sandipan, john.fastabend, kafai, rdna, brakmo, netdev, acme,
	sowmini.varadhan
In-Reply-To: <1529431217-5264-1-git-send-email-tushar.n.dave@oracle.com>

When sg_filter_run() is invoked it runs the attached eBPF
SOCKET_SG_FILTER program which deals with struct scatterlist.

In addition, this patch also adds bpf_sg_next helper function that
allows users to retrieve the next sg element from sg list.

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 include/linux/filter.h                    |  2 +
 include/uapi/linux/bpf.h                  | 10 ++++-
 net/core/filter.c                         | 72 +++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            | 10 ++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
 5 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 71618b1..d176402 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1072,4 +1072,6 @@ struct bpf_sock_ops_kern {
 					 */
 };
 
+int sg_filter_run(struct sock *sk, struct scatterlist *sg);
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef0a7b6..036432b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2076,6 +2076,13 @@ struct bpf_stack_build_id {
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
+ *
+ * int bpf_sg_next(struct bpf_scatterlist *sg)
+ *	Description
+ *		This helper allows user to retrieve next sg element from
+ *		sg list.
+ *	Return
+ *		Returns 0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2158,7 +2165,8 @@ struct bpf_stack_build_id {
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
 	FN(skb_cgroup_id),		\
-	FN(get_current_cgroup_id),
+	FN(get_current_cgroup_id),	\
+	FN(sg_next),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 8f67942..702ff5b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -121,6 +121,53 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 }
 EXPORT_SYMBOL(sk_filter_trim_cap);
 
+int sg_filter_run(struct sock *sk, struct scatterlist *sg)
+{
+	struct sk_filter *filter;
+	int err;
+
+	rcu_read_lock();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter) {
+		struct bpf_scatterlist bpfsg;
+		int num_sg;
+
+		if (!sg) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		num_sg = sg_nents(sg);
+		if (num_sg <= 0) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* We store a reference  to the sg list so it can later used by
+		 * eBPF helpers to retrieve the next sg element.
+		 */
+		bpfsg.num_sg = num_sg;
+		bpfsg.cur_sg = 0;
+		bpfsg.sg = sg;
+
+		/* For the first sg element, we store the pkt access pointers
+		 * into start and end so eBPF program can have pkt access using
+		 * data and data_end. The pkt access for subsequent element of
+		 * sg list is possible when eBPF program invokes bpf_sg_next
+		 * which takes care of setting start and end to the correct sg
+		 * element.
+		 */
+		bpfsg.start = sg_virt(sg);
+		bpfsg.end = bpfsg.start + sg->length;
+		BPF_PROG_RUN(filter->prog, &bpfsg);
+	}
+out:
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(sg_filter_run);
+
 BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
 {
 	return skb_get_poff(skb);
@@ -3753,6 +3800,29 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_sg_next, struct bpf_scatterlist *, bpfsg)
+{
+	struct scatterlist *sg = bpfsg->sg;
+	int cur_sg = bpfsg->cur_sg;
+
+	cur_sg++;
+	if (cur_sg >= bpfsg->num_sg)
+		return -ENODATA;
+
+	bpfsg->cur_sg = cur_sg;
+	bpfsg->start = sg_virt(&sg[cur_sg]);
+	bpfsg->end = bpfsg->start + sg[cur_sg].length;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sg_next_proto = {
+	.func		= bpf_sg_next,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 	   int, level, int, optname, char *, optval, int, optlen)
 {
@@ -4720,6 +4790,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 socksg_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
+	case BPF_FUNC_sg_next:
+		return &bpf_sg_next_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c87ae16..a298498 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2076,6 +2076,13 @@ struct bpf_stack_build_id {
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
+ *
+ * int bpf_sg_next(struct bpf_scatterlist *sg)
+ *	Description
+ *		This helper allows user to retrieve next sg element from
+ *		sg list.
+ *	Return
+ *		Returns 0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2158,7 +2165,8 @@ struct bpf_stack_build_id {
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
 	FN(skb_cgroup_id),		\
-	FN(get_current_cgroup_id),
+	FN(get_current_cgroup_id),	\
+	FN(sg_next),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index f2f28b6..1997ba2 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -133,6 +133,9 @@ static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol,
 	(void *) BPF_FUNC_rc_keydown;
 static unsigned long long (*bpf_get_current_cgroup_id)(void) =
 	(void *) BPF_FUNC_get_current_cgroup_id;
+static unsigned long long (*bpf_sg_next)(void *ctx) =
+	(void *) BPF_FUNC_sg_next;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v2 PATCH 1/4] eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER
From: Tushar Dave @ 2018-06-19 18:00 UTC (permalink / raw)
  To: ast, daniel, davem, jakub.kicinski, quentin.monnet, jiong.wang,
	guro, sandipan, john.fastabend, kafai, rdna, brakmo, netdev, acme,
	sowmini.varadhan
In-Reply-To: <1529431217-5264-1-git-send-email-tushar.n.dave@oracle.com>

Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which uses the
existing socket filter infrastructure for bpf program attach and load.
SOCKET_SG_FILTER eBPF program receives struct scatterlist as bpf context
contrast to SOCKET_FILTER which deals with struct skb. This is useful
for kernel entities that don't have skb to represent packet data but
want to run eBPF socket filter on packet data that is in form of struct
scatterlist e.g. IB/RDMA

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 include/linux/bpf_types.h      |  1 +
 include/linux/filter.h         |  8 +++++
 include/uapi/linux/bpf.h       |  7 ++++
 kernel/bpf/syscall.c           |  1 +
 kernel/bpf/verifier.c          |  1 +
 net/core/filter.c              | 77 ++++++++++++++++++++++++++++++++++++++++--
 samples/bpf/bpf_load.c         | 11 ++++--
 tools/bpf/bpftool/prog.c       |  1 +
 tools/include/uapi/linux/bpf.h |  7 ++++
 tools/lib/bpf/libbpf.c         |  3 ++
 tools/lib/bpf/libbpf.h         |  2 ++
 11 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2..f8b4b56 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -16,6 +16,7 @@
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_SG_FILTER, socksg_filter)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 45fc0f5..71618b1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -517,6 +517,14 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
+struct bpf_scatterlist {
+	struct scatterlist *sg;
+	void *start;
+	void *end;
+	int cur_sg;
+	int num_sg;
+};
+
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6..ef0a7b6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -144,6 +144,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
+	BPF_PROG_TYPE_SOCKET_SG_FILTER,
 };
 
 enum bpf_attach_type {
@@ -2358,6 +2359,12 @@ enum sk_action {
 	SK_PASS,
 };
 
+/* use accessible scatterlist */
+struct sg_filter_md {
+	void *data; /* sg_virt(sg) */
+	void *data_end; /* sg_virt(sg) + sg->length */
+};
+
 /* user accessible metadata for SK_MSG packet hook, new fields must
  * be added to the end of this structure
  */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fa2062..74193a8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1300,6 +1300,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
+	    type != BPF_PROG_TYPE_SOCKET_SG_FILTER &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6403b5..a00d3eb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1320,6 +1320,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_LWT_XMIT:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_SOCKET_SG_FILTER:
 		if (meta)
 			return meta->pkt_access;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3d9ba7e..8f67942 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1130,7 +1130,8 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
 
 static void __bpf_prog_release(struct bpf_prog *prog)
 {
-	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
+	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER ||
+	    prog->type == BPF_PROG_TYPE_SOCKET_SG_FILTER) {
 		bpf_prog_put(prog);
 	} else {
 		bpf_release_orig_filter(prog);
@@ -1551,10 +1552,16 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 
 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
 {
+	struct bpf_prog *prog;
+
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
 		return ERR_PTR(-EPERM);
 
-	return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+	prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+	if (IS_ERR(prog))
+		prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_SG_FILTER);
+
+	return prog;
 }
 
 int sk_attach_bpf(u32 ufd, struct sock *sk)
@@ -4710,6 +4717,15 @@ bool bpf_helper_changes_pkt_data(void *func)
 }
 
 static const struct bpf_func_proto *
+socksg_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
@@ -5037,6 +5053,30 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static bool socksg_filter_is_valid_access(int off, int size,
+					  enum bpf_access_type type,
+					  const struct bpf_prog *prog,
+					  struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct sg_filter_md, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct sg_filter_md, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	if (off < 0 || off >= sizeof(struct sg_filter_md))
+		return false;
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u64))
+		return false;
+
+	return true;
+}
+
 static bool lwt_is_valid_access(int off, int size,
 				enum bpf_access_type type,
 				const struct bpf_prog *prog,
@@ -6516,6 +6556,30 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static u32 socksg_filter_convert_ctx_access(enum bpf_access_type type,
+					    const struct bpf_insn *si,
+					    struct bpf_insn *insn_buf,
+					    struct bpf_prog *prog,
+					    u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct sg_filter_md, data):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_scatterlist, start),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_scatterlist, start));
+		break;
+	case offsetof(struct sg_filter_md, data_end):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_scatterlist, end),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_scatterlist, end));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     const struct bpf_insn *si,
 				     struct bpf_insn *insn_buf,
@@ -6654,6 +6718,15 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops socksg_filter_verifier_ops = {
+	.get_func_proto         = socksg_filter_func_proto,
+	.is_valid_access        = socksg_filter_is_valid_access,
+	.convert_ctx_access     = socksg_filter_convert_ctx_access,
+};
+
+const struct bpf_prog_ops socksg_filter_prog_ops = {
+};
+
 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 89161c9..15c355e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -69,6 +69,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_sockops = strncmp(event, "sockops", 7) == 0;
 	bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
 	bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
+	bool is_socksg = strncmp(event, "socksg", 6) == 0;
+
 	size_t insns_cnt = size / sizeof(struct bpf_insn);
 	enum bpf_prog_type prog_type;
 	char buf[256];
@@ -102,6 +104,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_SK_SKB;
 	} else if (is_sk_msg) {
 		prog_type = BPF_PROG_TYPE_SK_MSG;
+	} else if (is_socksg) {
+		prog_type = BPF_PROG_TYPE_SOCKET_SG_FILTER;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -119,8 +123,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
 		return 0;
 
-	if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
-		if (is_socket)
+	if (is_socket || is_sockops || is_sk_skb || is_sk_msg || is_socksg) {
+		if (is_socket || is_socksg)
 			event += 6;
 		else
 			event += 7;
@@ -624,7 +628,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		    memcmp(shname, "cgroup/", 7) == 0 ||
 		    memcmp(shname, "sockops", 7) == 0 ||
 		    memcmp(shname, "sk_skb", 6) == 0 ||
-		    memcmp(shname, "sk_msg", 6) == 0) {
+		    memcmp(shname, "sk_msg", 6) == 0 ||
+		    memcmp(shname, "socksg", 6) == 0) {
 			ret = load_and_attach(shname, data->d_buf,
 					      data->d_size);
 			if (ret != 0)
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index a4f4352..06b2fef 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -72,6 +72,7 @@
 	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
 	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
 	[BPF_PROG_TYPE_LIRC_MODE2]	= "lirc_mode2",
+	[BPF_PROG_TYPE_SOCKET_SG_FILTER] = "socket_sg_filter",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0b0678..c87ae16 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -144,6 +144,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
+	BPF_PROG_TYPE_SOCKET_SG_FILTER,
 };
 
 enum bpf_attach_type {
@@ -2358,6 +2359,12 @@ enum sk_action {
 	SK_PASS,
 };
 
+/* use accessible scatterlist */
+struct sg_filter_md {
+	void *data; /* sg_virt(sg) */
+	void *data_end; /* sg_virt(sg) + sg->length */
+};
+
 /* user accessible metadata for SK_MSG packet hook, new fields must
  * be added to the end of this structure
  */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a1e96b5..7628278 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1463,6 +1463,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SK_MSG:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_LIRC_MODE2:
+	case BPF_PROG_TYPE_SOCKET_SG_FILTER:
 		return false;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_KPROBE:
@@ -1998,6 +1999,7 @@ static bool bpf_program__is_type(struct bpf_program *prog,
 BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT);
 BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP);
 BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
+BPF_PROG_TYPE_FNS(socket_sg_filter, BPF_PROG_TYPE_SOCKET_SG_FILTER);
 
 void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 					   enum bpf_attach_type type)
@@ -2048,6 +2050,7 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 	BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG),
 	BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
 	BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
+	BPF_PROG_SEC("socksg",          BPF_PROG_TYPE_SOCKET_SG_FILTER),
 };
 
 #undef BPF_PROG_SEC
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 0997653..3be165b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -195,6 +195,7 @@ int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
 void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
 void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 					   enum bpf_attach_type type);
+int bpf_program__set_socket_sg_filter(struct bpf_program *prog);
 
 bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
@@ -204,6 +205,7 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 bool bpf_program__is_sched_act(struct bpf_program *prog);
 bool bpf_program__is_xdp(struct bpf_program *prog);
 bool bpf_program__is_perf_event(struct bpf_program *prog);
+bool bpf_program__is_socket_sg_filter(struct bpf_program *prog);
 
 /*
  * No need for __attribute__((packed)), all members of 'bpf_map_def'
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v2 PATCH 0/4] eBPF and struct scatterlist
From: Tushar Dave @ 2018-06-19 18:00 UTC (permalink / raw)
  To: ast, daniel, davem, jakub.kicinski, quentin.monnet, jiong.wang,
	guro, sandipan, john.fastabend, kafai, rdna, brakmo, netdev, acme,
	sowmini.varadhan

This follows up on https://patchwork.ozlabs.org/cover/927050/
where the review feedback was to use bpf_skb_load_bytes() to deal with
linear and non-linear skbs. While that feedback is valid and correct,
the motivation for this work is to allow eBPF based firewalling for
kernel modules that do not always get their packet as an sk_buff from
their downlink drivers. One such instance of this use-case is RDS, which
can be run both over IB (driver RDMA's a scatterlist to the RDS module)
or over TCP (TCP passes an sk_buff to the RDS module)

This RFC (call it v2) uses exiting socket filter infrastructure and
extend it with new eBPF program type that deals with struct scatterlist.
For RDS, the integrated approach treats the scatterlist as the common
denominator, and allows the application to write a filter for processing
a scatterlist.

Details:
Patch 1 adds new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which
uses the existing socket filter infrastructure for bpf program attach
and load. eBPF program of type BPF_PROG_TYPE_SOCKET_SG_FILTER receives
struct scatterlist as bpf context contrast to
BPF_PROG_TYPE_SOCKET_FILTER which deals with struct skb. This new eBPF
program type allow socket filter to run on packet data that is in form
form of struct scatterlist.

Patch 2 adds functionality to run BPF_PROG_TYPE_SOCKET_SG_FILTER socket
filter program. A bpf helpers bpf_sg_next() is also added so users can
retrieve sg elements from scatterlist.

Patch 3 adds socket filter eBPF sample program that uses patch 1 and
patch 2. The sample program opens an rds socket, attach ebpf program
(socksg i.e. BPF_PROG_TYPE_SOCKET_SG_FILTER) to rds socket and uses
bpf_sg_next helper to look into sg. For a test, current ebpf program
only prints first few bytes from each elements of sg list.

Finally, patch 4 allows rds_recv_incoming to invoke socket filter
program which deals with scatterlist.

Thanks.

-Tushar

Tushar Dave (4):
  eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER
  ebpf: Add sg_filter_run and sg helper
  ebpf: Add sample ebpf program for SOCKET_SG_FILTER
  rds: invoke socket sg filter attached to rds socket

 include/linux/bpf_types.h                 |   1 +
 include/linux/filter.h                    |  10 +
 include/uapi/linux/bpf.h                  |  17 +-
 kernel/bpf/syscall.c                      |   1 +
 kernel/bpf/verifier.c                     |   1 +
 net/core/filter.c                         | 149 ++++++++++++-
 net/rds/ib.c                              |   1 +
 net/rds/ib.h                              |   1 +
 net/rds/ib_recv.c                         |  12 ++
 net/rds/rds.h                             |   2 +
 net/rds/recv.c                            |  16 ++
 net/rds/tcp.c                             |   2 +
 net/rds/tcp.h                             |   2 +
 net/rds/tcp_recv.c                        |  38 ++++
 samples/bpf/Makefile                      |   3 +
 samples/bpf/bpf_load.c                    |  11 +-
 samples/bpf/rds_filter_kern.c             |  78 +++++++
 samples/bpf/rds_filter_user.c             | 339 ++++++++++++++++++++++++++++++
 tools/bpf/bpftool/prog.c                  |   1 +
 tools/include/uapi/linux/bpf.h            |  17 +-
 tools/lib/bpf/libbpf.c                    |   3 +
 tools/lib/bpf/libbpf.h                    |   2 +
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 23 files changed, 703 insertions(+), 7 deletions(-)
 create mode 100644 samples/bpf/rds_filter_kern.c
 create mode 100644 samples/bpf/rds_filter_user.c

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH 1/3] net: ethernet: fix suspend/resume in davinci_emac
From: Lukas Wunner @ 2018-06-19 18:00 UTC (permalink / raw)
  To: Bartosz Golaszewski
  Cc: Grygorii Strashko, David S . Miller, Florian Fainelli,
	Dan Carpenter, Ivan Khoronzhuk, Rob Herring, Kevin Hilman,
	David Lechner, Sekhar Nori, Andrew Lunn, linux-omap, netdev,
	linux-kernel, Bartosz Golaszewski, stable
In-Reply-To: <20180619160950.6283-2-brgl@bgdev.pl>

On Tue, Jun 19, 2018 at 06:09:48PM +0200, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> 
> This patch reverts commit 3243ff2a05ec ("net: ethernet: davinci_emac:
> Deduplicate bus_find_device() by name matching") and adds a comment
> which should stop anyone from reintroducing the same "fix" in the future.
> 
> We can't use bus_find_device_by_name() here because the device name is
> not guaranteed to be 'davinci_mdio'. On some systems it can be
> 'davinci_mdio.0' so we need to use strncmp() against the first part of
> the string to correctly match it.
> 
> Fixes: 3243ff2a05ec ("net: ethernet: davinci_emac: Deduplicate bus_find_device() by name matching")
> Cc: stable@vger.kernel.org
> Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>

This is still

Acked-by: Lukas Wunner <lukas@wunner.de>

given that my patch which is reverted here seems to have been incorrect.
Feel free to keep the ack if you respin in response to Florian Fainelli's
comments.

Thanks,

Lukas

> ---
>  drivers/net/ethernet/ti/davinci_emac.c | 15 +++++++++++++--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
> index 06d7c9e4dcda..a1a6445b5a7e 100644
> --- a/drivers/net/ethernet/ti/davinci_emac.c
> +++ b/drivers/net/ethernet/ti/davinci_emac.c
> @@ -1385,6 +1385,11 @@ static int emac_devioctl(struct net_device *ndev, struct ifreq *ifrq, int cmd)
>  		return -EOPNOTSUPP;
>  }
>  
> +static int match_first_device(struct device *dev, void *data)
> +{
> +	return !strncmp(dev_name(dev), "davinci_mdio", 12);
> +}
> +
>  /**
>   * emac_dev_open - EMAC device open
>   * @ndev: The DaVinci EMAC network adapter
> @@ -1484,8 +1489,14 @@ static int emac_dev_open(struct net_device *ndev)
>  
>  	/* use the first phy on the bus if pdata did not give us a phy id */
>  	if (!phydev && !priv->phy_id) {
> -		phy = bus_find_device_by_name(&mdio_bus_type, NULL,
> -					      "davinci_mdio");
> +		/* NOTE: we can't use bus_find_device_by_name() here because
> +		 * the device name is not guaranteed to be 'davinci_mdio'. On
> +		 * some systems it can be 'davinci_mdio.0' so we need to use
> +		 * strncmp() against the first part of the string to correctly
> +		 * match it.
> +		 */
> +		phy = bus_find_device(&mdio_bus_type, NULL, NULL,
> +				      match_first_device);
>  		if (phy) {
>  			priv->phy_id = dev_name(phy);
>  			if (!priv->phy_id || !*priv->phy_id)
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH] net: nixge: Add __packed attribute to DMA descriptor struct
From: Florian Fainelli @ 2018-06-19 17:41 UTC (permalink / raw)
  To: Moritz Fischer
  Cc: David S. Miller, Kees Cook, netdev, Linux Kernel Mailing List
In-Reply-To: <CAAtXAHfGwSLfhhPU9B=O8q5aGExrmgkPgkC5u8XYLtKL5i4JHQ@mail.gmail.com>

On 06/19/2018 10:31 AM, Moritz Fischer wrote:
> Hi Florian,
> 
> On Tue, Jun 19, 2018 at 10:13 AM, Florian Fainelli <f.fainelli@gmail.com> wrote:
>> On 06/19/2018 09:54 AM, Moritz Fischer wrote:
>>> Add __packed attribute to DMA descriptor structure  in order to
>>> make sure that the DMA engine's alignemnt requirements are met.
>>>
>>> Fixes commit 492caffa8a1a ("net: ethernet: nixge: Add support for
>>> National Instruments XGE netdev")
>>> Signed-off-by: Moritz Fischer <mdf@kernel.org>
>>> ---
>>>
>>> Hi David,
>>>
>>> this addresses an issue where padding occured breaking the alignment
>>> in the array the descriptors are allocated in coherent memory.
>>> This was discovered when we tried to bring up the driver via a PCIe
>>> bridge on x86.
>>
>> How could padding be inserted given than all of the structure members
>> are naturally aligned (all u32 type). Compiler bug?
> 
> I have no good answer to this, all I can tell you is that it wouldn't work
> otherwise. This was part of a bunch of changes that I made in order
> to make this work with 64bit DMA. I made sure to remove the padding/
> reserved fields accordingly such that the net difference would be zero.
> 
> I might've messed that up? The descriptors looked something like this:

Ah ah! This is not the layout in the upstream tree I am looking at, but
your layout below will definitive introduce padding, yes.

> 
> struct nixge_hw_dma_bd {
> #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>         u64 next;
>         u64 phys;
> #else
>         u32 next;
>         u32 reserved1;
>         u32 phys;
>         u32 reserved2;
> #endif
>         u32 reserved3;
>         u32 reserved4;
>         u32 cntrl;
>         u32 status;
>         u32 app0;
>         u32 app1;
>         u32 app2;
>         u32 app3;
>         u32 app4;
> #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>         u64 sw_id_offset;
> #else
>         u32 sw_id_offset;
>         u32 reserved5;
> #endif
>         u32 reserved6;
> } __packed;
> 
> 
> I'll have some follow up patches to add 64bit support together with a
> wrapper driver for the PCIe bridge once the architecture solidifies here.

Why not have the structure updated like this:

struct nixge_hw_dma_bd {
	u32 next_lo;	/* lower 32-bit address part */
	u32 next_hi;	/* upper 32-bit address part */
	u32 phys_lo;
	u32 phys_hi;
        u32 reserved3;
        u32 reserved4;
        u32 cntrl;
        u32 status;
        u32 app0;
        u32 app1;
        u32 app2;
        u32 app3;
        u32 app4;
        u32 sw_id_offset_low;
        u32 sw_id_offset_hi;
        u32 reserved6;
};

That assumes I got the upper/lower address part correct, if not, swapthe
members. Then in your code, if you want to be efficient, you can
populate the fields only if CONFIG_ARCH_DMA_ADDR_T_64BIT is defined. I
did that in bcmgenet.c for instance because the register accesses are
slow, so if we can save 200ns per packet transmit/receive, that's a win.

This should not change anything because the structure size is the same
in both cases, but how you are managing is different, and that would in
turn influence what the HW sees.

Does that make sense?

> 
> Thanks for the feedback,
> 
> Moritz
> 


-- 
Florian

^ permalink raw reply

* bpfilter compile failure on parisc
From: Meelis Roos @ 2018-06-19 17:38 UTC (permalink / raw)
  To: netdev, linux-parisc

Tried enabling bpfilter on parisc, got this:

  HOSTCC  net/bpfilter/main.o
net/bpfilter/main.c:3:21: fatal error: sys/uio.h: No such file or directory
 #include <sys/uio.h>
                     ^
compilation terminated.

-- 
Meelis Roos (mroos@linux.ee)

^ permalink raw reply

* Re: [PATCH] net: nixge: Add __packed attribute to DMA descriptor struct
From: Moritz Fischer @ 2018-06-19 17:31 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: David S. Miller, Kees Cook, netdev, Linux Kernel Mailing List
In-Reply-To: <832bebb8-300d-e911-2946-5edfe82dc30a@gmail.com>

Hi Florian,

On Tue, Jun 19, 2018 at 10:13 AM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> On 06/19/2018 09:54 AM, Moritz Fischer wrote:
>> Add __packed attribute to DMA descriptor structure  in order to
>> make sure that the DMA engine's alignemnt requirements are met.
>>
>> Fixes commit 492caffa8a1a ("net: ethernet: nixge: Add support for
>> National Instruments XGE netdev")
>> Signed-off-by: Moritz Fischer <mdf@kernel.org>
>> ---
>>
>> Hi David,
>>
>> this addresses an issue where padding occured breaking the alignment
>> in the array the descriptors are allocated in coherent memory.
>> This was discovered when we tried to bring up the driver via a PCIe
>> bridge on x86.
>
> How could padding be inserted given than all of the structure members
> are naturally aligned (all u32 type). Compiler bug?

I have no good answer to this, all I can tell you is that it wouldn't work
otherwise. This was part of a bunch of changes that I made in order
to make this work with 64bit DMA. I made sure to remove the padding/
reserved fields accordingly such that the net difference would be zero.

I might've messed that up? The descriptors looked something like this:

struct nixge_hw_dma_bd {
#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
        u64 next;
        u64 phys;
#else
        u32 next;
        u32 reserved1;
        u32 phys;
        u32 reserved2;
#endif
        u32 reserved3;
        u32 reserved4;
        u32 cntrl;
        u32 status;
        u32 app0;
        u32 app1;
        u32 app2;
        u32 app3;
        u32 app4;
#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
        u64 sw_id_offset;
#else
        u32 sw_id_offset;
        u32 reserved5;
#endif
        u32 reserved6;
} __packed;

I'll have some follow up patches to add 64bit support together with a
wrapper driver for the PCIe bridge once the architecture solidifies here.

Thanks for the feedback,

Moritz

^ permalink raw reply

* Re: [PATCH net 5/5] net sched actions: fix misleading text strings in pedit action
From: Roman Mashak @ 2018-06-19 17:15 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, netdev, kernel, jhs, xiyou.wangcong, jiri
In-Reply-To: <20180619100408.42155193@xeon-e3>

Stephen Hemminger <stephen@networkplumber.org> writes:

> On Tue, 19 Jun 2018 12:56:08 -0400

[...]

>> @@ -326,12 +326,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
>>  			}
>>  
>>  			if (offset % 4) {
>> -				pr_info("tc filter pedit offset must be on 32 bit boundaries\n");
>> +				pr_info("tc action pedit offset must be on 32 bit boundaries\n");
>>  				goto bad;
>>  			}
>>  
>>  			if (!offset_valid(skb, hoffset + offset)) {
>> -				pr_info("tc filter pedit offset %d out of bounds\n",
>> +				pr_info("tc action pedit offset %d out of bounds\n",
>>  					hoffset + offset);
>>  				goto bad;
>
> Time to convert these to netlink extack reporting?

Yes, this is planned in next patches.

^ permalink raw reply

* Re: [PATCH] net: nixge: Add __packed attribute to DMA descriptor struct
From: Florian Fainelli @ 2018-06-19 17:13 UTC (permalink / raw)
  To: Moritz Fischer, davem; +Cc: keescook, netdev, linux-kernel
In-Reply-To: <20180619165453.31894-1-mdf@kernel.org>

On 06/19/2018 09:54 AM, Moritz Fischer wrote:
> Add __packed attribute to DMA descriptor structure  in order to
> make sure that the DMA engine's alignemnt requirements are met.
> 
> Fixes commit 492caffa8a1a ("net: ethernet: nixge: Add support for
> National Instruments XGE netdev")
> Signed-off-by: Moritz Fischer <mdf@kernel.org>
> ---
> 
> Hi David,
> 
> this addresses an issue where padding occured breaking the alignment
> in the array the descriptors are allocated in coherent memory.
> This was discovered when we tried to bring up the driver via a PCIe
> bridge on x86.

How could padding be inserted given than all of the structure members
are naturally aligned (all u32 type). Compiler bug?

Also

> 
> Thanks,
> 
> Moritz
> 
> ---
>  drivers/net/ethernet/ni/nixge.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
> index 09f674ec0f9e..fea0e994324b 100644
> --- a/drivers/net/ethernet/ni/nixge.c
> +++ b/drivers/net/ethernet/ni/nixge.c
> @@ -122,7 +122,7 @@ struct nixge_hw_dma_bd {
>  	u32 sw_id_offset;
>  	u32 reserved5;
>  	u32 reserved6;
> -};
> +} __packed;
>  
>  struct nixge_tx_skb {
>  	struct sk_buff *skb;
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH net 5/5] net sched actions: fix misleading text strings in pedit action
From: Stephen Hemminger @ 2018-06-19 17:04 UTC (permalink / raw)
  To: Roman Mashak; +Cc: davem, netdev, kernel, jhs, xiyou.wangcong, jiri
In-Reply-To: <1529427368-17129-6-git-send-email-mrv@mojatatu.com>

On Tue, 19 Jun 2018 12:56:08 -0400
Roman Mashak <mrv@mojatatu.com> wrote:

> Change "tc filter pedit .." to "tc actions pedit .." in error
> messages to clearly refer to pedit action.
> 
> Signed-off-by: Roman Mashak <mrv@mojatatu.com>
> ---
>  net/sched/act_pedit.c | 10 +++++-----
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> index 3b775f54cee5..caa6927a992c 100644
> --- a/net/sched/act_pedit.c
> +++ b/net/sched/act_pedit.c
> @@ -305,7 +305,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
>  
>  			rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
>  			if (rc) {
> -				pr_info("tc filter pedit bad header type specified (0x%x)\n",
> +				pr_info("tc action pedit bad header type specified (0x%x)\n",
>  					htype);
>  				goto bad;
>  			}
> @@ -314,7 +314,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
>  				char *d, _d;
>  
>  				if (!offset_valid(skb, hoffset + tkey->at)) {
> -					pr_info("tc filter pedit 'at' offset %d out of bounds\n",
> +					pr_info("tc action pedit 'at' offset %d out of bounds\n",
>  						hoffset + tkey->at);
>  					goto bad;
>  				}
> @@ -326,12 +326,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
>  			}
>  
>  			if (offset % 4) {
> -				pr_info("tc filter pedit offset must be on 32 bit boundaries\n");
> +				pr_info("tc action pedit offset must be on 32 bit boundaries\n");
>  				goto bad;
>  			}
>  
>  			if (!offset_valid(skb, hoffset + offset)) {
> -				pr_info("tc filter pedit offset %d out of bounds\n",
> +				pr_info("tc action pedit offset %d out of bounds\n",
>  					hoffset + offset);
>  				goto bad;

Time to convert these to netlink extack reporting?

^ permalink raw reply

* [PATCH net 5/5] net sched actions: fix misleading text strings in pedit action
From: Roman Mashak @ 2018-06-19 16:56 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
In-Reply-To: <1529427368-17129-1-git-send-email-mrv@mojatatu.com>

Change "tc filter pedit .." to "tc actions pedit .." in error
messages to clearly refer to pedit action.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 net/sched/act_pedit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 3b775f54cee5..caa6927a992c 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -305,7 +305,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 
 			rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
 			if (rc) {
-				pr_info("tc filter pedit bad header type specified (0x%x)\n",
+				pr_info("tc action pedit bad header type specified (0x%x)\n",
 					htype);
 				goto bad;
 			}
@@ -314,7 +314,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 				char *d, _d;
 
 				if (!offset_valid(skb, hoffset + tkey->at)) {
-					pr_info("tc filter pedit 'at' offset %d out of bounds\n",
+					pr_info("tc action pedit 'at' offset %d out of bounds\n",
 						hoffset + tkey->at);
 					goto bad;
 				}
@@ -326,12 +326,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			}
 
 			if (offset % 4) {
-				pr_info("tc filter pedit offset must be on 32 bit boundaries\n");
+				pr_info("tc action pedit offset must be on 32 bit boundaries\n");
 				goto bad;
 			}
 
 			if (!offset_valid(skb, hoffset + offset)) {
-				pr_info("tc filter pedit offset %d out of bounds\n",
+				pr_info("tc action pedit offset %d out of bounds\n",
 					hoffset + offset);
 				goto bad;
 			}
@@ -349,7 +349,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 				val = (*ptr + tkey->val) & ~tkey->mask;
 				break;
 			default:
-				pr_info("tc filter pedit bad command (%d)\n",
+				pr_info("tc action pedit bad command (%d)\n",
 					cmd);
 				goto bad;
 			}
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 4/5] net sched actions: use sizeof operator for buffer length
From: Roman Mashak @ 2018-06-19 16:56 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
In-Reply-To: <1529427368-17129-1-git-send-email-mrv@mojatatu.com>

Replace constant integer with sizeof() to clearly indicate
the destination buffer length in skb_header_pointer() calls.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 net/sched/act_pedit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 9c2d8a31a5c5..3b775f54cee5 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -319,7 +319,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 					goto bad;
 				}
 				d = skb_header_pointer(skb, hoffset + tkey->at,
-						       1, &_d);
+						       sizeof(_d), &_d);
 				if (!d)
 					goto bad;
 				offset += (*d & tkey->offmask) >> tkey->shift;
@@ -337,7 +337,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			}
 
 			ptr = skb_header_pointer(skb, hoffset + offset,
-						 4, &hdata);
+						 sizeof(hdata), &hdata);
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 3/5] net sched actions: fix sparse warning
From: Roman Mashak @ 2018-06-19 16:56 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
In-Reply-To: <1529427368-17129-1-git-send-email-mrv@mojatatu.com>

The variable _data in include/asm-generic/sections.h defines sections,
this causes sparse warning in pedit:

net/sched/act_pedit.c:293:35: warning: symbol '_data' shadows an earlier one
./include/asm-generic/sections.h:36:13: originally declared here

Therefore rename the variable.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 net/sched/act_pedit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index e4b29ee79ba8..9c2d8a31a5c5 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -290,7 +290,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
-			u32 *ptr, _data;
+			u32 *ptr, hdata;
 			int offset = tkey->off;
 			int hoffset;
 			u32 val;
@@ -337,7 +337,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			}
 
 			ptr = skb_header_pointer(skb, hoffset + offset,
-						 4, &_data);
+						 4, &hdata);
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
@@ -355,7 +355,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			}
 
 			*ptr = ((*ptr & tkey->mask) ^ val);
-			if (ptr == &_data)
+			if (ptr == &hdata)
 				skb_store_bits(skb, hoffset + offset, ptr, 4);
 		}
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 2/5] net sched actions: fix coding style in pedit headers
From: Roman Mashak @ 2018-06-19 16:56 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
In-Reply-To: <1529427368-17129-1-git-send-email-mrv@mojatatu.com>

Fix coding style issues in tc pedit headers detected by the
checkpatch script.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 include/net/tc_act/tc_pedit.h        | 1 +
 include/uapi/linux/tc_act/tc_pedit.h | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 227a6f1d02f4..fac3ad4a86de 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -17,6 +17,7 @@ struct tcf_pedit {
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
+
 #define to_pedit(a) ((struct tcf_pedit *)a)
 
 static inline bool is_tcf_pedit(const struct tc_action *a)
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 162d1094c41c..24ec792dacc1 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -17,13 +17,15 @@ enum {
 	TCA_PEDIT_KEY_EX,
 	__TCA_PEDIT_MAX
 };
+
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)
-                                                                                
+
 enum {
 	TCA_PEDIT_KEY_EX_HTYPE = 1,
 	TCA_PEDIT_KEY_EX_CMD = 2,
 	__TCA_PEDIT_KEY_EX_MAX
 };
+
 #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
 
  /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It
@@ -38,6 +40,7 @@ enum pedit_header_type {
 	TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
 	__PEDIT_HDR_TYPE_MAX,
 };
+
 #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
 
 enum pedit_cmd {
@@ -45,6 +48,7 @@ enum pedit_cmd {
 	TCA_PEDIT_KEY_EX_CMD_ADD = 1,
 	__PEDIT_CMD_MAX,
 };
+
 #define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1)
 
 struct tc_pedit_key {
@@ -55,13 +59,14 @@ struct tc_pedit_key {
 	__u32           offmask;
 	__u32           shift;
 };
-                                                                                
+
 struct tc_pedit_sel {
 	tc_gen;
 	unsigned char           nkeys;
 	unsigned char           flags;
 	struct tc_pedit_key     keys[0];
 };
+
 #define tc_pedit tc_pedit_sel
 
 #endif
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 1/5] net sched actions: fix coding style in pedit action
From: Roman Mashak @ 2018-06-19 16:56 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
In-Reply-To: <1529427368-17129-1-git-send-email-mrv@mojatatu.com>

Fix coding style issues in tc pedit action detected by the
checkpatch script.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 net/sched/act_pedit.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 8a925c72db5f..e4b29ee79ba8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -136,15 +136,15 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
-	struct nlattr *pattr;
-	struct tc_pedit *parm;
-	int ret = 0, err;
-	struct tcf_pedit *p;
 	struct tc_pedit_key *keys = NULL;
 	struct tcf_pedit_key_ex *keys_ex;
+	struct tc_pedit *parm;
+	struct nlattr *pattr;
+	struct tcf_pedit *p;
+	int ret = 0, err;
 	int ksize;
 
-	if (nla == NULL)
+	if (!nla)
 		return -EINVAL;
 
 	err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
@@ -175,7 +175,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			return ret;
 		p = to_pedit(*a);
 		keys = kmalloc(ksize, GFP_KERNEL);
-		if (keys == NULL) {
+		if (!keys) {
 			tcf_idr_release(*a, bind);
 			kfree(keys_ex);
 			return -ENOMEM;
@@ -220,6 +220,7 @@ static void tcf_pedit_cleanup(struct tc_action *a)
 {
 	struct tcf_pedit *p = to_pedit(a);
 	struct tc_pedit_key *keys = p->tcfp_keys;
+
 	kfree(keys);
 	kfree(p->tcfp_keys_ex);
 }
@@ -284,7 +285,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 	if (p->tcfp_nkeys > 0) {
 		struct tc_pedit_key *tkey = p->tcfp_keys;
 		struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
-		enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+		enum pedit_header_type htype =
+			TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
 		enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
@@ -316,16 +318,15 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 						hoffset + tkey->at);
 					goto bad;
 				}
-				d = skb_header_pointer(skb, hoffset + tkey->at, 1,
-						       &_d);
+				d = skb_header_pointer(skb, hoffset + tkey->at,
+						       1, &_d);
 				if (!d)
 					goto bad;
 				offset += (*d & tkey->offmask) >> tkey->shift;
 			}
 
 			if (offset % 4) {
-				pr_info("tc filter pedit"
-					" offset must be on 32 bit boundaries\n");
+				pr_info("tc filter pedit offset must be on 32 bit boundaries\n");
 				goto bad;
 			}
 
@@ -335,7 +336,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 				goto bad;
 			}
 
-			ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
+			ptr = skb_header_pointer(skb, hoffset + offset,
+						 4, &_data);
 			if (!ptr)
 				goto bad;
 			/* just do it, baby */
@@ -358,8 +360,9 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		}
 
 		goto done;
-	} else
+	} else {
 		WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+	}
 
 bad:
 	p->tcf_qstats.overlimits++;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 0/5] net sched actions: code style cleanup and fixes
From: Roman Mashak @ 2018-06-19 16:56 UTC (permalink / raw)
  To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak

The patchset fixes a few code stylistic issues and typos, as well as one
detected by sparse semantic checker tool.

No functional changes introduced.

Patch 1 & 2 fix coding style bits caught by the checkpatch.pl script
Patch 3 fixes an issue with a shadowed variable
Patch 4 adds sizeof() operator instead of magic number for buffer length
Patch 5 fixes typos in diagnostics messages

Roman Mashak (5):
  net sched actions: fix coding style in pedit action
  net sched actions: fix coding style in pedit headers
  net sched actions: fix sparse warning
  net sched actions: use sizeof operator for buffer length
  net sched actions: fix misleading text strings in pedit action

 include/net/tc_act/tc_pedit.h        |  1 +
 include/uapi/linux/tc_act/tc_pedit.h |  9 ++++++--
 net/sched/act_pedit.c                | 41 +++++++++++++++++++-----------------
 3 files changed, 30 insertions(+), 21 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH 3/3] net: davinci_emac: match the mdio device against its compatible if possible
From: Florian Fainelli @ 2018-06-19 16:56 UTC (permalink / raw)
  To: Bartosz Golaszewski, Grygorii Strashko, David S . Miller,
	Dan Carpenter, Ivan Khoronzhuk, Rob Herring, Lukas Wunner,
	Kevin Hilman, David Lechner, Sekhar Nori, Andrew Lunn
  Cc: linux-omap, netdev, linux-kernel, Bartosz Golaszewski
In-Reply-To: <20180619160950.6283-4-brgl@bgdev.pl>

On 06/19/2018 09:09 AM, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> 
> Device tree based systems without of_dev_auxdata will have the mdio
> device named differently than "davinci_mdio(.0)". In this case use the
> device's compatible string for matching.
> 
> Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> ---
>  drivers/net/ethernet/ti/davinci_emac.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
> index a1a6445b5a7e..c28a35bb852f 100644
> --- a/drivers/net/ethernet/ti/davinci_emac.c
> +++ b/drivers/net/ethernet/ti/davinci_emac.c
> @@ -1387,6 +1387,10 @@ static int emac_devioctl(struct net_device *ndev, struct ifreq *ifrq, int cmd)
>  
>  static int match_first_device(struct device *dev, void *data)
>  {
> +	if (dev->of_node)
> +		return of_device_is_compatible(dev->of_node,
> +					       "ti,davinci_mdio");

Why would we be matching the PHY device with the MDIO controller
compatibe string? Why not check dev->parent.of_node instead which would
make more sense?
-- 
Florian

^ permalink raw reply

* Re: [PATCH 2/3] net: phy: set the of_node in the mdiodev's struct device
From: Florian Fainelli @ 2018-06-19 16:55 UTC (permalink / raw)
  To: Bartosz Golaszewski, Grygorii Strashko, David S . Miller,
	Dan Carpenter, Ivan Khoronzhuk, Rob Herring, Lukas Wunner,
	Kevin Hilman, David Lechner, Sekhar Nori, Andrew Lunn
  Cc: linux-omap, netdev, linux-kernel, Bartosz Golaszewski
In-Reply-To: <20180619160950.6283-3-brgl@bgdev.pl>

On 06/19/2018 09:09 AM, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> 
> Copy the of_node over from mii_bus's struct device. This is needed
> for device-tree systems to be able to check the mdio device's
> compatible string.
> 
> Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> ---
>  drivers/net/phy/phy_device.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index bd0f339f69fd..a92d5ee61813 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -411,6 +411,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
>  	mdiodev->dev.parent = &bus->dev;
>  	mdiodev->dev.bus = &mdio_bus_type;
>  	mdiodev->dev.type = &mdio_bus_phy_type;
> +	mdiodev->dev.of_node = bus->dev.of_node;

That does not quite make sense to me, the mdio device's parent already
points to &bus->dev, which would get you the correct of_node. You are
breaking the parent/child relationship here. From patch 3, see my
comments there, it does not look like you are matching on the right
device level.

>  	mdiodev->bus = bus;
>  	mdiodev->bus_match = phy_bus_match;
>  	mdiodev->addr = addr;
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH 1/3] net: ethernet: fix suspend/resume in davinci_emac
From: Florian Fainelli @ 2018-06-19 16:55 UTC (permalink / raw)
  To: Bartosz Golaszewski, Grygorii Strashko, David S . Miller,
	Dan Carpenter, Ivan Khoronzhuk, Rob Herring, Lukas Wunner,
	Kevin Hilman, David Lechner, Sekhar Nori, Andrew Lunn
  Cc: linux-omap, netdev, linux-kernel, Bartosz Golaszewski, stable
In-Reply-To: <20180619160950.6283-2-brgl@bgdev.pl>

On 06/19/2018 09:09 AM, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> 
> This patch reverts commit 3243ff2a05ec ("net: ethernet: davinci_emac:
> Deduplicate bus_find_device() by name matching") and adds a comment
> which should stop anyone from reintroducing the same "fix" in the future.
> 
> We can't use bus_find_device_by_name() here because the device name is
> not guaranteed to be 'davinci_mdio'. On some systems it can be
> 'davinci_mdio.0' so we need to use strncmp() against the first part of
> the string to correctly match it.
> 
> Fixes: 3243ff2a05ec ("net: ethernet: davinci_emac: Deduplicate bus_find_device() by name matching")
> Cc: stable@vger.kernel.org
> Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
> ---
>  drivers/net/ethernet/ti/davinci_emac.c | 15 +++++++++++++--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
> index 06d7c9e4dcda..a1a6445b5a7e 100644
> --- a/drivers/net/ethernet/ti/davinci_emac.c
> +++ b/drivers/net/ethernet/ti/davinci_emac.c
> @@ -1385,6 +1385,11 @@ static int emac_devioctl(struct net_device *ndev, struct ifreq *ifrq, int cmd)
>  		return -EOPNOTSUPP;
>  }
>  
> +static int match_first_device(struct device *dev, void *data)
> +{
> +	return !strncmp(dev_name(dev), "davinci_mdio", 12);

	const char *bus_name = "davinci_mdio";

	return !strncmp(dev_name(dev), bus_name, strlen(bus_name));

Or even better yet, if you want to make sure this really is a PHY device
that you are trying to match, you could try to use sscanf() with PHY_ID_FMT.
-- 
Florian

^ permalink raw reply

* [PATCH] net: nixge: Add __packed attribute to DMA descriptor struct
From: Moritz Fischer @ 2018-06-19 16:54 UTC (permalink / raw)
  To: davem; +Cc: keescook, netdev, linux-kernel, Moritz Fischer

Add __packed attribute to DMA descriptor structure  in order to
make sure that the DMA engine's alignemnt requirements are met.

Fixes commit 492caffa8a1a ("net: ethernet: nixge: Add support for
National Instruments XGE netdev")
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---

Hi David,

this addresses an issue where padding occured breaking the alignment
in the array the descriptors are allocated in coherent memory.
This was discovered when we tried to bring up the driver via a PCIe
bridge on x86.

Thanks,

Moritz

---
 drivers/net/ethernet/ni/nixge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index 09f674ec0f9e..fea0e994324b 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -122,7 +122,7 @@ struct nixge_hw_dma_bd {
 	u32 sw_id_offset;
 	u32 reserved5;
 	u32 reserved6;
-};
+} __packed;
 
 struct nixge_tx_skb {
 	struct sk_buff *skb;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net v2] ip: limit use of gso_size to udp
From: Willem de Bruijn @ 2018-06-19 16:47 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

The ipcm(6)_cookie field gso_size is set only in the udp path. The ip
layer copies this to cork only if sk_type is SOCK_DGRAM. This check
proved too permissive. Ping and l2tp sockets have the same type.

Limit to sockets of type SOCK_DGRAM and protocol IPPROTO_UDP to
exclude ping sockets.

v1 -> v2
- remove irrelevant whitespace changes

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>

---

For net-next, I'll take a look whether ipcm(6)_cookie fields like
these can be initialized uniformly, and then this branch removed
completely.
---
 net/ipv4/ip_output.c  | 3 ++-
 net/ipv6/ip6_output.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index af5a830ff6ad..b3308e9d9762 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1145,7 +1145,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
 
-	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
+	cork->gso_size = sk->sk_type == SOCK_DGRAM &&
+			 sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 021e5aef6ba3..a14fb4fcdf18 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1219,7 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
-	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
+			      sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
 
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
-- 
2.18.0.rc1.244.gcf134e6275-goog

^ permalink raw reply related

* Re: [PATCH rdma-next v2 00/20] Introduce mlx5 DEVX interface
From: Leon Romanovsky @ 2018-06-19 16:46 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180619045930.GA7557@mtr-leonro.mtl.com>

[-- Attachment #1: Type: text/plain, Size: 3184 bytes --]

On Tue, Jun 19, 2018 at 07:59:30AM +0300, Leon Romanovsky wrote:
> On Mon, Jun 18, 2018 at 04:05:04PM -0600, Jason Gunthorpe wrote:
> > On Sun, Jun 17, 2018 at 12:59:46PM +0300, Leon Romanovsky wrote:
> >
> > > Leon Romanovsky (2):
> > >   drm/i915: Move u64-to-ptr helpers to general header
> > >   kernel.h: Reuse u64_to_ptr macro to cast __user pointers
> >
> > I dropped these since they are not needed by this series when using a
> > union.
>
> No problem, it was my idea to reuse existing macro, before it was
> hard-coded implementation, but union makes it cleaner.
>
> >
> > > Matan Barak (5):
> > >   IB/uverbs: Export uverbs idr and fd types
> > >   IB/uverbs: Add PTR_IN attributes that are allocated/copied
> > >     automatically
> >
> > Revised this one, as noted
>
> Thanks
>
> >
> > >   IB/uverbs: Add a macro to define a type with no kernel known size
> > >   IB/uverbs: Allow an empty namespace in ioctl() framework
> > >   IB/uverbs: Refactor uverbs_finalize_objects
> >
> > I put the above in a branch and can apply them if you ack my revisions..
> >
>
> Except the line "return (void *)attr;", which should be "return ERR_CAST(attr);"
> everything looks reasonable. I didn't test it, but I'm not worried, we will have
> enough time to fix if needed.
>
> > >   net/mlx5_core: Prevent warns in dmesg upon firmware commands
> > >   IB/core: Improve uverbs_cleanup_ucontext algorithm
> >
> > I dropped these two (they are linked), need comments addressed and
> > resent.
>
> They are linked only logically, the second patch will trigger warning
> which is suppressed by first patch. So actually mlx5-net branch will have
> only first patch "net/mlx5_core: Prevent warns in dmesg upon firmware commands"
> and you will apply "IB/core: Improve uverbs_cleanup_ucontext algorithm" in
> your rdma-next.
>
> >
> > > Yishai Hadas (13):
> > >   net/mlx5: Expose DEVX ifc structures
> > >   IB/mlx5: Introduce DEVX
> > >   IB/core: Introduce DECLARE_UVERBS_GLOBAL_METHODS
> > >   IB: Expose ib_ucontext from a given ib_uverbs_file
> > >   IB/mlx5: Add support for DEVX general command
> > >   IB/mlx5: Add obj create and destroy functionality
> > >   IB/mlx5: Add DEVX support for modify and query commands
> > >   IB/mlx5: Add support for DEVX query UAR
> > >   IB/mlx5: Add DEVX support for memory registration
> > >   IB/mlx5: Add DEVX query EQN support
> > >   IB/mlx5: Expose DEVX tree
> >
> > I put these in a branch also and can apply them, but I need the first
> > two patches in the mlx5 core branch first please, thanks.
> >
> > Since this requires so many core patches I think I prefer to merge the
> > mlx core branch then apply rather merge a branch.
>
> So to summarize, I'm applying those three patches to mlx5-next:
>  * net/mlx5_core: Prevent warns in dmesg upon firmware commands
>  * net/mlx5: Expose DEVX ifc structures
>  * IB/mlx5: Introduce DEVX

Updated mlx5-next with two patches and squashed ifc and commands bits
from third commit into second one.

>
> And resend:
>  * IB/core: Improve uverbs_cleanup_ucontext algorithm
>

Resent.

> Thanks
>
> >
> > Jason



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH] selftests: net: add config fragments
From: Anders Roxell @ 2018-06-19 16:41 UTC (permalink / raw)
  To: davem, shuah, fw, shannon.nelson
  Cc: netdev, linux-kselftest, linux-kernel, Anders Roxell

Add fragments to pass bridge and vlan tests.

Fixes: 33b01b7b4f19 ("selftests: add rtnetlink test script")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---

Hi,

net/rtnetlink.sh still fails on tc hbt hierarchy, addrlabel and ipsec:
Error: Specified qdisc not found.
RTNETLINK answers: No such file or directory
Error: Parent Qdisc doesn't exists.
We have an error talking to the kernel, -1
Error: Parent Qdisc doesn't exists.
We have an error talking to the kernel, -1
Error: Parent Qdisc doesn't exists.
We have an error talking to the kernel, -1
Error: Parent Qdisc doesn't exists.
We have an error talking to the kernel, -1
Error: Parent Qdisc doesn't exists.
We have an error talking to the kernel, -1
Error: Parent Qdisc doesn't exists.
We have an error talking to the kernel, -1
Error: Invalid handle.
FAIL: tc htb hierarchy

FAIL: ipv6 addrlabel

FAIL: can't add fou port 7777, skipping test
RTNETLINK answers: Operation not supported
FAIL: can't add macsec interface, skipping test
RTNETLINK answers: Protocol not supported
RTNETLINK answers: No such process
RTNETLINK answers: No such process
./rtnetlink.sh: line 527:  5356 Terminated              ip x m >
$tmpfile
FAIL: ipsec

I'm using iproute2 tag: 4.17 and tried the qdisc command from the
function kci_test_tc in net/rtnetlink.sh:
$ tc qdisc add dev lo root handle 1: htb
Error: Specified qdisc not found.

For kci_test_addrlabel it fails on this row:
ip addrlabel list |grep -q "prefix dead::/64 dev lo label 1"

Any idea why these three fails?

Cheers,
Anders

 tools/testing/selftests/net/config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 7ba089b33e8b..cd3a2f1545b5 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -12,3 +12,5 @@ CONFIG_NET_IPVTI=y
 CONFIG_INET6_XFRM_MODE_TUNNEL=y
 CONFIG_IPV6_VTI=y
 CONFIG_DUMMY=y
+CONFIG_BRIDGE=y
+CONFIG_VLAN_8021Q=y
-- 
2.17.1

^ permalink raw reply related

* [PATCH] ucc_geth: Add BQL support
From: Joakim Tjernlund @ 2018-06-19 16:30 UTC (permalink / raw)
  To: Li Yang, netdev; +Cc: Joakim Tjernlund

Signed-off-by: Joakim Tjernlund <joakim.tjernlund@infinera.com>
---
 drivers/net/ethernet/freescale/ucc_geth.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index f77ba9fa257b..6c99a9af6647 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -3096,6 +3096,7 @@ static int ucc_geth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	ugeth_vdbg("%s: IN", __func__);
 
+	netdev_sent_queue(dev, skb->len);
 	spin_lock_irqsave(&ugeth->lock, flags);
 
 	dev->stats.tx_bytes += skb->len;
@@ -3242,6 +3243,8 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
 	struct ucc_geth_private *ugeth = netdev_priv(dev);
 	u8 __iomem *bd;		/* BD pointer */
 	u32 bd_status;
+	int howmany = 0;
+	unsigned int bytes_sent = 0;
 
 	bd = ugeth->confBd[txQ];
 	bd_status = in_be32((u32 __iomem *)bd);
@@ -3257,7 +3260,8 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
 		skb = ugeth->tx_skbuff[txQ][ugeth->skb_dirtytx[txQ]];
 		if (!skb)
 			break;
-
+		howmany++;
+		bytes_sent += skb->len;
 		dev->stats.tx_packets++;
 
 		dev_consume_skb_any(skb);
@@ -3279,6 +3283,7 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
 		bd_status = in_be32((u32 __iomem *)bd);
 	}
 	ugeth->confBd[txQ] = bd;
+	netdev_completed_queue(dev, howmany, bytes_sent);
 	return 0;
 }
 
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status
From: Martin KaFai Lau @ 2018-06-19 16:36 UTC (permalink / raw)
  To: David Ahern; +Cc: dsahern, netdev, borkmann, ast, davem
In-Reply-To: <2d6278d1-45ab-ff53-7b97-d9593203ff3e@gmail.com>

On Tue, Jun 19, 2018 at 09:34:28AM -0600, David Ahern wrote:
> On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
> > On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> >> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >>>> 	/* rc > 0 case */
> >>>> 	switch(rc) {
> >>>> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
> >>>> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
> >>>> 	case BPF_FIB_LKUP_RET_PROHIBIT:
> >>>> 		return XDP_DROP;
> >>>> 	}
> >>>>
> >>>> For the others it becomes a question of do we share why the stack needs
> >>>> to be involved? Maybe the program wants to collect stats to show traffic
> >>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> >>> Thanks for the explanation.
> >>>
> >>> Agree on the bpf able to collect stats will be useful.
> >>>
> >>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> >>> how may the old xdp_prog work/not-work?  As of now, the return value
> >>> is straight forward, FWD, PASS (to stack) or DROP (error).
> >>> With this change, the xdp_prog needs to match/switch() the
> >>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> >>
> >> IMO, programs should only call XDP_DROP for known reasons - like the 3
> >> above. Anything else punt to the stack.
> >>
> >> If a new RET_XYZ comes along:
> >> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> >> If the program does not understand XYZ and punts to the stack
> >> (recommendation), then a second lookup is done during normal packet
> >> processing and the stack drops it.
> >>
> >> 2. the new XYZ is a new path in the kernel that is unsupported with
> >> respect to XDP forwarding, nothing new for the program to do.
> >>
> >> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> >> the program writer.
> >>
> >> Worst case of punting packets to the stack for any rc != 0 means the
> >> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> >> in normal stack processing - to handle the packet.
> > Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
> > should the bpf_*_fib_lookup() return value be kept as is such that
> > the xdp_prog is clear what to do.  The reason can be returned in
> > the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
> > If the xdp_prog does not understand a reason, it still will not
> > affect its decision because the return value is clear.
> > I think the situation here is similar to regular syscall which usually
> > uses -1 to clearly states error and errno to spells out the reason.
> > 
> 
> I did consider returning the status in struct bpf_fib_lookup. However,
> it is 64 bytes and can not be extended without a big performance
> penalty, so the only option there is to make an existing entry a union
> the most logical of which is the ifindex. It seemed odd to me to have
> the result by hidden in the struct as a union on ifindex and returning
> the egress index from the function:
> 
> @@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {
> 
>         /* total length of packet from network header - used for MTU
> check */
>         __u16   tot_len;
> -       __u32   ifindex;  /* L3 device index for lookup */
> +
> +       union {
> +               __u32   ifindex;  /* input: L3 device index for lookup */
> +               __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
> +       };
> 
> 
> It seemed more natural to have ifindex stay ifindex and only change
> value on return:
> 
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
> 
>  	/* total length of packet from network header - used for MTU check */
>  	__u16	tot_len;
> -	__u32	ifindex;  /* L3 device index for lookup */
> +
> +	/* input: L3 device index for lookup
> +	 * output: nexthop device index from FIB lookup
> +	 */
> +	__u32	ifindex;
> 
>  	union {
>  		/* inputs to lookup */
> 
> 
> From a program's perspective:
> 
> rc < 0  -- program is passing incorrect data
> rc == 0 -- packet can be forwarded
> rc > 0  -- packet can not be forwarded.
> 
> BPF programs are not required to track the LKUP_RET values any more than
> a function returning multiple negative values - the caller just checks
> rc < 0 means failure. If the program cares it can look at specific
> values of rc to see the specific value.
> 
> The same applies with the LKUP_RET values - they are there to provide
> insight into why the packet is not forwarded directly if the program
> cares to know why.
hmm...ic. My concern is, the prog can interpret rc > 0 (in this patch) to be
drop vs pass (although we can advise them in bpf.h to always pass if it does
not understand a rc but it is not a strong contract),  it may catch people
a surprise if a xdp_prog suddenly drops everything when running in a
newer kernel where the upper stack can actually handle it.

while the current behavior (i.e. before this patch, rc == 0) is always pass
to the stack.

I think at least comments should be put in the enum such that
the xdp/tc_prog should expect the enum could be extended later, so
the suggested behavior should be a pass for unknown LKUP_RET and let
the stack to decide.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox