Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v9 02/10] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++++++--
 kernel/bpf/core.c        |  5 ++++
 kernel/bpf/stackmap.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    | 19 ++++++++++++++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
 7 files changed, 183 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc6..c553f6f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b23..64899c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				has_callchain_buf:1; /* callchain buffer allocated? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a93..1afb606 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1767,6 +1767,40 @@ union bpf_attr {
  * 		**CONFIG_XFRM** configuration option.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *ctx*, which is a pointer
+ *		to the context on which the tracing program is executed.
+ *		To store the stacktrace, the bpf program provides *buf* with
+ *		a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *	::
+ *
+ *		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		a non-negative value equal to or less than size on success, or
+ * 		a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec3..9349a5d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->has_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..3ba102b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr < skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596..253f6bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
 #include <linux/stringify.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/perf_event.h>
 
 #include "disasm.h"
 
@@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+		const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+		err = -ENOTSUPP;
+		err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+		if (err) {
+			verbose(env, err_str, func_id_name(func_id), func_id);
+			return err;
+		}
+
+		env->prog->has_callchain_buf = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2..46d866e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 03/10] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0 || usize > max_len)
        return 0;
The verifier may have the following errors:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (bf) r8 = r0
    54: (bf) r1 = r8
    55: (67) r1 <<= 32
    56: (bf) r2 = r1
    57: (77) r2 >>= 32
    58: (25) if r2 > 0x31f goto pc+33
     R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
                         umax_value=18446744069414584320,
                         var_off=(0x0; 0xffffffff00000000))
     R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0) R9=inv800 R10=fp0,call_-1
    59: (1f) r9 -= r8
    60: (c7) r1 s>>= 32
    61: (bf) r2 = r7
    62: (0f) r2 += r1
    math between map_value pointer and register with unbounded
    min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.

Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (b7) r1 = 0
    54: (bf) r8 = r0
    55: (67) r8 <<= 32
    56: (c7) r8 s>>= 32
    57: (6d) if r1 s> r8 goto pc+24
     R0=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
     R1=inv0 R6=ctx(id=0,off=0,imm=0)
     R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
     R10=fp0,call_-1
    58: (bf) r2 = r7
    59: (0f) r2 += r8
    60: (1f) r9 -= r8
    61: (bf) r1 = r6

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 253f6bd..988400e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,6 +165,8 @@ struct bpf_call_arg_meta {
 	bool pkt_access;
 	int regno;
 	int access_size;
+	s64 msize_smax_value;
+	u64 msize_umax_value;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -1985,6 +1987,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
+		/* remember the mem_size which may be used later
+		 * to refine return values.
+		 */
+		meta->msize_smax_value = reg->smax_value;
+		meta->msize_umax_value = reg->umax_value;
+
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
@@ -2324,6 +2332,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	ret_reg->smax_value = meta->msize_smax_value;
+	ret_reg->umax_value = meta->msize_umax_value;
+	__reg_deduce_bounds(ret_reg);
+	__reg_bound_offset(ret_reg);
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2447,6 +2472,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 05/10] bpf/verifier: improve register value range tracking with ARSH
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

When helpers like bpf_get_stack returns an int value
and later on used for arithmetic computation, the LSH and ARSH
operations are often required to get proper sign extension into
64-bit. For example, without this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0)
With this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
With better range of "R8", later on when "R8" is added to other register,
e.g., a map pointer or scalar-value register, the better register
range can be derived and verifier failure may be avoided.

In our later example,
    ......
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0)
        return 0;
    ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
    ......
Without improving ARSH value range tracking, the register representing
"max_len - usize" will have smin_value equal to S64_MIN and will be
rejected by verifier.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/tnum.h  |  4 +++-
 kernel/bpf/tnum.c     | 10 ++++++++++
 kernel/bpf/verifier.c | 23 +++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 0d2d3da..c7dc2b5 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max);
 /* Arithmetic and logical ops */
 /* Shift a tnum left (by a fixed shift) */
 struct tnum tnum_lshift(struct tnum a, u8 shift);
-/* Shift a tnum right (by a fixed shift) */
+/* Shift (rsh) a tnum right (by a fixed shift) */
 struct tnum tnum_rshift(struct tnum a, u8 shift);
+/* Shift (arsh) a tnum right (by a fixed min_shift) */
+struct tnum tnum_arshift(struct tnum a, u8 min_shift);
 /* Add two tnums, return @a + @b */
 struct tnum tnum_add(struct tnum a, struct tnum b);
 /* Subtract two tnums, return @a - @b */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68..938d412 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
 	return TNUM(a.value >> shift, a.mask >> shift);
 }
 
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+	/* if a.value is negative, arithmetic shifting by minimum shift
+	 * will have larger negative offset compared to more shifting.
+	 * If a.value is nonnegative, arithmetic shifting by minimum shift
+	 * will have larger positive offset compare to more shifting.
+	 */
+	return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
 struct tnum tnum_add(struct tnum a, struct tnum b)
 {
 	u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6e3f859..712d865 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2974,6 +2974,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
+	case BPF_ARSH:
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
+			 */
+			mark_reg_unknown(env, regs, insn->dst_reg);
+			break;
+		}
+
+		/* Upon reaching here, src_known is true and
+		 * umax_val is equal to umin_val.
+		 */
+		dst_reg->smin_value >>= umin_val;
+		dst_reg->smax_value >>= umin_val;
+		dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+		/* blow away the dst_reg umin_value/umax_value and rely on
+		 * dst_reg var_off to refine the result.
+		 */
+		dst_reg->umin_value = 0;
+		dst_reg->umax_value = U64_MAX;
+		__update_reg_bounds(dst_reg);
+		break;
 	default:
 		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 04/10] bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

In verifier function adjust_scalar_min_max_vals,
when src_known is false and the opcode is BPF_LSH/BPF_RSH,
early return will happen in the function. So remove
the branch in handling BPF_LSH/BPF_RSH when src_known is false.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 988400e..6e3f859 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2940,10 +2940,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->umin_value <<= umin_val;
 			dst_reg->umax_value <<= umax_val;
 		}
-		if (src_known)
-			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
-		else
-			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
@@ -2971,11 +2968,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		dst_reg->smin_value = S64_MIN;
 		dst_reg->smax_value = S64_MAX;
-		if (src_known)
-			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
-						       umin_val);
-		else
-			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
 		dst_reg->umin_value >>= umax_val;
 		dst_reg->umax_value >>= umin_val;
 		/* We may learn something more from the var_off */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 08/10] tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

The test_verifier already has a few ARSH test cases.
This patch adds a new test case which takes advantage of newly
improved verifier behavior for bpf_get_stack and ARSH.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_verifier.c | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 165e9dd..1acafe26 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11680,6 +11680,51 @@ static struct bpf_test tests[] = {
 		.errstr = "BPF_XADD stores into R2 packet",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"bpf_get_stack return R0 within range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)),
+			BPF_MOV64_IMM(BPF_REG_4, 256),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32),
+			BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 01/10] bpf: change prototype for stack_map_get_build_id_offset
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb12..04f6ec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 10/10] tools/bpf: add a test for bpf_get_stack with tracepoint prog
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

The test_stacktrace_map and test_stacktrace_build_id are
enhanced to call bpf_get_stack in the helper to get the
stack trace as well.  The stack traces from bpf_get_stack
and bpf_get_stackid are compared to ensure that for the
same stack as represented as the same hash, their ip addresses
or build id's must be the same.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c           | 70 ++++++++++++++++++++--
 .../selftests/bpf/test_stacktrace_build_id.c       | 20 ++++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c  | 19 +++++-
 3 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 0ddbf34..aa336f0 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -906,11 +906,47 @@ static int compare_map_keys(int map1_fd, int map2_fd)
 	return 0;
 }
 
+static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
+{
+	__u32 key, next_key, *cur_key_p, *next_key_p;
+	char *val_buf1, *val_buf2;
+	int i, err = 0;
+
+	val_buf1 = malloc(stack_trace_len);
+	val_buf2 = malloc(stack_trace_len);
+	cur_key_p = NULL;
+	next_key_p = &key;
+	while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
+		err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
+		if (err)
+			goto out;
+		err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
+		if (err)
+			goto out;
+		for (i = 0; i < stack_trace_len; i++) {
+			if (val_buf1[i] != val_buf2[i]) {
+				err = -1;
+				goto out;
+			}
+		}
+		key = *next_key_p;
+		cur_key_p = &key;
+		next_key_p = &next_key;
+	}
+	if (errno != ENOENT)
+		err = -1;
+
+out:
+	free(val_buf1);
+	free(val_buf2);
+	return err;
+}
+
 static void test_stacktrace_map()
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_map.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, val, duration = 0;
 	struct bpf_object *obj;
@@ -966,6 +1002,10 @@ static void test_stacktrace_map()
 	if (stackmap_fd < 0)
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (stack_amap_fd < 0)
+		goto disable_pmu;
+
 	/* give some time for bpf program run */
 	sleep(1);
 
@@ -987,6 +1027,12 @@ static void test_stacktrace_map()
 		  "err %d errno %d\n", err, errno))
 		goto disable_pmu_noerr;
 
+	stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
 	goto disable_pmu_noerr;
 disable_pmu:
 	error_cnt++;
@@ -1080,9 +1126,9 @@ static int extract_build_id(char *build_id, size_t size)
 
 static void test_stacktrace_build_id(void)
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_build_id.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, previous_key, val, duration = 0;
 	struct bpf_object *obj;
@@ -1147,6 +1193,11 @@ static void test_stacktrace_build_id(void)
 		  err, errno))
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
 	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
 	       == 0);
 	assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0);
@@ -1198,8 +1249,15 @@ static void test_stacktrace_build_id(void)
 		previous_key = key;
 	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
 
-	CHECK(build_id_matches < 1, "build id match",
-	      "Didn't find expected build ID from the map");
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map"))
+		goto disable_pmu;
+
+	stack_trace_len = PERF_MAX_STACK_DEPTH
+		* sizeof(struct bpf_stack_build_id);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+	      "err %d errno %d\n", err, errno);
 
 disable_pmu:
 	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
index b755bd7..d86c281 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
@@ -19,7 +19,7 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
@@ -31,6 +31,14 @@ struct bpf_map_def SEC("maps") stackmap = {
 	.map_flags = BPF_F_STACK_BUILD_ID,
 };
 
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct bpf_stack_build_id)
+		* PERF_MAX_STACK_DEPTH,
+	.max_entries = 128,
+};
+
 /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
 struct random_urandom_args {
 	unsigned long long pad;
@@ -42,7 +50,10 @@ struct random_urandom_args {
 SEC("tracepoint/random/urandom_read")
 int oncpu(struct random_urandom_args *args)
 {
+	__u32 max_len = sizeof(struct bpf_stack_build_id)
+			* PERF_MAX_STACK_DEPTH;
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -50,8 +61,13 @@ int oncpu(struct random_urandom_args *args)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(args, stack_p, max_len,
+				      BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
index 76d85c5d..af111af 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
-	.max_entries = 10000,
+	.max_entries = 16384,
+};
+
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 16384,
 };
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
@@ -44,7 +51,9 @@ struct sched_switch_args {
 SEC("tracepoint/sched/sched_switch")
 int oncpu(struct sched_switch_args *ctx)
 {
+	__u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -52,8 +61,12 @@ int oncpu(struct sched_switch_args *ctx)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(ctx, &stackmap, 0);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(ctx, stack_p, max_len, 0);
+	}
 
 	return 0;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 06/10] tools/bpf: add bpf_get_stack helper to tools headers
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

The tools header file bpf.h is synced with kernel uapi bpf.h.
The new helper is also added to bpf_helpers.h.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h            | 42 +++++++++++++++++++++++++++++--
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index da77a93..1afb606 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1767,6 +1767,40 @@ union bpf_attr {
  * 		**CONFIG_XFRM** configuration option.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *ctx*, which is a pointer
+ *		to the context on which the tracing program is executed.
+ *		To store the stacktrace, the bpf program provides *buf* with
+ *		a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *	::
+ *
+ *		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		a non-negative value equal to or less than size on success, or
+ * 		a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 69d7b91..265f8e0 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -101,6 +101,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
 static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 				     int size, int flags) =
 	(void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
+	(void *) BPF_FUNC_get_stack;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 00/10] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table regardless of
whether BPF_F_REUSE_STACKID is specified or not,
so some stack traces may be missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Patches #1 and #2 implemented the core kernel support.
Patch #3 removes two never-hit branches in verifier.
Patches #4 and #5 are two verifier improves to make
bpf programming easier. Patch #6 synced the new helper
to tools headers. Patch #7 moved perf_event polling code
and ksym lookup code from samples/bpf to
tools/testing/selftests/bpf. Patch #8 added a verifier
test in tools/bpf for new verifier change.
Patches #9 and #10 added tests for raw tracepoint prog
and tracepoint prog respectively.

Changelogs:
  v8 -> v9:
    . make function perf_event_mmap (in trace_helpers.c) extern
      to decouple perf_event_mmap and perf_event_poller.
    . add jit enabled handling for kernel stack verification
      in Patch #9. Since we did not have a good way to
      verify jit enabled kernel stack, just return true if
      the kernel stack is not empty.
    . In path #9, using raw_syscalls/sys_enter instead of
      sched/sched_switch, removed calling cmd
      "task 1 dd if=/dev/zero of=/dev/null" which is left
      with dangling process after the program exited.

  v7 -> v8:
    . rebase on top of latest bpf-next
    . simplify BPF_ARSH dst_reg->smin_val/smax_value tracking
    . rewrite the description of bpf_get_stack() in uapi bpf.h
      based on new format.
  v6 -> v7:
    . do perf callchain buffer allocation inside the
      verifier. so if the prog->has_callchain_buf is set,
      it is guaranteed that the buffer has been allocated.
    . change condition "trace_nr <= skip" to "trace_nr < skip"
      so that for zero size buffer, return 0 instead of -EFAULT
  v5 -> v6:
    . after refining return register smax_value and umax_value
      for helpers bpf_get_stack and bpf_probe_read_str,
      bounds and var_off of the return register are further refined.
    . added missing commit message for tools header sync commit.
    . removed one unnecessary empty line.
  v4 -> v5:
    . relied on dst_reg->var_off to refine umin_val/umax_val
      in verifier handling BPF_ARSH value range tracking,
      suggested by Edward.
  v3 -> v4:
    . fixed a bug when meta ptr is set to NULL in check_func_arg.
    . introduced tnum_arshift and added detailed comments for
      the underlying implementation
    . avoided using VLA in tools/bpf test_progs.
  v2 -> v3:
    . used meta to track helper memory size argument
    . implemented range checking for ARSH in verifier
    . moved perf event polling and ksym related functions
      from samples/bpf to tools/bpf
    . added test to compare build id's between bpf_get_stackid
      and bpf_get_stack
  v1 -> v2:
    . fixed compilation error when CONFIG_PERF_EVENTS is not enabled

Yonghong Song (10):
  bpf: change prototype for stack_map_get_build_id_offset
  bpf: add bpf_get_stack helper
  bpf/verifier: refine retval R0 state for bpf_get_stack helper
  bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals
  bpf/verifier: improve register value range tracking with ARSH
  tools/bpf: add bpf_get_stack helper to tools headers
  samples/bpf: move common-purpose trace functions to selftests
  tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
  tools/bpf: add a test for bpf_get_stack with raw tracepoint prog
  tools/bpf: add a test for bpf_get_stack with tracepoint prog

 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |   3 +-
 include/linux/tnum.h                               |   4 +-
 include/uapi/linux/bpf.h                           |  42 +++-
 kernel/bpf/core.c                                  |   5 +
 kernel/bpf/stackmap.c                              |  80 ++++++-
 kernel/bpf/tnum.c                                  |  10 +
 kernel/bpf/verifier.c                              |  80 ++++++-
 kernel/trace/bpf_trace.c                           |  50 ++++-
 samples/bpf/Makefile                               |  11 +-
 samples/bpf/bpf_load.c                             |  63 ------
 samples/bpf/bpf_load.h                             |   7 -
 samples/bpf/offwaketime_user.c                     |   1 +
 samples/bpf/sampleip_user.c                        |   1 +
 samples/bpf/spintest_user.c                        |   1 +
 samples/bpf/trace_event_user.c                     |   1 +
 samples/bpf/trace_output_user.c                    | 110 +---------
 tools/include/uapi/linux/bpf.h                     |  42 +++-
 tools/testing/selftests/bpf/Makefile               |   4 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |   2 +
 tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 +++++++++
 tools/testing/selftests/bpf/test_progs.c           | 242 +++++++++++++++++++--
 .../selftests/bpf/test_stacktrace_build_id.c       |  20 +-
 tools/testing/selftests/bpf/test_stacktrace_map.c  |  19 +-
 tools/testing/selftests/bpf/test_verifier.c        |  45 ++++
 tools/testing/selftests/bpf/trace_helpers.c        | 180 +++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h        |  23 ++
 27 files changed, 927 insertions(+), 222 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.h

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v9 07/10] samples/bpf: move common-purpose trace functions to selftests
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

There is no functionality change in this patch. The common-purpose
trace functions, including perf_event polling and ksym lookup,
are moved from trace_output_user.c and bpf_load.c to
selftests/bpf/trace_helpers.c so that these function can
be reused later in selftests.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile                        |  11 +-
 samples/bpf/bpf_load.c                      |  63 ----------
 samples/bpf/bpf_load.h                      |   7 --
 samples/bpf/offwaketime_user.c              |   1 +
 samples/bpf/sampleip_user.c                 |   1 +
 samples/bpf/spintest_user.c                 |   1 +
 samples/bpf/trace_event_user.c              |   1 +
 samples/bpf/trace_output_user.c             | 110 ++---------------
 tools/testing/selftests/bpf/trace_helpers.c | 180 ++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h |  23 ++++
 10 files changed, 223 insertions(+), 175 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b853581..5e31770 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -49,6 +49,7 @@ hostprogs-y += xdp_adjust_tail
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
 test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
 sock_example-objs := sock_example.o $(LIBBPF)
@@ -65,10 +66,10 @@ tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
 tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
-trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS)
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
-offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
-spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
+offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
 map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
 test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
 test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
@@ -82,8 +83,8 @@ xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
 xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
-trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
-sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
+trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS)
 tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
 xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index feca497..a27ef3c 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -648,66 +648,3 @@ void read_trace_pipe(void)
 		}
 	}
 }
-
-#define MAX_SYMS 300000
-static struct ksym syms[MAX_SYMS];
-static int sym_cnt;
-
-static int ksym_cmp(const void *p1, const void *p2)
-{
-	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-int load_kallsyms(void)
-{
-	FILE *f = fopen("/proc/kallsyms", "r");
-	char func[256], buf[256];
-	char symbol;
-	void *addr;
-	int i = 0;
-
-	if (!f)
-		return -ENOENT;
-
-	while (!feof(f)) {
-		if (!fgets(buf, sizeof(buf), f))
-			break;
-		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
-			break;
-		if (!addr)
-			continue;
-		syms[i].addr = (long) addr;
-		syms[i].name = strdup(func);
-		i++;
-	}
-	sym_cnt = i;
-	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
-	return 0;
-}
-
-struct ksym *ksym_search(long key)
-{
-	int start = 0, end = sym_cnt;
-	int result;
-
-	while (start < end) {
-		size_t mid = start + (end - start) / 2;
-
-		result = key - syms[mid].addr;
-		if (result < 0)
-			end = mid;
-		else if (result > 0)
-			start = mid + 1;
-		else
-			return &syms[mid];
-	}
-
-	if (start >= 1 && syms[start - 1].addr < key &&
-	    key < syms[start].addr)
-		/* valid ksym */
-		return &syms[start - 1];
-
-	/* out of range. return _stext */
-	return &syms[0];
-}
-
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200..2c3d0b4 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -54,12 +54,5 @@ int load_bpf_file(char *path);
 int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
 
 void read_trace_pipe(void);
-struct ksym {
-	long addr;
-	char *name;
-};
-
-int load_kallsyms(void);
-struct ksym *ksym_search(long key);
 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 #endif
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 512f87a..f06063a 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -17,6 +17,7 @@
 #include <sys/resource.h>
 #include "libbpf.h"
 #include "bpf_load.h"
+#include "trace_helpers.h"
 
 #define PRINT_RAW_ADDR 0
 
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4ed690b..60c2b73 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -22,6 +22,7 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 #define DEFAULT_FREQ	99
 #define DEFAULT_SECS	5
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index 3d73621..8d3e9cf 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -7,6 +7,7 @@
 #include <sys/resource.h>
 #include "libbpf.h"
 #include "bpf_load.h"
+#include "trace_helpers.h"
 
 int main(int ac, char **argv)
 {
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 56f7a25..1fa1bec 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -21,6 +21,7 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 #define SAMPLE_FREQ 50
 
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e3..5e78c2e 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -21,100 +21,10 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 static int pmu_fd;
 
-int page_size;
-int page_cnt = 8;
-volatile struct perf_event_mmap_page *header;
-
-typedef void (*print_fn)(void *data, int size);
-
-static int perf_event_mmap(int fd)
-{
-	void *base;
-	int mmap_size;
-
-	page_size = getpagesize();
-	mmap_size = page_size * (page_cnt + 1);
-
-	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-	if (base == MAP_FAILED) {
-		printf("mmap err\n");
-		return -1;
-	}
-
-	header = base;
-	return 0;
-}
-
-static int perf_event_poll(int fd)
-{
-	struct pollfd pfd = { .fd = fd, .events = POLLIN };
-
-	return poll(&pfd, 1, 1000);
-}
-
-struct perf_event_sample {
-	struct perf_event_header header;
-	__u32 size;
-	char data[];
-};
-
-static void perf_event_read(print_fn fn)
-{
-	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	__u64 buffer_size = page_cnt * page_size;
-	void *base, *begin, *end;
-	char buf[256];
-
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % buffer_size;
-	end = base + data_head % buffer_size;
-
-	while (begin != end) {
-		struct perf_event_sample *e;
-
-		e = begin;
-		if (begin + e->header.size > base + buffer_size) {
-			long len = base + buffer_size - begin;
-
-			assert(len < e->header.size);
-			memcpy(buf, begin, len);
-			memcpy(buf + len, base, e->header.size - len);
-			e = (void *) buf;
-			begin = base + e->header.size - len;
-		} else if (begin + e->header.size == base + buffer_size) {
-			begin = base;
-		} else {
-			begin += e->header.size;
-		}
-
-		if (e->header.type == PERF_RECORD_SAMPLE) {
-			fn(e->data, e->size);
-		} else if (e->header.type == PERF_RECORD_LOST) {
-			struct {
-				struct perf_event_header header;
-				__u64 id;
-				__u64 lost;
-			} *lost = (void *) e;
-			printf("lost %lld events\n", lost->lost);
-		} else {
-			printf("unknown event type=%d size=%d\n",
-			       e->header.type, e->header.size);
-		}
-	}
-
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_head;
-}
-
 static __u64 time_get_ns(void)
 {
 	struct timespec ts;
@@ -127,7 +37,7 @@ static __u64 start_time;
 
 #define MAX_CNT 100000ll
 
-static void print_bpf_output(void *data, int size)
+static int print_bpf_output(void *data, int size)
 {
 	static __u64 cnt;
 	struct {
@@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size)
 	if (e->cookie != 0x12345678) {
 		printf("BUG pid %llx cookie %llx sized %d\n",
 		       e->pid, e->cookie, size);
-		kill(0, SIGINT);
+		return PERF_EVENT_ERROR;
 	}
 
 	cnt++;
@@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size)
 	if (cnt == MAX_CNT) {
 		printf("recv %lld events per sec\n",
 		       MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
-		kill(0, SIGINT);
+		return PERF_EVENT_DONE;
 	}
+
+	return PERF_EVENT_CONT;
 }
 
 static void test_bpf_perf_event(void)
@@ -170,6 +82,7 @@ int main(int argc, char **argv)
 {
 	char filename[256];
 	FILE *f;
+	int ret;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
@@ -187,10 +100,7 @@ int main(int argc, char **argv)
 	(void) f;
 
 	start_time = time_get_ns();
-	for (;;) {
-		perf_event_poll(pmu_fd);
-		perf_event_read(print_bpf_output);
-	}
-
-	return 0;
+	ret = perf_event_poller(pmu_fd, print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
 }
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
new file mode 100644
index 0000000..ad025bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/mman.h>
+#include "trace_helpers.h"
+
+#define MAX_SYMS 300000
+static struct ksym syms[MAX_SYMS];
+static int sym_cnt;
+
+static int ksym_cmp(const void *p1, const void *p2)
+{
+	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+int load_kallsyms(void)
+{
+	FILE *f = fopen("/proc/kallsyms", "r");
+	char func[256], buf[256];
+	char symbol;
+	void *addr;
+	int i = 0;
+
+	if (!f)
+		return -ENOENT;
+
+	while (!feof(f)) {
+		if (!fgets(buf, sizeof(buf), f))
+			break;
+		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
+			break;
+		if (!addr)
+			continue;
+		syms[i].addr = (long) addr;
+		syms[i].name = strdup(func);
+		i++;
+	}
+	sym_cnt = i;
+	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
+	return 0;
+}
+
+struct ksym *ksym_search(long key)
+{
+	int start = 0, end = sym_cnt;
+	int result;
+
+	while (start < end) {
+		size_t mid = start + (end - start) / 2;
+
+		result = key - syms[mid].addr;
+		if (result < 0)
+			end = mid;
+		else if (result > 0)
+			start = mid + 1;
+		else
+			return &syms[mid];
+	}
+
+	if (start >= 1 && syms[start - 1].addr < key &&
+	    key < syms[start].addr)
+		/* valid ksym */
+		return &syms[start - 1];
+
+	/* out of range. return _stext */
+	return &syms[0];
+}
+
+static int page_size;
+static int page_cnt = 8;
+static volatile struct perf_event_mmap_page *header;
+
+int perf_event_mmap(int fd)
+{
+	void *base;
+	int mmap_size;
+
+	page_size = getpagesize();
+	mmap_size = page_size * (page_cnt + 1);
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		printf("mmap err\n");
+		return -1;
+	}
+
+	header = base;
+	return 0;
+}
+
+static int perf_event_poll(int fd)
+{
+	struct pollfd pfd = { .fd = fd, .events = POLLIN };
+
+	return poll(&pfd, 1, 1000);
+}
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	char data[];
+};
+
+static int perf_event_read(perf_event_print_fn fn)
+{
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	__u64 buffer_size = page_cnt * page_size;
+	void *base, *begin, *end;
+	char buf[256];
+	int ret;
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return PERF_EVENT_CONT;
+
+	base = ((char *)header) + page_size;
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			assert(len < e->header.size);
+			memcpy(buf, begin, len);
+			memcpy(buf + len, base, e->header.size - len);
+			e = (void *) buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			ret = fn(e->data, e->size);
+			if (ret != PERF_EVENT_CONT)
+				return ret;
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			struct {
+				struct perf_event_header header;
+				__u64 id;
+				__u64 lost;
+			} *lost = (void *) e;
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_head;
+	return PERF_EVENT_CONT;
+}
+
+int perf_event_poller(int fd, perf_event_print_fn output_fn)
+{
+	int ret;
+
+	for (;;) {
+		perf_event_poll(fd);
+		ret = perf_event_read(output_fn);
+		if (ret != PERF_EVENT_CONT)
+			return ret;
+	}
+
+	return PERF_EVENT_DONE;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
new file mode 100644
index 0000000..fe3eefd
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_HELPER_H
+#define __TRACE_HELPER_H
+
+struct ksym {
+	long addr;
+	char *name;
+};
+
+int load_kallsyms(void);
+struct ksym *ksym_search(long key);
+
+typedef int (*perf_event_print_fn)(void *data, int size);
+
+/* return code for perf_event_print_fn */
+#define PERF_EVENT_DONE		0
+#define PERF_EVENT_ERROR	-1
+#define PERF_EVENT_CONT		-2
+
+int perf_event_mmap(int fd);
+/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */
+int perf_event_poller(int fd, perf_event_print_fn output_fn);
+#endif
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v9 09/10] tools/bpf: add a test for bpf_get_stack with raw tracepoint prog
From: Yonghong Song @ 2018-04-29  5:28 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>

The test attached a raw_tracepoint program to raw_syscalls/sys_enter.
It tested to get stack for user space, kernel space and user
space with build_id request. It also tested to get user
and kernel stack into the same buffer with back-to-back
bpf_get_stack helper calls.

If jit is not enabled, the user space application will check
to ensure that the kernel function for raw_tracepoint
___bpf_prog_run is part of the stack.

If jit is enabled, we did not have a reliable way to
verify the kernel stack, so just assume the kernel stack
is good when the kernel stack size is greater than 0.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/Makefile               |   4 +-
 tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 ++++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 172 +++++++++++++++++++--
 3 files changed, 266 insertions(+), 12 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b64a7a3..9d76218 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
-	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o
+	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
+	test_get_stack_rawtp.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -58,6 +59,7 @@ $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
 $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 $(OUTPUT)/test_sockmap: cgroup_helpers.c
+$(OUTPUT)/test_progs: trace_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
new file mode 100644
index 0000000..f6d9f23
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Permit pretty deep stack traces */
+#define MAX_STACK_RAWTP 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+struct bpf_map_def SEC("maps") perfmap = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(__u32),
+	.max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") stackdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct stack_trace_t),
+	.max_entries = 1,
+};
+
+/* Allocate per-cpu space twice the needed. For the code below
+ *   usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+ *   if (usize < 0)
+ *     return 0;
+ *   ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+ *
+ * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64),
+ * verifier will complain that access "raw_data + usize"
+ * with size "max_len - usize" may be out of bound.
+ * The maximum "raw_data + usize" is "raw_data + max_len"
+ * and the maximum "max_len - usize" is "max_len", verifier
+ * concludes that the maximum buffer access range is
+ * "raw_data[0...max_len * 2 - 1]" and hence reject the program.
+ *
+ * Doubling the to-be-used max buffer size can fix this verifier
+ * issue and avoid complicated C programming massaging.
+ * This is an acceptable workaround since there is one entry here.
+ */
+struct bpf_map_def SEC("maps") rawdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2,
+	.max_entries = 1,
+};
+
+SEC("tracepoint/raw_syscalls/sys_enter")
+int bpf_prog1(void *ctx)
+{
+	int max_len, max_buildid_len, usize, ksize, total_size;
+	struct stack_trace_t *data;
+	void *raw_data;
+	__u32 key = 0;
+
+	data = bpf_map_lookup_elem(&stackdata_map, &key);
+	if (!data)
+		return 0;
+
+	max_len = MAX_STACK_RAWTP * sizeof(__u64);
+	max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id);
+	data->pid = bpf_get_current_pid_tgid();
+	data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
+					      max_len, 0);
+	data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
+					    BPF_F_USER_STACK);
+	data->user_stack_buildid_size = bpf_get_stack(
+		ctx, data->user_stack_buildid, max_buildid_len,
+		BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
+
+	/* write both kernel and user stacks to the same buffer */
+	raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
+	if (!raw_data)
+		return 0;
+
+	usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+	if (usize < 0)
+		return 0;
+
+	ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+	if (ksize < 0)
+		return 0;
+
+	total_size = usize + ksize;
+	if (total_size > 0 && total_size <= max_len)
+		bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index eedda98..0ddbf34 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -38,8 +38,10 @@ typedef __u16 __sum16;
 #include "bpf_util.h"
 #include "bpf_endian.h"
 #include "bpf_rlimit.h"
+#include "trace_helpers.h"
 
 static int error_cnt, pass_cnt;
+static bool jit_enabled;
 
 #define MAGIC_BYTES 123
 
@@ -391,13 +393,30 @@ static inline __u64 ptr_to_u64(const void *ptr)
 	return (__u64) (unsigned long) ptr;
 }
 
+static bool is_jit_enabled(void)
+{
+	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
+	bool enabled = false;
+	int sysctl_fd;
+
+	sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
+	if (sysctl_fd != -1) {
+		char tmpc;
+
+		if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
+			enabled = (tmpc != '0');
+		close(sysctl_fd);
+	}
+
+	return enabled;
+}
+
 static void test_bpf_obj_id(void)
 {
 	const __u64 array_magic_value = 0xfaceb00c;
 	const __u32 array_key = 0;
 	const int nr_iters = 2;
 	const char *file = "./test_obj_id.o";
-	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
 	const char *expected_prog_name = "test_obj_id";
 	const char *expected_map_name = "test_map_id";
 	const __u64 nsec_per_sec = 1000000000;
@@ -414,20 +433,11 @@ static void test_bpf_obj_id(void)
 	char jited_insns[128], xlated_insns[128], zeros[128];
 	__u32 i, next_id, info_len, nr_id_found, duration = 0;
 	struct timespec real_time_ts, boot_time_ts;
-	int sysctl_fd, jit_enabled = 0, err = 0;
+	int err = 0;
 	__u64 array_value;
 	uid_t my_uid = getuid();
 	time_t now, load_time;
 
-	sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
-	if (sysctl_fd != -1) {
-		char tmpc;
-
-		if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
-			jit_enabled = (tmpc != '0');
-		close(sysctl_fd);
-	}
-
 	err = bpf_prog_get_fd_by_id(0);
 	CHECK(err >= 0 || errno != ENOENT,
 	      "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno);
@@ -1204,8 +1214,147 @@ static void test_stacktrace_build_id(void)
 	return;
 }
 
+#define MAX_CNT_RAWTP	10ull
+#define MAX_STACK_RAWTP	100
+struct get_stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+static int get_stack_print_output(void *data, int size)
+{
+	bool good_kern_stack = false, good_user_stack = false;
+	const char *nonjit_func = "___bpf_prog_run";
+	struct get_stack_trace_t *e = data;
+	int i, num_stack;
+	static __u64 cnt;
+	struct ksym *ks;
+
+	cnt++;
+
+	if (size < sizeof(struct get_stack_trace_t)) {
+		__u64 *raw_data = data;
+		bool found = false;
+
+		num_stack = size / sizeof(__u64);
+		/* If jit is enabled, we do not have a good way to
+		 * verify the sanity of the kernel stack. So we
+		 * just assume it is good if the stack is not empty.
+		 * This could be improved in the future.
+		 */
+		if (jit_enabled) {
+			found = num_stack > 0;
+		} else {
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(raw_data[i]);
+				if (strcmp(ks->name, nonjit_func) == 0) {
+					found = true;
+					break;
+				}
+			}
+		}
+		if (found) {
+			good_kern_stack = true;
+			good_user_stack = true;
+		}
+	} else {
+		num_stack = e->kern_stack_size / sizeof(__u64);
+		if (jit_enabled) {
+			good_kern_stack = num_stack > 0;
+		} else {
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(e->kern_stack[i]);
+				if (strcmp(ks->name, nonjit_func) == 0) {
+					good_kern_stack = true;
+					break;
+				}
+			}
+		}
+		if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
+			good_user_stack = true;
+	}
+	if (!good_kern_stack || !good_user_stack)
+		return PERF_EVENT_ERROR;
+
+	if (cnt == MAX_CNT_RAWTP)
+		return PERF_EVENT_DONE;
+
+	return PERF_EVENT_CONT;
+}
+
+static void test_get_stack_raw_tp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	int i, efd, err, prog_fd, pmu_fd, perfmap_fd;
+	struct perf_event_attr attr = {};
+	struct timespec tv = {0, 10};
+	__u32 key = 0, duration = 0;
+	struct bpf_object *obj;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	perfmap_fd = bpf_find_map(__func__, obj, "perfmap");
+	if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  perfmap_fd, errno))
+		goto close_prog;
+
+	err = load_kallsyms();
+	if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid()/*pid*/, -1/*cpu*/,
+			 -1/*group_fd*/, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY);
+	if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+
+	err = perf_event_mmap(pmu_fd);
+	if (CHECK(err < 0, "perf_event_mmap", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	/* trigger some syscall action */
+	for (i = 0; i < MAX_CNT_RAWTP; i++)
+		nanosleep(&tv, NULL);
+
+	err = perf_event_poller(pmu_fd, get_stack_print_output);
+	if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
+	jit_enabled = is_jit_enabled();
+
 	test_pkt_access();
 	test_xdp();
 	test_xdp_adjust_tail();
@@ -1219,6 +1368,7 @@ int main(void)
 	test_stacktrace_map();
 	test_stacktrace_build_id();
 	test_stacktrace_map_raw_tp();
+	test_get_stack_raw_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v2] bpf: Allow bpf_current_task_under_cgroup in interrupt
From: Teng Qin @ 2018-04-29  6:39 UTC (permalink / raw)
  To: netdev; +Cc: ast, daniel, yhs, kernel-team, Teng Qin
In-Reply-To: <20180428073235.3917719-1-qinteng@fb.com>

Currently, the bpf_current_task_under_cgroup helper has a check where if
the BPF program is running in_interrupt(), it will return -EINVAL. This
prevents the helper to be used in many useful scenarios, particularly
BPF programs attached to Perf Events.

This commit removes the check. Tested a few NMI (Perf Event) and some
softirq context, the helper returns the correct result.

Signed-off-by: Teng Qin <qinteng@fb.com>
---
 kernel/trace/bpf_trace.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2..f94890c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -474,8 +474,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
 
-	if (unlikely(in_interrupt()))
-		return -EINVAL;
 	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH 1/1] tg3: fix meaningless hw_stats reading after tg3_halt memset 0 hw_stats
From: Zumeng Chen @ 2018-04-29  6:43 UTC (permalink / raw)
  To: Michael Chan
  Cc: Netdev, open list, Siva Reddy Kallam, michael.chan@broadcom.com,
	prashant.sreedharan@broadcom.com, David Miller
In-Reply-To: <CACKFLi=UxiSUAx8CoMKpx16d7mVSAbbmhR-m=1W_kzNrePu7XQ@mail.gmail.com>

On 2018年04月29日 02:36, Michael Chan wrote:
> On Fri, Apr 27, 2018 at 8:15 PM, Zumeng Chen <zumeng.chen@gmail.com> wrote:
>
>> diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
>> index 3b5e98e..6727d93 100644
>> --- a/drivers/net/ethernet/broadcom/tg3.h
>> +++ b/drivers/net/ethernet/broadcom/tg3.h
>> @@ -3352,6 +3352,7 @@ struct tg3 {
>>          struct pci_dev                  *pdev_peer;
>>
>>          struct tg3_hw_stats             *hw_stats;
>> +       bool                            hw_stats_flag;
> You can just add another bit to enum TG3_FLAGS for this purpose.

Right, it's a good idea, I didn't notice it, I'll send V2 with that later.

>
> While this scheme will probably work, I think a better and more
> elegant way to fix this is to use RCU.

IMHO, RCU is not necessary for this simple two consumers, and no 
frequent ops
on tg3_halt, plus no new locker involved either.

Cheers,
Zumeng
>
>>          dma_addr_t                      stats_mapping;
>>          struct work_struct              reset_task;
>>
>> --
>> 2.9.3
>>

^ permalink raw reply

* [PATCH] change the comment of vti6_ioctl
From: Sun Lianwen @ 2018-04-29  7:05 UTC (permalink / raw)
  To: netdev

The comment of vti6_ioctl() is wrong. which use vti6_tnl_ioctl
instead of vti6_ioctl.

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
---
 net/ipv6/ip6_vti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c214ffec02f0..deadc4c3703b 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
 }
 
 /**
- * vti6_tnl_ioctl - configure vti6 tunnels from userspace
+ * vti6_ioctl - configure vti6 tunnels from userspace
  *   @dev: virtual device associated with tunnel
  *   @ifr: parameters passed from userspace
  *   @cmd: command to be performed
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next v3 0/6] mlxsw: SPAN: Support routes pointing at bridges
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel

Petr says:

When mirroring to a gretap or ip6gretap netdevice, the route that
directs the encapsulated packets can reference a bridge. In that case,
in the software model, the packet is switched.

Thus when offloading mirroring like that, take into consideration FDB,
STP, PVID configured at the bridge, and whether that VLAN ID should be
tagged on egress.

Patch #1 introduces functions to get bridge PVID, VLAN flags and to look
up an FDB entry.

Patches #2 and #3 refactor some existing code and introduce a new
accessor function.

With patches #4 and #5 mlxsw calls mlxsw_sp_span_respin() on switchdev
events as well. There is no impact yet, because bridge as an underlay
device is still not allowed.

That is implemented in patch #6, which uses the new interfaces to figure
out on which one port the mirroring should be configured, and whether
the mirrored packets should be VLAN-tagged and how.

Changes from v2 to v3:

- Rename the suite of bridge accessor function to br_vlan_get_pvid(),
  br_vlan_get_info() and br_fdb_find_port(). The _get bit is to avoid
  clashing with an existing static function.

Changes from v1 to v2:

- Change the suite of bridge accessor functions to br_vlan_pvid_rtnl(),
  br_vlan_info_rtnl(), br_fdb_find_port_rtnl().

Petr Machata (6):
  net: bridge: Publish bridge accessor functions
  mlxsw: spectrum: Extract mlxsw_sp_stp_spms_state()
  mlxsw: spectrum_switchdev: Publish two functions
  mlxsw: spectrum: Register SPAN before switchdev
  mlxsw: Respin SPAN on switchdev events
  mlxsw: spectrum_span: Allow bridge for gretap mirror

 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 50 ++++++------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  1 +
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 95 ++++++++++++++++++++--
 .../net/ethernet/mellanox/mlxsw/spectrum_span.h    |  1 +
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 72 ++++++++++++++--
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.h   | 43 ++++++++++
 include/linux/if_bridge.h                          | 28 +++++++
 net/bridge/br_fdb.c                                | 22 +++++
 net/bridge/br_private.h                            | 11 +++
 net/bridge/br_vlan.c                               | 39 +++++++++
 10 files changed, 326 insertions(+), 36 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h

-- 
2.14.3

^ permalink raw reply

* [PATCH net-next v3 1/6] net: bridge: Publish bridge accessor functions
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel
In-Reply-To: <20180429075613.10832-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Add a couple new functions to allow querying FDB and vlan settings of a
bridge.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 include/linux/if_bridge.h | 28 ++++++++++++++++++++++++++++
 net/bridge/br_fdb.c       | 22 ++++++++++++++++++++++
 net/bridge/br_private.h   | 11 +++++++++++
 net/bridge/br_vlan.c      | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 02639ebea2f0..585d27182425 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -93,11 +93,39 @@ static inline bool br_multicast_router(const struct net_device *dev)
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
 bool br_vlan_enabled(const struct net_device *dev);
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+		     struct bridge_vlan_info *p_vinfo);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
 	return false;
 }
+
+static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	return -1;
+}
+
+static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
+				   struct bridge_vlan_info *p_vinfo)
+{
+	return -1;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_BRIDGE)
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+				    const unsigned char *addr,
+				    __u16 vid);
+#else
+static inline struct net_device *
+br_fdb_find_port(const struct net_device *br_dev,
+		 const unsigned char *addr,
+		 __u16 vid)
+{
+	return NULL;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..a1c409cacec4 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
 	return fdb;
 }
 
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+				    const unsigned char *addr,
+				    __u16 vid)
+{
+	struct net_bridge_fdb_entry *f;
+	struct net_device *dev = NULL;
+	struct net_bridge *br;
+
+	ASSERT_RTNL();
+
+	if (!netif_is_bridge_master(br_dev))
+		return NULL;
+
+	br = netdev_priv(br_dev);
+	f = br_fdb_find(br, addr, vid);
+	if (f && f->dst)
+		dev = f->dst->dev;
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
 struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
 					     const unsigned char *addr,
 					     __u16 vid)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..1a5093115534 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -594,11 +594,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
 	return rcu_dereference(dev->rx_handler) == br_handle_frame;
 }
 
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+}
+
 static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
 {
 	return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
 }
 
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+	return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
+
 /* br_ioctl.c */
 int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..df37a5137c25 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1149,3 +1149,42 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
 		stats->tx_packets += txpackets;
 	}
 }
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	struct net_bridge_vlan_group *vg;
+
+	ASSERT_RTNL();
+	if (netif_is_bridge_master(dev))
+		vg = br_vlan_group(netdev_priv(dev));
+	else
+		return -EINVAL;
+
+	*p_pvid = br_get_pvid(vg);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+		     struct bridge_vlan_info *p_vinfo)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	struct net_bridge_port *p;
+
+	ASSERT_RTNL();
+	p = br_port_get_check_rtnl(dev);
+	if (p)
+		vg = nbp_vlan_group(p);
+	else
+		return -EINVAL;
+
+	v = br_vlan_find(vg, vid);
+	if (!v)
+		return -ENOENT;
+
+	p_vinfo->vid = vid;
+	p_vinfo->flags = v->flags;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 2/6] mlxsw: spectrum: Extract mlxsw_sp_stp_spms_state()
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel
In-Reply-To: <20180429075613.10832-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Instead of duplicating the decision regarding port forwarding state made
by mlxsw_sp_port_vid_stp_set(), extract the decision-making into a new
function and reuse.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 26 +++++++++++++-------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  1 +
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index ca38a30fbe91..7317fb8079d1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -441,29 +441,29 @@ static void mlxsw_sp_txhdr_construct(struct sk_buff *skb,
 	mlxsw_tx_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
 }
 
-int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
-			      u8 state)
+enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 state)
 {
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-	enum mlxsw_reg_spms_state spms_state;
-	char *spms_pl;
-	int err;
-
 	switch (state) {
 	case BR_STATE_FORWARDING:
-		spms_state = MLXSW_REG_SPMS_STATE_FORWARDING;
-		break;
+		return MLXSW_REG_SPMS_STATE_FORWARDING;
 	case BR_STATE_LEARNING:
-		spms_state = MLXSW_REG_SPMS_STATE_LEARNING;
-		break;
+		return MLXSW_REG_SPMS_STATE_LEARNING;
 	case BR_STATE_LISTENING: /* fall-through */
 	case BR_STATE_DISABLED: /* fall-through */
 	case BR_STATE_BLOCKING:
-		spms_state = MLXSW_REG_SPMS_STATE_DISCARDING;
-		break;
+		return MLXSW_REG_SPMS_STATE_DISCARDING;
 	default:
 		BUG();
 	}
+}
+
+int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+			      u8 state)
+{
+	enum mlxsw_reg_spms_state spms_state = mlxsw_sp_stp_spms_state(state);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spms_pl;
+	int err;
 
 	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
 	if (!spms_pl)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 804d4d2c8031..4a519d8edec8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -364,6 +364,7 @@ int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu,
 int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port,
 				  enum mlxsw_reg_qeec_hr hr, u8 index,
 				  u8 next_index, u32 maxrate);
+enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 stp_state);
 int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
 			      u8 state);
 int mlxsw_sp_port_vp_mode_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 3/6] mlxsw: spectrum_switchdev: Publish two functions
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel
In-Reply-To: <20180429075613.10832-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Publish the existing function mlxsw_sp_bridge_port_find(), and add
another service accessor mlxsw_sp_bridge_port_stp_state(). Publish both
in a new file spectrum_switchdev.h.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |  9 ++++-
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.h   | 43 ++++++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index c11c9a635866..db4aea0f8996 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -50,6 +50,7 @@
 #include <net/switchdev.h>
 
 #include "spectrum_router.h"
+#include "spectrum_switchdev.h"
 #include "spectrum.h"
 #include "core.h"
 #include "reg.h"
@@ -239,7 +240,7 @@ __mlxsw_sp_bridge_port_find(const struct mlxsw_sp_bridge_device *bridge_device,
 	return NULL;
 }
 
-static struct mlxsw_sp_bridge_port *
+struct mlxsw_sp_bridge_port *
 mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
 			  struct net_device *brport_dev)
 {
@@ -2297,6 +2298,12 @@ static struct notifier_block mlxsw_sp_switchdev_notifier = {
 	.notifier_call = mlxsw_sp_switchdev_event,
 };
 
+u8
+mlxsw_sp_bridge_port_stp_state(struct mlxsw_sp_bridge_port *bridge_port)
+{
+	return bridge_port->stp_state;
+}
+
 static int mlxsw_sp_fdb_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct mlxsw_sp_bridge *bridge = mlxsw_sp->bridge;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
new file mode 100644
index 000000000000..bc44d5effc28
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/netdevice.h>
+
+struct mlxsw_sp_bridge;
+struct mlxsw_sp_bridge_port;
+
+struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
+			  struct net_device *brport_dev);
+
+u8 mlxsw_sp_bridge_port_stp_state(struct mlxsw_sp_bridge_port *bridge_port);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 4/6] mlxsw: spectrum: Register SPAN before switchdev
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel
In-Reply-To: <20180429075613.10832-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Since switchdev events can trigger SPAN respin, it is necessary that the
data structures are available. Register SPAN first, with a commentary on
what the dependencies are.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 7317fb8079d1..94132f6cec61 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3666,6 +3666,15 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		goto err_lag_init;
 	}
 
+	/* Initialize SPAN before router and switchdev, so that those components
+	 * can call mlxsw_sp_span_respin().
+	 */
+	err = mlxsw_sp_span_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init span system\n");
+		goto err_span_init;
+	}
+
 	err = mlxsw_sp_switchdev_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize switchdev\n");
@@ -3684,15 +3693,6 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		goto err_afa_init;
 	}
 
-	err = mlxsw_sp_span_init(mlxsw_sp);
-	if (err) {
-		dev_err(mlxsw_sp->bus_info->dev, "Failed to init span system\n");
-		goto err_span_init;
-	}
-
-	/* Initialize router after SPAN is initialized, so that the FIB and
-	 * neighbor event handlers can issue SPAN respin.
-	 */
 	err = mlxsw_sp_router_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n");
@@ -3739,14 +3739,14 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 err_netdev_notifier:
 	mlxsw_sp_router_fini(mlxsw_sp);
 err_router_init:
-	mlxsw_sp_span_fini(mlxsw_sp);
-err_span_init:
 	mlxsw_sp_afa_fini(mlxsw_sp);
 err_afa_init:
 	mlxsw_sp_counter_pool_fini(mlxsw_sp);
 err_counter_pool_init:
 	mlxsw_sp_switchdev_fini(mlxsw_sp);
 err_switchdev_init:
+	mlxsw_sp_span_fini(mlxsw_sp);
+err_span_init:
 	mlxsw_sp_lag_fini(mlxsw_sp);
 err_lag_init:
 	mlxsw_sp_buffers_fini(mlxsw_sp);
@@ -3768,10 +3768,10 @@ static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
 	mlxsw_sp_acl_fini(mlxsw_sp);
 	unregister_netdevice_notifier(&mlxsw_sp->netdevice_nb);
 	mlxsw_sp_router_fini(mlxsw_sp);
-	mlxsw_sp_span_fini(mlxsw_sp);
 	mlxsw_sp_afa_fini(mlxsw_sp);
 	mlxsw_sp_counter_pool_fini(mlxsw_sp);
 	mlxsw_sp_switchdev_fini(mlxsw_sp);
+	mlxsw_sp_span_fini(mlxsw_sp);
 	mlxsw_sp_lag_fini(mlxsw_sp);
 	mlxsw_sp_buffers_fini(mlxsw_sp);
 	mlxsw_sp_traps_fini(mlxsw_sp);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 5/6] mlxsw: Respin SPAN on switchdev events
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel
In-Reply-To: <20180429075613.10832-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Changes to switchdev artifact can make a SPAN entry offloadable or
unoffloadable. To that end:

- Listen to SWITCHDEV_FDB_*_TO_BRIDGE notifications in addition to
  the *_TO_DEVICE ones, to catch whatever activity is sent to the
  bridge (likely by mlxsw itself).

  On each FDB notification, respin SPAN to reconcile it with the FDB
  changes.

- Also respin on switchdev port attribute changes (which currently
  covers changes to STP state of ports) and port object additions and
  removals.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 63 ++++++++++++++++++++--
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index db4aea0f8996..1af99fe5fd32 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -49,6 +49,7 @@
 #include <linux/netlink.h>
 #include <net/switchdev.h>
 
+#include "spectrum_span.h"
 #include "spectrum_router.h"
 #include "spectrum_switchdev.h"
 #include "spectrum.h"
@@ -923,6 +924,9 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
 		break;
 	}
 
+	if (switchdev_trans_ph_commit(trans))
+		mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
 	return err;
 }
 
@@ -1647,18 +1651,57 @@ mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
 	}
 }
 
+struct mlxsw_sp_span_respin_work {
+	struct work_struct work;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static void mlxsw_sp_span_respin_work(struct work_struct *work)
+{
+	struct mlxsw_sp_span_respin_work *respin_work =
+		container_of(work, struct mlxsw_sp_span_respin_work, work);
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(respin_work->mlxsw_sp);
+	rtnl_unlock();
+	kfree(respin_work);
+}
+
+static void mlxsw_sp_span_respin_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_span_respin_work *respin_work;
+
+	respin_work = kzalloc(sizeof(*respin_work), GFP_ATOMIC);
+	if (!respin_work)
+		return;
+
+	INIT_WORK(&respin_work->work, mlxsw_sp_span_respin_work);
+	respin_work->mlxsw_sp = mlxsw_sp;
+
+	mlxsw_core_schedule_work(&respin_work->work);
+}
+
 static int mlxsw_sp_port_obj_add(struct net_device *dev,
 				 const struct switchdev_obj *obj,
 				 struct switchdev_trans *trans)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	const struct switchdev_obj_port_vlan *vlan;
 	int err = 0;
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port,
-					      SWITCHDEV_OBJ_PORT_VLAN(obj),
-					      trans);
+		vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, trans);
+
+		if (switchdev_trans_ph_commit(trans)) {
+			/* The event is emitted before the changes are actually
+			 * applied to the bridge. Therefore schedule the respin
+			 * call for later, so that the respin logic sees the
+			 * updated bridge state.
+			 */
+			mlxsw_sp_span_respin_schedule(mlxsw_sp_port->mlxsw_sp);
+		}
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		err = mlxsw_sp_port_mdb_add(mlxsw_sp_port,
@@ -1809,6 +1852,8 @@ static int mlxsw_sp_port_obj_del(struct net_device *dev,
 		break;
 	}
 
+	mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
 	return err;
 }
 
@@ -2236,8 +2281,16 @@ static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
 		fdb_info = &switchdev_work->fdb_info;
 		mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false);
 		break;
+	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+		/* These events are only used to potentially update an existing
+		 * SPAN mirror.
+		 */
+		break;
 	}
 
+	mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
 out:
 	rtnl_unlock();
 	kfree(switchdev_work->fdb_info.addr);
@@ -2266,7 +2319,9 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
-	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE: /* fall through */
+	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
 		memcpy(&switchdev_work->fdb_info, ptr,
 		       sizeof(switchdev_work->fdb_info));
 		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next v3 6/6] mlxsw: spectrum_span: Allow bridge for gretap mirror
From: Ido Schimmel @ 2018-04-29  7:56 UTC (permalink / raw)
  To: netdev, bridge; +Cc: davem, jiri, petrm, stephen, nikolay, mlxsw, Ido Schimmel
In-Reply-To: <20180429075613.10832-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

When handling mirroring to a gretap or ip6gretap netdevice in mlxsw, the
underlay address (i.e. the remote address of the tunnel) may be routed
to a bridge.

In that case, look up the resolved neighbor Ethernet address in that
bridge's FDB. Then configure the offload to direct the mirrored traffic
to that port, possibly with tagging.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 95 ++++++++++++++++++++--
 .../net/ethernet/mellanox/mlxsw/spectrum_span.h    |  1 +
 2 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 65a77708ff61..cd9071ee19ad 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -32,6 +32,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/if_bridge.h>
 #include <linux/list.h>
 #include <net/arp.h>
 #include <net/gre.h>
@@ -39,8 +40,9 @@
 #include <net/ip6_tunnel.h>
 
 #include "spectrum.h"
-#include "spectrum_span.h"
 #include "spectrum_ipip.h"
+#include "spectrum_span.h"
+#include "spectrum_switchdev.h"
 
 int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp)
 {
@@ -167,6 +169,72 @@ mlxsw_sp_span_entry_unoffloadable(struct mlxsw_sp_span_parms *sparmsp)
 	return 0;
 }
 
+static struct net_device *
+mlxsw_sp_span_entry_bridge_8021q(const struct net_device *br_dev,
+				 unsigned char *dmac,
+				 u16 *p_vid)
+{
+	struct bridge_vlan_info vinfo;
+	struct net_device *edev;
+	u16 pvid;
+
+	if (WARN_ON(br_vlan_get_pvid(br_dev, &pvid)))
+		return NULL;
+	if (!pvid)
+		return NULL;
+
+	edev = br_fdb_find_port(br_dev, dmac, pvid);
+	if (!edev)
+		return NULL;
+
+	if (br_vlan_get_info(edev, pvid, &vinfo))
+		return NULL;
+	if (!(vinfo.flags & BRIDGE_VLAN_INFO_UNTAGGED))
+		*p_vid = pvid;
+	return edev;
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge_8021d(const struct net_device *br_dev,
+				 unsigned char *dmac)
+{
+	return br_fdb_find_port(br_dev, dmac, 0);
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge(const struct net_device *br_dev,
+			   unsigned char dmac[ETH_ALEN],
+			   u16 *p_vid)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	enum mlxsw_reg_spms_state spms_state;
+	struct mlxsw_sp_port *port;
+	struct net_device *dev;
+	u8 stp_state;
+
+	if (br_vlan_enabled(br_dev))
+		dev = mlxsw_sp_span_entry_bridge_8021q(br_dev, dmac, p_vid);
+	else
+		dev = mlxsw_sp_span_entry_bridge_8021d(br_dev, dmac);
+	if (!dev)
+		return NULL;
+
+	port = mlxsw_sp_port_dev_lower_find(dev);
+	if (!port)
+		return NULL;
+
+	bridge_port = mlxsw_sp_bridge_port_find(port->mlxsw_sp->bridge, dev);
+	if (!bridge_port)
+		return NULL;
+
+	stp_state = mlxsw_sp_bridge_port_stp_state(bridge_port);
+	spms_state = mlxsw_sp_stp_spms_state(stp_state);
+	if (spms_state != MLXSW_REG_SPMS_STATE_FORWARDING)
+		return NULL;
+
+	return dev;
+}
+
 static __maybe_unused int
 mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 					union mlxsw_sp_l3addr saddr,
@@ -177,13 +245,22 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 					struct mlxsw_sp_span_parms *sparmsp)
 {
 	unsigned char dmac[ETH_ALEN];
+	u16 vid = 0;
 
 	if (mlxsw_sp_l3addr_is_zero(gw))
 		gw = daddr;
 
-	if (!l3edev || !mlxsw_sp_port_dev_check(l3edev) ||
-	    mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
-		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+	if (!l3edev || mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
+		goto unoffloadable;
+
+	if (netif_is_bridge_master(l3edev)) {
+		l3edev = mlxsw_sp_span_entry_bridge(l3edev, dmac, &vid);
+		if (!l3edev)
+			goto unoffloadable;
+	}
+
+	if (!mlxsw_sp_port_dev_check(l3edev))
+		goto unoffloadable;
 
 	sparmsp->dest_port = netdev_priv(l3edev);
 	sparmsp->ttl = ttl;
@@ -191,7 +268,11 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 	memcpy(sparmsp->smac, l3edev->dev_addr, ETH_ALEN);
 	sparmsp->saddr = saddr;
 	sparmsp->daddr = daddr;
+	sparmsp->vid = vid;
 	return 0;
+
+unoffloadable:
+	return mlxsw_sp_span_entry_unoffloadable(sparmsp);
 }
 
 #if IS_ENABLED(CONFIG_NET_IPGRE)
@@ -268,9 +349,10 @@ mlxsw_sp_span_entry_gretap4_configure(struct mlxsw_sp_span_entry *span_entry,
 	/* Create a new port analayzer entry for local_port. */
 	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
 			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
 				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
-				    sparms.dmac, false);
+				    sparms.dmac, !!sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l3_ipv4_pack(mpat_pl,
 					      sparms.ttl, sparms.smac,
 					      be32_to_cpu(sparms.saddr.addr4),
@@ -368,9 +450,10 @@ mlxsw_sp_span_entry_gretap6_configure(struct mlxsw_sp_span_entry *span_entry,
 	/* Create a new port analayzer entry for local_port. */
 	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
 			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
 				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
-				    sparms.dmac, false);
+				    sparms.dmac, !!sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l3_ipv6_pack(mpat_pl, sparms.ttl, sparms.smac,
 					      sparms.saddr.addr6,
 					      sparms.daddr.addr6);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
index 4b87ec20e658..14a6de904db1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
@@ -63,6 +63,7 @@ struct mlxsw_sp_span_parms {
 	unsigned char smac[ETH_ALEN];
 	union mlxsw_sp_l3addr daddr;
 	union mlxsw_sp_l3addr saddr;
+	u16 vid;
 };
 
 struct mlxsw_sp_span_entry_ops;
-- 
2.14.3

^ permalink raw reply related

* [PATCH V9 net-next 00/14] TLS offload, netdev & MLX5 support
From: Boris Pismenny @ 2018-04-29  8:06 UTC (permalink / raw)
  To: davem; +Cc: netdev, saeedm, borisp, davejwatson, ktkhai

Hi Dave,

The following series provides TLS TX inline crypto offload.

v1->v2:
   - Added IS_ENABLED(CONFIG_TLS_DEVICE) and a STATIC_KEY for icsk_clean_acked
   - File license fix
   - Fix spelling, comment by DaveW
   - Move memory allocations out of tls_set_device_offload and other misc fixes,
	comments by Kiril.

v2->v3:
   - Reversed xmas tree where needed and style fixes
   - Removed the need for skb_page_frag_refill, per Eric's comment
   - IPv6 dependency fixes

v3->v4:
   - Remove "inline" from functions in C files
   - Make clean_acked_data_enabled a static variable and add enable/disable functions to control it.
   - Remove unnecessary variable initialization mentioned by ShannonN
   - Rebase over TLS RX
   - Refactor the tls_software_fallback to reduce the number of variables mentioned by KirilT

v4->v5:
   - Add missing CONFIG_TLS_DEVICE

v5->v6:
   - Move changes to the software implementation into a seperate patch
   - Fix some checkpatch warnings 
   - GPL export the enable/disable clean_acked_data functions

v6->v7:
   - Use the dst_entry to obtain the netdev in dev_get_by_index 
   - Remove the IPv6 patch since it is redundent now

v7->v8:
   - Fix a merge conflict in mlx5 header

v8->v9:
   - Fix false -Wmaybe-uninitialized warning
   - Fix empty space in the end of new files

This series adds a generic infrastructure to offload TLS crypto to a
network devices. It enables the kernel TLS socket to skip encryption and
authentication operations on the transmit side of the data path. Leaving
those computationally expensive operations to the NIC.

The NIC offload infrastructure builds TLS records and pushes them to the
TCP layer just like the SW KTLS implementation and using the same API.
TCP segmentation is mostly unaffected. Currently the only exception is
that we prevent mixed SKBs where only part of the payload requires
offload. In the future we are likely to add a similar restriction
following a change cipher spec record.

The notable differences between SW KTLS and NIC offloaded TLS
implementations are as follows:
1. The offloaded implementation builds "plaintext TLS record", those
records contain plaintext instead of ciphertext and place holder bytes
instead of authentication tags.
2. The offloaded implementation maintains a mapping from TCP sequence
number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
TLS socket, we can use the tls NIC offload infrastructure to obtain
enough context to encrypt the payload of the SKB.
A TLS record is released when the last byte of the record is ack'ed,
this is done through the new icsk_clean_acked callback.

The infrastructure should be extendable to support various NIC offload
implementations.  However it is currently written with the
implementation below in mind:
The NIC assumes that packets from each offloaded stream are sent as
plaintext and in-order. It keeps track of the TLS records in the TCP
stream. When a packet marked for offload is transmitted, the NIC
encrypts the payload in-place and puts authentication tags in the
relevant place holders.

The responsibility for handling out-of-order packets (i.e. TCP
retransmission, qdisc drops) falls on the netdev driver.

The netdev driver keeps track of the expected TCP SN from the NIC's
perspective.  If the next packet to transmit matches the expected TCP
SN, the driver advances the expected TCP SN, and transmits the packet
with TLS offload indication.

If the next packet to transmit does not match the expected TCP SN. The
driver calls the TLS layer to obtain the TLS record that includes the
TCP of the packet for transmission. Using this TLS record, the driver
posts a work entry on the transmit queue to reconstruct the NIC TLS
state required for the offload of the out-of-order packet. It updates
the expected TCP SN accordingly and transmit the now in-order packet.
The same queue is used for packet transmission and TLS context
reconstruction to avoid the need for flushing the transmit queue before
issuing the context reconstruction request.

Expected TCP SN is accessed without a lock, under the assumption that
TCP doesn't transmit SKBs from different TX queue concurrently.

If packets are rerouted to a different netdevice, then a software
fallback routine handles encryption.

Paper: https://www.netdevconf.org/1.2/papers/netdevconf-TLS.pdf

Boris Pismenny (3):
  net/tls: Split conf to rx + tx
  MAINTAINERS: Update mlx5 innova driver maintainers
  MAINTAINERS: Update TLS maintainers

Ilya Lesokhin (11):
  tcp: Add clean acked data hook
  net: Rename and export copy_skb_header
  net: Add Software fallback infrastructure for socket dependent
    offloads
  net: Add TLS offload netdev ops
  net: Add TLS TX offload features
  net/tls: Add generic NIC offload infrastructure
  net/mlx5e: Move defines out of ipsec code
  net/mlx5: Accel, Add TLS tx offload interface
  net/mlx5e: TLS, Add Innova TLS TX support
  net/mlx5e: TLS, Add Innova TLS TX offload data path
  net/mlx5e: TLS, Add error statistics

 MAINTAINERS                                        |  19 +-
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |  11 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   6 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c    |  71 ++
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h    |  86 +++
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  21 +
 .../mellanox/mlx5/core/en_accel/en_accel.h         |  72 ++
 .../ethernet/mellanox/mlx5/core/en_accel/ipsec.h   |   3 -
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 197 ++++++
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h |  87 +++
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c         | 278 ++++++++
 .../mellanox/mlx5/core/en_accel/tls_rxtx.h         |  50 ++
 .../mellanox/mlx5/core/en_accel/tls_stats.c        |  89 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   9 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  32 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   9 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |  37 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/core.h    |   1 +
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c   |   5 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 562 +++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h |  68 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  11 +
 include/linux/mlx5/mlx5_ifc.h                      |  16 -
 include/linux/mlx5/mlx5_ifc_fpga.h                 |  77 +++
 include/linux/netdev_features.h                    |   2 +
 include/linux/netdevice.h                          |  24 +
 include/linux/skbuff.h                             |   1 +
 include/net/inet_connection_sock.h                 |   2 +
 include/net/sock.h                                 |  21 +
 include/net/tcp.h                                  |   8 +
 include/net/tls.h                                  | 120 +++-
 net/Kconfig                                        |   4 +
 net/core/dev.c                                     |   4 +
 net/core/ethtool.c                                 |   1 +
 net/core/skbuff.c                                  |   9 +-
 net/ipv4/tcp_input.c                               |  25 +
 net/tls/Kconfig                                    |  10 +
 net/tls/Makefile                                   |   2 +
 net/tls/tls_device.c                               | 764 +++++++++++++++++++++
 net/tls/tls_device_fallback.c                      | 450 ++++++++++++
 net/tls/tls_main.c                                 | 143 ++--
 net/tls/tls_sw.c                                   | 138 ++--
 43 files changed, 3356 insertions(+), 191 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
 create mode 100644 net/tls/tls_device.c
 create mode 100644 net/tls/tls_device_fallback.c

-- 
1.8.3.1

^ permalink raw reply

* [PATCH V9 net-next 04/14] net: Add TLS offload netdev ops
From: Boris Pismenny @ 2018-04-29  8:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, saeedm, borisp, davejwatson, ktkhai, Ilya Lesokhin,
	Aviad Yehezkel
In-Reply-To: <1524989209-5866-1-git-send-email-borisp@mellanox.com>

From: Ilya Lesokhin <ilyal@mellanox.com>

Add new netdev ops to add and delete tls context

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
---
 include/linux/netdevice.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 366c328..b7f8f9d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -865,6 +865,26 @@ struct xfrmdev_ops {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+enum tls_offload_ctx_dir {
+	TLS_OFFLOAD_CTX_DIR_RX,
+	TLS_OFFLOAD_CTX_DIR_TX,
+};
+
+struct tls_crypto_info;
+struct tls_context;
+
+struct tlsdev_ops {
+	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
+			   enum tls_offload_ctx_dir direction,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn);
+	void (*tls_dev_del)(struct net_device *netdev,
+			    struct tls_context *ctx,
+			    enum tls_offload_ctx_dir direction);
+};
+#endif
+
 struct dev_ifalias {
 	struct rcu_head rcuhead;
 	char ifalias[];
@@ -1750,6 +1770,10 @@ struct net_device {
 	const struct xfrmdev_ops *xfrmdev_ops;
 #endif
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+	const struct tlsdev_ops *tlsdev_ops;
+#endif
+
 	const struct header_ops *header_ops;
 
 	unsigned int		flags;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V9 net-next 02/14] net: Rename and export copy_skb_header
From: Boris Pismenny @ 2018-04-29  8:06 UTC (permalink / raw)
  To: davem; +Cc: netdev, saeedm, borisp, davejwatson, ktkhai, Ilya Lesokhin
In-Reply-To: <1524989209-5866-1-git-send-email-borisp@mellanox.com>

From: Ilya Lesokhin <ilyal@mellanox.com>

copy_skb_header is renamed to skb_copy_header and
exported. Exposing this function give more flexibility
in copying SKBs.
skb_copy and skb_copy_expand do not give enough control
over which parts are copied.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a4a5c0c..908d66e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1034,6 +1034,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 				   gfp_t gfp_mask, bool fclone);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c647cfe..c642304 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
 	skb->inner_mac_header += off;
 }
 
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
 {
 	__copy_skb_header(new, old);
 
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
 	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
+EXPORT_SYMBOL(skb_copy_header);
 
 static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
 {
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 
 	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 	return n;
 }
 EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 		skb_clone_fraglist(n);
 	}
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 out:
 	return n;
 }
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
 			     skb->len + head_copy_len));
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 
 	skb_headers_offset_update(n, newheadroom - oldheadroom);
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V9 net-next 05/14] net: Add TLS TX offload features
From: Boris Pismenny @ 2018-04-29  8:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, saeedm, borisp, davejwatson, ktkhai, Ilya Lesokhin,
	Aviad Yehezkel
In-Reply-To: <1524989209-5866-1-git-send-email-borisp@mellanox.com>

From: Ilya Lesokhin <ilyal@mellanox.com>

This patch adds a netdev feature to configure TLS TX offloads.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
---
 include/linux/netdev_features.h | 2 ++
 net/core/ethtool.c              | 1 +
 2 files changed, 3 insertions(+)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index fe2f3b3..c87c3a3 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -78,6 +78,7 @@ enum {
 	NETIF_F_HW_ESP_BIT,		/* Hardware ESP transformation offload */
 	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 	NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
+	NETIF_F_HW_TLS_TX_BIT,		/* Hardware TLS TX offload */
 
 	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */
@@ -149,6 +150,7 @@ enum {
 #define	NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
 #define NETIF_F_HW_TLS_RECORD	__NETIF_F(HW_TLS_RECORD)
 #define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
+#define NETIF_F_HW_TLS_TX	__NETIF_F(HW_TLS_TX)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b849fda..3c6ce8a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -110,6 +110,7 @@ int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
 	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",
 	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
+	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
 };
 
 static const char
-- 
1.8.3.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox