Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH v3 bpf-next 3/5] bpf/verifier: update selftests
From: Edward Cree @ 2018-04-06 17:14 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev
In-Reply-To: <b445b3c5-04f8-7425-2a3c-9bc54a04add2@solarflare.com>

Error messages for some bad programs have changed.
Also added a test ("calls: interleaved functions") to ensure that subprogs
 are required to be contiguous.
It wasn't entirely clear to me what "calls: wrong recursive calls" was
 meant to test for, since all of the JMP|CALL insns are unreachable.  I've
 changed it so that they are now reachable, which causes static back-edges
 to be detected (since that, like insn reachability, is now tested before
 subprog boundaries are determined).

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 tools/testing/selftests/bpf/test_verifier.c | 51 ++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3e7718b1a9ae..d53522d20072 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -646,7 +646,7 @@ static struct bpf_test tests[] = {
 		.insns = {
 			BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2),
 		},
-		.errstr = "not an exit",
+		.errstr = "no exit/jump at end of program",
 		.result = REJECT,
 	},
 	{
@@ -9442,13 +9442,13 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "last insn is not an exit or jmp",
+		.errstr = "no exit/jump at end of subprog 0 (insn 0)",
 		.result = REJECT,
 	},
 	{
 		"calls: wrong recursive calls",
 		.insns = {
-			BPF_JMP_IMM(BPF_JA, 0, 0, 4),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 3),
 			BPF_JMP_IMM(BPF_JA, 0, 0, 4),
 			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
 			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
@@ -9457,7 +9457,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "jump out of range",
+		.errstr = "jump from insn 0 to 4 crosses",
 		.result = REJECT,
 	},
 	{
@@ -9508,7 +9508,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "jump out of range",
+		.errstr = "jump from insn 1 to 5 crosses",
 		.result = REJECT,
 	},
 	{
@@ -9787,7 +9787,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
-		.errstr = "jump out of range from insn 1 to 4",
+		.errstr = "jump from insn 1 to 4 crosses",
 		.result = REJECT,
 	},
 	{
@@ -9803,13 +9803,12 @@ static struct bpf_test tests[] = {
 			BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
 			BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
 			BPF_EXIT_INSN(),
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, len)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8),
 			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3),
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "jump out of range from insn 11 to 9",
+		.errstr = "jump from insn 11 to 9 crosses",
 		.result = REJECT,
 	},
 	{
@@ -9861,7 +9860,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "invalid destination",
+		.errstr = "call to invalid destination",
 		.result = REJECT,
 	},
 	{
@@ -9873,7 +9872,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "invalid destination",
+		.errstr = "call to invalid destination",
 		.result = REJECT,
 	},
 	{
@@ -9886,7 +9885,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "jump out of range",
+		.errstr = "jump from insn 3 to 1 crosses",
 		.result = REJECT,
 	},
 	{
@@ -9899,7 +9898,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "jump out of range",
+		.errstr = "jump from insn 0 to 4 crosses",
 		.result = REJECT,
 	},
 	{
@@ -9913,7 +9912,7 @@ static struct bpf_test tests[] = {
 			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -2),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "not an exit",
+		.errstr = "no exit/jump at end of program",
 		.result = REJECT,
 	},
 	{
@@ -9927,7 +9926,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "last insn",
+		.errstr = "no exit/jump at end of subprog",
 		.result = REJECT,
 	},
 	{
@@ -9942,7 +9941,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "last insn",
+		.errstr = "no exit/jump at end of subprog",
 		.result = REJECT,
 	},
 	{
@@ -9982,12 +9981,11 @@ static struct bpf_test tests[] = {
 			BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
 			BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
 			BPF_MOV64_REG(BPF_REG_0, BPF_REG_0),
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, len)),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8),
 			BPF_EXIT_INSN(),
 		},
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-		.errstr = "not an exit",
+		.errstr = "no exit/jump at end of subprog",
 		.result = REJECT,
 	},
 	{
@@ -11423,6 +11421,21 @@ static struct bpf_test tests[] = {
 		.errstr = "BPF_XADD stores into R2 packet",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"calls: interleaved functions",
+		.insns = {
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+		.errstr = "jump from insn 2 to 5 crosses",
+		.result = REJECT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)

^ permalink raw reply related

* [RFC PATCH v3 bpf-next 2/5] bpf/verifier: rewrite subprog boundary detection
From: Edward Cree @ 2018-04-06 17:13 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev
In-Reply-To: <b445b3c5-04f8-7425-2a3c-9bc54a04add2@solarflare.com>

By storing a subprogno in each insn's aux data, we avoid the need to keep
 the list of subprog starts sorted or bsearch() it in find_subprog().
Also, get rid of the weird one-based indexing of subprog numbers.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/bpf_verifier.h |   3 +-
 kernel/bpf/verifier.c        | 284 ++++++++++++++++++++++++++-----------------
 2 files changed, 177 insertions(+), 110 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8f70dc181e23..17990dd56e65 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -146,6 +146,7 @@ struct bpf_insn_aux_data {
 		s32 call_imm;			/* saved imm field of call insn */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+	u16 subprogno; /* subprog in which this insn resides */
 	bool seen; /* this insn was processed by the verifier */
 };
 
@@ -196,7 +197,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS];
 	u32 subprog_cnt;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df4d742360d9..587eb445bfa2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -736,109 +736,171 @@ enum reg_arg_type {
 	DST_OP_NO_MARK	/* same as above, check only, don't mark */
 };
 
-static int cmp_subprogs(const void *a, const void *b)
+static int find_subprog(struct bpf_verifier_env *env, int insn_idx)
 {
-	return ((struct bpf_subprog_info *)a)->start -
-	       ((struct bpf_subprog_info *)b)->start;
-}
-
-static int find_subprog(struct bpf_verifier_env *env, int off)
-{
-	struct bpf_subprog_info *p;
-
-	p = bsearch(&off, env->subprog_info, env->subprog_cnt,
-		    sizeof(env->subprog_info[0]), cmp_subprogs);
-	if (!p)
-		return -ENOENT;
-	return p - env->subprog_info;
+	int insn_cnt = env->prog->len;
+	u32 subprogno;
 
+	if (insn_idx >= insn_cnt || insn_idx < 0) {
+		verbose(env, "find_subprog of invalid insn_idx %d\n", insn_idx);
+		return -EINVAL;
+	}
+	subprogno = env->insn_aux_data[insn_idx].subprogno;
+	/* validate that we are at start of subprog */
+	if (env->subprog_info[subprogno].start != insn_idx) {
+		verbose(env, "insn_idx %d is in subprog %u but that starts at %d\n",
+			insn_idx, subprogno, env->subprog_info[subprogno].start);
+		return -EINVAL;
+	}
+	return subprogno;
 }
 
 static int add_subprog(struct bpf_verifier_env *env, int off)
 {
 	int insn_cnt = env->prog->len;
+	struct bpf_subprog_info *info;
 	int ret;
 
 	if (off >= insn_cnt || off < 0) {
 		verbose(env, "call to invalid destination\n");
 		return -EINVAL;
 	}
-	ret = find_subprog(env, off);
-	if (ret >= 0)
-		return 0;
-	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
-		verbose(env, "too many subprograms\n");
-		return -E2BIG;
+	ret = env->insn_aux_data[off].subprogno;
+	/* At the time add_subprog is called, only (some) entry points are
+	 * marked with an aux->subprogno; 'body' insns aren't.  Thus, a
+	 * subprogno of 0 means either "not yet marked as an entry point" or
+	 * "entry point of main(), i.e. insn 0".
+	 * However, a call to insn 0 is never legal (it would necessarily imply
+	 * recursion, which check_cfg will catch), therefore we can assume that
+	 * we will only be called with off == 0 once, and in that case we
+	 * should go ahead and add the subprog entry.
+	 */
+	if (!ret) {
+		if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+			verbose(env, "too many subprograms\n");
+			return -E2BIG;
+		}
+		ret = env->subprog_cnt++;
+		info = &env->subprog_info[ret];
+		info->start = off;
+		info->stack_depth = 0;
+		env->insn_aux_data[off].subprogno = ret;
 	}
-	env->subprog_info[env->subprog_cnt++].start = off;
-	sort(env->subprog_info, env->subprog_cnt,
-	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
-	return 0;
+	return ret;
 }
 
 static int check_subprogs(struct bpf_verifier_env *env)
 {
-	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
+	struct bpf_insn_aux_data *aux = env->insn_aux_data;
+	struct bpf_insn *insns = env->prog->insnsi;
+	int insn_cnt = env->prog->len, i, err;
+	int cur_subprog;
 
-	/* determine subprog starts. The end is one before the next starts */
-	for (i = 0; i < insn_cnt; i++) {
-		if (insn[i].code != (BPF_JMP | BPF_CALL))
-			continue;
-		if (insn[i].src_reg != BPF_PSEUDO_CALL)
-			continue;
-		if (!env->allow_ptr_leaks) {
-			verbose(env, "function calls to other bpf functions are allowed for root only\n");
-			return -EPERM;
-		}
-		if (bpf_prog_is_dev_bound(env->prog->aux)) {
-			verbose(env, "function calls in offloaded programs are not supported yet\n");
-			return -EINVAL;
+	/* Find subprog starts */
+	err = add_subprog(env, 0); /* subprog 0, main(), starts at insn 0 */
+	if (err < 0)
+		return err;
+	for (i = 0; i < insn_cnt; i++)
+		if (insns[i].code == (BPF_JMP | BPF_CALL) &&
+		    insns[i].src_reg == BPF_PSEUDO_CALL) {
+			if (!env->allow_ptr_leaks) {
+				verbose(env, "function calls to other bpf functions are allowed for root only\n");
+				return -EPERM;
+			}
+			if (bpf_prog_is_dev_bound(env->prog->aux)) {
+				verbose(env, "function calls in offloaded programs are not supported yet\n");
+				return -EINVAL;
+			}
+			/* add_subprog marks the callee entry point with the
+			 * new subprogno.
+			 */
+			err = add_subprog(env, i + insns[i].imm + 1);
+			if (err < 0)
+				return err;
 		}
-		ret = add_subprog(env, i + insn[i].imm + 1);
-		if (ret < 0)
-			return ret;
-	}
 
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, env->subprog_info[i].start);
 
-	/* now check that all jumps are within the same subprog */
-	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = env->subprog_info[cur_subprog++].start;
+	/* Fill in subprogno on each insn of function body, on the assumption
+	 * that each subprog is a contiguous block of insns.  This will be
+	 * validated by the next step
+	 */
+	cur_subprog = 0;
+	for (i = 0; i < insn_cnt; i++)
+		if (aux[i].subprogno)
+			cur_subprog = aux[i].subprogno;
+		else
+			aux[i].subprogno = cur_subprog;
+
+	/* Check that control never flows from one subprog to another except by
+	 * a pseudo-call insn.
+	 */
 	for (i = 0; i < insn_cnt; i++) {
-		u8 code = insn[i].code;
-
-		if (BPF_CLASS(code) != BPF_JMP)
-			goto next;
-		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
-			goto next;
-		off = i + insn[i].off + 1;
-		if (off < subprog_start || off >= subprog_end) {
-			verbose(env, "jump out of range from insn %d to %d\n", i, off);
-			return -EINVAL;
+		bool fallthrough = false, jump = false;
+		u8 opcode = BPF_OP(insns[i].code);
+		int target;
+
+		/* Determine where control flow from this insn goes */
+		if (BPF_CLASS(insns[i].code) != BPF_JMP) {
+			fallthrough = true;
+		} else {
+			switch (opcode) {
+			case BPF_EXIT:
+				/* This insn doesn't go anywhere.  If we're in
+				 * a subprog, then the return target is handled
+				 * as a fallthrough on the call insn, so no need
+				 * to worry about it here.
+				 */
+				continue;
+			case BPF_CALL:
+				/* If pseudo-call, already handled marking the
+				 * callee.  Both kinds of call will eventually
+				 * return and pass to the following insn.
+				 */
+				fallthrough = true;
+				break;
+			case BPF_JA:
+				/* unconditional jump; mark target */
+				jump = true;
+				target = i + insns[i].off + 1;
+				break;
+			default:
+				/* conditional jump; branch both ways */
+				fallthrough = true;
+				jump = true;
+				target = i + insns[i].off + 1;
+				break;
+			}
 		}
-next:
-		if (i == subprog_end - 1) {
-			/* to avoid fall-through from one subprog into another
-			 * the last insn of the subprog should be either exit
-			 * or unconditional jump back
-			 */
-			if (code != (BPF_JMP | BPF_EXIT) &&
-			    code != (BPF_JMP | BPF_JA)) {
-				verbose(env, "last insn is not an exit or jmp\n");
+
+		/* Check that that control flow doesn't leave the subprog */
+		cur_subprog = aux[i].subprogno;
+		if (fallthrough) {
+			if (i + 1 >= insn_cnt) {
+				verbose(env, "no exit/jump at end of program (insn %d)\n",
+					i);
+				return -EINVAL;
+			}
+			if (aux[i + 1].subprogno != cur_subprog) {
+				verbose(env, "no exit/jump at end of subprog %d (insn %d)\n",
+					cur_subprog, i);
+				return -EINVAL;
+			}
+		}
+		if (jump) {
+			if (target < 0 || target >= insn_cnt) {
+				verbose(env, "jump out of range from insn %d to %d\n",
+					i, target);
+				return -EINVAL;
+			}
+			if (aux[target].subprogno != cur_subprog) {
+				verbose(env, "jump from insn %d to %d crosses from subprog %d to %d\n",
+					i, target, cur_subprog,
+					aux[target].subprogno);
 				return -EINVAL;
 			}
-			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog)
-				subprog_end = insn_cnt;
-			else
-				subprog_end = env->subprog_info[cur_subprog++].start;
 		}
 	}
 	return 0;
@@ -1489,7 +1551,8 @@ static int update_stack_depth(struct bpf_verifier_env *env,
  */
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
-	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+	int depth = 0, frame = 0, subprog = 0, i = 0;
+	struct bpf_insn_aux_data *aux = env->insn_aux_data;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
@@ -1507,11 +1570,7 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = env->subprog_info[subprog].start;
-	for (; i < subprog_end; i++) {
+	for (; i < insn_cnt && aux[i].subprogno == subprog; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn[i].src_reg != BPF_PSEUDO_CALL)
@@ -1528,7 +1587,6 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 				  i);
 			return -EFAULT;
 		}
-		subprog++;
 		frame++;
 		if (frame >= MAX_CALL_FRAMES) {
 			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1561,7 +1619,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	subprog++;
 	return env->subprog_info[subprog].stack_depth;
 }
 #endif
@@ -2100,7 +2157,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_tail_call:
 		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
 			goto error;
-		if (env->subprog_cnt) {
+		if (env->subprog_cnt > 1) {
 			verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
 			return -EINVAL;
 		}
@@ -2245,6 +2302,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	}
 
 	target_insn = *insn_idx + insn->imm;
+	/* We will increment insn_idx (PC) in do_check() after handling this
+	 * call insn, so the actual start of the function is target + 1.
+	 */
 	subprog = find_subprog(env, target_insn + 1);
 	if (subprog < 0) {
 		verbose(env, "verifier bug. No program starts at insn %d\n",
@@ -2272,7 +2332,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			/* remember the callsite, it will be used by bpf_exit */
 			*insn_idx /* callsite */,
 			state->curframe + 1 /* frameno within this callchain */,
-			subprog + 1 /* subprog number within this prog */);
+			subprog /* subprog number within this prog */);
 
 	/* copy r1 - r5 args that callee can access */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -3831,7 +3891,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
-	if (env->subprog_cnt) {
+	if (env->subprog_cnt > 1) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
 		 * that can have arbitrary arguments. It's problematic
@@ -4020,10 +4080,6 @@ static int check_cfg(struct bpf_verifier_env *env)
 	int ret = 0;
 	int i, t;
 
-	ret = check_subprogs(env);
-	if (ret < 0)
-		return ret;
-
 	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
 		return -ENOMEM;
@@ -4862,11 +4918,11 @@ static int do_check(struct bpf_verifier_env *env)
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt + 1; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		u32 depth = env->subprog_info[i].stack_depth;
 
 		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt + 1)
+		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
@@ -5238,12 +5294,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 static int jit_subprogs(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog, **func, *tmp;
-	int i, j, subprog_start, subprog_end = 0, len, subprog;
 	struct bpf_insn *insn;
 	void *old_bpf_func;
+	int i, j, subprog;
 	int err = -ENOMEM;
 
-	if (env->subprog_cnt == 0)
+	if (env->subprog_cnt <= 1)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5259,7 +5315,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		/* temporarily remember subprog id inside insn instead of
 		 * aux_data, since next loop will split up all insns into funcs
 		 */
-		insn->off = subprog + 1;
+		insn->off = subprog;
 		/* remember original imm in case JIT fails and fallback
 		 * to interpreter will be needed
 		 */
@@ -5268,23 +5324,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
-	func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+	func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
 	if (!func)
 		return -ENOMEM;
 
-	for (i = 0; i <= env->subprog_cnt; i++) {
-		subprog_start = subprog_end;
-		if (env->subprog_cnt == i)
-			subprog_end = prog->len;
-		else
-			subprog_end = env->subprog_info[i].start;
+	for (i = 0; i < env->subprog_cnt; i++) {
+		struct bpf_subprog_info *info = &env->subprog_info[i];
+		int len, end = prog->len;
+
+		if (i + 1 < env->subprog_cnt)
+			end = info[1].start;
+		len = end - info->start;
 
-		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
 		if (!func[i])
 			goto out_free;
-		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
+		memcpy(func[i]->insnsi, prog->insnsi + info->start,
 		       len * sizeof(struct bpf_insn));
+
 		func[i]->type = prog->type;
 		func[i]->len = len;
 		if (bpf_prog_calc_tag(func[i]))
@@ -5307,7 +5364,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (insn->code != (BPF_JMP | BPF_CALL) ||
@@ -5315,12 +5372,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 				continue;
 			subprog = insn->off;
 			insn->off = 0;
+			if (subprog < 0 || subprog >= env->subprog_cnt) {
+				err = -EFAULT;
+				goto out_free;
+			}
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
 	}
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
 		tmp = bpf_int_jit_compile(func[i]);
 		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5334,7 +5395,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	/* finally lock prog and jit images for all functions and
 	 * populate kallsysm
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		bpf_prog_lock_ro(func[i]);
 		bpf_prog_kallsyms_add(func[i]);
 	}
@@ -5351,7 +5412,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog + 1]->bpf_func;
+		addr  = (unsigned long)func[subprog]->bpf_func;
 		addr &= PAGE_MASK;
 		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 			    addr - __bpf_call_base;
@@ -5360,10 +5421,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt + 1;
+	prog->aux->func_cnt = env->subprog_cnt;
 	return 0;
 out_free:
-	for (i = 0; i <= env->subprog_cnt; i++)
+	for (i = 0; i < env->subprog_cnt; i++)
 		if (func[i])
 			bpf_jit_free(func[i]);
 	kfree(func);
@@ -5393,6 +5454,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 		err = jit_subprogs(env);
 		if (err == 0)
 			return 0;
+		verbose(env, "failed to jit_subprogs %d\n", err);
 	}
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	for (i = 0; i < prog->len; i++, insn++) {
@@ -5682,6 +5744,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
+	ret = check_subprogs(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = check_cfg(env);
 	if (ret < 0)
 		goto skip_full_check;

^ permalink raw reply related

* [RFC PATCH v3 bpf-next 5/5] bpf/verifier: per-register parent pointers
From: Edward Cree @ 2018-04-06 17:15 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev
In-Reply-To: <b445b3c5-04f8-7425-2a3c-9bc54a04add2@solarflare.com>

By giving each register its own liveness chain, we elide the skip_callee()
 logic.  Instead, each register's parent is the state it inherits from;
 both check_func_call() and prepare_func_exit() automatically connect
 reg states to the correct chain since when they copy the reg state across
 (r1-r5 into the callee as args, and r0 out as the return value) they also
 copy the parent pointer.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/bpf_verifier.h |   8 +-
 kernel/bpf/verifier.c        | 180 ++++++++++---------------------------------
 2 files changed, 45 insertions(+), 143 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 32f27cbe721b..81b627dbda32 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -41,6 +41,7 @@ enum bpf_reg_liveness {
 };
 
 struct bpf_reg_state {
+	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
 	union {
 		/* valid when type == PTR_TO_PACKET */
@@ -59,7 +60,6 @@ struct bpf_reg_state {
 	 * came from, when one is tested for != NULL.
 	 */
 	u32 id;
-	/* Ordering of fields matters.  See states_equal() */
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 	 * the actual value.
 	 * For pointer types, this represents the variable part of the offset
@@ -76,15 +76,15 @@ struct bpf_reg_state {
 	s64 smax_value; /* maximum possible (s64)value */
 	u64 umin_value; /* minimum possible (u64)value */
 	u64 umax_value; /* maximum possible (u64)value */
+	/* parentage chain for liveness checking */
+	struct bpf_reg_state *parent;
 	/* Inside the callee two registers can be both PTR_TO_STACK like
 	 * R1=fp-8 and R2=fp-8, but one of them points to this function stack
 	 * while another to the caller's stack. To differentiate them 'frameno'
 	 * is used which is an index in bpf_verifier_state->frame[] array
 	 * pointing to bpf_func_state.
-	 * This field must be second to last, for states_equal() reasons.
 	 */
 	u32 frameno;
-	/* This field must be last, for states_equal() reasons. */
 	enum bpf_reg_liveness live;
 };
 
@@ -107,7 +107,6 @@ struct bpf_stack_state {
  */
 struct bpf_func_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	struct bpf_verifier_state *parent;
 	/* index of call instruction that called into this func */
 	int callsite;
 	/* stack frame number of this function state from pov of
@@ -129,7 +128,6 @@ struct bpf_func_state {
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
-	struct bpf_verifier_state *parent;
 	u32 curframe;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45f530e4a65e..4b37cd6fa181 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -355,9 +355,9 @@ static int copy_stack_state(struct bpf_func_state *dst,
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
  * make it consume minimal amount of memory. check_stack_write() access from
  * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which this function copies over. It points to previous bpf_verifier_state
- * which is never reallocated
+ * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
+ * which this function copies over. It points to corresponding reg in previous
+ * bpf_verifier_state which is never reallocated
  */
 static int realloc_func_state(struct bpf_func_state *state, int size,
 			      bool copy_old)
@@ -441,7 +441,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		dst_state->frame[i] = NULL;
 	}
 	dst_state->curframe = src->curframe;
-	dst_state->parent = src->parent;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -707,6 +706,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
 	for (i = 0; i < MAX_BPF_REG; i++) {
 		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
+		regs[i].parent = NULL;
 	}
 
 	/* frame pointer */
@@ -916,74 +916,21 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	return 0;
 }
 
-static
-struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
-				       const struct bpf_verifier_state *state,
-				       struct bpf_verifier_state *parent,
-				       u32 regno)
-{
-	struct bpf_verifier_state *tmp = NULL;
-
-	/* 'parent' could be a state of caller and
-	 * 'state' could be a state of callee. In such case
-	 * parent->curframe < state->curframe
-	 * and it's ok for r1 - r5 registers
-	 *
-	 * 'parent' could be a callee's state after it bpf_exit-ed.
-	 * In such case parent->curframe > state->curframe
-	 * and it's ok for r0 only
-	 */
-	if (parent->curframe == state->curframe ||
-	    (parent->curframe < state->curframe &&
-	     regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
-	    (parent->curframe > state->curframe &&
-	       regno == BPF_REG_0))
-		return parent;
-
-	if (parent->curframe > state->curframe &&
-	    regno >= BPF_REG_6) {
-		/* for callee saved regs we have to skip the whole chain
-		 * of states that belong to callee and mark as LIVE_READ
-		 * the registers before the call
-		 */
-		tmp = parent;
-		while (tmp && tmp->curframe != state->curframe) {
-			tmp = tmp->parent;
-		}
-		if (!tmp)
-			goto bug;
-		parent = tmp;
-	} else {
-		goto bug;
-	}
-	return parent;
-bug:
-	verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
-	verbose(env, "regno %d parent frame %d current frame %d\n",
-		regno, parent->curframe, state->curframe);
-	return NULL;
-}
-
+/* Parentage chain of this register (or stack slot) should take care of all
+ * issues like callee-saved registers, stack slot allocation time, etc.
+ */
 static int mark_reg_read(struct bpf_verifier_env *env,
-			 const struct bpf_verifier_state *state,
-			 struct bpf_verifier_state *parent,
-			 u32 regno)
+			 const struct bpf_reg_state *state,
+			 struct bpf_reg_state *parent)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
 
-	if (regno == BPF_REG_FP)
-		/* We don't need to worry about FP liveness because it's read-only */
-		return 0;
-
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
+		if (writes && state->live & REG_LIVE_WRITTEN)
 			break;
-		parent = skip_callee(env, state, parent, regno);
-		if (!parent)
-			return -EFAULT;
 		/* ... then we depend on parent's value */
-		parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
+		parent->live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 		writes = true;
@@ -1009,7 +956,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		return mark_reg_read(env, vstate, vstate->parent, regno);
+		/* We don't need to worry about FP liveness because it's read-only */
+		if (regno != BPF_REG_FP)
+			return mark_reg_read(env, &regs[regno],
+					     regs[regno].parent);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -1121,61 +1071,6 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	return 0;
 }
 
-/* registers of every function are unique and mark_reg_read() propagates
- * the liveness in the following cases:
- * - from callee into caller for R1 - R5 that were used as arguments
- * - from caller into callee for R0 that used as result of the call
- * - from caller to the same caller skipping states of the callee for R6 - R9,
- *   since R6 - R9 are callee saved by implicit function prologue and
- *   caller's R6 != callee's R6, so when we propagate liveness up to
- *   parent states we need to skip callee states for R6 - R9.
- *
- * stack slot marking is different, since stacks of caller and callee are
- * accessible in both (since caller can pass a pointer to caller's stack to
- * callee which can pass it to another function), hence mark_stack_slot_read()
- * has to propagate the stack liveness to all parent states at given frame number.
- * Consider code:
- * f1() {
- *   ptr = fp - 8;
- *   *ptr = ctx;
- *   call f2 {
- *      .. = *ptr;
- *   }
- *   .. = *ptr;
- * }
- * First *ptr is reading from f1's stack and mark_stack_slot_read() has
- * to mark liveness at the f1's frame and not f2's frame.
- * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
- * to propagate liveness to f2 states at f1's frame level and further into
- * f1 states at f1's frame level until write into that stack slot
- */
-static void mark_stack_slot_read(struct bpf_verifier_env *env,
-				 const struct bpf_verifier_state *state,
-				 struct bpf_verifier_state *parent,
-				 int slot, int frameno)
-{
-	bool writes = parent == state->parent; /* Observe write marks */
-
-	while (parent) {
-		if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
-			/* since LIVE_WRITTEN mark is only done for full 8-byte
-			 * write the read marks are conservative and parent
-			 * state may not even have the stack allocated. In such case
-			 * end the propagation, since the loop reached beginning
-			 * of the function
-			 */
-			break;
-		/* if read wasn't screened by an earlier write ... */
-		if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
-			break;
-		/* ... then we depend on parent's value */
-		parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
-		state = parent;
-		parent = state->parent;
-		writes = true;
-	}
-}
-
 static int check_stack_read(struct bpf_verifier_env *env,
 			    struct bpf_func_state *reg_state /* func where register points to */,
 			    int off, int size, int value_regno)
@@ -1213,8 +1108,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			 */
 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 		}
-		mark_stack_slot_read(env, vstate, vstate->parent, spi,
-				     reg_state->frameno);
+		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+			      reg_state->stack[spi].spilled_ptr.parent);
 		return 0;
 	} else {
 		int zeros = 0;
@@ -1230,8 +1125,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 				off, i, size);
 			return -EACCES;
 		}
-		mark_stack_slot_read(env, vstate, vstate->parent, spi,
-				     reg_state->frameno);
+		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+			      reg_state->stack[spi].spilled_ptr.parent);
 		if (value_regno >= 0) {
 			if (zeros == size) {
 				/* any size read into register is zero extended,
@@ -1930,8 +1825,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		/* reading any byte out of 8-byte 'spill_slot' will cause
 		 * the whole slot to be marked as 'read'
 		 */
-		mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
-				     spi, state->frameno);
+		mark_reg_read(env, &state->stack[spi].spilled_ptr,
+			      state->stack[spi].spilled_ptr.parent);
 	}
 	return update_stack_depth(env, state, off);
 }
@@ -2362,11 +2257,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			state->curframe + 1 /* frameno within this callchain */,
 			subprog /* subprog number within this prog */);
 
-	/* copy r1 - r5 args that callee can access */
+	/* copy r1 - r5 args that callee can access.  The copy includes parent
+	 * pointers, which connects us up to the liveness chain
+	 */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 		callee->regs[i] = caller->regs[i];
 
-	/* after the call regsiters r0 - r5 were scratched */
+	/* after the call registers r0 - r5 were scratched */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, caller->regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
@@ -4272,7 +4169,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 		/* explored state didn't use this */
 		return true;
 
-	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
+	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
 
 	if (rold->type == PTR_TO_STACK)
 		/* two stack pointers are equal only if they're pointing to
@@ -4505,7 +4402,7 @@ static bool states_equal(struct bpf_verifier_env *env,
  * equivalent state (jump target or such) we didn't arrive by the straight-line
  * code, so read marks in the state must propagate to the parent regardless
  * of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() and mark_stack_slot_read() is for.
+ * in mark_reg_read() is for.
  */
 static int propagate_liveness(struct bpf_verifier_env *env,
 			      const struct bpf_verifier_state *vstate,
@@ -4526,7 +4423,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 		if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
 			continue;
 		if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
-			err = mark_reg_read(env, vstate, vparent, i);
+			err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
+					    &vparent->frame[vstate->curframe]->regs[i]);
 			if (err)
 				return err;
 		}
@@ -4541,7 +4439,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 			if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 				continue;
 			if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
-				mark_stack_slot_read(env, vstate, vparent, i, frame);
+				mark_reg_read(env, &state->stack[i].spilled_ptr,
+					      &parent->stack[i].spilled_ptr);
 		}
 	}
 	return err;
@@ -4551,7 +4450,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
-	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err;
 
 	sl = env->explored_states[insn_idx];
@@ -4593,16 +4492,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	err = copy_verifier_state(&new_sl->state, cur);
+	new = &new_sl->state;
+	err = copy_verifier_state(new, cur);
 	if (err) {
-		free_verifier_state(&new_sl->state, false);
+		free_verifier_state(new, false);
 		kfree(new_sl);
 		return err;
 	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	cur->parent = &new_sl->state;
+	for (i = 0; i < BPF_REG_FP; i++)
+		cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -4615,9 +4516,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	/* all stack frames are accessible from callee, clear them all */
 	for (j = 0; j <= cur->curframe; j++) {
 		struct bpf_func_state *frame = cur->frame[j];
+		struct bpf_func_state *newframe = new->frame[j];
 
-		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
+		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
 			frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
+			frame->stack[i].spilled_ptr.parent =
+						&newframe->stack[i].spilled_ptr;
+		}
 	}
 	return 0;
 }
@@ -4636,7 +4541,6 @@ static int do_check(struct bpf_verifier_env *env)
 	if (!state)
 		return -ENOMEM;
 	state->curframe = 0;
-	state->parent = NULL;
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);

^ permalink raw reply related

* [RFC PATCH v3 bpf-next 4/5] bpf/verifier: use call graph to efficiently check max stack depth
From: Edward Cree @ 2018-04-06 17:14 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev
In-Reply-To: <b445b3c5-04f8-7425-2a3c-9bc54a04add2@solarflare.com>

Instead of recursively walking every possible call chain,
 check_max_stack_depth() now uses an explicit call graph (recorded in
 subprog_info.callees) which it topologically sorts, allowing it to find
 for each subprog the maximum stack depth of all call chains, and then
 use that depth in calculating the stack depth it implies for the
 subprog's callees.  This is O(number of subprogs).
The call graph is populated as part of the check_subprogs() pass.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/bpf_verifier.h |   3 +
 kernel/bpf/verifier.c        | 168 +++++++++++++++++++++++++------------------
 2 files changed, 101 insertions(+), 70 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 17990dd56e65..32f27cbe721b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -175,8 +175,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
+	/* which other subprogs does this one directly call? */
+	DECLARE_BITMAP(callees, BPF_MAX_SUBPROGS);
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
+	u16 total_stack_depth; /* max. stack depth used by entire call chain */
 };
 
 /* single container for all structs
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 587eb445bfa2..45f530e4a65e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -784,6 +784,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		info = &env->subprog_info[ret];
 		info->start = off;
 		info->stack_depth = 0;
+		memset(info->callees, 0, sizeof(info->callees));
 		env->insn_aux_data[off].subprogno = ret;
 	}
 	return ret;
@@ -793,13 +794,13 @@ static int check_subprogs(struct bpf_verifier_env *env)
 {
 	struct bpf_insn_aux_data *aux = env->insn_aux_data;
 	struct bpf_insn *insns = env->prog->insnsi;
-	int insn_cnt = env->prog->len, i, err;
+	int insn_cnt = env->prog->len, i, subprog;
 	int cur_subprog;
 
 	/* Find subprog starts */
-	err = add_subprog(env, 0); /* subprog 0, main(), starts at insn 0 */
-	if (err < 0)
-		return err;
+	subprog = add_subprog(env, 0); /* subprog 0, main(), starts at insn 0 */
+	if (subprog < 0)
+		return subprog;
 	for (i = 0; i < insn_cnt; i++)
 		if (insns[i].code == (BPF_JMP | BPF_CALL) &&
 		    insns[i].src_reg == BPF_PSEUDO_CALL) {
@@ -814,9 +815,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			/* add_subprog marks the callee entry point with the
 			 * new subprogno.
 			 */
-			err = add_subprog(env, i + insns[i].imm + 1);
-			if (err < 0)
-				return err;
+			subprog = add_subprog(env, i + insns[i].imm + 1);
+			if (subprog < 0)
+				return subprog;
 		}
 
 	if (env->log.level > 1)
@@ -842,6 +843,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 		u8 opcode = BPF_OP(insns[i].code);
 		int target;
 
+		cur_subprog = aux[i].subprogno;
 		/* Determine where control flow from this insn goes */
 		if (BPF_CLASS(insns[i].code) != BPF_JMP) {
 			fallthrough = true;
@@ -855,9 +857,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
 				 */
 				continue;
 			case BPF_CALL:
-				/* If pseudo-call, already handled marking the
-				 * callee.  Both kinds of call will eventually
-				 * return and pass to the following insn.
+				if (insns[i].src_reg == BPF_PSEUDO_CALL) {
+					/* record edge in call graph */
+					subprog = find_subprog(env, i + insns[i].imm + 1);
+					if (subprog < 0)
+						return subprog;
+					__set_bit(subprog, env->subprog_info[cur_subprog].callees);
+					/* already handled marking the callee
+					 * back in add_subprog, so jump is false
+					 */
+				}
+				/* Call will eventually return and pass control
+				 * to the following insn.
 				 */
 				fallthrough = true;
 				break;
@@ -876,7 +887,6 @@ static int check_subprogs(struct bpf_verifier_env *env)
 		}
 
 		/* Check that that control flow doesn't leave the subprog */
-		cur_subprog = aux[i].subprogno;
 		if (fallthrough) {
 			if (i + 1 >= insn_cnt) {
 				verbose(env, "no exit/jump at end of program (insn %d)\n",
@@ -1533,78 +1543,96 @@ static int update_stack_depth(struct bpf_verifier_env *env,
 			      const struct bpf_func_state *func,
 			      int off)
 {
-	u16 stack = env->subprog_info[func->subprogno].stack_depth;
+	struct bpf_subprog_info *info;
 
-	if (stack >= -off)
-		return 0;
+	if (!func)
+		return -EFAULT;
+	if (func->subprogno >= BPF_MAX_SUBPROGS)
+		return -E2BIG;
+	info = &env->subprog_info[func->subprogno];
 
 	/* update known max for given subprogram */
-	env->subprog_info[func->subprogno].stack_depth = -off;
+	info->stack_depth = max_t(u16, info->stack_depth, -off);
 	return 0;
 }
 
-/* starting from main bpf function walk all instructions of the function
- * and recursively walk all callees that given function can call.
- * Ignore jump and exit insns.
- * Since recursion is prevented by check_cfg() this algorithm
- * only needs a local stack of MAX_CALL_FRAMES to remember callsites
+/* Topologically sort the call graph, and thereby determine the maximum stack
+ * depth of each subprog's worst-case call chain.  Store in total_stack_depth.
+ * The tsort is performed using Kahn's algorithm.  Since that algorithm selects
+ * nodes in the order of the sorted output, we can do our processing in the loop
+ * that does the tsort, rather than storing the sorted list and then having a
+ * second loop to iterate over it and compute the total_stack_depth values.
  */
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
-	int depth = 0, frame = 0, subprog = 0, i = 0;
-	struct bpf_insn_aux_data *aux = env->insn_aux_data;
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
-	int ret_insn[MAX_CALL_FRAMES];
-	int ret_prog[MAX_CALL_FRAMES];
+	DECLARE_BITMAP(frontier, BPF_MAX_SUBPROGS) = {0};
+	DECLARE_BITMAP(visited, BPF_MAX_SUBPROGS) = {0};
+	int subprog, i, j;
+
+	/* subprog 0 has no incoming edges, and should be the only such */
+	__set_bit(0, frontier);
+	env->subprog_info[0].total_stack_depth = env->subprog_info[0].stack_depth;
+
+	while (true) {
+		/* select a frontier node */
+		subprog = find_first_bit(frontier, BPF_MAX_SUBPROGS);
+		if (subprog >= BPF_MAX_SUBPROGS) /* frontier is empty, done */
+			break;
+		/* remove it from the frontier */
+		__clear_bit(subprog, frontier);
+		/* validate its total stack depth.  Since all callers of this
+		 * function have already been processed, the value now in
+		 * info->total_stack_depth reflects the deepest call chain of
+		 * this function.
+		 */
+		if (env->subprog_info[subprog].total_stack_depth > MAX_BPF_STACK) {
+			verbose(env, "combined stack size of calls to %d (at insn %d) is %d.  Too large\n",
+				subprog, env->subprog_info[subprog].start,
+				env->subprog_info[subprog].total_stack_depth);
+			return -EACCES;
+		}
+		if (env->log.level > 1) {
+			verbose(env, "combined stack size of calls to %d (at insn %d) is %d\n",
+				subprog, env->subprog_info[subprog].start,
+				env->subprog_info[subprog].total_stack_depth);
+		}
+		__set_bit(subprog, visited);
+		/* for each callee */
+		for_each_set_bit(i, env->subprog_info[subprog].callees,
+				 BPF_MAX_SUBPROGS) {
+			/* round up to 32-bytes, since this is granularity of
+			 * interpreter stack size
+			 */
+			u16 stack_depth = round_up(max_t(u16, env->subprog_info[i].stack_depth, 1), 32);
 
-process_func:
-	/* round up to 32-bytes, since this is granularity
-	 * of interpreter stack size
-	 */
-	depth += round_up(max_t(u32, env->subprog_info[subprog].stack_depth, 1),
-			  32);
-	if (depth > MAX_BPF_STACK) {
-		verbose(env, "combined stack size of %d calls is %d. Too large\n",
-			frame + 1, depth);
-		return -EACCES;
+			/* Update callee total stack depth.  Since our own
+			 * max total_stack_depth is known, the callee tsd is
+			 * at least that plus the callee's stack frame size.
+			 */
+			env->subprog_info[i].total_stack_depth = max_t(u16,
+				env->subprog_info[i].total_stack_depth,
+				env->subprog_info[subprog].total_stack_depth +
+				stack_depth);
+			/* does it have unvisited callers? */
+			for_each_clear_bit(j, visited, env->subprog_cnt) {
+				if (test_bit(i, env->subprog_info[j].callees))
+					break;
+			}
+			/* if not, add it to the frontier */
+			if (j >= env->subprog_cnt)
+				__set_bit(i, frontier);
+		}
 	}
-continue_func:
-	for (; i < insn_cnt && aux[i].subprogno == subprog; i++) {
-		if (insn[i].code != (BPF_JMP | BPF_CALL))
-			continue;
-		if (insn[i].src_reg != BPF_PSEUDO_CALL)
-			continue;
-		/* remember insn and function to return to */
-		ret_insn[frame] = i + 1;
-		ret_prog[frame] = subprog;
 
-		/* find the callee */
-		i = i + insn[i].imm + 1;
-		subprog = find_subprog(env, i);
-		if (subprog < 0) {
-			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
-				  i);
-			return -EFAULT;
-		}
-		frame++;
-		if (frame >= MAX_CALL_FRAMES) {
-			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
-			return -EFAULT;
-		}
-		goto process_func;
+	/* are any nodes left unvisited? */
+	subprog = find_first_zero_bit(visited, env->subprog_cnt);
+	if (subprog < env->subprog_cnt) {
+		/* then call graph is not acyclic, which shouldn't happen */
+		verbose(env, "verifier bug.  Call graph has a cycle including subprog %d (at insn %d)\n",
+			subprog, env->subprog_info[subprog].start);
+		return -EFAULT;
 	}
-	/* end of for() loop means the last insn of the 'subprog'
-	 * was reached. Doesn't matter whether it was JA or EXIT
-	 */
-	if (frame == 0)
-		return 0;
-	depth -= round_up(max_t(u32, env->subprog_info[subprog].stack_depth, 1),
-			  32);
-	frame--;
-	i = ret_insn[frame];
-	subprog = ret_prog[frame];
-	goto continue_func;
+	return 0;
 }
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON

^ permalink raw reply related

* Re: [PATCH v2] dp83640: Ensure against premature access to PHY registers after reset
From: Andrew Lunn @ 2018-04-06 17:23 UTC (permalink / raw)
  To: Esben Haabendal
  Cc: netdev, richardcochran, f.fainelli, linux-kernel, Esben Haabendal
In-Reply-To: <20180406170844.4248-1-esben.haabendal@gmail.com>

On Fri, Apr 06, 2018 at 07:08:44PM +0200, Esben Haabendal wrote:
> From: Esben Haabendal <eha@deif.com>
 
Hi Esben

It it good to have some form of commit message. Something like:

The datasheet specifies a 3uS pause after performing a software
reset. The default implementation of genphy_soft_reset() does not
provide this, so implement soft_reset with the needed pause.


With that, or something similar added:

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Fw: [Bug 199307] New: kernel panic,call trace,sched_clock,tcp_write_timer_handler
From: Stephen Hemminger @ 2018-04-06 17:23 UTC (permalink / raw)
  To: netdev

Forwarding to netdev mailing list, it might be a real problem

Ugh. Glaring backtrace screenshots. 

Begin forwarded message:

Date: Fri, 06 Apr 2018 16:43:46 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 199307] New: kernel panic,call trace,sched_clock,tcp_write_timer_handler


https://bugzilla.kernel.org/show_bug.cgi?id=199307

            Bug ID: 199307
           Summary: kernel panic,call
                    trace,sched_clock,tcp_write_timer_handler
           Product: Networking
           Version: 2.5
    Kernel Version: 4.16
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: normal
          Priority: P1
         Component: Other
          Assignee: stephen@networkplumber.org
          Reporter: youling257@gmail.com
        Regression: No

Created attachment 275131
  --> https://bugzilla.kernel.org/attachment.cgi?id=275131&action=edit  
picture

see the picture

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Re: [PATCH v2] net: phy: marvell: Enable interrupt function on LED2 pin
From: Andrew Lunn @ 2018-04-06 17:29 UTC (permalink / raw)
  To: Esben Haabendal
  Cc: netdev, Esben Haabendal, Rasmus Villemoes, Florian Fainelli,
	open list
In-Reply-To: <20180405204029.665-1-esben.haabendal@gmail.com>

On Thu, Apr 05, 2018 at 10:40:29PM +0200, Esben Haabendal wrote:
> From: Esben Haabendal <eha@deif.com>
> 
> The LED2[2]/INTn pin on Marvell 88E1318S as well as 88E1510/12/14/18 needs
> to be configured to be usable as interrupt not only when WOL is enabled,
> but whenever we rely on interrupts from the PHY.
> 
> Signed-off-by: Esben Haabendal <eha@deif.com>
> Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2] net: phy: marvell: Enable interrupt function on LED2 pin
From: David Miller @ 2018-04-06 17:37 UTC (permalink / raw)
  To: andrew
  Cc: esben.haabendal, netdev, eha, rasmus.villemoes, f.fainelli,
	linux-kernel
In-Reply-To: <20180406172928.GS17495@lunn.ch>

From: Andrew Lunn <andrew@lunn.ch>
Date: Fri, 6 Apr 2018 19:29:28 +0200

> On Thu, Apr 05, 2018 at 10:40:29PM +0200, Esben Haabendal wrote:
>> From: Esben Haabendal <eha@deif.com>
>> 
>> The LED2[2]/INTn pin on Marvell 88E1318S as well as 88E1510/12/14/18 needs
>> to be configured to be usable as interrupt not only when WOL is enabled,
>> but whenever we rely on interrupts from the PHY.
>> 
>> Signed-off-by: Esben Haabendal <eha@deif.com>
>> Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
> 
> Reviewed-by: Andrew Lunn <andrew@lunn.ch>

Applied, thanks everyone.

^ permalink raw reply

* Re: [PATCH iproute2-next v1] rdma: Print net device name and index for RDMA device
From: David Ahern @ 2018-04-06 18:54 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Leon Romanovsky, netdev, RDMA mailing list, Stephen Hemminger,
	Steve Wise
In-Reply-To: <20180403042914.23360-1-leon@kernel.org>

On 4/2/18 10:29 PM, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
> 
> The RDMA devices are operated in RoCE and iWARP modes have net device
> underneath. Present their names in regular output and their net index
> in detailed mode.
> 
> [root@nps ~]# rdma link show mlx5_3/1
> 4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7
> [root@nps ~]# rdma link show mlx5_3/1 -d
> 4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7 netdev_index 7
>     caps: <CM, IP_BASED_GIDS>
> 
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
> ---
>  Changes v0->v1:
>   * Resend after commit 29122c1aae35 ("rdma: update rdma_netlink.h")
>     which updated relevant netlink attributes.
>   * Added Steve's ROB
> ---
>  rdma/include/uapi/rdma/rdma_netlink.h |  4 ++++
>  rdma/link.c                           | 21 +++++++++++++++++++++
>  rdma/utils.c                          |  2 ++
>  3 files changed, 27 insertions(+)
> 

applied to iproute2-next

^ permalink raw reply

* kernel BUG at drivers/vhost/vhost.c:LINE! (2)
From: syzbot @ 2018-04-06 19:02 UTC (permalink / raw)
  To: jasowang, kvm, linux-kernel, mst, netdev, syzkaller-bugs,
	virtualization

Hello,

syzbot hit the following crash on upstream commit
38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +0000)
Merge tag 'armsoc-drivers' of  
git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd

So far this crash happened 4 times on upstream.
C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6586748079439872
syzkaller reproducer:  
https://syzkaller.appspot.com/x/repro.syz?id=5974272052822016
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=6224632407392256
Kernel config:  
https://syzkaller.appspot.com/x/.config?id=-5813481738265533882
compiler: gcc (GCC) 8.0.1 20180301 (experimental)

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.
If you forward the report, please keep this part and the footer.

------------[ cut here ]------------
kernel BUG at drivers/vhost/vhost.c:1652!
invalid opcode: 0000 [#1] SMP KASAN
------------[ cut here ]------------
Dumping ftrace buffer:
kernel BUG at drivers/vhost/vhost.c:1652!
    (ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 4461 Comm: syzkaller684218 Not tainted 4.16.0+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:set_bit_to_user drivers/vhost/vhost.c:1652 [inline]
RIP: 0010:log_write+0x42a/0x4d0 drivers/vhost/vhost.c:1676
RSP: 0018:ffff8801b256f920 EFLAGS: 00010293
RAX: ffff8801adc9e2c0 RBX: dffffc0000000000 RCX: ffffffff85924a0f
RDX: 0000000000000000 RSI: ffffffff85924cea RDI: 0000000000000005
RBP: ffff8801b256fa58 R08: ffff8801adc9e2c0 R09: ffffed003962412d
R10: ffff8801b256fad8 R11: ffff8801cb12096f R12: 0001ffffffffffff
R13: ffffed00364adf36 R14: 0000000000000000 R15: ffff8801b256fa30
FS:  00007fdf24b19700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020bf6000 CR3: 00000001ae6a7000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  vhost_update_used_flags+0x3af/0x4a0 drivers/vhost/vhost.c:1723
  vhost_vq_init_access+0x117/0x590 drivers/vhost/vhost.c:1763
  vhost_vsock_start drivers/vhost/vsock.c:446 [inline]
  vhost_vsock_dev_ioctl+0x751/0x920 drivers/vhost/vsock.c:678
  vfs_ioctl fs/ioctl.c:46 [inline]
  file_ioctl fs/ioctl.c:500 [inline]
  do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684
  ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701
  SYSC_ioctl fs/ioctl.c:708 [inline]
  SyS_ioctl+0x24/0x30 fs/ioctl.c:706
  do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x4456c9
RSP: 002b:00007fdf24b18da8 EFLAGS: 00000297 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000006dac24 RCX: 00000000004456c9
RDX: 0000000020f82ffc RSI: 000000004004af61 RDI: 000000000000001b
RBP: 00000000006dac20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000297 R12: 6b636f73762d7473
R13: 6f68762f7665642f R14: fffffffffffffffc R15: 0000000000000007
Code: e8 7c 5e e4 fb 4c 89 ef e8 e4 16 06 fc 48 8d 85 58 ff ff ff 48 c1 e8  
03 c6 04 18 f8 e9 46 ff ff ff 45 31 f6 eb 91 e8 56 5e e4 fb <0f> 0b e8 4f  
5e e4 fb 48 c7 c6 a0 a3 24 88 4c 89 ef e8 60 b6 10
RIP: set_bit_to_user drivers/vhost/vhost.c:1652 [inline] RSP:  
ffff8801b256f920
RIP: log_write+0x42a/0x4d0 drivers/vhost/vhost.c:1676 RSP: ffff8801b256f920
invalid opcode: 0000 [#2] SMP KASAN
---[ end trace 0d0ff45aa44d8a23 ]---
Dumping ftrace buffer:
    (ftrace buffer empty)
Modules linked in:


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* [PATCH v4] net: thunderx: rework mac addresses list to u64 array
From: Vadim Lomovtsev @ 2018-04-06 19:53 UTC (permalink / raw)
  To: sgoutham, sunil.kovvuri, rric, linux-arm-kernel, netdev,
	linux-kernel, davem
  Cc: dnelson, robin.murphy, hch, gustavo, Vadim Lomovtsev
In-Reply-To: <20180406140443.15181-1-Vadim.Lomovtsev@caviumnetworks.com>

From: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>

It is too expensive to pass u64 values via linked list, instead
allocate array for them by overall number of mac addresses from netdev.

This eventually removes multiple kmalloc() calls, aviod memory
fragmentation and allow to put single null check on kmalloc
return value in order to prevent a potential null pointer dereference.

Addresses-Coverity-ID: 1467429 ("Dereference null return value")
Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Vadim Lomovtsev <Vadim.Lomovtsev@cavium.com>
---
Changes from v1 to v2:
 - C99 syntax: update xcast_addr_list struct field mc[0] -> mc[];
Changes from v2 to v3:
 - update commit description with 'Reported-by: Dan Carpenter';
 - update size calculations for mc list to offsetof() call
   instead of explicit arithmetic;
Changes from v3 to v4:
 - change loop control variable type from u8 to int, accordingly
   to mc_count size;
---
 drivers/net/ethernet/cavium/thunder/nic.h        |  7 +-----
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++++---------------
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 5fc46c5..448d1fa 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -265,14 +265,9 @@ struct nicvf_drv_stats {
 
 struct cavium_ptp;
 
-struct xcast_addr {
-	struct list_head list;
-	u64              addr;
-};
-
 struct xcast_addr_list {
-	struct list_head list;
 	int              count;
+	u64              mc[];
 };
 
 struct nicvf_work {
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 1e9a31f..6bd5658 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
 						  work.work);
 	struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work);
 	union nic_mbx mbx = {};
-	struct xcast_addr *xaddr, *next;
+	int idx = 0;
 
 	if (!vf_work)
 		return;
@@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg)
 	/* check if we have any specific MACs to be added to PF DMAC filter */
 	if (vf_work->mc) {
 		/* now go through kernel list of MACs and add them one by one */
-		list_for_each_entry_safe(xaddr, next,
-					 &vf_work->mc->list, list) {
+		for (idx = 0; idx < vf_work->mc->count; idx++) {
 			mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST;
-			mbx.xcast.data.mac = xaddr->addr;
+			mbx.xcast.data.mac = vf_work->mc->mc[idx];
 			nicvf_send_msg_to_pf(nic, &mbx);
-
-			/* after receiving ACK from PF release memory */
-			list_del(&xaddr->list);
-			kfree(xaddr);
-			vf_work->mc->count--;
 		}
 		kfree(vf_work->mc);
 	}
@@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev)
 			mode |= BGX_XCAST_MCAST_FILTER;
 			/* here we need to copy mc addrs */
 			if (netdev_mc_count(netdev)) {
-				struct xcast_addr *xaddr;
-
-				mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC);
-				INIT_LIST_HEAD(&mc_list->list);
+				mc_list = kmalloc(offsetof(typeof(*mc_list),
+							   mc[netdev_mc_count(netdev)]),
+						  GFP_ATOMIC);
+				if (unlikely(!mc_list))
+					return;
+				mc_list->count = 0;
 				netdev_hw_addr_list_for_each(ha, &netdev->mc) {
-					xaddr = kmalloc(sizeof(*xaddr),
-							GFP_ATOMIC);
-					xaddr->addr =
+					mc_list->mc[mc_list->count] =
 						ether_addr_to_u64(ha->addr);
-					list_add_tail(&xaddr->list,
-						      &mc_list->list);
 					mc_list->count++;
 				}
 			}
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net 1/2] net: bcmgenet: Fix sparse warnings in bcmgenet_put_tx_csum()
From: Sasha Levin @ 2018-04-06 20:05 UTC (permalink / raw)
  To: Sasha Levin, Florian Fainelli, netdev@vger.kernel.org
  Cc: Florian Fainelli, stable@vger.kernel.org
In-Reply-To: <20180402225856.4351-2-f.fainelli@gmail.com>

Hi,

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag,
fixing commit: 1c1008c793fa net: bcmgenet: add main driver file.

The bot has also determined it's probably a bug fixing patch. (score: 49.2621)

The bot has tested the following trees: v4.16, v4.15.15, v4.14.32, v4.9.92, v4.4.126.

v4.16: Build OK!
v4.15.15: Build OK!
v4.14.32: Build OK!
v4.9.92: Build OK!
v4.4.126: Build OK!

^ permalink raw reply

* Re: [PATCH net 2/2] net: systemport: Fix sparse warnings in bcm_sysport_insert_tsb()
From: Sasha Levin @ 2018-04-06 20:05 UTC (permalink / raw)
  To: Sasha Levin, Florian Fainelli, netdev@vger.kernel.org
  Cc: Florian Fainelli, stable@vger.kernel.org
In-Reply-To: <20180402225856.4351-3-f.fainelli@gmail.com>

Hi,

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag,
fixing commit: 80105befdb4b net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver.

The bot has also determined it's probably a bug fixing patch. (score: 50.4075)

The bot has tested the following trees: v4.16, v4.15.15, v4.14.32, v4.9.92, v4.4.126.

v4.16: Build OK!
v4.15.15: Build OK!
v4.14.32: Build OK!
v4.9.92: Build OK!
v4.4.126: Build OK!

^ permalink raw reply

* Re: [PATCH net] net: dsa: b53: Fix sparse warnings in b53_mmap.c
From: Sasha Levin @ 2018-04-06 20:05 UTC (permalink / raw)
  To: Sasha Levin, Florian Fainelli, netdev@vger.kernel.org
  Cc: jonas.gorski@gmail.com, Florian Fainelli, stable@vger.kernel.org
In-Reply-To: <20180402231701.17348-1-f.fainelli@gmail.com>

Hi,

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag,
fixing commit: 967dd82ffc52 net: dsa: b53: Add support for Broadcom RoboSwitch.

The bot has also determined it's probably a bug fixing patch. (score: 8.8847)

The bot has tested the following trees: v4.16, v4.15.15, v4.14.32, v4.9.92.

v4.16: Build OK!
v4.15.15: Build OK!
v4.14.32: Build OK!
v4.9.92: Build OK!

^ permalink raw reply

* Re: [PATCH net v6 4/4] ipv6: udp: set dst cache for a connected sk if current not valid
From: Sasha Levin @ 2018-04-06 20:05 UTC (permalink / raw)
  To: Sasha Levin, Alexey Kodanev, netdev@vger.kernel.org
  Cc: Eric Dumazet, stable@vger.kernel.org
In-Reply-To: <1522756810-24985-5-git-send-email-alexey.kodanev@oracle.com>

Hi,

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag,
fixing commit: 33c162a980fe ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update.

The bot has tested the following trees: v4.16, v4.15.15, v4.14.32, v4.9.92.

v4.16: Failed to apply! Possible dependencies:
    96818159c3c0 ("ipv6: allow to cache dst for a connected sk in ip6_sk_dst_lookup_flow()")

v4.15.15: Failed to apply! Possible dependencies:
    96818159c3c0 ("ipv6: allow to cache dst for a connected sk in ip6_sk_dst_lookup_flow()")

v4.14.32: Failed to apply! Possible dependencies:
    96818159c3c0 ("ipv6: allow to cache dst for a connected sk in ip6_sk_dst_lookup_flow()")

v4.9.92: Failed to apply! Possible dependencies:
    96818159c3c0 ("ipv6: allow to cache dst for a connected sk in ip6_sk_dst_lookup_flow()")

^ permalink raw reply

* Re: [PATCH net-next 6/6] netdevsim: Add simple FIB resource controller via devlink
From: David Ahern @ 2018-04-06 21:22 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, roopa, shm, jiri, idosch, jakub.kicinski,
	andy.roulin
In-Reply-To: <20180406055255.GB19345@nanopsycho>

On 4/5/18 11:52 PM, Jiri Pirko wrote:
> Thu, Apr 05, 2018 at 11:06:41PM CEST, dsa@cumulusnetworks.com wrote:
>> On 4/5/18 2:10 PM, David Ahern wrote:
>>>
>>> The ASIC here is the kernel tables in a namespace. It does not make
>>> sense to have 2 devlink instances for a single namespace.
>>
>> I put this example controller in netdevsim per a suggestion from Ido.
>> The netdevsim seemed like a good idea given that modules intention --
>> testing network facilities. Perhaps I should have done this as a
>> completely standalone module ...
>>
>> The intention is to treat the kernel's tables *per namespace* as a
>> standalone entity that can be managed very similar to ASIC resources.
> 
> So you say you want to treat a namespace as an ASIC? That sounds very
> odd to me :/

Why? The kernel has forwarding tables, acl's, etc just like the ASIC,
and each namespace is a separate set of tables.

If you think about it, userspace "programs" the kernel just like mlxsw
and userspace SDKs "program" an asic.


>> Given that I can add a resource controller module
>> (drivers/net/kern_res_mgr.c?) that creates a 'struct device' per network
>> namespace with a devlink instance. In this case the device would very
>> much be tied to the namespace 1:1.
> 
> That sounds more reasonable and accurate, yet still odd. You would not
> have any netdevices there? Any ports?
> 

Sure, what ever ports are assigned to or created in the namespace.

Nothing about the devlink API says it has to be a real h/w device.
Nothing about the devlink API says it can only be used for real h/w that
has ports represented by netdevices that the devlink instance some how
has "control" over.

As the netdevsim demo shows, I can build an L3 resource controller for
the kernel tables using just the devlink API and the in-kernel notifiers.

^ permalink raw reply

* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Siwei Liu @ 2018-04-06 21:29 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Michael S. Tsirkin, Jiri Pirko, Alexander Duyck, David Miller,
	Brandeburg, Jesse, Jakub Kicinski, Jason Wang, Samudrala, Sridhar,
	Netdev, virtualization, virtio-dev, si-wei liu
In-Reply-To: <20180403160834.51594373@xeon-e3>

(put discussions back on list which got accidentally removed)

On Tue, Apr 3, 2018 at 4:08 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 3 Apr 2018 12:04:38 -0700
> Siwei Liu <loseweigh@gmail.com> wrote:
>
>> On Tue, Apr 3, 2018 at 10:35 AM, Stephen Hemminger
>> <stephen@networkplumber.org> wrote:
>> > On Sun,  1 Apr 2018 05:13:09 -0400
>> > Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>> >
>> >> Hidden netdevice is not visible to userspace such that
>> >> typical network utilites e.g. ip, ifconfig and et al,
>> >> cannot sense its existence or configure it. Internally
>> >> hidden netdev may associate with an upper level netdev
>> >> that userspace has access to. Although userspace cannot
>> >> manipulate the lower netdev directly, user may control
>> >> or configure the underlying hidden device through the
>> >> upper-level netdev. For identification purpose, the
>> >> kobject for hidden netdev still presents in the sysfs
>> >> hierarchy, however, no uevent message will be generated
>> >> when the sysfs entry is created, modified or destroyed.
>> >>
>> >> For that end, a separate namescope needs to be carved
>> >> out for IFF_HIDDEN netdevs. As of now netdev name that
>> >> starts with colon i.e. ':' is invalid in userspace,
>> >> since socket ioctls such as SIOCGIFCONF use ':' as the
>> >> separator for ifname. The absence of namescope started
>> >> with ':' can rightly be used as the namescope for
>> >> the kernel-only IFF_HIDDEN netdevs.
>> >>
>> >> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>> >> ---
>> >
>> > I understand the use case. I proposed using . as a prefix before
>> > but that ran into resistance. Using colon seems worse.
>>
>> Using dot (.) can't be good because it would cause namespace collision
>> and thus breaking apps when you hide the device. Imagine a user really
>> wants to add a link with the same name as the one hidden and it starts
>> with a dot. It would fail, and users don't know its just because the
>> name starts with dot. IMHO users should be agnostic of (the namespace
>> of) hidden device at all if what they pick is a valid name.
>>
>> ":" is an invalid prefix to userspace, there's no such problem if
>> being used to construct the namescope for hidden devices.
>>
>> However, technically, just as what I alluded to in the reply earlier,
>> it might really be consistent to put this under a separeate namespace
>> instead than fiddling with name prefix. But I am just not sure if that
>> is a big hammer and would like to earn enough feedback and attention
>> before going that way too quickly.
>>
>>
>> >
>> > Rather than playing with names and all the issues that can cause,
>> > why not make it an attribute flag of the device in netlink.
>>
>> Atrribute flag doesn't help. It's a matter of namespace.
>>
>> Regards,
>> -Siwei
>
> In Vyatta, we used names like ".isatap" for devices that would clutter up
> the user experience. They are naturally not visible by simple scans of
> /sys/class/net, and there was a patch to ignore them in iproute2.
> It was a hack which worked but not really worth upstreaming.
>
> The question is if this a security feature then it needs to be more

I don't expect the namespace to be a security aspect of feature, but
rather a way to make old userspace unmodified  to work with a new
feature. And, we're going to add API to expose the netdev info for the
invisible IFF_AUTO_MANAGED links anyway. We don't need to make it
secure and all hidden under the dark to be honest.

> robust than just name prefix. Plus it took years to handle network
> namespaces everywhere; this kind of flag would start same problems.
>
> Network namespaces work but have the problem namespaces only weakly
> support hierarchy and nesting. I prefer the namespace approach
> because it fits better and has less impact.

Great, thanks!

-Siwei

^ permalink raw reply

* [RFC PATCH bpf-next 4/6] samples/bpf: move common-purpose perf_event functions to bpf_load.c
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180406214846.916265-1-yhs@fb.com>

There is no functionality change in this patch. The common-purpose
perf_event functions are moved from trace_output_user.c to bpf_load.c
so that these function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/bpf_load.c          | 104 ++++++++++++++++++++++++++++++++++++
 samples/bpf/bpf_load.h          |   5 ++
 samples/bpf/trace_output_user.c | 113 ++++------------------------------------
 3 files changed, 118 insertions(+), 104 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index bebe418..62aa5cc 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -713,3 +713,107 @@ struct ksym *ksym_search(long key)
 	return &syms[0];
 }
 
+static int page_size;
+static int page_cnt = 8;
+static volatile struct perf_event_mmap_page *header;
+
+static int perf_event_mmap(int fd)
+{
+	void *base;
+	int mmap_size;
+
+	page_size = getpagesize();
+	mmap_size = page_size * (page_cnt + 1);
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		printf("mmap err\n");
+		return -1;
+	}
+
+	header = base;
+	return 0;
+}
+
+static int perf_event_poll(int fd)
+{
+	struct pollfd pfd = { .fd = fd, .events = POLLIN };
+
+	return poll(&pfd, 1, 1000);
+}
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	char data[];
+};
+
+static void perf_event_read(perf_event_print_fn fn)
+{
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	__u64 buffer_size = page_cnt * page_size;
+	void *base, *begin, *end;
+	char buf[256];
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return;
+
+	base = ((char *)header) + page_size;
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			assert(len < e->header.size);
+			memcpy(buf, begin, len);
+			memcpy(buf + len, base, e->header.size - len);
+			e = (void *) buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			fn(e->data, e->size);
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			struct {
+				struct perf_event_header header;
+				__u64 id;
+				__u64 lost;
+			} *lost = (void *) e;
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_head;
+}
+
+int perf_event_poller(int fd, perf_event_exec_fn exec_fn,
+		      perf_event_print_fn output_fn)
+{
+	if (perf_event_mmap(fd) < 0)
+		return 1;
+
+	exec_fn();
+
+	for (;;) {
+		perf_event_poll(fd);
+		perf_event_read(output_fn);
+	}
+
+	return 0;
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200..d618750 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -62,4 +62,9 @@ struct ksym {
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+
+typedef void (*perf_event_exec_fn)(void);
+typedef void (*perf_event_print_fn)(void *data, int size);
+int perf_event_poller(int fd, perf_event_exec_fn exec_fn,
+		      perf_event_print_fn output_fn);
 #endif
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e3..3d3991f 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -24,97 +24,6 @@
 
 static int pmu_fd;
 
-int page_size;
-int page_cnt = 8;
-volatile struct perf_event_mmap_page *header;
-
-typedef void (*print_fn)(void *data, int size);
-
-static int perf_event_mmap(int fd)
-{
-	void *base;
-	int mmap_size;
-
-	page_size = getpagesize();
-	mmap_size = page_size * (page_cnt + 1);
-
-	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-	if (base == MAP_FAILED) {
-		printf("mmap err\n");
-		return -1;
-	}
-
-	header = base;
-	return 0;
-}
-
-static int perf_event_poll(int fd)
-{
-	struct pollfd pfd = { .fd = fd, .events = POLLIN };
-
-	return poll(&pfd, 1, 1000);
-}
-
-struct perf_event_sample {
-	struct perf_event_header header;
-	__u32 size;
-	char data[];
-};
-
-static void perf_event_read(print_fn fn)
-{
-	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	__u64 buffer_size = page_cnt * page_size;
-	void *base, *begin, *end;
-	char buf[256];
-
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % buffer_size;
-	end = base + data_head % buffer_size;
-
-	while (begin != end) {
-		struct perf_event_sample *e;
-
-		e = begin;
-		if (begin + e->header.size > base + buffer_size) {
-			long len = base + buffer_size - begin;
-
-			assert(len < e->header.size);
-			memcpy(buf, begin, len);
-			memcpy(buf + len, base, e->header.size - len);
-			e = (void *) buf;
-			begin = base + e->header.size - len;
-		} else if (begin + e->header.size == base + buffer_size) {
-			begin = base;
-		} else {
-			begin += e->header.size;
-		}
-
-		if (e->header.type == PERF_RECORD_SAMPLE) {
-			fn(e->data, e->size);
-		} else if (e->header.type == PERF_RECORD_LOST) {
-			struct {
-				struct perf_event_header header;
-				__u64 id;
-				__u64 lost;
-			} *lost = (void *) e;
-			printf("lost %lld events\n", lost->lost);
-		} else {
-			printf("unknown event type=%d size=%d\n",
-			       e->header.type, e->header.size);
-		}
-	}
-
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_head;
-}
-
 static __u64 time_get_ns(void)
 {
 	struct timespec ts;
@@ -166,10 +75,17 @@ static void test_bpf_perf_event(void)
 	ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
 }
 
+static void exec_action(void)
+{
+	FILE *f;
+
+	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+	(void) f;
+}
+
 int main(int argc, char **argv)
 {
 	char filename[256];
-	FILE *f;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
@@ -180,17 +96,6 @@ int main(int argc, char **argv)
 
 	test_bpf_perf_event();
 
-	if (perf_event_mmap(pmu_fd) < 0)
-		return 1;
-
-	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
-	(void) f;
-
 	start_time = time_get_ns();
-	for (;;) {
-		perf_event_poll(pmu_fd);
-		perf_event_read(print_bpf_output);
-	}
-
-	return 0;
+	return perf_event_poller(pmu_fd, exec_action, print_bpf_output);
 }
-- 
2.9.5

^ permalink raw reply related

* [RFC PATCH bpf-next 0/6] bpf: add bpf_get_stack_helper
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Patches #1 and #2 implemented the core kernel support.
Patch #3 synced the new helper to tools headers.
Patches #4 and #5 added a test in samples/bpf by attaching
to a kprobe, and Patch #6 added a test in tools/bpf by
attaching to a tracepoint.

Yonghong Song (6):
  bpf: change prototype for stack_map_get_build_id_offset
  bpf: add bpf_get_stack helper
  tools/bpf: add bpf_get_stack helper to tools headers
  samples/bpf: move common-purpose perf_event functions to bpf_load.c
  samples/bpf: add a test for bpf_get_stack helper
  tools/bpf: add a test case for bpf_get_stack helper

 include/linux/bpf.h                               |   1 +
 include/linux/filter.h                            |   3 +-
 include/uapi/linux/bpf.h                          |  17 ++-
 kernel/bpf/stackmap.c                             |  69 ++++++++--
 kernel/bpf/syscall.c                              |  12 +-
 kernel/bpf/verifier.c                             |   3 +
 kernel/trace/bpf_trace.c                          |  50 +++++++-
 samples/bpf/Makefile                              |   4 +
 samples/bpf/bpf_load.c                            | 104 +++++++++++++++
 samples/bpf/bpf_load.h                            |   5 +
 samples/bpf/trace_get_stack_kern.c                |  80 ++++++++++++
 samples/bpf/trace_get_stack_user.c                | 150 ++++++++++++++++++++++
 samples/bpf/trace_output_user.c                   | 113 ++--------------
 tools/include/uapi/linux/bpf.h                    |  17 ++-
 tools/testing/selftests/bpf/bpf_helpers.h         |   2 +
 tools/testing/selftests/bpf/test_progs.c          |  41 +++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c |  20 ++-
 17 files changed, 568 insertions(+), 123 deletions(-)
 create mode 100644 samples/bpf/trace_get_stack_kern.c
 create mode 100644 samples/bpf/trace_get_stack_user.c

-- 
2.9.5

^ permalink raw reply

* [RFC PATCH bpf-next 2/6] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180406214846.916265-1-yhs@fb.com>

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 17 +++++++++++++--
 kernel/bpf/stackmap.c    | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 12 ++++++++++-
 kernel/bpf/verifier.c    |  3 +++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd..72ccb9a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -676,6 +676,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f9..9b64f63 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -467,7 +467,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				need_callchain_buf:1; /* Needs callchain buffer? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..a4ff5b7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -517,6 +517,17 @@ union bpf_attr {
  *             other bits - reserved
  *     Return: >= 0 stackid on success or negative error
  *
+ * int bpf_get_stack(ctx, buf, size, flags)
+ *     walk user or kernel stack and store the ips in buf
+ *     @ctx: struct pt_regs*
+ *     @buf: user buffer to fill stack
+ *     @size: the buf size
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 11 - get build-id as well if user stack
+ *             other bits - reserved
+ *     Return: >= 0 size copied on success or negative error
+ *
  * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
  *     calculate csum diff
  *     @from: raw from buffer
@@ -821,7 +832,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -855,11 +867,12 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..371c72e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		return -EINVAL;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		return -EINVAL;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		return -EFAULT;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr <= skip)
+		return -EFAULT;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	return copy_len;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0244973..2aa3a65 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+	bool need_callchain_buf = aux->prog->need_callchain_buf;
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
 	security_bpf_prog_free(aux);
+	if (need_callchain_buf)
+		put_callchain_buffers();
 	bpf_prog_free(aux->prog);
 }
 
@@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 			bpf_prog_kallsyms_del(prog->aux->func[i]);
 		bpf_prog_kallsyms_del(prog);
 
-		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+		synchronize_rcu();
+		__bpf_prog_put_rcu(&prog->aux->rcu);
 	}
 }
 
@@ -1341,6 +1345,12 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err)
 		goto free_used_maps;
 
+	if (prog->need_callchain_buf) {
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		if (err)
+			goto free_used_maps;
+	}
+
 	err = bpf_prog_new_fd(prog);
 	if (err < 0) {
 		/* failed to allocate fd.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb..aba9425 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2460,6 +2460,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack)
+		env->prog->need_callchain_buf = true;
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d..fe8476f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
2.9.5

^ permalink raw reply related

* [RFC PATCH bpf-next 5/6] samples/bpf: add a test for bpf_get_stack helper
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180406214846.916265-1-yhs@fb.com>

The test attached a kprobe program to kernel function sys_write.
It tested to get stack for user space, kernel space and user
space with build_id request. It also tested to get user
and kernel stack into the same buffer with back-to-back
bpf_get_stack helper calls.

Whenever the kernel stack is available, the user space
application will check to ensure that sys_write/SyS_write
is part of the stack.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile               |   4 +
 samples/bpf/trace_get_stack_kern.c |  80 ++++++++++++++++++++
 samples/bpf/trace_get_stack_user.c | 150 +++++++++++++++++++++++++++++++++++++
 3 files changed, 234 insertions(+)
 create mode 100644 samples/bpf/trace_get_stack_kern.c
 create mode 100644 samples/bpf/trace_get_stack_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6ed..94e7b10 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -44,6 +44,7 @@ hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
+hostprogs-y += trace_get_stack
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -95,6 +96,7 @@ xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
+trace_get_stack-objs := bpf_load.o $(LIBBPF) trace_get_stack_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -148,6 +150,7 @@ always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
+always += trace_get_stack_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -193,6 +196,7 @@ HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
+HOSTLOADLIBES_trace_get_stack += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/trace_get_stack_kern.c b/samples/bpf/trace_get_stack_kern.c
new file mode 100644
index 0000000..c7cc7b1
--- /dev/null
+++ b/samples/bpf/trace_get_stack_kern.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Permit pretty deep stack traces */
+#define MAX_STACK 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	u64 kern_stack[MAX_STACK];
+	u64 user_stack[MAX_STACK];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK];
+};
+
+struct bpf_map_def SEC("maps") perfmap = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") stackdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(struct stack_trace_t),
+	.max_entries = 1,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	int max_len, max_buildid_len, usize, ksize, total_size;
+	struct stack_trace_t *data;
+	void *raw_data;
+	u32 key = 0;
+
+	data = bpf_map_lookup_elem(&stackdata_map, &key);
+	if (!data)
+		return 0;
+
+	max_len = MAX_STACK * sizeof(u64);
+	max_buildid_len = MAX_STACK * sizeof(struct bpf_stack_build_id);
+	data->pid = bpf_get_current_pid_tgid();
+	data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
+					      max_len, 0);
+	data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
+					    BPF_F_USER_STACK);
+	data->user_stack_buildid_size = bpf_get_stack(
+		ctx, data->user_stack_buildid, max_buildid_len,
+		BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
+
+	/* write both kernel and user stacks to the same buffer */
+	raw_data = (void *)data;
+	usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+	if (usize < 0)
+		return 0;
+
+	ksize = 0;
+	if (usize < max_len) {
+		ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize,
+				      0);
+		if (ksize < 0)
+			return 0;
+	}
+	total_size = (usize < max_len ? usize : 0) +
+		     (ksize < max_len ? ksize : 0);
+	if (total_size > 0 && total_size < max_len)
+		bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_get_stack_user.c b/samples/bpf/trace_get_stack_user.c
new file mode 100644
index 0000000..f64f5a5
--- /dev/null
+++ b/samples/bpf/trace_get_stack_user.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "perf-sys.h"
+
+static int pmu_fd;
+
+#define MAX_CNT 10ull
+#define MAX_STACK 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK];
+	__u64 user_stack[MAX_STACK];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK];
+};
+
+static void print_bpf_output(void *data, int size)
+{
+	struct stack_trace_t *e = data;
+	int i, num_stack;
+	static __u64 cnt;
+	bool found = false;
+
+	cnt++;
+
+	if (size < sizeof(struct stack_trace_t)) {
+		__u64 *raw_data = data;
+
+		num_stack = size / sizeof(__u64);
+		printf("sample size = %d, raw stack\n\t", size);
+		for (i = 0; i < num_stack; i++) {
+			struct ksym *ks = ksym_search(raw_data[i]);
+
+			printf("0x%llx ", raw_data[i]);
+			if (ks && (strcmp(ks->name, "sys_write") == 0 ||
+				   strcmp(ks->name, "SyS_write") == 0))
+				found = true;
+		}
+		printf("\n");
+	} else {
+		printf("sample size = %d, pid %d\n", size, e->pid);
+		if (e->kern_stack_size > 0) {
+			num_stack = e->kern_stack_size / sizeof(__u64);
+			printf("\tkernel_stack(%d): ", num_stack);
+			for (i = 0; i < num_stack; i++) {
+				struct ksym *ks = ksym_search(e->kern_stack[i]);
+
+				printf("0x%llx ", e->kern_stack[i]);
+				if (ks && (strcmp(ks->name, "sys_write") == 0 ||
+					   strcmp(ks->name, "SyS_write") == 0))
+					found = true;
+			}
+			printf("\n");
+		}
+		if (e->user_stack_size > 0) {
+			num_stack = e->user_stack_size / sizeof(__u64);
+			printf("\tuser_stack(%d): ", num_stack);
+			for (i = 0; i < num_stack; i++)
+				printf("0x%llx ", e->user_stack[i]);
+			printf("\n");
+		}
+		if (e->user_stack_buildid_size > 0) {
+			num_stack = e->user_stack_buildid_size /
+				    sizeof(struct bpf_stack_build_id);
+			printf("\tuser_stack_buildid(%d): ", num_stack);
+			for (i = 0; i < num_stack; i++) {
+				int j;
+
+				printf("(%d, 0x", e->user_stack_buildid[i].status);
+				for (j = 0; j < BPF_BUILD_ID_SIZE; j++)
+					printf("%02x", e->user_stack_buildid[i].build_id[i]);
+				printf(", %llx) ", e->user_stack_buildid[i].offset);
+			}
+			printf("\n");
+		}
+	}
+	if (!found) {
+		printf("received %lld events, kern symbol not found, exiting ...\n", cnt);
+		kill(0, SIGINT);
+	}
+
+	if (cnt == MAX_CNT) {
+		printf("received max %lld events, exiting ...\n", cnt);
+		kill(0, SIGINT);
+	}
+}
+
+static void test_bpf_perf_event(void)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+	};
+	int key = 0;
+
+	pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+
+	assert(pmu_fd >= 0);
+	assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0);
+	ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static void action(void)
+{
+	FILE *f;
+
+	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+	(void) f;
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_kallsyms()) {
+		printf("failed to process /proc/kallsyms\n");
+		return 2;
+	}
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	test_bpf_perf_event();
+	return perf_event_poller(pmu_fd, action, print_bpf_output);
+}
-- 
2.9.5

^ permalink raw reply related

* [RFC PATCH bpf-next 3/6] tools/bpf: add bpf_get_stack helper to tools headers
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180406214846.916265-1-yhs@fb.com>

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h            | 17 +++++++++++++++--
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9d07465..3930463 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -517,6 +517,17 @@ union bpf_attr {
  *             other bits - reserved
  *     Return: >= 0 stackid on success or negative error
  *
+ * int bpf_get_stack(ctx, buf, size, flags)
+ *     walk user or kernel stack and store the ips in buf
+ *     @ctx: struct pt_regs*
+ *     @buf: user buffer to fill stack
+ *     @size: the buf size
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 11 - get build-id as well if user stack
+ *             other bits - reserved
+ *     Return: >= 0 size copied on success or negative error
+ *
  * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
  *     calculate csum diff
  *     @from: raw from buffer
@@ -821,7 +832,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -855,11 +867,12 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index d8223d9..acaed02 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -96,6 +96,8 @@ static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_pull_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
+static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
+	(void *) BPF_FUNC_get_stack;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.9.5

^ permalink raw reply related

* [RFC PATCH bpf-next 6/6] tools/bpf: add a test case for bpf_get_stack helper
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180406214846.916265-1-yhs@fb.com>

The test_stacktrace_map is enhanced to call bpf_get_stack
in the helper to get the stack trace as well.
The stack traces from bpf_get_stack and bpf_get_stackid
are compared to ensure that for the same stack as
represented as the same hash, their ip addresses
must be the same.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c          | 41 ++++++++++++++++++++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c | 20 +++++++++--
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index faadbe2..8aa2844 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -865,9 +865,39 @@ static int compare_map_keys(int map1_fd, int map2_fd)
 	return 0;
 }
 
+static int compare_stack_ips(int smap_fd, int amap_fd)
+{
+	int max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+	__u32 key, next_key, *cur_key_p, *next_key_p;
+	char val_buf1[max_len], val_buf2[max_len];
+	int i, err;
+
+	cur_key_p = NULL;
+	next_key_p = &key;
+	while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
+		err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
+		if (err)
+			return err;
+		err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
+		if (err)
+			return err;
+		for (i = 0; i < max_len; i++) {
+			if (val_buf1[i] != val_buf2[i])
+				return -1;
+		}
+		key = *next_key_p;
+		cur_key_p = &key;
+		next_key_p = &next_key;
+	}
+	if (errno != ENOENT)
+		return -1;
+
+	return 0;
+}
+
 static void test_stacktrace_map()
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_map.o";
 	int bytes, efd, err, pmu_fd, prog_fd;
 	struct perf_event_attr attr = {};
@@ -925,6 +955,10 @@ static void test_stacktrace_map()
 	if (stackmap_fd < 0)
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (stack_amap_fd < 0)
+		goto disable_pmu;
+
 	/* give some time for bpf program run */
 	sleep(1);
 
@@ -946,6 +980,11 @@ static void test_stacktrace_map()
 		  "err %d errno %d\n", err, errno))
 		goto disable_pmu_noerr;
 
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd);
+	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
 	goto disable_pmu_noerr;
 disable_pmu:
 	error_cnt++;
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
index 76d85c5d..f83c7b6 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
-	.max_entries = 10000,
+	.max_entries = 16384,
+};
+
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 16384,
 };
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
@@ -44,7 +51,10 @@ struct sched_switch_args {
 SEC("tracepoint/sched/sched_switch")
 int oncpu(struct sched_switch_args *ctx)
 {
+	__u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
+
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -52,8 +62,12 @@ int oncpu(struct sched_switch_args *ctx)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(ctx, &stackmap, 0);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(ctx, stack_p, max_len, 0);
+	}
 
 	return 0;
 }
-- 
2.9.5

^ permalink raw reply related

* [RFC PATCH bpf-next 1/6] bpf: change prototype for stack_map_get_build_id_offset
From: Yonghong Song @ 2018-04-06 21:48 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180406214846.916265-1-yhs@fb.com>

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb12..04f6ec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
2.9.5

^ permalink raw reply related

* Fwd: Problem with the kernel 4.15 - cutting the band (tc)
From: Linus Torvalds @ 2018-04-06 21:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <42a29f8a-beb3-84da-9129-fbda7ef81be4@hostcenter.eu>

Forwarding a report about what looks like a regression between 4.14 and 4.15.

New ENOSPC issue? I don't even knew where to start guessing where to look.

Help me, Davem-Wan Kenobi, you are my only hope.

(But adding netdev just in case somebody else goes "That's obviously Xyz")

              Linus

---------- Forwarded message ----------
From: Marcin Kabiesz <admin@hostcenter.eu>
Date: Thu, Apr 5, 2018 at 10:38 AM
Subject: Problem with the kernel 4.15 - cutting the band (tc)


Hello,
I have a problem with bandwidth cutting on kernel 4.15. On the version
up to 4.15, i.e. 4.14, this problem does not occur.

uname -a: Linux router 4.14.15 #1 SMP x86_64 Intel Xeon E3-1230 v6
command to reproduce:

tc qdisc add dev ifb0 root handle 1: htb r2q 2
tc class add dev ifb0 parent 1: classid 1:1 htb rate 10gbit ceil
10gbit quantum 16000
tc filter add dev ifb0 parent 1: prio 5 handle 1: protocol all u32 divisor 256
tc filter add dev ifb0 protocol all parent 1: prio 5 u32 ht 800::
match ip dst 0.0.0.0/0 hashkey mask 0x000000ff at 16 link 1:
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32

This ok, no error/warnings and dmesg log.

uname -a: Linux router 4.15.8 #1 SMP x86_64 Intel Xeon E3-1230 v6 (or
4.15.14 this same effect)
command to reproduce:

tc qdisc add dev ifb0 root handle 1: htb r2q 2
tc class add dev ifb0 parent 1: classid 1:1 htb rate 10gbit ceil
10gbit quantum 16000
tc filter add dev ifb0 parent 1: prio 5 handle 1: protocol all u32 divisor 256
tc filter add dev ifb0 protocol all parent 1: prio 5 u32 ht 800::
match ip dst 0.0.0.0/0 hashkey mask 0x000000ff at 16 link 1:
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
RTNETLINK answers: No space left on device
We have an error talking to the kernel

This not ok, on error/warnings and no dmesg log.

Best Regards
Please forgive my English
Marcin Kabiesz

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox