Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC bpf-next 05/10] bpf: cfg: detect unreachable basic blocks
From: Jiong Wang @ 2018-05-07 10:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel
  Cc: john.fastabend, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525688567-19618-1-git-send-email-jiong.wang@netronome.com>

Do unreachable basic blocks detection as a side-product of the dfs walk
when building domination information.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 kernel/bpf/cfg.c      | 31 ++++++++++++++++++++++++++-----
 kernel/bpf/cfg.h      |  3 ++-
 kernel/bpf/verifier.c |  3 ++-
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index 90692e4..7ce1472 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -407,15 +407,17 @@ static void calc_idoms(struct bpf_subprog_info *subprog, struct dom_info *di,
 			di->idom[idx] = di->idom[di->idom[idx]];
 }
 
-static int calc_dfs_tree(struct bpf_subprog_info *subprog, struct dom_info *di,
-			 bool reverse)
+static int
+calc_dfs_tree(struct bpf_verifier_env *env, struct bpf_subprog_info *subprog,
+	      struct dom_info *di, bool reverse)
 {
-	u16 bb_num = subprog->bb_num, sp = 0, idx, parent_idx;
+	u16 bb_num = subprog->bb_num, sp = 0, idx, parent_idx, i;
 	struct list_head *bb_list = &subprog->bbs;
 	u16 entry_bb_fake_idx = bb_num - 2;
 	struct bb_node *entry_bb, *exit_bb;
 	struct edge_iter ei, *stack;
 	struct edge_node *e;
+	bool *visited;
 
 	di->dfs_order[entry_bb_fake_idx] = di->dfsnum;
 
@@ -423,6 +425,10 @@ static int calc_dfs_tree(struct bpf_subprog_info *subprog, struct dom_info *di,
 	if (!stack)
 		return -ENOMEM;
 
+	visited = kcalloc(bb_num - 2, sizeof(bool), GFP_KERNEL);
+	if (!visited)
+		return -ENOMEM;
+
 	if (reverse) {
 		entry_bb = exit_bb(bb_list);
 		exit_bb = entry_bb(bb_list);
@@ -445,6 +451,8 @@ static int calc_dfs_tree(struct bpf_subprog_info *subprog, struct dom_info *di,
 
 			if (reverse) {
 				bb_dst = e->src;
+				if (bb_dst != exit_bb)
+					visited[bb_dst->idx] = true;
 				if (bb_dst == exit_bb ||
 				    di->dfs_order[bb_dst->idx]) {
 					ei_next(&ei);
@@ -453,6 +461,8 @@ static int calc_dfs_tree(struct bpf_subprog_info *subprog, struct dom_info *di,
 				bb_src = e->dst;
 			} else {
 				bb_dst = e->dst;
+				if (bb_dst != exit_bb)
+					visited[bb_dst->idx] = true;
 				if (bb_dst == exit_bb ||
 				    di->dfs_order[bb_dst->idx]) {
 					ei_next(&ei);
@@ -490,6 +500,16 @@ static int calc_dfs_tree(struct bpf_subprog_info *subprog, struct dom_info *di,
 
 	kfree(stack);
 
+	for (i = 0; i < bb_num - 2; i++) {
+		if (!visited[i]) {
+			bpf_verifier_log_write(env, "cfg - unreachable insn\n");
+			kfree(visited);
+			return -EINVAL;
+		}
+	}
+
+	kfree(visited);
+
 	return 0;
 }
 
@@ -541,7 +561,8 @@ static int idoms_to_doms(struct bpf_subprog_info *subprog, struct dom_info *di)
  * The implementation also referenced GNU GCC 3.0.
  */
 
-int subprog_build_dom_info(struct bpf_subprog_info *subprog)
+int subprog_build_dom_info(struct bpf_verifier_env *env,
+			   struct bpf_subprog_info *subprog)
 {
 	struct dom_info di;
 	int ret;
@@ -550,7 +571,7 @@ int subprog_build_dom_info(struct bpf_subprog_info *subprog)
 	if (ret < 0)
 		goto free_dominfo;
 
-	ret = calc_dfs_tree(subprog, &di, false);
+	ret = calc_dfs_tree(env, subprog, &di, false);
 	if (ret < 0)
 		goto free_dominfo;
 
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index c02c4cf..02729a9 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -10,7 +10,8 @@
 
 int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list);
 int subprog_append_bb(struct list_head *bb_list, int head);
-int subprog_build_dom_info(struct bpf_subprog_info *subprog);
+int subprog_build_dom_info(struct bpf_verifier_env *env,
+			   struct bpf_subprog_info *subprog);
 int subprog_fini_bb(struct list_head *bb_list, int subprog_end);
 bool subprog_has_loop(struct bpf_subprog_info *subprog);
 int subprog_init_bb(struct list_head *bb_list, int subprog_start);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a93aa43..46d5eae 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -879,7 +879,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			if (ret < 0)
 				goto free_nodes;
 			subprog[cur_subprog].bb_num = ret;
-			ret = subprog_build_dom_info(&subprog[cur_subprog]);
+			ret = subprog_build_dom_info(env,
+						     &subprog[cur_subprog]);
 			if (ret < 0)
 				goto free_nodes;
 			if (subprog_has_loop(&subprog[cur_subprog])) {
-- 
2.7.4

^ permalink raw reply related

* [RFC bpf-next 04/10] bpf: cfg: detect loop use domination information
From: Jiong Wang @ 2018-05-07 10:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel
  Cc: john.fastabend, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525688567-19618-1-git-send-email-jiong.wang@netronome.com>

If one bb is dominating its predecessor, then there is loop.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 kernel/bpf/cfg.c      | 22 ++++++++++++++++++++++
 kernel/bpf/cfg.h      |  1 +
 kernel/bpf/verifier.c |  8 ++++++++
 3 files changed, 31 insertions(+)

diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index b50937a..90692e4 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -568,6 +568,28 @@ int subprog_build_dom_info(struct bpf_subprog_info *subprog)
 	return ret;
 }
 
+bool subprog_has_loop(struct bpf_subprog_info *subprog)
+{
+	int lane_len = BITS_TO_LONGS(subprog->bb_num - 2);
+	struct list_head *bb_list = &subprog->bbs;
+	struct bb_node *bb, *entry_bb;
+	struct edge_node *e;
+
+	entry_bb = entry_bb(bb_list);
+	bb = bb_next(entry_bb);
+	list_for_each_entry_from(bb, &exit_bb(bb_list)->l, l)
+		list_for_each_entry(e, &bb->e_prevs, l) {
+			struct bb_node *latch = e->src;
+
+			if (latch != entry_bb &&
+			    test_bit(bb->idx,
+				     subprog->dtree + latch->idx * lane_len))
+				return true;
+		}
+
+	return false;
+}
+
 static void subprog_free_edge(struct bb_node *bb)
 {
 	struct list_head *succs = &bb->e_succs;
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index cbb44f2..c02c4cf 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -12,6 +12,7 @@ int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list);
 int subprog_append_bb(struct list_head *bb_list, int head);
 int subprog_build_dom_info(struct bpf_subprog_info *subprog);
 int subprog_fini_bb(struct list_head *bb_list, int subprog_end);
+bool subprog_has_loop(struct bpf_subprog_info *subprog);
 int subprog_init_bb(struct list_head *bb_list, int subprog_start);
 void subprog_free(struct bpf_subprog_info *subprog, int end_idx);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6543470..a93aa43 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -879,6 +879,14 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			if (ret < 0)
 				goto free_nodes;
 			subprog[cur_subprog].bb_num = ret;
+			ret = subprog_build_dom_info(&subprog[cur_subprog]);
+			if (ret < 0)
+				goto free_nodes;
+			if (subprog_has_loop(&subprog[cur_subprog])) {
+				verbose(env, "cfg - loop detected");
+				ret = -EINVAL;
+				goto free_nodes;
+			}
 			subprog_start = subprog_end;
 			cur_subprog++;
 			if (cur_subprog < env->subprog_cnt) {
-- 
2.7.4

^ permalink raw reply related

* [RFC bpf-next 03/10] bpf: cfg: build domination tree using Tarjan algorithm
From: Jiong Wang @ 2018-05-07 10:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel
  Cc: john.fastabend, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525688567-19618-1-git-send-email-jiong.wang@netronome.com>

  - When building domination information, we need to some array data
    structure, and need to index into them using basic block index.
    Calculate the basic block index during adding edges.

  - The Step-1 in Tarjan algorithm is to assign dfs order number for
    each bb in the cfg. We need to iterate on all bbs in reverse dfs order.

  - Step-2 and Step-3 in Tarjan algorithm to calculate semi-dominators and
    immediate-dominators implicitly and explicitly.

  - Step-4, we add a new bitmap field inside subprog_info to keep the
    dominator tree there.

  - Post-domination information is built but not tested.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 include/linux/bpf_verifier.h |   3 +
 kernel/bpf/cfg.c             | 386 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/cfg.h             |   3 +-
 kernel/bpf/verifier.c        |   5 +-
 4 files changed, 393 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 404a5d8..9c0a808 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -177,6 +177,9 @@ struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
 	struct list_head bbs; /* basic blocks list. */
+	u32 bb_num; /* total basic block num. */
+	unsigned long *dtree;
+	bool dtree_avail;
 };
 
 /* single container for all structs
diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index 208aac7..b50937a 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -17,11 +17,33 @@ struct edge_node {
 	struct bb_node *dst;
 };
 
+struct edge_iter {
+	struct list_head *list_head;
+	struct edge_node *edge;
+};
+
+#define first_edge(e_list)	list_first_entry(e_list, struct edge_node, l)
+#define last_edge(e_list)	list_last_entry(e_list, struct edge_node, l)
+#define next_edge(e)		list_next_entry(e, l)
+
+static bool ei_end_p(struct edge_iter *ei)
+{
+	return &ei->edge->l == ei->list_head;
+}
+
+static void ei_next(struct edge_iter *ei)
+{
+	struct edge_node *e = ei->edge;
+
+	ei->edge = next_edge(e);
+}
+
 struct bb_node {
 	struct list_head l;
 	struct list_head e_prevs;
 	struct list_head e_succs;
 	u16 head;
+	u16 idx;
 };
 
 #define bb_prev(bb)		list_prev_entry(bb, l)
@@ -31,6 +53,27 @@ struct bb_node {
 #define entry_bb(bb_list)	bb_first(bb_list)
 #define exit_bb(bb_list)	bb_last(bb_list)
 
+struct dom_info {
+	u16 *dfs_parent;
+	u16 *dfs_order;
+	struct bb_node **dfs_to_bb;
+	/* immediate-dominator */
+	u16 *idom;
+	/* semi-dominator */
+	u16 *sdom;
+	u16 *bucket;
+	u16 *next_bucket;
+	/* best node during path compression. */
+	u16 *best;
+	/* ancestor along tree edge. */
+	u16 *ancestor;
+	/* size and child are used for tree balancing. */
+	u16 *size;
+	u16 *child;
+
+	u16 dfsnum;
+};
+
 int subprog_append_bb(struct list_head *bb_list, int head)
 {
 	struct bb_node *new_bb, *bb;
@@ -107,6 +150,7 @@ int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list)
 {
 	struct bb_node *bb, *exit_bb;
 	struct edge_node *edge;
+	int bb_num;
 
 	bb = entry_bb(bb_list);
 	edge = kcalloc(2, sizeof(*edge), GFP_KERNEL);
@@ -118,9 +162,12 @@ int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list)
 	edge[1].src = edge->src;
 	edge[1].dst = edge->dst;
 	list_add_tail(&edge[1].l, &edge[1].dst->e_prevs);
+	bb->idx = -1;
 
 	exit_bb = exit_bb(bb_list);
+	exit_bb->idx = -2;
 	bb = bb_next(bb);
+	bb_num = 0;
 	list_for_each_entry_from(bb, &exit_bb->l, l) {
 		bool has_fallthrough, only_has_fallthrough;
 		bool has_branch, only_has_branch;
@@ -129,6 +176,8 @@ int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list)
 		struct bpf_insn insn;
 		u8 code;
 
+		bb->idx = bb_num++;
+
 		edge = kcalloc(2, sizeof(*edge), GFP_KERNEL);
 		if (!edge)
 			return -ENOMEM;
@@ -184,9 +233,341 @@ int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list)
 		}
 	}
 
+	return bb_num + 2;
+}
+
+static int init_dom_info(struct bpf_subprog_info *subprog, struct dom_info *di)
+{
+	u16 *p, bb_num, i;
+
+	di->dfs_parent = NULL;
+	di->dfs_to_bb = NULL;
+
+	bb_num = subprog->bb_num;
+	p = kcalloc(10 * bb_num, sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	di->dfs_parent = p;
+	di->dfs_order = di->dfs_parent + bb_num;
+	di->idom = di->dfs_order + bb_num;
+	di->sdom = di->idom + bb_num;
+	di->bucket = di->sdom + bb_num;
+	di->next_bucket = di->bucket + bb_num;
+	di->best = di->next_bucket + bb_num;
+	di->ancestor = di->best + bb_num;
+	di->size = di->ancestor + bb_num;
+	di->child = di->size + bb_num;
+	di->dfs_to_bb = kcalloc(bb_num, sizeof(struct bb_node *), GFP_KERNEL);
+	di->dfsnum = 1;
+
+	for (i = 0; i < bb_num; i++) {
+		di->size[i] = 1;
+		di->best[i] = i;
+		di->sdom[i] = i;
+	}
+
 	return 0;
 }
 
+static void compress_path(struct dom_info *di, unsigned int v)
+{
+	unsigned int parent = di->ancestor[v];
+
+	if (di->ancestor[parent]) {
+		compress_path(di, parent);
+
+		if (di->sdom[di->best[parent]] < di->sdom[di->best[v]])
+			di->best[v] = di->best[parent];
+
+		di->ancestor[v] = di->ancestor[parent];
+	}
+}
+
+static unsigned int eval(struct dom_info *di, unsigned int v)
+{
+	unsigned int ancestor = di->ancestor[v];
+
+	/* v is root. */
+	if (!ancestor)
+		return di->best[v];
+
+	/* compress path */
+	compress_path(di, v);
+	ancestor = di->ancestor[v];
+
+	if (di->sdom[di->best[ancestor]] >= di->sdom[di->best[v]])
+		return di->best[v];
+	else
+		return di->best[ancestor];
+}
+
+/* Re-balancing the tree before linking. */
+static void link(struct dom_info *di, unsigned int v, unsigned int w)
+{
+	unsigned int s = w;
+
+	while (di->sdom[di->best[w]] < di->sdom[di->best[di->child[s]]]) {
+		if (di->size[s] + di->size[di->child[di->child[s]]] >=
+			2 * di->size[di->child[s]]) {
+			di->ancestor[di->child[s]] = s;
+			di->child[s] = di->child[di->child[s]];
+		} else {
+			di->size[di->child[s]] = di->size[s];
+			di->ancestor[s] = di->child[s];
+			s = di->child[s];
+		}
+	}
+
+	di->best[s] = di->best[w];
+	di->size[v] += di->size[w];
+	if (di->size[v] < 2 * di->size[w]) {
+		unsigned int t = s;
+
+		s = di->child[v];
+		di->child[v] = t;
+	}
+
+	while (s) {
+		di->ancestor[s] = v;
+		s = di->child[s];
+	}
+}
+
+static void calc_idoms(struct bpf_subprog_info *subprog, struct dom_info *di,
+		       bool reverse)
+{
+	u16 entry_bb_fake_idx = subprog->bb_num - 2, idx, w, k, par;
+	struct list_head *bb_list = &subprog->bbs;
+	struct bb_node *entry_bb;
+	struct edge_iter ei;
+
+	if (reverse)
+		entry_bb = exit_bb(bb_list);
+	else
+		entry_bb = entry_bb(bb_list);
+	idx = di->dfsnum - 1;
+
+	while (idx > 1) {
+		struct bb_node *bb = di->dfs_to_bb[idx];
+		struct edge_node *e;
+
+		par = di->dfs_parent[idx];
+		k = idx;
+
+		if (reverse) {
+			ei.edge = first_edge(&bb->e_succs);
+			ei.list_head = &bb->e_succs;
+		} else {
+			ei.edge = first_edge(&bb->e_prevs);
+			ei.list_head = &bb->e_prevs;
+		}
+
+		while (!ei_end_p(&ei)) {
+			struct bb_node *b;
+			u16 k1;
+
+			e = ei.edge;
+			if (reverse)
+				b = e->dst;
+			else
+				b = e->src;
+			ei_next(&ei);
+
+			if (b == entry_bb)
+				k1 = di->dfs_order[entry_bb_fake_idx];
+			else
+				k1 = di->dfs_order[b->idx];
+
+			if (k1 > idx)
+				k1 = di->sdom[eval(di, k1)];
+			if (k1 < k)
+				k = k1;
+		}
+
+		di->sdom[idx] = k;
+		link(di, par, idx);
+		di->next_bucket[idx] = di->bucket[k];
+		di->bucket[k] = idx;
+
+		for (w = di->bucket[par]; w; w = di->next_bucket[w]) {
+			k = eval(di, w);
+			if (di->sdom[k] < di->sdom[w])
+				di->idom[w] = k;
+			else
+				di->idom[w] = par;
+		}
+		di->bucket[par] = 0;
+		idx--;
+	}
+
+	di->idom[1] = 0;
+	for (idx = 2; idx <= di->dfsnum - 1; idx++)
+		if (di->idom[idx] != di->sdom[idx])
+			di->idom[idx] = di->idom[di->idom[idx]];
+}
+
+static int calc_dfs_tree(struct bpf_subprog_info *subprog, struct dom_info *di,
+			 bool reverse)
+{
+	u16 bb_num = subprog->bb_num, sp = 0, idx, parent_idx;
+	struct list_head *bb_list = &subprog->bbs;
+	u16 entry_bb_fake_idx = bb_num - 2;
+	struct bb_node *entry_bb, *exit_bb;
+	struct edge_iter ei, *stack;
+	struct edge_node *e;
+
+	di->dfs_order[entry_bb_fake_idx] = di->dfsnum;
+
+	stack = kmalloc_array(bb_num - 1, sizeof(struct edge_iter), GFP_KERNEL);
+	if (!stack)
+		return -ENOMEM;
+
+	if (reverse) {
+		entry_bb = exit_bb(bb_list);
+		exit_bb = entry_bb(bb_list);
+		di->dfs_to_bb[di->dfsnum++] = exit_bb;
+		ei.edge = first_edge(&entry_bb->e_prevs);
+		ei.list_head = &entry_bb->e_prevs;
+	} else {
+		entry_bb = entry_bb(bb_list);
+		exit_bb = exit_bb(bb_list);
+		di->dfs_to_bb[di->dfsnum++] = entry_bb;
+		ei.edge = first_edge(&entry_bb->e_succs);
+		ei.list_head = &entry_bb->e_succs;
+	}
+
+	while (1) {
+		struct bb_node *bb_dst, *bb_src;
+
+		while (!ei_end_p(&ei)) {
+			e = ei.edge;
+
+			if (reverse) {
+				bb_dst = e->src;
+				if (bb_dst == exit_bb ||
+				    di->dfs_order[bb_dst->idx]) {
+					ei_next(&ei);
+					continue;
+				}
+				bb_src = e->dst;
+			} else {
+				bb_dst = e->dst;
+				if (bb_dst == exit_bb ||
+				    di->dfs_order[bb_dst->idx]) {
+					ei_next(&ei);
+					continue;
+				}
+				bb_src = e->src;
+			}
+
+			if (bb_src != entry_bb)
+				parent_idx = di->dfs_order[bb_src->idx];
+			else
+				parent_idx = di->dfs_order[entry_bb_fake_idx];
+
+			idx = di->dfsnum++;
+			di->dfs_order[bb_dst->idx] = idx;
+			di->dfs_to_bb[idx] = bb_dst;
+			di->dfs_parent[idx] = parent_idx;
+
+			stack[sp++] = ei;
+			if (reverse) {
+				ei.edge = first_edge(&bb_dst->e_prevs);
+				ei.list_head = &bb_dst->e_prevs;
+			} else {
+				ei.edge = first_edge(&bb_dst->e_succs);
+				ei.list_head = &bb_dst->e_succs;
+			}
+		}
+
+		if (!sp)
+			break;
+
+		ei = stack[--sp];
+		ei_next(&ei);
+	}
+
+	kfree(stack);
+
+	return 0;
+}
+
+static int idoms_to_doms(struct bpf_subprog_info *subprog, struct dom_info *di)
+{
+	int bb_num, i, end_index, bb, bb_idom, lane_len;
+	unsigned long *bitmap;
+
+	bb_num = subprog->bb_num - 2;
+	lane_len = BITS_TO_LONGS(bb_num);
+	bitmap = kcalloc(bb_num, lane_len * sizeof(long), GFP_KERNEL);
+	subprog->dtree = bitmap;
+	if (!subprog->dtree)
+		return -ENOMEM;
+	subprog->dtree_avail = true;
+	end_index = di->dfs_order[bb_num];
+
+	for (i = 1; i <= di->dfsnum - 1; i++) {
+		if (i == end_index)
+			continue;
+
+		bb = di->dfs_to_bb[i]->idx;
+		if (di->idom[i] && di->idom[i] != end_index) {
+			bb_idom = di->dfs_to_bb[di->idom[i]]->idx;
+			bitmap_copy(bitmap + bb * lane_len,
+				    bitmap + bb_idom * lane_len, bb_num);
+		}
+
+		bitmap_set(bitmap + bb * lane_len, bb, 1);
+	}
+
+	return 0;
+}
+
+/* Build domination information using Lengauer and Tarjan algorithm.
+ *
+ *   1. dfs on cfg to assign dfs-num for each node(bb).
+ *   2. calculate semi-dominator and calculate immediate dominator when
+ *      possible.
+ *   3. calculate immediate dominator not finished during step 2.
+ *   4. build domination bitmap using immediate dominator information.
+ *
+ * See:
+ *   A fast algorithm for finding dominators in a flowgraph.
+ *     - 1979, by T Lengauer and R Tarjan
+ *
+ *   Especially, Appendix B: The Complete Dominators Algorithms.
+ *
+ * The implementation also referenced GNU GCC 3.0.
+ */
+
+int subprog_build_dom_info(struct bpf_subprog_info *subprog)
+{
+	struct dom_info di;
+	int ret;
+
+	ret = init_dom_info(subprog, &di);
+	if (ret < 0)
+		goto free_dominfo;
+
+	ret = calc_dfs_tree(subprog, &di, false);
+	if (ret < 0)
+		goto free_dominfo;
+
+	calc_idoms(subprog, &di, false);
+	ret = idoms_to_doms(subprog, &di);
+	if (ret < 0)
+		goto free_dominfo;
+
+	ret = 0;
+
+free_dominfo:
+	if (di.dfs_parent)
+		kfree(di.dfs_parent);
+
+	return ret;
+}
+
 static void subprog_free_edge(struct bb_node *bb)
 {
 	struct list_head *succs = &bb->e_succs;
@@ -199,7 +580,7 @@ static void subprog_free_edge(struct bb_node *bb)
 	}
 }
 
-void subprog_free_bb(struct bpf_subprog_info *subprog, int end_idx)
+void subprog_free(struct bpf_subprog_info *subprog, int end_idx)
 {
 	int i = 0;
 
@@ -214,5 +595,8 @@ void subprog_free_bb(struct bpf_subprog_info *subprog, int end_idx)
 			list_del(&bb->l);
 			kfree(bb);
 		}
+
+		if (subprog[i].dtree_avail)
+			kfree(subprog[i].dtree);
 	}
 }
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index 7a9d5dd..cbb44f2 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -10,8 +10,9 @@
 
 int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list);
 int subprog_append_bb(struct list_head *bb_list, int head);
+int subprog_build_dom_info(struct bpf_subprog_info *subprog);
 int subprog_fini_bb(struct list_head *bb_list, int subprog_end);
 int subprog_init_bb(struct list_head *bb_list, int subprog_start);
-void subprog_free_bb(struct bpf_subprog_info *subprog, int end_idx);
+void subprog_free(struct bpf_subprog_info *subprog, int end_idx);
 
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 585e0a0..6543470 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -878,6 +878,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			ret = subprog_add_bb_edges(insn, cur_bb_list);
 			if (ret < 0)
 				goto free_nodes;
+			subprog[cur_subprog].bb_num = ret;
 			subprog_start = subprog_end;
 			cur_subprog++;
 			if (cur_subprog < env->subprog_cnt) {
@@ -894,8 +895,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	ret = 0;
 
 free_nodes:
-	subprog_free_bb(subprog, cur_subprog == env->subprog_cnt ?
-					cur_subprog - 1 : cur_subprog);
+	subprog_free(subprog, cur_subprog == env->subprog_cnt ?
+				cur_subprog - 1 : cur_subprog);
 	return ret;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [RFC bpf-next 02/10] bpf: cfg: add edges between basic blocks to form CFG
From: Jiong Wang @ 2018-05-07 10:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel
  Cc: john.fastabend, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525688567-19618-1-git-send-email-jiong.wang@netronome.com>

This patch add edges between basic blocks. Both edges for predecessors and
successors are added.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 kernel/bpf/cfg.c      | 129 +++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/cfg.h      |   1 +
 kernel/bpf/verifier.c |   3 ++
 3 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index b1af714..208aac7 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -11,8 +11,16 @@
 
 #include "cfg.h"
 
+struct edge_node {
+	struct list_head l;
+	struct bb_node *src;
+	struct bb_node *dst;
+};
+
 struct bb_node {
 	struct list_head l;
+	struct list_head e_prevs;
+	struct list_head e_succs;
 	u16 head;
 };
 
@@ -39,6 +47,8 @@ int subprog_append_bb(struct list_head *bb_list, int head)
 	if (!new_bb)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&new_bb->e_prevs);
+	INIT_LIST_HEAD(&new_bb->e_succs);
 	new_bb->head = head;
 	list_add(&new_bb->l, &bb->l);
 
@@ -53,6 +63,8 @@ int subprog_fini_bb(struct list_head *bb_list, int subprog_end)
 		return -ENOMEM;
 	/* entry bb. */
 	bb->head = -1;
+	INIT_LIST_HEAD(&bb->e_prevs);
+	INIT_LIST_HEAD(&bb->e_succs);
 	list_add(&bb->l, bb_list);
 
 	bb = kzalloc(sizeof(*bb), GFP_KERNEL);
@@ -60,6 +72,8 @@ int subprog_fini_bb(struct list_head *bb_list, int subprog_end)
 		return -ENOMEM;
 	/* exit bb. */
 	bb->head = subprog_end;
+	INIT_LIST_HEAD(&bb->e_prevs);
+	INIT_LIST_HEAD(&bb->e_succs);
 	list_add_tail(&bb->l, bb_list);
 
 	return 0;
@@ -77,15 +91,126 @@ int subprog_init_bb(struct list_head *bb_list, int subprog_start)
 	return 0;
 }
 
+static struct bb_node *search_bb_with_head(struct list_head *bb_list, int head)
+{
+	struct bb_node *bb;
+
+	list_for_each_entry(bb, bb_list, l) {
+		if (bb->head == head)
+			return bb;
+	}
+
+	return NULL;
+}
+
+int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list)
+{
+	struct bb_node *bb, *exit_bb;
+	struct edge_node *edge;
+
+	bb = entry_bb(bb_list);
+	edge = kcalloc(2, sizeof(*edge), GFP_KERNEL);
+	if (!edge)
+		return -ENOMEM;
+	edge->src = bb;
+	edge->dst = bb_next(bb);
+	list_add_tail(&edge->l, &bb->e_succs);
+	edge[1].src = edge->src;
+	edge[1].dst = edge->dst;
+	list_add_tail(&edge[1].l, &edge[1].dst->e_prevs);
+
+	exit_bb = exit_bb(bb_list);
+	bb = bb_next(bb);
+	list_for_each_entry_from(bb, &exit_bb->l, l) {
+		bool has_fallthrough, only_has_fallthrough;
+		bool has_branch, only_has_branch;
+		struct bb_node *next_bb = bb_next(bb);
+		int tail = next_bb->head - 1;
+		struct bpf_insn insn;
+		u8 code;
+
+		edge = kcalloc(2, sizeof(*edge), GFP_KERNEL);
+		if (!edge)
+			return -ENOMEM;
+		edge->src = bb;
+		edge[1].src = bb;
+
+		insn = insns[tail];
+		code = insn.code;
+		only_has_fallthrough = BPF_CLASS(code) != BPF_JMP ||
+				       BPF_OP(code) == BPF_EXIT;
+		has_fallthrough = only_has_fallthrough ||
+				  (BPF_CLASS(code) == BPF_JMP &&
+				   BPF_OP(code) != BPF_CALL &&
+				   BPF_OP(code) != BPF_JA);
+		only_has_branch = BPF_CLASS(code) == BPF_JMP &&
+				  BPF_OP(code) == BPF_JA;
+		has_branch = only_has_branch ||
+			     (BPF_CLASS(code) == BPF_JMP &&
+			      BPF_OP(code) != BPF_EXIT &&
+			      BPF_OP(code) != BPF_CALL);
+
+		if (has_fallthrough) {
+			if (BPF_CLASS(code) == BPF_JMP &&
+			    BPF_OP(code) == BPF_EXIT)
+				next_bb = exit_bb;
+			edge->dst = next_bb;
+			edge[1].dst = next_bb;
+			list_add_tail(&edge->l, &bb->e_succs);
+			list_add_tail(&edge[1].l, &edge[1].dst->e_prevs);
+			edge = NULL;
+		}
+
+		if (has_branch) {
+			struct bb_node *tgt;
+
+			if (!edge) {
+				edge = kcalloc(2, sizeof(*edge), GFP_KERNEL);
+				if (!edge)
+					return -ENOMEM;
+				edge->src = bb;
+				edge[1].src = bb;
+			}
+
+			tgt = search_bb_with_head(bb_list,
+						  tail + insn.off + 1);
+			if (!tgt)
+				return -EINVAL;
+
+			edge->dst = tgt;
+			edge[1].dst = tgt;
+			list_add_tail(&edge->l, &bb->e_succs);
+			list_add_tail(&edge[1].l, &tgt->e_prevs);
+		}
+	}
+
+	return 0;
+}
+
+static void subprog_free_edge(struct bb_node *bb)
+{
+	struct list_head *succs = &bb->e_succs;
+	struct edge_node *e, *tmp;
+
+	/* prevs and succs are allocated as pair, succs is the start addr. */
+	list_for_each_entry_safe(e, tmp, succs, l) {
+		list_del(&e->l);
+		kfree(e);
+	}
+}
+
 void subprog_free_bb(struct bpf_subprog_info *subprog, int end_idx)
 {
 	int i = 0;
 
 	for (; i <= end_idx; i++) {
 		struct list_head *bbs = &subprog[i].bbs;
-		struct bb_node *bb, *tmp;
+		struct bb_node *bb, *tmp, *exit;
 
-		list_for_each_entry_safe(bb, tmp, bbs, l) {
+		bb = entry_bb(bbs);
+		exit = exit_bb(bbs);
+		list_for_each_entry_safe_from(bb, tmp, &exit->l, l) {
+			subprog_free_edge(bb);
 			list_del(&bb->l);
 			kfree(bb);
 		}
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index 4092145..7a9d5dd 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -8,6 +8,7 @@
 #ifndef __BPF_CFG_H__
 #define __BPF_CFG_H__
 
+int subprog_add_bb_edges(struct bpf_insn *insns, struct list_head *bb_list);
 int subprog_append_bb(struct list_head *bb_list, int head);
 int subprog_fini_bb(struct list_head *bb_list, int subprog_end);
 int subprog_init_bb(struct list_head *bb_list, int subprog_start);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9c3eeef..585e0a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -875,6 +875,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			ret = subprog_fini_bb(cur_bb_list, subprog_end);
 			if (ret < 0)
 				goto free_nodes;
+			ret = subprog_add_bb_edges(insn, cur_bb_list);
+			if (ret < 0)
+				goto free_nodes;
 			subprog_start = subprog_end;
 			cur_subprog++;
 			if (cur_subprog < env->subprog_cnt) {
-- 
2.7.4

^ permalink raw reply related

* [RFC bpf-next 01/10] bpf: cfg: partition basic blocks for each subprog
From: Jiong Wang @ 2018-05-07 10:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel
  Cc: john.fastabend, netdev, oss-drivers, Jiong Wang
In-Reply-To: <1525688567-19618-1-git-send-email-jiong.wang@netronome.com>

"check_subprogs" has partitioned subprogs and is doing insn scan for each
subprog already, this patch extends it to also partition basic blocks (BB)
for each subprog.

  - The first insn for each subprog start a BB.
  - Branch (both cond and absolute) target start a BB.
  - Insn immediately follows branch insn start a BB.
    Insn immediately follows exit and within subprog start a BB.

BBs for each subprog are organized as a list in ascending head.

Two special BBs, entry and exit are added as well.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/Makefile          |  2 +-
 kernel/bpf/cfg.c             | 93 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cfg.h             | 16 ++++++++
 kernel/bpf/verifier.c        | 57 ++++++++++++++++++++++++---
 5 files changed, 162 insertions(+), 7 deletions(-)
 create mode 100644 kernel/bpf/cfg.c
 create mode 100644 kernel/bpf/cfg.h

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8f70dc1..404a5d8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -176,6 +176,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
+	struct list_head bbs; /* basic blocks list. */
 };
 
 /* single container for all structs
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 35c485f..b4542d8 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y := core.o
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cfg.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
new file mode 100644
index 0000000..b1af714
--- /dev/null
+++ b/kernel/bpf/cfg.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2018 Netronome Systems, Inc. */
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "cfg.h"
+
+struct bb_node {
+	struct list_head l;
+	u16 head;
+};
+
+#define bb_prev(bb)		list_prev_entry(bb, l)
+#define bb_next(bb)		list_next_entry(bb, l)
+#define bb_first(bb_list)	list_first_entry(bb_list, struct bb_node, l)
+#define bb_last(bb_list)	list_last_entry(bb_list, struct bb_node, l)
+#define entry_bb(bb_list)	bb_first(bb_list)
+#define exit_bb(bb_list)	bb_last(bb_list)
+
+int subprog_append_bb(struct list_head *bb_list, int head)
+{
+	struct bb_node *new_bb, *bb;
+
+	list_for_each_entry(bb, bb_list, l) {
+		if (bb->head == head)
+			return 0;
+		else if (bb->head > head)
+			break;
+	}
+
+	bb = bb_prev(bb);
+	new_bb = kzalloc(sizeof(*new_bb), GFP_KERNEL);
+	if (!new_bb)
+		return -ENOMEM;
+
+	new_bb->head = head;
+	list_add(&new_bb->l, &bb->l);
+
+	return 0;
+}
+
+int subprog_fini_bb(struct list_head *bb_list, int subprog_end)
+{
+	struct bb_node *bb = kzalloc(sizeof(*bb), GFP_KERNEL);
+
+	if (!bb)
+		return -ENOMEM;
+	/* entry bb. */
+	bb->head = -1;
+	list_add(&bb->l, bb_list);
+
+	bb = kzalloc(sizeof(*bb), GFP_KERNEL);
+	if (!bb)
+		return -ENOMEM;
+	/* exit bb. */
+	bb->head = subprog_end;
+	list_add_tail(&bb->l, bb_list);
+
+	return 0;
+}
+
+int subprog_init_bb(struct list_head *bb_list, int subprog_start)
+{
+	int ret;
+
+	INIT_LIST_HEAD(bb_list);
+	ret = subprog_append_bb(bb_list, subprog_start);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+void subprog_free_bb(struct bpf_subprog_info *subprog, int end_idx)
+{
+	int i = 0;
+
+	for (; i <= end_idx; i++) {
+		struct list_head *bbs = &subprog[i].bbs;
+		struct bb_node *bb, *tmp;
+
+		list_for_each_entry_safe(bb, tmp, bbs, l) {
+			list_del(&bb->l);
+			kfree(bb);
+		}
+	}
+}
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
new file mode 100644
index 0000000..4092145
--- /dev/null
+++ b/kernel/bpf/cfg.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2018 Netronome Systems, Inc. */
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#ifndef __BPF_CFG_H__
+#define __BPF_CFG_H__
+
+int subprog_append_bb(struct list_head *bb_list, int head);
+int subprog_fini_bb(struct list_head *bb_list, int subprog_end);
+int subprog_init_bb(struct list_head *bb_list, int subprog_start);
+void subprog_free_bb(struct bpf_subprog_info *subprog, int end_idx);
+
+#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 65a6e2e..9c3eeef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 
+#include "cfg.h"
 #include "disasm.h"
 
 static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
@@ -782,6 +783,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
+	struct list_head *cur_bb_list;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
@@ -816,20 +818,46 @@ static int check_subprogs(struct bpf_verifier_env *env)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
-	/* now check that all jumps are within the same subprog */
 	subprog_start = subprog[cur_subprog].start;
 	subprog_end = subprog[cur_subprog + 1].start;
+	cur_bb_list = &subprog[cur_subprog].bbs;
+	ret = subprog_init_bb(cur_bb_list, subprog_start);
+	if (ret < 0)
+		goto free_nodes;
+	/* now check that all jumps are within the same subprog */
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
 		if (BPF_CLASS(code) != BPF_JMP)
 			goto next;
-		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+
+		if (BPF_OP(code) == BPF_EXIT) {
+			if (i + 1 < subprog_end) {
+				ret = subprog_append_bb(cur_bb_list, i + 1);
+				if (ret < 0)
+					goto free_nodes;
+			}
+			goto next;
+		}
+
+		if (BPF_OP(code) == BPF_CALL)
 			goto next;
+
 		off = i + insn[i].off + 1;
 		if (off < subprog_start || off >= subprog_end) {
 			verbose(env, "jump out of range from insn %d to %d\n", i, off);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto free_nodes;
+		}
+
+		ret = subprog_append_bb(cur_bb_list, off);
+		if (ret < 0)
+			goto free_nodes;
+
+		if (i + 1 < subprog_end) {
+			ret = subprog_append_bb(cur_bb_list, i + 1);
+			if (ret < 0)
+				goto free_nodes;
 		}
 next:
 		if (i == subprog_end - 1) {
@@ -840,15 +868,32 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			if (code != (BPF_JMP | BPF_EXIT) &&
 			    code != (BPF_JMP | BPF_JA)) {
 				verbose(env, "last insn is not an exit or jmp\n");
-				return -EINVAL;
+				ret = -EINVAL;
+				goto free_nodes;
 			}
+
+			ret = subprog_fini_bb(cur_bb_list, subprog_end);
+			if (ret < 0)
+				goto free_nodes;
 			subprog_start = subprog_end;
 			cur_subprog++;
-			if (cur_subprog < env->subprog_cnt)
+			if (cur_subprog < env->subprog_cnt) {
 				subprog_end = subprog[cur_subprog + 1].start;
+				cur_bb_list = &subprog[cur_subprog].bbs;
+				ret = subprog_init_bb(cur_bb_list,
+						      subprog_start);
+				if (ret < 0)
+					goto free_nodes;
+			}
 		}
 	}
-	return 0;
+
+	ret = 0;
+
+free_nodes:
+	subprog_free_bb(subprog, cur_subprog == env->subprog_cnt ?
+					cur_subprog - 1 : cur_subprog);
+	return ret;
 }
 
 static
-- 
2.7.4

^ permalink raw reply related

* [RFC bpf-next 00/10] initial control flow support for eBPF verifier
From: Jiong Wang @ 2018-05-07 10:22 UTC (permalink / raw)
  To: alexei.starovoitov, daniel
  Cc: john.fastabend, netdev, oss-drivers, Jiong Wang

This patch introduce initial control flow infrastructure to eBPF verifier.
Various control flow information are built in an incremental way, so they
could be easily maintained at various level.

After this patch set, information collection on eBPF insns are centred at
check_subprogs, check_cfg and related code then are replaced by new
infrastructure.

Checks/Analysis offered by check_cfg are still offered by new
infrastructure. For example loop detection, recursive function call,
unreachable insn, prune heuristic marking.

This infrastructure comes with some memory usage and execution time cost.
Memory pool based allocation is used to avoid frequently calling of
kmalloc/free. Singly linked list and compact pointer are used to reduce
various cfg node size.

data structure size
===
  basic blocks     edge        call-graph edge
  16-bytes         24-bytes    6-bytes

memory usage (test_l4lb_noinline, test_xdp_noinline)
====
(counted all subprogs):

test_l4lb_noinline:

  cfg info:  597 insns, 12 subprogs
             107 basic-blocks, 278 edges, 12 call edges.

  mem usage: 9216(9K)-bytes for memory pool (basic-blocks/edges/call-edges)
             644-bytes for dominator trees.

test_xdp_noinline:

  cfg info:  1029 insns, 21 subprogs
             191 basic-basics, 502 edges, 23 call edges.

  mem usage: 15360(15K)-bytes for memory pool.
             1192-bytes for dominator trees.

The existing check_cfg is using two auxiliary arrays (insn_state and
insn_stack), insn_cnt * sizeof(int) bytes for each, so would use ~5K and
~8K accordingly.  (looks to me insn_state could use 1-byte element,
insn_stack could use 2-byte, so probably could reduced to ~2K and ~3K).

The existing check_cfg is using 8-bytes for each insn during cfg check.
The new infrastructure basically use 2x mems, 16-bytes for each insn.

execution time
===
test_l4lb_noinline:
  existing check_subprog/check_cfg: ~55000 ns
  new infrastructure: ~135000 ns

test_xdp_noinline:
  existing check_subprog/check_cfg: ~52000 ns
  new infrastructure: ~120000 ns

The control flow infrastructure could possibly lay the ground for further
static analysis inside eBPF verifier, for example bounded loop detection,
path-sensitive data-flow analysis etc.

Jiong Wang (10):
  bpf: cfg: partition basic blocks for each subprog
  bpf: cfg: add edges between basic blocks to form CFG
  bpf: cfg: build domination tree using Tarjan algorithm
  bpf: cfg: detect loop use domination information
  bpf: cfg: detect unreachable basic blocks
  bpf: cfg: move find_subprog/add_subprog to cfg.c
  bpf: cfg: build call graph and detect unreachable/recursive call
  bpf: cfg: remove push_insn and check_cfg
  bpf: cfg: reduce k*alloc/free call by using memory pool for allocating
    nodes
  bpf: cfg: reduce memory usage by using singly list + compat pointer

 include/linux/bpf_verifier.h |   8 +
 kernel/bpf/Makefile          |   2 +-
 kernel/bpf/cfg.c             | 956 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cfg.h             |  42 ++
 kernel/bpf/verifier.c        | 386 ++++++-----------
 5 files changed, 1131 insertions(+), 263 deletions(-)
 create mode 100644 kernel/bpf/cfg.c
 create mode 100644 kernel/bpf/cfg.h

-- 
2.7.4

^ permalink raw reply

* Re: WARNING in add_uevent_var
From: Tetsuo Handa @ 2018-05-07 10:13 UTC (permalink / raw)
  To: Johannes Berg, syzbot, syzkaller-bugs
  Cc: davem, linux-kernel, linux-wireless, netdev
In-Reply-To: <1522758845.2961.3.camel@sipsolutions.net>

On 2018/04/03 21:34, Johannes Berg wrote:
> On Sun, 2018-04-01 at 23:01 -0700, syzbot wrote:
> 
>> So far this crash happened 5 times on net-next, upstream.
>> C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6614377067184128
>>
> 
> Huh, fun. Looks like you're basically creating a new HWSIM radio with an
> insanely long name (4k!) and nothing stops you, until we try to generate
> an rfkill instance which sends a uevent and only has a 2k buffer for the
> environment variables, where we put the name ...
> 
> But yeah, we should probably limit the phy name to something sane, I'll
> pick 128 and send a patch.
> 

I think it should be NAME_MAX.

----------------------------------------
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <linux/netlink.h>

int main(int argc, char *argv[])
{
	const int fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
	static struct {
		struct nlmsghdr hdr;
		char data[4120 - sizeof(struct nlmsghdr)];
	} buf = { };
	struct sockaddr_nl addr = { .nl_family = AF_NETLINK };
	struct iovec iov = { &buf, sizeof(buf) };
	struct msghdr msg = {
		.msg_name = &addr,
		.msg_namelen = sizeof(addr),
		.msg_iov = &iov,
		.msg_iovlen = 1,
	};
	buf.hdr.nlmsg_len = 0x1018;
	buf.hdr.nlmsg_type = 0x22;
	buf.hdr.nlmsg_flags = 0x109;
	*(uint8_t*) buf.data = 4;
	*(uint8_t*) (buf.data + 1) = 0;
	*(uint16_t*) (buf.data + 2) = 0;
	*(uint16_t*) (buf.data + 4) = 0x1004;
	*(uint16_t*) (buf.data + 6) = 0x11;
	memset(buf.data + 8, 'A', 4096); /* strlen() > NAME_MAX */
	sendmsg(fd, &msg, 0);
	return 0;
}
----------------------------------------



>From 3eba6541da0c7338e3d71ea83cbc69962923d65e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 7 May 2018 15:58:37 +0900
Subject: [PATCH] net: rfkill: Add filename varidity test at rfkill_alloc().

syzbot is hitting WARN() at kobject_uevent_env() [1].
This is because the test case is requesting too long name.
Since the name is used as part of pathname under /sys/devices/virtual/ ,
this name parameter must obey the limitations of a filename.
Fix this by applying name validity test to rfkill_alloc().

[1] https://syzkaller.appspot.com/bug?id=c1e1ab1d042b637ee9e25c64b107d51630b9d9ec

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+230d9e642a85d3fec29c@syzkaller.appspotmail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
---
 net/rfkill/core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 59d0eb9..d8beebc 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -933,6 +933,10 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
 	if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
 		return NULL;
 
+	if (strlen(name) > NAME_MAX || strchr(name, '/') ||
+	    !strcmp(name, ".") || !strcmp(name, ".."))
+		return NULL;
+
 	rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL);
 	if (!rfkill)
 		return NULL;
-- 
1.8.3.1

^ permalink raw reply related

* Re: WARNING in kernfs_add_one
From: Tetsuo Handa @ 2018-05-07 10:10 UTC (permalink / raw)
  To: Greg KH, Eric Dumazet, syzbot
  Cc: linux-wireless, netdev, linux-kernel, syzkaller-bugs, tj,
	Johannes Berg
In-Reply-To: <20180505220721.GA10829@kroah.com>

On 2018/05/06 7:07, Greg KH wrote:
>> More likely wireless territory
> 
> Ugh, that's what I get for writing emails before coffee in the
> morning...
> 
> Yes, you are right, this looks like a wireless issue.
> 
> Now cc: linux-wireless.
> 
Nope, if you look at previous fault injection messages...



>From 7ddcaa3d4327d4f29d11053bd2011bf77ecf72af Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 7 May 2018 14:19:50 +0900
Subject: [PATCH] driver core: Don't ignore class_dir_create_and_add() failure.

syzbot is hitting WARN() at kernfs_add_one() [1].
This is because kernfs_create_link() is confused by previous device_add()
call which continued without setting dev->kobj.parent field when
get_device_parent() failed by memory allocation fault injection.
Fix this by propagating the error from class_dir_create_and_add() to
the calllers of get_device_parent().

[1] https://syzkaller.appspot.com/bug?id=fae0fb607989ea744526d1c082a5b8de6529116f

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+df47f81c226b31d89fb1@syzkaller.appspotmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index b610816..d680fd0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1467,7 +1467,7 @@ class_dir_create_and_add(struct class *class, struct kobject *parent_kobj)
 
 	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
 	if (!dir)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	dir->class = class;
 	kobject_init(&dir->kobj, &class_dir_ktype);
@@ -1477,7 +1477,7 @@ class_dir_create_and_add(struct class *class, struct kobject *parent_kobj)
 	retval = kobject_add(&dir->kobj, parent_kobj, "%s", class->name);
 	if (retval < 0) {
 		kobject_put(&dir->kobj);
-		return NULL;
+		return ERR_PTR(retval);
 	}
 	return &dir->kobj;
 }
@@ -1784,6 +1784,10 @@ int device_add(struct device *dev)
 
 	parent = get_device(dev->parent);
 	kobj = get_device_parent(dev, parent);
+	if (IS_ERR(kobj)) {
+		error = PTR_ERR(kobj);
+		goto parent_error;
+	}
 	if (kobj)
 		dev->kobj.parent = kobj;
 
@@ -1882,6 +1886,7 @@ int device_add(struct device *dev)
 	kobject_del(&dev->kobj);
  Error:
 	cleanup_glue_dir(dev, glue_dir);
+parent_error:
 	put_device(parent);
 name_error:
 	kfree(dev->p);
@@ -2701,6 +2706,11 @@ int device_move(struct device *dev, struct device *new_parent,
 	device_pm_lock();
 	new_parent = get_device(new_parent);
 	new_parent_kobj = get_device_parent(dev, new_parent);
+	if (IS_ERR(new_parent_kobj)) {
+		error = PTR_ERR(new_parent_kobj);
+		put_device(new_parent);
+		goto out;
+	}
 
 	pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev),
 		 __func__, new_parent ? dev_name(new_parent) : "<NULL>");
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next] flow_dissector: do not rely on implicit casts
From: Paolo Abeni @ 2018-05-07 10:06 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller

This change fixes a couple of type mismatch reported by the sparse
tool, explicitly using the requested type for the offending arguments.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tipc.h        | 4 ++--
 net/core/flow_dissector.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/tipc.h b/include/net/tipc.h
index 07670ec022a7..f0e7e6bc1bef 100644
--- a/include/net/tipc.h
+++ b/include/net/tipc.h
@@ -44,11 +44,11 @@ struct tipc_basic_hdr {
 	__be32 w[4];
 };
 
-static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
+static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
 {
 	u32 w0 = ntohl(hdr->w[0]);
 	bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
-	int key;
+	__be32 key;
 
 	/* Return source node identity as key */
 	if (likely(!keepalive_msg))
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 030d4ca177fb..4fc1e84d77ec 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1316,7 +1316,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys_basic keys;
 
-	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
+	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH v2 net-next] net: stmmac: Add support for U32 TC filter using Flexible RX Parser
From: Jose Abreu @ 2018-05-07 10:02 UTC (permalink / raw)
  To: Jakub Kicinski, Jose Abreu
  Cc: netdev, David S. Miller, Joao Pinto, Vitor Soares,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <20180504183338.586f2903@cakuba.netronome.com>

Hi Jakub, David,

On 05-05-2018 02:33, Jakub Kicinski wrote:
> On Fri,  4 May 2018 10:01:38 +0100, Jose Abreu wrote:
>> This adds support for U32 filter by using an HW only feature called
>> Flexible RX Parser. This allow us to match any given packet field with a
>> pattern and accept/reject or even route the packet to a specific DMA
>> channel.
>>
>> Right now we only support acception or rejection of frame and we only
>> support simple rules. Though, the Parser has the flexibility of jumping to
>> specific rules as an if condition so complex rules can be established.
>>
>> This is only supported in GMAC5.10+.
>>
>> The following commands can be used to test this code:
>>
>> 	1) Setup an ingress qdisk:
>> 	# tc qdisc add dev eth0 handle ffff: ingress
>>
>> 	2) Setup a filter (e.g. filter by IP):
>> 	# tc filter add dev eth0 parent ffff: protocol ip u32 match ip \
>> 		src 192.168.0.3 skip_sw action drop
>>
>> In every tests performed we always used the "skip_sw" flag to make sure
>> only the RX Parser was involved.
>>
>> Signed-off-by: Jose Abreu <joabreu@synopsys.com>
>> Cc: David S. Miller <davem@davemloft.net>
>> Cc: Joao Pinto <jpinto@synopsys.com>
>> Cc: Vitor Soares <soares@synopsys.com>
>> Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
>> Cc: Alexandre Torgue <alexandre.torgue@st.com>
>> Cc: Jakub Kicinski <kubakici@wp.pl>
>> ---
>> Changes from v1:
>> 	- Follow Linux network coding style (David)
>> 	- Use tc_cls_can_offload_and_chain0() (Jakub)
> Thanks!
>
>> @@ -4223,6 +4277,11 @@ int stmmac_dvr_probe(struct device *device,
>>  	ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
>>  			    NETIF_F_RXCSUM;
>>  
>> +	ret = stmmac_tc_init(priv, priv);
>> +	if (!ret) {
>> +		ndev->hw_features |= NETIF_F_HW_TC;
>> +	}
>> +
>>  	if ((priv->plat->tso_en) && (priv->dma_cap.tsoen)) {
>>  		ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
>>  		priv->tso = true;
> One more comment, but perhaps not a showstopper, it's considered good
> practice to disallow clearing/disabling this flag while filters are
> installed.  Driver should return -EBUSY from .ndo_set_features if TC
> rules are offloaded and user wants to disable HW_TC feature flag.

I can do that but I saw in Patchwork that patch was already
marked as accepted, David sent me no confirmation though.

David,
Shall I respin or send a follow up patch?

Thanks and Best Regards,
Jose Miguel Abreu

^ permalink raw reply

* Re: WARNING in kernfs_add_one
From: Johannes Berg @ 2018-05-07  9:53 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Greg KH, linux-wireless, Eric Dumazet, netdev, syzbot, LKML,
	syzkaller-bugs, Tejun Heo
In-Reply-To: <CACT4Y+bWW8w7RJ1roUQ8dDk5Fv0syDsKtMH1wRnkkN4nWOootQ@mail.gmail.com>

On Mon, 2018-05-07 at 11:33 +0200, Dmitry Vyukov wrote:
> On Mon, May 7, 2018 at 10:43 AM, Johannes Berg
> <johannes@sipsolutions.net> wrote:
> > On Sat, 2018-05-05 at 15:07 -0700, Greg KH wrote:
> > 
> > > > > > syzbot found the following crash on:
> > 
> > Maybe it should learn to differentiate warnings, if it's going to set
> > panic_on_warn :-)
> 
> How?
> Note that this is not specific to syzbot. If you see WARNINGs in a
> subsystem that you have no idea about (or you just a normal user),
> what do you do? Right, you report it to maintainers.

Yeah, no problem with that. Just some people seem to get so much more
upset about crashes ... but then again I get bug reports about WARN_ON
all the time anyway that say "my kernel crashed" so I guess it doesn't
really matter :-)

> > I get why, but still, at least differentiating in the emails wouldn't be
> > bad.
> 
> Well, the subject says "WARNING".
> But note there are _very_ bad WARNINGs too. Generally, a WARNING means
> a kernel bug just that kernel can tolerate without bringing the system
> down (as opposed to BUG).

Yeah, fair point. I sort of missed the subject I guess.

johannes

^ permalink raw reply

* [PATCH net-next] net:sched: add gkprio scheduler
From: Nishanth Devarajan @ 2018-05-07  9:36 UTC (permalink / raw)
  To: xiyou.wangcong, jiri, jhs, davem; +Cc: netdev, doucette, michel

net/sched: add gkprio scheduler

Gkprio (Gatekeeper Priority Queue) is a queueing discipline that prioritizes
IPv4 and IPv6 packets accordingly to their DSCP field. Although Gkprio can be
employed in any QoS scenario in which a higher DSCP field means a higher
priority packet, Gkprio was concieved as a solution for denial-of-service
defenses that need to route packets with different priorities.

Signed-off-by: Nishanth Devarajan <ndev2021@gmail.com>
Reviewed-by: Cody Doucette <doucette@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Reviewed-by: Sachin Paryani <sachin.paryani@gmail.com>
---
 include/uapi/linux/pkt_sched.h |  11 ++
 net/sched/Kconfig              |  13 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_gkprio.c         | 316 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 341 insertions(+)
 create mode 100644 net/sched/sch_gkprio.c

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096..de8b5ca 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -124,6 +124,17 @@ struct tc_fifo_qopt {
 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
 };
 
+/* GKPRIO section */
+
+struct tc_gkprio_qopt {
+	__u32	limit; 	    	/* Queue length in packets. */
+	__u16	noip_dfltp; 	/* Default priority for non-IP packets. */
+
+	/* Stats. */
+	__u16 highest_prio; 	/* Highest priority currently in queue.  */
+	__u16 lowest_prio;  	/* Lowest priority currently in queue. */
+};
+
 /* PRIO section */
 
 #define TCQ_PRIO_BANDS	16
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169f..9c47857 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -240,6 +240,19 @@ config NET_SCH_MQPRIO
 
 	  If unsure, say N.
 
+config NET_SCH_GKPRIO
+	tristate "Gatekeeper priority queue scheduler (GKPRIO)"
+	help
+	  Say Y here if you want to use the Gatekeeper priority queue
+	  scheduler. This schedules packets according to priorities based on
+	  the DSCP (IPv4) and DS (IPv6) fields, which is useful for request
+	  packets in DoS mitigation systems such as Gatekeeper.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_gkprio.
+
+	  If unsure, say N.
+
 config NET_SCH_CHOKE
 	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d38..93a1fdb 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
+obj-$(CONFIG_NET_SCH_GKPRIO)	+= sch_gkprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
diff --git a/net/sched/sch_gkprio.c b/net/sched/sch_gkprio.c
new file mode 100644
index 0000000..ad1227c
--- /dev/null
+++ b/net/sched/sch_gkprio.c
@@ -0,0 +1,316 @@
+/*
+ * net/sched/sch_gkprio.c  Gatekeeper Priority Queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nishanth Devarajan, <ndev_2021@gmail.com>
+ *	        original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+/* Packets are assigned priorities [0, 63] due to the IP DSCP field limits. */
+#define GKPRIO_MAX_PRIORITY 64
+
+/*	  Gatekeeper Priority Queue
+ *	=================================
+ *
+ * This qdisc schedules a packet according to the value (0-63) of its DSCP
+ * (IPv4) or DS (IPv6) field, where a higher value places the packet closer
+ * to the exit of the queue. Non-IP packets are assigned a default priority
+ * specified to GKPRIO; if none is specified, default priority is set
+ * to 0. When the queue is full, the lowest priority packet in the queue is
+ * dropped to make room for the packet to be added if it has higher priority.
+ * If the packet to be added has lower priority than all packets in the queue,
+ * it is dropped.
+ *
+ * Without the Gatekeeper priority queue, queue length limits must be imposed
+ * for individual queues, and there is no easy way to enforce a global queue
+ * length limit across all priorities. With the Gatekeeper queue, a global
+ * queue length limit can be enforced while not restricting the queue lengths
+ * of individual priorities.
+ *
+ * This is especially useful for a denial-of-service defense system; like
+ * Gatekeeper, which prioritizes packets in flows that demonstrate expected
+ * behavior of legitimate users. The queue is flexible to allow any number
+ * of packets of any priority up to the global limit of the scheduler
+ * without risking resource overconsumption by a flood of low priority packets.
+ *
+ * The Gatekeper standalone codebase is found here:
+ *
+ *		https://github.com/AltraMayor/gatekeeper
+ */
+
+struct gkprio_sched_data {
+	/* Parameters. */
+	u32 max_limit;
+	u16 noip_dfltp;
+
+	/* Queue state. */
+	struct sk_buff_head qdiscs[GKPRIO_MAX_PRIORITY];
+	u16 highest_prio;
+	u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct gkprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* GK queue is empty, return 0 (default highest priority setting). */
+	return 0;
+}
+
+static u16 calc_new_low_prio(const struct gkprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* GK queue is empty, return GKPRIO_MAX_PRIORITY - 1
+	 * (default lowest priority setting).
+	 */
+	return GKPRIO_MAX_PRIORITY - 1;
+}
+
+static int gkprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *qdisc;
+	struct sk_buff_head *lp_qdisc;
+	struct sk_buff *to_drop;
+	int wlen;
+	u16 prio, lp;
+
+	/* Obtain the priority of @skb. */
+	wlen = skb_network_offset(skb);
+	switch (tc_skb_protocol(skb)) {
+	case htons(ETH_P_IP):
+		wlen += sizeof(struct iphdr);
+		if (!pskb_may_pull(skb, wlen))
+			goto drop;
+		prio = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+		break;
+
+	case htons(ETH_P_IPV6):
+		wlen += sizeof(struct ipv6hdr);
+		if (!pskb_may_pull(skb, wlen))
+			goto drop;
+		prio = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+		break;
+
+	default:
+		prio = q->noip_dfltp;
+		break;
+	}
+
+	qdisc = &q->qdiscs[prio];
+
+	if (sch->q.qlen < q->max_limit) {
+		__skb_queue_tail(qdisc, skb);
+		qdisc_qstats_backlog_inc(sch, skb);
+
+		/* Check to update highest and lowest priorities. */
+		if (prio > q->highest_prio)
+			q->highest_prio = prio;
+
+		if (prio < q->lowest_prio)
+			q->lowest_prio = prio;
+
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+
+	/* If this packet has the lowest priority, drop it. */
+	lp = q->lowest_prio;
+	if (prio <= lp)
+		return qdisc_drop(skb, sch, to_free);
+
+	/* Drop the packet at the tail of the lowest priority qdisc. */
+	lp_qdisc = &q->qdiscs[lp];
+	to_drop = __skb_dequeue_tail(lp_qdisc);
+	BUG_ON(!to_drop);
+	qdisc_qstats_backlog_dec(sch, to_drop);
+	qdisc_drop(to_drop, sch, to_free);
+
+	__skb_queue_tail(qdisc, skb);
+	qdisc_qstats_backlog_inc(sch, skb);
+
+	/* Check to update highest and lowest priorities. */
+	if (skb_queue_empty(lp_qdisc)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->lowest_prio = prio;
+			q->highest_prio = prio;
+		} else {
+			q->lowest_prio = calc_new_low_prio(q);
+		}
+	}
+
+	if (prio > q->highest_prio)
+		q->highest_prio = prio;
+
+	return NET_XMIT_SUCCESS;
+drop:
+	qdisc_drop(skb, sch, to_free);
+	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+static struct sk_buff *gkprio_dequeue(struct Qdisc *sch)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+	struct sk_buff *skb = __skb_dequeue(hpq);
+
+	if (unlikely(!skb))
+		return NULL;
+
+	sch->q.qlen--;
+	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_bstats_update(sch, skb);
+
+	/* Update highest priority field. */
+	if (skb_queue_empty(hpq)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->highest_prio = 0;
+			q->lowest_prio = GKPRIO_MAX_PRIORITY - 1;
+		} else {
+			q->highest_prio = calc_new_high_prio(q);
+		}
+	}
+	return skb;
+}
+
+static int gkprio_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct tc_gkprio_qopt *ctl = nla_data(opt);
+	unsigned int min_limit = 1;
+
+	if (ctl->limit == (typeof(ctl->limit))-1)
+		q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
+	else if (ctl->limit < 1 || ctl->limit > qdisc_dev(sch)->tx_queue_len)
+		return -EINVAL;
+	else
+		q->max_limit = ctl->limit;
+
+	if (ctl->noip_dfltp == (typeof(ctl->noip_dfltp))-1)
+		q->noip_dfltp = 0;
+	else if (ctl->noip_dfltp >= GKPRIO_MAX_PRIORITY)
+		return -EINVAL;
+	else
+		q->noip_dfltp = ctl->noip_dfltp;
+
+	return 0;
+}
+
+static int gkprio_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+	unsigned int min_limit = 1;
+
+	/* Initialise all queues, one for each possible priority. */
+	for (prio = 0; prio < GKPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_head_init(&q->qdiscs[prio]);
+
+	q->highest_prio = 0;
+	q->lowest_prio = GKPRIO_MAX_PRIORITY - 1;
+	if (!opt) {
+		q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
+		q->noip_dfltp = 0;
+		return 0;
+	}
+	return gkprio_change(sch, opt, extack);
+}
+
+static int gkprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	struct tc_gkprio_qopt opt;
+
+	opt.limit = q->max_limit;
+	opt.noip_dfltp = q->noip_dfltp;
+	opt.highest_prio = q->highest_prio;
+	opt.lowest_prio = q->lowest_prio;
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		return -1;
+
+	return skb->len;
+}
+
+static void gkprio_reset(struct Qdisc *sch)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	for (prio = 0; prio < GKPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+	q->highest_prio = 0;
+	q->lowest_prio = GKPRIO_MAX_PRIORITY - 1;
+}
+
+static void gkprio_destroy(struct Qdisc *sch)
+{
+	struct gkprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < GKPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+}
+
+struct Qdisc_ops gkprio_qdisc_ops __read_mostly = {
+	.id		=	"gkprio",
+	.priv_size	=	sizeof(struct gkprio_sched_data),
+	.enqueue	=	gkprio_enqueue,
+	.dequeue	=	gkprio_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	gkprio_init,
+	.reset		=	gkprio_reset,
+	.change		=	gkprio_change,
+	.dump		=	gkprio_dump,
+	.destroy	=	gkprio_destroy,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init gkprio_module_init(void)
+{
+	return register_qdisc(&gkprio_qdisc_ops);
+}
+
+static void __exit gkprio_module_exit(void)
+{
+	unregister_qdisc(&gkprio_qdisc_ops);
+}
+
+module_init(gkprio_module_init)
+module_exit(gkprio_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
1.9.1

^ permalink raw reply related

* [PATCH 01/18] docs: can.rst: fix a footnote reference
From: Mauro Carvalho Chehab @ 2018-05-07  9:35 UTC (permalink / raw)
  To: Linux Doc Mailing List
  Cc: Mauro Carvalho Chehab, Mauro Carvalho Chehab, linux-kernel,
	Jonathan Corbet, Oliver Hartkopp, Marc Kleine-Budde,
	David S. Miller, linux-can, netdev
In-Reply-To: <cover.1525684985.git.mchehab+samsung@kernel.org>

As stated at:
	http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#footnotes

A footnote should contain either a number, a reference or
an auto number, e. g.:
	[1], [#f1] or [#].

While using [*] accidentaly works for html, it fails for other
document outputs. In particular, it causes an error with LaTeX
output, causing all books after networking to not be built.

So, replace it by a valid syntax.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/networking/can.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
index d23c51abf8c6..2fd0b51a8c52 100644
--- a/Documentation/networking/can.rst
+++ b/Documentation/networking/can.rst
@@ -164,7 +164,7 @@ The Linux network devices (by default) just can handle the
 transmission and reception of media dependent frames. Due to the
 arbitration on the CAN bus the transmission of a low prio CAN-ID
 may be delayed by the reception of a high prio CAN frame. To
-reflect the correct [*]_ traffic on the node the loopback of the sent
+reflect the correct [#f1]_ traffic on the node the loopback of the sent
 data has to be performed right after a successful transmission. If
 the CAN network interface is not capable of performing the loopback for
 some reason the SocketCAN core can do this task as a fallback solution.
@@ -175,7 +175,7 @@ networking behaviour for CAN applications. Due to some requests from
 the RT-SocketCAN group the loopback optionally may be disabled for each
 separate socket. See sockopts from the CAN RAW sockets in :ref:`socketcan-raw-sockets`.
 
-.. [*] you really like to have this when you're running analyser
+.. [#f1] you really like to have this when you're running analyser
        tools like 'candump' or 'cansniffer' on the (same) node.
 
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH 09/18] net: mac80211.h: fix a bad comment line
From: Mauro Carvalho Chehab @ 2018-05-07  9:35 UTC (permalink / raw)
  To: Linux Doc Mailing List
  Cc: Mauro Carvalho Chehab, Mauro Carvalho Chehab, linux-kernel,
	Jonathan Corbet, Johannes Berg, David S. Miller, linux-wireless,
	netdev
In-Reply-To: <cover.1525684985.git.mchehab+samsung@kernel.org>

Sphinx produces a lot of errors like this:
	./include/net/mac80211.h:2083: warning: bad line:  >

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d2279b2d61aa..b2f3a0c018e7 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2080,7 +2080,7 @@ struct ieee80211_txq {
  *	virtual interface might not be given air time for the transmission of
  *	the frame, as it is not synced with the AP/P2P GO yet, and thus the
  *	deauthentication frame might not be transmitted.
- >
+ *
  * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't
  *	support QoS NDP for AP probing - that's most likely a driver bug.
  *
-- 
2.17.0

^ permalink raw reply related

* Re: WARNING in kernfs_add_one
From: Dmitry Vyukov @ 2018-05-07  9:33 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Greg KH, linux-wireless, Eric Dumazet, netdev, syzbot, LKML,
	syzkaller-bugs, Tejun Heo
In-Reply-To: <1525682589.6049.4.camel@sipsolutions.net>

On Mon, May 7, 2018 at 10:43 AM, Johannes Berg
<johannes@sipsolutions.net> wrote:
> On Sat, 2018-05-05 at 15:07 -0700, Greg KH wrote:
>
>> > > > syzbot found the following crash on:
>
> Maybe it should learn to differentiate warnings, if it's going to set
> panic_on_warn :-)

How?
Note that this is not specific to syzbot. If you see WARNINGs in a
subsystem that you have no idea about (or you just a normal user),
what do you do? Right, you report it to maintainers.


> I get why, but still, at least differentiating in the emails wouldn't be
> bad.

Well, the subject says "WARNING".
But note there are _very_ bad WARNINGs too. Generally, a WARNING means
a kernel bug just that kernel can tolerate without bringing the system
down (as opposed to BUG).


>> > > > kernfs: ns required in 'ieee80211' for 'phy3'
>
> Huh. What does that even mean?
>
>> > > > RIP: 0010:kernfs_add_one+0x406/0x4d0 fs/kernfs/dir.c:758
>> > > > RSP: 0018:ffff8801ca9eece0 EFLAGS: 00010286
>> > > > RAX: 000000000000002d RBX: ffffffff87d5cee0 RCX: ffffffff8160ba7d
>> > > > RDX: 0000000000000000 RSI: ffffffff81610731 RDI: ffff8801ca9ee840
>> > > > RBP: ffff8801ca9eed20 R08: ffff8801d9538500 R09: 0000000000000006
>> > > > R10: ffff8801d9538500 R11: 0000000000000000 R12: ffff8801ad1cb6c0
>> > > > R13: ffffffff885da640 R14: 0000000000000020 R15: 0000000000000000
>> > > >  kernfs_create_link+0x112/0x180 fs/kernfs/symlink.c:41
>> > > >  sysfs_do_create_link_sd.isra.2+0x90/0x130 fs/sysfs/symlink.c:43
>> > > >  sysfs_do_create_link fs/sysfs/symlink.c:79 [inline]
>> > > >  sysfs_create_link+0x65/0xc0 fs/sysfs/symlink.c:91
>> > > >  device_add_class_symlinks drivers/base/core.c:1612 [inline]
>> > > >  device_add+0x7a0/0x16d0 drivers/base/core.c:1810
>> > > >  wiphy_register+0x178a/0x2430 net/wireless/core.c:806
>> > > >  ieee80211_register_hw+0x13cd/0x35d0 net/mac80211/main.c:1047
>> > > >  mac80211_hwsim_new_radio+0x1d9b/0x3410
>> > > > drivers/net/wireless/mac80211_hwsim.c:2772
>> > > >  hwsim_new_radio_nl+0x7a7/0xa60 drivers/net/wireless/mac80211_hwsim.c:3246
>> > > >  genl_family_rcv_msg+0x889/0x1120 net/netlink/genetlink.c:599
>
> Basically we're creating a new virtual radio, which in turn creates a
> new device, which we have to register.
>
> Something is going on with the context here that makes sysfs unhappy,
> but TBH I have no idea what.
>
> johannes
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/1525682589.6049.4.camel%40sipsolutions.net.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [PATCH 6/8] rhashtable: further improve stability of rhashtable_walk
From: Herbert Xu @ 2018-05-07  9:30 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <87vac1db5t.fsf@notabene.neil.brown.name>

On Sun, May 06, 2018 at 07:50:54AM +1000, NeilBrown wrote:
>
> Do we?  How could we fix it for both rhashtable and rhltable?

Well I suggested earlier to insert the walker object into the
hash table, which would be applicable regardless of whether it
is a normal rhashtable of a rhltable.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 8/8] rhashtable: don't hold lock on first table throughout insertion.
From: Herbert Xu @ 2018-05-07  9:29 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <878t8wcthy.fsf@notabene.neil.brown.name>

On Mon, May 07, 2018 at 08:24:41AM +1000, NeilBrown wrote:
>
> This is true, but I don't see how it is relevant.
> At some point, each thread will find that the table they have just
> locked for their search key, has a NULL 'future_tbl' pointer.
> At the point, the thread can know that the key is not in any table,
> and that no other thread can add the key until the lock is released.

The updating of future_tbl is not synchronised with insert threads.
Therefore it is entirely possible that two inserters end up on
different tables as their "latest" table.  This must not be allowed
to occur.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 2/8] rhashtable: remove nulls_base and related code.
From: Herbert Xu @ 2018-05-07  9:27 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <877eoheqc2.fsf@notabene.neil.brown.name>

On Sun, May 06, 2018 at 07:37:49AM +1000, NeilBrown wrote:
> 
> I can see no evidence that this is required for anything, as it isn't
> use and I'm fairly sure that in it's current form - it cannot be used.

Search for nulls in net/ipv4.  This is definitely used throughout
the network stack.  As the aim is to convert as many existing hash
tables to rhashtable as possible, we want to keep this.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH bpf-next v3 00/15] Introducing AF_XDP support
From: Magnus Karlsson @ 2018-05-07  9:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, Björn Töpel, Karlsson, Magnus,
	Alexander Duyck, Alexander Duyck, John Fastabend,
	Alexei Starovoitov, Jesper Dangaard Brouer, Willem de Bruijn,
	Michael S. Tsirkin, Network Development, Björn Töpel,
	michael.lundkvist, Brandeburg, Jesse, Singhai, Anjali,
	Zhang, Qi Z
In-Reply-To: <20180505003445.zxaghbly54s2cre3@ast-mbp>

On Sat, May 5, 2018 at 2:34 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, May 04, 2018 at 01:22:17PM +0200, Magnus Karlsson wrote:
>> On Fri, May 4, 2018 at 1:38 AM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> > On Fri, May 04, 2018 at 12:49:09AM +0200, Daniel Borkmann wrote:
>> >> On 05/02/2018 01:01 PM, Björn Töpel wrote:
>> >> > From: Björn Töpel <bjorn.topel@intel.com>
>> >> >
>> >> > This patch set introduces a new address family called AF_XDP that is
>> >> > optimized for high performance packet processing and, in upcoming
>> >> > patch sets, zero-copy semantics. In this patch set, we have removed
>> >> > all zero-copy related code in order to make it smaller, simpler and
>> >> > hopefully more review friendly. This patch set only supports copy-mode
>> >> > for the generic XDP path (XDP_SKB) for both RX and TX and copy-mode
>> >> > for RX using the XDP_DRV path. Zero-copy support requires XDP and
>> >> > driver changes that Jesper Dangaard Brouer is working on. Some of his
>> >> > work has already been accepted. We will publish our zero-copy support
>> >> > for RX and TX on top of his patch sets at a later point in time.
>> >>
>> >> +1, would be great to see it land this cycle. Saw few minor nits here
>> >> and there but nothing to hold it up, for the series:
>> >>
>> >> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>> >>
>> >> Thanks everyone!
>> >
>> > Great stuff!
>> >
>> > Applied to bpf-next, with one condition.
>> > Upcoming zero-copy patches for both RX and TX need to be posted
>> > and reviewed within this release window.
>> > If netdev community as a whole won't be able to agree on the zero-copy
>> > bits we'd need to revert this feature before the next merge window.
>>
>> Thanks everyone for reviewing this. Highly appreciated.
>>
>> Just so we understand the purpose correctly:
>>
>> 1: Do you want to see the ZC patches in order to verify that the user
>> space API holds? If so, we can produce an additional RFC  patch set
>> using a big chunk of code that we had in RFC V1. We are not proud of
>> this code since it is clunky, but it hopefully proves the point with
>> the uapi being the same.
>>
>> 2: And/Or are you worried about us all (the netdev community) not
>> agreeing on a way to implement ZC internally in the drivers and the
>> XDP infrastructure? This is not going to be possible to finish during
>> this cycle since we do not like the implementation we had in RFC V1.
>> Too intrusive and now we also have nicer abstractions from Jesper that
>> we can use and extend to provide a (hopefully) much cleaner and less
>> intrusive solution.
>
> short answer: both.
>
> Cleanliness and performance of the ZC code is not as important as
> getting API right. The main concern that during ZC review process
> we will find out that existing API has issues, so we have to
> do this exercise before the merge window.
> And RFC won't fly. Send the patches for real. They have to go
> through the proper code review. The hackers of netdev community
> can accept a partial, or a bit unclean, or slightly inefficient
> implementation, since it can be and will be improved later,
> but API we cannot change once it goes into official release.
>
> Here is the example of API concern:
> this patch set added shared umem concept. It sounds good in theory,
> but will it perform well with ZC ? Earlier RFCs didn't have that
> feature. If it won't perform well than it shouldn't be in the tree.
> The key reason to let AF_XDP into the tree is its performance promise.
> If it doesn't perform we should rip it out and redesign.

That is a fair point. We will try to produce patch sets for zero-copy
RX and TX using the latest interfaces within this merge window. Just
note that we will focus on this for the next week(s) instead of the
review items that you and Daniel Borkmann submitted. If we get those
patch sets out in time and we agree that they are a possible way
forward, then we produce patches with your fixes. It was mainly small
items, so should be quick.

/Magnus

^ permalink raw reply

* [PATCH 4/5] change the comment of vti6_ioctl
From: Steffen Klassert @ 2018-05-07  9:01 UTC (permalink / raw)
  To: David Miller; +Cc: Herbert Xu, Steffen Klassert, netdev
In-Reply-To: <20180507090116.31222-1-steffen.klassert@secunet.com>

From: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>

The comment of vti6_ioctl() is wrong. which use vti6_tnl_ioctl
instead of vti6_ioctl.

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_vti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c214ffec02f0..deadc4c3703b 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
 }
 
 /**
- * vti6_tnl_ioctl - configure vti6 tunnels from userspace
+ * vti6_ioctl - configure vti6 tunnels from userspace
  *   @dev: virtual device associated with tunnel
  *   @ifr: parameters passed from userspace
  *   @cmd: command to be performed
-- 
2.14.1

^ permalink raw reply related

* [PATCH 5/5] xfrm: use a dedicated slab cache for struct xfrm_state
From: Steffen Klassert @ 2018-05-07  9:01 UTC (permalink / raw)
  To: David Miller; +Cc: Herbert Xu, Steffen Klassert, netdev
In-Reply-To: <20180507090116.31222-1-steffen.klassert@secunet.com>

From: Mathias Krause <minipli@googlemail.com>

struct xfrm_state is rather large (768 bytes here) and therefore wastes
quite a lot of memory as it falls into the kmalloc-1024 slab cache,
leaving 256 bytes of unused memory per XFRM state object -- a net waste
of 25%.

Using a dedicated slab cache for struct xfrm_state reduces the level of
internal fragmentation to a minimum.

On my configuration SLUB chooses to create a slab cache covering 4
pages holding 21 objects, resulting in an average memory waste of ~13
bytes per object -- a net waste of only 1.6%.

In my tests this led to memory savings of roughly 2.3MB for 10k XFRM
states.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f9d2f2233f09..f595797a20ce 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -42,6 +42,7 @@ static void xfrm_state_gc_task(struct work_struct *work);
 
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
+static struct kmem_cache *xfrm_state_cache __ro_after_init;
 
 static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
 static HLIST_HEAD(xfrm_state_gc_list);
@@ -451,7 +452,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 	}
 	xfrm_dev_state_free(x);
 	security_xfrm_state_free(x);
-	kfree(x);
+	kmem_cache_free(xfrm_state_cache, x);
 }
 
 static void xfrm_state_gc_task(struct work_struct *work)
@@ -563,7 +564,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 {
 	struct xfrm_state *x;
 
-	x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC);
+	x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO);
 
 	if (x) {
 		write_pnet(&x->xs_net, net);
@@ -2307,6 +2308,10 @@ int __net_init xfrm_state_init(struct net *net)
 {
 	unsigned int sz;
 
+	if (net_eq(net, &init_net))
+		xfrm_state_cache = KMEM_CACHE(xfrm_state,
+					      SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
 	INIT_LIST_HEAD(&net->xfrm.state_all);
 
 	sz = sizeof(struct hlist_head) * 8;
-- 
2.14.1

^ permalink raw reply related

* [PATCH 2/5] udp: enable UDP checksum offload for ESP
From: Steffen Klassert @ 2018-05-07  9:01 UTC (permalink / raw)
  To: David Miller; +Cc: Herbert Xu, Steffen Klassert, netdev
In-Reply-To: <20180507090116.31222-1-steffen.klassert@secunet.com>

From: Jacek Kalwas <jacek.kalwas@intel.com>

In case NIC has support for ESP TX CSUM offload skb->ip_summed is not
set to CHECKSUM_PARTIAL which results in checksum calculated by SW.

Fix enables ESP TX CSUM for UDP by extending condition with check for
NETIF_F_HW_ESP_TX_CSUM.

Signed-off-by: Jacek Kalwas <jacek.kalwas@intel.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4c11b810a447..a2dfb5a9ba76 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -907,7 +907,7 @@ static int __ip_append_data(struct sock *sk,
 	    length + fragheaderlen <= mtu &&
 	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
 	    !(flags & MSG_MORE) &&
-	    !exthdrlen)
+	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
 		csummode = CHECKSUM_PARTIAL;
 
 	cork->length += length;
-- 
2.14.1

^ permalink raw reply related

* [PATCH 1/5] selftests: add xfrm state-policy-monitor to rtnetlink.sh
From: Steffen Klassert @ 2018-05-07  9:01 UTC (permalink / raw)
  To: David Miller; +Cc: Herbert Xu, Steffen Klassert, netdev
In-Reply-To: <20180507090116.31222-1-steffen.klassert@secunet.com>

From: Shannon Nelson <shannon.nelson@oracle.com>

Add a simple set of tests for the IPsec xfrm commands.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 tools/testing/selftests/net/rtnetlink.sh | 103 +++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index e6f485235435..760faef2e12e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -502,6 +502,108 @@ kci_test_macsec()
 	echo "PASS: macsec"
 }
 
+#-------------------------------------------------------------------
+# Example commands
+#   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+#            spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#            aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#            sel src 14.0.0.52/24 dst 14.0.0.70/24
+#   ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+#            tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+#            spi 0x07 mode transport reqid 0x07
+#
+# Subcommands not tested
+#    ip x s update
+#    ip x s allocspi
+#    ip x s deleteall
+#    ip x p update
+#    ip x p deleteall
+#    ip x p set
+#-------------------------------------------------------------------
+kci_test_ipsec()
+{
+	srcip="14.0.0.52"
+	dstip="14.0.0.70"
+	algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+
+	# flush to be sure there's nothing configured
+	ip x s flush ; ip x p flush
+	check_err $?
+
+	# start the monitor in the background
+	tmpfile=`mktemp ipsectestXXX`
+	ip x m > $tmpfile &
+	mpid=$!
+	sleep 0.2
+
+	ipsecid="proto esp src $srcip dst $dstip spi 0x07"
+	ip x s add $ipsecid \
+            mode transport reqid 0x07 replay-window 32 \
+            $algo sel src $srcip/24 dst $dstip/24
+	check_err $?
+
+	lines=`ip x s list | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x s count | grep -q "SAD count 1"
+	check_err $?
+
+	lines=`ip x s get $ipsecid | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x s delete $ipsecid
+	check_err $?
+
+	lines=`ip x s list | wc -l`
+	test $lines -eq 0
+	check_err $?
+
+	ipsecsel="dir out src $srcip/24 dst $dstip/24"
+	ip x p add $ipsecsel \
+		    tmpl proto esp src $srcip dst $dstip \
+		    spi 0x07 mode transport reqid 0x07
+	check_err $?
+
+	lines=`ip x p list | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x p count | grep -q "SPD IN  0 OUT 1 FWD 0"
+	check_err $?
+
+	lines=`ip x p get $ipsecsel | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x p delete $ipsecsel
+	check_err $?
+
+	lines=`ip x p list | wc -l`
+	test $lines -eq 0
+	check_err $?
+
+	# check the monitor results
+	kill $mpid
+	lines=`wc -l $tmpfile | cut "-d " -f1`
+	test $lines -eq 20
+	check_err $?
+	rm -rf $tmpfile
+
+	# clean up any leftovers
+	ip x s flush
+	check_err $?
+	ip x p flush
+	check_err $?
+
+	if [ $ret -ne 0 ]; then
+		echo "FAIL: ipsec"
+		return 1
+	fi
+	echo "PASS: ipsec"
+}
+
 kci_test_gretap()
 {
 	testns="testns"
@@ -755,6 +857,7 @@ kci_test_rtnl()
 	kci_test_vrf
 	kci_test_encap
 	kci_test_macsec
+	kci_test_ipsec
 
 	kci_del_dummy
 }
-- 
2.14.1

^ permalink raw reply related

* [PATCH 3/5] xfrm: remove VLA usage in __xfrm6_sort()
From: Steffen Klassert @ 2018-05-07  9:01 UTC (permalink / raw)
  To: David Miller; +Cc: Herbert Xu, Steffen Klassert, netdev
In-Reply-To: <20180507090116.31222-1-steffen.klassert@secunet.com>

From: Kees Cook <keescook@chromium.org>

In the quest to remove all stack VLA usage removed from the kernel[1],
just use XFRM_MAX_DEPTH as already done for the "class" array. In one
case, it'll do this loop up to 5, the other caller up to 6.

[1] https://lkml.org/lkml/2018/3/7/621

Co-developed-by: Andreas Christoforou <andreaschristofo@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_state.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 16f434791763..5bdca3d5d6b7 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -60,11 +60,9 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
 static int
 __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
 {
-	int i;
+	int count[XFRM_MAX_DEPTH] = { };
 	int class[XFRM_MAX_DEPTH];
-	int count[maxclass];
-
-	memset(count, 0, sizeof(count));
+	int i;
 
 	for (i = 0; i < n; i++) {
 		int c;
-- 
2.14.1

^ permalink raw reply related

* pull request (net-next): ipsec-next 2018-05-07
From: Steffen Klassert @ 2018-05-07  9:01 UTC (permalink / raw)
  To: David Miller; +Cc: Herbert Xu, Steffen Klassert, netdev

1) Add selftests for the xfrm commands.
   From Shannon Nelson.

2) Enable hardware checksum offload for ESP encapsulated
   UDP packets if the hardware supports this.
   From Jacek Kalwas.

3) Remove VLA usage in __xfrm6_sort. From Kees Cook.

4) Fix a typo in the comment of vti6_ioctl.
   From Sun Lianwen.

5) Use a dedicated slab cache for struct xfrm_state,
   this reduces the memory usage of this struct
   by 25 percent. From Mathias Krause.

Please note that this pull request has a merge conflict
between commit:

  bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")

from the net-next tree and commit:

  cd027a5433d6 ("udp: enable UDP checksum offload for ESP")

from the ipsec-next tree.

The conflict can be solved as done in linux-next.

Please pull or let me know if there are problems.

Thanks!

The following changes since commit ef53e9e14714de2ce26eaae0244c07c426064d69:

  net: Remove unused tcp_set_state tracepoint (2018-04-16 19:02:15 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master

for you to fetch changes up to 565f0fa902b64020d5d147ff1708567e9e0b6e49:

  xfrm: use a dedicated slab cache for struct xfrm_state (2018-05-04 10:14:00 +0200)

----------------------------------------------------------------
Jacek Kalwas (1):
      udp: enable UDP checksum offload for ESP

Kees Cook (1):
      xfrm: remove VLA usage in __xfrm6_sort()

Mathias Krause (1):
      xfrm: use a dedicated slab cache for struct xfrm_state

Shannon Nelson (1):
      selftests: add xfrm state-policy-monitor to rtnetlink.sh

Sun Lianwen (1):
      change the comment of vti6_ioctl

 net/ipv4/ip_output.c                     |   2 +-
 net/ipv6/ip6_vti.c                       |   2 +-
 net/ipv6/xfrm6_state.c                   |   6 +-
 net/xfrm/xfrm_state.c                    |   9 ++-
 tools/testing/selftests/net/rtnetlink.sh | 103 +++++++++++++++++++++++++++++++
 5 files changed, 114 insertions(+), 8 deletions(-)

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox