Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 11/16] bpf: cfg: detect irreducible loop using Eric Stoltz algorithm
From: John Fastabend @ 2018-06-01  9:33 UTC (permalink / raw)
  To: alexei.starovoitov, daniel, davem; +Cc: netdev
In-Reply-To: <20180601092646.15353.28269.stgit@john-Precision-Tower-5810>

From: Jiong Wang <jiong.wang@netronome.com>

We don't want to do any further loop bounds analysis for irreducible loop,
so just reject programs containing it.

The current DOM based loop detection can't detect irreducible loop. We'd
use the algorithm given by Eric Stoltz to detect it. The algorithm requires
DOM info.

Algorithm pseudo code:

	test_dom(a,b) returns TRUE if a dominates b
	push( v ) pushes v onto a reverse topologically-sorted stack

	top_sort( entry node )

	top_sort( node v ) {
		mark_visited( v );
		Visit all successors s of v {
			if (mark_visited(s) &&
			    !pushed(s) &&
			    !test_dom(s, v)) {
				Irreducible_Graph = TRUE;
				Exit -- no need to continue now!
			}
			if(!mark_visited(s))
				top_sort( s );
		}
		push( v );
	}

Tested by:

int xdp_prog1(struct xdp_md *ctx) {
	char *data      = (char *)(unsigned long)(ctx->data);
	char *data_end  = (char *)(unsigned long)(ctx->data_end);
	if (data + 60 > (unsigned char *)(unsigned long)ctx->data_end)
		return XDP_ABORTED;

	char *p = data + 14;
	char *mark = *(char **)p;
	unsigned long mark2 = *(unsigned long *)(p + 24);

	mark += 1;

	if (p + 1 > data) {
B:
		mark += 2;

		if (mark < mark2 * 3)
			goto C;
	} else if (mark < data) {
C:
		mark += 1;
		if (mark < 1000)
			goto  B;
	}

	return XDP_PASS;
}

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/cfg.c      |  117 ++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/cfg.h      |    5 ++
 kernel/bpf/verifier.c |    9 ++++
 3 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index 2a9b513..c67541c 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -767,6 +767,109 @@ bool subprog_has_loop(struct cfg_node_allocator *allocator,
 	return false;
 }
 
+/* We don't want to do any further loop bounds analysis for irreducible loop,
+ * so just reject programs containing it.
+ *
+ * The current DOM based loop detection can't detect irreducible loop. We'd
+ * use the algorithm given by Eric Stoltz to detect it. The algorithm requires
+ * DOM info.
+ *
+ * Algorithm pseudo code:
+ *
+ *   test_dom(a,b) returns TRUE if a dominates b
+ *   push( v ) pushes v onto a reverse topologically-sorted stack
+ *
+ *   top_sort( entry node )
+ *
+ *   top_sort( node v ) {
+ *	mark_visited( v );
+ *	Visit all successors s of v {
+ *		if (mark_visited(s) && !pushed(s) && !test_dom(s, v)) {
+ *			Irreducible_Graph = TRUE;
+ *			Exit -- no need to continue now!
+ *		}
+ *		if(!mark_visited(s))
+ *			top_sort( s );
+ *	}
+ *	push( v );
+ *   }
+ */
+int subprog_has_irreduciable_loop(struct cfg_node_allocator *allocator,
+				  struct bpf_subprog_info *subprog)
+{
+	u16 bb_num = subprog->bb_num - 2, sp = 0, *status;
+	void **bb_list = (void **)&subprog->bbs;
+	struct edge_node **stack, *e, *prev_e;
+	int lane_len = BITS_TO_LONGS(bb_num);
+	struct bb_node *entry_bb, *exit_bb;
+	int found = 0;
+
+	stack = kmalloc_array(bb_num, sizeof(struct edge_node *), GFP_KERNEL);
+	if (!stack)
+		return -ENOMEM;
+
+	status = kcalloc(bb_num, sizeof(u16), GFP_KERNEL);
+	if (!status)
+		return -ENOMEM;
+
+	entry_bb = entry_bb(bb_list);
+	exit_bb = exit_bb(bb_list);
+	e = cfg_node_delink(allocator, &entry_bb->e_succs);
+	prev_e = e;
+
+	while (1) {
+		struct bb_node *bb_dst;
+
+		while (e) {
+			bb_dst = e->dst;
+
+			if (bb_dst == exit_bb ||
+			    status[bb_dst->idx] == DFS_NODE_EXPLORED) {
+				prev_e = e;
+				e = cfg_node_delink(allocator, &e->link);
+				continue;
+			}
+
+			if (status[bb_dst->idx] == DFS_NODE_EXPLORING) {
+				u16 src_idx = e->src->idx;
+				unsigned long *bb_map;
+
+				bb_map = subprog->dtree + src_idx * lane_len;
+				if (!test_bit(bb_dst->idx, bb_map)) {
+					found = 1;
+					goto free_and_ret;
+				} else {
+					prev_e = e;
+					e = cfg_node_delink(allocator,
+							    &e->link);
+					continue;
+				}
+			}
+
+			status[bb_dst->idx] = DFS_NODE_EXPLORING;
+			stack[sp++] = e;
+			/* e should never be NULL as it couldn't be exit_bb. */
+			e = cfg_node_delink(allocator, &bb_dst->e_succs);
+		}
+
+		if (prev_e->src != entry_bb)
+			status[prev_e->src->idx] = DFS_NODE_EXPLORED;
+
+		if (!sp)
+			break;
+
+		e = stack[--sp];
+		prev_e = e;
+		e = cfg_node_delink(allocator, &e->link);
+	}
+
+free_and_ret:
+	kfree(stack);
+	kfree(status);
+
+	return found;
+}
+
 static int cmp_subprogs(const void *a, const void *b)
 {
 	return ((struct bpf_subprog_info *)a)->start -
@@ -824,8 +927,6 @@ static void ci_next(struct cfg_node_allocator *allocator,
 	ci->callee = cfg_node_delink(allocator, &c->link);
 }
 
-#define EXPLORING	1
-#define EXPLORED	2
 int cgraph_check_recursive_unreachable(struct bpf_verifier_env *env,
 				       struct cfg_node_allocator *allocator,
 				       struct bpf_subprog_info *subprog)
@@ -844,19 +945,19 @@ int cgraph_check_recursive_unreachable(struct bpf_verifier_env *env,
 	}
 	ci.head = subprog->callees;
 	ci.callee = subprog->callees;
-	status[0] = EXPLORING;
+	status[0] = DFS_NODE_EXPLORING;
 
 	while (1) {
 		while (!ci_end_p(&ci)) {
 			callee = ci.callee;
 			idx = callee->callee_idx;
-			if (status[idx] == EXPLORING) {
+			if (status[idx] == DFS_NODE_EXPLORING) {
 				bpf_verifier_log_write(env, "cgraph - recursive call\n");
 				ret = -EINVAL;
 				goto err_free;
 			}
 
-			status[idx] = EXPLORING;
+			status[idx] = DFS_NODE_EXPLORING;
 
 			if (sp == 64) {
 				bpf_verifier_log_write(env, "cgraph - call frame too deep\n");
@@ -870,10 +971,10 @@ int cgraph_check_recursive_unreachable(struct bpf_verifier_env *env,
 		}
 
 		if (ci.head)
-			status[ci.head->caller_idx] = EXPLORED;
+			status[ci.head->caller_idx] = DFS_NODE_EXPLORED;
 		else
 			/* leaf func. */
-			status[idx] = EXPLORED;
+			status[idx] = DFS_NODE_EXPLORED;
 
 		if (!sp)
 			break;
@@ -883,7 +984,7 @@ int cgraph_check_recursive_unreachable(struct bpf_verifier_env *env,
 	}
 
 	for (idx = 0; idx < env->subprog_cnt; idx++)
-		if (status[idx] != EXPLORED) {
+		if (status[idx] != DFS_NODE_EXPLORED) {
 			bpf_verifier_log_write(env, "cgraph - unreachable subprog\n");
 			ret = -EINVAL;
 			goto err_free;
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index cb833bd..8363406 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -35,8 +35,13 @@ int subprog_build_dom_info(struct bpf_verifier_env *env,
 			   struct bpf_subprog_info *subprog);
 bool subprog_has_loop(struct cfg_node_allocator *allocator,
 		      struct bpf_subprog_info *subprog);
+int subprog_has_irreduciable_loop(struct cfg_node_allocator *allocator,
+				  struct bpf_subprog_info *subprog);
 int subprog_init_bb(struct cfg_node_allocator *allocator, void **bb_list,
 		    int subprog_start, int subprog_end);
 void subprog_free(struct bpf_subprog_info *subprog, int end_idx);
 
+#define DFS_NODE_EXPLORING	1
+#define DFS_NODE_EXPLORED	2
+
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 90619da..49895f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -913,6 +913,15 @@ static int check_subprogs(struct bpf_verifier_env *env)
 						     &subprog[cur_subprog]);
 			if (ret < 0)
 				goto free_nodes;
+			ret = subprog_has_irreduciable_loop(&allocator,
+							&subprog[cur_subprog]);
+			if (ret < 0)
+				goto free_nodes;
+			if (ret > 0) {
+				verbose(env, "cfg - irreduciable loop detected");
+				ret = -EINVAL;
+				goto free_nodes;
+			}
 			if (subprog_has_loop(&allocator,
 					     &subprog[cur_subprog])) {
 				verbose(env, "cfg - loop detected");

^ permalink raw reply related

* [RFC PATCH 12/16] bpf: cfg: pretty print CFG and DOM
From: John Fastabend @ 2018-06-01  9:33 UTC (permalink / raw)
  To: alexei.starovoitov, daniel, davem; +Cc: netdev
In-Reply-To: <20180601092646.15353.28269.stgit@john-Precision-Tower-5810>

Add functions to pretty print the CFG and DOM table. We can also
add contrib scripts to print this dot format so tools can visualize
them. I at least found this helpful. Will add scripts in tools/bpf
follow up patch.

For development we are always printing these but should put these
noisy routines behind a verbose bit and/or only print when an error
has occured in bounded-loop logic.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/cfg.c      |   54 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cfg.h      |    5 +++++
 kernel/bpf/verifier.c |    2 ++
 3 files changed, 61 insertions(+)

diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index c67541c..0d50646 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -62,6 +62,60 @@ static struct bb_node *bb_next(struct cfg_node_allocator *allocator,
 	return (struct bb_node *)cfg_node_delink(allocator, &bb->link);
 }
 
+void cfg_pretty_print(struct bpf_verifier_env *env,
+		      struct cfg_node_allocator *allocator,
+		      struct bpf_subprog_info *subprog)
+{
+	void **bb_list = (void **)&subprog->bbs;
+	struct bb_node *bb, *exit_bb;
+
+	bb = entry_bb(bb_list);
+	exit_bb = exit_bb(bb_list);
+
+	bpf_verifier_log_write(env, "CFG: ");
+	while (bb && bb != exit_bb) {
+		struct bb_node *next_bb = bb_next(allocator, bb);
+		struct edge_node *e;
+
+		e = cfg_node_delink(allocator, &bb->e_succs);
+		while (e) {
+			struct bb_node *dst = e->dst;
+			int tail = next_bb->head - 1;
+			struct bb_node *dst_next;
+			int dst_tail;
+
+			dst_next = bb_next(allocator, dst);
+			dst_tail = dst_next ? dst_next->head - 1 : 65534;
+
+			bpf_verifier_log_write(env, " %i[%i,%i] -> %i[%i,%i] ",
+					       bb->idx, bb->head, tail, dst->idx, dst->head, dst_tail);
+			e = cfg_node_delink(allocator, &e->link);
+		}
+		bb = bb_next(allocator, bb);
+	}
+	bpf_verifier_log_write(env, "\n");
+}
+
+void dom_pretty_print(struct bpf_verifier_env *env,
+		      struct bpf_subprog_info *subprog)
+{
+	int lane_len, bb_num = subprog->bb_num - 2;
+	int i, j;
+
+	lane_len = BITS_TO_LONGS(bb_num);
+
+	bpf_verifier_log_write(env, "DOM:\n");
+	for (i = 0; i < bb_num; i++) {
+		for (j = 0; j < bb_num; j++) {
+			bpf_verifier_log_write(env, " %i ",
+			    test_bit(j,
+				     subprog->dtree + i * lane_len) ? 1 : 0);
+		}
+		bpf_verifier_log_write(env, "\n");
+	}
+	bpf_verifier_log_write(env, "\n");
+}
+
 struct dom_info {
 	u16 *dfs_parent;
 	u16 *dfs_order;
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index 8363406..44dcabb 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -37,6 +37,11 @@ bool subprog_has_loop(struct cfg_node_allocator *allocator,
 		      struct bpf_subprog_info *subprog);
 int subprog_has_irreduciable_loop(struct cfg_node_allocator *allocator,
 				  struct bpf_subprog_info *subprog);
+void cfg_pretty_print(struct bpf_verifier_env *env,
+		      struct cfg_node_allocator *allocator,
+		      struct bpf_subprog_info *subprog);
+void dom_pretty_print(struct bpf_verifier_env *env,
+		      struct bpf_subprog_info *subprog);
 int subprog_init_bb(struct cfg_node_allocator *allocator, void **bb_list,
 		    int subprog_start, int subprog_end);
 void subprog_free(struct bpf_subprog_info *subprog, int end_idx);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 49895f3..610559a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -913,6 +913,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
 						     &subprog[cur_subprog]);
 			if (ret < 0)
 				goto free_nodes;
+			cfg_pretty_print(env, &allocator, &subprog[cur_subprog]);
+			dom_pretty_print(env, &subprog[cur_subprog]);
 			ret = subprog_has_irreduciable_loop(&allocator,
 							&subprog[cur_subprog]);
 			if (ret < 0)

^ permalink raw reply related

* [RFC PATCH 13/16] bpf: verifier, can ptr range be calculated with scalar ALU op
From: John Fastabend @ 2018-06-01  9:33 UTC (permalink / raw)
  To: alexei.starovoitov, daniel, davem; +Cc: netdev
In-Reply-To: <20180601092646.15353.28269.stgit@john-Precision-Tower-5810>

At the moment any BPF_ADD or BPF_NEG with a pointer type will create
a new pointer and destroy the register range. Then any memory access
after this will fail because it looks like no bounds checking has
been done, even if it was previously done on the other pointer. So
patterns like this fail,

   ptrA = pkt_data;

   if (ptrA + expected_payload > data_end)
          return 0;

   ptrA  += 1
   accum += *ptrA
   ptrA  += 1
   accum += *ptrA

I did a quick look over the code and it seems like if the ALU op
has a uma_val and it is less than the previous range, we can simply
reduce the range by that amount and pull it into the new ptr.

This patch does the above.
---
 kernel/bpf/verifier.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 610559a..c41f587 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2830,8 +2830,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		dst_reg->off = ptr_reg->off;
 		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
-			/* something was added to pkt_ptr, set range to zero */
-			dst_reg->range = 0;
+			if (umax_val > dst_reg->range)
+				dst_reg->range = 0;
+			else
+				dst_reg->range -= umax_val;
 		}
 		break;
 	case BPF_SUB:

^ permalink raw reply related

* [RFC PATCH 14/16] bpf: verifier, add initial support to allow bounded loops
From: John Fastabend @ 2018-06-01  9:33 UTC (permalink / raw)
  To: alexei.starovoitov, daniel, davem; +Cc: netdev
In-Reply-To: <20180601092646.15353.28269.stgit@john-Precision-Tower-5810>

Allow bounded loops if we can become convinced they will in fact
terminate.

At a high level this is done in steps the first two steps are done
outside of the do_check routine which "runs" the instructions.

 1. Use dom tree to find loops.
 2. Find loop induction variable in the loop. (I use loop
    induction variable here to refer to the variable used
    in the loop guard. For example in `do { .... i++; ...} while(i)`
    we refer to 'i' as the loop induction variable any other
    variables as just induction variables. I need to check
    what the standard terminology is for this)

The final steps are done 'inline' with do_check when the first
instruction of a loop is reached.

 3. Verify the bounds of the loop induction variable do in
    fact ensure the loop will terminate at some point. e.g.
    if its an increasing induction variable it has a lower bound.
 4. Deduce the min/max values of the induction variable so
    that when the loop is run all possible worst case values are
    tested on the first iteration through the loop. And the
    state pruning logic has some chance of pruning future trips
    through the loop.
 5. When exiting the loop clean up loop markers (tbd)

So that is the high level sketch now the details:

1. Finding the loops using standard dom queries is shown in
   previous patches, so it will not be discussed further.

2. Finding the loop induction variable. This is the variable
   easily identified in C constructs by any of the following,

      for (i = 0; i < j; i++) {...}
      while (i) { ... i++; ...}
      do {... i++; ... } while(i)

   and so on. The fist problem with finding the loop induction
   variable in BPF is locating it. LLVM clang/bpf has a set of
   patterns it tends to generate for various C loop constructs.
   This patch matches the following simple one (in pseudocode),

       hdr:
            <do stuff>
            if (i != x) goto hdr

   where the comparison at the end can match any of the JMP
   ops and can also of the form '(i op imm)'. The key is the
   jump is the last insn in the loop. For example another
   common pattern generated by LLVM, in my codes at least is
   the following (again in pseudocode),

       hdr:
           <do stuff>
           if (i != x) goto out
           <do more stuff>
           goto hdr
       out:
           <outside loop>

  This patch is still fairly naive on this front and does not
  handle the second case yet. It seems though by scanning the
  loop for conditionals that exit the loop and testing to see
  if either src or dst reg are induction variables should be
  enough to resolve this.

  For each potential loop induction variable we walk all paths
  back to the header of the loop and test (a) is it always
  increasing or decreasing (but not both!) and (b) that all
  paths agree.

  Currently, we only search for the loop induction variable.
  Future patches will expand this for additional induction
  variables. This will help the state pruning logic later.
  Also we only allow for induction variables generated
  from constants at the moment. This will be solved by
  tracking multiple induction variables.

  Even with all these limitations I've been able to use this
  to get some basic loops running. See subsequent patch.

  Once we have found a loop induction variable we can test
  that it is compatible with the comparison operation. For
  example 'while (i > 0) { ... i++ } is clearly not a valid
  loop.

3. Now that we believe we have a valid loop induction variable we
   need to confirm we have good bounds when the do_check hits the
   first insn in the loop. This ensures that the loops has a
   known starting point. If the loop induction variable is
   increasing we ensure it has min value and vice versa.

4. If we stopped here the loop would terminate but would
   not likely prune any states so each iteration would
   contribute to the verifier complexity and many loops
   would hit the complexity limit.

   To help the state pruning logic we set the induction variables
   to their min/max values, which we can calculate because we
   know the final exit condition and how the induction variable
   increases/decreases for each trip in the loop.

   There is one noteworth problem though. When the ALU operation
   is run over these new ranges it would normally increment both
   the min and the max. However this is not correct anymore
   because one of the bounds is a result of the conditional, that
   exits the loop and so can not be passed, and the other is a
   result of the initial condition entering the loop. So to
   resolve this we only increment the initial condition and fix
   the upper/lower bound that is attributed to the conditional.
   This works, except tnum needs to be improved to handle this.
   For right now I hacked this by forcing tnum values using
   tnum_range. This might be OK (it seems to work on scalars
   at least) but seems a bit like an ugly hack. More thinking
   needed.

   With the above I've had luck getting the pruning logic
   to work for many actual C code loops.

5. Finally, when we exit the node we have to clear the flag on
   the register indicating its an induction variable. This is
   what kicks the logic to try and only update min/max and fix
   the other. (tbd, at the moment if these variables are used
   after the loop we run spilling these vars in and out of the
   stack while tracking bounds which may make the state pruning
   logic less effective. In practice it is cleared fairly quickly
   usually by a mark_set_reg_unknown or the likes.)

All right that is it.

The big todos are the following:

- nested loops is broken at the moment, could just reject
  nested loops in first iteration of this work.
- only doing simple pattern matching for one loop type generated
  by LLVM need to generalize to find most types (or at least the
  common ones).
- I need to check overflow cases in a few places
- Bounds checks need to account for induction var step to ensure
  we account for possibility of walking past guard in last iteration
- Follow on patches will track multiple induction variables
- In general the code needs some love, error paths are suspect,
  originally I was using lists for things that can now be dropped
  and use more standard stacks, bb_node_stack(!?) was part of a
  bigger object but then got simplified now there is nothing left,
  etc.
- There are some problematic code snippets as well that we can
  illustrate in example code in next patches. For example LLVM
  likes to test unsigned values for '!= 0' to avoid decrementing
  a unsigned of 0 causing overflow state. However we don't learn
  this bound from the ALU op and we mark the reg as possibly
  for overflow which ruins our bounds preventing loop verification
  in some cases..

Interesting note, we nearly have scalar evolution infrastructure
in place which would be interesting but not sure if its needet
yet. We will probably get there with the follow on patch for
multiple induction variables.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/bpf_verifier.h    |   20 +
 kernel/bpf/cfg.c                |  971 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cfg.h                |    8 
 kernel/bpf/verifier.c           |   98 +++-
 samples/bpf/xdp2skb_meta_kern.c |   57 ++
 5 files changed, 1126 insertions(+), 28 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 07844ff..b8c9853 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -40,6 +40,14 @@ enum bpf_reg_liveness {
 	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
 };
 
+enum bpf_indvar_state {
+	BPF_LOOP_UNKNOWN,
+	BPF_LOOP_INVALID,
+	BPF_LOOP_IMM,
+	BPF_LOOP_INC,
+	BPF_LOOP_DEC,
+};
+
 struct bpf_reg_state {
 	enum bpf_reg_type type;
 	union {
@@ -81,9 +89,17 @@ struct bpf_reg_state {
 	 * while another to the caller's stack. To differentiate them 'frameno'
 	 * is used which is an index in bpf_verifier_state->frame[] array
 	 * pointing to bpf_func_state.
-	 * This field must be second to last, for states_equal() reasons.
+	 * This field marks the offset for comparisons in state pruning, review
+	 * states_equal() for specifics.
 	 */
 	u32 frameno;
+
+	/* Used to determine if this reg state is tracking an induction variable
+	 * through a loop. Induction variables have worst case values for all
+	 * iterations of the loop.
+	 */
+	enum bpf_indvar_state indvar;
+
 	/* This field must be last, for states_equal() reasons. */
 	enum bpf_reg_liveness live;
 };
@@ -139,6 +155,7 @@ struct bpf_verifier_state_list {
 	struct bpf_verifier_state_list *next;
 };
 
+struct bpf_loop_info;
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
@@ -148,6 +165,7 @@ struct bpf_insn_aux_data {
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
+	struct bpf_loop_info *loop; /* when set this insn marks loop header */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
index 0d50646..4622365 100644
--- a/kernel/bpf/cfg.c
+++ b/kernel/bpf/cfg.c
@@ -12,6 +12,7 @@
 #include <linux/sort.h>
 
 #include "cfg.h"
+#include "disasm.h"
 
 /* The size of cfg nodes matters, therefore we try to avoid using doubly linked
  * list and we try to use base + offset to address node, by this we only need
@@ -53,6 +54,64 @@ struct bb_node {
 	u16 idx;
 };
 
+struct bb_node_stack {
+	struct bb_node *bb;
+	struct list_head list;
+};
+
+static const char * const bpf_loop_state_str[] = {
+	[BPF_LOOP_UNKNOWN]	= "unknown",
+	[BPF_LOOP_INVALID]	= "invalid",
+	[BPF_LOOP_IMM]		= "imm",
+	[BPF_LOOP_INC]		= "increasing",
+	[BPF_LOOP_DEC]		= "decreasing",
+};
+
+struct bb_state_reg {
+	int reg;	/* register number */
+	int off;	/* offset in stack or unused */
+	int size;	/* size in stack or unused */
+	s64 smin_value; /* minimum possible (s64)value */
+	s64 smax_value; /* maximum possible (s64)value */
+	u64 umin_value; /* minimum possible (u64)value */
+	u64 umax_value; /* maximum possible (u64)value */
+	enum bpf_indvar_state state;
+};
+
+struct bb_state {
+	struct bb_node *bb;
+	struct list_head list;
+	int insn;
+	int insn_cnt;
+	struct bb_state_reg src;
+	struct bb_state_reg dst;
+};
+
+struct bpf_loop_info {
+	struct list_head bb;
+	int bb_num;
+	int insn_cnt;
+	int insn_entry;
+	int insn_exit;
+	struct bb_state_reg src;
+	struct bb_state_reg dst;
+	u16 idx;
+	struct bpf_loop_info *next;
+};
+
+static u16 loop_idx;
+
+static void bpf_clear_state(struct bb_state *s)
+{
+	s->insn = s->insn_cnt = 0;
+
+	s->src.state = BPF_LOOP_UNKNOWN;
+	s->src.reg = s->src.off = s->src.size = 0;
+
+	s->dst.state = BPF_LOOP_UNKNOWN;
+	s->dst.reg = s->src.off = s->src.size = 0;
+}
+
 #define entry_bb(bb_list)		(struct bb_node *)(*bb_list)
 #define exit_bb(bb_list)		(struct bb_node *)(*(bb_list + 1))
 
@@ -791,13 +850,912 @@ int subprog_build_dom_info(struct bpf_verifier_env *env,
 	return ret;
 }
 
-bool subprog_has_loop(struct cfg_node_allocator *allocator,
-		      struct bpf_subprog_info *subprog)
+static bool bb_search(struct bb_node *bb, struct bpf_loop_info *loop)
+{
+	struct bb_node_stack *i;
+
+	list_for_each_entry(i, &loop->bb, list) {
+		if (i->bb->idx == bb->idx)
+			return true;
+	}
+	return false;
+}
+
+void bpf_state_set_invalid(struct bb_state *state)
+{
+	state->src.state = BPF_LOOP_INVALID;
+	state->dst.state = BPF_LOOP_INVALID;
+}
+
+void bpf_state_set_stx(struct bb_state *state, const struct bpf_insn *insn)
+{
+	int size = bpf_size_to_bytes(BPF_SIZE(insn->code));
+
+	/* BPF_MEM | <size> | BPF_STX: *(size *) (dst_reg + off) = src_reg */
+	if (state->dst.reg == insn->dst_reg) {
+		state->dst.reg = insn->src_reg;
+		state->dst.off = insn->off;
+		state->dst.size = size;
+	} else if (state->src.reg == insn->dst_reg) {
+		state->src.reg = insn->src_reg;
+		state->src.off = insn->off;
+		state->src.size = size;
+	}
+}
+
+static void bpf_state_set_ldx(struct bpf_verifier_env *env,
+			      struct bb_state *state, const struct bpf_insn *insn)
+{
+	int off = insn->off;
+	int size = bpf_size_to_bytes(BPF_SIZE(insn->code));
+
+	/* BPF_MEM | <size> | BPF_LDX:  dst_reg = *(size *) (src_reg + off) */
+	if (state->dst.reg == insn->src_reg && state->dst.off == off) {
+		if (state->dst.size != size) {
+			bpf_verifier_log_write(env,
+					       "Loop tracing (dst) through BPF_LDX with mismatch sizes unsupported (%i != %i)\n",
+					       state->dst.size, size);
+			bpf_state_set_invalid(state);
+			return;
+		}
+		state->dst.reg = insn->dst_reg;
+	} else if (state->src.reg == insn->dst_reg && state->dst.off == off) {
+		if (state->src.size != size) {
+			bpf_verifier_log_write(env,
+					       "Loop tracing (src) through BPF_LDX with mismatch sizes unsupported (%i != %i)\n",
+					       state->src.size, size);
+			bpf_state_set_invalid(state);
+			return;
+		}
+		state->src.reg = insn->dst_reg;
+	}
+}
+
+void bpf_state_set_xadd(struct bb_state *state, const struct bpf_insn *insn)
+{
+	/* Bail out on XADD programs for the moment */
+	bpf_state_set_invalid(state);
+}
+
+static void _bpf_state_set_add(struct bpf_verifier_env *env,
+			       struct bb_state_reg *reg,
+			       const struct bpf_insn *insn, bool add)
+{
+	int sign;
+
+	/* Currently, only basic induction variables are supported. So we
+	 * require "reg += const" this limitation is artificial and we can support
+	 * more complex linear statements 'x += y' and 'x = x + a y' with additional
+	 * verifier effort. However, lets see if this is actually needed before
+	 * we add recursive path search for n-order induction variables.
+	 */
+	if (BPF_SRC(insn->code) == BPF_K) {
+		/* BPF_ADD/BPF_NEG by zero is a NOP just return */
+		if (insn->imm == 0)
+			return;
+
+		if (add)
+			sign = insn->imm < 0 ? BPF_LOOP_DEC : BPF_LOOP_INC;
+		else
+			sign = insn->imm < 0 ? BPF_LOOP_INC : BPF_LOOP_DEC;
+	} else {
+		reg->state = BPF_LOOP_INVALID;
+		return;
+	}
+
+	if (reg->state == BPF_LOOP_UNKNOWN)
+		reg->state = sign;
+	else if (reg->state == BPF_LOOP_INC && sign == BPF_LOOP_INC)
+		reg->state = BPF_LOOP_INC;
+	else if (reg->state == BPF_LOOP_DEC && sign == BPF_LOOP_DEC)
+		reg->state = BPF_LOOP_DEC;
+	else
+		reg->state = BPF_LOOP_INVALID;
+}
+
+static void bpf_state_set_add(struct bpf_verifier_env *env,
+			      struct bb_state *state,
+			      const struct bpf_insn *insn)
+{
+	if (state->dst.reg == insn->dst_reg) {
+		_bpf_state_set_add(env, &state->dst, insn, true);
+	} else if (state->src.reg == insn->dst_reg) {
+		_bpf_state_set_add(env, &state->src, insn, true);
+	} else {
+		bpf_state_set_invalid(state);
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void bpf_state_set_sub(struct bpf_verifier_env *env,
+			      struct bb_state *state,
+			      const struct bpf_insn *insn)
+{
+	if (state->dst.reg == insn->dst_reg) {
+		_bpf_state_set_add(env, &state->dst, insn, false);
+	} else if (state->src.reg == insn->dst_reg) {
+		_bpf_state_set_add(env, &state->src, insn, false);
+	} else {
+		bpf_state_set_invalid(state);
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void _bpf_state_set_move(struct bb_state_reg *reg, const struct bpf_insn *insn)
+{
+	if (BPF_SRC(insn->code) == BPF_K) {
+		u64 uimm = insn->imm;
+		s64 simm = (s64)insn->imm;
+
+		reg->state = BPF_LOOP_IMM;
+		reg->reg = -1;
+
+		reg->smin_value = simm;
+		reg->smax_value = simm;
+		reg->umin_value = uimm;
+		reg->umax_value = uimm;
+	} else {
+		reg->reg  = insn->src_reg;
+	}
+}
+void bpf_state_set_mov(struct bb_state *state, const struct bpf_insn *insn)
+{
+
+	if (state->dst.reg == insn->dst_reg) {
+		_bpf_state_set_move(&state->dst, insn);
+	} else if (state->src.reg == insn->dst_reg) {
+		_bpf_state_set_move(&state->src, insn);
+	} else {
+		bpf_state_set_invalid(state);
+		WARN_ON_ONCE(1);
+	}
+}
+
+void bpf_loop_state(struct bpf_verifier_env *env,
+		    int i, const struct bpf_insn *insn, struct bb_state *state)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		u8 opcode;
+
+		if (state->src.reg != insn->dst_reg &&
+		    state->dst.reg != insn->dst_reg)
+			return;
+
+		opcode = BPF_OP(insn->code);
+		switch (opcode) {
+		case BPF_ADD:
+			bpf_state_set_add(env, state, insn);
+			break;
+		case BPF_SUB:
+			bpf_state_set_sub(env, state, insn);
+			break;
+		case BPF_MOV:
+			bpf_state_set_mov(state, insn);
+			break;
+		case BPF_DIV:
+		case BPF_OR:
+		case BPF_AND:
+		case BPF_LSH:
+		case BPF_RSH:
+		case BPF_MOD:
+		case BPF_XOR:
+		case BPF_ARSH:
+		case BPF_END:
+		default:
+			bpf_verifier_log_write(env,
+					      "%i: BPF_ALU%s: unsupported opcode (%u) invalidate state\n",
+					       i, class == BPF_ALU ? "" : "64",
+					       opcode);
+			bpf_state_set_invalid(state);
+			break;
+		}
+	} else if (class == BPF_STX) {
+		u8 mode = BPF_MODE(insn->code);
+
+		switch (mode) {
+		case BPF_MEM:
+			/* BPF_MEM | <size> | BPF_STX */
+			bpf_state_set_stx(state, insn);
+			break;
+		case BPF_XADD:
+			/* BPF_XADD | BPF_W | BPF_STX */
+			/* BPF_XADD | BPF_DW | BPF_STX */
+			bpf_state_set_xadd(state, insn);
+			break;
+		default:
+			bpf_verifier_log_write(env,
+					       "%i: BPF_STX: unsupported mode (%u) invalidate state\n",
+					       i, mode);
+			bpf_state_set_invalid(state);
+		}
+	} else if (class == BPF_ST) {
+		/* Unsupported at the moment */
+		bpf_verifier_log_write(env, "%i: BPF_ST: unsupported class invalidate state\n", i);
+		bpf_state_set_invalid(state);
+	} else if (class == BPF_LDX) {
+		u8 mode = BPF_MODE(insn->code);
+
+		if (mode != BPF_MEM) {
+			bpf_verifier_log_write(env,
+					      "%i: BPF_LDX: unsupported mode (%u) invalidate state\n",
+					      i, mode);
+			bpf_state_set_invalid(state);
+		} else {
+			/* BPF_MEM | <size> | BPF_LDX */
+			bpf_state_set_ldx(env, state, insn);
+		}
+	} else if (class == BPF_LD) {
+		/* Unsupported at the moment */
+		bpf_verifier_log_write(env, "%i: BPF_LD: unsupported class invalidate state\n", i);
+		bpf_state_set_invalid(state);
+	} else if (class == BPF_JMP) {
+		; // Jumps are verified by CFG
+	} else {
+		/* If we do not understand instruction invalidate state */
+		bpf_verifier_log_write(env, "%i: %u: unknown class invalidate state\n", i, class);
+		bpf_state_set_invalid(state);
+	}
+}
+
+/* bit noisy at the moment duplicates with print_regs */
+static void bpf_print_loop_info(struct bpf_verifier_env *env,
+				struct bpf_loop_info *loop)
+{
+	struct bpf_reg_state *regs, *src_reg = NULL, *dst_reg = NULL;
+
+	regs = cur_regs(env);
+	if (loop->src.reg >= 0)
+		src_reg = &regs[loop->src.reg];
+	if (loop->dst.reg >= 0)
+		dst_reg = &regs[loop->dst.reg];
+
+	bpf_verifier_log_write(env,
+			       "Loop %i: (%i:ty(%i))src.state(%s) (%i:ty(%i))dst.state(%s): R%d(%llu:%llu,%lld:%lld) R%d(%llu:%llu,%lld:%lld)\n",
+			       loop->idx,
+			       loop->src.reg, src_reg ? src_reg->type : -1,
+			       bpf_loop_state_str[loop->src.state],
+			       loop->dst.reg, dst_reg ? dst_reg->type : -1,
+			       bpf_loop_state_str[loop->dst.state],
+			       loop->src.reg,
+			       src_reg ? src_reg->umin_value : loop->src.umin_value,
+			       src_reg ? src_reg->umax_value : loop->src.umax_value,
+			       src_reg ? src_reg->smin_value : loop->src.smin_value,
+			       src_reg ? src_reg->smax_value : loop->src.smax_value,
+			       loop->dst.reg,
+			       dst_reg ? dst_reg->umin_value : loop->dst.umin_value,
+			       dst_reg ? dst_reg->umax_value : loop->dst.umax_value,
+			       dst_reg ? dst_reg->smin_value : loop->dst.smin_value,
+			       dst_reg ? dst_reg->smax_value : loop->dst.smax_value);
+}
+
+static bool bpf_op_sign(u8 op)
+{
+	switch (op) {
+	case BPF_JNE:
+	case BPF_JGT:
+	case BPF_JGE:
+		return false;
+	case BPF_JSGT:
+	case BPF_JSGE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/* Verify conditions necessary to ensure increasing/decreasing loop induction
+ * variables will in fact terminate.
+ *
+ * 1. Increasing/decreasing variables _must_ be paired with a bounded variable
+ *    in this case BPF_LOOP_IMM type.
+ * 2. Increasing/decreasing variables _must_ have a "known" worst case starting
+ *    bound. For example if an increasing variable has no min value we can not
+ *    say it will actually terminate. So test increasing variables have mins
+ *    and decreasing variables have maxs.
+ * 3. The known min/max bound must match the comparison sign
+ *
+ * After this we know that a loop will increase or decrease and eventually
+ * terminate.
+ */
+static int bpf_cfg_valid_bounds(struct bpf_verifier_env *env, u8 op,
+				struct bpf_reg_state *src_reg,
+				struct bpf_reg_state *dst_reg,
+				struct bpf_loop_info *loop)
+{
+	bool sign = bpf_op_sign(op);
+
+	switch (loop->src.state) {
+	/*
+	 * dev note: improve verbose messaging, and maybe refactor the
+	 * switch stmt
+	 */
+	case BPF_LOOP_IMM:
+		if (!dst_reg) {
+			bpf_verifier_log_write(env, "internal cfg error: missing dst_reg LOOP_IMM!\n");
+			return -1;
+		}
+
+		if (loop->dst.state == BPF_LOOP_INC) {
+			if (sign && dst_reg->smin_value == S64_MIN) {
+				bpf_verifier_log_write(env,
+						       "increasing loop induction variable (towarads imm) unbounded min value\n");
+				return -1;
+			}
+		} else if (loop->dst.state == BPF_LOOP_DEC) {
+			if ((sign && dst_reg->smax_value == S64_MAX) ||
+			    (!sign && dst_reg->umax_value == U64_MAX)) {
+				bpf_verifier_log_write(env,
+						       "decreasing loop induction variable (towards imm) unbounded max value\n");
+				return -1;
+			}
+		} else {
+			return -1;
+		}
+		break;
+	case BPF_LOOP_INC:
+		if (loop->dst.state != BPF_LOOP_IMM) {
+			bpf_verifier_log_write(env,
+					       "increasing loop induction variable not towards imm\n");
+			return -1;
+		}
+
+		if (!src_reg) {
+			bpf_verifier_log_write(env, "internal cfg error: missing src_reg LOOP_INC!\n");
+			return -1;
+		}
+
+		if (sign && src_reg->smin_value == S64_MIN) {
+			bpf_verifier_log_write(env,
+					       "increasing loop induction variable unbounded min value\n");
+			return -1;
+		}
+		break;
+	case BPF_LOOP_DEC:
+		if (loop->dst.state != BPF_LOOP_IMM) {
+			bpf_verifier_log_write(env,
+					       "decreasing loop induction variable not towards imm\n");
+			return -1;
+		}
+
+		if (!src_reg) {
+			bpf_verifier_log_write(env, "internal cfg error: missing src_reg LOOP_DEC!\n");
+			return -1;
+		}
+
+		if ((sign && src_reg->smax_value == S64_MAX) ||
+		    (!sign && src_reg->umax_value == U64_MAX)) {
+			bpf_verifier_log_write(env,
+					       "decreasing loop induction variable unbounded max value\n");
+			return -1;
+		}
+		break;
+	default:
+		bpf_verifier_log_write(env, "loop state unknown/invalid\n");
+		return -1;
+	}
+	return 0;
+}
+
+/* Before calling bpf_cfg_deduce_bounds we ensured the loop does in fact
+ * terminate. (Because increasing/decreasing towards a constant) But, if
+ * left as is each iteration of the loop will be a new state. This is a
+ * result of the loop induction variable, by definition, being incremented
+ * or decremented by a constant each iteration of the loop.
+ *
+ * To resolve this we know the worst case iteration count and the step
+ * of each iteration so we know the expected range of the indvar. Here
+ * we calculate and set the min/max to the worst case range.
+ */
+static int bpf_cfg_deduce_bounds(struct bpf_verifier_env *env, u8 op,
+				 struct bpf_reg_state *src_reg,
+				 struct bpf_reg_state *dst_reg,
+				 struct bpf_loop_info *loop)
+{
+	int err = 0;
+
+	/* Need to consider overflow cases? */
+	/* Need to consider step > 1 */
+	if (loop->src.state == BPF_LOOP_INC) {
+		switch (op) {
+		case BPF_JNE:
+		case BPF_JGT:
+		case BPF_JGE:
+		case BPF_JSGT:
+		case BPF_JSGE:
+			src_reg->umax_value = loop->dst.umax_value;
+			src_reg->smax_value = loop->dst.smax_value;
+			src_reg->smax_value = loop->dst.smax_value;
+			src_reg->umax_value = loop->dst.umax_value;
+			break;
+		default:
+			bpf_verifier_log_write(env, "src.state INC, invalid opcode %u", op);
+			err = -1;
+			break;
+		}
+		src_reg->var_off = tnum_range(src_reg->umin_value,
+					      src_reg->umax_value);
+	} else if (loop->src.state == BPF_LOOP_DEC) {
+		switch (op) {
+		case BPF_JNE:
+		case BPF_JGT:
+		case BPF_JGE:
+		case BPF_JSGT:
+		case BPF_JSGE:
+			src_reg->umin_value = loop->dst.umin_value;
+			src_reg->smin_value = loop->dst.smin_value;
+			break;
+		default:
+			bpf_verifier_log_write(env, "src.state INC, invalid opcode %u", op);
+			err = -1;
+			break;
+		}
+		src_reg->var_off = tnum_range(src_reg->umin_value,
+					      src_reg->umax_value);
+	} else if (loop->dst.state == BPF_LOOP_INC) {
+		switch (op) {
+		case BPF_JNE:
+		case BPF_JGT:
+		case BPF_JGE:
+		case BPF_JSGT:
+		case BPF_JSGE:
+			dst_reg->umax_value = loop->src.umax_value;
+			dst_reg->smax_value = loop->src.smax_value;
+			break;
+		default:
+			bpf_verifier_log_write(env, "dst.state INC, invalid opcode %u", op);
+			err = -1;
+			break;
+		}
+		dst_reg->var_off = tnum_range(dst_reg->umin_value,
+					      dst_reg->umax_value);
+	} else if (loop->dst.state == BPF_LOOP_DEC) {
+		switch (op) {
+		case BPF_JNE:
+		case BPF_JLT:
+		case BPF_JLE:
+		case BPF_JSLT:
+		case BPF_JSLE:
+			dst_reg->umin_value = loop->src.umin_value;
+			dst_reg->smin_value = loop->src.smin_value;
+			break;
+		default:
+			bpf_verifier_log_write(env, "dst.state DEC, invalid opcode %u", op);
+			err = -1;
+			break;
+		}
+		dst_reg->var_off = tnum_range(dst_reg->umin_value,
+					      dst_reg->umax_value);
+	} else {
+		bpf_verifier_log_write(env, "internal cfg error: unknown src|dst state\n");
+		err = -1;
+	}
+
+	return err;
+}
+
+static int bb_stack_push(struct bb_node *node, struct bpf_loop_info *loop)
+{
+	struct bb_node_stack *n;
+
+	n = kzalloc(sizeof(struct bb_node_stack), GFP_KERNEL);
+	if (!n)
+		return -ENOMEM;
+
+	n->bb = node;
+	list_add(&n->list, &loop->bb);
+	return 0;
+}
+
+static struct bb_node *bb_stack_pop(struct bpf_loop_info *loop)
+{
+	struct bb_node *bb = NULL;
+	struct bb_node_stack *n;
+
+	n = list_first_entry_or_null(&loop->bb, struct bb_node_stack, list);
+	if (n) {
+		list_del(&n->list);
+		bb = n->bb;
+	}
+	kfree(n);
+	return bb;
+}
+
+static void bpf_free_loop(struct bpf_loop_info *loop)
+{
+	struct bb_node *bb = bb_stack_pop(loop);
+
+	while (bb)
+		bb = bb_stack_pop(loop);
+	kfree(loop);
+}
+
+/* CFG verified the loop is normal and that either src or dst is a
+ * basic loop induction variable but we still need to ensure min/max
+ * values will guarentee termination and that trip count is reasonable.
+ *
+ * There are a few cases that need to be handled. First the easy case, if the
+ * CFG loop scan found that one of the registers is an IMM value then we need
+ * only verify that the other register has a min or max value depending on
+ * increasing/decreasing induction variable. The more complicated case is when
+ * the loop scan has yet to resolve one of the registers bounds. (Note having
+ * two unknowns is not valid because we would have no guarantees the loop
+ * induction variable is increasing or decreasing) At this point we need to
+ * lookup the register and verify it does have a known min/max value without
+ * these bounds we can not make any statement about termination.
+ *
+ * Then we make a worse case instruction count analysis and when the loop
+ * latch is completed this will be added to the total instruction count.
+ *
+ * Finally, with worst case insn count completed and valid bounds go ahead
+ * and mark the register with the induction type indicator.
+*/
+int bpf_check_loop_header(struct bpf_verifier_env *env, int insn_idx)
+{
+	struct bpf_reg_state *regs, *src_reg = NULL, *dst_reg = NULL;
+	struct bpf_insn *insn = env->prog->insnsi;
+	struct bpf_loop_info *loop;
+	int err = 0;
+	u8 op;
+
+	loop = env->insn_aux_data[insn_idx].loop;
+	if (!loop)
+		return 0;
+
+	regs = cur_regs(env);
+	if (loop->src.reg >= 0)
+		src_reg = &regs[loop->src.reg];
+	if (loop->dst.reg >= 0)
+		dst_reg = &regs[loop->dst.reg];
+
+	op = BPF_OP(insn[loop->insn_exit].code);
+
+	/* If one of the states is unknown we have some options. We could
+	 * try to do a path walk backwards and see if the value is valid.
+	 * Or we could look at the bounds here and see if we can infer
+	 * anything from that.
+	 *
+	 * What I've found from experimentation is usually due to stack
+	 * spilling and operations that leave the variable in unknown
+	 * state we don't learn much here.
+	 *
+	 * At one point I tried some code to walk back in the tree but
+	 * I didn't like that very much either. Right now my working
+	 * assumption is we can make the bounds tracking good enough
+	 * to avoid walking back through the tree.
+	 */
+	if (loop->dst.state == BPF_LOOP_UNKNOWN ||
+	    loop->src.state == BPF_LOOP_UNKNOWN) {
+		bpf_verifier_log_write(env, "bpf loop unknown state!\n");
+		return -1;
+	}
+
+	err = bpf_cfg_valid_bounds(env, op, src_reg, dst_reg, loop);
+	if (err) {
+		bpf_verifier_log_write(env, "bpf cfg loop has invalid bounds!\n");
+		return -1;
+	}
+
+	/* At this point the bounds are valid */
+
+	/* We are going to push worse case bounds on to the loop induction
+	 * variable this way when we run symbolic execution we check validity
+	 * of min and max values. This allows the state pruning logic to have
+	 * a chance at pruning the state.
+	 *
+	 * At the moment we only track the loop induction variables any other
+	 * induction variables in the loop (linear or otherwise) will force
+	 * the loop to iterate through every case presumably exploding the
+	 * state complexity. E.g. simple example is the following,
+	 *
+	 *   for (i = 0, j = 0; i < const; i++, j++) { ... }
+	 *
+	 * Improving the induction variable logic can catch the above case
+	 * and many more.
+	 */
+	err = bpf_cfg_deduce_bounds(env, op, src_reg, dst_reg, loop);
+	if (err) {
+		bpf_verifier_log_write(env, "bpf cfg loop could not deduce bounds!\n");
+		return -1;
+	}
+
+	bpf_print_loop_info(env, loop);
+
+	/* Instruction count is being used as an approximation for runtime.
+	 * Loops have the potential to iterative many times over a single
+	 * set of instructions. To account for this charge instruction count
+	 * limits the worst case path times the worst case number of iterations.
+	 */
+	// tbd
+
+	/* Mark insn for indvar tracking informs the execution engine when
+	 * it can avoid updating bounds on an insn. This is required to
+	 * allow the pruning logic to eventually prune the loop state.
+	 */
+	if (loop->dst.state == BPF_LOOP_DEC ||
+	    loop->dst.state == BPF_LOOP_INC)
+		dst_reg->indvar = loop->dst.state;
+	else if (loop->src.state == BPF_LOOP_DEC ||
+		 loop->src.state == BPF_LOOP_INC)
+		src_reg->indvar = loop->src.state;
+
+	/* Loop _will_ terminate remove reference and let the state pruning
+	 * do its job.
+	 */
+	env->insn_aux_data[insn_idx].loop = NULL;
+	bpf_free_loop(loop);
+	return 1;
+}
+
+static int bpf_is_valid_loop_state(struct bpf_loop_info *loop)
+{
+	if (loop->src.state == BPF_LOOP_INVALID ||
+	    loop->dst.state == BPF_LOOP_INVALID)
+		return -1;
+
+	switch (loop->src.state) {
+	case BPF_LOOP_UNKNOWN:
+	case BPF_LOOP_IMM:
+		if (loop->dst.state == BPF_LOOP_INC ||
+		    loop->dst.state == BPF_LOOP_DEC)
+			return 0;
+		break;
+	case BPF_LOOP_DEC:
+	case BPF_LOOP_INC:
+		if (loop->dst.state == BPF_LOOP_UNKNOWN ||
+		    loop->dst.state == BPF_LOOP_IMM)
+			return 0;
+		break;
+	}
+
+	return -1;
+}
+
+static void bb_state_stack_push(struct bb_state *state, struct list_head *stack)
+{
+	list_add(&state->list, stack);
+}
+
+static struct bb_state *bb_state_stack_pop(struct list_head *stack)
+{
+	struct bb_state *s;
+
+	s = list_first_entry_or_null(stack, struct bb_state, list);
+	if (s)
+		list_del(&s->list);
+	return s;
+}
+
+static int build_loop_info(struct bpf_verifier_env *env,
+			   struct cfg_node_allocator *allocator,
+			   struct bpf_subprog_info *subprog,
+			   struct bb_node *head,
+			   struct bb_node *tail)
+{
+	struct bpf_insn *insns = env->prog->insnsi;
+	bool has_branch, only_has_branch;
+	struct list_head bb_state_stack;
+	struct bb_state *state = NULL;
+	struct bpf_loop_info *loop;
+	struct bb_node *next_bb;
+	struct bpf_insn *insn;
+	int err = 0;
+	u8 code;
+
+	loop = kzalloc(sizeof(struct bpf_loop_info), GFP_KERNEL);
+	if (!loop)
+		return -ENOMEM;
+
+	loop->src.state = BPF_LOOP_UNKNOWN;
+	loop->dst.state = BPF_LOOP_UNKNOWN;
+	loop->idx = loop_idx++;
+	INIT_LIST_HEAD(&loop->bb);
+	INIT_LIST_HEAD(&bb_state_stack);
+
+	state = kzalloc(sizeof(struct bb_state), GFP_KERNEL);
+	if (!state) {
+		kfree(loop);
+		return -ENOMEM;
+	}
+
+	/* Initialize stack for path walk. To track the loop induction
+	 * variable we will walk all paths back from the last instruction
+	 * to the first instruction in the loop.
+	 */
+	next_bb = bb_next(allocator, head);
+	state->bb = head;
+	state->insn = next_bb->head - 1;
+	insn = &insns[state->insn];
+	code = insn->code;
+	if (BPF_SRC(insn->code) == BPF_K) {
+		_bpf_state_set_move(&state->src, insn);
+	} else {
+		state->src.state = BPF_LOOP_UNKNOWN;
+		state->src.reg = insn->src_reg;
+	}
+	state->dst.state = BPF_LOOP_UNKNOWN;
+	state->dst.reg = insn->dst_reg;
+	state->insn_cnt = 0;
+
+	bb_state_stack_push(state, &bb_state_stack);
+	err = bb_stack_push(tail, loop);
+	if (err)
+		goto out;
+
+	loop->insn_entry = tail->head;
+	loop->insn_exit = next_bb->head - 1;
+
+	/* This is a pattern match on the loop type we expect loops
+	 * of the form,
+	 *
+	 *   header
+	 *   ...
+	 *   if (r1 > r2) goto header
+	 *   ...
+	 *
+	 * Where the jump is not a BPF_JA instruction. However sometimes
+	 * we get loops like the following,
+	 *
+	 *  header
+	 *  ...
+	 *  if (r1 > r2) goto out_of_loop
+	 *  ...
+	 *  goto header
+	 *
+	 *  Here the bounding condition is inside the loop and not in the
+	 *  last BB. Presumably we can handle these as well with additional
+	 *  searching to find the latch element. However it makes the scanning
+	 *  a bit more painful. For simplicity test if tail is valid latch and
+	 *  throw out other constructs.
+	 *
+	 *  TBD handle nested loops.
+	 */
+	only_has_branch = BPF_CLASS(code) == BPF_JMP &&
+			  BPF_OP(code) == BPF_JA;
+	if (only_has_branch) {
+		bpf_verifier_log_write(env,
+				       "non-terminating loop detected e(%i->%i)\n",
+				       head->idx, tail->idx);
+		return -EINVAL;
+	}
+
+	has_branch = only_has_branch ||
+		     (BPF_CLASS(code) == BPF_JMP &&
+		      BPF_OP(code) != BPF_EXIT &&
+		      BPF_OP(code) != BPF_CALL);
+	if (!has_branch) {
+		bpf_verifier_log_write(env,
+				       "loop without branches (class %i op %i), must be a verifier bug? e(%i->%i)\n",
+				       BPF_CLASS(code), BPF_OP(code), head->idx, tail->idx);
+		return -EINVAL;
+	}
+
+
+	/* With a valid branch then either src or dst register must be monotonic for
+	 * the loop to terminate. To detect this do a path walk through the loop and
+	 * ensure that monotonic property holds in each path.
+	 */
+	state = bb_state_stack_pop(&bb_state_stack);
+	while (state) {
+		int bb_tail, bb_head;
+		struct edge_node *e;
+		struct bb_node *bb;
+		bool found;
+
+		bb = state->bb;
+		found = bb_search(bb, loop);
+		if (!found)
+			bb_stack_push(bb, loop);
+		next_bb = bb_next(allocator, bb);
+		bb_tail = next_bb->head - 1;
+		bb_head = bb->head;
+
+		while (bb_tail >= bb_head) {
+			bpf_loop_state(env, bb_tail, &insns[bb_tail], state);
+			bb_tail--;
+			state->insn_cnt++;
+		}
+
+		if (state->src.state == BPF_LOOP_INVALID ||
+		    state->dst.state == BPF_LOOP_INVALID) {
+			bpf_verifier_log_write(env,
+					       "Detected BPF_LOOP_INVALID state\n");
+			goto out;
+		}
+
+		/* If this is the last node in the loop ensure the loop states
+		 * have not changed with paths. For example, it would be invalid
+		 * to have two paths one where the induction variable increases
+		 * and another where it decreases. If the state is invalid abort
+		 * now because if any single path fails the loop is invalid.
+		 *
+		 * Finally, assuming state is valid continue processing stack
+		 * giving the next path trace.
+		 */
+		if (bb == tail) {
+			if (state->src.state != loop->src.state &&
+			    loop->src.state != BPF_LOOP_UNKNOWN) {
+				bpf_verifier_log_write(env,
+						       "Paths (src) do not align %i != %i\n",
+						       state->src.state,
+						       loop->src.state);
+
+				goto out;
+			}
+			if (state->dst.state != loop->dst.state &&
+			    loop->dst.state != BPF_LOOP_UNKNOWN) {
+				bpf_verifier_log_write(env,
+						       "Paths (dst) do not align %i != %i\n",
+						       state->src.state,
+						       loop->src.state);
+				goto out;
+			}
+
+			if (loop->insn_cnt < state->insn_cnt)
+				loop->insn_cnt = state->insn_cnt;
+
+			loop->dst = state->dst;
+			loop->src = state->src;
+
+			bpf_clear_state(state);
+			kfree(state);
+			state = bb_state_stack_pop(&bb_state_stack);
+			continue;
+		}
+
+		e = cfg_node_delink(allocator, &bb->e_prevs);
+		while (e) {
+			struct bb_node *src = e->src;
+			struct bb_state *old = state;
+			struct bb_state *new;
+
+			new = kzalloc(sizeof(struct bb_state), GFP_KERNEL);
+			if (!state)
+				goto out;
+
+			next_bb = bb_next(allocator, src);
+
+			*new = *old;
+			new->bb = src;
+			new->insn = next_bb->head - 1;
+			bb_state_stack_push(new, &bb_state_stack);
+
+			e = cfg_node_delink(allocator, &e->link);
+		}
+		kfree(state);
+		state = bb_state_stack_pop(&bb_state_stack);
+	}
+	if (bpf_is_valid_loop_state(loop))
+		goto out;
+
+	bpf_verifier_log_write(env,
+			      "Loop detected e(%i->%i) insn(%i) src_state(R%i:%s) dst_state(R%i:%s)\n",
+			       head->idx, tail->idx, loop->insn_cnt,
+			       loop->src.reg,
+			       bpf_loop_state_str[loop->src.state],
+			       loop->dst.reg,
+			       bpf_loop_state_str[loop->dst.state]);
+	env->insn_aux_data[loop->insn_entry].loop = loop;
+	return 0;
+out:
+	while (state) {
+		kfree(state);
+		state = bb_state_stack_pop(&bb_state_stack);
+	}
+	bpf_free_loop(loop);
+	return -1;
+}
+
+int subprog_has_loop(struct bpf_verifier_env *env,
+		     struct cfg_node_allocator *allocator,
+		     struct bpf_subprog_info *subprog)
 {
 	int lane_len = BITS_TO_LONGS(subprog->bb_num - 2);
 	struct bb_node *bb, *entry_bb, *exit_bb;
 	void **bb_list = (void **)&subprog->bbs;
 	struct edge_node *e;
+	int err = 0;
 
 	entry_bb = entry_bb(bb_list);
 	exit_bb = exit_bb(bb_list);
@@ -809,8 +1767,11 @@ bool subprog_has_loop(struct cfg_node_allocator *allocator,
 
 			if (latch != entry_bb &&
 			    test_bit(bb->idx,
-				     subprog->dtree + latch->idx * lane_len))
-				return true;
+				     subprog->dtree + latch->idx * lane_len)) {
+				err = build_loop_info(env, allocator, subprog, latch, bb);
+				if (err)
+					return err;
+			}
 
 			e = cfg_node_delink(allocator, &e->link);
 		}
@@ -818,7 +1779,7 @@ bool subprog_has_loop(struct cfg_node_allocator *allocator,
 		bb = bb_next(allocator, bb);
 	}
 
-	return false;
+	return 0;
 }
 
 /* We don't want to do any further loop bounds analysis for irreducible loop,
diff --git a/kernel/bpf/cfg.h b/kernel/bpf/cfg.h
index 44dcabb..8fa64b5 100644
--- a/kernel/bpf/cfg.h
+++ b/kernel/bpf/cfg.h
@@ -15,6 +15,8 @@ struct cfg_node_allocator {
 	u8 pool_cnt;
 };
 
+struct bpf_loop_info;
+
 int add_subprog(struct bpf_verifier_env *env, int off);
 void cfg_node_allocator_free(struct cfg_node_allocator *allocator);
 int cfg_node_allocator_init(struct cfg_node_allocator *allocator,
@@ -33,8 +35,9 @@ int subprog_append_callee(struct bpf_verifier_env *env,
 int subprog_build_dom_info(struct bpf_verifier_env *env,
 			   struct cfg_node_allocator *allocator,
 			   struct bpf_subprog_info *subprog);
-bool subprog_has_loop(struct cfg_node_allocator *allocator,
-		      struct bpf_subprog_info *subprog);
+int subprog_has_loop(struct bpf_verifier_env *env,
+		     struct cfg_node_allocator *allocator,
+		     struct bpf_subprog_info *subprog);
 int subprog_has_irreduciable_loop(struct cfg_node_allocator *allocator,
 				  struct bpf_subprog_info *subprog);
 void cfg_pretty_print(struct bpf_verifier_env *env,
@@ -45,6 +48,7 @@ void dom_pretty_print(struct bpf_verifier_env *env,
 int subprog_init_bb(struct cfg_node_allocator *allocator, void **bb_list,
 		    int subprog_start, int subprog_end);
 void subprog_free(struct bpf_subprog_info *subprog, int end_idx);
+int bpf_check_loop_header(struct bpf_verifier_env *env, int insn_idx);
 
 #define DFS_NODE_EXPLORING	1
 #define DFS_NODE_EXPLORED	2
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c41f587..c9ed3d7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -687,6 +687,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	reg->off = 0;
 	reg->var_off = tnum_unknown;
 	reg->frameno = 0;
+	reg->indvar = BPF_LOOP_UNKNOWN;
 	__mark_reg_unbounded(reg);
 }
 
@@ -913,6 +914,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 						     &subprog[cur_subprog]);
 			if (ret < 0)
 				goto free_nodes;
+
 			cfg_pretty_print(env, &allocator, &subprog[cur_subprog]);
 			dom_pretty_print(env, &subprog[cur_subprog]);
 			ret = subprog_has_irreduciable_loop(&allocator,
@@ -924,7 +926,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
 				ret = -EINVAL;
 				goto free_nodes;
 			}
-			if (subprog_has_loop(&allocator,
+			if (subprog_has_loop(env,
+					     &allocator,
 					     &subprog[cur_subprog])) {
 				verbose(env, "cfg - loop detected");
 				ret = -EINVAL;
@@ -1117,7 +1120,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
 
 	cur = env->cur_state->frame[env->cur_state->curframe];
 	if (value_regno >= 0 &&
-	    is_spillable_regtype((type = cur->regs[value_regno].type))) {
+	    (is_spillable_regtype((type = cur->regs[value_regno].type)) ||
+	    cur->regs[value_regno].indvar)) {
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
@@ -2784,6 +2788,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
 		return -EINVAL;
 
+	/* For now if indvar is being used with a pointer and scalar drop the
+	 * indvar. This will force the loop to run until termination without
+	 * the aid of pruning. With extra complexity this can be added in the
+	 * future if needed.
+	 */
+	dst_reg->indvar = BPF_LOOP_UNKNOWN;
+
 	switch (opcode) {
 	case BPF_ADD:
 		/* We can take a fixed offset as long as it doesn't overflow
@@ -2809,6 +2820,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * this creates a new 'base' pointer, off_reg (variable) gets
 		 * added into the variable offset, and we copy the fixed offset
 		 * from ptr_reg.
+		 * For PTR_TO_PACJET ptrs we propagate range through when
+		 * possible by taking worst case upper limit.
 		 */
 		if (signed_add_overflows(smin_ptr, smin_val) ||
 		    signed_add_overflows(smax_ptr, smax_val)) {
@@ -2956,6 +2969,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		return 0;
 	}
 
+	/* Verifier check to ensure indvar state is good */
+	if (dst_reg->indvar && opcode != BPF_ADD && opcode != BPF_SUB) {
+		WARN_ON_ONCE(1);
+		__mark_reg_unknown(dst_reg);
+		return 0;
+	}
+
 	switch (opcode) {
 	case BPF_ADD:
 		if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2963,18 +2983,37 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->smin_value = S64_MIN;
 			dst_reg->smax_value = S64_MAX;
 		} else {
-			dst_reg->smin_value += smin_val;
-			dst_reg->smax_value += smax_val;
+			if (dst_reg->indvar == BPF_LOOP_INC) {
+				dst_reg->smin_value += smin_val;
+			} else if (dst_reg->indvar == BPF_LOOP_DEC) {
+				dst_reg->smax_value += smax_val;
+			} else {
+				dst_reg->smin_value += smin_val;
+				dst_reg->smax_value += smax_val;
+			}
 		}
 		if (dst_reg->umin_value + umin_val < umin_val ||
 		    dst_reg->umax_value + umax_val < umax_val) {
 			dst_reg->umin_value = 0;
 			dst_reg->umax_value = U64_MAX;
 		} else {
-			dst_reg->umin_value += umin_val;
-			dst_reg->umax_value += umax_val;
+			if (dst_reg->indvar == BPF_LOOP_INC) {
+				dst_reg->umin_value += umin_val;
+			} else if (dst_reg->indvar == BPF_LOOP_DEC) {
+				dst_reg->umax_value += smax_val;
+			} else {
+				dst_reg->umin_value += umin_val;
+				dst_reg->umax_value += umax_val;
+			}
 		}
-		dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
+
+		/* This is a hack for now need to sort out tnum when manipulating
+		 * min/max bounds directly.
+		 */
+		if (dst_reg->indvar)
+			dst_reg->var_off = tnum_range(dst_reg->umin_value, dst_reg->umax_value);
+		else
+			dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_SUB:
 		if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
@@ -2983,8 +3022,14 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->smin_value = S64_MIN;
 			dst_reg->smax_value = S64_MAX;
 		} else {
-			dst_reg->smin_value -= smax_val;
-			dst_reg->smax_value -= smin_val;
+			if (dst_reg->indvar == BPF_LOOP_INC) {
+				dst_reg->smin_value -= smax_val;
+			} else if (dst_reg->indvar == BPF_LOOP_DEC) {
+				dst_reg->smax_value -= smin_val;
+			} else {
+				dst_reg->smin_value -= smax_val;
+				dst_reg->smax_value -= smin_val;
+			}
 		}
 		if (dst_reg->umin_value < umax_val) {
 			/* Overflow possible, we know nothing */
@@ -2992,10 +3037,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->umax_value = U64_MAX;
 		} else {
 			/* Cannot overflow (as long as bounds are consistent) */
-			dst_reg->umin_value -= umax_val;
-			dst_reg->umax_value -= umin_val;
+			if (dst_reg->indvar == BPF_LOOP_INC) {
+				dst_reg->umin_value -= umax_val;
+			} else if (dst_reg->indvar == BPF_LOOP_DEC) {
+				dst_reg->umax_value -= umin_val;
+			} else {
+				dst_reg->umin_value -= umax_val;
+				dst_reg->umax_value -= umin_val;
+			}
 		}
-		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
+
+		/* This is a hack for now need to sort out tnum when manipulating
+		 * min/max bounds directly.
+		 */
+		if (dst_reg->indvar)
+			dst_reg->var_off = tnum_range(dst_reg->umin_value, dst_reg->umax_value);
+		else
+			dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_MUL:
 		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
@@ -3167,6 +3225,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
+		/* Not tracking 32-bit ALU ops for now */
+		dst_reg->indvar = BPF_LOOP_UNKNOWN;
+		src_reg.indvar = BPF_LOOP_UNKNOWN;
+
 		/* 32-bit ALU ops are (32,32)->32 */
 		coerce_reg_to_size(dst_reg, 4);
 		coerce_reg_to_size(&src_reg, 4);
@@ -3199,7 +3261,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 			if (dst_reg->type != SCALAR_VALUE) {
 				/* Combining two pointers by any ALU op yields
 				 * an arbitrary scalar. Disallow all math except
-				 * pointer subtraction
+				 * pointer subtraction. Further we do not track
+				 * indvar through pointer ALU, so likely to hit
+				 * complexity limits with pointer loop indvar.
 				 */
 				if (opcode == BPF_SUB){
 					mark_reg_unknown(env, regs, insn->dst_reg);
@@ -4864,6 +4928,14 @@ static int do_check(struct bpf_verifier_env *env)
 			return -EINVAL;
 		}
 
+		err = bpf_check_loop_header(env, insn_idx);
+		if (err < 0) {
+			print_verifier_state(env, state->frame[state->curframe]);
+			return err;
+		} else if (err > 0) {
+			print_verifier_state(env, state->frame[state->curframe]);
+		}
+
 		insn_idx++;
 	}
 
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
index 0c12048..b8a1d98 100644
--- a/samples/bpf/xdp2skb_meta_kern.c
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -11,6 +11,7 @@
  */
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/pkt_cls.h>
+#include <linux/version.h>
 
 #include "bpf_helpers.h"
 
@@ -34,6 +35,13 @@ int _xdp_mark(struct xdp_md *ctx)
 	struct meta_info *meta;
 	void *data, *data_end;
 	int ret;
+	volatile __u64 v[4];
+
+	v[0] = 0;
+	v[1] = 1;
+	v[2] = 2;
+	v[3] = 3;
+
 
 	/* Reserve space in-front of data pointer for our meta info.
 	 * (Notice drivers not supporting data_meta will fail here!)
@@ -59,27 +67,62 @@ int _xdp_mark(struct xdp_md *ctx)
 	return XDP_PASS;
 }
 
-SEC("tc_mark")
+SEC("classifier_tc_mark")
 int _tc_mark(struct __sk_buff *ctx)
 {
 	void *data      = (void *)(unsigned long)ctx->data;
 	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	char a = 'a';
+#if 0
 	void *data_meta = (void *)(unsigned long)ctx->data_meta;
 	struct meta_info *meta = data_meta;
+	volatile __u64 mark = ctx->mark;
+#endif
 
 	/* Check XDP gave us some data_meta */
-	if (meta + 1 > data) {
-		ctx->mark = 41;
-		 /* Skip "accept" if no data_meta is avail */
-		return TC_ACT_OK;
+	 u8 j = 0;
+	 s32 i = 0;
+	 u8 k = 0;
+	 u8 *p;
+	
+	 if (data + 2 > data_end)
+		 return TC_ACT_OK;
+
+	 p = data;
+	 j = p[0]; 
+
+	 if (data + 111 > data_end)
+		 return TC_ACT_OK;
+
+	 if (j > 100)
+		 return TC_ACT_OK;
+#if 0
+	 if (j < 10)
+		 return TC_ACT_OK;
+#endif
+
+#pragma nounroll
+	while (i < j) {
+		//j1 = j2 = j3 = j4 = j5 = j6 = j7 = j8 = j9 = j10 = 1;
+		//k += (__u8)p[i];
+		k += p[i];
+		i++;
+#if 0
+		else 
+			p[i] = 'b';
+#endif
+
+		//j++;
 	}
-
 	/* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
-	ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+	//ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+	ctx->mark = k;
 
 	return TC_ACT_OK;
 }
 
+u32 _version SEC("version") = LINUX_VERSION_CODE;
+
 /* Manually attaching these programs:
 export DEV=ixgbe2
 export FILE=xdp2skb_meta_kern.o

^ permalink raw reply related

* [RFC PATCH 15/16] bpf: verifier, simple loop examples
From: John Fastabend @ 2018-06-01  9:33 UTC (permalink / raw)
  To: alexei.starovoitov, daniel, davem; +Cc: netdev
In-Reply-To: <20180601092646.15353.28269.stgit@john-Precision-Tower-5810>

Add some simple loop examples. For now I just load these
using bpftool but eventually they should have a loader
simliar to test_verifier except for C snippets.

We want to use C files here instead of BPF because most
users will be using C clang/llvm and want to test how
well this works with the code generated by the actual
tools instead of hand-crafted BPF.
---
 tools/testing/selftests/bpf/Makefile           |    5 ++
 tools/testing/selftests/bpf/test_bad_loop1.c   |   47 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_bad_loop2.c   |   45 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_bad_loop3.c   |   47 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_basic_loop1.c |   44 +++++++++++++++++++++
 tools/testing/selftests/bpf/test_basic_loop2.c |   48 +++++++++++++++++++++++
 tools/testing/selftests/bpf/test_basic_loop3.c |   51 ++++++++++++++++++++++++
 7 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_bad_loop1.c
 create mode 100644 tools/testing/selftests/bpf/test_bad_loop2.c
 create mode 100644 tools/testing/selftests/bpf/test_bad_loop3.c
 create mode 100644 tools/testing/selftests/bpf/test_basic_loop1.c
 create mode 100644 tools/testing/selftests/bpf/test_basic_loop2.c
 create mode 100644 tools/testing/selftests/bpf/test_basic_loop3.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a1b66da..0cf7bd1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -34,7 +34,10 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o \
+	test_basic_loop1.o test_basic_loop2.o test_basic_loop3.o \
+	test_bad_loop1.o test_bad_loop2.o test_bad_loop3.o
+
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/test_bad_loop1.c b/tools/testing/selftests/bpf/test_bad_loop1.c
new file mode 100644
index 0000000..88e8cfb
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bad_loop1.c
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Covalent IO
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+
+/* Test packet access out of bounds */
+
+int _version SEC("version") = 1;
+SEC("classifier_tc_loop1")
+int _tc_loop(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	 __u8 i = 0, j = 0, k = 0, *p;
+
+	 p = data;
+	 if (data + 50 > data_end)
+		 return TC_ACT_OK;
+
+#pragma nounroll
+	while (i < 100) {
+		k += p[i];
+		p[i] = k;
+		i++;
+	}
+	ctx->mark = k;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_bad_loop2.c b/tools/testing/selftests/bpf/test_bad_loop2.c
new file mode 100644
index 0000000..fff005e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bad_loop2.c
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Covalent IO
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+/* Test non-terminating loops */
+int _version SEC("version") = 1;
+SEC("classifier_tc_loop1")
+int _tc_loop(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	int i = 0, j = 0, k = 0, *p;
+
+	 p = data;
+	 if (data + 101 > data_end)
+		 return TC_ACT_OK;
+
+#pragma nounroll
+	while (i < 100) {
+		k += p[i];
+		p[i] = k;
+		i--;
+	}
+	ctx->mark = k;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_bad_loop3.c b/tools/testing/selftests/bpf/test_bad_loop3.c
new file mode 100644
index 0000000..4015366
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bad_loop3.c
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Covalent IO
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+SEC("classifier_tc_loop1")
+int _tc_loop(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	 __u8 i = 0, j = 0, k = 0, *p;
+
+	 p = data;
+	 if (data + 101 > data_end)
+		 return TC_ACT_OK;
+
+head:
+#pragma nounroll
+	while (i < 100) {
+		k += p[i];
+		p[i] = k;
+		i++;
+		if (k < 100)
+			goto head;
+	}
+	ctx->mark = k;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_basic_loop1.c b/tools/testing/selftests/bpf/test_basic_loop1.c
new file mode 100644
index 0000000..63dc4a4
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_basic_loop1.c
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Covalent IO
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+SEC("classifier_tc_loop1")
+int _tc_loop(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	 __u8 i = 0, j = 0, k = 0, *p;
+
+	 p = data;
+	 if (data + 101 > data_end)
+		 return TC_ACT_OK;
+
+#pragma nounroll
+	while (i < 100) {
+		k += p[i];
+		p[i] = k;
+		i++;
+	}
+	ctx->mark = k;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_basic_loop2.c b/tools/testing/selftests/bpf/test_basic_loop2.c
new file mode 100644
index 0000000..fb774dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_basic_loop2.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Covalent IO
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+SEC("classifier_tc_loop1")
+int _tc_loop(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	 __u8 i = 0, j = 0, k = 0, *p;
+
+	 p = data;
+	 if (data + 101 > data_end)
+		 return TC_ACT_OK;
+
+#pragma nounroll
+	while (i < 100) {
+		k += p[i];
+		if (k < 100) {
+			p[i] = 'a';
+		} else {
+			p[i] = 'b';
+		}
+		i++;
+	}
+	ctx->mark = k;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_basic_loop3.c b/tools/testing/selftests/bpf/test_basic_loop3.c
new file mode 100644
index 0000000..dd7dd1b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_basic_loop3.c
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Covalent IO
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+SEC("classifier_tc_loop1")
+int _tc_loop(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	 __u8 i = 0, j = 0, k = 0, *p;
+
+	 if (data + 2 > data_end)
+		 return TC_ACT_OK;
+
+	 p = data;
+	 j = p[0];
+
+	 if (data + 101 > data_end)
+		 return TC_ACT_OK;
+
+	 if (j < 100)
+		 return TC_ACT_OK;
+
+#pragma nounroll
+	while (i < j) {
+		k += p[i];
+		i++;
+	}
+	ctx->mark = k;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";

^ permalink raw reply related

* [RFC PATCH 16/16] bpf: tools: dbg patch to turn on debugging and add primitive examples
From: John Fastabend @ 2018-06-01  9:33 UTC (permalink / raw)
  To: alexei.starovoitov, daniel, davem; +Cc: netdev
In-Reply-To: <20180601092646.15353.28269.stgit@john-Precision-Tower-5810>

While developing this I found it useful to always have the log
output when dealing with bpftool. This is a quick hack to enable
it but we should add a '-v' option shortly.

Then add a set of good and bad loop examples to test with. These
are all very basic at the moment but will get better soon.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/lib/bpf/bpf.c    |    2 +-
 tools/lib/bpf/libbpf.c |    7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9ddc89d..b1d2199 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -200,7 +200,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	attr.license = ptr_to_u64(load_attr->license);
 	attr.log_buf = ptr_to_u64(NULL);
 	attr.log_size = 0;
-	attr.log_level = 0;
+	attr.log_level = 1;
 	attr.kern_version = load_attr->kern_version;
 	attr.prog_ifindex = load_attr->prog_ifindex;
 	memcpy(attr.prog_name, load_attr->name,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b1a60ac..34ff173 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1304,6 +1304,13 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 	if (ret >= 0) {
 		*pfd = ret;
 		ret = 0;
+
+		pr_warning("load bpf program succesful\n");
+		if (log_buf && log_buf[0] != '\0') {
+			pr_warning("-- BEGIN DUMP LOG ---\n");
+			pr_warning("\n%s\n", log_buf);
+			pr_warning("-- END LOG --\n");
+		}
 		goto out;
 	}
 

^ permalink raw reply related

* Re: [PATCH] net: core: improve the tx_hash calculating
From: Eric Dumazet @ 2018-06-01  9:54 UTC (permalink / raw)
  To: Tonghao Zhang, netdev
In-Reply-To: <1527761641-22034-1-git-send-email-xiangxia.m.yue@gmail.com>



On 05/31/2018 06:14 AM, Tonghao Zhang wrote:
> Use the % instead of while, and it may simple code and improve
> the calculating. The real_num_tx_queues has been checked when
> allocating and setting it.
> 
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
>  net/core/dev.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1844d9b..edc5b75 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2617,15 +2617,13 @@ void netif_device_attach(struct net_device *dev)
>   */
>  static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
>  {
> -	u32 hash;
>  	u16 qoffset = 0;
>  	u16 qcount = dev->real_num_tx_queues;
>  
>  	if (skb_rx_queue_recorded(skb)) {
> -		hash = skb_get_rx_queue(skb);
> -		while (unlikely(hash >= qcount))
> -			hash -= qcount;
> -		return hash;
> +		/* When setting the real_num_tx_queues, we make sure
> +		 * real_num_tx_queues != 0. */
> +		return skb_get_rx_queue(skb) % qcount;
>  	}
>  
>  	if (dev->num_tc) {
> 


This patch is adding a divide. We do not need it most of the time.

^ permalink raw reply

* Re: [PATCH v2 net] mlx4_core: restore optimal ICM memory allocation
From: Eric Dumazet @ 2018-06-01  9:57 UTC (permalink / raw)
  To: Qing Huang, Eric Dumazet, David S . Miller
  Cc: netdev, Eric Dumazet, John Sperbeck, Tarick Bedeir,
	Daniel Jurgens, Zhu Yanjun
In-Reply-To: <d470065c-f45d-7761-b347-729fb49c89c9@oracle.com>



On 05/31/2018 09:51 PM, Qing Huang wrote:
> 

> It would be great if you could share the test case that triggered the KASAN report in your
> environment. Our QA has been running intensive tests using 8KB or 4KB chunk size configuration
> for some time, no one has reported memory corruption issues so far, given that we are not
> running the latest upstream kernel.
> 
> IMO, it's worthwhile to find out the root cause and fix the problem.
> 

Do not worry, we are working on this minor problem.

We do not use KASAN on production hosts, quite obviously.

The memory waste is the major problem really.

^ permalink raw reply

* [PATCH net] ipv6: omit traffic class when calculating flow hash
From: Michal Kubecek @ 2018-06-01 10:34 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, linux-kernel, Nicolas Dichtel, Tom Herbert, David Ahern

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also incosistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
 net/core/flow_dissector.c | 3 ++-
 net/ipv6/route.c          | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..441d3db76e8e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,8 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = (__force u32)(fl6->flowlabel &
+					      IPV6_FLOWLABEL_MASK);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4d61736c41a..fcbacf1677f8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1868,7 +1868,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -1889,7 +1889,8 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = (__force u32)(fl6->flowlabel &
+								  IPV6_FLOWLABEL_MASK);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH 02/18] rhashtable: split rhashtable.h
From: Herbert Xu @ 2018-06-01 10:48 UTC (permalink / raw)
  To: NeilBrown; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <152782824932.30340.8068908844678938422.stgit@noble>

On Fri, Jun 01, 2018 at 02:44:09PM +1000, NeilBrown wrote:
> Due to the use of rhashtables in net namespaces,
> rhashtable.h is included in lots of the kernel,
> so a small changes can required a large recompilation.
> This makes development painful.
> 
> This patch splits out rhashtable-types.h which just includes
> the major type declarations, and does not include (non-trivial)
> inline code.  rhashtable.h is no longer included by anything
> in the include/ directory.
> Common include files only include rhashtable-types.h so a large
> recompilation is only triggered when that changes.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [net-next][PATCH] tcp: probe timer MUST not less than 5 minuter for tcp PMTU
From: Li RongQing @ 2018-06-01 11:03 UTC (permalink / raw)
  To: netdev

RFC4821 say: The value for this timer MUST NOT be less than
5 minutes and is recommended to be 10 minutes, per RFC 1981.

Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 net/ipv4/sysctl_net_ipv4.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d2eed3ddcb0a..ed8952bb6874 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -47,6 +47,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 static int comp_sack_nr_max = 255;
+static int tcp_probe_interval_min = 300;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -711,7 +712,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.data		= &init_net.ipv4.sysctl_tcp_probe_interval,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_probe_interval_min,
 	},
 	{
 		.procname	= "igmp_link_local_mcast_reports",
-- 
2.16.2

^ permalink raw reply related

* Re: WARNING in kcm_exit_net (3)
From: Kirill Tkhai @ 2018-06-01 11:10 UTC (permalink / raw)
  To: syzbot, davem, ebiggers, edumazet, linux-kernel, netdev,
	syzkaller-bugs, tom, viro
In-Reply-To: <00000000000000dca5056d7c1442@google.com>

On 31.05.2018 11:16, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    d60d61f36b8f Merge branch 'for-linus' of git://git.kernel...
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=101bb52f800000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=968b0b23c7854c0b
> dashboard link: https://syzkaller.appspot.com/bug?extid=5f1a04e374a635efc426
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=13b9ed2f800000
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+5f1a04e374a635efc426@syzkaller.appspotmail.com
> 
> IPVS: ftp: loaded support on port[0] = 21
> IPVS: ftp: loaded support on port[0] = 21
> IPVS: ftp: loaded support on port[0] = 21
> IPVS: ftp: loaded support on port[0] = 21
> IPVS: ftp: loaded support on port[0] = 21
> WARNING: CPU: 0 PID: 6 at net/kcm/kcmsock.c:2023 kcm_exit_net+0x392/0x3e0 net/kcm/kcmsock.c:2023
> Kernel panic - not syncing: panic_on_warn set ...
> 
> CPU: 0 PID: 6 Comm: kworker/u4:0 Not tainted 4.17.0-rc7+ #75
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Workqueue: netns cleanup_net
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>  panic+0x22f/0x4de kernel/panic.c:184
>  __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
>  report_bug+0x252/0x2d0 lib/bug.c:186
>  fixup_bug arch/x86/kernel/traps.c:178 [inline]
>  do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
>  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
>  invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
> RIP: 0010:kcm_exit_net+0x392/0x3e0 net/kcm/kcmsock.c:2023
> RSP: 0018:ffff8801d9a97430 EFLAGS: 00010293
> RAX: ffff8801d9a88180 RBX: 1ffff1003b352e86 RCX: 1ffff1003b351135
> RDX: 0000000000000000 RSI: ffffffff86d56942 RDI: 0000000000000286
> RBP: ffff8801d9a974f8 R08: 1ffff1003b352e67 R09: ffffed003b5c46d2
> R10: 0000000000000003 R11: 0000000000000003 R12: 1ffff1003b352e8a
> R13: ffff8801d9a974d0 R14: ffff8801d96763d0 R15: ffff8801c51d0e00
>  ops_exit_list.isra.7+0xb0/0x160 net/core/net_namespace.c:152
>  cleanup_net+0x51d/0xb20 net/core/net_namespace.c:523
>  process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
>  worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
>  kthread+0x345/0x410 kernel/kthread.c:240
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412
> Dumping ftrace buffer:
>    (ftrace buffer empty)
> Kernel Offset: disabled
> Rebooting in 86400 seconds..

How about this?

[PATCH]kcm: Fix use-after-free caused by clonned sockets

kcm_clone() creates kernel socket, which does not take net counter.
Thus, the net may die before the socket is completely destructed,
i.e. kcm_exit_net() is executed before kcm_done().

Reported-by: syzbot+5f1a04e374a635efc426@syzkaller.appspotmail.com
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d67734c99027..84b7d5c6fec8 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1671,7 +1671,7 @@ static struct file *kcm_clone(struct socket *osock)
 	__module_get(newsock->ops->owner);
 
 	newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
-			 &kcm_proto, true);
+			 &kcm_proto, false);
 	if (!newsk) {
 		sock_release(newsock);
 		return ERR_PTR(-ENOMEM);

^ permalink raw reply related

* Re: [PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info
From: Dmitry V. Levin @ 2018-06-01 11:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Linus Torvalds, Eugene Syromiatnikov, netdev, linux-kernel,
	Martin KaFai Lau, Daniel Borkmann, Alexei Starovoitov,
	David S. Miller, Jiri Olsa, Ingo Molnar, Lawrence Brakmo,
	Andrey Ignatov, Jakub Kicinski, John Fastabend
In-Reply-To: <20180601084114.xgvtg7nmihdavnnd@ast-mbp>

[-- Attachment #1: Type: text/plain, Size: 5859 bytes --]

On Fri, Jun 01, 2018 at 04:41:16AM -0400, Alexei Starovoitov wrote:
> On Fri, Jun 01, 2018 at 06:12:10AM +0300, Dmitry V. Levin wrote:
> > Hi,
> > 
> > Looks like the ABI bug in bpf_map_info and bpf_prog info introduced
> > in 4.16 is going to slip into 4.17, causing extra pain to 32-bit
> > userspace.  I'm adding Linus to this thread in hope it might help
> > to get a fix applied before 4.17 is released.
> 
> The issue identified in patch 1 is valid.
> These two fields won't be properly accessible by 32-bit user space
> on 64-bit kernel.

Yes, and currently there is no way to build a 32-bit userspace that would
work properly both with 32-bit and 64-bit kernels.

> But the fix in patch 1 is wrong.

Please elaborate.

> The patch 2 is completely unnecessary.

The patch 2 doesn't have to be backported to 4.16, but if it was
implemented in 4.16, there would be no bug to discuss.  That is,
the patch 2 is a good policy that would help to avoid this class of bugs
in the future.

Alexei, looks like your Acked-by on the buggy commit
52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd affects your judgement.
If this is the case, please refer patch 2 to other people who are less biased.

> Everyone can also see that the patches are still marked as 'new' in patchworks
> meaning that we didn't process them.
> At the moment many networking folks are traveling to netconf
> and we only had a chance to discuss this set last night.
> We'll try to send a fix in coming days.
> But regardless whether 4.17 is realesed this sunday or not
> we're not going to rush wrong patch in without proper code review
> and discussion.
> That future patch either will land in 4.17 (if it's dealyed into next sunday)
> or it will be sent to stable.

Note that the fix was submitted to netdev on 2018-05-27.
One week is surely enough to make this bug fixed, isn't it?

> To speed up the situation next time please report the issue that you find
> to public netdev mailing list instead of using proprietary distro emails.

Please explain to me and to the public what do you mean by making this
statement.

I do believe strace developers are free to discuss among themselves
using whatever means of communication they find appropriate.

I do believe the issue was properly reported to netdev, see
https://lkml.kernel.org/r/20180527112835.GA9118@asgard.redhat.com
https://lkml.kernel.org/r/20180527112842.GA18204@asgard.redhat.com

I'd like to use this opportunity to thank Eugene for submitting the fix
the same day the issue was identified as a kernel bug.

Alexei, please do not exclude my email address from this discussion.
Thanks.

> > On Wed, May 30, 2018 at 09:18:58PM +0300, Dmitry V. Levin wrote:
> > > On Sun, May 27, 2018 at 01:28:42PM +0200, Eugene Syromiatnikov wrote:
> > > > Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
> > > > has broken compat, as offsets of these fields are different in 32-bit
> > > > and 64-bit ABIs.  One fix (other than implementing compat support in
> > > > syscall in order to handle this discrepancy) is to use __aligned_u64
> > > > instead of __u64 for these fields.
> > > > 
> > > > Reported-by: Dmitry V. Levin <ldv@altlinux.org>
> > > > Fixes: 52775b33bb507 ("bpf: offload: report device information about
> > > > offloaded maps")
> > > > Fixes: 675fc275a3a2d ("bpf: offload: report device information for
> > > > offloaded programs")
> > > > 
> > > > Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
> > > 
> > > Reviewed-by: "Dmitry V. Levin" <ldv@altlinux.org>
> > > Cc: <stable@vger.kernel.org> # v4.16+
> > > 
> > > Thanks,
> > > 
> > > > ---
> > > >  include/uapi/linux/bpf.h       | 8 ++++----
> > > >  tools/include/uapi/linux/bpf.h | 8 ++++----
> > > >  2 files changed, 8 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > index c5ec897..903010a 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -1017,8 +1017,8 @@ struct bpf_prog_info {
> > > >  	__aligned_u64 map_ids;
> > > >  	char name[BPF_OBJ_NAME_LEN];
> > > >  	__u32 ifindex;
> > > > -	__u64 netns_dev;
> > > > -	__u64 netns_ino;
> > > > +	__aligned_u64 netns_dev;
> > > > +	__aligned_u64 netns_ino;
> > > >  } __attribute__((aligned(8)));
> > > >  
> > > >  struct bpf_map_info {
> > > > @@ -1030,8 +1030,8 @@ struct bpf_map_info {
> > > >  	__u32 map_flags;
> > > >  	char  name[BPF_OBJ_NAME_LEN];
> > > >  	__u32 ifindex;
> > > > -	__u64 netns_dev;
> > > > -	__u64 netns_ino;
> > > > +	__aligned_u64 netns_dev;
> > > > +	__aligned_u64 netns_ino;
> > > >  } __attribute__((aligned(8)));
> > > >  
> > > >  /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
> > > > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > > > index c5ec897..903010a 100644
> > > > --- a/tools/include/uapi/linux/bpf.h
> > > > +++ b/tools/include/uapi/linux/bpf.h
> > > > @@ -1017,8 +1017,8 @@ struct bpf_prog_info {
> > > >  	__aligned_u64 map_ids;
> > > >  	char name[BPF_OBJ_NAME_LEN];
> > > >  	__u32 ifindex;
> > > > -	__u64 netns_dev;
> > > > -	__u64 netns_ino;
> > > > +	__aligned_u64 netns_dev;
> > > > +	__aligned_u64 netns_ino;
> > > >  } __attribute__((aligned(8)));
> > > >  
> > > >  struct bpf_map_info {
> > > > @@ -1030,8 +1030,8 @@ struct bpf_map_info {
> > > >  	__u32 map_flags;
> > > >  	char  name[BPF_OBJ_NAME_LEN];
> > > >  	__u32 ifindex;
> > > > -	__u64 netns_dev;
> > > > -	__u64 netns_ino;
> > > > +	__aligned_u64 netns_dev;
> > > > +	__aligned_u64 netns_ino;
> > > >  } __attribute__((aligned(8)));
> > > >  
> > > >  /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed


-- 
ldv

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH] ath10k: fix incorrect size of dma_free_coherent in ath10k_ce_alloc_src_ring_64
From: YueHaibing @ 2018-06-01 11:16 UTC (permalink / raw)
  To: davem, kvalo; +Cc: netdev, linux-kernel, ath10k, linux-wireless, YueHaibing

sizeof(struct ce_desc) should be a copy-paste mistake
just use sizeof(struct ce_desc_64) to avoid mem leak

Fixes: b7ba83f7c414 ("ath10k: add support for shadow register for WNC3990")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/wireless/ath/ath10k/ce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
index 3b96a43..35dec2a 100644
--- a/drivers/net/wireless/ath/ath10k/ce.c
+++ b/drivers/net/wireless/ath/ath10k/ce.c
@@ -1512,7 +1512,7 @@ ath10k_ce_alloc_src_ring_64(struct ath10k *ar, unsigned int ce_id,
 		ret = ath10k_ce_alloc_shadow_base(ar, src_ring, nentries);
 		if (ret) {
 			dma_free_coherent(ar->dev,
-					  (nentries * sizeof(struct ce_desc) +
+					  (nentries * sizeof(struct ce_desc64) +
 					   CE_DESC_RING_ALIGN),
 					  src_ring->base_addr_owner_space_unaligned,
 					  base_addr);
-- 
2.7.0

^ permalink raw reply related

* Re: [PATCH] ath10k: fix incorrect size of dma_free_coherent in ath10k_ce_alloc_src_ring_64
From: YueHaibing @ 2018-06-01 11:23 UTC (permalink / raw)
  To: davem, kvalo; +Cc: netdev, linux-kernel, ath10k, linux-wireless
In-Reply-To: <20180601111602.24108-1-yuehaibing@huawei.com>


On 2018/6/1 19:16, YueHaibing wrote:
> sizeof(struct ce_desc) should be a copy-paste mistake
> just use sizeof(struct ce_desc_64) to avoid mem leak
> 
> Fixes: b7ba83f7c414 ("ath10k: add support for shadow register for WNC3990")
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> ---
>  drivers/net/wireless/ath/ath10k/ce.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
> index 3b96a43..35dec2a 100644
> --- a/drivers/net/wireless/ath/ath10k/ce.c
> +++ b/drivers/net/wireless/ath/ath10k/ce.c
> @@ -1512,7 +1512,7 @@ ath10k_ce_alloc_src_ring_64(struct ath10k *ar, unsigned int ce_id,
>  		ret = ath10k_ce_alloc_shadow_base(ar, src_ring, nentries);
>  		if (ret) {
>  			dma_free_coherent(ar->dev,
> -					  (nentries * sizeof(struct ce_desc) +
> +					  (nentries * sizeof(struct ce_desc64) +

sorry ,I post the wrong patch,will send V2
>  					   CE_DESC_RING_ALIGN),
>  					  src_ring->base_addr_owner_space_unaligned,
>  					  base_addr);
> 

^ permalink raw reply

* [PATCH v2] ath10k: fix incorrect size of dma_free_coherent in ath10k_ce_alloc_src_ring_64
From: YueHaibing @ 2018-06-01 11:25 UTC (permalink / raw)
  To: davem, kvalo; +Cc: netdev, linux-kernel, ath10k, linux-wireless, YueHaibing

sizeof(struct ce_desc) should be a copy-paste mistake
just use sizeof(struct ce_desc_64) to avoid mem leak

Fixes: b7ba83f7c414 ("ath10k: add support for shadow register for WNC3990")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/wireless/ath/ath10k/ce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
index 3b96a43..35dec2a 100644
--- a/drivers/net/wireless/ath/ath10k/ce.c
+++ b/drivers/net/wireless/ath/ath10k/ce.c
@@ -1512,7 +1512,7 @@ ath10k_ce_alloc_src_ring_64(struct ath10k *ar, unsigned int ce_id,
 		ret = ath10k_ce_alloc_shadow_base(ar, src_ring, nentries);
 		if (ret) {
 			dma_free_coherent(ar->dev,
-					  (nentries * sizeof(struct ce_desc) +
+					  (nentries * sizeof(struct ce_desc_64) +
 					   CE_DESC_RING_ALIGN),
 					  src_ring->base_addr_owner_space_unaligned,
 					  base_addr);
-- 
2.7.0

^ permalink raw reply related

* [PATCH net] kcm: Fix use-after-free caused by clonned sockets
From: Kirill Tkhai @ 2018-06-01 11:30 UTC (permalink / raw)
  To: David S. Miller, ebiggers, linux-kernel, netdev, syzkaller-bugs,
	tom, viro, Eric Dumazet
In-Reply-To: <9f485659-6d6f-9047-b9ad-b0e4084be88d@virtuozzo.com>

(resend for properly queueing in patchwork)

kcm_clone() creates kernel socket, which does not take net counter.
Thus, the net may die before the socket is completely destructed,
i.e. kcm_exit_net() is executed before kcm_done().

Reported-by: syzbot+5f1a04e374a635efc426@syzkaller.appspotmail.com
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d67734c99027..84b7d5c6fec8 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1671,7 +1671,7 @@ static struct file *kcm_clone(struct socket *osock)
 	__module_get(newsock->ops->owner);
 
 	newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
-			 &kcm_proto, true);
+			 &kcm_proto, false);
 	if (!newsk) {
 		sock_release(newsock);
 		return ERR_PTR(-ENOMEM);

^ permalink raw reply related

* Re: [PATCH net-next v4 00/11] Modify action API for implementing lockless actions
From: Jamal Hadi Salim @ 2018-06-01 12:24 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, xiyou.wangcong, jiri, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, marcelo.leitner, kliteyn
In-Reply-To: <vbfmuwgf19e.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

On 31/05/18 08:38 AM, Vlad Buslov wrote:

> Hi Jamal,
> 
> On current net-next I still have action with single reference after last
> step:
> ~$ sudo $TC -s actions ls action skbedit
> total acts 1
>                                                                 
>          action order 0:  skbedit mark 1 pipe
>           index 1 ref 2 bind 1 installed 47 sec used 47 sec
>          Action statistics:
>          Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>          backlog 0b 0p requeues 0
> ~$ sudo $TC filter del dev lo parent ffff: protocol ip prio 1 \
>> u32 match ip dst 127.0.0.1/32 flowid 1:1
> ~$ sudo $TC -s actions ls action skbedit
> total acts 1
>                                                                 
>          action order 0:  skbedit mark 1 pipe
>           index 1 ref 1 bind 0 installed 80 sec used 80 sec
>          Action statistics:
>          Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
>          backlog 0b 0p requeues 0
> 
> Which branch are you testing on?

You are correct - this is how it works now (I was thinking of a
very old version before Cong made some changes a while back).
Just vet this continues to work as above.

cheers,
jamal

^ permalink raw reply

* Re: [PATCH net] ipv6: omit traffic class when calculating flow hash
From: Nicolas Dichtel @ 2018-06-01 12:25 UTC (permalink / raw)
  To: Michal Kubecek, David S. Miller
  Cc: netdev, linux-kernel, Tom Herbert, David Ahern
In-Reply-To: <20180601112948.93BE7A0C48@unicorn.suse.cz>

Le 01/06/2018 à 12:34, Michal Kubecek a écrit :
> Some of the code paths calculating flow hash for IPv6 use flowlabel member
> of struct flowi6 which, despite its name, encodes both flow label and
> traffic class. If traffic class changes within a TCP connection (as e.g.
> ssh does), ECMP route can switch between path. It's also incosistent with
nit: s/incosistent/inconsistent

> other code paths where ip6_flowlabel() (returning only flow label) is used
> to feed the key.
> 
> Use only flow label everywhere, including one place where hash key is set
> using ip6_flowinfo().
> 
> Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

^ permalink raw reply

* Re: [PATCH net-next] rtnetlink: Fix null-ptr-deref in rtnl_newlink
From: Ido Schimmel @ 2018-06-01 13:03 UTC (permalink / raw)
  To: Prashant Bhole
  Cc: David S . Miller, Eric Dumazet, Daniel Borkmann,
	Alexei Starovoitov, Kirill Tkhai, Florian Westphal, netdev
In-Reply-To: <20180601081658.6968-1-bhole_prashant_q7@lab.ntt.co.jp>

On Fri, Jun 01, 2018 at 05:16:58PM +0900, Prashant Bhole wrote:
> In rtnl_newlink(), NULL check is performed on m_ops however member of
> ops is accessed. Fixed by accessing member of m_ops instead of ops.
> 
> [  345.432629] BUG: KASAN: null-ptr-deref in rtnl_newlink+0x400/0x1110
> [  345.432629] Read of size 4 at addr 0000000000000088 by task ip/986
> [  345.432629]
> [  345.432629] CPU: 1 PID: 986 Comm: ip Not tainted 4.17.0-rc6+ #9
> [  345.432629] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
> [  345.432629] Call Trace:
> [  345.432629]  dump_stack+0xc6/0x150
> [  345.432629]  ? dump_stack_print_info.cold.0+0x1b/0x1b
> [  345.432629]  ? kasan_report+0xb4/0x410
> [  345.432629]  kasan_report.cold.4+0x8f/0x91
> [  345.432629]  ? rtnl_newlink+0x400/0x1110
> [  345.432629]  rtnl_newlink+0x400/0x1110
> [...]
> 
> Fixes: ccf8dbcd062a ("rtnetlink: Remove VLA usage")
> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>

My machine crashed while running regression tests. Thanks for fixing!

Tested-by: Ido Schimmel <idosch@mellanox.com>

^ permalink raw reply

* [PATCH] qtnfmac: fix NULL pointer dereference
From: Gustavo A. R. Silva @ 2018-06-01 13:24 UTC (permalink / raw)
  To: Igor Mitsyanko, Avinash Patil, Sergey Matyukevich, Kalle Valo,
	David S. Miller
  Cc: linux-wireless, netdev, linux-kernel, Gustavo A. R. Silva,
	kernel-janitors

In case *vif* is NULL at 655: if (!vif), the execution path jumps to
label out, where *vif* is dereferenced at 679:

if (vif->sta_state == QTNF_STA_CONNECTING)

Fix this by immediately returning when *vif* is NULL instead of
jumping to label out.

Addresses-Coverity-ID: 1469567 ("Dereference after null check")
Fixes: 480daa9cb62c ("qtnfmac: fix invalid STA state on EAPOL failure")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 220e2b7..ae0ca80 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -654,8 +654,7 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev,
 	vif = qtnf_mac_get_base_vif(mac);
 	if (!vif) {
 		pr_err("MAC%u: primary VIF is not configured\n", mac->macid);
-		ret = -EFAULT;
-		goto out;
+		return -EFAULT;
 	}
 
 	if (vif->wdev.iftype != NL80211_IFTYPE_STATION) {
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout
From: John Fastabend @ 2018-06-01 14:03 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, David S . Miller, Shuah Khan,
	netdev, linux-kselftest
In-Reply-To: <634dc36e-f299-631d-f501-d12453fa0b98@lab.ntt.co.jp>

On 05/30/2018 09:13 PM, Prashant Bhole wrote:
> 
> 
> On 5/31/2018 4:59 AM, John Fastabend wrote:
>> On 05/30/2018 12:29 PM, Alexei Starovoitov wrote:
>>> On Wed, May 30, 2018 at 02:56:09PM +0900, Prashant Bhole wrote:
>>>> In order to reduce runtime of tests, recently timout for select() call
>>>> was reduced from 1sec to 10usec. This was causing many tests failures.
>>>> It was caught with failure handling commits in this series.
>>>>
>>>> Restoring the timeout from 10usec to 1sec
>>>>
>>>> Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
>>>> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
>>>> ---
>>>>   tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
>>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/tools/testing/selftests/bpf/test_sockmap.c
>>>> b/tools/testing/selftests/bpf/test_sockmap.c
>>>> index 64f9e25c451f..9d01f5c2abe2 100644
>>>> --- a/tools/testing/selftests/bpf/test_sockmap.c
>>>> +++ b/tools/testing/selftests/bpf/test_sockmap.c
>>>> @@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int
>>>> iov_length, int cnt,
>>>>           if (err < 0)
>>>>               perror("recv start time: ");
>>>>           while (s->bytes_recvd < total_bytes) {
>>>> -            timeout.tv_sec = 0;
>>>> -            timeout.tv_usec = 10;
>>>> +            timeout.tv_sec = 1;
>>>> +            timeout.tv_usec = 0;
>>>
>>> I've applied the set, but had to revert it, since it takes too long.
>>>
>>> real    1m40.124s
>>> user    0m0.375s
>>> sys    0m14.521s
>>>
>>
>> Dang, I thought it would be a bit longer but not minutes.
>>
>>> Myself and Daniel run the test semi-manually when we apply patches.>
>>> Adding 2 extra minutes of wait time is unnecessary.
>>
>> Yep.
>>
>>> Especially since most of it is idle time.
>>> Please find a way to fix tests differently.
>>> btw I don't see any failures today. Not sure what is being fixed
>>> by incresing a timeout.
>>>
>>
>> Calling these fixes is a bit much, they are primarily improvements.
>>
>> The background is, when I originally wrote the tests my goal was to
>> exercise the kernel code paths. Because of this I didn't really care if
>> the tests actually sent/recv all bytes in the test. (I have long
>> running tests using netperf/wrk/apached/etc. for that) But, the manual
>> tests do have an option to verify the data if specified. The 'verify'
>> option is a bit fragile in that with the right tests (e.g. drop)
>> or the certain options (e.g. cork) it can fail which is expected.
>>
>> What Prashant added was support to actually verify the data correctly.
>> And also fix a few cgroup handling and some pretty printing as well.
>> He noticed the low timeout causing issue in these cases though so
>> increased it.
>>
>> @Prashant, how about increasing this less dramatically because now
>> all cork tests are going to stall for 1s unless perfectly aligned.
>> How about 100us? Or even better we can conditionally set it based
>> on if tx_cork is set. If tx_cork is set use 1us otherwise use 200us
>> or something. (1s is really to high in any cases for lo)
>>
>> Also capturing some of the above in the cover letter would help
>> folks understand the context a bit better.
>>
> 
> I did trial and error for timeout values. Currently 1000us for corked
> tests and 1 sec for other tests works fine. I observed broken-pipe error
> at tx side when timeout was < 1000us.
> 
> Also tests with apply=1 and higher number of iterations were taking
> time, so reducing iterations reduces the test run time drastically.
> 

Yep, sending 1B at a time is slow.

> real    0m12.968s
> user    0m0.219s
> sys     0m14.337s
> 
> Also I will try to explain background in the cover letter of next series.
> 
Seems more reasonable to me now. Thanks.

> -Prashant
> 
> 

^ permalink raw reply

* Re: [PATCH bpf-next v4 1/5] selftests/bpf: test_sockmap, check test failure
From: John Fastabend @ 2018-06-01 14:04 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov, Daniel Borkmann
  Cc: David S . Miller, Shuah Khan, netdev, linux-kselftest
In-Reply-To: <20180531044240.796-2-bhole_prashant_q7@lab.ntt.co.jp>

On 05/30/2018 09:42 PM, Prashant Bhole wrote:
> Test failures are not identified because exit code of RX/TX threads
> is not checked. Also threads are not returning correct exit code.
> 
> - Return exit code from threads depending on test execution status
> - In main thread, check the exit code of RX/TX threads
> - Skip error checking for corked tests as they are expected to timeout
> 
> Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> ---


Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* [PATCH net] ipv4: igmp: hold wakelock to prevent delayed reports
From: Tejaswi Tanikella @ 2018-06-01 14:05 UTC (permalink / raw)
  To: netdev

On receiving a IGMPv2/v3 query, based on max_delay set in the header a
timer is started to send out a response after a random time within
max_delay. If the system then moves into suspend state, Report is
delayed until system wakes up.

In one reported scenario, on arm64 devices, max_delay was set to 10s,
Reports were consistantly delayed if the timer is scheduled after 5 plus
seconds.

Hold wakelock while starting the timer to prevent moving into suspend
state.

Signed-off-by: Tejaswi Tanikella <tejaswit@codeaurora.org>
---
 include/linux/igmp.h |  1 +
 net/ipv4/igmp.c      | 20 ++++++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index f823185..9be1c58 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -84,6 +84,7 @@ struct ip_mc_list {
 	};
 	struct ip_mc_list __rcu *next_hash;
 	struct timer_list	timer;
+	struct wakeup_source	*wakeup_src;
 	int			users;
 	refcount_t		refcnt;
 	spinlock_t		lock;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b26a81a..c695e2c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,6 +107,9 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #endif
+#ifdef CONFIG_IP_MULTICAST
+#include <linux/pm_wakeup.h>
+#endif
 
 #ifdef CONFIG_IP_MULTICAST
 /* Parameter names and values are taken from igmp-v2-06 draft */
@@ -174,7 +177,13 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 
 static void ip_ma_put(struct ip_mc_list *im)
 {
+#ifdef CONFIG_IP_MULTICAST
+	__pm_relax(im->wakeup_src);
+#endif
 	if (refcount_dec_and_test(&im->refcnt)) {
+#ifdef CONFIG_IP_MULTICAST
+		wakeup_source_unregister(im->wakeup_src);
+#endif
 		in_dev_put(im->interface);
 		kfree_rcu(im, rcu);
 	}
@@ -199,8 +208,10 @@ static void ip_ma_put(struct ip_mc_list *im)
 static void igmp_stop_timer(struct ip_mc_list *im)
 {
 	spin_lock_bh(&im->lock);
-	if (del_timer(&im->timer))
+	if (del_timer(&im->timer)) {
+		__pm_relax(im->wakeup_src);
 		refcount_dec(&im->refcnt);
+	}
 	im->tm_running = 0;
 	im->reporter = 0;
 	im->unsolicit_count = 0;
@@ -213,8 +224,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
 	int tv = prandom_u32() % max_delay;
 
 	im->tm_running = 1;
-	if (!mod_timer(&im->timer, jiffies+tv+2))
+	if (!mod_timer(&im->timer, jiffies+tv+2)) {
 		refcount_inc(&im->refcnt);
+		__pm_stay_awake(im->wakeup_src);
+	}
 }
 
 static void igmp_gq_start_timer(struct in_device *in_dev)
@@ -250,6 +263,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
 			spin_unlock_bh(&im->lock);
 			return;
 		}
+		__pm_relax(im->wakeup_src);
 		refcount_dec(&im->refcnt);
 	}
 	igmp_start_timer(im, max_delay);
@@ -1415,6 +1429,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 #ifdef CONFIG_IP_MULTICAST
 	timer_setup(&im->timer, igmp_timer_expire, 0);
 	im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
+	im->wakeup_src = wakeup_source_create("igmp_wakeup_source");
+	wakeup_source_add(im->wakeup_src);
 #endif
 
 	im->next_rcu = in_dev->mc_list;
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH] qtnfmac: fix NULL pointer dereference
From: Sergey Matyukevich @ 2018-06-01 14:08 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: Igor Mitsyanko, Avinash Patil, Sergey Matyukevich, Kalle Valo,
	David S. Miller, linux-wireless, netdev, linux-kernel,
	kernel-janitors
In-Reply-To: <20180601132408.GA2572@embeddedor.com>

Hello Gustavo,

> diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
> index 220e2b7..ae0ca80 100644
> --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
> +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
> @@ -654,8 +654,7 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev,
>         vif = qtnf_mac_get_base_vif(mac);
>         if (!vif) {
>                 pr_err("MAC%u: primary VIF is not configured\n", mac->macid);
> -               ret = -EFAULT;
> -               goto out;
> +               return -EFAULT;
>         }
> 
>         if (vif->wdev.iftype != NL80211_IFTYPE_STATION) {

That was my fault. Thanks for the fix!

Reviewed-by: Sergey Matyukevich <sergey.matyukevich.os@quanenna.com>

Regards,
Sergey

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox