Netdev List
 help / color / mirror / Atom feed
* [PATCH 0/2] codel: make it reuseable beyond qdiscs
From: Michal Kazior @ 2016-04-22 12:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, johannes, Michal Kazior

Hi,

There's an ongoing effort in fixing wireless
bufferbloat. As part of that fq_codel is being
ported into mac80211. To prevent code duplication
codel.h needs to be slightly modified before it
can be used in mac80211 (or other drivers FWIW).

For more background please see:

  https://www.spinics.net/lists/linux-wireless/msg149976.html


Michal Kazior (2):
  codel: generalize the implementation
  codel: split into multiple files

 include/net/codel.h       | 217 +--------------------------------------
 include/net/codel_impl.h  | 255 ++++++++++++++++++++++++++++++++++++++++++++++
 include/net/codel_qdisc.h |  73 +++++++++++++
 net/sched/sch_codel.c     |  22 +++-
 net/sched/sch_fq_codel.c  |  21 +++-
 5 files changed, 368 insertions(+), 220 deletions(-)
 create mode 100644 include/net/codel_impl.h
 create mode 100644 include/net/codel_qdisc.h

-- 
2.1.4

^ permalink raw reply

* [PATCH 1/2] codel: generalize the implementation
From: Michal Kazior @ 2016-04-22 12:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, johannes, Michal Kazior
In-Reply-To: <1461327359-21646-1-git-send-email-michal.kazior@tieto.com>

This strips out qdisc specific bits from the code
and makes it slightly more reusable. Codel will be
used by wireless/mac80211 in the future.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---
 include/net/codel.h      | 64 +++++++++++++++++++++++++++++-------------------
 net/sched/sch_codel.c    | 20 ++++++++++++---
 net/sched/sch_fq_codel.c | 19 +++++++++++---
 3 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/include/net/codel.h b/include/net/codel.h
index d168aca115cc..06ac687b4909 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -176,12 +176,10 @@ struct codel_stats {
 
 #define CODEL_DISABLED_THRESHOLD INT_MAX
 
-static void codel_params_init(struct codel_params *params,
-			      const struct Qdisc *sch)
+static void codel_params_init(struct codel_params *params)
 {
 	params->interval = MS2TIME(100);
 	params->target = MS2TIME(5);
-	params->mtu = psched_mtu(qdisc_dev(sch));
 	params->ce_threshold = CODEL_DISABLED_THRESHOLD;
 	params->ecn = false;
 }
@@ -226,28 +224,38 @@ static codel_time_t codel_control_law(codel_time_t t,
 	return t + reciprocal_scale(interval, rec_inv_sqrt << REC_INV_SQRT_SHIFT);
 }
 
+typedef u32 (*codel_skb_len_t)(const struct sk_buff *skb);
+typedef codel_time_t (*codel_skb_time_t)(const struct sk_buff *skb);
+typedef void (*codel_skb_drop_t)(struct sk_buff *skb, void *ctx);
+typedef struct sk_buff * (*codel_skb_dequeue_t)(struct codel_vars *vars,
+						void *ctx);
+
 static bool codel_should_drop(const struct sk_buff *skb,
-			      struct Qdisc *sch,
+			      void *ctx,
 			      struct codel_vars *vars,
 			      struct codel_params *params,
 			      struct codel_stats *stats,
+			      codel_skb_len_t skb_len_func,
+			      codel_skb_time_t skb_time_func,
+			      u32 *backlog,
 			      codel_time_t now)
 {
 	bool ok_to_drop;
+	u32 skb_len;
 
 	if (!skb) {
 		vars->first_above_time = 0;
 		return false;
 	}
 
-	vars->ldelay = now - codel_get_enqueue_time(skb);
-	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	skb_len = skb_len_func(skb);
+	vars->ldelay = now - skb_time_func(skb);
 
-	if (unlikely(qdisc_pkt_len(skb) > stats->maxpacket))
-		stats->maxpacket = qdisc_pkt_len(skb);
+	if (unlikely(skb_len > stats->maxpacket))
+		stats->maxpacket = skb_len;
 
 	if (codel_time_before(vars->ldelay, params->target) ||
-	    sch->qstats.backlog <= params->mtu) {
+	    *backlog <= params->mtu) {
 		/* went below - stay below for at least interval */
 		vars->first_above_time = 0;
 		return false;
@@ -264,16 +272,17 @@ static bool codel_should_drop(const struct sk_buff *skb,
 	return ok_to_drop;
 }
 
-typedef struct sk_buff * (*codel_skb_dequeue_t)(struct codel_vars *vars,
-						struct Qdisc *sch);
-
-static struct sk_buff *codel_dequeue(struct Qdisc *sch,
+static struct sk_buff *codel_dequeue(void *ctx,
+				     u32 *backlog,
 				     struct codel_params *params,
 				     struct codel_vars *vars,
 				     struct codel_stats *stats,
+				     codel_skb_len_t skb_len_func,
+				     codel_skb_time_t skb_time_func,
+				     codel_skb_drop_t drop_func,
 				     codel_skb_dequeue_t dequeue_func)
 {
-	struct sk_buff *skb = dequeue_func(vars, sch);
+	struct sk_buff *skb = dequeue_func(vars, ctx);
 	codel_time_t now;
 	bool drop;
 
@@ -282,7 +291,8 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 		return skb;
 	}
 	now = codel_get_time();
-	drop = codel_should_drop(skb, sch, vars, params, stats, now);
+	drop = codel_should_drop(skb, ctx, vars, params, stats,
+				 skb_len_func, skb_time_func, backlog, now);
 	if (vars->dropping) {
 		if (!drop) {
 			/* sojourn time below target - leave dropping state */
@@ -310,12 +320,15 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 								  vars->rec_inv_sqrt);
 					goto end;
 				}
-				stats->drop_len += qdisc_pkt_len(skb);
-				qdisc_drop(skb, sch);
+				stats->drop_len += skb_len_func(skb);
+				drop_func(skb, ctx);
 				stats->drop_count++;
-				skb = dequeue_func(vars, sch);
-				if (!codel_should_drop(skb, sch,
-						       vars, params, stats, now)) {
+				skb = dequeue_func(vars, ctx);
+				if (!codel_should_drop(skb, ctx,
+						       vars, params, stats,
+						       skb_len_func,
+						       skb_time_func,
+						       backlog, now)) {
 					/* leave dropping state */
 					vars->dropping = false;
 				} else {
@@ -333,13 +346,14 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 		if (params->ecn && INET_ECN_set_ce(skb)) {
 			stats->ecn_mark++;
 		} else {
-			stats->drop_len += qdisc_pkt_len(skb);
-			qdisc_drop(skb, sch);
+			stats->drop_len += skb_len_func(skb);
+			drop_func(skb, ctx);
 			stats->drop_count++;
 
-			skb = dequeue_func(vars, sch);
-			drop = codel_should_drop(skb, sch, vars, params,
-						 stats, now);
+			skb = dequeue_func(vars, ctx);
+			drop = codel_should_drop(skb, ctx, vars, params,
+						 stats, skb_len_func,
+						 skb_time_func, backlog, now);
 		}
 		vars->dropping = true;
 		/* if min went above target close to when we last went below it
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 9b7e2980ee5c..512a94abe351 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -64,20 +64,33 @@ struct codel_sched_data {
  * to dequeue a packet from queue. Note: backlog is handled in
  * codel, we dont need to reduce it here.
  */
-static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 {
+	struct Qdisc *sch = ctx;
 	struct sk_buff *skb = __skb_dequeue(&sch->q);
 
+	if (skb)
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+
 	prefetch(&skb->end); /* we'll need skb_shinfo() */
 	return skb;
 }
 
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+	struct Qdisc *sch = ctx;
+
+	qdisc_drop(skb, sch);
+}
+
 static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
 {
 	struct codel_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+	skb = codel_dequeue(sch, &sch->qstats.backlog, &q->params, &q->vars,
+			    &q->stats, qdisc_pkt_len, codel_get_enqueue_time,
+			    drop_func, dequeue_func);
 
 	/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
 	 * or HTB crashes. Defer it for next round.
@@ -173,9 +186,10 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->limit = DEFAULT_CODEL_LIMIT;
 
-	codel_params_init(&q->params, sch);
+	codel_params_init(&q->params);
 	codel_vars_init(&q->vars);
 	codel_stats_init(&q->stats);
+	q->params.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
 		int err = codel_change(sch, opt);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d3fc8f9dd3d4..dcf7266e6901 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -220,8 +220,9 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
  * to dequeue a packet from queue. Note: backlog is handled in
  * codel, we dont need to reduce it here.
  */
-static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 {
+	struct Qdisc *sch = ctx;
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct fq_codel_flow *flow;
 	struct sk_buff *skb = NULL;
@@ -231,10 +232,18 @@ static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
 		skb = dequeue_head(flow);
 		q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
 		sch->q.qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
 	}
 	return skb;
 }
 
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+	struct Qdisc *sch = ctx;
+
+	qdisc_drop(skb, sch);
+}
+
 static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -263,8 +272,9 @@ begin:
 	prev_ecn_mark = q->cstats.ecn_mark;
 	prev_backlog = sch->qstats.backlog;
 
-	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
-			    dequeue);
+	skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
+			    &flow->cvars, &q->cstats, qdisc_pkt_len,
+			    codel_get_enqueue_time, drop_func, dequeue_func);
 
 	flow->dropped += q->cstats.drop_count - prev_drop_count;
 	flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
@@ -423,9 +433,10 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
 	INIT_LIST_HEAD(&q->old_flows);
-	codel_params_init(&q->cparams, sch);
+	codel_params_init(&q->cparams);
 	codel_stats_init(&q->cstats);
 	q->cparams.ecn = true;
+	q->cparams.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
 		int err = fq_codel_change(sch, opt);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 2/2] codel: split into multiple files
From: Michal Kazior @ 2016-04-22 12:15 UTC (permalink / raw)
  To: netdev; +Cc: davem, johannes, Michal Kazior
In-Reply-To: <1461327359-21646-1-git-send-email-michal.kazior@tieto.com>

It was impossible to include codel.h for the
purpose of having access to codel_params or
codel_vars structure definitions and using them
for embedding in other more complex structures.

This splits allows codel.h itself to be treated
like any other header file while codel_qdisc.h and
codel_impl.h contain function definitions with
logic that was previously in codel.h.

This copies over copyrights and doesn't involve
code changes other than adding a few additional
include directives to net/sched/sch*codel.c.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---
 include/net/codel.h       | 223 ----------------------------------------
 include/net/codel_impl.h  | 255 ++++++++++++++++++++++++++++++++++++++++++++++
 include/net/codel_qdisc.h |  73 +++++++++++++
 net/sched/sch_codel.c     |   2 +
 net/sched/sch_fq_codel.c  |   2 +
 5 files changed, 332 insertions(+), 223 deletions(-)
 create mode 100644 include/net/codel_impl.h
 create mode 100644 include/net/codel_qdisc.h

diff --git a/include/net/codel.h b/include/net/codel.h
index 06ac687b4909..a6e428f80135 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -87,27 +87,6 @@ static inline codel_time_t codel_get_time(void)
 	 ((s32)((a) - (b)) >= 0))
 #define codel_time_before_eq(a, b)	codel_time_after_eq(b, a)
 
-/* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */
-struct codel_skb_cb {
-	codel_time_t enqueue_time;
-};
-
-static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
-{
-	qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
-	return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
-}
-
-static codel_time_t codel_get_enqueue_time(const struct sk_buff *skb)
-{
-	return get_codel_cb(skb)->enqueue_time;
-}
-
-static void codel_set_enqueue_time(struct sk_buff *skb)
-{
-	get_codel_cb(skb)->enqueue_time = codel_get_time();
-}
-
 static inline u32 codel_time_to_us(codel_time_t val)
 {
 	u64 valns = ((u64)val << CODEL_SHIFT);
@@ -176,212 +155,10 @@ struct codel_stats {
 
 #define CODEL_DISABLED_THRESHOLD INT_MAX
 
-static void codel_params_init(struct codel_params *params)
-{
-	params->interval = MS2TIME(100);
-	params->target = MS2TIME(5);
-	params->ce_threshold = CODEL_DISABLED_THRESHOLD;
-	params->ecn = false;
-}
-
-static void codel_vars_init(struct codel_vars *vars)
-{
-	memset(vars, 0, sizeof(*vars));
-}
-
-static void codel_stats_init(struct codel_stats *stats)
-{
-	stats->maxpacket = 0;
-}
-
-/*
- * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots
- * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
- *
- * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
- */
-static void codel_Newton_step(struct codel_vars *vars)
-{
-	u32 invsqrt = ((u32)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
-	u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
-	u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
-
-	val >>= 2; /* avoid overflow in following multiply */
-	val = (val * invsqrt) >> (32 - 2 + 1);
-
-	vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
-}
-
-/*
- * CoDel control_law is t + interval/sqrt(count)
- * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
- * both sqrt() and divide operation.
- */
-static codel_time_t codel_control_law(codel_time_t t,
-				      codel_time_t interval,
-				      u32 rec_inv_sqrt)
-{
-	return t + reciprocal_scale(interval, rec_inv_sqrt << REC_INV_SQRT_SHIFT);
-}
-
 typedef u32 (*codel_skb_len_t)(const struct sk_buff *skb);
 typedef codel_time_t (*codel_skb_time_t)(const struct sk_buff *skb);
 typedef void (*codel_skb_drop_t)(struct sk_buff *skb, void *ctx);
 typedef struct sk_buff * (*codel_skb_dequeue_t)(struct codel_vars *vars,
 						void *ctx);
 
-static bool codel_should_drop(const struct sk_buff *skb,
-			      void *ctx,
-			      struct codel_vars *vars,
-			      struct codel_params *params,
-			      struct codel_stats *stats,
-			      codel_skb_len_t skb_len_func,
-			      codel_skb_time_t skb_time_func,
-			      u32 *backlog,
-			      codel_time_t now)
-{
-	bool ok_to_drop;
-	u32 skb_len;
-
-	if (!skb) {
-		vars->first_above_time = 0;
-		return false;
-	}
-
-	skb_len = skb_len_func(skb);
-	vars->ldelay = now - skb_time_func(skb);
-
-	if (unlikely(skb_len > stats->maxpacket))
-		stats->maxpacket = skb_len;
-
-	if (codel_time_before(vars->ldelay, params->target) ||
-	    *backlog <= params->mtu) {
-		/* went below - stay below for at least interval */
-		vars->first_above_time = 0;
-		return false;
-	}
-	ok_to_drop = false;
-	if (vars->first_above_time == 0) {
-		/* just went above from below. If we stay above
-		 * for at least interval we'll say it's ok to drop
-		 */
-		vars->first_above_time = now + params->interval;
-	} else if (codel_time_after(now, vars->first_above_time)) {
-		ok_to_drop = true;
-	}
-	return ok_to_drop;
-}
-
-static struct sk_buff *codel_dequeue(void *ctx,
-				     u32 *backlog,
-				     struct codel_params *params,
-				     struct codel_vars *vars,
-				     struct codel_stats *stats,
-				     codel_skb_len_t skb_len_func,
-				     codel_skb_time_t skb_time_func,
-				     codel_skb_drop_t drop_func,
-				     codel_skb_dequeue_t dequeue_func)
-{
-	struct sk_buff *skb = dequeue_func(vars, ctx);
-	codel_time_t now;
-	bool drop;
-
-	if (!skb) {
-		vars->dropping = false;
-		return skb;
-	}
-	now = codel_get_time();
-	drop = codel_should_drop(skb, ctx, vars, params, stats,
-				 skb_len_func, skb_time_func, backlog, now);
-	if (vars->dropping) {
-		if (!drop) {
-			/* sojourn time below target - leave dropping state */
-			vars->dropping = false;
-		} else if (codel_time_after_eq(now, vars->drop_next)) {
-			/* It's time for the next drop. Drop the current
-			 * packet and dequeue the next. The dequeue might
-			 * take us out of dropping state.
-			 * If not, schedule the next drop.
-			 * A large backlog might result in drop rates so high
-			 * that the next drop should happen now,
-			 * hence the while loop.
-			 */
-			while (vars->dropping &&
-			       codel_time_after_eq(now, vars->drop_next)) {
-				vars->count++; /* dont care of possible wrap
-						* since there is no more divide
-						*/
-				codel_Newton_step(vars);
-				if (params->ecn && INET_ECN_set_ce(skb)) {
-					stats->ecn_mark++;
-					vars->drop_next =
-						codel_control_law(vars->drop_next,
-								  params->interval,
-								  vars->rec_inv_sqrt);
-					goto end;
-				}
-				stats->drop_len += skb_len_func(skb);
-				drop_func(skb, ctx);
-				stats->drop_count++;
-				skb = dequeue_func(vars, ctx);
-				if (!codel_should_drop(skb, ctx,
-						       vars, params, stats,
-						       skb_len_func,
-						       skb_time_func,
-						       backlog, now)) {
-					/* leave dropping state */
-					vars->dropping = false;
-				} else {
-					/* and schedule the next drop */
-					vars->drop_next =
-						codel_control_law(vars->drop_next,
-								  params->interval,
-								  vars->rec_inv_sqrt);
-				}
-			}
-		}
-	} else if (drop) {
-		u32 delta;
-
-		if (params->ecn && INET_ECN_set_ce(skb)) {
-			stats->ecn_mark++;
-		} else {
-			stats->drop_len += skb_len_func(skb);
-			drop_func(skb, ctx);
-			stats->drop_count++;
-
-			skb = dequeue_func(vars, ctx);
-			drop = codel_should_drop(skb, ctx, vars, params,
-						 stats, skb_len_func,
-						 skb_time_func, backlog, now);
-		}
-		vars->dropping = true;
-		/* if min went above target close to when we last went below it
-		 * assume that the drop rate that controlled the queue on the
-		 * last cycle is a good starting point to control it now.
-		 */
-		delta = vars->count - vars->lastcount;
-		if (delta > 1 &&
-		    codel_time_before(now - vars->drop_next,
-				      16 * params->interval)) {
-			vars->count = delta;
-			/* we dont care if rec_inv_sqrt approximation
-			 * is not very precise :
-			 * Next Newton steps will correct it quadratically.
-			 */
-			codel_Newton_step(vars);
-		} else {
-			vars->count = 1;
-			vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
-		}
-		vars->lastcount = vars->count;
-		vars->drop_next = codel_control_law(now, params->interval,
-						    vars->rec_inv_sqrt);
-	}
-end:
-	if (skb && codel_time_after(vars->ldelay, params->ce_threshold) &&
-	    INET_ECN_set_ce(skb))
-		stats->ce_mark++;
-	return skb;
-}
 #endif
diff --git a/include/net/codel_impl.h b/include/net/codel_impl.h
new file mode 100644
index 000000000000..d289b91dcd65
--- /dev/null
+++ b/include/net/codel_impl.h
@@ -0,0 +1,255 @@
+#ifndef __NET_SCHED_CODEL_IMPL_H
+#define __NET_SCHED_CODEL_IMPL_H
+
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ *  Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+/* Controlling Queue Delay (CoDel) algorithm
+ * =========================================
+ * Source : Kathleen Nichols and Van Jacobson
+ * http://queue.acm.org/detail.cfm?id=2209336
+ *
+ * Implemented on linux by Dave Taht and Eric Dumazet
+ */
+
+static void codel_params_init(struct codel_params *params)
+{
+	params->interval = MS2TIME(100);
+	params->target = MS2TIME(5);
+	params->ce_threshold = CODEL_DISABLED_THRESHOLD;
+	params->ecn = false;
+}
+
+static void codel_vars_init(struct codel_vars *vars)
+{
+	memset(vars, 0, sizeof(*vars));
+}
+
+static void codel_stats_init(struct codel_stats *stats)
+{
+	stats->maxpacket = 0;
+}
+
+/*
+ * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+static void codel_Newton_step(struct codel_vars *vars)
+{
+	u32 invsqrt = ((u32)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
+	u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+	u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+	val >>= 2; /* avoid overflow in following multiply */
+	val = (val * invsqrt) >> (32 - 2 + 1);
+
+	vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
+}
+
+/*
+ * CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static codel_time_t codel_control_law(codel_time_t t,
+				      codel_time_t interval,
+				      u32 rec_inv_sqrt)
+{
+	return t + reciprocal_scale(interval, rec_inv_sqrt << REC_INV_SQRT_SHIFT);
+}
+
+static bool codel_should_drop(const struct sk_buff *skb,
+			      void *ctx,
+			      struct codel_vars *vars,
+			      struct codel_params *params,
+			      struct codel_stats *stats,
+			      codel_skb_len_t skb_len_func,
+			      codel_skb_time_t skb_time_func,
+			      u32 *backlog,
+			      codel_time_t now)
+{
+	bool ok_to_drop;
+	u32 skb_len;
+
+	if (!skb) {
+		vars->first_above_time = 0;
+		return false;
+	}
+
+	skb_len = skb_len_func(skb);
+	vars->ldelay = now - skb_time_func(skb);
+
+	if (unlikely(skb_len > stats->maxpacket))
+		stats->maxpacket = skb_len;
+
+	if (codel_time_before(vars->ldelay, params->target) ||
+	    *backlog <= params->mtu) {
+		/* went below - stay below for at least interval */
+		vars->first_above_time = 0;
+		return false;
+	}
+	ok_to_drop = false;
+	if (vars->first_above_time == 0) {
+		/* just went above from below. If we stay above
+		 * for at least interval we'll say it's ok to drop
+		 */
+		vars->first_above_time = now + params->interval;
+	} else if (codel_time_after(now, vars->first_above_time)) {
+		ok_to_drop = true;
+	}
+	return ok_to_drop;
+}
+
+static struct sk_buff *codel_dequeue(void *ctx,
+				     u32 *backlog,
+				     struct codel_params *params,
+				     struct codel_vars *vars,
+				     struct codel_stats *stats,
+				     codel_skb_len_t skb_len_func,
+				     codel_skb_time_t skb_time_func,
+				     codel_skb_drop_t drop_func,
+				     codel_skb_dequeue_t dequeue_func)
+{
+	struct sk_buff *skb = dequeue_func(vars, ctx);
+	codel_time_t now;
+	bool drop;
+
+	if (!skb) {
+		vars->dropping = false;
+		return skb;
+	}
+	now = codel_get_time();
+	drop = codel_should_drop(skb, ctx, vars, params, stats,
+				 skb_len_func, skb_time_func, backlog, now);
+	if (vars->dropping) {
+		if (!drop) {
+			/* sojourn time below target - leave dropping state */
+			vars->dropping = false;
+		} else if (codel_time_after_eq(now, vars->drop_next)) {
+			/* It's time for the next drop. Drop the current
+			 * packet and dequeue the next. The dequeue might
+			 * take us out of dropping state.
+			 * If not, schedule the next drop.
+			 * A large backlog might result in drop rates so high
+			 * that the next drop should happen now,
+			 * hence the while loop.
+			 */
+			while (vars->dropping &&
+			       codel_time_after_eq(now, vars->drop_next)) {
+				vars->count++; /* dont care of possible wrap
+						* since there is no more divide
+						*/
+				codel_Newton_step(vars);
+				if (params->ecn && INET_ECN_set_ce(skb)) {
+					stats->ecn_mark++;
+					vars->drop_next =
+						codel_control_law(vars->drop_next,
+								  params->interval,
+								  vars->rec_inv_sqrt);
+					goto end;
+				}
+				stats->drop_len += skb_len_func(skb);
+				drop_func(skb, ctx);
+				stats->drop_count++;
+				skb = dequeue_func(vars, ctx);
+				if (!codel_should_drop(skb, ctx,
+						       vars, params, stats,
+						       skb_len_func,
+						       skb_time_func,
+						       backlog, now)) {
+					/* leave dropping state */
+					vars->dropping = false;
+				} else {
+					/* and schedule the next drop */
+					vars->drop_next =
+						codel_control_law(vars->drop_next,
+								  params->interval,
+								  vars->rec_inv_sqrt);
+				}
+			}
+		}
+	} else if (drop) {
+		u32 delta;
+
+		if (params->ecn && INET_ECN_set_ce(skb)) {
+			stats->ecn_mark++;
+		} else {
+			stats->drop_len += skb_len_func(skb);
+			drop_func(skb, ctx);
+			stats->drop_count++;
+
+			skb = dequeue_func(vars, ctx);
+			drop = codel_should_drop(skb, ctx, vars, params,
+						 stats, skb_len_func,
+						 skb_time_func, backlog, now);
+		}
+		vars->dropping = true;
+		/* if min went above target close to when we last went below it
+		 * assume that the drop rate that controlled the queue on the
+		 * last cycle is a good starting point to control it now.
+		 */
+		delta = vars->count - vars->lastcount;
+		if (delta > 1 &&
+		    codel_time_before(now - vars->drop_next,
+				      16 * params->interval)) {
+			vars->count = delta;
+			/* we dont care if rec_inv_sqrt approximation
+			 * is not very precise :
+			 * Next Newton steps will correct it quadratically.
+			 */
+			codel_Newton_step(vars);
+		} else {
+			vars->count = 1;
+			vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
+		}
+		vars->lastcount = vars->count;
+		vars->drop_next = codel_control_law(now, params->interval,
+						    vars->rec_inv_sqrt);
+	}
+end:
+	if (skb && codel_time_after(vars->ldelay, params->ce_threshold) &&
+	    INET_ECN_set_ce(skb))
+		stats->ce_mark++;
+	return skb;
+}
+
+#endif
diff --git a/include/net/codel_qdisc.h b/include/net/codel_qdisc.h
new file mode 100644
index 000000000000..8144d9cd2908
--- /dev/null
+++ b/include/net/codel_qdisc.h
@@ -0,0 +1,73 @@
+#ifndef __NET_SCHED_CODEL_QDISC_H
+#define __NET_SCHED_CODEL_QDISC_H
+
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ *  Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+/* Controlling Queue Delay (CoDel) algorithm
+ * =========================================
+ * Source : Kathleen Nichols and Van Jacobson
+ * http://queue.acm.org/detail.cfm?id=2209336
+ *
+ * Implemented on linux by Dave Taht and Eric Dumazet
+ */
+
+/* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */
+struct codel_skb_cb {
+	codel_time_t enqueue_time;
+};
+
+static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
+	return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static codel_time_t codel_get_enqueue_time(const struct sk_buff *skb)
+{
+	return get_codel_cb(skb)->enqueue_time;
+}
+
+static void codel_set_enqueue_time(struct sk_buff *skb)
+{
+	get_codel_cb(skb)->enqueue_time = codel_get_time();
+}
+
+#endif
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 512a94abe351..dddf3bb65a32 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -49,6 +49,8 @@
 #include <linux/prefetch.h>
 #include <net/pkt_sched.h>
 #include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
 
 
 #define DEFAULT_CODEL_LIMIT 1000
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index dcf7266e6901..a5e420b3d4ab 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -24,6 +24,8 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
 
 /*	Fair Queue CoDel.
  *
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH net-next] macsec: Convert to using IFF_NO_QUEUE
From: Sabrina Dubroca @ 2016-04-22 12:17 UTC (permalink / raw)
  To: Phil Sutter; +Cc: David Miller, netdev
In-Reply-To: <1461326562-6595-1-git-send-email-phil@nwl.cc>

2016-04-22, 14:02:42 +0200, Phil Sutter wrote:
> Signed-off-by: Phil Sutter <phil@nwl.cc>
[...]

Acked-by: Sabrina Dubroca <sd@queasysnail.net>

-- 
Sabrina

^ permalink raw reply

* [PATCH] fq: add fair queuing framework
From: Michal Kazior @ 2016-04-22 12:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, johannes, Michal Kazior

This works on the same implementation principle as
codel*.h, i.e. there's a generic header with
structures and macros and a implementation header
carrying function definitions to include in given,
e.g. driver or module.

The fairness logic comes from
net/sched/sch_fq_codel.c but is generalized so it
is more flexible and easier to re-use.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---
This will be used by mac80211.

For more background please see:

  https://www.spinics.net/lists/linux-wireless/msg149976.html


 include/net/fq.h      |  95 ++++++++++++++++++
 include/net/fq_impl.h | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 364 insertions(+)
 create mode 100644 include/net/fq.h
 create mode 100644 include/net/fq_impl.h

diff --git a/include/net/fq.h b/include/net/fq.h
new file mode 100644
index 000000000000..268b49049c37
--- /dev/null
+++ b/include/net/fq.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016 Qualcomm Atheros, Inc
+ *
+ * GPL v2
+ *
+ * Based on net/sched/sch_fq_codel.c
+ */
+#ifndef __NET_SCHED_FQ_H
+#define __NET_SCHED_FQ_H
+
+struct fq_tin;
+
+/**
+ * struct fq_flow - per traffic flow queue
+ *
+ * @tin: owner of this flow. Used to manage collisions, i.e. when a packet
+ *	hashes to an index which points to a flow that is already owned by a
+ *	different tin the packet is destined to. In such case the implementer
+ *	must provide a fallback flow
+ * @flowchain: can be linked to fq_tin's new_flows or old_flows. Used for DRR++
+ *	(deficit round robin) based round robin queuing similar to the one
+ *	found in net/sched/sch_fq_codel.c
+ * @backlogchain: can be linked to other fq_flow and fq. Used to keep track of
+ *	fat flows and efficient head-dropping if packet limit is reached
+ * @queue: sk_buff queue to hold packets
+ * @backlog: number of bytes pending in the queue. The number of packets can be
+ *	found in @queue.qlen
+ * @deficit: used for DRR++
+ */
+struct fq_flow {
+	struct fq_tin *tin;
+	struct list_head flowchain;
+	struct list_head backlogchain;
+	struct sk_buff_head queue;
+	u32 backlog;
+	int deficit;
+};
+
+/**
+ * struct fq_tin - a logical container of fq_flows
+ *
+ * Used to group fq_flows into a logical aggregate. DRR++ scheme is used to
+ * pull interleaved packets out of the associated flows.
+ *
+ * @new_flows: linked list of fq_flow
+ * @old_flows: linked list of fq_flow
+ */
+struct fq_tin {
+	struct list_head new_flows;
+	struct list_head old_flows;
+	u32 backlog_bytes;
+	u32 backlog_packets;
+	u32 overlimit;
+	u32 collisions;
+	u32 flows;
+	u32 tx_bytes;
+	u32 tx_packets;
+};
+
+/**
+ * struct fq - main container for fair queuing purposes
+ *
+ * @backlogs: linked to fq_flows. Used to maintain fat flows for efficient
+ *	head-dropping when @backlog reaches @limit
+ * @limit: max number of packets that can be queued across all flows
+ * @backlog: number of packets queued across all flows
+ */
+struct fq {
+	struct fq_flow *flows;
+	struct list_head backlogs;
+	spinlock_t lock;
+	u32 flows_cnt;
+	u32 perturbation;
+	u32 limit;
+	u32 quantum;
+	u32 backlog;
+	u32 overlimit;
+	u32 collisions;
+};
+
+typedef struct sk_buff *fq_tin_dequeue_t(struct fq *,
+					 struct fq_tin *,
+					 struct fq_flow *flow);
+
+typedef void fq_skb_free_t(struct fq *,
+			   struct fq_tin *,
+			   struct fq_flow *,
+			   struct sk_buff *);
+
+typedef struct fq_flow *fq_flow_get_default_t(struct fq *,
+					      struct fq_tin *,
+					      int idx,
+					      struct sk_buff *);
+
+#endif
diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
new file mode 100644
index 000000000000..02eab7c51adb
--- /dev/null
+++ b/include/net/fq_impl.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016 Qualcomm Atheros, Inc
+ *
+ * GPL v2
+ *
+ * Based on net/sched/sch_fq_codel.c
+ */
+#ifndef __NET_SCHED_FQ_IMPL_H
+#define __NET_SCHED_FQ_IMPL_H
+
+#include <net/fq.h>
+
+/* functions that are embedded into includer */
+
+static struct sk_buff *fq_flow_dequeue(struct fq *fq,
+				       struct fq_flow *flow)
+{
+	struct fq_tin *tin = flow->tin;
+	struct fq_flow *i;
+	struct sk_buff *skb;
+
+	lockdep_assert_held(&fq->lock);
+
+	skb = __skb_dequeue(&flow->queue);
+	if (!skb)
+		return NULL;
+
+	tin->backlog_bytes -= skb->len;
+	tin->backlog_packets--;
+	flow->backlog -= skb->len;
+	fq->backlog--;
+
+	if (flow->backlog == 0) {
+		list_del_init(&flow->backlogchain);
+	} else {
+		i = flow;
+
+		list_for_each_entry_continue(i, &fq->backlogs, backlogchain)
+			if (i->backlog < flow->backlog)
+				break;
+
+		list_move_tail(&flow->backlogchain,
+			       &i->backlogchain);
+	}
+
+	return skb;
+}
+
+static struct sk_buff *fq_tin_dequeue(struct fq *fq,
+				      struct fq_tin *tin,
+				      fq_tin_dequeue_t dequeue_func)
+{
+	struct fq_flow *flow;
+	struct list_head *head;
+	struct sk_buff *skb;
+
+	lockdep_assert_held(&fq->lock);
+
+begin:
+	head = &tin->new_flows;
+	if (list_empty(head)) {
+		head = &tin->old_flows;
+		if (list_empty(head))
+			return NULL;
+	}
+
+	flow = list_first_entry(head, struct fq_flow, flowchain);
+
+	if (flow->deficit <= 0) {
+		flow->deficit += fq->quantum;
+		list_move_tail(&flow->flowchain,
+			       &tin->old_flows);
+		goto begin;
+	}
+
+	skb = dequeue_func(fq, tin, flow);
+	if (!skb) {
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &tin->new_flows) &&
+		    !list_empty(&tin->old_flows)) {
+			list_move_tail(&flow->flowchain, &tin->old_flows);
+		} else {
+			list_del_init(&flow->flowchain);
+			flow->tin = NULL;
+		}
+		goto begin;
+	}
+
+	flow->deficit -= skb->len;
+	tin->tx_bytes += skb->len;
+	tin->tx_packets++;
+
+	return skb;
+}
+
+static struct fq_flow *fq_flow_classify(struct fq *fq,
+					struct fq_tin *tin,
+					struct sk_buff *skb,
+					fq_flow_get_default_t get_default_func)
+{
+	struct fq_flow *flow;
+	u32 hash;
+	u32 idx;
+
+	lockdep_assert_held(&fq->lock);
+
+	hash = skb_get_hash_perturb(skb, fq->perturbation);
+	idx = reciprocal_scale(hash, fq->flows_cnt);
+	flow = &fq->flows[idx];
+
+	if (flow->tin && flow->tin != tin) {
+		flow = get_default_func(fq, tin, idx, skb);
+		tin->collisions++;
+		fq->collisions++;
+	}
+
+	if (!flow->tin)
+		tin->flows++;
+
+	return flow;
+}
+
+static void fq_tin_enqueue(struct fq *fq,
+			   struct fq_tin *tin,
+			   struct sk_buff *skb,
+			   fq_skb_free_t free_func,
+			   fq_flow_get_default_t get_default_func)
+{
+	struct fq_flow *flow;
+	struct fq_flow *i;
+
+	lockdep_assert_held(&fq->lock);
+
+	flow = fq_flow_classify(fq, tin, skb, get_default_func);
+
+	flow->tin = tin;
+	flow->backlog += skb->len;
+	tin->backlog_bytes += skb->len;
+	tin->backlog_packets++;
+	fq->backlog++;
+
+	if (list_empty(&flow->backlogchain))
+		list_add_tail(&flow->backlogchain, &fq->backlogs);
+
+	i = flow;
+	list_for_each_entry_continue_reverse(i, &fq->backlogs,
+					     backlogchain)
+		if (i->backlog > flow->backlog)
+			break;
+
+	list_move(&flow->backlogchain, &i->backlogchain);
+
+	if (list_empty(&flow->flowchain)) {
+		flow->deficit = fq->quantum;
+		list_add_tail(&flow->flowchain,
+			      &tin->new_flows);
+	}
+
+	__skb_queue_tail(&flow->queue, skb);
+
+	if (fq->backlog > fq->limit) {
+		flow = list_first_entry_or_null(&fq->backlogs,
+						struct fq_flow,
+						backlogchain);
+		if (!flow)
+			return;
+
+		skb = fq_flow_dequeue(fq, flow);
+		if (!skb)
+			return;
+
+		free_func(fq, flow->tin, flow, skb);
+
+		flow->tin->overlimit++;
+		fq->overlimit++;
+	}
+}
+
+static void fq_flow_reset(struct fq *fq,
+			  struct fq_flow *flow,
+			  fq_skb_free_t free_func)
+{
+	struct sk_buff *skb;
+
+	while ((skb = fq_flow_dequeue(fq, flow)))
+		free_func(fq, flow->tin, flow, skb);
+
+	if (!list_empty(&flow->flowchain))
+		list_del_init(&flow->flowchain);
+
+	if (!list_empty(&flow->backlogchain))
+		list_del_init(&flow->backlogchain);
+
+	flow->tin = NULL;
+
+	WARN_ON_ONCE(flow->backlog);
+}
+
+static void fq_tin_reset(struct fq *fq,
+			 struct fq_tin *tin,
+			 fq_skb_free_t free_func)
+{
+	struct list_head *head;
+	struct fq_flow *flow;
+
+	for (;;) {
+		head = &tin->new_flows;
+		if (list_empty(head)) {
+			head = &tin->old_flows;
+			if (list_empty(head))
+				break;
+		}
+
+		flow = list_first_entry(head, struct fq_flow, flowchain);
+		fq_flow_reset(fq, flow, free_func);
+	}
+
+	WARN_ON_ONCE(tin->backlog_bytes);
+	WARN_ON_ONCE(tin->backlog_packets);
+}
+
+static void fq_flow_init(struct fq_flow *flow)
+{
+	INIT_LIST_HEAD(&flow->flowchain);
+	INIT_LIST_HEAD(&flow->backlogchain);
+	__skb_queue_head_init(&flow->queue);
+}
+
+static void fq_tin_init(struct fq_tin *tin)
+{
+	INIT_LIST_HEAD(&tin->new_flows);
+	INIT_LIST_HEAD(&tin->old_flows);
+}
+
+static int fq_init(struct fq *fq, int flows_cnt)
+{
+	int i;
+
+	memset(fq, 0, sizeof(fq[0]));
+	INIT_LIST_HEAD(&fq->backlogs);
+	spin_lock_init(&fq->lock);
+	fq->flows_cnt = max_t(u32, flows_cnt, 1);
+	fq->perturbation = prandom_u32();
+	fq->quantum = 300;
+	fq->limit = 8192;
+
+	fq->flows = kcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL);
+	if (!fq->flows)
+		return -ENOMEM;
+
+	for (i = 0; i < fq->flows_cnt; i++)
+		fq_flow_init(&fq->flows[i]);
+
+	return 0;
+}
+
+static void fq_reset(struct fq *fq,
+		     fq_skb_free_t free_func)
+{
+	int i;
+
+	for (i = 0; i < fq->flows_cnt; i++)
+		fq_flow_reset(fq, &fq->flows[i], free_func);
+
+	kfree(fq->flows);
+	fq->flows = NULL;
+}
+
+#endif
-- 
2.1.4

^ permalink raw reply related

* Re: [RFC PATCH] gro: Partly revert "net: gro: allow to build full sized skb"
From: Eric Dumazet @ 2016-04-22 12:39 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Sowmini Varadhan, netdev
In-Reply-To: <20160422091328.GK3347@gauss.secunet.com>

On Fri, 2016-04-22 at 11:13 +0200, Steffen Klassert wrote:
> On Thu, Apr 21, 2016 at 05:59:06AM -0700, Eric Dumazet wrote:

> > Here at Google, we increased MAX_SKB_FRAGS, but this is a rather
> > intrusive change to be upstreamed :(
> 
> I've played here with MAX_SKB_FRAGS too, but it seems to
> be device specific how many page fragments it can handle.
> I wonder if we could increase MAX_SKB_FRAGS at the GRO
> layer and let GSO split this buffer in something that
> the transmitting device can handle?

Yes, the same principle would apply.

Split a GRO packet into X TSO packets with whatever number of frags.

bnx2x has to linearize some skbs having more than 13 frags.

^ permalink raw reply

* Re: [PATCH net v2 1/3] drivers: net: cpsw: fix parsing of phy-handle DT property in dual_emac config
From: Grygorii Strashko @ 2016-04-22 13:03 UTC (permalink / raw)
  To: David Rivshin (Allworx), netdev, linux-omap
  Cc: linux-arm-kernel, devicetree, linux-kernel, David Miller,
	Mugunthan V N, Andrew Goodbody, Markus Brunner, Nicolas Chauvet
In-Reply-To: <1461262794-9562-1-git-send-email-drivshin.allworx@gmail.com>

On 04/21/2016 09:19 PM, David Rivshin (Allworx) wrote:
> From: David Rivshin <drivshin@allworx.com>
>
> Commit 9e42f715264ff158478fa30eaed847f6e131366b ("drivers: net: cpsw: add
> phy-handle parsing") saved the "phy-handle" phandle into a new cpsw_priv
> field. However, phy connections are per-slave, so the phy_node field should
> be in cpsw_slave_data rather than cpsw_priv.
>
> This would go unnoticed in a single emac configuration. But in dual_emac
> mode, the last "phy-handle" property parsed for either slave would be used
> by both of them, causing them both to refer to the same phy_device.
>
> Fixes: 9e42f715264f ("drivers: net: cpsw: add phy-handle parsing")
> Signed-off-by: David Rivshin <drivshin@allworx.com>
> Tested-by: Nicolas Chauvet <kwizart@gmail.com>
> ---
> I would suggest this for -stable. It should apply cleanly as far back
> as 4.4.
>
> Changes since v1 [1]:
> - Rebased (no conflicts)
> - Added Tested-by from Nicolas Chauvet
>
> [1] https://patchwork.ozlabs.org/patch/560326/

Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>

>
>   drivers/net/ethernet/ti/cpsw.c | 13 ++++++-------
>   drivers/net/ethernet/ti/cpsw.h |  1 +
>   2 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 42fdfd4..d69cb3f 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -363,15 +363,14 @@ static inline void slave_write(struct cpsw_slave *slave, u32 val, u32 offset)
>   	__raw_writel(val, slave->regs + offset);
>   }
>
>   struct cpsw_priv {
>   	spinlock_t			lock;
>   	struct platform_device		*pdev;
>   	struct net_device		*ndev;
> -	struct device_node		*phy_node;
>   	struct napi_struct		napi_rx;
>   	struct napi_struct		napi_tx;
>   	struct device			*dev;
>   	struct cpsw_platform_data	data;
>   	struct cpsw_ss_regs __iomem	*regs;
>   	struct cpsw_wr_regs __iomem	*wr_regs;
>   	u8 __iomem			*hw_stats;
> @@ -1144,16 +1143,16 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
>
>   	if (priv->data.dual_emac)
>   		cpsw_add_dual_emac_def_ale_entries(priv, slave, slave_port);
>   	else
>   		cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast,
>   				   1 << slave_port, 0, 0, ALE_MCAST_FWD_2);
>
> -	if (priv->phy_node)
> -		slave->phy = of_phy_connect(priv->ndev, priv->phy_node,
> +	if (slave->data->phy_node)
> +		slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node,
>   				 &cpsw_adjust_link, 0, slave->data->phy_if);
>   	else
>   		slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
>   				 &cpsw_adjust_link, slave->data->phy_if);
>   	if (IS_ERR(slave->phy)) {
>   		dev_err(priv->dev, "phy %s not found on slave %d\n",
>   			slave->data->phy_id, slave->slave_num);
> @@ -1936,20 +1935,19 @@ static void cpsw_slave_init(struct cpsw_slave *slave, struct cpsw_priv *priv,
>
>   	slave->data	= data;
>   	slave->regs	= regs + slave_reg_ofs;
>   	slave->sliver	= regs + sliver_reg_ofs;
>   	slave->port_vlan = data->dual_emac_res_vlan;
>   }
>
> -static int cpsw_probe_dt(struct cpsw_priv *priv,
> +static int cpsw_probe_dt(struct cpsw_platform_data *data,
>   			 struct platform_device *pdev)
>   {
>   	struct device_node *node = pdev->dev.of_node;
>   	struct device_node *slave_node;
> -	struct cpsw_platform_data *data = &priv->data;
>   	int i = 0, ret;
>   	u32 prop;
>
>   	if (!node)
>   		return -EINVAL;
>
>   	if (of_property_read_u32(node, "slaves", &prop)) {
> @@ -2029,15 +2027,16 @@ static int cpsw_probe_dt(struct cpsw_priv *priv,
>   		int lenp;
>   		const __be32 *parp;
>
>   		/* This is no slave child node, continue */
>   		if (strcmp(slave_node->name, "slave"))
>   			continue;
>
> -		priv->phy_node = of_parse_phandle(slave_node, "phy-handle", 0);
> +		slave_data->phy_node = of_parse_phandle(slave_node,
> +							"phy-handle", 0);
>   		parp = of_get_property(slave_node, "phy_id", &lenp);
>   		if (of_phy_is_fixed_link(slave_node)) {
>   			struct device_node *phy_node;
>   			struct phy_device *phy_dev;
>
>   			/* In the case of a fixed PHY, the DT node associated
>   			 * to the PHY is the Ethernet MAC DT node.
> @@ -2271,15 +2270,15 @@ static int cpsw_probe(struct platform_device *pdev)
>   	 * This may be required here for child devices.
>   	 */
>   	pm_runtime_enable(&pdev->dev);
>
>   	/* Select default pin state */
>   	pinctrl_pm_select_default_state(&pdev->dev);
>
> -	if (cpsw_probe_dt(priv, pdev)) {
> +	if (cpsw_probe_dt(&priv->data, pdev)) {
>   		dev_err(&pdev->dev, "cpsw: platform data missing\n");
>   		ret = -ENODEV;
>   		goto clean_runtime_disable_ret;
>   	}
>   	data = &priv->data;
>
>   	if (is_valid_ether_addr(data->slave_data[0].mac_addr)) {
> diff --git a/drivers/net/ethernet/ti/cpsw.h b/drivers/net/ethernet/ti/cpsw.h
> index 442a703..e50afd1 100644
> --- a/drivers/net/ethernet/ti/cpsw.h
> +++ b/drivers/net/ethernet/ti/cpsw.h
> @@ -14,14 +14,15 @@
>   #ifndef __CPSW_H__
>   #define __CPSW_H__
>
>   #include <linux/if_ether.h>
>   #include <linux/phy.h>
>
>   struct cpsw_slave_data {
> +	struct device_node *phy_node;
>   	char		phy_id[MII_BUS_ID_SIZE];
>   	int		phy_if;
>   	u8		mac_addr[ETH_ALEN];
>   	u16		dual_emac_res_vlan;	/* Reserved VLAN for DualEMAC */
>   };
>
>   struct cpsw_platform_data {
>


-- 
regards,
-grygorii

^ permalink raw reply

* Re: [PATCH net v2 2/3] drivers: net: cpsw: fix error messages when using phy-handle DT property
From: Grygorii Strashko @ 2016-04-22 13:03 UTC (permalink / raw)
  To: David Rivshin (Allworx), netdev, linux-omap, Rob Herring
  Cc: Markus Brunner, devicetree, Mugunthan V N, Nicolas Chauvet,
	linux-kernel, Andrew Goodbody, David Miller, linux-arm-kernel
In-Reply-To: <1461263172-10428-1-git-send-email-drivshin.allworx@gmail.com>

On 04/21/2016 09:26 PM, David Rivshin (Allworx) wrote:
> From: David Rivshin <drivshin@allworx.com>
> 
> The phy-handle, phy_id, and fixed-link properties are mutually exclusive,
> and only one need be specified. However if phy-handle was specified, an
> error message would complain about the lack of phy_id or fixed-link.
> 
> Also, if phy-handle was specified and the subsequent of_phy_connect()
> failed, the error message still referenced slaved->data->phy_id, which
> would be empty. Instead, use the name of the device_node as a useful
> identifier.
> 
> Fixes: 9e42f715264f ("drivers: net: cpsw: add phy-handle parsing")
> Signed-off-by: David Rivshin <drivshin@allworx.com>
> Acked-by: Rob Herring <robh@kernel.org>
> Tested-by: Nicolas Chauvet <kwizart@gmail.com>
> ---
> If would like this for -stable it should apply cleanly as far back
> as 4.5. It failes on 4.4 due to some context differences, but can be
> applied with 'git am -C2'. Or, I can produce a separate patch against
> linux-4.4.y if preferred.
> 
> Changes since v1 [1]:
> - Rebased (no conflicts)
> - Added Tested-by from Nicolas Chauvet
> - Added Acked-by from Rob Herring for the binding change
> 
> [1] https://patchwork.ozlabs.org/patch/560324/
> 
>   Documentation/devicetree/bindings/net/cpsw.txt |  4 ++--
>   drivers/net/ethernet/ti/cpsw.c                 | 17 +++++++++++++----
>   2 files changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
> index 28a4781..3033c0f 100644
> --- a/Documentation/devicetree/bindings/net/cpsw.txt
> +++ b/Documentation/devicetree/bindings/net/cpsw.txt
> @@ -46,16 +46,16 @@ Optional properties:
>   - dual_emac_res_vlan	: Specifies VID to be used to segregate the ports
>   - mac-address		: See ethernet.txt file in the same directory
>   - phy_id		: Specifies slave phy id

May be the "phy_id" can be marked as deprecated? (while here)
The recommended property now is "phy-handle".

>   - phy-handle		: See ethernet.txt file in the same directory
>   
>   Slave sub-nodes:
>   - fixed-link		: See fixed-link.txt file in the same directory
> -			  Either the property phy_id, or the sub-node
> -			  fixed-link can be specified
> +
> +Note: Exactly one of phy_id, phy-handle, or fixed-link must be specified.
>   
>   Note: "ti,hwmods" field is used to fetch the base address and irq
>   resources from TI, omap hwmod data base during device registration.
>   Future plan is to migrate hwmod data base contents into device tree
>   blob so that, all the required data will be used from device tree dts
>   file.
>   
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index d69cb3f..3c81413 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -1150,16 +1150,19 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
>   	if (slave->data->phy_node)
>   		slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node,
>   				 &cpsw_adjust_link, 0, slave->data->phy_if);
>   	else
>   		slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
>   				 &cpsw_adjust_link, slave->data->phy_if);
>   	if (IS_ERR(slave->phy)) {
> -		dev_err(priv->dev, "phy %s not found on slave %d\n",
> -			slave->data->phy_id, slave->slave_num);
> +		dev_err(priv->dev, "phy \"%s\" not found on slave %d\n",
> +			slave->data->phy_node ?
> +				slave->data->phy_node->full_name :
> +				slave->data->phy_id,
> +			slave->slave_num);

Unfortunately,  there are some inconsistency between legacy and FDT API :(
of_phy_connect() will return valid phy_device or NULL, but phy_connect()
can return valid phy_device or ERR_PTR().



>   		slave->phy = NULL;
>   	} else {
>   		phy_attached_info(slave->phy);
>   
>   		phy_start(slave->phy);
>   
>   		/* Configure GMII_SEL register */
> @@ -2030,15 +2033,19 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
>   		/* This is no slave child node, continue */
>   		if (strcmp(slave_node->name, "slave"))
>   			continue;
>   
>   		slave_data->phy_node = of_parse_phandle(slave_node,
>   							"phy-handle", 0);
>   		parp = of_get_property(slave_node, "phy_id", &lenp);
> -		if (of_phy_is_fixed_link(slave_node)) {
> +		if (slave_data->phy_node) {
> +			dev_dbg(&pdev->dev,
> +				"slave[%d] using phy-handle=\"%s\"\n",
> +				i, slave_data->phy_node->full_name);
> +		} else if (of_phy_is_fixed_link(slave_node)) {
>   			struct device_node *phy_node;
>   			struct phy_device *phy_dev;
>   
>   			/* In the case of a fixed PHY, the DT node associated
>   			 * to the PHY is the Ethernet MAC DT node.
>   			 */
>   			ret = of_phy_register_fixed_link(slave_node);
> @@ -2067,15 +2074,17 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
>   			if (!mdio) {
>   				dev_err(&pdev->dev, "Missing mdio platform device\n");
>   				return -EINVAL;
>   			}
>   			snprintf(slave_data->phy_id, sizeof(slave_data->phy_id),
>   				 PHY_ID_FMT, mdio->name, phyid);
>   		} else {
> -			dev_err(&pdev->dev, "No slave[%d] phy_id or fixed-link property\n", i);
> +			dev_err(&pdev->dev,
> +				"No slave[%d] phy_id, phy-handle, or fixed-link property\n",
> +				i);
>   			goto no_phy_slave;
>   		}
>   		slave_data->phy_if = of_get_phy_mode(slave_node);
>   		if (slave_data->phy_if < 0) {
>   			dev_err(&pdev->dev, "Missing or malformed slave[%d] phy-mode property\n",
>   				i);
>   			return slave_data->phy_if;
> 


-- 
regards,
-grygorii

^ permalink raw reply

* Re: [v8, 1/7] Documentation: DT: update Freescale DCFG compatible
From: Mark Rutland @ 2016-04-22 13:11 UTC (permalink / raw)
  To: Yangbo Lu
  Cc: linux-mmc, linuxppc-dev, devicetree, linux-arm-kernel,
	linux-kernel, linux-clk, linux-i2c, iommu, netdev, ulf.hansson,
	scott.wood, Rob Herring, Russell King, Jochen Friedrich,
	Joerg Roedel, Claudiu Manoil, Bhupesh Sharma, Zhao Qiang,
	Kumar Gala, Santosh Shilimkar, leoyang.li, xiaobo.xie
In-Reply-To: <1461306464-19521-2-git-send-email-yangbo.lu@nxp.com>

On Fri, Apr 22, 2016 at 02:27:38PM +0800, Yangbo Lu wrote:
> Update Freescale DCFG compatible with 'fsl,<chip>-dcfg' instead
> of 'fsl,ls1021a-dcfg' to include more chips.
> 
> Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
> ---
> Changes for v8:
> 	- Added this patch
> ---
>  Documentation/devicetree/bindings/arm/fsl.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/Documentation/devicetree/bindings/arm/fsl.txt b/Documentation/devicetree/bindings/arm/fsl.txt
> index 752a685..1d5f512 100644
> --- a/Documentation/devicetree/bindings/arm/fsl.txt
> +++ b/Documentation/devicetree/bindings/arm/fsl.txt
> @@ -119,7 +119,7 @@ Freescale DCFG
>  configuration and status for the device. Such as setting the secondary
>  core start address and release the secondary core from holdoff and startup.
>    Required properties:
> -  - compatible: should be "fsl,ls1021a-dcfg"
> +  - compatible: should be "fsl,<chip>-dcfg"

Please list specific values expected for <chip>, while jusy saying
<chip> may be more generic, it makes it practically impossible to search
for the correct binding given a compatible string, and it's vague as to
exaclty what <chip> should be.

Thanks,
Mark.



>    - reg : should contain base address and length of DCFG memory-mapped registers
>  
>  Example:
> -- 
> 2.1.0.27.g96db324
> 
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH net v2 3/3] drivers: net: cpsw: use of_phy_connect() in fixed-link case
From: Grygorii Strashko @ 2016-04-22 13:34 UTC (permalink / raw)
  To: David Rivshin (Allworx), netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-omap-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, David Miller, Mugunthan V N,
	Andrew Goodbody, Markus Brunner, Nicolas Chauvet
In-Reply-To: <1461264066-12358-1-git-send-email-drivshin.allworx-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On 04/21/2016 09:41 PM, David Rivshin (Allworx) wrote:
> From: David Rivshin <drivshin-5fOYsn7Fw8lBDgjK7y7TUQ@public.gmane.org>
>
> If a fixed-link DT subnode is used, the phy_device was looked up so
> that a PHY ID string could be constructed and passed to phy_connect().
> This is not necessary, as the device_node can be passed directly to
> of_phy_connect() instead. This reuses the same codepath as if the
> phy-handle DT property was used.
>
> Signed-off-by: David Rivshin <drivshin-5fOYsn7Fw8lBDgjK7y7TUQ@public.gmane.org>
> Tested-by: Nicolas Chauvet <kwizart-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
>
> Changes since v1 [1]:
> - Rebased (trivial conflict, e5a03bfd modified the deleted snprintf)
> - Added Tested-by from Nicolas Chauvet
>
> [1] https://patchwork.ozlabs.org/patch/560327/
>

Reviewed-by: Grygorii Strashko <grygorii.strashko-l0cyMroinI0@public.gmane.org>

>
>   drivers/net/ethernet/ti/cpsw.c | 11 +----------
>   1 file changed, 1 insertion(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 3c81413..245f919 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -2038,30 +2038,21 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
>   							"phy-handle", 0);
>   		parp = of_get_property(slave_node, "phy_id", &lenp);
>   		if (slave_data->phy_node) {
>   			dev_dbg(&pdev->dev,
>   				"slave[%d] using phy-handle=\"%s\"\n",
>   				i, slave_data->phy_node->full_name);
>   		} else if (of_phy_is_fixed_link(slave_node)) {
> -			struct device_node *phy_node;
> -			struct phy_device *phy_dev;
> -
>   			/* In the case of a fixed PHY, the DT node associated
>   			 * to the PHY is the Ethernet MAC DT node.
>   			 */
>   			ret = of_phy_register_fixed_link(slave_node);
>   			if (ret)
>   				return ret;
> -			phy_node = of_node_get(slave_node);
> -			phy_dev = of_phy_find_device(phy_node);
> -			if (!phy_dev)
> -				return -ENODEV;
> -			snprintf(slave_data->phy_id, sizeof(slave_data->phy_id),
> -				 PHY_ID_FMT, phy_dev->mdio.bus->id,
> -				 phy_dev->mdio.addr);
> +			slave_data->phy_node = of_node_get(slave_node);
>   		} else if (parp) {
>   			u32 phyid;
>   			struct device_node *mdio_node;
>   			struct platform_device *mdio;
>
>   			if (lenp != (sizeof(__be32) * 2)) {
>   				dev_err(&pdev->dev, "Invalid slave[%d] phy_id property\n", i);
>


-- 
regards,
-grygorii
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 06/23] netfilter: x_tables: add compat version of xt_check_entry_offsets
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

32bit rulesets have different layout and alignment requirements, so once
more integrity checks get added to xt_check_entry_offsets it will reject
well-formed 32bit rulesets.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  3 +++
 net/ipv4/netfilter/arp_tables.c    |  3 ++-
 net/ipv4/netfilter/ip_tables.c     |  3 ++-
 net/ipv6/netfilter/ip6_tables.c    |  3 ++-
 net/netfilter/x_tables.c           | 22 ++++++++++++++++++++++
 5 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 0de0862..08de48b 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -494,6 +494,9 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 				unsigned int *size);
 int xt_compat_target_to_user(const struct xt_entry_target *t,
 			     void __user **dstptr, unsigned int *size);
+int xt_compat_check_entry_offsets(const void *base,
+				  unsigned int target_offset,
+				  unsigned int next_offset);
 
 #endif /* CONFIG_COMPAT */
 #endif /* _X_TABLES_H */
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 24ad92a..ab8952a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1254,7 +1254,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e, e->target_offset,
+					    e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cdf1850..7d24c87 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1513,7 +1513,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e,
+					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e378311..73eee7b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1525,7 +1525,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	ret = xt_compat_check_entry_offsets(e,
+					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ec1b718..fa206ce 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -539,6 +539,27 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+
+int xt_compat_check_entry_offsets(const void *base,
+				  unsigned int target_offset,
+				  unsigned int next_offset)
+{
+	const struct compat_xt_entry_target *t;
+	const char *e = base;
+
+	if (target_offset + sizeof(*t) > next_offset)
+		return -EINVAL;
+
+	t = (void *)(e + target_offset);
+	if (t->u.target_size < sizeof(*t))
+		return -EINVAL;
+
+	if (target_offset + t->u.target_size > next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_compat_check_entry_offsets);
 #endif /* CONFIG_COMPAT */
 
 /**
@@ -549,6 +570,7 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
  * validates that target_offset and next_offset are sane.
+ * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
-- 
2.1.4


^ permalink raw reply related

* [PATCH 07/23] netfilter: x_tables: check standard target size too
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

We have targets and standard targets -- the latter carries a verdict.

The ip/ip6tables validation functions will access t->verdict for the
standard targets to fetch the jump offset or verdict for chainloop
detection, but this happens before the targets get checked/validated.

Thus we also need to check for verdict presence here, else t->verdict
can point right after a blob.

Spotted with UBSAN while testing malformed blobs.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index fa206ce..1cb7a27 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -540,6 +540,13 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 }
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 
+/* non-compat version may have padding after verdict */
+struct compat_xt_standard_target {
+	struct compat_xt_entry_target t;
+	compat_uint_t verdict;
+};
+
+/* see xt_check_entry_offsets */
 int xt_compat_check_entry_offsets(const void *base,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
@@ -557,6 +564,10 @@ int xt_compat_check_entry_offsets(const void *base,
 	if (target_offset + t->u.target_size > next_offset)
 		return -EINVAL;
 
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
+	    target_offset + sizeof(struct compat_xt_standard_target) != next_offset)
+		return -EINVAL;
+
 	return 0;
 }
 EXPORT_SYMBOL(xt_compat_check_entry_offsets);
@@ -596,6 +607,10 @@ int xt_check_entry_offsets(const void *base,
 	if (target_offset + t->u.target_size > next_offset)
 		return -EINVAL;
 
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
+	    target_offset + sizeof(struct xt_standard_target) != next_offset)
+		return -EINVAL;
+
 	return 0;
 }
 EXPORT_SYMBOL(xt_check_entry_offsets);
-- 
2.1.4


^ permalink raw reply related

* [PATCH 09/23] netfilter: x_tables: validate all offsets and sizes in a rule
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Validate that all matches (if any) add up to the beginning of
the target and that each match covers at least the base structure size.

The compat path should be able to safely re-use the function
as the structures only differ in alignment; added a
BUILD_BUG_ON just in case we have an arch that adds padding as well.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 81 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e2a6f2a..f9aa971 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -416,6 +416,47 @@ int xt_check_match(struct xt_mtchk_param *par,
 }
 EXPORT_SYMBOL_GPL(xt_check_match);
 
+/** xt_check_entry_match - check that matches end before start of target
+ *
+ * @match: beginning of xt_entry_match
+ * @target: beginning of this rules target (alleged end of matches)
+ * @alignment: alignment requirement of match structures
+ *
+ * Validates that all matches add up to the beginning of the target,
+ * and that each match covers at least the base structure size.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int xt_check_entry_match(const char *match, const char *target,
+				const size_t alignment)
+{
+	const struct xt_entry_match *pos;
+	int length = target - match;
+
+	if (length == 0) /* no matches */
+		return 0;
+
+	pos = (struct xt_entry_match *)match;
+	do {
+		if ((unsigned long)pos % alignment)
+			return -EINVAL;
+
+		if (length < (int)sizeof(struct xt_entry_match))
+			return -EINVAL;
+
+		if (pos->u.match_size < sizeof(struct xt_entry_match))
+			return -EINVAL;
+
+		if (pos->u.match_size > length)
+			return -EINVAL;
+
+		length -= pos->u.match_size;
+		pos = ((void *)((char *)(pos) + (pos)->u.match_size));
+	} while (length > 0);
+
+	return 0;
+}
+
 #ifdef CONFIG_COMPAT
 int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
@@ -571,7 +612,14 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 	    target_offset + sizeof(struct compat_xt_standard_target) != next_offset)
 		return -EINVAL;
 
-	return 0;
+	/* compat_xt_entry match has less strict aligment requirements,
+	 * otherwise they are identical.  In case of padding differences
+	 * we need to add compat version of xt_check_entry_match.
+	 */
+	BUILD_BUG_ON(sizeof(struct compat_xt_entry_match) != sizeof(struct xt_entry_match));
+
+	return xt_check_entry_match(elems, base + target_offset,
+				    __alignof__(struct compat_xt_entry_match));
 }
 EXPORT_SYMBOL(xt_compat_check_entry_offsets);
 #endif /* CONFIG_COMPAT */
@@ -584,17 +632,39 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * @target_offset: the arp/ip/ip6_t->target_offset
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
- * validates that target_offset and next_offset are sane.
- * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
+ * validates that target_offset and next_offset are sane and that all
+ * match sizes (if any) align with the target offset.
  *
  * This function does not validate the targets or matches themselves, it
- * only tests that all the offsets and sizes are correct.
+ * only tests that all the offsets and sizes are correct, that all
+ * match structures are aligned, and that the last structure ends where
+ * the target structure begins.
+ *
+ * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
  * - base to base + next_offset must be accessible, i.e. not exceed allocated
  *   length.
  *
+ * A well-formed entry looks like this:
+ *
+ * ip(6)t_entry   match [mtdata]  match [mtdata] target [tgdata] ip(6)t_entry
+ * e->elems[]-----'                              |               |
+ *                matchsize                      |               |
+ *                                matchsize      |               |
+ *                                               |               |
+ * target_offset---------------------------------'               |
+ * next_offset---------------------------------------------------'
+ *
+ * elems[]: flexible array member at end of ip(6)/arpt_entry struct.
+ *          This is where matches (if any) and the target reside.
+ * target_offset: beginning of target.
+ * next_offset: start of the next rule; also: size of this rule.
+ * Since targets have a minimum size, target_offset + minlen <= next_offset.
+ *
+ * Every match stores its size, sum of sizes must not exceed target_offset.
+ *
  * Return: 0 on success, negative errno on failure.
  */
 int xt_check_entry_offsets(const void *base,
@@ -624,7 +694,8 @@ int xt_check_entry_offsets(const void *base,
 	    target_offset + sizeof(struct xt_standard_target) != next_offset)
 		return -EINVAL;
 
-	return 0;
+	return xt_check_entry_match(elems, base + target_offset,
+				    __alignof__(struct xt_entry_match));
 }
 EXPORT_SYMBOL(xt_check_entry_offsets);
 
-- 
2.1.4


^ permalink raw reply related

* [PATCH 11/23] netfilter: ip6_tables: simplify translate_compat_table args
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6_tables.c | 59 +++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6957627..8d082c55 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1461,7 +1461,6 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
 
 static int
 compat_find_calc_match(struct xt_entry_match *m,
-		       const char *name,
 		       const struct ip6t_ip6 *ipv6,
 		       int *size)
 {
@@ -1498,8 +1497,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 				  const unsigned char *base,
 				  const unsigned char *limit,
 				  const unsigned int *hook_entries,
-				  const unsigned int *underflows,
-				  const char *name)
+				  const unsigned int *underflows)
 {
 	struct xt_entry_match *ematch;
 	struct xt_entry_target *t;
@@ -1535,7 +1533,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	entry_offset = (void *)e - (void *)base;
 	j = 0;
 	xt_ematch_foreach(ematch, e) {
-		ret = compat_find_calc_match(ematch, name, &e->ipv6, &off);
+		ret = compat_find_calc_match(ematch, &e->ipv6, &off);
 		if (ret != 0)
 			goto release_matches;
 		++j;
@@ -1584,7 +1582,7 @@ release_matches:
 
 static int
 compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
-			    unsigned int *size, const char *name,
+			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
 	struct xt_entry_target *t;
@@ -1664,14 +1662,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
 
 static int
 translate_compat_table(struct net *net,
-		       const char *name,
-		       unsigned int valid_hooks,
 		       struct xt_table_info **pinfo,
 		       void **pentry0,
-		       unsigned int total_size,
-		       unsigned int number,
-		       unsigned int *hook_entries,
-		       unsigned int *underflows)
+		       const struct compat_ip6t_replace *compatr)
 {
 	unsigned int i, j;
 	struct xt_table_info *newinfo, *info;
@@ -1683,8 +1676,8 @@ translate_compat_table(struct net *net,
 
 	info = *pinfo;
 	entry0 = *pentry0;
-	size = total_size;
-	info->number = number;
+	size = compatr->size;
+	info->number = compatr->num_entries;
 
 	/* Init all hooks to impossible value. */
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -1695,40 +1688,39 @@ translate_compat_table(struct net *net,
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(AF_INET6);
-	xt_compat_init_offsets(AF_INET6, number);
+	xt_compat_init_offsets(AF_INET6, compatr->num_entries);
 	/* Walk through entries, checking offsets. */
-	xt_entry_foreach(iter0, entry0, total_size) {
+	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
 							entry0,
-							entry0 + total_size,
-							hook_entries,
-							underflows,
-							name);
+							entry0 + compatr->size,
+							compatr->hook_entry,
+							compatr->underflow);
 		if (ret != 0)
 			goto out_unlock;
 		++j;
 	}
 
 	ret = -EINVAL;
-	if (j != number) {
+	if (j != compatr->num_entries) {
 		duprintf("translate_compat_table: %u not %u entries\n",
-			 j, number);
+			 j, compatr->num_entries);
 		goto out_unlock;
 	}
 
 	/* Check hooks all assigned */
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
 		/* Only hooks which are valid */
-		if (!(valid_hooks & (1 << i)))
+		if (!(compatr->valid_hooks & (1 << i)))
 			continue;
 		if (info->hook_entry[i] == 0xFFFFFFFF) {
 			duprintf("Invalid hook entry %u %u\n",
-				 i, hook_entries[i]);
+				 i, info->hook_entry[i]);
 			goto out_unlock;
 		}
 		if (info->underflow[i] == 0xFFFFFFFF) {
 			duprintf("Invalid underflow %u %u\n",
-				 i, underflows[i]);
+				 i, info->underflow[i]);
 			goto out_unlock;
 		}
 	}
@@ -1738,17 +1730,17 @@ translate_compat_table(struct net *net,
 	if (!newinfo)
 		goto out_unlock;
 
-	newinfo->number = number;
+	newinfo->number = compatr->num_entries;
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
-	size = total_size;
-	xt_entry_foreach(iter0, entry0, total_size) {
+	size = compatr->size;
+	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  name, newinfo, entry1);
+						  newinfo, entry1);
 		if (ret != 0)
 			break;
 	}
@@ -1758,12 +1750,12 @@ translate_compat_table(struct net *net,
 		goto free_newinfo;
 
 	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, valid_hooks, entry1))
+	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
 		goto free_newinfo;
 
 	i = 0;
 	xt_entry_foreach(iter1, entry1, newinfo->size) {
-		ret = compat_check_entry(iter1, net, name);
+		ret = compat_check_entry(iter1, net, compatr->name);
 		if (ret != 0)
 			break;
 		++i;
@@ -1803,7 +1795,7 @@ translate_compat_table(struct net *net,
 free_newinfo:
 	xt_free_table_info(newinfo);
 out:
-	xt_entry_foreach(iter0, entry0, total_size) {
+	xt_entry_foreach(iter0, entry0, compatr->size) {
 		if (j-- == 0)
 			break;
 		compat_release_entry(iter0);
@@ -1848,10 +1840,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		goto free_newinfo;
 	}
 
-	ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
-				     &newinfo, &loc_cpu_entry, tmp.size,
-				     tmp.num_entries, tmp.hook_entry,
-				     tmp.underflow);
+	ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
 	if (ret != 0)
 		goto free_newinfo;
 
-- 
2.1.4


^ permalink raw reply related

* [PATCH 14/23] netfilter: x_tables: do compat validation via translate_table
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

This looks like refactoring, but its also a bug fix.

Problem is that the compat path (32bit iptables, 64bit kernel) lacks a few
sanity tests that are done in the normal path.

For example, we do not check for underflows and the base chain policies.

While its possible to also add such checks to the compat path, its more
copy&pastry, for instance we cannot reuse check_underflow() helper as
e->target_offset differs in the compat case.

Other problem is that it makes auditing for validation errors harder; two
places need to be checked and kept in sync.

At a high level 32 bit compat works like this:
1- initial pass over blob:
   validate match/entry offsets, bounds checking
   lookup all matches and targets
   do bookkeeping wrt. size delta of 32/64bit structures
   assign match/target.u.kernel pointer (points at kernel
   implementation, needed to access ->compatsize etc.)

2- allocate memory according to the total bookkeeping size to
   contain the translated ruleset

3- second pass over original blob:
   for each entry, copy the 32bit representation to the newly allocated
   memory.  This also does any special match translations (e.g.
   adjust 32bit to 64bit longs, etc).

4- check if ruleset is free of loops (chase all jumps)

5-first pass over translated blob:
   call the checkentry function of all matches and targets.

The alternative implemented by this patch is to drop steps 3&4 from the
compat process, the translation is changed into an intermediate step
rather than a full 1:1 translate_table replacement.

In the 2nd pass (step #3), change the 64bit ruleset back to a kernel
representation, i.e. put() the kernel pointer and restore ->u.user.name .

This gets us a 64bit ruleset that is in the format generated by a 64bit
iptables userspace -- we can then use translate_table() to get the
'native' sanity checks.

This has two drawbacks:

1. we re-validate all the match and target entry structure sizes even
though compat translation is supposed to never generate bogus offsets.
2. we put and then re-lookup each match and target.

THe upside is that we get all sanity tests and ruleset validations
provided by the normal path and can remove some duplicated compat code.

iptables-restore time of autogenerated ruleset with 300k chains of form
-A CHAIN0001 -m limit --limit 1/s -j CHAIN0002
-A CHAIN0002 -m limit --limit 1/s -j CHAIN0003

shows no noticeable differences in restore times:
old:   0m30.796s
new:   0m31.521s
64bit: 0m25.674s

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 114 ++++++-----------------------
 net/ipv4/netfilter/ip_tables.c  | 155 ++++++++--------------------------------
 net/ipv6/netfilter/ip6_tables.c | 148 ++++++--------------------------------
 net/netfilter/x_tables.c        |   8 +++
 4 files changed, 83 insertions(+), 342 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index be514c6..705179b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1234,19 +1234,17 @@ static inline void compat_release_entry(struct compat_arpt_entry *e)
 	module_put(t->u.kernel.target->me);
 }
 
-static inline int
+static int
 check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 				  struct xt_table_info *newinfo,
 				  unsigned int *size,
 				  const unsigned char *base,
-				  const unsigned char *limit,
-				  const unsigned int *hook_entries,
-				  const unsigned int *underflows)
+				  const unsigned char *limit)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
-	int ret, off, h;
+	int ret, off;
 
 	duprintf("check_compat_entry_size_and_hooks %p\n", e);
 	if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
@@ -1291,17 +1289,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (ret)
 		goto release_target;
 
-	/* Check hooks & underflows */
-	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
-		if ((unsigned char *)e - base == hook_entries[h])
-			newinfo->hook_entry[h] = hook_entries[h];
-		if ((unsigned char *)e - base == underflows[h])
-			newinfo->underflow[h] = underflows[h];
-	}
-
-	/* Clear counters and comefrom */
-	memset(&e->counters, 0, sizeof(e->counters));
-	e->comefrom = 0;
 	return 0;
 
 release_target:
@@ -1351,7 +1338,7 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	struct xt_table_info *newinfo, *info;
 	void *pos, *entry0, *entry1;
 	struct compat_arpt_entry *iter0;
-	struct arpt_entry *iter1;
+	struct arpt_replace repl;
 	unsigned int size;
 	int ret = 0;
 
@@ -1360,12 +1347,6 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	size = compatr->size;
 	info->number = compatr->num_entries;
 
-	/* Init all hooks to impossible value. */
-	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
-		info->hook_entry[i] = 0xFFFFFFFF;
-		info->underflow[i] = 0xFFFFFFFF;
-	}
-
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(NFPROTO_ARP);
@@ -1374,9 +1355,7 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
 							entry0,
-							entry0 + compatr->size,
-							compatr->hook_entry,
-							compatr->underflow);
+							entry0 + compatr->size);
 		if (ret != 0)
 			goto out_unlock;
 		++j;
@@ -1389,23 +1368,6 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 		goto out_unlock;
 	}
 
-	/* Check hooks all assigned */
-	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
-		/* Only hooks which are valid */
-		if (!(compatr->valid_hooks & (1 << i)))
-			continue;
-		if (info->hook_entry[i] == 0xFFFFFFFF) {
-			duprintf("Invalid hook entry %u %u\n",
-				 i, info->hook_entry[i]);
-			goto out_unlock;
-		}
-		if (info->underflow[i] == 0xFFFFFFFF) {
-			duprintf("Invalid underflow %u %u\n",
-				 i, info->underflow[i]);
-			goto out_unlock;
-		}
-	}
-
 	ret = -ENOMEM;
 	newinfo = xt_alloc_table_info(size);
 	if (!newinfo)
@@ -1422,55 +1384,26 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	xt_entry_foreach(iter0, entry0, compatr->size)
 		compat_copy_entry_from_user(iter0, &pos, &size,
 					    newinfo, entry1);
+
+	/* all module references in entry0 are now gone */
+
 	xt_compat_flush_offsets(NFPROTO_ARP);
 	xt_compat_unlock(NFPROTO_ARP);
 
-	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
-		goto free_newinfo;
-
-	i = 0;
-	xt_entry_foreach(iter1, entry1, newinfo->size) {
-		iter1->counters.pcnt = xt_percpu_counter_alloc();
-		if (IS_ERR_VALUE(iter1->counters.pcnt)) {
-			ret = -ENOMEM;
-			break;
-		}
+	memcpy(&repl, compatr, sizeof(*compatr));
 
-		ret = check_target(iter1, compatr->name);
-		if (ret != 0) {
-			xt_percpu_counter_free(iter1->counters.pcnt);
-			break;
-		}
-		++i;
-		if (strcmp(arpt_get_target(iter1)->u.user.name,
-		    XT_ERROR_TARGET) == 0)
-			++newinfo->stacksize;
-	}
-	if (ret) {
-		/*
-		 * The first i matches need cleanup_entry (calls ->destroy)
-		 * because they had called ->check already. The other j-i
-		 * entries need only release.
-		 */
-		int skip = i;
-		j -= i;
-		xt_entry_foreach(iter0, entry0, newinfo->size) {
-			if (skip-- > 0)
-				continue;
-			if (j-- == 0)
-				break;
-			compat_release_entry(iter0);
-		}
-		xt_entry_foreach(iter1, entry1, newinfo->size) {
-			if (i-- == 0)
-				break;
-			cleanup_entry(iter1);
-		}
-		xt_free_table_info(newinfo);
-		return ret;
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		repl.hook_entry[i] = newinfo->hook_entry[i];
+		repl.underflow[i] = newinfo->underflow[i];
 	}
 
+	repl.num_counters = 0;
+	repl.counters = NULL;
+	repl.size = newinfo->size;
+	ret = translate_table(newinfo, entry1, &repl);
+	if (ret)
+		goto free_newinfo;
+
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1478,17 +1411,16 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 
 free_newinfo:
 	xt_free_table_info(newinfo);
-out:
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		if (j-- == 0)
 			break;
 		compat_release_entry(iter0);
 	}
 	return ret;
-out_unlock:
-	xt_compat_flush_offsets(NFPROTO_ARP);
-	xt_compat_unlock(NFPROTO_ARP);
-	goto out;
 }
 
 static int compat_do_replace(struct net *net, void __user *user,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 5c20eef..c26ccd8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1483,16 +1483,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 				  struct xt_table_info *newinfo,
 				  unsigned int *size,
 				  const unsigned char *base,
-				  const unsigned char *limit,
-				  const unsigned int *hook_entries,
-				  const unsigned int *underflows)
+				  const unsigned char *limit)
 {
 	struct xt_entry_match *ematch;
 	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	unsigned int j;
-	int ret, off, h;
+	int ret, off;
 
 	duprintf("check_compat_entry_size_and_hooks %p\n", e);
 	if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
@@ -1544,17 +1542,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (ret)
 		goto out;
 
-	/* Check hooks & underflows */
-	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
-		if ((unsigned char *)e - base == hook_entries[h])
-			newinfo->hook_entry[h] = hook_entries[h];
-		if ((unsigned char *)e - base == underflows[h])
-			newinfo->underflow[h] = underflows[h];
-	}
-
-	/* Clear counters and comefrom */
-	memset(&e->counters, 0, sizeof(e->counters));
-	e->comefrom = 0;
 	return 0;
 
 out:
@@ -1597,6 +1584,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 	xt_compat_target_from_user(t, dstptr, size);
 
 	de->next_offset = e->next_offset - (origsize - *size);
+
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if ((unsigned char *)de - base < newinfo->hook_entry[h])
 			newinfo->hook_entry[h] -= origsize - *size;
@@ -1606,48 +1594,6 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 }
 
 static int
-compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
-{
-	struct xt_entry_match *ematch;
-	struct xt_mtchk_param mtpar;
-	unsigned int j;
-	int ret = 0;
-
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
-		return -ENOMEM;
-
-	j = 0;
-	mtpar.net	= net;
-	mtpar.table     = name;
-	mtpar.entryinfo = &e->ip;
-	mtpar.hook_mask = e->comefrom;
-	mtpar.family    = NFPROTO_IPV4;
-	xt_ematch_foreach(ematch, e) {
-		ret = check_match(ematch, &mtpar);
-		if (ret != 0)
-			goto cleanup_matches;
-		++j;
-	}
-
-	ret = check_target(e, net, name);
-	if (ret)
-		goto cleanup_matches;
-	return 0;
-
- cleanup_matches:
-	xt_ematch_foreach(ematch, e) {
-		if (j-- == 0)
-			break;
-		cleanup_match(ematch, net);
-	}
-
-	xt_percpu_counter_free(e->counters.pcnt);
-
-	return ret;
-}
-
-static int
 translate_compat_table(struct net *net,
 		       struct xt_table_info **pinfo,
 		       void **pentry0,
@@ -1657,7 +1603,7 @@ translate_compat_table(struct net *net,
 	struct xt_table_info *newinfo, *info;
 	void *pos, *entry0, *entry1;
 	struct compat_ipt_entry *iter0;
-	struct ipt_entry *iter1;
+	struct ipt_replace repl;
 	unsigned int size;
 	int ret;
 
@@ -1666,12 +1612,6 @@ translate_compat_table(struct net *net,
 	size = compatr->size;
 	info->number = compatr->num_entries;
 
-	/* Init all hooks to impossible value. */
-	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		info->hook_entry[i] = 0xFFFFFFFF;
-		info->underflow[i] = 0xFFFFFFFF;
-	}
-
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(AF_INET);
@@ -1680,9 +1620,7 @@ translate_compat_table(struct net *net,
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
 							entry0,
-							entry0 + compatr->size,
-							compatr->hook_entry,
-							compatr->underflow);
+							entry0 + compatr->size);
 		if (ret != 0)
 			goto out_unlock;
 		++j;
@@ -1695,23 +1633,6 @@ translate_compat_table(struct net *net,
 		goto out_unlock;
 	}
 
-	/* Check hooks all assigned */
-	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		/* Only hooks which are valid */
-		if (!(compatr->valid_hooks & (1 << i)))
-			continue;
-		if (info->hook_entry[i] == 0xFFFFFFFF) {
-			duprintf("Invalid hook entry %u %u\n",
-				 i, info->hook_entry[i]);
-			goto out_unlock;
-		}
-		if (info->underflow[i] == 0xFFFFFFFF) {
-			duprintf("Invalid underflow %u %u\n",
-				 i, info->underflow[i]);
-			goto out_unlock;
-		}
-	}
-
 	ret = -ENOMEM;
 	newinfo = xt_alloc_table_info(size);
 	if (!newinfo)
@@ -1719,8 +1640,8 @@ translate_compat_table(struct net *net,
 
 	newinfo->number = compatr->num_entries;
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		newinfo->hook_entry[i] = info->hook_entry[i];
-		newinfo->underflow[i] = info->underflow[i];
+		newinfo->hook_entry[i] = compatr->hook_entry[i];
+		newinfo->underflow[i] = compatr->underflow[i];
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
@@ -1729,47 +1650,30 @@ translate_compat_table(struct net *net,
 		compat_copy_entry_from_user(iter0, &pos, &size,
 					    newinfo, entry1);
 
+	/* all module references in entry0 are now gone.
+	 * entry1/newinfo contains a 64bit ruleset that looks exactly as
+	 * generated by 64bit userspace.
+	 *
+	 * Call standard translate_table() to validate all hook_entrys,
+	 * underflows, check for loops, etc.
+	 */
 	xt_compat_flush_offsets(AF_INET);
 	xt_compat_unlock(AF_INET);
 
-	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
-		goto free_newinfo;
+	memcpy(&repl, compatr, sizeof(*compatr));
 
-	i = 0;
-	xt_entry_foreach(iter1, entry1, newinfo->size) {
-		ret = compat_check_entry(iter1, net, compatr->name);
-		if (ret != 0)
-			break;
-		++i;
-		if (strcmp(ipt_get_target(iter1)->u.user.name,
-		    XT_ERROR_TARGET) == 0)
-			++newinfo->stacksize;
-	}
-	if (ret) {
-		/*
-		 * The first i matches need cleanup_entry (calls ->destroy)
-		 * because they had called ->check already. The other j-i
-		 * entries need only release.
-		 */
-		int skip = i;
-		j -= i;
-		xt_entry_foreach(iter0, entry0, newinfo->size) {
-			if (skip-- > 0)
-				continue;
-			if (j-- == 0)
-				break;
-			compat_release_entry(iter0);
-		}
-		xt_entry_foreach(iter1, entry1, newinfo->size) {
-			if (i-- == 0)
-				break;
-			cleanup_entry(iter1, net);
-		}
-		xt_free_table_info(newinfo);
-		return ret;
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		repl.hook_entry[i] = newinfo->hook_entry[i];
+		repl.underflow[i] = newinfo->underflow[i];
 	}
 
+	repl.num_counters = 0;
+	repl.counters = NULL;
+	repl.size = newinfo->size;
+	ret = translate_table(net, newinfo, entry1, &repl);
+	if (ret)
+		goto free_newinfo;
+
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1777,17 +1681,16 @@ translate_compat_table(struct net *net,
 
 free_newinfo:
 	xt_free_table_info(newinfo);
-out:
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(AF_INET);
+	xt_compat_unlock(AF_INET);
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		if (j-- == 0)
 			break;
 		compat_release_entry(iter0);
 	}
 	return ret;
-out_unlock:
-	xt_compat_flush_offsets(AF_INET);
-	xt_compat_unlock(AF_INET);
-	goto out;
 }
 
 static int
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 620d54c..f5a4eb2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1495,16 +1495,14 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 				  struct xt_table_info *newinfo,
 				  unsigned int *size,
 				  const unsigned char *base,
-				  const unsigned char *limit,
-				  const unsigned int *hook_entries,
-				  const unsigned int *underflows)
+				  const unsigned char *limit)
 {
 	struct xt_entry_match *ematch;
 	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	unsigned int j;
-	int ret, off, h;
+	int ret, off;
 
 	duprintf("check_compat_entry_size_and_hooks %p\n", e);
 	if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
@@ -1556,17 +1554,6 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (ret)
 		goto out;
 
-	/* Check hooks & underflows */
-	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
-		if ((unsigned char *)e - base == hook_entries[h])
-			newinfo->hook_entry[h] = hook_entries[h];
-		if ((unsigned char *)e - base == underflows[h])
-			newinfo->underflow[h] = underflows[h];
-	}
-
-	/* Clear counters and comefrom */
-	memset(&e->counters, 0, sizeof(e->counters));
-	e->comefrom = 0;
 	return 0;
 
 out:
@@ -1615,47 +1602,6 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 	}
 }
 
-static int compat_check_entry(struct ip6t_entry *e, struct net *net,
-			      const char *name)
-{
-	unsigned int j;
-	int ret = 0;
-	struct xt_mtchk_param mtpar;
-	struct xt_entry_match *ematch;
-
-	e->counters.pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(e->counters.pcnt))
-		return -ENOMEM;
-	j = 0;
-	mtpar.net	= net;
-	mtpar.table     = name;
-	mtpar.entryinfo = &e->ipv6;
-	mtpar.hook_mask = e->comefrom;
-	mtpar.family    = NFPROTO_IPV6;
-	xt_ematch_foreach(ematch, e) {
-		ret = check_match(ematch, &mtpar);
-		if (ret != 0)
-			goto cleanup_matches;
-		++j;
-	}
-
-	ret = check_target(e, net, name);
-	if (ret)
-		goto cleanup_matches;
-	return 0;
-
- cleanup_matches:
-	xt_ematch_foreach(ematch, e) {
-		if (j-- == 0)
-			break;
-		cleanup_match(ematch, net);
-	}
-
-	xt_percpu_counter_free(e->counters.pcnt);
-
-	return ret;
-}
-
 static int
 translate_compat_table(struct net *net,
 		       struct xt_table_info **pinfo,
@@ -1666,7 +1612,7 @@ translate_compat_table(struct net *net,
 	struct xt_table_info *newinfo, *info;
 	void *pos, *entry0, *entry1;
 	struct compat_ip6t_entry *iter0;
-	struct ip6t_entry *iter1;
+	struct ip6t_replace repl;
 	unsigned int size;
 	int ret = 0;
 
@@ -1675,12 +1621,6 @@ translate_compat_table(struct net *net,
 	size = compatr->size;
 	info->number = compatr->num_entries;
 
-	/* Init all hooks to impossible value. */
-	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		info->hook_entry[i] = 0xFFFFFFFF;
-		info->underflow[i] = 0xFFFFFFFF;
-	}
-
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(AF_INET6);
@@ -1689,9 +1629,7 @@ translate_compat_table(struct net *net,
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
 							entry0,
-							entry0 + compatr->size,
-							compatr->hook_entry,
-							compatr->underflow);
+							entry0 + compatr->size);
 		if (ret != 0)
 			goto out_unlock;
 		++j;
@@ -1704,23 +1642,6 @@ translate_compat_table(struct net *net,
 		goto out_unlock;
 	}
 
-	/* Check hooks all assigned */
-	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		/* Only hooks which are valid */
-		if (!(compatr->valid_hooks & (1 << i)))
-			continue;
-		if (info->hook_entry[i] == 0xFFFFFFFF) {
-			duprintf("Invalid hook entry %u %u\n",
-				 i, info->hook_entry[i]);
-			goto out_unlock;
-		}
-		if (info->underflow[i] == 0xFFFFFFFF) {
-			duprintf("Invalid underflow %u %u\n",
-				 i, info->underflow[i]);
-			goto out_unlock;
-		}
-	}
-
 	ret = -ENOMEM;
 	newinfo = xt_alloc_table_info(size);
 	if (!newinfo)
@@ -1728,56 +1649,34 @@ translate_compat_table(struct net *net,
 
 	newinfo->number = compatr->num_entries;
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		newinfo->hook_entry[i] = info->hook_entry[i];
-		newinfo->underflow[i] = info->underflow[i];
+		newinfo->hook_entry[i] = compatr->hook_entry[i];
+		newinfo->underflow[i] = compatr->underflow[i];
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
+	size = compatr->size;
 	xt_entry_foreach(iter0, entry0, compatr->size)
 		compat_copy_entry_from_user(iter0, &pos, &size,
 					    newinfo, entry1);
 
+	/* all module references in entry0 are now gone. */
 	xt_compat_flush_offsets(AF_INET6);
 	xt_compat_unlock(AF_INET6);
 
-	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
-		goto free_newinfo;
+	memcpy(&repl, compatr, sizeof(*compatr));
 
-	i = 0;
-	xt_entry_foreach(iter1, entry1, newinfo->size) {
-		ret = compat_check_entry(iter1, net, compatr->name);
-		if (ret != 0)
-			break;
-		++i;
-		if (strcmp(ip6t_get_target(iter1)->u.user.name,
-		    XT_ERROR_TARGET) == 0)
-			++newinfo->stacksize;
-	}
-	if (ret) {
-		/*
-		 * The first i matches need cleanup_entry (calls ->destroy)
-		 * because they had called ->check already. The other j-i
-		 * entries need only release.
-		 */
-		int skip = i;
-		j -= i;
-		xt_entry_foreach(iter0, entry0, newinfo->size) {
-			if (skip-- > 0)
-				continue;
-			if (j-- == 0)
-				break;
-			compat_release_entry(iter0);
-		}
-		xt_entry_foreach(iter1, entry1, newinfo->size) {
-			if (i-- == 0)
-				break;
-			cleanup_entry(iter1, net);
-		}
-		xt_free_table_info(newinfo);
-		return ret;
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		repl.hook_entry[i] = newinfo->hook_entry[i];
+		repl.underflow[i] = newinfo->underflow[i];
 	}
 
+	repl.num_counters = 0;
+	repl.counters = NULL;
+	repl.size = newinfo->size;
+	ret = translate_table(net, newinfo, entry1, &repl);
+	if (ret)
+		goto free_newinfo;
+
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1785,17 +1684,16 @@ translate_compat_table(struct net *net,
 
 free_newinfo:
 	xt_free_table_info(newinfo);
-out:
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(AF_INET6);
+	xt_compat_unlock(AF_INET6);
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		if (j-- == 0)
 			break;
 		compat_release_entry(iter0);
 	}
 	return ret;
-out_unlock:
-	xt_compat_flush_offsets(AF_INET6);
-	xt_compat_unlock(AF_INET6);
-	goto out;
 }
 
 static int
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 7e7173b..9ec23ffa 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -533,6 +533,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
 	struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
 	int pad, off = xt_compat_match_offset(match);
 	u_int16_t msize = cm->u.user.match_size;
+	char name[sizeof(m->u.user.name)];
 
 	m = *dstptr;
 	memcpy(m, cm, sizeof(*cm));
@@ -546,6 +547,9 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
 
 	msize += off;
 	m->u.user.match_size = msize;
+	strlcpy(name, match->name, sizeof(name));
+	module_put(match->me);
+	strncpy(m->u.user.name, name, sizeof(m->u.user.name));
 
 	*size += off;
 	*dstptr += msize;
@@ -763,6 +767,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 	struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
 	int pad, off = xt_compat_target_offset(target);
 	u_int16_t tsize = ct->u.user.target_size;
+	char name[sizeof(t->u.user.name)];
 
 	t = *dstptr;
 	memcpy(t, ct, sizeof(*ct));
@@ -776,6 +781,9 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 
 	tsize += off;
 	t->u.user.target_size = tsize;
+	strlcpy(name, target->name, sizeof(name));
+	module_put(target->me);
+	strncpy(t->u.user.name, name, sizeof(t->u.user.name));
 
 	*size += off;
 	*dstptr += tsize;
-- 
2.1.4


^ permalink raw reply related

* [PATCH 19/23] netfilter: connlabels: move helpers to xt_connlabel
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Currently labels can only be set either by iptables connlabel
match or via ctnetlink.

Before adding nftables set support, clean up the clabel core and move
helpers that nft will not need after all to the xtables module.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_labels.h |  1 -
 net/netfilter/nf_conntrack_labels.c         | 19 +------------------
 net/netfilter/xt_connlabel.c                | 12 +++++++++++-
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index 7e2b1d0..5167818 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -45,7 +45,6 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
 #endif
 }
 
-bool nf_connlabel_match(const struct nf_conn *ct, u16 bit);
 int nf_connlabel_set(struct nf_conn *ct, u16 bit);
 
 int nf_connlabels_replace(struct nf_conn *ct,
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 3ce5c31..3a30900 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -16,28 +16,11 @@
 
 static spinlock_t nf_connlabels_lock;
 
-static unsigned int label_bits(const struct nf_conn_labels *l)
-{
-	unsigned int longs = l->words;
-	return longs * BITS_PER_LONG;
-}
-
-bool nf_connlabel_match(const struct nf_conn *ct, u16 bit)
-{
-	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
-
-	if (!labels)
-		return false;
-
-	return bit < label_bits(labels) && test_bit(bit, labels->bits);
-}
-EXPORT_SYMBOL_GPL(nf_connlabel_match);
-
 int nf_connlabel_set(struct nf_conn *ct, u16 bit)
 {
 	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
 
-	if (!labels || bit >= label_bits(labels))
+	if (!labels || BIT_WORD(bit) >= labels->words)
 		return -ENOSPC;
 
 	if (test_bit(bit, labels->bits))
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index bb9cbeb..d9b3e53 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -18,6 +18,16 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
 MODULE_ALIAS("ipt_connlabel");
 MODULE_ALIAS("ip6t_connlabel");
 
+static bool connlabel_match(const struct nf_conn *ct, u16 bit)
+{
+	struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+	if (!labels)
+		return false;
+
+	return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits);
+}
+
 static bool
 connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -33,7 +43,7 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (info->options & XT_CONNLABEL_OP_SET)
 		return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
 
-	return nf_connlabel_match(ct, info->bit) ^ invert;
+	return connlabel_match(ct, info->bit) ^ invert;
 }
 
 static int connlabel_mt_check(const struct xt_mtchk_param *par)
-- 
2.1.4


^ permalink raw reply related

* [PATCH 23/23] netfilter: conntrack: don't acquire lock during seq_printf
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

read access doesn't need any lock here.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_sctp.c | 8 +-------
 net/netfilter/nf_conntrack_proto_tcp.c  | 8 +-------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 9578a7c..1d7ab96 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -191,13 +191,7 @@ static void sctp_print_tuple(struct seq_file *s,
 /* Print out the private part of the conntrack. */
 static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
-	enum sctp_conntrack state;
-
-	spin_lock_bh(&ct->lock);
-	state = ct->proto.sctp.state;
-	spin_unlock_bh(&ct->lock);
-
-	seq_printf(s, "%s ", sctp_conntrack_names[state]);
+	seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]);
 }
 
 #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 278f3b9..e0cb0ce 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -313,13 +313,7 @@ static void tcp_print_tuple(struct seq_file *s,
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
-	enum tcp_conntrack state;
-
-	spin_lock_bh(&ct->lock);
-	state = ct->proto.tcp.state;
-	spin_unlock_bh(&ct->lock);
-
-	seq_printf(s, "%s ", tcp_conntrack_names[state]);
+	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
 }
 
 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
-- 
2.1.4


^ permalink raw reply related

* [PATCH 00/23] Netfilter updates for net-next
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

Hi David,

The following patchset contains Netfilter updates for your net-next
tree, mostly from Florian Westphal to sort out the lack of sufficient
validation in x_tables and connlabel preparation patches to add
nf_tables support. They are:

1) Ensure we don't go over the ruleset blob boundaries in
   mark_source_chains().

2) Validate that target jumps land on an existing xt_entry. This extra
   sanitization comes with a performance penalty when loading the ruleset.

3) Introduce xt_check_entry_offsets() and use it from {arp,ip,ip6}tables.

4) Get rid of the smallish check_entry() functions in {arp,ip,ip6}tables.

5) Make sure the minimal possible target size in x_tables.

6) Similar to #3, add xt_compat_check_entry_offsets() for compat code.

7) Check that standard target size is valid.

8) More sanitization to ensure that the target_offset field is correct.

9) Add xt_check_entry_match() to validate that matches are well-formed.

10-12) Three patch to reduce the number of parameters in
    translate_compat_table() for {arp,ip,ip6}tables by using a container
    structure.

13) No need to return value from xt_compat_match_from_user(), so make
    it void.

14) Consolidate translate_table() so it can be used by compat code too.

15) Remove obsolete check for compat code, so we keep consistent with
    what was already removed in the native layout code (back in 2007).

16) Get rid of target jump validation from mark_source_chains(),
    obsoleted by #2.

17) Introduce xt_copy_counters_from_user() to consolidate counter
    copying, and use it from {arp,ip,ip6}tables.

18,22) Get rid of unnecessary explicit inlining in ctnetlink for dump
    functions.

19) Move nf_connlabel_match() to xt_connlabel.

20) Skip event notification if connlabel did not change.

21) Update of nf_connlabels_get() to make the upcoming nft connlabel
    support easier.

23) Remove spinlock to read protocol state field in conntrack.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Thanks!

----------------------------------------------------------------

The following changes since commit 7d45a04cbc2683f9552572850f1c711d9b96dd26:

  tipc: remove remnants of old broadcast code (2016-04-13 17:49:11 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git HEAD

for you to fetch changes up to a163f2cb393d9d71cad57bfe6a8c7f452a478fb4:

  netfilter: conntrack: don't acquire lock during seq_printf (2016-04-19 20:26:25 +0200)

----------------------------------------------------------------
Florian Westphal (21):
      netfilter: x_tables: don't move to non-existent next rule
      netfilter: x_tables: validate targets of jumps
      netfilter: x_tables: add and use xt_check_entry_offsets
      netfilter: x_tables: kill check_entry helper
      netfilter: x_tables: assert minimum target size
      netfilter: x_tables: add compat version of xt_check_entry_offsets
      netfilter: x_tables: check standard target size too
      netfilter: x_tables: check for bogus target offset
      netfilter: x_tables: validate all offsets and sizes in a rule
      netfilter: ip_tables: simplify translate_compat_table args
      netfilter: ip6_tables: simplify translate_compat_table args
      netfilter: arp_tables: simplify translate_compat_table args
      netfilter: x_tables: xt_compat_match_from_user doesn't need a retval
      netfilter: x_tables: do compat validation via translate_table
      netfilter: x_tables: remove obsolete overflow check for compat case too
      netfilter: x_tables: remove obsolete check
      netfilter: x_tables: introduce and use xt_copy_counters_from_user
      netfilter: connlabels: move helpers to xt_connlabel
      netfilter: labels: don't emit ct event if labels were not changed
      netfilter: connlabels: change nf_connlabels_get bit arg to 'highest used'
      netfilter: conntrack: don't acquire lock during seq_printf

Pablo Neira Ayuso (2):
      netfilter: ctnetlink: remove unnecessary inlining
      netfilter: ctnetlink: restore inlining for netlink message size calculation

 include/linux/netfilter/x_tables.h          |  12 +-
 include/net/netfilter/nf_conntrack_labels.h |   5 +-
 net/ipv4/netfilter/arp_tables.c             | 303 ++++++++------------------
 net/ipv4/netfilter/ip_tables.c              | 327 ++++++++--------------------
 net/ipv6/netfilter/ip6_tables.c             | 320 +++++++--------------------
 net/netfilter/nf_conntrack_labels.c         |  44 ++--
 net/netfilter/nf_conntrack_netlink.c        | 119 +++++-----
 net/netfilter/nf_conntrack_proto_sctp.c     |   8 +-
 net/netfilter/nf_conntrack_proto_tcp.c      |   8 +-
 net/netfilter/nft_ct.c                      |   2 +
 net/netfilter/x_tables.c                    | 245 ++++++++++++++++++++-
 net/netfilter/xt_connlabel.c                |  14 +-
 net/openvswitch/conntrack.c                 |   2 +-
 13 files changed, 591 insertions(+), 818 deletions(-)

^ permalink raw reply

* [PATCH 02/23] netfilter: x_tables: validate targets of jumps
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

When we see a jump also check that the offset gets us to beginning of
a rule (an ipt_entry).

The extra overhead is negible, even with absurd cases.

300k custom rules, 300k jumps to 'next' user chain:
[ plus one jump from INPUT to first userchain ]:

Before:
real    0m24.874s
user    0m7.532s
sys     0m16.076s

After:
real    0m27.464s
user    0m7.436s
sys     0m18.840s

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 16 ++++++++++++++++
 net/ipv4/netfilter/ip_tables.c  | 16 ++++++++++++++++
 net/ipv6/netfilter/ip6_tables.c | 16 ++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 82a434b..ec37f7c 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -367,6 +367,18 @@ static inline bool unconditional(const struct arpt_entry *e)
 	       memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
 }
 
+static bool find_jump_target(const struct xt_table_info *t,
+			     const struct arpt_entry *target)
+{
+	struct arpt_entry *iter;
+
+	xt_entry_foreach(iter, t->entries, t->size) {
+		 if (iter == target)
+			return true;
+	}
+	return false;
+}
+
 /* Figures out from what hook each rule can be called: returns 0 if
  * there are loops.  Puts hook bitmask in comefrom.
  */
@@ -460,6 +472,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 					/* This a jump; chase it. */
 					duprintf("Jump rule %u -> %u\n",
 						 pos, newpos);
+					e = (struct arpt_entry *)
+						(entry0 + newpos);
+					if (!find_jump_target(newinfo, e))
+						return 0;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e301a3d..503038e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -443,6 +443,18 @@ ipt_do_table(struct sk_buff *skb,
 #endif
 }
 
+static bool find_jump_target(const struct xt_table_info *t,
+			     const struct ipt_entry *target)
+{
+	struct ipt_entry *iter;
+
+	xt_entry_foreach(iter, t->entries, t->size) {
+		 if (iter == target)
+			return true;
+	}
+	return false;
+}
+
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
@@ -540,6 +552,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					/* This a jump; chase it. */
 					duprintf("Jump rule %u -> %u\n",
 						 pos, newpos);
+					e = (struct ipt_entry *)
+						(entry0 + newpos);
+					if (!find_jump_target(newinfo, e))
+						return 0;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7b3335b..126f2a0 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -455,6 +455,18 @@ ip6t_do_table(struct sk_buff *skb,
 #endif
 }
 
+static bool find_jump_target(const struct xt_table_info *t,
+			     const struct ip6t_entry *target)
+{
+	struct ip6t_entry *iter;
+
+	xt_entry_foreach(iter, t->entries, t->size) {
+		 if (iter == target)
+			return true;
+	}
+	return false;
+}
+
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
@@ -552,6 +564,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					/* This a jump; chase it. */
 					duprintf("Jump rule %u -> %u\n",
 						 pos, newpos);
+					e = (struct ip6t_entry *)
+						(entry0 + newpos);
+					if (!find_jump_target(newinfo, e))
+						return 0;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 01/23] netfilter: x_tables: don't move to non-existent next rule
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Ben Hawkes says:

 In the mark_source_chains function (net/ipv4/netfilter/ip_tables.c) it
 is possible for a user-supplied ipt_entry structure to have a large
 next_offset field. This field is not bounds checked prior to writing a
 counter value at the supplied offset.

Base chains enforce absolute verdict.

User defined chains are supposed to end with an unconditional return,
xtables userspace adds them automatically.

But if such return is missing we will move to non-existent next rule.

Reported-by: Ben Hawkes <hawkes@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 8 +++++---
 net/ipv4/netfilter/ip_tables.c  | 4 ++++
 net/ipv6/netfilter/ip6_tables.c | 4 ++++
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 4133b0f..82a434b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -439,6 +439,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				size = e->next_offset;
 				e = (struct arpt_entry *)
 					(entry0 + pos + size);
+				if (pos + size >= newinfo->size)
+					return 0;
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -461,6 +463,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
+					if (newpos >= newinfo->size)
+						return 0;
 				}
 				e = (struct arpt_entry *)
 					(entry0 + newpos);
@@ -691,10 +695,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 		}
 	}
 
-	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
-		duprintf("Looping hook\n");
+	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
 		return -ELOOP;
-	}
 
 	/* Finally, each sanity check must pass */
 	i = 0;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 631c100..e301a3d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -520,6 +520,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				size = e->next_offset;
 				e = (struct ipt_entry *)
 					(entry0 + pos + size);
+				if (pos + size >= newinfo->size)
+					return 0;
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -541,6 +543,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
+					if (newpos >= newinfo->size)
+						return 0;
 				}
 				e = (struct ipt_entry *)
 					(entry0 + newpos);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 86b67b7..7b3335b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -532,6 +532,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				size = e->next_offset;
 				e = (struct ip6t_entry *)
 					(entry0 + pos + size);
+				if (pos + size >= newinfo->size)
+					return 0;
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -553,6 +555,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
+					if (newpos >= newinfo->size)
+						return 0;
 				}
 				e = (struct ip6t_entry *)
 					(entry0 + newpos);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 03/23] netfilter: x_tables: add and use xt_check_entry_offsets
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Currently arp/ip and ip6tables each implement a short helper to check that
the target offset is large enough to hold one xt_entry_target struct and
that t->u.target_size fits within the current rule.

Unfortunately these checks are not sufficient.

To avoid adding new tests to all of ip/ip6/arptables move the current
checks into a helper, then extend this helper in followup patches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 ++++
 net/ipv4/netfilter/arp_tables.c    | 11 +----------
 net/ipv4/netfilter/ip_tables.c     | 12 +-----------
 net/ipv6/netfilter/ip6_tables.c    | 12 +-----------
 net/netfilter/x_tables.c           | 34 ++++++++++++++++++++++++++++++++++
 5 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 80a305b..0de0862 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -242,6 +242,10 @@ void xt_unregister_match(struct xt_match *target);
 int xt_register_matches(struct xt_match *match, unsigned int n);
 void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
+int xt_check_entry_offsets(const void *base,
+			   unsigned int target_offset,
+			   unsigned int next_offset);
+
 int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto,
 		   bool inv_proto);
 int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto,
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ec37f7c..74668c1 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -496,19 +496,10 @@ next:
 
 static inline int check_entry(const struct arpt_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
-		return -EINVAL;
-
-	t = arpt_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static inline int check_target(struct arpt_entry *e, const char *name)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 503038e..71c204d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -590,20 +590,10 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) >
-	    e->next_offset)
-		return -EINVAL;
-
-	t = ipt_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static int
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 126f2a0..24ae7f4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -602,20 +602,10 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ip6t_entry *e)
 {
-	const struct xt_entry_target *t;
-
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	if (e->target_offset + sizeof(struct xt_entry_target) >
-	    e->next_offset)
-		return -EINVAL;
-
-	t = ip6t_get_target_c(e);
-	if (e->target_offset + t->u.target_size > e->next_offset)
-		return -EINVAL;
-
-	return 0;
+	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 }
 
 static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 582c9cf..1f44bfa 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -541,6 +541,40 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
 EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
 #endif /* CONFIG_COMPAT */
 
+/**
+ * xt_check_entry_offsets - validate arp/ip/ip6t_entry
+ *
+ * @base: pointer to arp/ip/ip6t_entry
+ * @target_offset: the arp/ip/ip6_t->target_offset
+ * @next_offset: the arp/ip/ip6_t->next_offset
+ *
+ * validates that target_offset and next_offset are sane.
+ *
+ * The arp/ip/ip6t_entry structure @base must have passed following tests:
+ * - it must point to a valid memory location
+ * - base to base + next_offset must be accessible, i.e. not exceed allocated
+ *   length.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int xt_check_entry_offsets(const void *base,
+			   unsigned int target_offset,
+			   unsigned int next_offset)
+{
+	const struct xt_entry_target *t;
+	const char *e = base;
+
+	if (target_offset + sizeof(*t) > next_offset)
+		return -EINVAL;
+
+	t = (void *)(e + target_offset);
+	if (target_offset + t->u.target_size > next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_check_entry_offsets);
+
 int xt_check_target(struct xt_tgchk_param *par,
 		    unsigned int size, u_int8_t proto, bool inv_proto)
 {
-- 
2.1.4

^ permalink raw reply related

* [PATCH 04/23] netfilter: x_tables: kill check_entry helper
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Once we add more sanity testing to xt_check_entry_offsets it
becomes relvant if we're expecting a 32bit 'config_compat' blob
or a normal one.

Since we already have a lot of similar-named functions (check_entry,
compat_check_entry, find_and_check_entry, etc.) and the current
incarnation is short just fold its contents into the callers.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 19 ++++++++-----------
 net/ipv4/netfilter/ip_tables.c  | 20 ++++++++------------
 net/ipv6/netfilter/ip6_tables.c | 20 ++++++++------------
 3 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 74668c1..24ad92a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -494,14 +494,6 @@ next:
 	return 1;
 }
 
-static inline int check_entry(const struct arpt_entry *e)
-{
-	if (!arp_checkentry(&e->arp))
-		return -EINVAL;
-
-	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
-}
-
 static inline int check_target(struct arpt_entry *e, const char *name)
 {
 	struct xt_entry_target *t = arpt_get_target(e);
@@ -597,7 +589,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 		return -EINVAL;
 	}
 
-	err = check_entry(e);
+	if (!arp_checkentry(&e->arp))
+		return -EINVAL;
+
+	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 	if (err)
 		return err;
 
@@ -1256,8 +1251,10 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 		return -EINVAL;
 	}
 
-	/* For purposes of check_entry casting the compat entry is fine */
-	ret = check_entry((struct arpt_entry *)e);
+	if (!arp_checkentry(&e->arp))
+		return -EINVAL;
+
+	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 71c204d..cdf1850 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -588,15 +588,6 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 }
 
 static int
-check_entry(const struct ipt_entry *e)
-{
-	if (!ip_checkentry(&e->ip))
-		return -EINVAL;
-
-	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
-}
-
-static int
 check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	const struct ipt_ip *ip = par->entryinfo;
@@ -760,7 +751,10 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 		return -EINVAL;
 	}
 
-	err = check_entry(e);
+	if (!ip_checkentry(&e->ip))
+		return -EINVAL;
+
+	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 	if (err)
 		return err;
 
@@ -1516,8 +1510,10 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 		return -EINVAL;
 	}
 
-	/* For purposes of check_entry casting the compat entry is fine */
-	ret = check_entry((struct ipt_entry *)e);
+	if (!ip_checkentry(&e->ip))
+		return -EINVAL;
+
+	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 24ae7f4..e378311 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -599,15 +599,6 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
 	module_put(par.match->me);
 }
 
-static int
-check_entry(const struct ip6t_entry *e)
-{
-	if (!ip6_checkentry(&e->ipv6))
-		return -EINVAL;
-
-	return xt_check_entry_offsets(e, e->target_offset, e->next_offset);
-}
-
 static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	const struct ip6t_ip6 *ipv6 = par->entryinfo;
@@ -772,7 +763,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 		return -EINVAL;
 	}
 
-	err = check_entry(e);
+	if (!ip6_checkentry(&e->ipv6))
+		return -EINVAL;
+
+	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 	if (err)
 		return err;
 
@@ -1528,8 +1522,10 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 		return -EINVAL;
 	}
 
-	/* For purposes of check_entry casting the compat entry is fine */
-	ret = check_entry((struct ip6t_entry *)e);
+	if (!ip6_checkentry(&e->ipv6))
+		return -EINVAL;
+
+	ret = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 05/23] netfilter: x_tables: assert minimum target size
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

The target size includes the size of the xt_entry_target struct.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 1f44bfa..ec1b718 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -568,6 +568,9 @@ int xt_check_entry_offsets(const void *base,
 		return -EINVAL;
 
 	t = (void *)(e + target_offset);
+	if (t->u.target_size < sizeof(*t))
+		return -EINVAL;
+
 	if (target_offset + t->u.target_size > next_offset)
 		return -EINVAL;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 08/23] netfilter: x_tables: check for bogus target offset
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

We're currently asserting that targetoff + targetsize <= nextoff.

Extend it to also check that targetoff is >= sizeof(xt_entry).
Since this is generic code, add an argument pointing to the start of the
match/target, we can then derive the base structure size from the delta.

We also need the e->elems pointer in a followup change to validate matches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 ++--
 net/ipv4/netfilter/arp_tables.c    |  5 +++--
 net/ipv4/netfilter/ip_tables.c     |  5 +++--
 net/ipv6/netfilter/ip6_tables.c    |  5 +++--
 net/netfilter/x_tables.c           | 17 +++++++++++++++--
 5 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 08de48b..30cfb1e 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -242,7 +242,7 @@ void xt_unregister_match(struct xt_match *target);
 int xt_register_matches(struct xt_match *match, unsigned int n);
 void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
-int xt_check_entry_offsets(const void *base,
+int xt_check_entry_offsets(const void *base, const char *elems,
 			   unsigned int target_offset,
 			   unsigned int next_offset);
 
@@ -494,7 +494,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
 				unsigned int *size);
 int xt_compat_target_to_user(const struct xt_entry_target *t,
 			     void __user **dstptr, unsigned int *size);
-int xt_compat_check_entry_offsets(const void *base,
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ab8952a..95ed4e4 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -592,7 +592,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1254,7 +1255,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (!arp_checkentry(&e->arp))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e, e->target_offset,
+	ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset,
 					    e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7d24c87..baab033d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -754,7 +754,8 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1513,7 +1514,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (!ip_checkentry(&e->ip))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e,
+	ret = xt_compat_check_entry_offsets(e, e->elems,
 					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73eee7b..6957627 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -766,7 +766,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	err = xt_check_entry_offsets(e, e->target_offset, e->next_offset);
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
 	if (err)
 		return err;
 
@@ -1525,7 +1526,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (!ip6_checkentry(&e->ipv6))
 		return -EINVAL;
 
-	ret = xt_compat_check_entry_offsets(e,
+	ret = xt_compat_check_entry_offsets(e, e->elems,
 					    e->target_offset, e->next_offset);
 	if (ret)
 		return ret;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 1cb7a27..e2a6f2a 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -546,14 +546,17 @@ struct compat_xt_standard_target {
 	compat_uint_t verdict;
 };
 
-/* see xt_check_entry_offsets */
-int xt_compat_check_entry_offsets(const void *base,
+int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
 {
+	long size_of_base_struct = elems - (const char *)base;
 	const struct compat_xt_entry_target *t;
 	const char *e = base;
 
+	if (target_offset < size_of_base_struct)
+		return -EINVAL;
+
 	if (target_offset + sizeof(*t) > next_offset)
 		return -EINVAL;
 
@@ -577,12 +580,16 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * xt_check_entry_offsets - validate arp/ip/ip6t_entry
  *
  * @base: pointer to arp/ip/ip6t_entry
+ * @elems: pointer to first xt_entry_match, i.e. ip(6)t_entry->elems
  * @target_offset: the arp/ip/ip6_t->target_offset
  * @next_offset: the arp/ip/ip6_t->next_offset
  *
  * validates that target_offset and next_offset are sane.
  * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
  *
+ * This function does not validate the targets or matches themselves, it
+ * only tests that all the offsets and sizes are correct.
+ *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
  * - base to base + next_offset must be accessible, i.e. not exceed allocated
@@ -591,12 +598,18 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * Return: 0 on success, negative errno on failure.
  */
 int xt_check_entry_offsets(const void *base,
+			   const char *elems,
 			   unsigned int target_offset,
 			   unsigned int next_offset)
 {
+	long size_of_base_struct = elems - (const char *)base;
 	const struct xt_entry_target *t;
 	const char *e = base;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (target_offset < size_of_base_struct)
+		return -EINVAL;
+
 	if (target_offset + sizeof(*t) > next_offset)
 		return -EINVAL;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 10/23] netfilter: ip_tables: simplify translate_compat_table args
From: Pablo Neira Ayuso @ 2016-04-22 13:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1461332394-3994-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c | 59 +++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index baab033d..d7041860 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1449,7 +1449,6 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 
 static int
 compat_find_calc_match(struct xt_entry_match *m,
-		       const char *name,
 		       const struct ipt_ip *ip,
 		       int *size)
 {
@@ -1486,8 +1485,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 				  const unsigned char *base,
 				  const unsigned char *limit,
 				  const unsigned int *hook_entries,
-				  const unsigned int *underflows,
-				  const char *name)
+				  const unsigned int *underflows)
 {
 	struct xt_entry_match *ematch;
 	struct xt_entry_target *t;
@@ -1523,7 +1521,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	entry_offset = (void *)e - (void *)base;
 	j = 0;
 	xt_ematch_foreach(ematch, e) {
-		ret = compat_find_calc_match(ematch, name, &e->ip, &off);
+		ret = compat_find_calc_match(ematch, &e->ip, &off);
 		if (ret != 0)
 			goto release_matches;
 		++j;
@@ -1572,7 +1570,7 @@ release_matches:
 
 static int
 compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
-			    unsigned int *size, const char *name,
+			    unsigned int *size,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
 	struct xt_entry_target *t;
@@ -1655,14 +1653,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
 
 static int
 translate_compat_table(struct net *net,
-		       const char *name,
-		       unsigned int valid_hooks,
 		       struct xt_table_info **pinfo,
 		       void **pentry0,
-		       unsigned int total_size,
-		       unsigned int number,
-		       unsigned int *hook_entries,
-		       unsigned int *underflows)
+		       const struct compat_ipt_replace *compatr)
 {
 	unsigned int i, j;
 	struct xt_table_info *newinfo, *info;
@@ -1674,8 +1667,8 @@ translate_compat_table(struct net *net,
 
 	info = *pinfo;
 	entry0 = *pentry0;
-	size = total_size;
-	info->number = number;
+	size = compatr->size;
+	info->number = compatr->num_entries;
 
 	/* Init all hooks to impossible value. */
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -1686,40 +1679,39 @@ translate_compat_table(struct net *net,
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(AF_INET);
-	xt_compat_init_offsets(AF_INET, number);
+	xt_compat_init_offsets(AF_INET, compatr->num_entries);
 	/* Walk through entries, checking offsets. */
-	xt_entry_foreach(iter0, entry0, total_size) {
+	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
 							entry0,
-							entry0 + total_size,
-							hook_entries,
-							underflows,
-							name);
+							entry0 + compatr->size,
+							compatr->hook_entry,
+							compatr->underflow);
 		if (ret != 0)
 			goto out_unlock;
 		++j;
 	}
 
 	ret = -EINVAL;
-	if (j != number) {
+	if (j != compatr->num_entries) {
 		duprintf("translate_compat_table: %u not %u entries\n",
-			 j, number);
+			 j, compatr->num_entries);
 		goto out_unlock;
 	}
 
 	/* Check hooks all assigned */
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
 		/* Only hooks which are valid */
-		if (!(valid_hooks & (1 << i)))
+		if (!(compatr->valid_hooks & (1 << i)))
 			continue;
 		if (info->hook_entry[i] == 0xFFFFFFFF) {
 			duprintf("Invalid hook entry %u %u\n",
-				 i, hook_entries[i]);
+				 i, info->hook_entry[i]);
 			goto out_unlock;
 		}
 		if (info->underflow[i] == 0xFFFFFFFF) {
 			duprintf("Invalid underflow %u %u\n",
-				 i, underflows[i]);
+				 i, info->underflow[i]);
 			goto out_unlock;
 		}
 	}
@@ -1729,17 +1721,17 @@ translate_compat_table(struct net *net,
 	if (!newinfo)
 		goto out_unlock;
 
-	newinfo->number = number;
+	newinfo->number = compatr->num_entries;
 	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
-	size = total_size;
-	xt_entry_foreach(iter0, entry0, total_size) {
+	size = compatr->size;
+	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = compat_copy_entry_from_user(iter0, &pos, &size,
-						  name, newinfo, entry1);
+						  newinfo, entry1);
 		if (ret != 0)
 			break;
 	}
@@ -1749,12 +1741,12 @@ translate_compat_table(struct net *net,
 		goto free_newinfo;
 
 	ret = -ELOOP;
-	if (!mark_source_chains(newinfo, valid_hooks, entry1))
+	if (!mark_source_chains(newinfo, compatr->valid_hooks, entry1))
 		goto free_newinfo;
 
 	i = 0;
 	xt_entry_foreach(iter1, entry1, newinfo->size) {
-		ret = compat_check_entry(iter1, net, name);
+		ret = compat_check_entry(iter1, net, compatr->name);
 		if (ret != 0)
 			break;
 		++i;
@@ -1794,7 +1786,7 @@ translate_compat_table(struct net *net,
 free_newinfo:
 	xt_free_table_info(newinfo);
 out:
-	xt_entry_foreach(iter0, entry0, total_size) {
+	xt_entry_foreach(iter0, entry0, compatr->size) {
 		if (j-- == 0)
 			break;
 		compat_release_entry(iter0);
@@ -1839,10 +1831,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		goto free_newinfo;
 	}
 
-	ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
-				     &newinfo, &loc_cpu_entry, tmp.size,
-				     tmp.num_entries, tmp.hook_entry,
-				     tmp.underflow);
+	ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
 	if (ret != 0)
 		goto free_newinfo;
 
-- 
2.1.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox