[PATCH net v2 0/2] net: sched: fix some issues

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net v2 0/2] net: sched: fix some issues
@ 2013-11-18  8:39 Yang Yingliang
  2013-11-18  8:39 ` [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size Yang Yingliang
  2013-11-18  8:39 ` [PATCH net v2 2/2] net: sched: htb: fix calculation of quantum Yang Yingliang
  0 siblings, 2 replies; 5+ messages in thread
From: Yang Yingliang @ 2013-11-18  8:39 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet, brouer, jpirko

fix a regression introduced by commit b757c9336d63f94c6b57532
(tbf: improved accuracy at high rates).
fix quantum calculation introduced by 64bit rates.

v2
  patch 1/2: redescribe the regression
  patch 2/2: add Eric's ack

Yang Yingliang (2):
  net: sched: tbf: fix calculation of max_size
  net: sched: htb: fix calculation of quantum

 include/net/sch_generic.h | 46 ++++++++++++++++++++++++++++++++
 net/sched/sch_htb.c       | 18 +++++++------
 net/sched/sch_tbf.c       | 67 ++++++++++++++++++++++++++---------------------
 3 files changed, 93 insertions(+), 38 deletions(-)

-- 
1.8.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size
  2013-11-18  8:39 [PATCH net v2 0/2] net: sched: fix some issues Yang Yingliang
@ 2013-11-18  8:39 ` Yang Yingliang
  2013-11-18 12:32   ` Jesper Dangaard Brouer
  2013-11-18  8:39 ` [PATCH net v2 2/2] net: sched: htb: fix calculation of quantum Yang Yingliang
  1 sibling, 1 reply; 5+ messages in thread
From: Yang Yingliang @ 2013-11-18  8:39 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet, brouer, jpirko

commit b757c9336d63f94c6b57532(tbf: improved accuracy at high rates)
introduce a regression.

With the follow command:
tc qdisc add dev eth1 root handle 1: tbf latency 50ms burst 10KB rate 30gbit mtu 64k

Without this patch, the max_size value is 10751(bytes).
But, in fact, the real max_size value should be smaller than 7440(bytes).
Or a packet whose length is bigger than 7440 will cause network congestion.
Because the packet is so big that can't get enough tokens. Even all the tokens
in the buffer is given to the packet.

With this patch, the max_size value is 7440(bytes).
The packets whose length is bigger than 7440(bytes) will be dropped or reshape
in tbf_enqueue().

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 include/net/sch_generic.h | 46 ++++++++++++++++++++++++++++++++
 net/sched/sch_tbf.c       | 67 ++++++++++++++++++++++++++---------------------
 2 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d0a6321..8da64f3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -701,6 +701,52 @@ static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
 	return ((u64)len * r->mult) >> r->shift;
 }
 
+/* Time to Length, convert time in ns to length in bytes
+ * to determinate how many bytes can be sent in given time.
+ */
+static inline u64 psched_ns_t2l(const struct psched_ratecfg *r,
+				u64 time_in_ns)
+{
+	u64 len = time_in_ns;
+	u8 shift = r->shift;
+	bool is_div = false;
+
+	/* The formula is :
+	 * len = (time_in_ns << shift) / mult
+	 * when time_in_ns does shift, it would overflow.
+	 * If overflow happens first time, do division.
+	 * Then do shift. If it happens again,
+	 * set lenth to ~0ULL.
+	 */
+	while (shift) {
+		if (len & (1ULL << 63)) {
+			if (!is_div) {
+				len = div64_u64(len, r->mult);
+				is_div = true;
+			} else {
+				/* overflow happens */
+				len = ~0ULL;
+				is_div = true;
+				break;
+			}
+		}
+		len <<= 1;
+		shift--;
+	}
+	if (!is_div)
+		len = div64_u64(len, r->mult);
+
+	if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
+		len = (len / 53) * 48;
+
+	if (len > r->overhead)
+		len -= r->overhead;
+	else
+		len = 0;
+
+	return len;
+}
+
 void psched_ratecfg_precompute(struct psched_ratecfg *r,
 			       const struct tc_ratespec *conf,
 			       u64 rate64);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 68f9859..eb9ce7b 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -279,7 +279,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 	struct qdisc_rate_table *rtab = NULL;
 	struct qdisc_rate_table *ptab = NULL;
 	struct Qdisc *child = NULL;
-	int max_size, n;
+	u32 max_size = 0;
 	u64 rate64 = 0, prate64 = 0;
 
 	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
@@ -291,33 +291,20 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 		goto done;
 
 	qopt = nla_data(tb[TCA_TBF_PARMS]);
-	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
-	if (rtab == NULL)
-		goto done;
-
-	if (qopt->peakrate.rate) {
-		if (qopt->peakrate.rate > qopt->rate.rate)
-			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
-		if (ptab == NULL)
-			goto done;
+	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) {
+		rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
+		if (rtab) {
+			qdisc_put_rtab(rtab);
+			rtab = NULL;
+		}
 	}
-
-	for (n = 0; n < 256; n++)
-		if (rtab->data[n] > qopt->buffer)
-			break;
-	max_size = (n << qopt->rate.cell_log) - 1;
-	if (ptab) {
-		int size;
-
-		for (n = 0; n < 256; n++)
-			if (ptab->data[n] > qopt->mtu)
-				break;
-		size = (n << qopt->peakrate.cell_log) - 1;
-		if (size < max_size)
-			max_size = size;
+	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) {
+		ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
+		if (ptab) {
+			qdisc_put_rtab(ptab);
+			ptab = NULL;
+		}
 	}
-	if (max_size < 0)
-		goto done;
 
 	if (q->qdisc != &noop_qdisc) {
 		err = fifo_set_limit(q->qdisc, qopt->limit);
@@ -339,25 +326,45 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 	}
 	q->limit = qopt->limit;
 	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
-	q->max_size = max_size;
 	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
 
 	if (tb[TCA_TBF_RATE64])
 		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
-	psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
-	if (ptab) {
+	psched_ratecfg_precompute(&q->rate, &qopt->rate, rate64);
+	if (!q->rate.rate_bytes_ps)
+		goto unlock_done;
+	max_size = min_t(u64, psched_ns_t2l(&q->rate, q->buffer), ~0);
+	max_size = min_t(u32, max_size, (256 << qopt->rate.cell_log) - 1);
+
+	if (qopt->peakrate.rate) {
+		u64 size = 0;
 		if (tb[TCA_TBF_PRATE64])
 			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
-		psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
+		psched_ratecfg_precompute(&q->peak, &qopt->peakrate, prate64);
+		size = psched_ns_t2l(&q->peak, q->mtu);
+		max_size = min_t(u64, max_size, size);
+		max_size = min_t(u32,
+				 max_size,
+				 (256 << qopt->peakrate.cell_log) - 1);
 		q->peak_present = true;
 	} else {
 		q->peak_present = false;
 	}
 
+	if (!max_size)
+		goto unlock_done;
+	q->max_size = max_size;
+
 	sch_tree_unlock(sch);
 	err = 0;
+
+	if (0) {
+unlock_done:
+		sch_tree_unlock(sch);
+		err = -EINVAL;
+	}
 done:
 	if (rtab)
 		qdisc_put_rtab(rtab);
-- 
1.8.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH net v2 2/2] net: sched: htb: fix calculation of quantum
  2013-11-18  8:39 [PATCH net v2 0/2] net: sched: fix some issues Yang Yingliang
  2013-11-18  8:39 ` [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size Yang Yingliang
@ 2013-11-18  8:39 ` Yang Yingliang
  1 sibling, 0 replies; 5+ messages in thread
From: Yang Yingliang @ 2013-11-18  8:39 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet, brouer, jpirko

Now, 32bit rates may be not the true rate.
So use rate_bytes_ps which is from
max(rate32, rate64) to calcualte quantum.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Acked-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_htb.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0e1e38b..57c6678 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1477,11 +1477,20 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		sch_tree_lock(sch);
 	}
 
+	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
+
 	/* it used to be a nasty bug here, we have to check that node
 	 * is really leaf before changing cl->un.leaf !
 	 */
 	if (!cl->level) {
-		cl->quantum = hopt->rate.rate / q->rate2quantum;
+		u64 quantum = div64_u64(cl->rate.rate_bytes_ps,
+					q->rate2quantum);
+		cl->quantum = min_t(u64, quantum, INT_MAX);
 		if (!hopt->quantum && cl->quantum < 1000) {
 			pr_warning(
 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
@@ -1500,13 +1509,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			cl->prio = TC_HTB_NUMPRIO - 1;
 	}
 
-	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
-
-	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
-
-	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
-	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
-
 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
 
-- 
1.8.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size
  2013-11-18  8:39 ` [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size Yang Yingliang
@ 2013-11-18 12:32   ` Jesper Dangaard Brouer
  2013-11-19  6:04     ` Yang Yingliang
  0 siblings, 1 reply; 5+ messages in thread
From: Jesper Dangaard Brouer @ 2013-11-18 12:32 UTC (permalink / raw)
  To: Yang Yingliang; +Cc: davem, netdev, eric.dumazet, brouer, jpirko

On Mon, 18 Nov 2013 16:39:23 +0800
Yang Yingliang <yangyingliang@huawei.com> wrote:

> commit b757c9336d63f94c6b57532(tbf: improved accuracy at high rates)
> introduce a regression.
> 
> With the follow command:
> tc qdisc add dev eth1 root handle 1: tbf latency 50ms burst 10KB rate 30gbit mtu 64k
> 
> Without this patch, the max_size value is 10751(bytes).
> But, in fact, the real max_size value should be smaller than 7440(bytes).
> Or a packet whose length is bigger than 7440 will cause network congestion.
> Because the packet is so big that can't get enough tokens. Even all the tokens
> in the buffer is given to the packet.
> 
> With this patch, the max_size value is 7440(bytes).
> The packets whose length is bigger than 7440(bytes) will be dropped or reshape
> in tbf_enqueue().


I acknowledge that TBF seems to have some dependencies to the
userspace constructed rate table (which we do NOT use anymore in the
kernel).  And that these should be fixed.

But I'm not sure that your patch is the best solution... and the patch
also contains some issues, see inlined comments.

The main annoying problem is *again* how the rate table system got
removed, in the kernel, but nobody fixed userspace.


So, the main problem is that qopt->buffer (send from userspace/tc) is
in a "time-format" (user input "burst" in bytes).  Which used-to, make
sense because the rate table used the same "time-format".

Now you are reversing this calculation of "q->buffer" (token burst)
back into bytes, so we can choose "max_size" (to avoid a problem in
tbf_dequeue()).

I don't like this converting back-and-forth, I'm worried about
rounding errors.

The easiest "hack" solution would be:

 for (n = 0; n < 65535; n++)
     if (psched_l2t_ns(&qopt->rate, n) > q->buffer)
         break;
 max_size = n;

Unfortunately we have to keep backward compat with iproute2/tc, but
IMHO it would be a lot easier, if we could fix userspace, and remove
all the length-to-time calculations, as they should now be the
responsibility of the kernel.  Well, wishful thinking...



> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
> ---
>  include/net/sch_generic.h | 46 ++++++++++++++++++++++++++++++++
>  net/sched/sch_tbf.c       | 67 ++++++++++++++++++++++++++---------------------
>  2 files changed, 83 insertions(+), 30 deletions(-)
> 
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index d0a6321..8da64f3 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -701,6 +701,52 @@ static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
>  	return ((u64)len * r->mult) >> r->shift;
>  }
>  
> +/* Time to Length, convert time in ns to length in bytes
> + * to determinate how many bytes can be sent in given time.
> + */
> +static inline u64 psched_ns_t2l(const struct psched_ratecfg *r,
> +				u64 time_in_ns)
> +{
> +	u64 len = time_in_ns;
> +	u8 shift = r->shift;
> +	bool is_div = false;
> +
> +	/* The formula is :
> +	 * len = (time_in_ns << shift) / mult
> +	 * when time_in_ns does shift, it would overflow.
> +	 * If overflow happens first time, do division.
> +	 * Then do shift. If it happens again,
> +	 * set lenth to ~0ULL.
> +	 */
> +	while (shift) {
> +		if (len & (1ULL << 63)) {
> +			if (!is_div) {
> +				len = div64_u64(len, r->mult);
> +				is_div = true;
> +			} else {
> +				/* overflow happens */
> +				len = ~0ULL;
> +				is_div = true;
> +				break;
> +			}
> +		}
> +		len <<= 1;
> +		shift--;
> +	}
> +	if (!is_div)
> +		len = div64_u64(len, r->mult);
> +
> +	if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
> +		len = (len / 53) * 48;
> +
> +	if (len > r->overhead)
> +		len -= r->overhead;
> +	else
> +		len = 0;
> +
> +	return len;
> +}
> +

Are we 100% sure, that the conversion is accurate between
psched_l2t_ns() and psched_ns_t2l for all possible rates.

E.g. why is it that r->shift have to be recalculate (orig created in
psched_ratecfg_precompute()).


>  void psched_ratecfg_precompute(struct psched_ratecfg *r,
>  			       const struct tc_ratespec *conf,
>  			       u64 rate64);
> diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
> index 68f9859..eb9ce7b 100644
> --- a/net/sched/sch_tbf.c
> +++ b/net/sched/sch_tbf.c
> @@ -279,7 +279,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
>  	struct qdisc_rate_table *rtab = NULL;
>  	struct qdisc_rate_table *ptab = NULL;
>  	struct Qdisc *child = NULL;
> -	int max_size, n;
> +	u32 max_size = 0;
>  	u64 rate64 = 0, prate64 = 0;
>  
>  	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
> @@ -291,33 +291,20 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
>  		goto done;
>  
>  	qopt = nla_data(tb[TCA_TBF_PARMS]);
> -	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
> -	if (rtab == NULL)
> -		goto done;
> -
> -	if (qopt->peakrate.rate) {
> -		if (qopt->peakrate.rate > qopt->rate.rate)
> -			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
> -		if (ptab == NULL)
> -			goto done;
> +	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) {
> +		rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
> +		if (rtab) {
> +			qdisc_put_rtab(rtab);
> +			rtab = NULL;
> +		}

This is correct code construct, for backward compatible reading of the
rate table.

But, notice how your free this at once, which means you also should
cleanup the exit/done: section.



>  	}
> -
> -	for (n = 0; n < 256; n++)
> -		if (rtab->data[n] > qopt->buffer)
> -			break;
> -	max_size = (n << qopt->rate.cell_log) - 1;

This is here we could do the quick-and-dirty solution:

 for (n = 0; n < 65535; n++)
     if (psched_l2t_ns(&qopt->rate, n) > q->buffer)
         break;
 max_size = n;



> -	if (ptab) {
> -		int size;
> -
> -		for (n = 0; n < 256; n++)
> -			if (ptab->data[n] > qopt->mtu)
> -				break;
> -		size = (n << qopt->peakrate.cell_log) - 1;
> -		if (size < max_size)
> -			max_size = size;
> +	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) {
> +		ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
> +		if (ptab) {
> +			qdisc_put_rtab(ptab);
> +			ptab = NULL;
> +		}
>  	}
> -	if (max_size < 0)
> -		goto done;
>  
>  	if (q->qdisc != &noop_qdisc) {
>  		err = fifo_set_limit(q->qdisc, qopt->limit);
> @@ -339,25 +326,45 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
>  	}
>  	q->limit = qopt->limit;
>  	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
> -	q->max_size = max_size;
>  	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
>  	q->tokens = q->buffer;
>  	q->ptokens = q->mtu;
>  
>  	if (tb[TCA_TBF_RATE64])
>  		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
> -	psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
> -	if (ptab) {
> +	psched_ratecfg_precompute(&q->rate, &qopt->rate, rate64);
> +	if (!q->rate.rate_bytes_ps)
> +		goto unlock_done;
> +	max_size = min_t(u64, psched_ns_t2l(&q->rate, q->buffer), ~0);
> +	max_size = min_t(u32, max_size, (256 << qopt->rate.cell_log) - 1);

The rate system and the rate.cell_log is not really used anymore, so
its a bit strange to use it.  Perhaps it's even a bug to base a
calculation on this.

> +
> +	if (qopt->peakrate.rate) {
> +		u64 size = 0;
>  		if (tb[TCA_TBF_PRATE64])
>  			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
> -		psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
> +		psched_ratecfg_precompute(&q->peak, &qopt->peakrate, prate64);
> +		size = psched_ns_t2l(&q->peak, q->mtu);
> +		max_size = min_t(u64, max_size, size);
> +		max_size = min_t(u32,
> +				 max_size,
> +				 (256 << qopt->peakrate.cell_log) - 1);
>  		q->peak_present = true;
>  	} else {
>  		q->peak_present = false;
>  	}
>  
> +	if (!max_size)
> +		goto unlock_done;
> +	q->max_size = max_size;
> +
>  	sch_tree_unlock(sch);
>  	err = 0;
> +
> +	if (0) {

I really dislike this construct.  I'm afraid what a dumb compile would
optimized this to.

> +unlock_done:
> +		sch_tree_unlock(sch);
> +		err = -EINVAL;
> +	}
>  done:
>  	if (rtab)
>  		qdisc_put_rtab(rtab);

This could be cleaned up, as we already have released the rtab's.


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size
  2013-11-18 12:32   ` Jesper Dangaard Brouer
@ 2013-11-19  6:04     ` Yang Yingliang
  0 siblings, 0 replies; 5+ messages in thread
From: Yang Yingliang @ 2013-11-19  6:04 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: davem, netdev, eric.dumazet, brouer, jpirko

On 2013/11/18 20:32, Jesper Dangaard Brouer wrote:
> On Mon, 18 Nov 2013 16:39:23 +0800
> Yang Yingliang <yangyingliang@huawei.com> wrote:
> 
>> commit b757c9336d63f94c6b57532(tbf: improved accuracy at high rates)
>> introduce a regression.
>>
>> With the follow command:
>> tc qdisc add dev eth1 root handle 1: tbf latency 50ms burst 10KB rate 30gbit mtu 64k
>>
>> Without this patch, the max_size value is 10751(bytes).
>> But, in fact, the real max_size value should be smaller than 7440(bytes).
>> Or a packet whose length is bigger than 7440 will cause network congestion.
>> Because the packet is so big that can't get enough tokens. Even all the tokens
>> in the buffer is given to the packet.
>>
>> With this patch, the max_size value is 7440(bytes).
>> The packets whose length is bigger than 7440(bytes) will be dropped or reshape
>> in tbf_enqueue().
> 
> 
> I acknowledge that TBF seems to have some dependencies to the
> userspace constructed rate table (which we do NOT use anymore in the
> kernel).  And that these should be fixed.
> 
> But I'm not sure that your patch is the best solution... and the patch
> also contains some issues, see inlined comments.
> 
> The main annoying problem is *again* how the rate table system got
> removed, in the kernel, but nobody fixed userspace.
> 
> 
> So, the main problem is that qopt->buffer (send from userspace/tc) is
> in a "time-format" (user input "burst" in bytes).  Which used-to, make
> sense because the rate table used the same "time-format".
> 
> Now you are reversing this calculation of "q->buffer" (token burst)
> back into bytes, so we can choose "max_size" (to avoid a problem in
> tbf_dequeue()).
> 
> I don't like this converting back-and-forth, I'm worried about
> rounding errors.
> 
> The easiest "hack" solution would be:
> 
>  for (n = 0; n < 65535; n++)
>      if (psched_l2t_ns(&qopt->rate, n) > q->buffer)
>          break;
>  max_size = n;
> 
> Unfortunately we have to keep backward compat with iproute2/tc, but
> IMHO it would be a lot easier, if we could fix userspace, and remove
> all the length-to-time calculations, as they should now be the
> responsibility of the kernel.  Well, wishful thinking...
> 
> 
> 
>> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
>> ---
>>  include/net/sch_generic.h | 46 ++++++++++++++++++++++++++++++++
>>  net/sched/sch_tbf.c       | 67 ++++++++++++++++++++++++++---------------------
>>  2 files changed, 83 insertions(+), 30 deletions(-)
>>
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index d0a6321..8da64f3 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -701,6 +701,52 @@ static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
>>  	return ((u64)len * r->mult) >> r->shift;
>>  }
>>  
>> +/* Time to Length, convert time in ns to length in bytes
>> + * to determinate how many bytes can be sent in given time.
>> + */
>> +static inline u64 psched_ns_t2l(const struct psched_ratecfg *r,
>> +				u64 time_in_ns)
>> +{
>> +	u64 len = time_in_ns;
>> +	u8 shift = r->shift;
>> +	bool is_div = false;
>> +
>> +	/* The formula is :
>> +	 * len = (time_in_ns << shift) / mult
>> +	 * when time_in_ns does shift, it would overflow.
>> +	 * If overflow happens first time, do division.
>> +	 * Then do shift. If it happens again,
>> +	 * set lenth to ~0ULL.
>> +	 */
>> +	while (shift) {
>> +		if (len & (1ULL << 63)) {
>> +			if (!is_div) {
>> +				len = div64_u64(len, r->mult);
>> +				is_div = true;
>> +			} else {
>> +				/* overflow happens */
>> +				len = ~0ULL;
>> +				is_div = true;
>> +				break;
>> +			}
>> +		}
>> +		len <<= 1;
>> +		shift--;
>> +	}
>> +	if (!is_div)
>> +		len = div64_u64(len, r->mult);
>> +
>> +	if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
>> +		len = (len / 53) * 48;
>> +
>> +	if (len > r->overhead)
>> +		len -= r->overhead;
>> +	else
>> +		len = 0;
>> +
>> +	return len;
>> +}
>> +
> 
> Are we 100% sure, that the conversion is accurate between
> psched_l2t_ns() and psched_ns_t2l for all possible rates.
> 
> E.g. why is it that r->shift have to be recalculate (orig created in
> psched_ratecfg_precompute()).

It don't recalculate r->shift, it just do shift bit by bit in case that
len overflows. If len overflows, set it to ~0ULL.
I would like to use the way you suggest to calculate the max_size.
So forget about this function.:)

> 
> 
>>  void psched_ratecfg_precompute(struct psched_ratecfg *r,
>>  			       const struct tc_ratespec *conf,
>>  			       u64 rate64);
>> diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
>> index 68f9859..eb9ce7b 100644
>> --- a/net/sched/sch_tbf.c
>> +++ b/net/sched/sch_tbf.c
>> @@ -279,7 +279,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
>>  	struct qdisc_rate_table *rtab = NULL;
>>  	struct qdisc_rate_table *ptab = NULL;
>>  	struct Qdisc *child = NULL;
>> -	int max_size, n;
>> +	u32 max_size = 0;
>>  	u64 rate64 = 0, prate64 = 0;
>>  
>>  	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
>> @@ -291,33 +291,20 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
>>  		goto done;
>>  
>>  	qopt = nla_data(tb[TCA_TBF_PARMS]);
>> -	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
>> -	if (rtab == NULL)
>> -		goto done;
>> -
>> -	if (qopt->peakrate.rate) {
>> -		if (qopt->peakrate.rate > qopt->rate.rate)
>> -			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
>> -		if (ptab == NULL)
>> -			goto done;
>> +	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) {
>> +		rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
>> +		if (rtab) {
>> +			qdisc_put_rtab(rtab);
>> +			rtab = NULL;
>> +		}
> 
> This is correct code construct, for backward compatible reading of the
> rate table.
> 
> But, notice how your free this at once, which means you also should
> cleanup the exit/done: section.

Agreed, I will change it in v3.
Thanks!

> 
> 
> 
>>  	}
>> -
>> -	for (n = 0; n < 256; n++)
>> -		if (rtab->data[n] > qopt->buffer)
>> -			break;
>> -	max_size = (n << qopt->rate.cell_log) - 1;
> 
> This is here we could do the quick-and-dirty solution:
> 
>  for (n = 0; n < 65535; n++)
>      if (psched_l2t_ns(&qopt->rate, n) > q->buffer)
>          break;
>  max_size = n;
> 
> 
> 
>> -	if (ptab) {
>> -		int size;
>> -
>> -		for (n = 0; n < 256; n++)
>> -			if (ptab->data[n] > qopt->mtu)
>> -				break;
>> -		size = (n << qopt->peakrate.cell_log) - 1;
>> -		if (size < max_size)
>> -			max_size = size;
>> +	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) {
>> +		ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
>> +		if (ptab) {
>> +			qdisc_put_rtab(ptab);
>> +			ptab = NULL;
>> +		}
>>  	}
>> -	if (max_size < 0)
>> -		goto done;
>>  
>>  	if (q->qdisc != &noop_qdisc) {
>>  		err = fifo_set_limit(q->qdisc, qopt->limit);
>> @@ -339,25 +326,45 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
>>  	}
>>  	q->limit = qopt->limit;
>>  	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
>> -	q->max_size = max_size;
>>  	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
>>  	q->tokens = q->buffer;
>>  	q->ptokens = q->mtu;
>>  
>>  	if (tb[TCA_TBF_RATE64])
>>  		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
>> -	psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
>> -	if (ptab) {
>> +	psched_ratecfg_precompute(&q->rate, &qopt->rate, rate64);
>> +	if (!q->rate.rate_bytes_ps)
>> +		goto unlock_done;
>> +	max_size = min_t(u64, psched_ns_t2l(&q->rate, q->buffer), ~0);
>> +	max_size = min_t(u32, max_size, (256 << qopt->rate.cell_log) - 1);
> 
> The rate system and the rate.cell_log is not really used anymore, so
> its a bit strange to use it.  Perhaps it's even a bug to base a
> calculation on this.

cell_log is calculated from mtu(user input in bytes) in userspace.
In the old calculation, max_size should be smaller than (256 << cell_log)
which means it's smaller than mtu.

> 
>> +
>> +	if (qopt->peakrate.rate) {
>> +		u64 size = 0;
>>  		if (tb[TCA_TBF_PRATE64])
>>  			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
>> -		psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
>> +		psched_ratecfg_precompute(&q->peak, &qopt->peakrate, prate64);
>> +		size = psched_ns_t2l(&q->peak, q->mtu);
>> +		max_size = min_t(u64, max_size, size);
>> +		max_size = min_t(u32,
>> +				 max_size,
>> +				 (256 << qopt->peakrate.cell_log) - 1);
>>  		q->peak_present = true;
>>  	} else {
>>  		q->peak_present = false;
>>  	}
>>  
>> +	if (!max_size)
>> +		goto unlock_done;
>> +	q->max_size = max_size;
>> +
>>  	sch_tree_unlock(sch);
>>  	err = 0;
>> +
>> +	if (0) {
> 
> I really dislike this construct.  I'm afraid what a dumb compile would
> optimized this to.

OK, change it in v3.
Thanks!

> 
>> +unlock_done:
>> +		sch_tree_unlock(sch);
>> +		err = -EINVAL;
>> +	}
>>  done:
>>  	if (rtab)
>>  		qdisc_put_rtab(rtab);
> 
> This could be cleaned up, as we already have released the rtab's.
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2013-11-19  6:05 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-11-18  8:39 [PATCH net v2 0/2] net: sched: fix some issues Yang Yingliang
2013-11-18  8:39 ` [PATCH net v2 1/2] net: sched: tbf: fix calculation of max_size Yang Yingliang
2013-11-18 12:32   ` Jesper Dangaard Brouer
2013-11-19  6:04     ` Yang Yingliang
2013-11-18  8:39 ` [PATCH net v2 2/2] net: sched: htb: fix calculation of quantum Yang Yingliang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).