netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] htb: improved accuracy at high rates
@ 2012-10-19 22:26 Vimalkumar
  2012-10-19 23:52 ` Rick Jones
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Vimalkumar @ 2012-10-19 22:26 UTC (permalink / raw)
  To: davem, eric.dumazet, Jamal Hadi Salim, netdev; +Cc: Vimalkumar

Current HTB (and TBF) uses rate table computed by
the "tc" userspace program, which has the following
issue:

The rate table has 256 entries to map packet lengths
to token (time units).  With TSO sized packets, the
256 entry granularity leads to loss/gain of rate,
making the token bucket inaccurate.

Thus, instead of relying on rate table, this patch
explicitly computes the time and accounts for packet
transmission times with nanosec granularity.

This greatly improves accuracy of HTB with a wide
range of packet sizes.

Example:

tc qdisc add dev $dev root handle 1: \
	htb default 1

tc class add dev $dev classid 1:1 parent 1: \
	rate 1Gbit mtu 64k

Ideally it should work with all intermediate sized
packets as well, but...

Test:
for i in {1..20}; do
	(netperf -H $host -t UDP_STREAM -l 30 -- -m $size &);
done

With size=400 bytes: achieved rate ~600Mb/s
With size=1000 bytes: achieved rate ~835Mb/s
With size=8000 bytes: achieved rate ~1012Mb/s

With new HTB, in all cases, we achieve ~1000Mb/s.
Many thanks to Eric Dumazet for review and feedback.

Signed-off-by: Vimalkumar <j.vimal@gmail.com>
---
 sch_htb.c |  112 +++++++++++++++++++++++++++++++++++++++++--------------------
 1 files changed, 75 insertions(+), 37 deletions(-)

diff --git a/sch_htb.c b/sch_htb.c
index 29b942c..a808ba1 100644
--- a/sch_htb.c
+++ b/sch_htb.c
@@ -55,6 +55,7 @@
 
 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
+#define HTB_MIN_PKT_BYTES (64)
 
 #if HTB_VER >> 16 != TC_HTB_PROTOVER
 #error "Mismatched sch_htb.c and pkt_sch.h"
@@ -71,6 +72,12 @@ enum htb_cmode {
 	HTB_CAN_SEND		/* class can send */
 };
 
+struct htb_rate_cfg {
+	u64 rate_bps;
+	u32 mult;
+	u32 shift;
+};
+
 /* interior & leaf nodes; props specific to leaves are marked L: */
 struct htb_class {
 	struct Qdisc_class_common common;
@@ -118,11 +125,11 @@ struct htb_class {
 	int filter_cnt;
 
 	/* token bucket parameters */
-	struct qdisc_rate_table *rate;	/* rate table of the class itself */
-	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
-	long buffer, cbuffer;	/* token bucket depth/rate */
+	struct htb_rate_cfg rate;
+	struct htb_rate_cfg ceil;
+	s64 buffer, cbuffer;	/* token bucket depth/rate */
 	psched_tdiff_t mbuffer;	/* max wait time */
-	long tokens, ctokens;	/* current number of tokens */
+	s64 tokens, ctokens;	/* current number of tokens */
 	psched_time_t t_c;	/* checkpoint time */
 };
 
@@ -162,6 +169,30 @@ struct htb_sched {
 	struct work_struct work;
 };
 
+static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len)
+{
+	return ((u64)len * r->mult) >> r->shift;
+}
+
+static void htb_precompute_ratedata(struct htb_rate_cfg *r)
+{
+	u64 factor;
+	r->shift = 0;
+	r->mult = 1;
+	/*
+	 * Calibrate mult, shift so that token counting is accurate
+	 * for smallest packet size (64 bytes).  Token (time in ns) is
+	 * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps.  It will
+	 * work as long as the smallest packet transfer time can be
+	 * accurately represented in nanosec.
+	 */
+	if (r->rate_bps > 0) {
+		r->shift = 15;
+		factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
+		r->mult = div64_u64(factor, r->rate_bps);
+	}
+}
+
 /* find class in global hash table using given handle */
 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 {
@@ -273,7 +304,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
  * already in the queue.
  */
 static void htb_add_to_wait_tree(struct htb_sched *q,
-				 struct htb_class *cl, long delay)
+				 struct htb_class *cl, s64 delay)
 {
 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
 
@@ -441,14 +472,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 		htb_remove_class_from_row(q, cl, mask);
 }
 
-static inline long htb_lowater(const struct htb_class *cl)
+static inline s64 htb_lowater(const struct htb_class *cl)
 {
 	if (htb_hysteresis)
 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
 	else
 		return 0;
 }
-static inline long htb_hiwater(const struct htb_class *cl)
+static inline s64 htb_hiwater(const struct htb_class *cl)
 {
 	if (htb_hysteresis)
 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -469,9 +500,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
  * mode transitions per time unit. The speed gain is about 1/6.
  */
 static inline enum htb_cmode
-htb_class_mode(struct htb_class *cl, long *diff)
+htb_class_mode(struct htb_class *cl, s64 *diff)
 {
-	long toks;
+	s64 toks;
 
 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
 		*diff = -toks;
@@ -495,7 +526,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
  */
 static void
-htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 {
 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
 
@@ -584,26 +615,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_SUCCESS;
 }
 
-static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 {
-	long toks = diff + cl->tokens;
+	s64 toks = diff + cl->tokens;
 
 	if (toks > cl->buffer)
 		toks = cl->buffer;
-	toks -= (long) qdisc_l2t(cl->rate, bytes);
+	toks -= (s64) l2t_ns(&cl->rate, bytes);
 	if (toks <= -cl->mbuffer)
 		toks = 1 - cl->mbuffer;
 
 	cl->tokens = toks;
 }
 
-static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 {
-	long toks = diff + cl->ctokens;
+	s64 toks = diff + cl->ctokens;
 
 	if (toks > cl->cbuffer)
 		toks = cl->cbuffer;
-	toks -= (long) qdisc_l2t(cl->ceil, bytes);
+	toks -= (s64) l2t_ns(&cl->ceil, bytes);
 	if (toks <= -cl->mbuffer)
 		toks = 1 - cl->mbuffer;
 
@@ -626,10 +657,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 {
 	int bytes = qdisc_pkt_len(skb);
 	enum htb_cmode old_mode;
-	long diff;
+	s64 diff;
 
 	while (cl) {
-		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 		if (cl->level >= level) {
 			if (cl->level == level)
 				cl->xstats.lends++;
@@ -676,7 +707,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
 	unsigned long stop_at = start + 2;
 	while (time_before(jiffies, stop_at)) {
 		struct htb_class *cl;
-		long diff;
+		s64 diff;
 		struct rb_node *p = rb_first(&q->wait_pq[level]);
 
 		if (!p)
@@ -687,7 +718,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
 			return cl->pq_key;
 
 		htb_safe_rb_erase(p, q->wait_pq + level);
-		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 		htb_change_class_mode(q, cl, &diff);
 		if (cl->cmode != HTB_CAN_SEND)
 			htb_add_to_wait_tree(q, cl, diff);
@@ -873,10 +904,10 @@ ok:
 
 	if (!sch->q.qlen)
 		goto fin;
-	q->now = psched_get_time();
+	q->now = ktime_to_ns(ktime_get());
 	start_at = jiffies;
 
-	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
+	next_event = q->now + 5 * NSEC_PER_SEC;
 
 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 		/* common case optimization - skip event handler quickly */
@@ -886,7 +917,7 @@ ok:
 		if (q->now >= q->near_ev_cache[level]) {
 			event = htb_do_events(q, level, start_at);
 			if (!event)
-				event = q->now + PSCHED_TICKS_PER_SEC;
+				event = q->now + NSEC_PER_SEC;
 			q->near_ev_cache[level] = event;
 		} else
 			event = q->near_ev_cache[level];
@@ -905,10 +936,18 @@ ok:
 		}
 	}
 	sch->qstats.overlimits++;
-	if (likely(next_event > q->now))
-		qdisc_watchdog_schedule(&q->watchdog, next_event);
-	else
+	if (likely(next_event > q->now)) {
+		if (!test_bit(__QDISC_STATE_DEACTIVATED,
+			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
+			ktime_t time = ktime_set(0, 0);
+			qdisc_throttled(q->watchdog.qdisc);
+			time = ktime_add_ns(time, next_event);
+			hrtimer_start(&q->watchdog.timer, time,
+				      HRTIMER_MODE_ABS);
+		}
+	} else {
 		schedule_work(&q->work);
+	}
 fin:
 	return skb;
 }
@@ -1083,9 +1122,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 
 	memset(&opt, 0, sizeof(opt));
 
-	opt.rate = cl->rate->rate;
 	opt.buffer = cl->buffer;
-	opt.ceil = cl->ceil->rate;
 	opt.cbuffer = cl->cbuffer;
 	opt.quantum = cl->quantum;
 	opt.prio = cl->prio;
@@ -1203,9 +1240,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 		qdisc_destroy(cl->un.leaf.q);
 	}
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
-	qdisc_put_rtab(cl->rate);
-	qdisc_put_rtab(cl->ceil);
-
 	tcf_destroy_chain(&cl->filter_list);
 	kfree(cl);
 }
@@ -1460,12 +1494,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 	cl->buffer = hopt->buffer;
 	cl->cbuffer = hopt->cbuffer;
-	if (cl->rate)
-		qdisc_put_rtab(cl->rate);
-	cl->rate = rtab;
-	if (cl->ceil)
-		qdisc_put_rtab(cl->ceil);
-	cl->ceil = ctab;
+
+	cl->rate.rate_bps = (u64)rtab->rate.rate << 3;
+	cl->ceil.rate_bps = (u64)ctab->rate.rate << 3;
+
+	htb_precompute_ratedata(&cl->rate);
+	htb_precompute_ratedata(&cl->ceil);
+
+	cl->buffer = hopt->buffer << PSCHED_SHIFT;
+	cl->cbuffer = hopt->buffer << PSCHED_SHIFT;
+
 	sch_tree_unlock(sch);
 
 	qdisc_class_hash_grow(sch, &q->clhash);
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-19 22:26 [PATCH] htb: improved accuracy at high rates Vimalkumar
@ 2012-10-19 23:52 ` Rick Jones
  2012-10-20  0:51   ` Vimal
  2012-10-20  7:26   ` Eric Dumazet
  2012-10-20  7:47 ` Eric Dumazet
  2012-10-20 10:29 ` Eric Dumazet
  2 siblings, 2 replies; 8+ messages in thread
From: Rick Jones @ 2012-10-19 23:52 UTC (permalink / raw)
  To: Vimalkumar; +Cc: davem, eric.dumazet, Jamal Hadi Salim, netdev

On 10/19/2012 03:26 PM, Vimalkumar wrote:
> Current HTB (and TBF) uses rate table computed by
> the "tc" userspace program, which has the following
> issue:
>
> The rate table has 256 entries to map packet lengths
> to token (time units).  With TSO sized packets, the
> 256 entry granularity leads to loss/gain of rate,
> making the token bucket inaccurate.
>
> Thus, instead of relying on rate table, this patch
> explicitly computes the time and accounts for packet
> transmission times with nanosec granularity.
>
> This greatly improves accuracy of HTB with a wide
> range of packet sizes.
>
> Example:
>
> tc qdisc add dev $dev root handle 1: \
> 	htb default 1
>
> tc class add dev $dev classid 1:1 parent 1: \
> 	rate 1Gbit mtu 64k
>
> Ideally it should work with all intermediate sized
> packets as well, but...
>
> Test:
> for i in {1..20}; do
> 	(netperf -H $host -t UDP_STREAM -l 30 -- -m $size &);
> done
>
> With size=400 bytes: achieved rate ~600Mb/s
> With size=1000 bytes: achieved rate ~835Mb/s
> With size=8000 bytes: achieved rate ~1012Mb/s
>
> With new HTB, in all cases, we achieve ~1000Mb/s.

First some netperf/operational kinds of questions:

Did it really take 20 concurrent netperf UDP_STREAM tests to get to 
those rates?  And why UDP_STREAM rather than TCP_STREAM?

I couldn't recall if GSO did anything for UDP, so did some quick and 
dirty tests flipping GSO on and off on a 3.2.0 kernel, and the service 
demands didn't seem to change.  So, with 8000 bytes of user payload did 
HTB actually see 8000ish byte packets, or did it actually see a series 
of <= MTU sized IP datagram fragments?  Or did the NIC being used have 
UFO enabled?

Which reported throughput was used from the UDP_STREAM tests - send side 
or receive side?

Is there much/any change in service demand on a netperf test?  That is 
what is the service demand of a mumble_STREAM test running through the 
old HTB versus the new HTB?  And/or the performance of a TCP_RR test 
(both transactions per second and service demand per transaction) before 
vs after.

happy benchmarking,

rick jones

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-19 23:52 ` Rick Jones
@ 2012-10-20  0:51   ` Vimal
  2012-10-22 17:35     ` Rick Jones
  2012-10-20  7:26   ` Eric Dumazet
  1 sibling, 1 reply; 8+ messages in thread
From: Vimal @ 2012-10-20  0:51 UTC (permalink / raw)
  To: Rick Jones; +Cc: davem, eric.dumazet, Jamal Hadi Salim, netdev

On 19 October 2012 16:52, Rick Jones <rick.jones2@hp.com> wrote:
>
> First some netperf/operational kinds of questions:
>
> Did it really take 20 concurrent netperf UDP_STREAM tests to get to those
> rates?  And why UDP_STREAM rather than TCP_STREAM?

Nope, even 1 netperf was sufficient.  Before I couldn't get TCP_STREAM
to send small byte packets, but I checked my script now and I forgot
to enable TCP_NODELAY + send buffer size (-s $size).

With one tcp sender I am unable to reach the 1Gb/s limit (only
~100Mb/s) even with a lot of CPU to spare, which indicates that the
test is limited by e2e latency.  With 10 connections, I could get only
800Mb/s, and with 20 connections it went to 1160Mb/s, which violates
the 1Gb/s limit set.

> I couldn't recall if GSO did anything for UDP, so did some quick and dirty
> tests flipping GSO on and off on a 3.2.0 kernel, and the service demands
> didn't seem to change.  So, with 8000 bytes of user payload did HTB actually
> see 8000ish byte packets, or did it actually see a series of <= MTU sized IP
> datagram fragments?  Or did the NIC being used have UFO enabled?
>

UFO was enabled.  Now, I verified that the throughput was about the
same even with TCP, with TSO and by forcing send/recv buffer sizes to
8kB.

> Which reported throughput was used from the UDP_STREAM tests - send side or
> receive side?

Send side.

>
> Is there much/any change in service demand on a netperf test?  That is what
> is the service demand of a mumble_STREAM test running through the old HTB
> versus the new HTB?  And/or the performance of a TCP_RR test (both
> transactions per second and service demand per transaction) before vs after.
>

At 1Gb/s with just one TCP_STREAM:
With old HTB:
Sdem local: 0.548us/KB, Sdem remote: 1.426us/KB.

With new HTB:
Sdem local: 0.598us/KB, Sdem remote: 1.089us/KB.

TCP_RR: 1b req/response consumed very little bandwidth (~12Mb/s)
old HTB at 1Gb/s
Sdem local: 14.738us/trans, Sdem remote: 11.485us/Tran, latency: 41.622us/Tran.

new HTB at 1Gb/s
Sdem local: 14.505us/trans, Sdem remote: 11.440us/Tran, latency: 41.709us/Tran.

With multiple tests, these values are fairly stable. :)

thanks,
> happy benchmarking,
>
> rick jones

-- 
Vimal

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-19 23:52 ` Rick Jones
  2012-10-20  0:51   ` Vimal
@ 2012-10-20  7:26   ` Eric Dumazet
  2012-10-20 16:41     ` Vimal
  1 sibling, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2012-10-20  7:26 UTC (permalink / raw)
  To: Rick Jones; +Cc: Vimalkumar, davem, Jamal Hadi Salim, netdev

On Fri, 2012-10-19 at 16:52 -0700, Rick Jones wrote:

> Did it really take 20 concurrent netperf UDP_STREAM tests to get to 
> those rates?  And why UDP_STREAM rather than TCP_STREAM?
> 
> I couldn't recall if GSO did anything for UDP, so did some quick and 
> dirty tests flipping GSO on and off on a 3.2.0 kernel, and the service 
> demands didn't seem to change.  So, with 8000 bytes of user payload did 
> HTB actually see 8000ish byte packets, or did it actually see a series 
> of <= MTU sized IP datagram fragmentsOn 10/19/2012 03:26 PM, Vimalkumar wrote:

UFO is quite a virtual feature for the moment ;)

To my knowledge only one driver uses it (neterion/s2io.c)

So most probably HTB sees the fragments built by the stack before
queueing packets to the qdisc layer.

tcpdump can be used to check this.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-19 22:26 [PATCH] htb: improved accuracy at high rates Vimalkumar
  2012-10-19 23:52 ` Rick Jones
@ 2012-10-20  7:47 ` Eric Dumazet
  2012-10-20 10:29 ` Eric Dumazet
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Dumazet @ 2012-10-20  7:47 UTC (permalink / raw)
  To: Vimalkumar; +Cc: davem, Jamal Hadi Salim, netdev

On Fri, 2012-10-19 at 15:26 -0700, Vimalkumar wrote:
> Current HTB (and TBF) uses rate table computed by
> the "tc" userspace program, which has the following
> issue:
> 
> The rate table has 256 entries to map packet lengths
> to token (time units).  With TSO sized packets, the
> 256 entry granularity leads to loss/gain of rate,
> making the token bucket inaccurate.
> 
> Thus, instead of relying on rate table, this patch
> explicitly computes the time and accounts for packet
> transmission times with nanosec granularity.
> 
> This greatly improves accuracy of HTB with a wide
> range of packet sizes.
> 
> Example:
> 
> tc qdisc add dev $dev root handle 1: \
> 	htb default 1
> 
> tc class add dev $dev classid 1:1 parent 1: \
> 	rate 1Gbit mtu 64k
> 
> Ideally it should work with all intermediate sized
> packets as well, but...
> 
> Test:
> for i in {1..20}; do
> 	(netperf -H $host -t UDP_STREAM -l 30 -- -m $size &);
> done
> 
> With size=400 bytes: achieved rate ~600Mb/s
> With size=1000 bytes: achieved rate ~835Mb/s
> With size=8000 bytes: achieved rate ~1012Mb/s
> 
> With new HTB, in all cases, we achieve ~1000Mb/s.
> Many thanks to Eric Dumazet for review and feedback.
> 
> Signed-off-by: Vimalkumar <j.vimal@gmail.com>
> ---

Thanks Vimal for this promising work.

I am going to test it, but my preliminary remarks are :


1)  This is a net-next work, so its better to add the net-next tag in
the title, as in : 

"[PATCH net-next] htb: improved accuracy at high rates"

So next submission will also include a version tag as in :

"[PATCH net-next V2] htb: improved accuracy at high rates"


2) I dont know how you generated your diff, but its not applicable
   without knowing where file is located.

So instead of

diff --git a/sch_htb.c b/sch_htb.c
index 29b942c..a808ba1 100644
--- a/sch_htb.c
+++ b/sch_htb.c

we need the proper :

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9d75b77..13e02b3 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c

3) Always use an up2date net-next tree before submitting an official
patch.

Here is what I have applying your patch (once I fixed the path)

patching file net/sched/sch_htb.c
Hunk #9 succeeded at 612 (offset -3 lines).
Hunk #10 succeeded at 654 (offset -3 lines).
Hunk #11 succeeded at 704 (offset -3 lines).
Hunk #12 succeeded at 715 (offset -3 lines).
Hunk #13 succeeded at 902 (offset -2 lines).
Hunk #14 succeeded at 915 (offset -2 lines).
Hunk #15 succeeded at 934 (offset -2 lines).
Hunk #16 succeeded at 1121 (offset -1 lines).

It means you worked on an old version, so its always suspicious

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-19 22:26 [PATCH] htb: improved accuracy at high rates Vimalkumar
  2012-10-19 23:52 ` Rick Jones
  2012-10-20  7:47 ` Eric Dumazet
@ 2012-10-20 10:29 ` Eric Dumazet
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Dumazet @ 2012-10-20 10:29 UTC (permalink / raw)
  To: Vimalkumar; +Cc: davem, Jamal Hadi Salim, netdev

On Fri, 2012-10-19 at 15:26 -0700, Vimalkumar wrote:

> +	if (likely(next_event > q->now)) {
> +		if (!test_bit(__QDISC_STATE_DEACTIVATED,
> +			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
> +			ktime_t time = ktime_set(0, 0);
> +			qdisc_throttled(q->watchdog.qdisc);
> +			time = ktime_add_ns(time, next_event);
> +			hrtimer_start(&q->watchdog.timer, time,
> +				      HRTIMER_MODE_ABS);
> +		}
> +	} else 


could use ns_to_ktime() helper

+	if (likely(next_event > q->now)) {
> +		if (!test_bit(__QDISC_STATE_DEACTIVATED,
> +			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
> +			qdisc_throttled(q->watchdog.qdisc);
> +			hrtimer_start(&q->watchdog.timer, ns_to_ktime(next_event),
> +				      HRTIMER_MODE_ABS);
> +		}
> +	} else 

I'll post a patch to cleanup net/sched/sch_api.c & net/sched/sch_cbq.c

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-20  7:26   ` Eric Dumazet
@ 2012-10-20 16:41     ` Vimal
  0 siblings, 0 replies; 8+ messages in thread
From: Vimal @ 2012-10-20 16:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Rick Jones, davem, Jamal Hadi Salim, netdev

On 20 October 2012 00:26, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> UFO is quite a virtual feature for the moment ;)
>
> To my knowledge only one driver uses it (neterion/s2io.c)
>
> So most probably HTB sees the fragments built by the stack before
> queueing packets to the qdisc layer.
>
> tcpdump can be used to check this.


Ah you're right.  When I checked tcpdump output earlier, I checked the
IP length which was 8000 bytes, so I presumed UFO was on.  When I
looked carefully again, the bytes on the wire were MTU sized.

Thanks for the suggestions!  I just sent the next version of the patch
against net-next, and provided a simpler example that illustrates the
inaccuracy.

-- 
Vimal

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] htb: improved accuracy at high rates
  2012-10-20  0:51   ` Vimal
@ 2012-10-22 17:35     ` Rick Jones
  0 siblings, 0 replies; 8+ messages in thread
From: Rick Jones @ 2012-10-22 17:35 UTC (permalink / raw)
  To: Vimal; +Cc: davem, eric.dumazet, Jamal Hadi Salim, netdev

On 10/19/2012 05:51 PM, Vimal wrote:
> On 19 October 2012 16:52, Rick Jones <rick.jones2@hp.com> wrote:
>>
>> First some netperf/operational kinds of questions:
>>
>> Did it really take 20 concurrent netperf UDP_STREAM tests to get to those
>> rates?  And why UDP_STREAM rather than TCP_STREAM?
>
> Nope, even 1 netperf was sufficient.  Before I couldn't get TCP_STREAM
> to send small byte packets, but I checked my script now and I forgot
> to enable TCP_NODELAY + send buffer size (-s $size).
>
> With one tcp sender I am unable to reach the 1Gb/s limit (only
> ~100Mb/s) even with a lot of CPU to spare, which indicates that the
> test is limited by e2e latency.  With 10 connections, I could get only
> 800Mb/s, and with 20 connections it went to 1160Mb/s, which violates
> the 1Gb/s limit set.

Were you explicitly constraining the TCP socketbuffer/window via 
test-specific -s and -S options?  Or was this a system with little 
enough memory that the upper limit for the TCP socket/window autotuning 
wasn't the somewhat common, which would have been sufficient to cover a 
rather large RTT,  The results of the TCP_RR tests below suggest there 
was actually very little e2e latency... which suggests something else 
was holding-back the TCP performance.  Perhaps lost packets?

Or does this suggest the need for an htb_codel?-)  If your system can 
have rrdtool installed on it, and you are indeed using a contemporary 
version of netperf, you can run the bloat.sh script from doc/examples 
and get an idea of how much bufferbloat there is in the setup.

If you are using a "current" version of netperf, you can use the omni 
output selectors to have netperf emit the number of TCP retransmissions 
on the data connection during the test.  Otherwise, if the netperf tests 
are the only things running at the time, you can take a snapshot of 
netstat -s output before and after the test and run it through something 
like beforeafter, or the other script I keep forgetting the name of :(

>> Which reported throughput was used from the UDP_STREAM tests - send side or
>> receive side?
>
> Send side.

Given the nature of UDP and that netperf makes no attempt to compensate 
for that, it would be rather better to use receive-side throughout.  The 
receive side throughput is known to have made it through everything. 
The send side throughput is only that which didn't report an error in 
the sendto() call.  And lack of error on the sendto() call does not 
guarantee a successful transmission on the wire.  Even under Linux with 
intra-stack flow-control.

>> Is there much/any change in service demand on a netperf test?  That is what
>> is the service demand of a mumble_STREAM test running through the old HTB
>> versus the new HTB?  And/or the performance of a TCP_RR test (both
>> transactions per second and service demand per transaction) before vs after.
>>
>
> At 1Gb/s with just one TCP_STREAM:
> With old HTB:
> Sdem local: 0.548us/KB, Sdem remote: 1.426us/KB.
>
> With new HTB:
> Sdem local: 0.598us/KB, Sdem remote: 1.089us/KB.

Presumably the receive side service demand should have remained 
unaffected by the HTB changes.  That it changed by 40% suggests there 
wasn't actually all that much stability - at least not on the receive 
side.  Was there a large change in throughput for the single-stream?

That there was a 9% increase in sending service demand is a bit 
troubling, and at least slightly at odds with the little to no change in 
the sending service demand for the TCP_RR tests below.  I suppose that 
the "timing" nature of having things like small sends and TCP_NODELAY 
set can introduce too many variables.

One other way to skin the cat of "what does it do to sending service 
demand would be to stick with TCP_RR and walk-up the request size. 
Without a burst mode enabled, and just the one transaction in flight at 
one time, TCP_NODELAY on/off should be a don't care.  So, something 
along the lines of:

HDR="-P 1"
for r in 1 4 16 64 256 1024 4096 16384 65535
do
netperf $HDR -H <remote> -t TCP_RR -c -C -l 30 -- -r ${r},1
HDR="-P 0"
done

> TCP_RR: 1b req/response consumed very little bandwidth (~12Mb/s)

That is why, by default, it reports a transaction per second rate and 
not a bandwidth :)

> old HTB at 1Gb/s
> Sdem local: 14.738us/trans, Sdem remote: 11.485us/Tran, latency: 41.622us/Tran.
>
> new HTB at 1Gb/s
> Sdem local: 14.505us/trans, Sdem remote: 11.440us/Tran, latency: 41.709us/Tran.
>
> With multiple tests, these values are fairly stable. :)

Those do look like they were within the noise level.

Out of mostly idle curiousity, just what sort of system was being used 
for the testing?  CPU, bitness, memory etc.

happy benchmarking,

rick jones

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2012-10-22 17:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-19 22:26 [PATCH] htb: improved accuracy at high rates Vimalkumar
2012-10-19 23:52 ` Rick Jones
2012-10-20  0:51   ` Vimal
2012-10-22 17:35     ` Rick Jones
2012-10-20  7:26   ` Eric Dumazet
2012-10-20 16:41     ` Vimal
2012-10-20  7:47 ` Eric Dumazet
2012-10-20 10:29 ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).