* [PATCH net-next] net_sched: red: split red_parms into parms and vars
@ 2012-01-05 12:25 Eric Dumazet
2012-01-05 13:03 ` Dave Taht
2012-01-05 19:08 ` [PATCH net-next] net_sched: red: split red_parms into parms and vars David Miller
0 siblings, 2 replies; 18+ messages in thread
From: Eric Dumazet @ 2012-01-05 12:25 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Dave Taht, Stephen Hemminger
This patch splits the red_parms structure into two components.
One holding the RED 'constant' parameters, and one containing the
variables.
This permits a size reduction of GRED qdisc, and is a preliminary step
to add an optional RED unit to SFQ.
SFQRED will have a single red_parms structure shared by all flows, and a
private red_vars per flow.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Dave Taht <dave.taht@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
include/net/red.h | 98 ++++++++++++++++++++++------------------
net/sched/sch_choke.c | 40 ++++++++--------
net/sched/sch_gred.c | 45 +++++++++---------
net/sched/sch_red.c | 29 ++++++-----
4 files changed, 117 insertions(+), 95 deletions(-)
diff --git a/include/net/red.h b/include/net/red.h
index ef715a1..baab385 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -137,7 +137,9 @@ struct red_parms {
u8 Wlog; /* log(W) */
u8 Plog; /* random number bits */
u8 Stab[RED_STAB_SIZE];
+};
+struct red_vars {
/* Variables */
int qcount; /* Number of packets since last random
number generation */
@@ -152,6 +154,16 @@ static inline u32 red_maxp(u8 Plog)
return Plog < 32 ? (~0U >> Plog) : ~0U;
}
+static inline void red_set_vars(struct red_vars *v)
+{
+ /* Reset average queue length, the value is strictly bound
+ * to the parameters below, reseting hurts a bit but leaving
+ * it might result in an unreasonable qavg for a while. --TGR
+ */
+ v->qavg = 0;
+
+ v->qcount = -1;
+}
static inline void red_set_parms(struct red_parms *p,
u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog,
@@ -160,13 +172,6 @@ static inline void red_set_parms(struct red_parms *p,
int delta = qth_max - qth_min;
u32 max_p_delta;
- /* Reset average queue length, the value is strictly bound
- * to the parameters below, reseting hurts a bit but leaving
- * it might result in an unreasonable qavg for a while. --TGR
- */
- p->qavg = 0;
-
- p->qcount = -1;
p->qth_min = qth_min << Wlog;
p->qth_max = qth_max << Wlog;
p->Wlog = Wlog;
@@ -197,31 +202,32 @@ static inline void red_set_parms(struct red_parms *p,
memcpy(p->Stab, stab, sizeof(p->Stab));
}
-static inline int red_is_idling(const struct red_parms *p)
+static inline int red_is_idling(const struct red_vars *v)
{
- return p->qidlestart.tv64 != 0;
+ return v->qidlestart.tv64 != 0;
}
-static inline void red_start_of_idle_period(struct red_parms *p)
+static inline void red_start_of_idle_period(struct red_vars *v)
{
- p->qidlestart = ktime_get();
+ v->qidlestart = ktime_get();
}
-static inline void red_end_of_idle_period(struct red_parms *p)
+static inline void red_end_of_idle_period(struct red_vars *v)
{
- p->qidlestart.tv64 = 0;
+ v->qidlestart.tv64 = 0;
}
-static inline void red_restart(struct red_parms *p)
+static inline void red_restart(struct red_vars *v)
{
- red_end_of_idle_period(p);
- p->qavg = 0;
- p->qcount = -1;
+ red_end_of_idle_period(v);
+ v->qavg = 0;
+ v->qcount = -1;
}
-static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms *p)
+static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms *p,
+ const struct red_vars *v)
{
- s64 delta = ktime_us_delta(ktime_get(), p->qidlestart);
+ s64 delta = ktime_us_delta(ktime_get(), v->qidlestart);
long us_idle = min_t(s64, delta, p->Scell_max);
int shift;
@@ -248,7 +254,7 @@ static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms
shift = p->Stab[(us_idle >> p->Scell_log) & RED_STAB_MASK];
if (shift)
- return p->qavg >> shift;
+ return v->qavg >> shift;
else {
/* Approximate initial part of exponent with linear function:
*
@@ -257,16 +263,17 @@ static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms
* Seems, it is the best solution to
* problem of too coarse exponent tabulation.
*/
- us_idle = (p->qavg * (u64)us_idle) >> p->Scell_log;
+ us_idle = (v->qavg * (u64)us_idle) >> p->Scell_log;
- if (us_idle < (p->qavg >> 1))
- return p->qavg - us_idle;
+ if (us_idle < (v->qavg >> 1))
+ return v->qavg - us_idle;
else
- return p->qavg >> 1;
+ return v->qavg >> 1;
}
}
static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p,
+ const struct red_vars *v,
unsigned int backlog)
{
/*
@@ -278,16 +285,17 @@ static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p
*
* --ANK (980924)
*/
- return p->qavg + (backlog - (p->qavg >> p->Wlog));
+ return v->qavg + (backlog - (v->qavg >> p->Wlog));
}
static inline unsigned long red_calc_qavg(const struct red_parms *p,
+ const struct red_vars *v,
unsigned int backlog)
{
- if (!red_is_idling(p))
- return red_calc_qavg_no_idle_time(p, backlog);
+ if (!red_is_idling(v))
+ return red_calc_qavg_no_idle_time(p, v, backlog);
else
- return red_calc_qavg_from_idle_time(p);
+ return red_calc_qavg_from_idle_time(p, v);
}
@@ -296,7 +304,9 @@ static inline u32 red_random(const struct red_parms *p)
return reciprocal_divide(net_random(), p->max_P_reciprocal);
}
-static inline int red_mark_probability(const struct red_parms *p, unsigned long qavg)
+static inline int red_mark_probability(const struct red_parms *p,
+ const struct red_vars *v,
+ unsigned long qavg)
{
/* The formula used below causes questions.
@@ -314,7 +324,7 @@ static inline int red_mark_probability(const struct red_parms *p, unsigned long
Any questions? --ANK (980924)
*/
- return !(((qavg - p->qth_min) >> p->Wlog) * p->qcount < p->qR);
+ return !(((qavg - p->qth_min) >> p->Wlog) * v->qcount < v->qR);
}
enum {
@@ -323,7 +333,7 @@ enum {
RED_ABOVE_MAX_TRESH,
};
-static inline int red_cmp_thresh(struct red_parms *p, unsigned long qavg)
+static inline int red_cmp_thresh(const struct red_parms *p, unsigned long qavg)
{
if (qavg < p->qth_min)
return RED_BELOW_MIN_THRESH;
@@ -339,27 +349,29 @@ enum {
RED_HARD_MARK,
};
-static inline int red_action(struct red_parms *p, unsigned long qavg)
+static inline int red_action(const struct red_parms *p,
+ struct red_vars *v,
+ unsigned long qavg)
{
switch (red_cmp_thresh(p, qavg)) {
case RED_BELOW_MIN_THRESH:
- p->qcount = -1;
+ v->qcount = -1;
return RED_DONT_MARK;
case RED_BETWEEN_TRESH:
- if (++p->qcount) {
- if (red_mark_probability(p, qavg)) {
- p->qcount = 0;
- p->qR = red_random(p);
+ if (++v->qcount) {
+ if (red_mark_probability(p, v, qavg)) {
+ v->qcount = 0;
+ v->qR = red_random(p);
return RED_PROB_MARK;
}
} else
- p->qR = red_random(p);
+ v->qR = red_random(p);
return RED_DONT_MARK;
case RED_ABOVE_MAX_TRESH:
- p->qcount = -1;
+ v->qcount = -1;
return RED_HARD_MARK;
}
@@ -367,14 +379,14 @@ static inline int red_action(struct red_parms *p, unsigned long qavg)
return RED_DONT_MARK;
}
-static inline void red_adaptative_algo(struct red_parms *p)
+static inline void red_adaptative_algo(struct red_parms *p, struct red_vars *v)
{
unsigned long qavg;
u32 max_p_delta;
- qavg = p->qavg;
- if (red_is_idling(p))
- qavg = red_calc_qavg_from_idle_time(p);
+ qavg = v->qavg;
+ if (red_is_idling(v))
+ qavg = red_calc_qavg_from_idle_time(p, v);
/* p->qavg is fixed point number with point at Wlog */
qavg >>= p->Wlog;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index bef00ac..e465064 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -57,6 +57,7 @@ struct choke_sched_data {
struct red_parms parms;
/* Variables */
+ struct red_vars vars;
struct tcf_proto *filter_list;
struct {
u32 prob_drop; /* Early probability drops */
@@ -265,7 +266,7 @@ static bool choke_match_random(const struct choke_sched_data *q,
static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
- struct red_parms *p = &q->parms;
+ const struct red_parms *p = &q->parms;
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (q->filter_list) {
@@ -276,13 +277,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
choke_skb_cb(skb)->keys_valid = 0;
/* Compute average queue usage (see RED) */
- p->qavg = red_calc_qavg(p, sch->q.qlen);
- if (red_is_idling(p))
- red_end_of_idle_period(p);
+ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
/* Is queue small? */
- if (p->qavg <= p->qth_min)
- p->qcount = -1;
+ if (q->vars.qavg <= p->qth_min)
+ q->vars.qcount = -1;
else {
unsigned int idx;
@@ -294,8 +295,8 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
/* Queue is large, always mark/drop */
- if (p->qavg > p->qth_max) {
- p->qcount = -1;
+ if (q->vars.qavg > p->qth_max) {
+ q->vars.qcount = -1;
sch->qstats.overlimits++;
if (use_harddrop(q) || !use_ecn(q) ||
@@ -305,10 +306,10 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
q->stats.forced_mark++;
- } else if (++p->qcount) {
- if (red_mark_probability(p, p->qavg)) {
- p->qcount = 0;
- p->qR = red_random(p);
+ } else if (++q->vars.qcount) {
+ if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
+ q->vars.qcount = 0;
+ q->vars.qR = red_random(p);
sch->qstats.overlimits++;
if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
@@ -319,7 +320,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->stats.prob_mark++;
}
} else
- p->qR = red_random(p);
+ q->vars.qR = red_random(p);
}
/* Admit new packet */
@@ -353,8 +354,8 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
if (q->head == q->tail) {
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
return NULL;
}
@@ -377,8 +378,8 @@ static unsigned int choke_drop(struct Qdisc *sch)
if (len > 0)
q->stats.other++;
else {
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
}
return len;
@@ -388,7 +389,7 @@ static void choke_reset(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
- red_restart(&q->parms);
+ red_restart(&q->vars);
}
static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
@@ -482,9 +483,10 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
ctl->Plog, ctl->Scell_log,
nla_data(tb[TCA_CHOKE_STAB]),
max_P);
+ red_set_vars(&q->vars);
if (q->head == q->tail)
- red_end_of_idle_period(&q->parms);
+ red_end_of_idle_period(&q->vars);
sch_tree_unlock(sch);
choke_free(old);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 53204de..0b15236 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -41,6 +41,7 @@ struct gred_sched_data {
u8 prio; /* the prio of this vq */
struct red_parms parms;
+ struct red_vars vars;
struct red_stats stats;
};
@@ -55,7 +56,7 @@ struct gred_sched {
u32 red_flags;
u32 DPs;
u32 def;
- struct red_parms wred_set;
+ struct red_vars wred_set;
};
static inline int gred_wred_mode(struct gred_sched *table)
@@ -125,17 +126,17 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb)
return skb->tc_index & GRED_VQ_MASK;
}
-static inline void gred_load_wred_set(struct gred_sched *table,
+static inline void gred_load_wred_set(const struct gred_sched *table,
struct gred_sched_data *q)
{
- q->parms.qavg = table->wred_set.qavg;
- q->parms.qidlestart = table->wred_set.qidlestart;
+ q->vars.qavg = table->wred_set.qavg;
+ q->vars.qidlestart = table->wred_set.qidlestart;
}
static inline void gred_store_wred_set(struct gred_sched *table,
struct gred_sched_data *q)
{
- table->wred_set.qavg = q->parms.qavg;
+ table->wred_set.qavg = q->vars.qavg;
}
static inline int gred_use_ecn(struct gred_sched *t)
@@ -170,7 +171,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
goto drop;
}
- /* fix tc_index? --could be controvesial but needed for
+ /* fix tc_index? --could be controversial but needed for
requeueing */
skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
}
@@ -181,8 +182,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
- !red_is_idling(&t->tab[i]->parms))
- qavg += t->tab[i]->parms.qavg;
+ !red_is_idling(&t->tab[i]->vars))
+ qavg += t->tab[i]->vars.qavg;
}
}
@@ -193,15 +194,17 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (gred_wred_mode(t))
gred_load_wred_set(t, q);
- q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ gred_backlog(t, q, sch));
- if (red_is_idling(&q->parms))
- red_end_of_idle_period(&q->parms);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
if (gred_wred_mode(t))
gred_store_wred_set(t, q);
- switch (red_action(&q->parms, q->parms.qavg + qavg)) {
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
case RED_DONT_MARK:
break;
@@ -260,7 +263,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch)
q->backlog -= qdisc_pkt_len(skb);
if (!q->backlog && !gred_wred_mode(t))
- red_start_of_idle_period(&q->parms);
+ red_start_of_idle_period(&q->vars);
}
return skb;
@@ -293,7 +296,7 @@ static unsigned int gred_drop(struct Qdisc *sch)
q->stats.other++;
if (!q->backlog && !gred_wred_mode(t))
- red_start_of_idle_period(&q->parms);
+ red_start_of_idle_period(&q->vars);
}
qdisc_drop(skb, sch);
@@ -320,7 +323,7 @@ static void gred_reset(struct Qdisc *sch)
if (!q)
continue;
- red_restart(&q->parms);
+ red_restart(&q->vars);
q->backlog = 0;
}
}
@@ -398,12 +401,12 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
q->limit = ctl->limit;
if (q->backlog == 0)
- red_end_of_idle_period(&q->parms);
+ red_end_of_idle_period(&q->vars);
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
ctl->Scell_log, stab, max_P);
-
+ red_set_vars(&q->vars);
return 0;
}
@@ -563,12 +566,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.bytesin = q->bytesin;
if (gred_wred_mode(table)) {
- q->parms.qidlestart =
- table->tab[table->def]->parms.qidlestart;
- q->parms.qavg = table->tab[table->def]->parms.qavg;
+ q->vars.qidlestart =
+ table->tab[table->def]->vars.qidlestart;
+ q->vars.qavg = table->tab[table->def]->vars.qavg;
}
- opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
+ opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg);
append_opt:
if (nla_append(skb, sizeof(opt), &opt) < 0)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index ce2256a..a5cc301 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -41,6 +41,7 @@ struct red_sched_data {
unsigned char flags;
struct timer_list adapt_timer;
struct red_parms parms;
+ struct red_vars vars;
struct red_stats stats;
struct Qdisc *qdisc;
};
@@ -61,12 +62,14 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
struct Qdisc *child = q->qdisc;
int ret;
- q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog);
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ child->qstats.backlog);
- if (red_is_idling(&q->parms))
- red_end_of_idle_period(&q->parms);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
- switch (red_action(&q->parms, q->parms.qavg)) {
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
case RED_DONT_MARK:
break;
@@ -117,8 +120,8 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
} else {
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
}
return skb;
}
@@ -144,8 +147,8 @@ static unsigned int red_drop(struct Qdisc *sch)
return len;
}
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
return 0;
}
@@ -156,7 +159,7 @@ static void red_reset(struct Qdisc *sch)
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
- red_restart(&q->parms);
+ red_restart(&q->vars);
}
static void red_destroy(struct Qdisc *sch)
@@ -212,17 +215,19 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
q->qdisc = child;
}
- red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
nla_data(tb[TCA_RED_STAB]),
max_P);
+ red_set_vars(&q->vars);
del_timer(&q->adapt_timer);
if (ctl->flags & TC_RED_ADAPTATIVE)
mod_timer(&q->adapt_timer, jiffies + HZ/2);
if (!q->qdisc->q.qlen)
- red_start_of_idle_period(&q->parms);
+ red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
return 0;
@@ -235,7 +240,7 @@ static inline void red_adaptative_timer(unsigned long arg)
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
- red_adaptative_algo(&q->parms);
+ red_adaptative_algo(&q->parms, &q->vars);
mod_timer(&q->adapt_timer, jiffies + HZ/2);
spin_unlock(root_lock);
}
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH net-next] net_sched: red: split red_parms into parms and vars
2012-01-05 12:25 [PATCH net-next] net_sched: red: split red_parms into parms and vars Eric Dumazet
@ 2012-01-05 13:03 ` Dave Taht
2012-01-05 13:39 ` Eric Dumazet
2012-01-05 19:08 ` [PATCH net-next] net_sched: red: split red_parms into parms and vars David Miller
1 sibling, 1 reply; 18+ messages in thread
From: Dave Taht @ 2012-01-05 13:03 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Stephen Hemminger
On Thu, Jan 5, 2012 at 1:25 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> This patch splits the red_parms structure into two components.
>
> One holding the RED 'constant' parameters, and one containing the
> variables.
>
> This permits a size reduction of GRED qdisc, and is a preliminary step
> to add an optional RED unit to SFQ.
ah. I see where you are going with this. You are trying also to get your
'time in red' adaptation idea done, aren't you, or at least lay the groundwork
for that?
Are you trying to make the 3.3 merge window for all this?
I don't want to discourage you in any way, but byte oriented RED
with or without TSO doesn't work in your typical asymmetric
environment. If you set an appropriate byte limit for
stuff going one way, you end up with acks going out of
control - and vice versa.
and/or packet oriented RED has some hope, but in either
case you have to do something intelligent with a giant TSO
stream if you are going to use it on a server or host.
And the basic red still has two flaws that may or may
not go away with your adaptation, or combination with
SFQ - (I do retain hope, though)
I think you already grok this and you've already got the solution
outlined :)
so I'm going to go away now and finish writing up
what the problems are in sch_md, which are legion...
Keep having fun. I'll plan on a new build by the weekend.
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH net-next] net_sched: red: split red_parms into parms and vars
2012-01-05 13:03 ` Dave Taht
@ 2012-01-05 13:39 ` Eric Dumazet
2012-01-06 5:47 ` Eric Dumazet
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
0 siblings, 2 replies; 18+ messages in thread
From: Eric Dumazet @ 2012-01-05 13:39 UTC (permalink / raw)
To: Dave Taht; +Cc: David Miller, netdev, Stephen Hemminger
Le jeudi 05 janvier 2012 à 14:03 +0100, Dave Taht a écrit :
> On Thu, Jan 5, 2012 at 1:25 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > This patch splits the red_parms structure into two components.
> >
> > One holding the RED 'constant' parameters, and one containing the
> > variables.
> >
> > This permits a size reduction of GRED qdisc, and is a preliminary step
> > to add an optional RED unit to SFQ.
>
> ah. I see where you are going with this. You are trying also to get your
> 'time in red' adaptation idea done, aren't you, or at least lay the groundwork
> for that?
>
RED algo is agnostic : The quantity you want to average can be :
1) queue length in bytes
2) queue length in packets
3) time of residence in queue
...
Adding 3) is very easy.
> Are you trying to make the 3.3 merge window for all this?
>
Its up to David ;)
> I don't want to discourage you in any way, but byte oriented RED
> with or without TSO doesn't work in your typical asymmetric
> environment. If you set an appropriate byte limit for
> stuff going one way, you end up with acks going out of
> control - and vice versa.
TSO has little impact if you use "average queue length in bytes"
My main concern about adding RED is to be able to perform ecn marking
instead of mere packet drops.
In the case of your router, TSO is a non issue, since a router should
handle MTU sized skbs. (disable GRO at ingress side)
>
> and/or packet oriented RED has some hope, but in either
> case you have to do something intelligent with a giant TSO
> stream if you are going to use it on a server or host.
>
Again, a flow is a flow, and TSO is just a big packet.
Once you separate flows (using SFQ or QFQ), adding a RED unit permits
early drops only on flows that want to be unfair.
> And the basic red still has two flaws that may or may
> not go away with your adaptation, or combination with
> SFQ - (I do retain hope, though)
I only want to 'activate' red on selected backlogged flows.
tc qdisc add dev eth0 root sfq[red] \
limit 3000 headdrop flows 512 divisor 16384 \
redflowlimit 200000b ecn
Problems of RED are :
1) the 'average' on highly variable trafic : It means nothing
interesting at all.
2) the idle time management requires to specify the link bandwidth, as a
fixed value. This is a problem.
In case of SFQRED, flow bandwidth is unknown, or very dynamic.
This is why I plan to reset qavg to 0 each time a flow becomes 'active',
and not care of idle time at all.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH net-next] net_sched: red: split red_parms into parms and vars
2012-01-05 12:25 [PATCH net-next] net_sched: red: split red_parms into parms and vars Eric Dumazet
2012-01-05 13:03 ` Dave Taht
@ 2012-01-05 19:08 ` David Miller
1 sibling, 0 replies; 18+ messages in thread
From: David Miller @ 2012-01-05 19:08 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, dave.taht, shemminger
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 05 Jan 2012 13:25:16 +0100
> This patch splits the red_parms structure into two components.
>
> One holding the RED 'constant' parameters, and one containing the
> variables.
>
> This permits a size reduction of GRED qdisc, and is a preliminary step
> to add an optional RED unit to SFQ.
>
> SFQRED will have a single red_parms structure shared by all flows, and a
> private red_vars per flow.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Applied.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH net-next] net_sched: red: split red_parms into parms and vars
2012-01-05 13:39 ` Eric Dumazet
@ 2012-01-06 5:47 ` Eric Dumazet
2012-01-06 8:31 ` Dave Taht
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
1 sibling, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2012-01-06 5:47 UTC (permalink / raw)
To: Dave Taht; +Cc: David Miller, netdev, Stephen Hemminger
Le jeudi 05 janvier 2012 à 14:39 +0100, Eric Dumazet a écrit :
> Problems of RED are :
>
> 1) the 'average' on highly variable trafic : It means nothing
> interesting at all.
>
> 2) the idle time management requires to specify the link bandwidth, as a
> fixed value. This is a problem.
A third problem is that ECN marking is performed on the packet we add to
the tail of the queue. It's very late...
I am currently testing my SFQRED and I try to add ECN in the first
packet in queue so that TCP can react faster.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH net-next] net_sched: red: split red_parms into parms and vars
2012-01-06 5:47 ` Eric Dumazet
@ 2012-01-06 8:31 ` Dave Taht
0 siblings, 0 replies; 18+ messages in thread
From: Dave Taht @ 2012-01-06 8:31 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Stephen Hemminger
On Fri, Jan 6, 2012 at 6:47 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le jeudi 05 janvier 2012 à 14:39 +0100, Eric Dumazet a écrit :
>
>> Problems of RED are :
>>
>> 1) the 'average' on highly variable trafic : It means nothing
>> interesting at all.
>>
>> 2) the idle time management requires to specify the link bandwidth, as a
>> fixed value. This is a problem.
>
> A third problem is that ECN marking is performed on the packet we add to
> the tail of the queue. It's very late...
>
> I am currently testing my SFQRED and I try to add ECN in the first
> packet in queue so that TCP can react faster.
If you are in particular looking for stuff worth marking, vs dropping...
you could even look deeper in the queue (group) than the first packet.
I am reminded somewhat of how david mills handled the first
NSFnet congestion collapse on the fuzzball.
http://osdir.com/ml/culture.internet.history/2004-12/msg00003.html
"When a new packet arrived and no buffer space was available, the
output queues were scanned looking for the biggest elephant (total
byte count on all queues from the same IP address) and killed its
biggest packet. Gunshots continued until either the arriving packet
got shot or there was enough room to save it. It all worked
gangbusters and the poor ftpers never found out. "
>
>
>
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-05 13:39 ` Eric Dumazet
2012-01-06 5:47 ` Eric Dumazet
@ 2012-01-06 16:31 ` Eric Dumazet
2012-01-06 16:56 ` Dave Taht
` (3 more replies)
1 sibling, 4 replies; 18+ messages in thread
From: Eric Dumazet @ 2012-01-06 16:31 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Stephen Hemminger, Dave Taht
Adds an optional Random Early Detection on each SFQ flow queue.
Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.
1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.
2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]
Example of use :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
limit 3000 headdrop flows 512 divisor 16384 \
redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
ewma 6 min 8000b max 60000b probability 0.2 ecn
prob_mark 0 prob_mark_head 4876 prob_drop 6131
forced_mark 0 forced_mark_head 0 forced_drop 0
Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)
If same test is run, without RED, we can check backlog is much bigger.
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
---
include/linux/pkt_sched.h | 20 ++++
include/net/red.h | 3
net/sched/sch_sfq.c | 146 ++++++++++++++++++++++++++++++++----
3 files changed, 152 insertions(+), 17 deletions(-)
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 8f1b928..0d5b793 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -162,10 +162,30 @@ struct tc_sfq_qopt {
unsigned flows; /* Maximal number of flows */
};
+struct tc_sfqred_stats {
+ __u32 prob_drop; /* Early drops, below max threshold */
+ __u32 forced_drop; /* Early drops, after max threshold */
+ __u32 prob_mark; /* Marked packets, below max threshold */
+ __u32 forced_mark; /* Marked packets, after max threshold */
+ __u32 prob_mark_head; /* Marked packets, below max threshold */
+ __u32 forced_mark_head;/* Marked packets, after max threshold */
+};
+
struct tc_sfq_qopt_v1 {
struct tc_sfq_qopt v0;
unsigned int depth; /* max number of packets per flow */
unsigned int headdrop;
+/* SFQRED parameters */
+ __u32 limit; /* HARD maximal flow queue length (bytes) */
+ __u32 qth_min; /* Min average length threshold (bytes) */
+ __u32 qth_max; /* Max average length threshold (bytes) */
+ unsigned char Wlog; /* log(W) */
+ unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
+ unsigned char Scell_log; /* cell size for idle damping */
+ unsigned char flags;
+ __u32 max_P; /* probability, high resolution */
+/* SFQRED stats */
+ struct tc_sfqred_stats stats;
};
diff --git a/include/net/red.h b/include/net/red.h
index baab385..28068ec 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -199,7 +199,8 @@ static inline void red_set_parms(struct red_parms *p,
p->Scell_log = Scell_log;
p->Scell_max = (255 << Scell_log);
- memcpy(p->Stab, stab, sizeof(p->Stab));
+ if (stab)
+ memcpy(p->Stab, stab, sizeof(p->Stab));
}
static inline int red_is_idling(const struct red_vars *v)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 0a79640..67494ae 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -24,6 +24,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/flow_keys.h>
+#include <net/red.h>
/* Stochastic Fairness Queuing algorithm.
@@ -108,24 +109,30 @@ struct sfq_slot {
struct sfq_head dep; /* anchor in dep[] chains */
unsigned short hash; /* hash value (index in ht[]) */
short allot; /* credit for this slot */
+
+ unsigned int backlog;
+ struct red_vars vars;
};
struct sfq_sched_data {
/* frequently used fields */
int limit; /* limit of total number of packets in this qdisc */
unsigned int divisor; /* number of slots in hash table */
- unsigned int maxflows; /* number of flows in flows array */
- int headdrop;
- int maxdepth; /* limit of packets per flow */
+ u8 headdrop;
+ u8 maxdepth; /* limit of packets per flow */
u32 perturbation;
- struct tcf_proto *filter_list;
- sfq_index cur_depth; /* depth of longest slot */
+ u8 cur_depth; /* depth of longest slot */
+ u8 flags;
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
- struct sfq_slot *tail; /* current slot in round */
+ struct tcf_proto *filter_list;
sfq_index *ht; /* Hash table ('divisor' slots) */
struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
+ struct red_parms *red_parms;
+ struct tc_sfqred_stats stats;
+ struct sfq_slot *tail; /* current slot in round */
+
struct sfq_head dep[SFQ_MAX_DEPTH + 1];
/* Linked lists of slots, indexed by depth
* dep[0] : list of unused flows
@@ -133,6 +140,7 @@ struct sfq_sched_data {
* dep[X] : list of flows with X packets
*/
+ unsigned int maxflows; /* number of flows in flows array */
int perturb_period;
unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
struct timer_list perturb_timer;
@@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
drop:
skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
len = qdisc_pkt_len(skb);
+ slot->backlog -= len;
sfq_dec(q, x);
kfree_skb(skb);
sch->q.qlen--;
@@ -341,6 +350,23 @@ drop:
return 0;
}
+/* Is ECN parameter configured */
+static int sfq_prob_mark(const struct sfq_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max threshold just be marked */
+static int sfq_hard_mark(const struct sfq_sched_data *q)
+{
+ return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
+}
+
+static int sfq_headdrop(const struct sfq_sched_data *q)
+{
+ return q->headdrop;
+}
+
static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
@@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
sfq_index x, qlen;
struct sfq_slot *slot;
int uninitialized_var(ret);
+ struct sk_buff *head;
+ int delta;
hash = sfq_classify(skb, sch, &ret);
if (hash == 0) {
@@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->ht[hash] = x;
slot = &q->slots[x];
slot->hash = hash;
+ slot->backlog = 0; /* should already be 0 anyway... */
+ red_set_vars(&slot->vars);
+ goto enqueue;
}
+ if (q->red_parms) {
+ slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ switch (red_action(q->red_parms,
+ &slot->vars,
+ slot->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
- if (slot->qlen >= q->maxdepth) {
- struct sk_buff *head;
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (sfq_prob_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.prob_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ break;
+ }
+ }
+ q->stats.prob_drop++;
+ goto congestion_drop;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (sfq_hard_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.forced_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ break;
+ }
+ }
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ }
- if (!q->headdrop)
+ if (slot->qlen >= q->maxdepth) {
+congestion_drop:
+ if (!sfq_headdrop(q))
return qdisc_drop(skb, sch);
+ /* We know we have at least one packet in queue */
head = slot_dequeue_head(slot);
- sch->qstats.backlog -= qdisc_pkt_len(head);
+ delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
+ sch->qstats.backlog -= delta;
+ slot->backlog -= delta;
qdisc_drop(head, sch);
- sch->qstats.backlog += qdisc_pkt_len(skb);
slot_queue_add(slot, skb);
return NET_XMIT_CN;
}
+enqueue:
sch->qstats.backlog += qdisc_pkt_len(skb);
+ slot->backlog += qdisc_pkt_len(skb);
slot_queue_add(slot, skb);
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
@@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
slot->next = q->tail->next;
q->tail->next = x;
}
+ /* We could use a bigger initial quantum for new flows */
slot->allot = q->scaled_quantum;
}
if (++sch->q.qlen <= q->limit)
@@ -439,7 +519,7 @@ next_slot:
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
-
+ slot->backlog -= qdisc_pkt_len(skb);
/* Is the slot empty? */
if (slot->qlen == 0) {
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
@@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch)
sfq_dec(q, i);
__skb_queue_tail(&list, skb);
}
+ slot->backlog = 0;
+ red_set_vars(&slot->vars);
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
}
q->tail = NULL;
@@ -514,6 +596,11 @@ drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
if (slot->qlen >= q->maxdepth)
goto drop;
slot_queue_add(slot, skb);
+ if (q->red_parms)
+ slot->vars.qavg = red_calc_qavg(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ slot->backlog += qdisc_pkt_len(skb);
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
if (q->tail == NULL) { /* It is the first flow */
@@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
struct tc_sfq_qopt *ctl = nla_data(opt);
struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
unsigned int qlen;
+ struct red_parms *p = NULL;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
@@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
if (ctl->divisor &&
(!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
return -EINVAL;
-
+ if (ctl_v1 && ctl_v1->qth_min) {
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
sch_tree_lock(sch);
if (ctl->quantum) {
q->quantum = ctl->quantum;
@@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
if (ctl_v1) {
if (ctl_v1->depth)
q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ if (p) {
+ swap(q->red_parms, p);
+ red_set_parms(q->red_parms,
+ ctl_v1->qth_min, ctl_v1->qth_max,
+ ctl_v1->Wlog,
+ ctl_v1->Plog, ctl_v1->Scell_log,
+ NULL,
+ ctl_v1->max_P);
+ }
+ q->flags = ctl_v1->flags;
q->headdrop = ctl_v1->headdrop;
}
if (ctl->limit) {
@@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
q->perturbation = net_random();
}
sch_tree_unlock(sch);
+ kfree(p);
return 0;
}
@@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch)
del_timer_sync(&q->perturb_timer);
sfq_free(q->ht);
sfq_free(q->slots);
+ kfree(q->red_parms);
}
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
@@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_sfq_qopt_v1 opt;
+ struct red_parms *p = q->red_parms;
memset(&opt, 0, sizeof(opt));
opt.v0.quantum = q->quantum;
@@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.depth = q->maxdepth;
opt.headdrop = q->headdrop;
+ if (p) {
+ opt.qth_min = p->qth_min >> p->Wlog;
+ opt.qth_max = p->qth_max >> p->Wlog;
+ opt.Wlog = p->Wlog;
+ opt.Plog = p->Plog;
+ opt.Scell_log = p->Scell_log;
+ opt.max_P = p->max_P;
+ }
+ memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
+ opt.flags = q->flags;
+
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
@@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
sfq_index idx = q->ht[cl - 1];
struct gnet_stats_queue qs = { 0 };
struct tc_sfq_xstats xstats = { 0 };
- struct sk_buff *skb;
if (idx != SFQ_EMPTY_SLOT) {
const struct sfq_slot *slot = &q->slots[idx];
xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
qs.qlen = slot->qlen;
- slot_queue_walk(slot, skb)
- qs.backlog += qdisc_pkt_len(skb);
+ qs.backlog = slot->backlog;
}
if (gnet_stats_copy_queue(d, &qs) < 0)
return -1;
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
@ 2012-01-06 16:56 ` Dave Taht
2012-01-06 17:07 ` Eric Dumazet
2012-01-06 17:09 ` Stephen Hemminger
` (2 subsequent siblings)
3 siblings, 1 reply; 18+ messages in thread
From: Dave Taht @ 2012-01-06 16:56 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, netdev, Stephen Hemminger, Kathleen Nichols,
Jim Gettys
On Fri, Jan 6, 2012 at 5:31 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Adds an optional Random Early Detection on each SFQ flow queue.
netperf -t TCP_RR is useful
-t TCP_MAERTS will be interesting.
simultaneous ping?
> Traditional SFQ limits count of packets, while RED permits to also
> control number of bytes per flow, and adds ECN capability as well.
>
> 1) We dont handle the idle time management in this RED implementation,
> since each 'new flow' begins with a null qavg. We really want to address
> backlogged flows.
>
> 2) if headdrop is selected, we try to ecn mark first packet instead of
> currently enqueued packet. This gives faster feedback for tcp flows
> compared to traditional RED [ marking the last packet in queue ]
>
> Example of use :
>
> tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
> limit 3000 headdrop flows 512 divisor 16384 \
> redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> ewma 6 min 8000b max 60000b probability 0.2 ecn
> prob_mark 0 prob_mark_head 4876 prob_drop 6131
> forced_mark 0 forced_mark_head 0 forced_drop 0
> Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
> requeues 0)
> rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
>
> In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
> flows, we can see number of packets CE marked is smaller than number of
> drops (for non ECN flows)
>
> If same test is run, without RED, we can check backlog is much bigger.
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
> rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
>
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Stephen Hemminger <shemminger@vyatta.com>
> CC: Dave Taht <dave.taht@gmail.com>
> ---
> include/linux/pkt_sched.h | 20 ++++
> include/net/red.h | 3
> net/sched/sch_sfq.c | 146 ++++++++++++++++++++++++++++++++----
> 3 files changed, 152 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 8f1b928..0d5b793 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -162,10 +162,30 @@ struct tc_sfq_qopt {
> unsigned flows; /* Maximal number of flows */
> };
>
> +struct tc_sfqred_stats {
> + __u32 prob_drop; /* Early drops, below max threshold */
> + __u32 forced_drop; /* Early drops, after max threshold */
> + __u32 prob_mark; /* Marked packets, below max threshold */
> + __u32 forced_mark; /* Marked packets, after max threshold */
> + __u32 prob_mark_head; /* Marked packets, below max threshold */
> + __u32 forced_mark_head;/* Marked packets, after max threshold */
> +};
> +
> struct tc_sfq_qopt_v1 {
> struct tc_sfq_qopt v0;
> unsigned int depth; /* max number of packets per flow */
> unsigned int headdrop;
> +/* SFQRED parameters */
> + __u32 limit; /* HARD maximal flow queue length (bytes) */
> + __u32 qth_min; /* Min average length threshold (bytes) */
> + __u32 qth_max; /* Max average length threshold (bytes) */
> + unsigned char Wlog; /* log(W) */
> + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
> + unsigned char Scell_log; /* cell size for idle damping */
> + unsigned char flags;
> + __u32 max_P; /* probability, high resolution */
> +/* SFQRED stats */
> + struct tc_sfqred_stats stats;
> };
>
>
> diff --git a/include/net/red.h b/include/net/red.h
> index baab385..28068ec 100644
> --- a/include/net/red.h
> +++ b/include/net/red.h
> @@ -199,7 +199,8 @@ static inline void red_set_parms(struct red_parms *p,
> p->Scell_log = Scell_log;
> p->Scell_max = (255 << Scell_log);
>
> - memcpy(p->Stab, stab, sizeof(p->Stab));
> + if (stab)
> + memcpy(p->Stab, stab, sizeof(p->Stab));
> }
>
> static inline int red_is_idling(const struct red_vars *v)
> diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
> index 0a79640..67494ae 100644
> --- a/net/sched/sch_sfq.c
> +++ b/net/sched/sch_sfq.c
> @@ -24,6 +24,7 @@
> #include <net/netlink.h>
> #include <net/pkt_sched.h>
> #include <net/flow_keys.h>
> +#include <net/red.h>
>
>
> /* Stochastic Fairness Queuing algorithm.
> @@ -108,24 +109,30 @@ struct sfq_slot {
> struct sfq_head dep; /* anchor in dep[] chains */
> unsigned short hash; /* hash value (index in ht[]) */
> short allot; /* credit for this slot */
> +
> + unsigned int backlog;
> + struct red_vars vars;
> };
>
> struct sfq_sched_data {
> /* frequently used fields */
> int limit; /* limit of total number of packets in this qdisc */
> unsigned int divisor; /* number of slots in hash table */
> - unsigned int maxflows; /* number of flows in flows array */
> - int headdrop;
> - int maxdepth; /* limit of packets per flow */
> + u8 headdrop;
> + u8 maxdepth; /* limit of packets per flow */
>
> u32 perturbation;
> - struct tcf_proto *filter_list;
> - sfq_index cur_depth; /* depth of longest slot */
> + u8 cur_depth; /* depth of longest slot */
> + u8 flags;
> unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
> - struct sfq_slot *tail; /* current slot in round */
> + struct tcf_proto *filter_list;
> sfq_index *ht; /* Hash table ('divisor' slots) */
> struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
>
> + struct red_parms *red_parms;
> + struct tc_sfqred_stats stats;
> + struct sfq_slot *tail; /* current slot in round */
> +
> struct sfq_head dep[SFQ_MAX_DEPTH + 1];
> /* Linked lists of slots, indexed by depth
> * dep[0] : list of unused flows
> @@ -133,6 +140,7 @@ struct sfq_sched_data {
> * dep[X] : list of flows with X packets
> */
>
> + unsigned int maxflows; /* number of flows in flows array */
> int perturb_period;
> unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
> struct timer_list perturb_timer;
> @@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
> drop:
> skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
> len = qdisc_pkt_len(skb);
> + slot->backlog -= len;
> sfq_dec(q, x);
> kfree_skb(skb);
> sch->q.qlen--;
> @@ -341,6 +350,23 @@ drop:
> return 0;
> }
>
> +/* Is ECN parameter configured */
> +static int sfq_prob_mark(const struct sfq_sched_data *q)
> +{
> + return q->flags & TC_RED_ECN;
> +}
> +
> +/* Should packets over max threshold just be marked */
> +static int sfq_hard_mark(const struct sfq_sched_data *q)
> +{
> + return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
> +}
> +
> +static int sfq_headdrop(const struct sfq_sched_data *q)
> +{
> + return q->headdrop;
> +}
> +
> static int
> sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> {
> @@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> sfq_index x, qlen;
> struct sfq_slot *slot;
> int uninitialized_var(ret);
> + struct sk_buff *head;
> + int delta;
>
> hash = sfq_classify(skb, sch, &ret);
> if (hash == 0) {
> @@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> q->ht[hash] = x;
> slot = &q->slots[x];
> slot->hash = hash;
> + slot->backlog = 0; /* should already be 0 anyway... */
> + red_set_vars(&slot->vars);
> + goto enqueue;
> }
> + if (q->red_parms) {
> + slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
> + &slot->vars,
> + slot->backlog);
> + switch (red_action(q->red_parms,
> + &slot->vars,
> + slot->vars.qavg)) {
> + case RED_DONT_MARK:
> + break;
>
> - if (slot->qlen >= q->maxdepth) {
> - struct sk_buff *head;
> + case RED_PROB_MARK:
> + sch->qstats.overlimits++;
> + if (sfq_prob_mark(q)) {
> + /* We know we have at least one packet in queue */
> + if (sfq_headdrop(q) &&
> + INET_ECN_set_ce(slot->skblist_next)) {
> + q->stats.prob_mark_head++;
> + break;
> + }
> + if (INET_ECN_set_ce(skb)) {
> + q->stats.prob_mark++;
> + break;
> + }
> + }
> + q->stats.prob_drop++;
> + goto congestion_drop;
> +
> + case RED_HARD_MARK:
> + sch->qstats.overlimits++;
> + if (sfq_hard_mark(q)) {
> + /* We know we have at least one packet in queue */
> + if (sfq_headdrop(q) &&
> + INET_ECN_set_ce(slot->skblist_next)) {
> + q->stats.forced_mark_head++;
> + break;
> + }
> + if (INET_ECN_set_ce(skb)) {
> + q->stats.forced_mark++;
> + break;
> + }
> + }
> + q->stats.forced_drop++;
> + goto congestion_drop;
> + }
> + }
>
> - if (!q->headdrop)
> + if (slot->qlen >= q->maxdepth) {
> +congestion_drop:
> + if (!sfq_headdrop(q))
> return qdisc_drop(skb, sch);
>
> + /* We know we have at least one packet in queue */
> head = slot_dequeue_head(slot);
> - sch->qstats.backlog -= qdisc_pkt_len(head);
> + delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
> + sch->qstats.backlog -= delta;
> + slot->backlog -= delta;
> qdisc_drop(head, sch);
>
> - sch->qstats.backlog += qdisc_pkt_len(skb);
> slot_queue_add(slot, skb);
> return NET_XMIT_CN;
> }
>
> +enqueue:
> sch->qstats.backlog += qdisc_pkt_len(skb);
> + slot->backlog += qdisc_pkt_len(skb);
> slot_queue_add(slot, skb);
> sfq_inc(q, x);
> if (slot->qlen == 1) { /* The flow is new */
> @@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> slot->next = q->tail->next;
> q->tail->next = x;
> }
> + /* We could use a bigger initial quantum for new flows */
> slot->allot = q->scaled_quantum;
> }
> if (++sch->q.qlen <= q->limit)
> @@ -439,7 +519,7 @@ next_slot:
> qdisc_bstats_update(sch, skb);
> sch->q.qlen--;
> sch->qstats.backlog -= qdisc_pkt_len(skb);
> -
> + slot->backlog -= qdisc_pkt_len(skb);
> /* Is the slot empty? */
> if (slot->qlen == 0) {
> q->ht[slot->hash] = SFQ_EMPTY_SLOT;
> @@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch)
> sfq_dec(q, i);
> __skb_queue_tail(&list, skb);
> }
> + slot->backlog = 0;
> + red_set_vars(&slot->vars);
> q->ht[slot->hash] = SFQ_EMPTY_SLOT;
> }
> q->tail = NULL;
> @@ -514,6 +596,11 @@ drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
> if (slot->qlen >= q->maxdepth)
> goto drop;
> slot_queue_add(slot, skb);
> + if (q->red_parms)
> + slot->vars.qavg = red_calc_qavg(q->red_parms,
> + &slot->vars,
> + slot->backlog);
> + slot->backlog += qdisc_pkt_len(skb);
> sfq_inc(q, x);
> if (slot->qlen == 1) { /* The flow is new */
> if (q->tail == NULL) { /* It is the first flow */
> @@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> struct tc_sfq_qopt *ctl = nla_data(opt);
> struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
> unsigned int qlen;
> + struct red_parms *p = NULL;
>
> if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
> return -EINVAL;
> @@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> if (ctl->divisor &&
> (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
> return -EINVAL;
> -
> + if (ctl_v1 && ctl_v1->qth_min) {
> + p = kmalloc(sizeof(*p), GFP_KERNEL);
> + if (!p)
> + return -ENOMEM;
> + }
> sch_tree_lock(sch);
> if (ctl->quantum) {
> q->quantum = ctl->quantum;
> @@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> if (ctl_v1) {
> if (ctl_v1->depth)
> q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
> + if (p) {
> + swap(q->red_parms, p);
> + red_set_parms(q->red_parms,
> + ctl_v1->qth_min, ctl_v1->qth_max,
> + ctl_v1->Wlog,
> + ctl_v1->Plog, ctl_v1->Scell_log,
> + NULL,
> + ctl_v1->max_P);
> + }
> + q->flags = ctl_v1->flags;
> q->headdrop = ctl_v1->headdrop;
> }
> if (ctl->limit) {
> @@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> q->perturbation = net_random();
> }
> sch_tree_unlock(sch);
> + kfree(p);
> return 0;
> }
>
> @@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch)
> del_timer_sync(&q->perturb_timer);
> sfq_free(q->ht);
> sfq_free(q->slots);
> + kfree(q->red_parms);
> }
>
> static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
> @@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
> struct sfq_sched_data *q = qdisc_priv(sch);
> unsigned char *b = skb_tail_pointer(skb);
> struct tc_sfq_qopt_v1 opt;
> + struct red_parms *p = q->red_parms;
>
> memset(&opt, 0, sizeof(opt));
> opt.v0.quantum = q->quantum;
> @@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
> opt.depth = q->maxdepth;
> opt.headdrop = q->headdrop;
>
> + if (p) {
> + opt.qth_min = p->qth_min >> p->Wlog;
> + opt.qth_max = p->qth_max >> p->Wlog;
> + opt.Wlog = p->Wlog;
> + opt.Plog = p->Plog;
> + opt.Scell_log = p->Scell_log;
> + opt.max_P = p->max_P;
> + }
> + memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
> + opt.flags = q->flags;
> +
> NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
>
> return skb->len;
> @@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
> sfq_index idx = q->ht[cl - 1];
> struct gnet_stats_queue qs = { 0 };
> struct tc_sfq_xstats xstats = { 0 };
> - struct sk_buff *skb;
>
> if (idx != SFQ_EMPTY_SLOT) {
> const struct sfq_slot *slot = &q->slots[idx];
>
> xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
> qs.qlen = slot->qlen;
> - slot_queue_walk(slot, skb)
> - qs.backlog += qdisc_pkt_len(skb);
> + qs.backlog = slot->backlog;
> }
> if (gnet_stats_copy_queue(d, &qs) < 0)
> return -1;
>
>
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 16:56 ` Dave Taht
@ 2012-01-06 17:07 ` Eric Dumazet
2012-01-06 17:36 ` Dave Taht
2012-01-06 18:30 ` Rick Jones
0 siblings, 2 replies; 18+ messages in thread
From: Eric Dumazet @ 2012-01-06 17:07 UTC (permalink / raw)
To: Dave Taht
Cc: David Miller, netdev, Stephen Hemminger, Kathleen Nichols,
Jim Gettys
Le vendredi 06 janvier 2012 à 17:56 +0100, Dave Taht a écrit :
> On Fri, Jan 6, 2012 at 5:31 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Adds an optional Random Early Detection on each SFQ flow queue.
>
> netperf -t TCP_RR is useful
> -t TCP_MAERTS will be interesting.
> simultaneous ping?
>
I dont know what you expect from pings, since they are already coming in
new flows (unless a ping flood is in effect), so RED doesnt fire for
these packets.
Same for TCP_RR : Since at most one packet is in flight per flow, RED
cannot fire.
So there is no differences for them.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
2012-01-06 16:56 ` Dave Taht
@ 2012-01-06 17:09 ` Stephen Hemminger
2012-01-06 17:25 ` Eric Dumazet
2012-01-10 9:40 ` Dave Taht
2012-01-13 4:06 ` David Miller
3 siblings, 1 reply; 18+ messages in thread
From: Stephen Hemminger @ 2012-01-06 17:09 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Dave Taht
On Fri, 06 Jan 2012 17:31:44 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Adds an optional Random Early Detection on each SFQ flow queue.
>
> Traditional SFQ limits count of packets, while RED permits to also
> control number of bytes per flow, and adds ECN capability as well.
>
> 1) We dont handle the idle time management in this RED implementation,
> since each 'new flow' begins with a null qavg. We really want to address
> backlogged flows.
>
> 2) if headdrop is selected, we try to ecn mark first packet instead of
> currently enqueued packet. This gives faster feedback for tcp flows
> compared to traditional RED [ marking the last packet in queue ]
>
> Example of use :
>
> tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
> limit 3000 headdrop flows 512 divisor 16384 \
> redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> ewma 6 min 8000b max 60000b probability 0.2 ecn
> prob_mark 0 prob_mark_head 4876 prob_drop 6131
> forced_mark 0 forced_mark_head 0 forced_drop 0
> Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
> requeues 0)
> rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
>
> In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
> flows, we can see number of packets CE marked is smaller than number of
> drops (for non ECN flows)
>
> If same test is run, without RED, we can check backlog is much bigger.
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
> rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
>
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Stephen Hemminger <shemminger@vyatta.com>
> CC: Dave Taht <dave.taht@gmail.com>
Since SFQ is classful, I don't see what this buys over just putting
a red qdisc under each SFQ class?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 17:09 ` Stephen Hemminger
@ 2012-01-06 17:25 ` Eric Dumazet
0 siblings, 0 replies; 18+ messages in thread
From: Eric Dumazet @ 2012-01-06 17:25 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev, Dave Taht
Le vendredi 06 janvier 2012 à 09:09 -0800, Stephen Hemminger a écrit :
> Since SFQ is classful, I don't see what this buys over just putting
> a red qdisc under each SFQ class?
>
You meant RED is classful, not SFQ ?
Idea is to use RED on each flow, not on global SFQ, since as you pointed
out, you already can do that right now (and it has no interest)
You can have this right now using QFQ, and for each QFQ class, attach a
RED qdisc, but memory cost and setup time are crazy.
Please note that RED cant perform head mark or drop. (unless we add new
qdisc ops for this)
We can do it from SFQ because we do our proper queue management.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 17:07 ` Eric Dumazet
@ 2012-01-06 17:36 ` Dave Taht
2012-01-06 18:30 ` Rick Jones
1 sibling, 0 replies; 18+ messages in thread
From: Dave Taht @ 2012-01-06 17:36 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, netdev, Stephen Hemminger, Kathleen Nichols,
Jim Gettys
On Fri, Jan 6, 2012 at 6:07 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 06 janvier 2012 à 17:56 +0100, Dave Taht a écrit :
>> On Fri, Jan 6, 2012 at 5:31 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > Adds an optional Random Early Detection on each SFQ flow queue.
>>
>> netperf -t TCP_RR is useful
>> -t TCP_MAERTS will be interesting.
>> simultaneous ping?
>>
>
> I dont know what you expect from pings, since they are already coming in
> new flows (unless a ping flood is in effect), so RED doesnt fire for
> these packets.
>
> Same for TCP_RR : Since at most one packet is in flight per flow, RED
> cannot fire.
>
> So there is no differences for them.
Your environment is rather different than mine, and I like having
baseline numbers around... but I can live without as I was hovering
over the build button in my bql tree, and I'm building now.
Up until the world changed, a few days ago, those two did matter when
comparing results against QFQ and
various combinations of it's sub-qdiscs. I'm at the point now where I
have to do cdf plots against large data sets in order to see
*anything*, and I'm looking forward to seeing how well this does
against wireless, which is really noisy...
TCP_MAERTS tests activity in the other direction and (again, your
environment is different) does interesting things against byte
oriented red, or rather, 'did' when last I checked several eons
(months) ago, which was prior to the avqueue fixes.
Another excellent test is the 'apache benchmark', ab, while under a
saturating load from something else.
At some point I'd like to try iobench against nfs and samba, even
iscsi against multiple initiators...
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 17:07 ` Eric Dumazet
2012-01-06 17:36 ` Dave Taht
@ 2012-01-06 18:30 ` Rick Jones
2012-01-06 19:33 ` Eric Dumazet
1 sibling, 1 reply; 18+ messages in thread
From: Rick Jones @ 2012-01-06 18:30 UTC (permalink / raw)
To: Eric Dumazet
Cc: Dave Taht, David Miller, netdev, Stephen Hemminger,
Kathleen Nichols, Jim Gettys
On 01/06/2012 09:07 AM, Eric Dumazet wrote:
> Le vendredi 06 janvier 2012 à 17:56 +0100, Dave Taht a écrit :
>> On Fri, Jan 6, 2012 at 5:31 PM, Eric Dumazet<eric.dumazet@gmail.com> wrote:
>>> Adds an optional Random Early Detection on each SFQ flow queue.
>>
>> netperf -t TCP_RR is useful
>> -t TCP_MAERTS will be interesting.
>> simultaneous ping?
>>
>
> I dont know what you expect from pings, since they are already coming in
> new flows (unless a ping flood is in effect), so RED doesnt fire for
> these packets.
>
> Same for TCP_RR : Since at most one packet is in flight per flow, RED
> cannot fire.
netperf nitpick :) While I doubt that Dave Taht is running it that way,
one can have multiple requests in flight on a single _RR test via the
test-specific -b <additionaltrans> option. That option is enabled by
default (--enable-burst on the configure) in 2.5.0 and later.
netperf -t TCP_RR ... -- -b 1
will cause netperf to have two transactions in flight at one time, -b 3
will have four etc etc (actually it "slow-starts" to get to that level).
This can also be (ab)used to implement a single-connection,
bi-directional throughput test such as I have in my "runemomni" scripts
under doc/examples. For example:
netperf -t TCP_RR ... -- -s 1M -S 1M -r 64K -b 12
As there is no select() or poll() call in the path, it is best to ensure
that the SO_SNDBUF size on either end is large enough to hold
[request|response]_size*simultaneous_trans at one time.
Depending on the size of the requests/responses one may want to add the
test-specific -D option to set TCP_NODELAY.
happy benchmarking,
rick jones
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 18:30 ` Rick Jones
@ 2012-01-06 19:33 ` Eric Dumazet
2012-01-06 19:43 ` Rick Jones
0 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2012-01-06 19:33 UTC (permalink / raw)
To: Rick Jones
Cc: Dave Taht, David Miller, netdev, Stephen Hemminger,
Kathleen Nichols, Jim Gettys
Le vendredi 06 janvier 2012 à 10:30 -0800, Rick Jones a écrit :
> netperf nitpick :) While I doubt that Dave Taht is running it that way,
> one can have multiple requests in flight on a single _RR test via the
> test-specific -b <additionaltrans> option. That option is enabled by
> default (--enable-burst on the configure) in 2.5.0 and later.
Ah Rick, I dont think we can tune IP_TOS with netperf -t UDP_{STREAM|
RR} ?
I ask because it could be a good thing to set ECT(0) on datagrams to
check our ECN capabilities and get in the final report from receiver a
count/percentage of CE frames.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 19:33 ` Eric Dumazet
@ 2012-01-06 19:43 ` Rick Jones
2012-01-06 20:26 ` Dave Taht
0 siblings, 1 reply; 18+ messages in thread
From: Rick Jones @ 2012-01-06 19:43 UTC (permalink / raw)
To: Eric Dumazet
Cc: Dave Taht, David Miller, netdev, Stephen Hemminger,
Kathleen Nichols, Jim Gettys
On 01/06/2012 11:33 AM, Eric Dumazet wrote:
> Le vendredi 06 janvier 2012 à 10:30 -0800, Rick Jones a écrit :
>
>> netperf nitpick :) While I doubt that Dave Taht is running it that way,
>> one can have multiple requests in flight on a single _RR test via the
>> test-specific -b<additionaltrans> option. That option is enabled by
>> default (--enable-burst on the configure) in 2.5.0 and later.
>
> Ah Rick, I dont think we can tune IP_TOS with netperf -t UDP_{STREAM|
> RR} ?
>
> I ask because it could be a good thing to set ECT(0) on datagrams to
> check our ECN capabilities and get in the final report from receiver a
> count/percentage of CE frames.
Funny you should mention that :) In the top-of-trunk (perhaps it is in
2.5.0 too, I do not recall) there is the global -Y option:
$ src/netperf -Y
src/netperf: option requires an argument -- 'Y'
Usage: netperf [global options] -- [test options]
Global options:
...
-y local,remote Set the socket priority
-Y local,remote Set the IP_TOS. Use hexadecimal.
So long as you either use the omni code directly, or indirectly by not
undoing WANT_MIGRATION those should work - for some definition of work
anyway...I would not be surprised to learn there are bugs in the support.
However, there is nothing presently in the netperf code to cause any
*individual* send to be so marked independently of the others.
happy benchmarking,
rick jones
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 19:43 ` Rick Jones
@ 2012-01-06 20:26 ` Dave Taht
0 siblings, 0 replies; 18+ messages in thread
From: Dave Taht @ 2012-01-06 20:26 UTC (permalink / raw)
To: Rick Jones
Cc: Eric Dumazet, David Miller, netdev, Stephen Hemminger,
Kathleen Nichols, Jim Gettys
On Fri, Jan 6, 2012 at 8:43 PM, Rick Jones <rick.jones2@hp.com> wrote:
> On 01/06/2012 11:33 AM, Eric Dumazet wrote:
>>
>> Le vendredi 06 janvier 2012 à 10:30 -0800, Rick Jones a écrit :
>>
>>> netperf nitpick :) While I doubt that Dave Taht is running it that way,
>>> one can have multiple requests in flight on a single _RR test via the
>>> test-specific -b<additionaltrans> option. That option is enabled by
>>> default (--enable-burst on the configure) in 2.5.0 and later.
>>
>>
>> Ah Rick, I dont think we can tune IP_TOS with netperf -t UDP_{STREAM|
>> RR} ?
>>
>> I ask because it could be a good thing to set ECT(0) on datagrams to
>> check our ECN capabilities and get in the final report from receiver a
>> count/percentage of CE frames.
For other apps than netperf, being able to see this stuff in iptables
(ip6tables) might be helpful. Did this kernel patch series make it?
http://comments.gmane.org/gmane.comp.security.firewalls.netfilter.devel/41139
>
> Funny you should mention that :) In the top-of-trunk (perhaps it is in
> 2.5.0 too, I do not recall) there is the global -Y option:
>
> $ src/netperf -Y
> src/netperf: option requires an argument -- 'Y'
>
> Usage: netperf [global options] -- [test options]
>
> Global options:
> ...
> -y local,remote Set the socket priority
> -Y local,remote Set the IP_TOS. Use hexadecimal.
>
> So long as you either use the omni code directly, or indirectly by not
> undoing WANT_MIGRATION those should work - for some definition of work
> anyway...I would not be surprised to learn there are bugs in the support.
Also in top of netperf trunk is a mode to be able to exercise different TCP
congestion control algorithms. I was mostly fiddling with westwood, tcp-lp and
tcp-ledbat v1 ( https://github.com/silviov/TCP-LEDBAT )
" The output selectors are LOCAL_CONG_CONTROL and
REMOTE_CONG_CONTROL and setting is via the test-specific -K option."
I also have a rsync patch for priority, diffserv and congestion control
floating about...
I'm a little vague as to whether your related kernel patch for inheriting
the congestion algorithm made it in?
I note that netperf trunk breaks backward compatibility with
netperf 2.5.
> However, there is nothing presently in the netperf code to cause any
> *individual* send to be so marked independently of the others.
Heh.
>
> happy benchmarking,
>
> rick jones
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
2012-01-06 16:56 ` Dave Taht
2012-01-06 17:09 ` Stephen Hemminger
@ 2012-01-10 9:40 ` Dave Taht
2012-01-13 4:06 ` David Miller
3 siblings, 0 replies; 18+ messages in thread
From: Dave Taht @ 2012-01-10 9:40 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev, Stephen Hemminger
On Fri, Jan 6, 2012 at 5:31 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Adds an optional Random Early Detection on each SFQ flow queue.
>
> Traditional SFQ limits count of packets, while RED permits to also
> control number of bytes per flow, and adds ECN capability as well.
>
> 1) We dont handle the idle time management in this RED implementation,
> since each 'new flow' begins with a null qavg. We really want to address
> backlogged flows.
>
> 2) if headdrop is selected, we try to ecn mark first packet instead of
> currently enqueued packet. This gives faster feedback for tcp flows
> compared to traditional RED [ marking the last packet in queue ]
>
> Example of use :
>
> tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
> limit 3000 headdrop flows 512 divisor 16384 \
> redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> ewma 6 min 8000b max 60000b probability 0.2 ecn
> prob_mark 0 prob_mark_head 4876 prob_drop 6131
> forced_mark 0 forced_mark_head 0 forced_drop 0
> Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
> requeues 0)
> rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
>
> In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
> flows, we can see number of packets CE marked is smaller than number of
> drops (for non ECN flows)
>
> If same test is run, without RED, we can check backlog is much bigger.
>
I can confirm that it doesn't crash. It doesn't appear to do harm. It does
appear to hold queue depths to saner levels, balance competing streams,
really well (tested only with identical RTTs, however), latecomers
ramp up nice to compete...
and in the packet captures I have I see TCP fast retransmits, no significant
bursty losses, etc.
In other words, all pretty good behavior.
Configuring RED is correctly is still a PITA, but less so now. Not
that I'm getting
it right below. This was a test at 100Mbit, with BQL=4500, GSO/TSO off,
with 50 iperf streams across 2 routers to another box (all of which were
running the newer sfq with the HoL fix in the default mode)
qdisc sfq a: root refcnt 2 limit 300p quantum 1514b depth 127 headdrop
divisor 16384
ewma 6 min 8000b max 60000b probability 0.2 ecn
prob_mark 5 prob_mark_head 12863 prob_drop 0
forced_mark 0 forced_mark_head 0 forced_drop 0
Sent 10890212752 bytes 8191225 pkt (dropped 76030, overlimits 12868
requeues 2920968)
rate 41329Kbit 3448pps backlog 442088b 293p requeues 2920968
ping RTT went from ~.3ms unloaded to 1.6 to 2 ms
netperf -t TCP_RR went from ~2000 to ~500
These two changes are due mostly to the number of packets being buffered
in the driver and are far better than pfifo fast does in all cases...
Months of testing is indicated to thoroughly evaluate the effects of
this against
things such as bittorrent, voip, etc, in a long RTT environment, against
more workloads than just the above. It would be good to test against more
reference machines at both higher speeds and lower, and with HTB on,
etc., etc.
But as I said, it doesn't crash, is not on by default, more people should
definitely try it, and in general... appears to be a big win in addition
to the already huge wins in all the queueing disciplines in 3.3.
With those caveats...
Tested-by: Dave Taht <dave.taht@gmail.com>
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
` (2 preceding siblings ...)
2012-01-10 9:40 ` Dave Taht
@ 2012-01-13 4:06 ` David Miller
3 siblings, 0 replies; 18+ messages in thread
From: David Miller @ 2012-01-13 4:06 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, shemminger, dave.taht
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 06 Jan 2012 17:31:44 +0100
> Adds an optional Random Early Detection on each SFQ flow queue.
...
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Stephen Hemminger <shemminger@vyatta.com>
> CC: Dave Taht <dave.taht@gmail.com>
I've decided to apply this, thanks everyone.
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2012-01-13 4:06 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-05 12:25 [PATCH net-next] net_sched: red: split red_parms into parms and vars Eric Dumazet
2012-01-05 13:03 ` Dave Taht
2012-01-05 13:39 ` Eric Dumazet
2012-01-06 5:47 ` Eric Dumazet
2012-01-06 8:31 ` Dave Taht
2012-01-06 16:31 ` [PATCH] net_sched: sfq: add optional RED on top of SFQ Eric Dumazet
2012-01-06 16:56 ` Dave Taht
2012-01-06 17:07 ` Eric Dumazet
2012-01-06 17:36 ` Dave Taht
2012-01-06 18:30 ` Rick Jones
2012-01-06 19:33 ` Eric Dumazet
2012-01-06 19:43 ` Rick Jones
2012-01-06 20:26 ` Dave Taht
2012-01-06 17:09 ` Stephen Hemminger
2012-01-06 17:25 ` Eric Dumazet
2012-01-10 9:40 ` Dave Taht
2012-01-13 4:06 ` David Miller
2012-01-05 19:08 ` [PATCH net-next] net_sched: red: split red_parms into parms and vars David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).