* [LARTC] [PATCH] HTB O(1) class lookup
@ 2007-02-01 5:18 Simon Lodal
2007-02-01 6:08 ` [LARTC] " Patrick McHardy
2007-02-01 22:44 ` [LARTC] " Konrad Cempura
0 siblings, 2 replies; 9+ messages in thread
From: Simon Lodal @ 2007-02-01 5:18 UTC (permalink / raw)
To: netdev; +Cc: lartc
[-- Attachment #1: Type: text/plain, Size: 761 bytes --]
This patch changes HTB's class storage from hash+lists to a two-level linear
array, so it can do constant time (O(1)) class lookup by classid. It improves
scalability for large number of classes.
Without the patch, ~14k htb classes can starve a Xeon-3.2 at only 15kpps,
using most of it's cycles traversing lists in htb_find(). The patch
eliminates this problem, and has a measurable impact even with a few hundred
classes.
Previously, scalability could be improved by increasing HTB_HSIZE, modify the
hash function, and recompile, but this patch works for everyone without
recompile and scales better too.
The patch is for 2.6.20-rc6, I have older ones for 2.6.18 and 2.6.19 if anyone
is interested.
Signed-off-by: Simon Lodal <simonl@parknet.dk>
[-- Attachment #2: htb_O(1)_class_lookup.diff --]
[-- Type: text/x-diff, Size: 11212 bytes --]
--- linux-2.6.20-rc6.base/net/sched/sch_htb.c 2007-01-25 03:19:28.000000000 +0100
+++ linux-2.6.20-rc6/net/sched/sch_htb.c 2007-02-01 05:44:36.000000000 +0100
@@ -68,16 +68,37 @@
one less than their parent.
*/
-#define HTB_HSIZE 16 /* classid hash size */
-#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */
-#define HTB_RATECM 1 /* whether to use rate computer */
+#define HTB_MAX_CLS (TC_H_MIN(-1)+1)
+#define HTB_CLS_ARRAY_SIZE PAGE_SIZE
+#define HTB_CLS_PER_ARRAY (HTB_CLS_ARRAY_SIZE / sizeof(void*))
+#define HTB_CLS_ARRAYS (HTB_MAX_CLS / HTB_CLS_PER_ARRAY)
+#define HTB_CLS_ARRAY(CID) (CID / HTB_CLS_PER_ARRAY)
+#define HTB_CLS_INDEX(CID) (CID & (HTB_CLS_PER_ARRAY-1))
+
+
#define HTB_HYSTERESIS 1 /* whether to use mode hysteresis for speedup */
-#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
+#define HTB_VER 0x30012 /* major must be matched with number suplied by TC as version */
#if HTB_VER >> 16 != TC_HTB_PROTOVER
#error "Mismatched sch_htb.c and pkt_sch.h"
#endif
+/* Whether to use rate computer. This is only used for statistics output to
+ userspace (tc -s class show dev ...); if you do not care about that and
+ want the last bit of performance, disable this. */
+#define HTB_RATECM
+#ifdef HTB_RATECM
+/* Time between rate computation updates, in seconds, for each class. */
+#define HTB_RATECM_UPDATE_INTERVAL 16
+/* How many HTB_RATECM_UPDATE_INTERVAL intervals to average over when
+ computing the rate. If set to 1, the computed rate will be exactly the
+ observed rate of the last interval. If set to higher values, the computed
+ rate will converge slower, but also vary less with small, temporary changes
+ in traffic.
+*/
+#define HTB_RATECM_AVERAGE_INTERVALS 2
+#endif
+
/* used internaly to keep status of single class */
enum htb_cmode {
HTB_CANT_SEND, /* class can't send and can't borrow */
@@ -104,7 +125,7 @@
/* topology */
int level; /* our level (see above) */
struct htb_class *parent; /* parent class */
- struct hlist_node hlist; /* classid hash list item */
+ struct list_head clist; /* classid list item */
struct list_head sibling; /* sibling list item */
struct list_head children; /* children list */
@@ -165,9 +186,13 @@
return rate->data[slot];
}
+typedef struct htb_class* htb_cls_array[HTB_CLS_PER_ARRAY];
+
struct htb_sched {
- struct list_head root; /* root classes list */
- struct hlist_head hash[HTB_HSIZE]; /* hashed by classid */
+ struct list_head root; /* root classes list */
+ struct list_head clist; /* all classes list */
+ /* all classes arrays for fast lookup */
+ htb_cls_array* classes[HTB_CLS_ARRAYS];
struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
/* self list - roots of self generating tree */
@@ -198,8 +223,10 @@
psched_time_t now; /* cached dequeue time */
struct timer_list timer; /* send delay timer */
#ifdef HTB_RATECM
- struct timer_list rttim; /* rate computer timer */
- int recmp_bucket; /* which hash bucket to recompute next */
+ struct timer_list rttim;/* rate computer timer */
+ int clscnt; /* total number of classes */
+ struct list_head *rtcur;/* current class to update rate timer for */
+ int rtiter; /* current iteration (1..UPDATE_INTERVAL) */
#endif
/* non shaped skbs; let them go directly thru */
@@ -209,32 +236,22 @@
long direct_pkts;
};
-/* compute hash of size HTB_HSIZE for given handle */
-static inline int htb_hash(u32 h)
-{
-#if HTB_HSIZE != 16
-#error "Declare new hash for your HTB_HSIZE"
-#endif
- h ^= h >> 8; /* stolen from cbq_hash */
- h ^= h >> 4;
- return h & 0xf;
-}
-
-/* find class in global hash table using given handle */
+/* find class in class arrays using given handle */
static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
{
struct htb_sched *q = qdisc_priv(sch);
- struct hlist_node *p;
- struct htb_class *cl;
+ htb_cls_array* a;
+ int minorid;
if (TC_H_MAJ(handle) != sch->handle)
return NULL;
- hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
- if (cl->classid == handle)
- return cl;
- }
- return NULL;
+ minorid = TC_H_MIN(handle);
+ a = q->classes[HTB_CLS_ARRAY(minorid)];
+ if (a == NULL)
+ return NULL;
+
+ return (*a)[HTB_CLS_INDEX(minorid)];
}
/**
@@ -689,13 +706,14 @@
}
#ifdef HTB_RATECM
-#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
+#define RT_GEN(R,S) R+=S-(R/HTB_RATECM_AVERAGE_INTERVALS);S=0
+/* recompute rate for approximately clscnt/UPDATE_INTERVAL classes */
static void htb_rate_timer(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct htb_sched *q = qdisc_priv(sch);
- struct hlist_node *p;
struct htb_class *cl;
+ int cnt,done;
/* lock queue so that we can muck with it */
@@ -704,14 +722,35 @@
q->rttim.expires = jiffies + HZ;
add_timer(&q->rttim);
- /* scan and recompute one bucket at time */
- if (++q->recmp_bucket >= HTB_HSIZE)
- q->recmp_bucket = 0;
-
- hlist_for_each_entry(cl,p, q->hash + q->recmp_bucket, hlist) {
- RT_GEN(cl->sum_bytes, cl->rate_bytes);
- RT_GEN(cl->sum_packets, cl->rate_packets);
+ if (q->clscnt == 0)
+ goto done;
+ if (q->rtiter == HTB_RATECM_UPDATE_INTERVAL)
+ q->rtiter = 0;
+ if (q->rtiter == 0)
+ q->rtcur = q->clist.next;
+ q->rtiter++;
+ cnt = ((q->clscnt * q->rtiter) / HTB_RATECM_UPDATE_INTERVAL) -
+ ((q->clscnt * (q->rtiter-1)) / HTB_RATECM_UPDATE_INTERVAL);
+ done = 0;
+ for (;;) {
+ /* end of class list */
+ if (q->rtcur == NULL)
+ break;
+ /* last iteration must continue until end */
+ if ((done == cnt) &&
+ (q->rtiter < HTB_RATECM_UPDATE_INTERVAL)) {
+ break;
+ }
+ cl = list_entry(q->rtcur, struct htb_class, clist);
+ RT_GEN(cl->rate_bytes, cl->sum_bytes);
+ RT_GEN(cl->rate_packets, cl->sum_packets);
+ done++;
+ q->rtcur = q->rtcur->next;
+ if (q->rtcur == &q->clist)
+ q->rtcur = NULL;
}
+
+done:
spin_unlock_bh(&sch->dev->queue_lock);
}
#endif
@@ -1058,23 +1097,19 @@
{
struct htb_sched *q = qdisc_priv(sch);
int i;
+ struct list_head *p;
- for (i = 0; i < HTB_HSIZE; i++) {
- struct hlist_node *p;
- struct htb_class *cl;
-
- hlist_for_each_entry(cl, p, q->hash + i, hlist) {
- if (cl->level)
- memset(&cl->un.inner, 0, sizeof(cl->un.inner));
- else {
- if (cl->un.leaf.q)
- qdisc_reset(cl->un.leaf.q);
- INIT_LIST_HEAD(&cl->un.leaf.drop_list);
- }
- cl->prio_activity = 0;
- cl->cmode = HTB_CAN_SEND;
-
+ list_for_each(p, &q->clist) {
+ struct htb_class *cl = list_entry(p, struct htb_class, clist);
+ if (cl->level)
+ memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+ else {
+ if (cl->un.leaf.q)
+ qdisc_reset(cl->un.leaf.q);
+ INIT_LIST_HEAD(&cl->un.leaf.drop_list);
}
+ cl->prio_activity = 0;
+ cl->cmode = HTB_CAN_SEND;
}
sch->flags &= ~TCQ_F_THROTTLED;
del_timer(&q->timer);
@@ -1109,8 +1144,7 @@
}
INIT_LIST_HEAD(&q->root);
- for (i = 0; i < HTB_HSIZE; i++)
- INIT_HLIST_HEAD(q->hash + i);
+ INIT_LIST_HEAD(&q->clist);
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops + i);
@@ -1204,8 +1238,10 @@
struct htb_class *cl = (struct htb_class *)arg;
#ifdef HTB_RATECM
- cl->rate_est.bps = cl->rate_bytes / (HTB_EWMAC * HTB_HSIZE);
- cl->rate_est.pps = cl->rate_packets / (HTB_EWMAC * HTB_HSIZE);
+ cl->rate_est.bps = cl->rate_bytes /
+ (HTB_RATECM_UPDATE_INTERVAL * HTB_RATECM_AVERAGE_INTERVALS);
+ cl->rate_est.pps = cl->rate_packets /
+ (HTB_RATECM_UPDATE_INTERVAL * HTB_RATECM_AVERAGE_INTERVALS);
#endif
if (!cl->level && cl->un.leaf.q)
@@ -1310,6 +1346,7 @@
static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
{
struct htb_sched *q = qdisc_priv(sch);
+ int minorid = TC_H_MIN(cl->classid);
if (!cl->level) {
BUG_TRAP(cl->un.leaf.q);
@@ -1325,7 +1362,7 @@
struct htb_class, sibling));
/* note: this delete may happen twice (see htb_delete) */
- hlist_del_init(&cl->hlist);
+ list_del(&cl->clist);
list_del(&cl->sibling);
if (cl->prio_activity)
@@ -1334,6 +1371,7 @@
if (cl->cmode != HTB_CAN_SEND)
htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ (*q->classes[HTB_CLS_ARRAY(minorid)])[HTB_CLS_INDEX(minorid)] = NULL;
kfree(cl);
}
@@ -1341,6 +1379,7 @@
static void htb_destroy(struct Qdisc *sch)
{
struct htb_sched *q = qdisc_priv(sch);
+ int i;
del_timer_sync(&q->timer);
#ifdef HTB_RATECM
@@ -1355,6 +1394,8 @@
while (!list_empty(&q->root))
htb_destroy_class(sch, list_entry(q->root.next,
struct htb_class, sibling));
+ for (i=0; i<HTB_CLS_ARRAYS; i++)
+ kfree(q->classes[i]);
__skb_queue_purge(&q->direct_queue);
}
@@ -1381,8 +1422,15 @@
sch_tree_lock(sch);
- /* delete from hash and active; remainder in destroy_class */
- hlist_del_init(&cl->hlist);
+ /* delete from class list and active; remainder in destroy_class */
+ if (q->rtcur == &cl->clist) {
+ q->rtcur = q->rtcur->next;
+ if (q->rtcur == &q->clist)
+ q->rtcur = NULL;
+ }
+ if (--q->clscnt == 0)
+ q->rtiter = 0;
+ list_del_init(&cl->clist);
if (!cl->level) {
qlen = cl->un.leaf.q->q.qlen;
@@ -1422,6 +1470,7 @@
struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
struct rtattr *tb[TCA_HTB_RTAB];
struct tc_htb_opt *hopt;
+ int minorid = TC_H_MIN(classid);
/* extract all subattrs from opt attr */
if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
@@ -1453,12 +1502,18 @@
goto failure;
}
err = -ENOBUFS;
+ if (q->classes[HTB_CLS_ARRAY(minorid)] == NULL) {
+ if ((q->classes[HTB_CLS_ARRAY(minorid)] =
+ kzalloc(sizeof(htb_cls_array), GFP_KERNEL))
+ == NULL)
+ goto failure;
+ }
if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
goto failure;
cl->refcnt = 1;
+ INIT_LIST_HEAD(&cl->clist);
INIT_LIST_HEAD(&cl->sibling);
- INIT_HLIST_NODE(&cl->hlist);
INIT_LIST_HEAD(&cl->children);
INIT_LIST_HEAD(&cl->un.leaf.drop_list);
RB_CLEAR_NODE(&cl->pq_node);
@@ -1503,10 +1558,12 @@
PSCHED_GET_TIME(cl->t_c);
cl->cmode = HTB_CAN_SEND;
- /* attach to the hash list and parent's family */
- hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
+ /* attach to the classes list+array and parent's family */
+ list_add_tail(&cl->clist, &q->clist);
list_add_tail(&cl->sibling,
parent ? &parent->children : &q->root);
+ (*q->classes[HTB_CLS_ARRAY(minorid)])[HTB_CLS_INDEX(minorid)] = cl;
+ q->clscnt++;
} else
sch_tree_lock(sch);
@@ -1602,26 +1659,22 @@
static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct htb_sched *q = qdisc_priv(sch);
- int i;
+ struct list_head *p;
if (arg->stop)
return;
- for (i = 0; i < HTB_HSIZE; i++) {
- struct hlist_node *p;
- struct htb_class *cl;
-
- hlist_for_each_entry(cl, p, q->hash + i, hlist) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
- arg->stop = 1;
- return;
- }
+ list_for_each(p, &q->clist) {
+ struct htb_class *cl = list_entry(p, struct htb_class, clist);
+ if (arg->count < arg->skip) {
arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
}
+ arg->count++;
}
}
[-- Attachment #3: Type: text/plain, Size: 143 bytes --]
_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc
^ permalink raw reply [flat|nested] 9+ messages in thread* [LARTC] Re: [PATCH] HTB O(1) class lookup
2007-02-01 5:18 [LARTC] [PATCH] HTB O(1) class lookup Simon Lodal
@ 2007-02-01 6:08 ` Patrick McHardy
2007-02-01 7:08 ` Simon Lodal
2007-02-01 22:44 ` [LARTC] " Konrad Cempura
1 sibling, 1 reply; 9+ messages in thread
From: Patrick McHardy @ 2007-02-01 6:08 UTC (permalink / raw)
To: Simon Lodal; +Cc: netdev, lartc
Simon Lodal wrote:
> This patch changes HTB's class storage from hash+lists to a two-level linear
> array, so it can do constant time (O(1)) class lookup by classid. It improves
> scalability for large number of classes.
>
> Without the patch, ~14k htb classes can starve a Xeon-3.2 at only 15kpps,
> using most of it's cycles traversing lists in htb_find(). The patch
> eliminates this problem, and has a measurable impact even with a few hundred
> classes.
>
> Previously, scalability could be improved by increasing HTB_HSIZE, modify the
> hash function, and recompile, but this patch works for everyone without
> recompile and scales better too.
I agree that the current fixed sized hashes (additionally quite
small by default) are a big problem with many classes, for all of
HTB/HFSC/CBQ. But I think your approach is a bit wasteful, with
unfortunately chosen classids 128 classes are enough to reach the
maximum memory usage of ~512kb (with 4k pages and 8 byte pointers).
I have a patch for HFSC which introduces dynamic resizing of the
class hash. I have planned to generalize it (similar to tcf_hashinfo)
and convert HTB and CBQ as well, which as a nice side effect will
allow to get rid of some duplicated code, like hash walking.
If you give me a few days I'll try to finish and post it.
_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc
^ permalink raw reply [flat|nested] 9+ messages in thread
* [LARTC] Re: [PATCH] HTB O(1) class lookup
2007-02-01 6:08 ` [LARTC] " Patrick McHardy
@ 2007-02-01 7:08 ` Simon Lodal
[not found] ` <p73r6tanmhk.fsf@bingen.suse.de>
0 siblings, 1 reply; 9+ messages in thread
From: Simon Lodal @ 2007-02-01 7:08 UTC (permalink / raw)
To: Patrick McHardy; +Cc: netdev, lartc
On Thursday 01 February 2007 07:08, Patrick McHardy wrote:
> Simon Lodal wrote:
> > This patch changes HTB's class storage from hash+lists to a two-level
> > linear array, so it can do constant time (O(1)) class lookup by classid.
> > It improves scalability for large number of classes.
> >
> > Without the patch, ~14k htb classes can starve a Xeon-3.2 at only 15kpps,
> > using most of it's cycles traversing lists in htb_find(). The patch
> > eliminates this problem, and has a measurable impact even with a few
> > hundred classes.
> >
> > Previously, scalability could be improved by increasing HTB_HSIZE, modify
> > the hash function, and recompile, but this patch works for everyone
> > without recompile and scales better too.
>
> I agree that the current fixed sized hashes (additionally quite
> small by default) are a big problem with many classes, for all of
> HTB/HFSC/CBQ. But I think your approach is a bit wasteful, with
> unfortunately chosen classids 128 classes are enough to reach the
> maximum memory usage of ~512kb (with 4k pages and 8 byte pointers).
I think it is a non-issue since it does not happen in practice.
Generally there are two ways to assign classids:
1) Manually, used when you have few classes. People usually use 100, 101, 102,
200, 201 etc (probably unaware that they are hex). With 4k pages and 32bit
pointers, everything below classid 400 is within the first page, which covers
most "few classes" examples you can find lying around.
2) Script generated, in practice this is required if you have many classes.
The classid's will then usually be forthrunning, at least in large chunks,
which means minimal memory waste, and an optimal case for plain linear
lookup; hashing them can only be wasteful.
> I have a patch for HFSC which introduces dynamic resizing of the
> class hash. I have planned to generalize it (similar to tcf_hashinfo)
> and convert HTB and CBQ as well, which as a nice side effect will
> allow to get rid of some duplicated code, like hash walking.
I have not been looking into HFSC and CBQ, was not aware that they have
similar issues.
> If you give me a few days I'll try to finish and post it.
Memory is generally not an issue, but CPU is, and you can not beat the CPU
efficiency of plain array lookup (always faster, and constant time).
If anything, I would find it more relevant to use array lookup with dynamic
adjustment of the array size (HTB_CLS_ARRAY_SIZE in my patch); start out
small to waste less memory, increase up to PAGE_SIZE as needed.
But then, it is probably too much effort for the gain (a few kb's in machines
that should have plenty of RAM anyway), and requires more code => more
complexity, bugs, maintenance.
Regards
Simon
_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [LARTC] [PATCH] HTB O(1) class lookup
2007-02-01 5:18 [LARTC] [PATCH] HTB O(1) class lookup Simon Lodal
2007-02-01 6:08 ` [LARTC] " Patrick McHardy
@ 2007-02-01 22:44 ` Konrad Cempura
1 sibling, 0 replies; 9+ messages in thread
From: Konrad Cempura @ 2007-02-01 22:44 UTC (permalink / raw)
To: lartc
Simon Lodal napisa³(a):
> The patch is for 2.6.20-rc6, I have older ones for 2.6.18 and 2.6.19 if anyone
> is interested.
It's working also on 2.6.20-rc7.
I'm testing it and I'm impressed. Good work :)
_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2007-02-08 7:36 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-02-01 5:18 [LARTC] [PATCH] HTB O(1) class lookup Simon Lodal
2007-02-01 6:08 ` [LARTC] " Patrick McHardy
2007-02-01 7:08 ` Simon Lodal
[not found] ` <p73r6tanmhk.fsf@bingen.suse.de>
2007-02-05 10:16 ` Jarek Poplawski
2007-02-05 17:14 ` Simon Lodal
2007-02-06 8:08 ` Jarek Poplawski
2007-02-08 7:36 ` Jarek Poplawski
2007-02-05 18:21 ` Simon Lodal
2007-02-01 22:44 ` [LARTC] " Konrad Cempura
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox