* [NET]: gen_estimator deadlock fix
@ 2007-07-11 13:41 Ranko Zivojnovic
2007-07-12 7:37 ` Jarek Poplawski
0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-11 13:41 UTC (permalink / raw)
To: netdev; +Cc: Andrew Morton, Patrick McHardy, Jarek Poplawski
Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> There is at least one ABBA deadlock, est_timer() does:
> read_lock(&est_lock)
> spin_lock(e->stats_lock) (which is dev->queue_lock)
>
> and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> calls htb_destroy_class, then gen_kill_estimator and this
> write_locks est_lock.
est_lock completely removed and the RCU list implemented instead.
Both gen_kill_estimator and gen_new_estimator are already called under
rtnl_mutex.
Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
---
net/core/gen_estimator.c | 73 ++++++++++++++++++++++++----------------------
1 files changed, 38 insertions(+), 35 deletions(-)
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cc84d8d..444b2bd 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -79,7 +79,7 @@
struct gen_estimator
{
- struct gen_estimator *next;
+ struct list_head list;
struct gnet_stats_basic *bstats;
struct gnet_stats_rate_est *rate_est;
spinlock_t *stats_lock;
@@ -89,26 +89,24 @@ struct gen_estimator
u32 last_packets;
u32 avpps;
u32 avbps;
+ struct rcu_head e_rcu;
};
struct gen_estimator_head
{
struct timer_list timer;
- struct gen_estimator *list;
+ struct list_head list;
};
static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-/* Estimator array lock */
-static DEFINE_RWLOCK(est_lock);
-
static void est_timer(unsigned long arg)
{
int idx = (int)arg;
struct gen_estimator *e;
- read_lock(&est_lock);
- for (e = elist[idx].list; e; e = e->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &elist[idx].list, list) {
u64 nbytes;
u32 npackets;
u32 rate;
@@ -128,9 +126,9 @@ static void est_timer(unsigned long arg)
spin_unlock(e->stats_lock);
}
- if (elist[idx].list != NULL)
+ if (!list_empty(&elist[idx].list))
mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
- read_unlock(&est_lock);
+ rcu_read_unlock();
}
/**
@@ -147,12 +145,15 @@ static void est_timer(unsigned long arg)
* &rate_est with the statistics lock grabed during this period.
*
* Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
*/
int gen_new_estimator(struct gnet_stats_basic *bstats,
struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
{
struct gen_estimator *est;
struct gnet_estimator *parm = RTA_DATA(opt);
+ int idx;
if (RTA_PAYLOAD(opt) < sizeof(*parm))
return -EINVAL;
@@ -164,7 +165,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
if (est == NULL)
return -ENOBUFS;
- est->interval = parm->interval + 2;
+ est->interval = idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
@@ -174,20 +175,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
est->last_packets = bstats->packets;
est->avpps = rate_est->pps<<10;
- est->next = elist[est->interval].list;
- if (est->next == NULL) {
- init_timer(&elist[est->interval].timer);
- elist[est->interval].timer.data = est->interval;
- elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
- elist[est->interval].timer.function = est_timer;
- add_timer(&elist[est->interval].timer);
+ if (!elist[idx].timer.function) {
+ INIT_LIST_HEAD(&elist[idx].list);
+ setup_timer(&elist[idx].timer, est_timer, est->interval);
}
- write_lock_bh(&est_lock);
- elist[est->interval].list = est;
- write_unlock_bh(&est_lock);
+
+ if (list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+ list_add_rcu(&est->list, &elist[idx].list);
return 0;
}
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+ struct gen_estimator *e = container_of(head,
+ struct gen_estimator, e_rcu);
+ kfree(e);
+}
+
/**
* gen_kill_estimator - remove a rate estimator
* @bstats: basic statistics
@@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
*
* Removes the rate estimator specified by &bstats and &rate_est
* and deletes the timer.
+ *
+ * NOTE: Called under rtnl_mutex
*/
void gen_kill_estimator(struct gnet_stats_basic *bstats,
struct gnet_stats_rate_est *rate_est)
{
int idx;
- struct gen_estimator *est, **pest;
+ struct gen_estimator *e, *n;
for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
- int killed = 0;
- pest = &elist[idx].list;
- while ((est=*pest) != NULL) {
- if (est->rate_est != rate_est || est->bstats != bstats) {
- pest = &est->next;
- continue;
- }
- write_lock_bh(&est_lock);
- *pest = est->next;
- write_unlock_bh(&est_lock);
+ /* Skip non initialized indexes */
+ if (!elist[idx].timer.function)
+ continue;
+
+ list_for_each_entry_safe(e, n, &elist[idx].list, list) {
+ if (e->rate_est != rate_est || e->bstats != bstats)
+ continue;
- kfree(est);
- killed++;
+ list_del_rcu(&e->list);
+ call_rcu(&e->e_rcu, __gen_kill_estimator);
}
- if (killed && elist[idx].list == NULL)
- del_timer(&elist[idx].timer);
}
}
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-11 13:41 [NET]: gen_estimator deadlock fix Ranko Zivojnovic
@ 2007-07-12 7:37 ` Jarek Poplawski
2007-07-12 9:18 ` Ranko Zivojnovic
0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-12 7:37 UTC (permalink / raw)
To: Ranko Zivojnovic; +Cc: netdev, Andrew Morton, Patrick McHardy
On Wed, Jul 11, 2007 at 04:41:37PM +0300, Ranko Zivojnovic wrote:
> Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> > There is at least one ABBA deadlock, est_timer() does:
> > read_lock(&est_lock)
> > spin_lock(e->stats_lock) (which is dev->queue_lock)
> >
> > and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> > calls htb_destroy_class, then gen_kill_estimator and this
> > write_locks est_lock.
>
> est_lock completely removed and the RCU list implemented instead.
> Both gen_kill_estimator and gen_new_estimator are already called under
> rtnl_mutex.
>
> Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
Maybe it's only my issue, but it seems there are no tabs: all spaces...
...plus some old doubts:
> ---
> net/core/gen_estimator.c | 73 ++++++++++++++++++++++++----------------------
> 1 files changed, 38 insertions(+), 35 deletions(-)
>
> diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
> index cc84d8d..444b2bd 100644
> --- a/net/core/gen_estimator.c
> +++ b/net/core/gen_estimator.c
...
> @@ -174,20 +175,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
> est->last_packets = bstats->packets;
> est->avpps = rate_est->pps<<10;
>
> - est->next = elist[est->interval].list;
> - if (est->next == NULL) {
> - init_timer(&elist[est->interval].timer);
> - elist[est->interval].timer.data = est->interval;
> - elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
> - elist[est->interval].timer.function = est_timer;
> - add_timer(&elist[est->interval].timer);
> + if (!elist[idx].timer.function) {
> + INIT_LIST_HEAD(&elist[idx].list);
> + setup_timer(&elist[idx].timer, est_timer, est->interval);
- setup_timer(&elist[idx].timer, est_timer, est->interval);
+ setup_timer(&elist[idx].timer, est_timer, idx);
...
> /**
> * gen_kill_estimator - remove a rate estimator
> * @bstats: basic statistics
> @@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
> *
> * Removes the rate estimator specified by &bstats and &rate_est
> * and deletes the timer.
> + *
> + * NOTE: Called under rtnl_mutex
> */
> void gen_kill_estimator(struct gnet_stats_basic *bstats,
> struct gnet_stats_rate_est *rate_est)
> {
...
> + list_for_each_entry_safe(e, n, &elist[idx].list, list) {
> + if (e->rate_est != rate_est || e->bstats != bstats)
> + continue;
>
> - kfree(est);
> - killed++;
> + list_del_rcu(&e->list);
> + call_rcu(&e->e_rcu, __gen_kill_estimator);
> }
> - if (killed && elist[idx].list == NULL)
> - del_timer(&elist[idx].timer);
> }
> }
I've done a bit of mess last time, so maybe it was forgotten, but I
still think this kind of race is possible:
- gen_kill_estimator is called during qdisc_destroy under
dev->queue_lock,
- est_timer is running and waiting on this lock just on the
list entry of the destroyed class,
- gen_kill_estimator kills the entry and returns,
- in xxx_destroy_class kfree(cl) is done etc.,
- est_timer gets the lock and does nbytes = e->bstats->bytes or
e->rate_est-bps = ... with freed memory.
Regards,
Jarek P.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 7:37 ` Jarek Poplawski
@ 2007-07-12 9:18 ` Ranko Zivojnovic
2007-07-12 9:40 ` Ranko Zivojnovic
2007-07-12 10:46 ` Jarek Poplawski
0 siblings, 2 replies; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12 9:18 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: netdev, Andrew Morton, Patrick McHardy
On Thu, 2007-07-12 at 09:37 +0200, Jarek Poplawski wrote:
> On Wed, Jul 11, 2007 at 04:41:37PM +0300, Ranko Zivojnovic wrote:
> > Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
>
> Maybe it's only my issue, but it seems there are no tabs: all spaces...
Nope - you are right - just noticed my mailer converts tabs into spaces
- sorry about this...
>
> ...plus some old doubts:
>
> - setup_timer(&elist[idx].timer, est_timer, est->interval);
> + setup_timer(&elist[idx].timer, est_timer, idx);
>
I left this because setup_timer expects unsigned long.
> ...
> > /**
> > * gen_kill_estimator - remove a rate estimator
> > * @bstats: basic statistics
> > @@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
> > *
> > * Removes the rate estimator specified by &bstats and &rate_est
> > * and deletes the timer.
> > + *
> > + * NOTE: Called under rtnl_mutex
> > */
> > void gen_kill_estimator(struct gnet_stats_basic *bstats,
> > struct gnet_stats_rate_est *rate_est)
> > {
> ...
> > + list_for_each_entry_safe(e, n, &elist[idx].list, list) {
> > + if (e->rate_est != rate_est || e->bstats != bstats)
> > + continue;
> >
> > - kfree(est);
> > - killed++;
> > + list_del_rcu(&e->list);
> > + call_rcu(&e->e_rcu, __gen_kill_estimator);
> > }
> > - if (killed && elist[idx].list == NULL)
> > - del_timer(&elist[idx].timer);
> > }
> > }
>
> I've done a bit of mess last time, so maybe it was forgotten, but I
> still think this kind of race is possible:
>
> - gen_kill_estimator is called during qdisc_destroy under
> dev->queue_lock,
> - est_timer is running and waiting on this lock just on the
> list entry of the destroyed class,
> - gen_kill_estimator kills the entry and returns,
> - in xxx_destroy_class kfree(cl) is done etc.,
> - est_timer gets the lock and does nbytes = e->bstats->bytes or
> e->rate_est-bps = ... with freed memory.
Unfortunately I cannot say for sure that _currently_ anything really bad
can happen here... someone with better knowledge will have to confirm
this - Patrick?
Here's why - the only updates that are being done in the est_timer are
to the structure pointed by 'e'. Yes - it is possible the values it will
refer to could be random (freed memory), _however_ the 'e' structure is
still valid as it is _not_ kfree()'d directly from gen_kill_estimator()
- i.e. it will not be kfree()'d until rcu_read_unlock() is called -
which is outside the loop.
I don't mind fixing all the classful qdiscs to call_rcu() to release
their class structures for consistency purposes ... in fact ... that is
exactly what I will do in order to avoid any potential future mishaps.
One may actually decide in the future to add a callback to a user
defined function to update some qdisc/class specific rates and given
this inconsistency in handling qdiscs vs classes - it could have a nasty
backfire.
Will have a new patch soon.
Regards,
Ranko
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 9:18 ` Ranko Zivojnovic
@ 2007-07-12 9:40 ` Ranko Zivojnovic
2007-07-12 10:46 ` Jarek Poplawski
1 sibling, 0 replies; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12 9:40 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: netdev, Andrew Morton, Patrick McHardy
On Thu, 2007-07-12 at 12:18 +0300, Ranko Zivojnovic wrote:
> >
> > I've done a bit of mess last time, so maybe it was forgotten, but I
> > still think this kind of race is possible:
> >
> > - gen_kill_estimator is called during qdisc_destroy under
> > dev->queue_lock,
> > - est_timer is running and waiting on this lock just on the
> > list entry of the destroyed class,
> > - gen_kill_estimator kills the entry and returns,
> > - in xxx_destroy_class kfree(cl) is done etc.,
> > - est_timer gets the lock and does nbytes = e->bstats->bytes or
> > e->rate_est-bps = ... with freed memory.
>
> Unfortunately I cannot say for sure that _currently_ anything really bad
> can happen here... someone with better knowledge will have to confirm
> this - Patrick?
>
Oops - just saw it - and you even mention it up there:
e->rate_est->bps = ...
^^^^
>
> Will have a new patch soon.
>
R.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 9:18 ` Ranko Zivojnovic
2007-07-12 9:40 ` Ranko Zivojnovic
@ 2007-07-12 10:46 ` Jarek Poplawski
2007-07-12 11:47 ` Ranko Zivojnovic
1 sibling, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-12 10:46 UTC (permalink / raw)
To: Ranko Zivojnovic; +Cc: netdev, Andrew Morton, Patrick McHardy
On Thu, Jul 12, 2007 at 12:18:23PM +0300, Ranko Zivojnovic wrote:
> On Thu, 2007-07-12 at 09:37 +0200, Jarek Poplawski wrote:
> > On Wed, Jul 11, 2007 at 04:41:37PM +0300, Ranko Zivojnovic wrote:
> > > Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
> >
> > Maybe it's only my issue, but it seems there are no tabs: all spaces...
>
> Nope - you are right - just noticed my mailer converts tabs into spaces
> - sorry about this...
>
> >
> > ...plus some old doubts:
> >
>
> > - setup_timer(&elist[idx].timer, est_timer, est->interval);
> > + setup_timer(&elist[idx].timer, est_timer, idx);
> >
>
> I left this because setup_timer expects unsigned long.
Are there any warnings? It'll look strange without such comment.
I didn't check this, but maybe idx should be unsigned too?
>
> > ...
> > > /**
> > > * gen_kill_estimator - remove a rate estimator
> > > * @bstats: basic statistics
> > > @@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
> > > *
> > > * Removes the rate estimator specified by &bstats and &rate_est
> > > * and deletes the timer.
> > > + *
> > > + * NOTE: Called under rtnl_mutex
> > > */
> > > void gen_kill_estimator(struct gnet_stats_basic *bstats,
> > > struct gnet_stats_rate_est *rate_est)
> > > {
> > ...
> > > + list_for_each_entry_safe(e, n, &elist[idx].list, list) {
> > > + if (e->rate_est != rate_est || e->bstats != bstats)
> > > + continue;
> > >
> > > - kfree(est);
> > > - killed++;
> > > + list_del_rcu(&e->list);
> > > + call_rcu(&e->e_rcu, __gen_kill_estimator);
> > > }
> > > - if (killed && elist[idx].list == NULL)
> > > - del_timer(&elist[idx].timer);
> > > }
> > > }
> >
> > I've done a bit of mess last time, so maybe it was forgotten, but I
> > still think this kind of race is possible:
> >
> > - gen_kill_estimator is called during qdisc_destroy under
> > dev->queue_lock,
> > - est_timer is running and waiting on this lock just on the
> > list entry of the destroyed class,
> > - gen_kill_estimator kills the entry and returns,
> > - in xxx_destroy_class kfree(cl) is done etc.,
> > - est_timer gets the lock and does nbytes = e->bstats->bytes or
> > e->rate_est-bps = ... with freed memory.
...
> I don't mind fixing all the classful qdiscs to call_rcu() to release
> their class structures for consistency purposes ... in fact ... that is
> exactly what I will do in order to avoid any potential future mishaps.
> One may actually decide in the future to add a callback to a user
> defined function to update some qdisc/class specific rates and given
> this inconsistency in handling qdiscs vs classes - it could have a nasty
> backfire.
I don't know if such broad changes are acceptable, or if it's even
required that these structs have to belong to a class struct.
IMHO, unless I miss something, they could be included into
gen_estimator struct after some api change. BTW, maybe it would be
resonable to return a pointer to such gen_estimator from
gen_new_estimator, then lookups could be avoided in
gen_kill_estimator.
Alas, there is probably more (of course very unprobable): there
is no dev_hold for gen_estimator now, so I hope it'll always manage
to unlock in time - before dev is freed.
Jarek P.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 10:46 ` Jarek Poplawski
@ 2007-07-12 11:47 ` Ranko Zivojnovic
2007-07-12 12:07 ` Patrick McHardy
0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12 11:47 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: netdev, Andrew Morton, Patrick McHardy
On Thu, 2007-07-12 at 12:46 +0200, Jarek Poplawski wrote:
> On Thu, Jul 12, 2007 at 12:18:23PM +0300, Ranko Zivojnovic wrote:
> > On Thu, 2007-07-12 at 09:37 +0200, Jarek Poplawski wrote:
> > > - setup_timer(&elist[idx].timer, est_timer, est->interval);
> > > + setup_timer(&elist[idx].timer, est_timer, idx);
> >
> > I left this because setup_timer expects unsigned long.
>
> Are there any warnings? It'll look strange without such comment.
> I didn't check this, but maybe idx should be unsigned too?
>
I'll make idx unsigned.
> >
> > > - gen_kill_estimator is called during qdisc_destroy under
> > > dev->queue_lock,
> > > - est_timer is running and waiting on this lock just on the
> > > list entry of the destroyed class,
> > > - gen_kill_estimator kills the entry and returns,
> > > - in xxx_destroy_class kfree(cl) is done etc.,
> > > - est_timer gets the lock and does nbytes = e->bstats->bytes or
> > > e->rate_est-bps = ... with freed memory.
> ...
> > I don't mind fixing all the classful qdiscs to call_rcu() to release
> > their class structures for consistency purposes ... in fact ... that is
> > exactly what I will do in order to avoid any potential future mishaps.
> > One may actually decide in the future to add a callback to a user
> > defined function to update some qdisc/class specific rates and given
> > this inconsistency in handling qdiscs vs classes - it could have a nasty
> > backfire.
>
> I don't know if such broad changes are acceptable, or if it's even
> required that these structs have to belong to a class struct.
>
> IMHO, unless I miss something, they could be included into
> gen_estimator struct after some api change. BTW, maybe it would be
> resonable to return a pointer to such gen_estimator from
> gen_new_estimator, then lookups could be avoided in
> gen_kill_estimator.
>
> Alas, there is probably more (of course very unprobable): there
> is no dev_hold for gen_estimator now, so I hope it'll always manage
> to unlock in time - before dev is freed.
>
I agree - it does look like the most sensible thing to do - have
gnet_stats_basic and gnet_stats_rate_est allocated within the
gen_estimator struct rather than pointers looking here and there - and
provide api to maintain those stats - it simplifies the picture.
Also - the stats_lock in this case could be local to gen_estimator
struct, thus making the implementation completely "dev agnostic" - and
will not break on dev removal.
All this however will require a rather intrusive "surgical procedure" to
fix all the references.
I don't mind implementing and testing it - given there's consensus from
maintainers on the "correctness".
Waiting for feedback.
R.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 11:47 ` Ranko Zivojnovic
@ 2007-07-12 12:07 ` Patrick McHardy
2007-07-12 17:48 ` Ranko Zivojnovic
0 siblings, 1 reply; 18+ messages in thread
From: Patrick McHardy @ 2007-07-12 12:07 UTC (permalink / raw)
To: Ranko Zivojnovic; +Cc: Jarek Poplawski, netdev
[Removed Andrew from CC]
Ranko Zivojnovic wrote:
> I agree - it does look like the most sensible thing to do - have
> gnet_stats_basic and gnet_stats_rate_est allocated within the
> gen_estimator struct rather than pointers looking here and there - and
> provide api to maintain those stats - it simplifies the picture.
The API is not very pretty, some improvement there would be welcome.
> Also - the stats_lock in this case could be local to gen_estimator
> struct, thus making the implementation completely "dev agnostic" - and
> will not break on dev removal.
The queue lock is used since thats what protects the qdisc counters.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 12:07 ` Patrick McHardy
@ 2007-07-12 17:48 ` Ranko Zivojnovic
2007-07-13 12:17 ` Jarek Poplawski
0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12 17:48 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Jarek Poplawski, netdev
[-- Attachment #1: Type: text/plain, Size: 1255 bytes --]
On Thu, 2007-07-12 at 14:07 +0200, Patrick McHardy wrote:
> [Removed Andrew from CC]
>
> Ranko Zivojnovic wrote:
> > I agree - it does look like the most sensible thing to do - have
> > gnet_stats_basic and gnet_stats_rate_est allocated within the
> > gen_estimator struct rather than pointers looking here and there - and
> > provide api to maintain those stats - it simplifies the picture.
>
>
> The API is not very pretty, some improvement there would be welcome.
>
> > Also - the stats_lock in this case could be local to gen_estimator
> > struct, thus making the implementation completely "dev agnostic" - and
> > will not break on dev removal.
>
>
> The queue lock is used since thats what protects the qdisc counters.
Ok - here's the patch for a review - it compiles clean ... and that's as
much as it has been tested - I'll try give it a run later today or
definitely tomorrow.
'bstats' and 'rate_est' are now embedded into gen_estimator as well as
the spin_lock that protects them.
Introduced two new inline functions: gen_bstat_add() and gen_bstat_skb()
for maintaining the bstats within the gen_estimator structure.
Note that gen_bstat_skb includes also the TSO check and effectively
propagates the TSO support to all schedulers.
R.
[-- Attachment #2: t --]
[-- Type: text/plain, Size: 32294 bytes --]
include/net/act_api.h | 6 +-
include/net/gen_stats.h | 42 ++++++++++--
include/net/sch_generic.h | 7 +-
net/core/gen_estimator.c | 159 +++++++++++++++++++--------------------------
net/sched/act_api.c | 23 ++++--
net/sched/act_police.c | 34 +++++-----
net/sched/sch_api.c | 16 ++--
net/sched/sch_atm.c | 3 +-
net/sched/sch_cbq.c | 39 ++++++------
net/sched/sch_dsmark.c | 3 +-
net/sched/sch_generic.c | 2 +-
net/sched/sch_hfsc.c | 34 +++++-----
net/sched/sch_htb.c | 36 +++++------
net/sched/sch_ingress.c | 9 +--
net/sched/sch_netem.c | 6 +-
net/sched/sch_prio.c | 5 +-
net/sched/sch_red.c | 3 +-
net/sched/sch_sfq.c | 3 +-
net/sched/sch_tbf.c | 3 +-
net/sched/sch_teql.c | 3 +-
20 files changed, 214 insertions(+), 222 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 2f0273f..f5e0f05 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -16,9 +16,9 @@ struct tcf_common {
u32 tcfc_capab;
int tcfc_action;
struct tcf_t tcfc_tm;
+ struct gen_estimator *tcfc_ge;
struct gnet_stats_basic tcfc_bstats;
struct gnet_stats_queue tcfc_qstats;
- struct gnet_stats_rate_est tcfc_rate_est;
spinlock_t tcfc_lock;
};
#define tcf_next common.tcfc_next
@@ -29,8 +29,10 @@ struct tcf_common {
#define tcf_action common.tcfc_action
#define tcf_tm common.tcfc_tm
#define tcf_bstats common.tcfc_bstats
+#define tcf_ge common.tcfc_ge
+#define tcf_ge_bstats common.tcfc_ge->bstats
+#define tcf_ge_rate_est common.tcfc_ge->rate_est
#define tcf_qstats common.tcfc_qstats
-#define tcf_rate_est common.tcfc_rate_est
#define tcf_lock common.tcfc_lock
struct tcf_police {
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 0b95cf0..1103ec8 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -20,6 +20,21 @@ struct gnet_dump
struct tc_stats tc_stats;
};
+struct gen_estimator
+{
+ struct list_head list;
+ struct gnet_stats_basic bstats;
+ struct gnet_stats_rate_est rate_est;
+ spinlock_t stats_lock; /* Protects the stats updates */
+ unsigned interval;
+ int ewma_log;
+ u64 last_bytes;
+ u32 last_packets;
+ u32 avpps;
+ u32 avbps;
+ struct rcu_head e_rcu;
+};
+
extern int gnet_stats_start_copy(struct sk_buff *skb, int type,
spinlock_t *lock, struct gnet_dump *d);
@@ -37,13 +52,24 @@ extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
extern int gnet_stats_finish_copy(struct gnet_dump *d);
-extern int gen_new_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est,
- spinlock_t *stats_lock, struct rtattr *opt);
-extern void gen_kill_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est);
-extern int gen_replace_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est,
- spinlock_t *stats_lock, struct rtattr *opt);
+extern int gen_new_estimator(struct rtattr *opt, struct gen_estimator **ep);
+extern void gen_kill_estimator(struct gen_estimator *ep);
+extern int gen_replace_estimator(struct rtattr *opt, struct gen_estimator **ep);
+
+static inline void gen_bstat_add(struct gen_estimator *e,
+ int bytes,
+ int packets)
+{
+ spin_lock(&e->stats_lock);
+ e->bstats.bytes += bytes;
+ e->bstats.packets += packets;
+ spin_unlock(&e->stats_lock);
+}
+
+static inline void gen_bstat_skb(struct gen_estimator *e,
+ struct sk_buff *skb)
+{
+ gen_bstat_add(e, skb->len, skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1);
+}
#endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1b8e351..ff13671 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -40,9 +40,9 @@ struct Qdisc
struct net_device *dev;
struct list_head list;
- struct gnet_stats_basic bstats;
+ struct gen_estimator *ge;
+
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
spinlock_t *stats_lock;
struct rcu_head q_rcu;
int (*reshape_fail)(struct sk_buff *skb,
@@ -185,8 +185,7 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
{
__skb_queue_tail(list, skb);
sch->qstats.backlog += skb->len;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
return NET_XMIT_SUCCESS;
}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cc84d8d..502deb3 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -77,82 +77,65 @@
#define EST_MAX_INTERVAL 5
-struct gen_estimator
-{
- struct gen_estimator *next;
- struct gnet_stats_basic *bstats;
- struct gnet_stats_rate_est *rate_est;
- spinlock_t *stats_lock;
- unsigned interval;
- int ewma_log;
- u64 last_bytes;
- u32 last_packets;
- u32 avpps;
- u32 avbps;
-};
-
struct gen_estimator_head
{
struct timer_list timer;
- struct gen_estimator *list;
+ struct list_head list;
};
static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-/* Estimator array lock */
-static DEFINE_RWLOCK(est_lock);
-
static void est_timer(unsigned long arg)
{
int idx = (int)arg;
struct gen_estimator *e;
- read_lock(&est_lock);
- for (e = elist[idx].list; e; e = e->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &elist[idx].list, list) {
u64 nbytes;
u32 npackets;
u32 rate;
- spin_lock(e->stats_lock);
- nbytes = e->bstats->bytes;
- npackets = e->bstats->packets;
+ spin_lock(&e->stats_lock);
+ nbytes = e->bstats.bytes;
+ npackets = e->bstats.packets;
rate = (nbytes - e->last_bytes)<<(7 - idx);
e->last_bytes = nbytes;
e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
- e->rate_est->bps = (e->avbps+0xF)>>5;
+ e->rate_est.bps = (e->avbps+0xF)>>5;
rate = (npackets - e->last_packets)<<(12 - idx);
e->last_packets = npackets;
e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
- e->rate_est->pps = (e->avpps+0x1FF)>>10;
- spin_unlock(e->stats_lock);
+ e->rate_est.pps = (e->avpps+0x1FF)>>10;
+ spin_unlock(&e->stats_lock);
}
- if (elist[idx].list != NULL)
+ if (!list_empty(&elist[idx].list))
mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
- read_unlock(&est_lock);
+ rcu_read_unlock();
}
/**
* gen_new_estimator - create a new rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
+ * @ep: new gen_estimator struct is returned through this pointer on success
*
- * Creates a new rate estimator with &bstats as source and &rate_est
- * as destination. A new timer with the interval specified in the
+ * Creates a new rate estimator with bstats as source and rate_est
+ * as destination. A timer with the interval specified in the
* configuration TLV is created. Upon each interval, the latest statistics
- * will be read from &bstats and the estimated rate will be stored in
- * &rate_est with the statistics lock grabed during this period.
+ * will be read from bstats and the estimated rate will be stored in
+ * rate_est with the statistics lock grabed during this period.
*
* Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
*/
-int gen_new_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+int gen_new_estimator(struct rtattr *opt, struct gen_estimator **ep)
{
struct gen_estimator *est;
struct gnet_estimator *parm = RTA_DATA(opt);
+ int idx;
if (RTA_PAYLOAD(opt) < sizeof(*parm))
return -EINVAL;
@@ -164,84 +147,74 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
if (est == NULL)
return -ENOBUFS;
- est->interval = parm->interval + 2;
- est->bstats = bstats;
- est->rate_est = rate_est;
- est->stats_lock = stats_lock;
+ spin_lock_init(&est->stats_lock);
+ est->interval = idx = parm->interval + 2;
est->ewma_log = parm->ewma_log;
- est->last_bytes = bstats->bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = bstats->packets;
- est->avpps = rate_est->pps<<10;
-
- est->next = elist[est->interval].list;
- if (est->next == NULL) {
- init_timer(&elist[est->interval].timer);
- elist[est->interval].timer.data = est->interval;
- elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
- elist[est->interval].timer.function = est_timer;
- add_timer(&elist[est->interval].timer);
+
+ if (*ep != NULL) {
+ est->last_bytes = (*ep)->bstats.bytes;
+ est->avbps = (*ep)->rate_est.bps<<5;
+ est->last_packets = (*ep)->bstats.packets;
+ est->avpps = (*ep)->rate_est.pps<<10;
+ }
+
+ if (!elist[idx].timer.function) {
+ INIT_LIST_HEAD(&elist[idx].list);
+ setup_timer(&elist[idx].timer, est_timer, est->interval);
}
- write_lock_bh(&est_lock);
- elist[est->interval].list = est;
- write_unlock_bh(&est_lock);
+
+ if (list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+ list_add_rcu(&est->list, &elist[idx].list);
+
+ *ep = est;
+
return 0;
}
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+ struct gen_estimator *e = container_of(head,
+ struct gen_estimator, e_rcu);
+ kfree(e);
+}
+
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @ep: rate estimator to kill
+ *
+ * Removes the rate estimator specified
*
- * Removes the rate estimator specified by &bstats and &rate_est
- * and deletes the timer.
+ * NOTE: Called under rtnl_mutex
*/
-void gen_kill_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est)
+void gen_kill_estimator(struct gen_estimator *e)
{
- int idx;
- struct gen_estimator *est, **pest;
-
- for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
- int killed = 0;
- pest = &elist[idx].list;
- while ((est=*pest) != NULL) {
- if (est->rate_est != rate_est || est->bstats != bstats) {
- pest = &est->next;
- continue;
- }
-
- write_lock_bh(&est_lock);
- *pest = est->next;
- write_unlock_bh(&est_lock);
-
- kfree(est);
- killed++;
- }
- if (killed && elist[idx].list == NULL)
- del_timer(&elist[idx].timer);
- }
+ list_del_rcu(&e->list);
+ call_rcu(&e->e_rcu, __gen_kill_estimator);
}
/**
* gen_replace_estimator - replace rate estimator configruation
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
+ * @ep: new gen_estimator struct is returned through this pointer on success
*
* Replaces the configuration of a rate estimator by calling
* gen_kill_estimator() and gen_new_estimator().
*
* Returns 0 on success or a negative error code.
*/
-int
-gen_replace_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock,
- struct rtattr *opt)
+int gen_replace_estimator(struct rtattr *opt, struct gen_estimator **ep)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+ struct gen_estimator *old = *ep;
+ int err;
+
+ err = gen_new_estimator(opt, ep);
+
+ if (!err)
+ gen_kill_estimator(old);
+
+ return err;
}
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index feef366..bf94944 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -32,8 +32,7 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
write_lock_bh(hinfo->lock);
*p1p = p->tcfc_next;
write_unlock_bh(hinfo->lock);
- gen_kill_estimator(&p->tcfc_bstats,
- &p->tcfc_rate_est);
+ gen_kill_estimator(p->tcfc_ge);
kfree(p);
return;
}
@@ -209,7 +208,13 @@ struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind,
}
EXPORT_SYMBOL(tcf_hash_check);
-struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est, struct tc_action *a, int size, int bind, u32 *idx_gen, struct tcf_hashinfo *hinfo)
+struct tcf_common *tcf_hash_create(u32 index,
+ struct rtattr *est,
+ struct tc_action *a,
+ int size,
+ int bind,
+ u32 *idx_gen,
+ struct tcf_hashinfo *hinfo)
{
struct tcf_common *p = kzalloc(size, GFP_KERNEL);
@@ -223,9 +228,11 @@ struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est, struct tc_acti
p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo);
p->tcfc_tm.install = jiffies;
p->tcfc_tm.lastuse = jiffies;
- if (est)
- gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
- &p->tcfc_lock, est);
+ if (est) {
+ gen_new_estimator(est, &p->tcfc_ge);
+ /* XXX TO-DO: verify successful return from
+ gen_new_estimator()! */
+ }
a->priv = (void *) p;
return p;
}
@@ -598,8 +605,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (a->ops->get_stats(skb, a) < 0)
goto errout;
- if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &h->tcf_rate_est) < 0 ||
+ if (gnet_stats_copy_basic(&d, &h->tcf_ge_bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &h->tcf_ge_rate_est) < 0 ||
gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0)
goto errout;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index d204038..2a3e1d5 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -108,8 +108,7 @@ void tcf_police_destroy(struct tcf_police *p)
write_lock_bh(&police_lock);
*p1p = p->tcf_next;
write_unlock_bh(&police_lock);
- gen_kill_estimator(&p->tcf_bstats,
- &p->tcf_rate_est);
+ gen_kill_estimator(p->tcf_ge);
if (p->tcfp_R_tab)
qdisc_put_rtab(p->tcfp_R_tab);
if (p->tcfp_P_tab)
@@ -217,10 +216,11 @@ override:
if (tb[TCA_POLICE_AVRATE-1])
police->tcfp_ewma_rate =
*(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
- if (est)
- gen_replace_estimator(&police->tcf_bstats,
- &police->tcf_rate_est,
- &police->tcf_lock, est);
+ if (est) {
+ gen_replace_estimator(est, &police->tcf_ge);
+ /* XXX TO-DO: verify successful return from
+ gen_replace_estimator()! */
+ }
spin_unlock_bh(&police->tcf_lock);
if (ret != ACT_P_CREATED)
@@ -263,11 +263,10 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
spin_lock(&police->tcf_lock);
- police->tcf_bstats.bytes += skb->len;
- police->tcf_bstats.packets++;
+ gen_bstat_skb(police->tcf_ge, skb);
if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+ police->tcf_ge_rate_est.bps >= police->tcfp_ewma_rate) {
police->tcf_qstats.overlimits++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
@@ -476,9 +475,11 @@ struct tcf_police *tcf_police_locate(struct rtattr *rta, struct rtattr *est)
police->tcf_index = parm->index ? parm->index :
tcf_police_new_index();
police->tcf_action = parm->action;
- if (est)
- gen_new_estimator(&police->tcf_bstats, &police->tcf_rate_est,
- &police->tcf_lock, est);
+ if (est) {
+ gen_new_estimator(est, &police->tcf_ge);
+ /* XXX TO-DO: verify successful return from
+ gen_new_estimator()! */
+ }
h = tcf_hash(police->tcf_index, POL_TAB_MASK);
write_lock_bh(&police_lock);
police->tcf_next = tcf_police_ht[h];
@@ -501,11 +502,10 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *police)
spin_lock(&police->tcf_lock);
- police->tcf_bstats.bytes += skb->len;
- police->tcf_bstats.packets++;
+ gen_bstat_skb(police->tcf_ge, skb);
if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+ police->tcf_ge_rate_est.bps >= police->tcfp_ewma_rate) {
police->tcf_qstats.overlimits++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
@@ -583,8 +583,8 @@ int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *police)
&d) < 0)
goto errout;
- if (gnet_stats_copy_basic(&d, &police->tcf_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &police->tcf_rate_est) < 0 ||
+ if (gnet_stats_copy_basic(&d, &police->tcf_ge_bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &police->tcf_ge_rate_est) < 0 ||
gnet_stats_copy_queue(&d, &police->tcf_qstats) < 0)
goto errout;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d92ea26..dbcc846 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -504,9 +504,7 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
if (tca[TCA_RATE-1]) {
- err = gen_new_estimator(&sch->bstats, &sch->rate_est,
- sch->stats_lock,
- tca[TCA_RATE-1]);
+ err = gen_new_estimator(tca[TCA_RATE-1], &sch->ge);
if (err) {
/*
* Any broken qdiscs that would require
@@ -545,9 +543,11 @@ static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
if (err)
return err;
}
- if (tca[TCA_RATE-1])
- gen_replace_estimator(&sch->bstats, &sch->rate_est,
- sch->stats_lock, tca[TCA_RATE-1]);
+ if (tca[TCA_RATE-1]) {
+ gen_replace_estimator(tca[TCA_RATE-1], &sch->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_replace_estimator()! */
+ }
return 0;
}
@@ -822,8 +822,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
goto rtattr_failure;
- if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
+ if (gnet_stats_copy_basic(&d, &q->ge->bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->ge->rate_est) < 0 ||
gnet_stats_copy_queue(&d, &q->qstats) < 0)
goto rtattr_failure;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 54b92d2..35065f9 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -437,8 +437,7 @@ static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
if (flow) flow->qstats.drops++;
return ret;
}
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
flow->bstats.bytes += skb->len;
flow->bstats.packets++;
/*
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index b184c35..5c00954 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -129,9 +129,10 @@ struct cbq_class
long avgidle;
long deficit; /* Saved deficit for WRR */
psched_time_t penalized;
- struct gnet_stats_basic bstats;
+
+ struct gen_estimator *ge; /* byte and packet stats
+ collected here */
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto *filter_list;
@@ -385,7 +386,6 @@ static int
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- int len = skb->len;
int ret;
struct cbq_class *cl = cbq_classify(skb, sch, &ret);
@@ -404,8 +404,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
#endif
if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- sch->bstats.packets++;
- sch->bstats.bytes+=len;
+ gen_bstat_skb(sch->ge, skb);
cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
@@ -674,7 +673,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
{
- int len = skb->len;
struct Qdisc *sch = child->__parent;
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl = q->rx_class;
@@ -690,8 +688,7 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
if (cl->q->enqueue(skb, cl->q) == 0) {
sch->q.qlen++;
- sch->bstats.packets++;
- sch->bstats.bytes+=len;
+ gen_bstat_skb(sch->ge, skb);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
@@ -749,8 +746,7 @@ cbq_update(struct cbq_sched_data *q)
long avgidle = cl->avgidle;
long idle;
- cl->bstats.packets++;
- cl->bstats.bytes += len;
+ gen_bstat_add(cl->ge, len, 1);
/*
(now - last) is total time between packet right edges.
@@ -1634,8 +1630,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (cl->undertime != PSCHED_PASTPERFECT)
cl->xstats.undertime = cl->undertime - q->now;
- if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ if (gnet_stats_copy_basic(d, &cl->ge->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->ge->rate_est) < 0 ||
gnet_stats_copy_queue(d, &cl->qstats) < 0)
return -1;
@@ -1706,7 +1702,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(cl->ge);
if (cl != &q->link)
kfree(cl);
}
@@ -1851,10 +1847,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
sch_tree_unlock(sch);
- if (tca[TCA_RATE-1])
- gen_replace_estimator(&cl->bstats, &cl->rate_est,
- &sch->dev->queue_lock,
- tca[TCA_RATE-1]);
+ if (tca[TCA_RATE-1]) {
+ gen_replace_estimator(tca[TCA_RATE-1], &cl->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_replace_estimator()! */
+ }
return 0;
}
@@ -1939,9 +1936,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
sch_tree_unlock(sch);
- if (tca[TCA_RATE-1])
- gen_new_estimator(&cl->bstats, &cl->rate_est,
- &sch->dev->queue_lock, tca[TCA_RATE-1]);
+ if (tca[TCA_RATE-1]) {
+ gen_new_estimator(tca[TCA_RATE-1], &cl->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_new_estimator()! */
+ }
*arg = (unsigned long)cl;
return 0;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 4d2c233..8c44328 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -265,8 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
return err;
}
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index c81649c..acc6d90 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -506,7 +506,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
return;
list_del(&qdisc->list);
- gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ gen_kill_estimator(qdisc->ge);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 874452c..2b95b04 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -116,9 +116,10 @@ struct hfsc_class
u32 classid; /* class id */
unsigned int refcnt; /* usage count */
- struct gnet_stats_basic bstats;
+ struct gen_estimator *ge; /* byte and packet stats collected here
+ */
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+
unsigned int level; /* class level in hierarchy */
struct tcf_proto *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
@@ -1050,10 +1051,11 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
}
sch_tree_unlock(sch);
- if (tca[TCA_RATE-1])
- gen_replace_estimator(&cl->bstats, &cl->rate_est,
- &sch->dev->queue_lock,
- tca[TCA_RATE-1]);
+ if (tca[TCA_RATE-1]) {
+ gen_replace_estimator(tca[TCA_RATE-1], &cl->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_replace_estimator()! */
+ }
return 0;
}
@@ -1106,9 +1108,11 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
cl->cl_pcvtoff = parent->cl_cvtoff;
sch_tree_unlock(sch);
- if (tca[TCA_RATE-1])
- gen_new_estimator(&cl->bstats, &cl->rate_est,
- &sch->dev->queue_lock, tca[TCA_RATE-1]);
+ if (tca[TCA_RATE-1]) {
+ gen_new_estimator(tca[TCA_RATE-1], &cl->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_new_estimator()! */
+ }
*arg = (unsigned long)cl;
return 0;
}
@@ -1120,7 +1124,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(cl->filter_list);
qdisc_destroy(cl->qdisc);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(cl->ge);
if (cl != &q->root)
kfree(cl);
}
@@ -1373,8 +1377,8 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.work = cl->cl_total;
xstats.rtwork = cl->cl_cumul;
- if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ if (gnet_stats_copy_basic(d, &cl->ge->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->ge->rate_est) < 0 ||
gnet_stats_copy_queue(d, &cl->qstats) < 0)
return -1;
@@ -1587,10 +1591,8 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl->qdisc->q.qlen == 1)
set_active(cl, len);
- cl->bstats.packets++;
- cl->bstats.bytes += len;
- sch->bstats.packets++;
- sch->bstats.bytes += len;
+ gen_bstat_skb(cl->ge, skb);
+ gen_bstat_skb(sch->ge, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index b417a95..25bddd8 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -71,9 +71,8 @@ enum htb_cmode {
struct htb_class {
/* general class parameters */
u32 classid;
- struct gnet_stats_basic bstats;
+ struct gen_estimator *ge; /* byte and packet stats collected here */
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
struct tc_htb_xstats xstats; /* our special stats */
int refcnt; /* usage count of this class */
@@ -603,15 +602,12 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->qstats.drops++;
return NET_XMIT_DROP;
} else {
- cl->bstats.packets +=
- skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
- cl->bstats.bytes += skb->len;
+ gen_bstat_skb(cl->ge, skb);
htb_activate(q, cl);
}
sch->q.qlen++;
- sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
- sch->bstats.bytes += skb->len;
+ gen_bstat_skb(sch->ge, skb);
return NET_XMIT_SUCCESS;
}
@@ -696,9 +692,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
/* update byte stats except for leaves which are already updated */
if (cl->level) {
- cl->bstats.bytes += bytes;
- cl->bstats.packets += skb_is_gso(skb)?
- skb_shinfo(skb)->gso_segs:1;
+ gen_bstat_skb(cl->ge, skb);
}
cl = cl->parent;
}
@@ -1112,8 +1106,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
cl->xstats.tokens = cl->tokens;
cl->xstats.ctokens = cl->ctokens;
- if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+ if (gnet_stats_copy_basic(d, &cl->ge->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->ge->rate_est) < 0 ||
gnet_stats_copy_queue(d, &cl->qstats) < 0)
return -1;
@@ -1204,7 +1198,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
BUG_TRAP(cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(cl->ge);
qdisc_put_rtab(cl->rate);
qdisc_put_rtab(cl->ceil);
@@ -1357,9 +1351,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
goto failure;
- gen_new_estimator(&cl->bstats, &cl->rate_est,
- &sch->dev->queue_lock,
- tca[TCA_RATE-1] ? : &est.rta);
+ gen_new_estimator(tca[TCA_RATE-1] ? : &est.rta, &cl->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_new_estimator()! */
+
cl->refcnt = 1;
INIT_LIST_HEAD(&cl->sibling);
INIT_HLIST_NODE(&cl->hlist);
@@ -1412,10 +1407,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
list_add_tail(&cl->sibling,
parent ? &parent->children : &q->root);
} else {
- if (tca[TCA_RATE-1])
- gen_replace_estimator(&cl->bstats, &cl->rate_est,
- &sch->dev->queue_lock,
- tca[TCA_RATE-1]);
+ if (tca[TCA_RATE-1]) {
+ gen_replace_estimator(tca[TCA_RATE-1], &cl->ge);
+ /* XXX TO-DO: verify successful return from
+ gen_replace_estimator()! */
+ }
sch_tree_lock(sch);
}
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index cd0aab6..7be51ad 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -145,8 +145,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
* firewall FW_* code.
*/
#ifdef CONFIG_NET_CLS_ACT
- sch->bstats.packets++;
- sch->bstats.bytes += skb->len;
+ gen_bstat_skb(sch->ge, skb);
switch (result) {
case TC_ACT_SHOT:
result = TC_ACT_SHOT;
@@ -176,8 +175,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
case TC_POLICE_OK:
case TC_POLICE_UNSPEC:
default:
- sch->bstats.packets++;
- sch->bstats.bytes += skb->len;
+ gen_bstat_skb(sch->ge, skb);
result = NF_ACCEPT;
break;
}
@@ -185,8 +183,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
#else
D2PRINTK("Overriding result to ACCEPT\n");
result = NF_ACCEPT;
- sch->bstats.packets++;
- sch->bstats.bytes += skb->len;
+ gen_bstat_skb(sch->ge, skb);
#endif
#endif
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 9e5e87e..77c3575 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -231,8 +231,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->q.qlen++;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
} else
sch->qstats.drops++;
@@ -506,8 +505,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
__skb_queue_after(list, skb, nskb);
sch->qstats.backlog += nskb->len;
- sch->bstats.bytes += nskb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, nskb);
return NET_XMIT_SUCCESS;
}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 2d8c084..5fb1954 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -88,8 +88,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
#endif
if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -430,7 +429,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
- if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+ if (gnet_stats_copy_basic(d, &cl_q->ge->bstats) < 0 ||
gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
return -1;
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9b95fef..7161a37 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -94,8 +94,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
ret = child->enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
sch->q.qlen++;
} else {
q->stats.pdrop++;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 9579573..4880299 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -271,8 +271,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
}
if (++sch->q.qlen < q->limit-1) {
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
return 0;
}
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 22e431d..817eed2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -139,8 +139,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
sch->q.qlen++;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
return 0;
}
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 0968184..af6deaa 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -83,8 +83,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
+ gen_bstat_skb(sch->ge, skb);
return 0;
}
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-12 17:48 ` Ranko Zivojnovic
@ 2007-07-13 12:17 ` Jarek Poplawski
2007-07-13 12:26 ` Ranko Zivojnovic
0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-13 12:17 UTC (permalink / raw)
To: Ranko Zivojnovic; +Cc: Patrick McHardy, netdev
On Thu, Jul 12, 2007 at 08:48:45PM +0300, Ranko Zivojnovic wrote:
...
> Ok - here's the patch for a review - it compiles clean ... and that's as
> much as it has been tested - I'll try give it a run later today or
> definitely tomorrow.
...
Ranko, you have some powers!
Alas, I definitely need more time. I hope I'll manage to check this
till monday. Of course, if testing will be OK and somebody will check
and ack this earlier I don't see any reason in waiting on my opinion.
Thanks & good weekend,
Jarek P.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-13 12:17 ` Jarek Poplawski
@ 2007-07-13 12:26 ` Ranko Zivojnovic
2007-07-13 13:42 ` Jarek Poplawski
0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-13 12:26 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: Patrick McHardy, netdev
On Fri, 2007-07-13 at 14:17 +0200, Jarek Poplawski wrote:
> On Thu, Jul 12, 2007 at 08:48:45PM +0300, Ranko Zivojnovic wrote:
> ...
> > Ok - here's the patch for a review - it compiles clean ... and that's as
> > much as it has been tested - I'll try give it a run later today or
> > definitely tomorrow.
> ...
>
> Ranko, you have some powers!
>
> Alas, I definitely need more time. I hope I'll manage to check this
> till monday. Of course, if testing will be OK and somebody will check
> and ack this earlier I don't see any reason in waiting on my opinion.
Don't bother - just tested and it is a no-go (it would have been a
wander if it worked from the first time :)) - have to resolve the issues
with qdiscs that do not calculate/use rates...
I'm not sure if it is desirable to have rates available on all qdiscs -
even on pfifo_fast... that way, when you issue...
# tc -s qdisc show dev eth0
qdisc pfifo_fast 0: root bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
Sent 115971059 bytes 196761 pkt (dropped 0, overlimits 0 requeues 0)
rate 0bit 0pps backlog 0b 0p requeues 0
...you will not get the 'rate 0bit'.
R.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-13 12:26 ` Ranko Zivojnovic
@ 2007-07-13 13:42 ` Jarek Poplawski
2007-07-16 7:00 ` Jarek Poplawski
0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-13 13:42 UTC (permalink / raw)
To: Ranko Zivojnovic; +Cc: Patrick McHardy, netdev
On Fri, Jul 13, 2007 at 03:26:42PM +0300, Ranko Zivojnovic wrote:
> On Fri, 2007-07-13 at 14:17 +0200, Jarek Poplawski wrote:
> > On Thu, Jul 12, 2007 at 08:48:45PM +0300, Ranko Zivojnovic wrote:
> > ...
> > > Ok - here's the patch for a review - it compiles clean ... and that's as
> > > much as it has been tested - I'll try give it a run later today or
> > > definitely tomorrow.
> > ...
> >
> > Ranko, you have some powers!
> >
> > Alas, I definitely need more time. I hope I'll manage to check this
> > till monday. Of course, if testing will be OK and somebody will check
> > and ack this earlier I don't see any reason in waiting on my opinion.
>
> Don't bother - just tested and it is a no-go (it would have been a
> wander if it worked from the first time :)) - have to resolve the issues
> with qdiscs that do not calculate/use rates...
>
> I'm not sure if it is desirable to have rates available on all qdiscs -
> even on pfifo_fast... that way, when you issue...
>
> # tc -s qdisc show dev eth0
> qdisc pfifo_fast 0: root bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
> Sent 115971059 bytes 196761 pkt (dropped 0, overlimits 0 requeues 0)
> rate 0bit 0pps backlog 0b 0p requeues 0
>
> ...you will not get the 'rate 0bit'.
>
I've been a bit tight on time today, and only now I see that maybe
you have done too much. Of course, you can do it your way, but I
think it should be easier not change too much at once, so if possible
only include structs bstats and rate_est into struct gen_estimator
and do otherwise in these few sch_'s and act_'s which use this plus
all acceses and allocs changed appropriately. So it should be only 6
or 7 files affected. It seems no changes about gen_stats are necessary
now (next patch?).
I really can't check more now (or answer until monday).
Bye,
Jarek P.
PS: I'm in a little hurry now so I hope I'm not very wrong above...
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-13 13:42 ` Jarek Poplawski
@ 2007-07-16 7:00 ` Jarek Poplawski
2007-07-16 13:03 ` Patrick McHardy
0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-16 7:00 UTC (permalink / raw)
To: Ranko Zivojnovic; +Cc: Patrick McHardy, netdev
On Fri, Jul 13, 2007 at 03:42:31PM +0200, Jarek Poplawski wrote:
...
> On Fri, Jul 13, 2007 at 03:26:42PM +0300, Ranko Zivojnovic wrote:
> I've been a bit tight on time today, and only now I see that maybe
> you have done too much. Of course, you can do it your way, but I
> think it should be easier not change too much at once, so if possible
> only include structs bstats and rate_est into struct gen_estimator
> and do otherwise in these few sch_'s and act_'s which use this plus
> all acceses and allocs changed appropriately. So it should be only 6
> or 7 files affected. It seems no changes about gen_stats are necessary
> now (next patch?).
Alas, I was wrong. This idea really needs to change all these files
plus even more! If I could've foreseen this...
There is probably quite easy way to get rid of this one race only
by e.g. replacing *bstats field with NULL in gen_kill_estimator,
and check for this in est_timer just after taking a lock.
The gain from an api change would be mainly faster gen_kill_
and gen_replace_estimator. But, on the other hand, you have it
almost done. Ranko, I think these changes need opinion of more
maintainers as soon as possible - after all they could have some
objections; IMHO at least: Jamal Hadi Salim, Thomas Graf (authors
plus act_), Stephen Hemminger (sch_netem) and Jiri Benc
(net/mac80211).
I also have some suggestions, but maybe it would be better to do
them on a current version (the main is about this *stats_lock -
you probably misunderstood Patrick - but I hope you found it
yourself).
Regards,
Jarek P.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-16 7:00 ` Jarek Poplawski
@ 2007-07-16 13:03 ` Patrick McHardy
2007-07-16 17:45 ` Ranko Zivojnovic
0 siblings, 1 reply; 18+ messages in thread
From: Patrick McHardy @ 2007-07-16 13:03 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: Ranko Zivojnovic, netdev
Jarek Poplawski wrote:
> There is probably quite easy way to get rid of this one race only
> by e.g. replacing *bstats field with NULL in gen_kill_estimator,
> and check for this in est_timer just after taking a lock.
>
> The gain from an api change would be mainly faster gen_kill_
> and gen_replace_estimator. But, on the other hand, you have it
> almost done. Ranko, I think these changes need opinion of more
> maintainers as soon as possible - after all they could have some
> objections; IMHO at least: Jamal Hadi Salim, Thomas Graf (authors
> plus act_), Stephen Hemminger (sch_netem) and Jiri Benc
> (net/mac80211).
Frankly, all we need is a final patch.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-16 13:03 ` Patrick McHardy
@ 2007-07-16 17:45 ` Ranko Zivojnovic
2007-07-17 1:28 ` David Miller
2007-07-17 12:04 ` Jarek Poplawski
0 siblings, 2 replies; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-16 17:45 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Jarek Poplawski, netdev
[-- Attachment #1: Type: text/plain, Size: 1901 bytes --]
Patrick McHardy wrote:
> Jarek Poplawski wrote:
> > There is probably quite easy way to get rid of this one race only
> > by e.g. replacing *bstats field with NULL in gen_kill_estimator,
> > and check for this in est_timer just after taking a lock.
> >
You are absolutely right - I definitely overcomplicated the solution to
the actual problem in hand - the deadlock... Attached is the patch that
does exactly as above and should put a final resolution to the current
gen_estimator issues ... I've also given it a spin and so far it looks
ok. I will re-post it tomorrow evening for inclusion in the tree if no
objections come in the mean time.
> Jarek Poplawski wrote:
> > The gain from an api change would be mainly faster gen_kill_
> > and gen_replace_estimator. But, on the other hand, you have it
> > almost done. Ranko, I think these changes need opinion of more
> > maintainers as soon as possible - after all they could have some
> > objections; IMHO at least: Jamal Hadi Salim, Thomas Graf (authors
> > plus act_), Stephen Hemminger (sch_netem) and Jiri Benc
> > (net/mac80211).
>From my point of view the api change would help in untying the knot
entirely... It would become an independent piece of code with a sole
focus on the rate estimation. I am even thinking of making it even more
independent than the previous patch I posted... A pure small library
that can be configured to track rates with configurable algo's etc... It
may prove to be useful for the other things as well.
But now that gen_estimator issues are hopefully resolved through this
simpler patch - the api change can take its own pace.
Patrick McHardy wrote:
> Frankly, all we need is a final patch.
The patch that resolves the ABBA deadlock is attached. WRT api patch -
although I did not face any problems so far, I still need to do more
comprehensive testing...
Best regards and thanks for the patience,
R.
[-- Attachment #2: x --]
[-- Type: text/plain, Size: 5699 bytes --]
[NET] gen_estimator deadlock fix
-Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> There is at least one ABBA deadlock, est_timer() does:
> read_lock(&est_lock)
> spin_lock(e->stats_lock) (which is dev->queue_lock)
>
> and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> calls htb_destroy_class, then gen_kill_estimator and this
> write_locks est_lock.
To fix the ABBA deadlock the rate estimators are now kept on an rcu list.
-The est_lock changes the use from protecting the list to protecting
the update to the 'bstat' pointer in order to avoid NULL dereferencing.
-The 'interval' member of the gen_estimator structure removed as it is
not needed.
Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
---
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cc84d8d..590a767 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -79,27 +79,27 @@
struct gen_estimator
{
- struct gen_estimator *next;
+ struct list_head list;
struct gnet_stats_basic *bstats;
struct gnet_stats_rate_est *rate_est;
spinlock_t *stats_lock;
- unsigned interval;
int ewma_log;
u64 last_bytes;
u32 last_packets;
u32 avpps;
u32 avbps;
+ struct rcu_head e_rcu;
};
struct gen_estimator_head
{
struct timer_list timer;
- struct gen_estimator *list;
+ struct list_head list;
};
static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-/* Estimator array lock */
+/* Protects against NULL dereference */
static DEFINE_RWLOCK(est_lock);
static void est_timer(unsigned long arg)
@@ -107,13 +107,17 @@ static void est_timer(unsigned long arg)
int idx = (int)arg;
struct gen_estimator *e;
- read_lock(&est_lock);
- for (e = elist[idx].list; e; e = e->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &elist[idx].list, list) {
u64 nbytes;
u32 npackets;
u32 rate;
spin_lock(e->stats_lock);
+ read_lock(&est_lock);
+ if (e->bstats == NULL)
+ goto skip;
+
nbytes = e->bstats->bytes;
npackets = e->bstats->packets;
rate = (nbytes - e->last_bytes)<<(7 - idx);
@@ -125,12 +129,14 @@ static void est_timer(unsigned long arg)
e->last_packets = npackets;
e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+ read_unlock(&est_lock);
spin_unlock(e->stats_lock);
}
- if (elist[idx].list != NULL)
+ if (!list_empty(&elist[idx].list))
mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
- read_unlock(&est_lock);
+ rcu_read_unlock();
}
/**
@@ -147,12 +153,17 @@ static void est_timer(unsigned long arg)
* &rate_est with the statistics lock grabed during this period.
*
* Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
*/
int gen_new_estimator(struct gnet_stats_basic *bstats,
- struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+ struct gnet_stats_rate_est *rate_est,
+ spinlock_t *stats_lock,
+ struct rtattr *opt)
{
struct gen_estimator *est;
struct gnet_estimator *parm = RTA_DATA(opt);
+ int idx;
if (RTA_PAYLOAD(opt) < sizeof(*parm))
return -EINVAL;
@@ -164,7 +175,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
if (est == NULL)
return -ENOBUFS;
- est->interval = parm->interval + 2;
+ idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
@@ -174,20 +185,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
est->last_packets = bstats->packets;
est->avpps = rate_est->pps<<10;
- est->next = elist[est->interval].list;
- if (est->next == NULL) {
- init_timer(&elist[est->interval].timer);
- elist[est->interval].timer.data = est->interval;
- elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
- elist[est->interval].timer.function = est_timer;
- add_timer(&elist[est->interval].timer);
+ if (!elist[idx].timer.function) {
+ INIT_LIST_HEAD(&elist[idx].list);
+ setup_timer(&elist[idx].timer, est_timer, idx);
}
- write_lock_bh(&est_lock);
- elist[est->interval].list = est;
- write_unlock_bh(&est_lock);
+
+ if (list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+ list_add_rcu(&est->list, &elist[idx].list);
return 0;
}
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+ struct gen_estimator *e = container_of(head,
+ struct gen_estimator, e_rcu);
+ kfree(e);
+}
+
/**
* gen_kill_estimator - remove a rate estimator
* @bstats: basic statistics
@@ -195,31 +211,32 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
*
* Removes the rate estimator specified by &bstats and &rate_est
* and deletes the timer.
+ *
+ * NOTE: Called under rtnl_mutex
*/
void gen_kill_estimator(struct gnet_stats_basic *bstats,
struct gnet_stats_rate_est *rate_est)
{
int idx;
- struct gen_estimator *est, **pest;
+ struct gen_estimator *e, *n;
for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
- int killed = 0;
- pest = &elist[idx].list;
- while ((est=*pest) != NULL) {
- if (est->rate_est != rate_est || est->bstats != bstats) {
- pest = &est->next;
+
+ /* Skip non initialized indexes */
+ if (!elist[idx].timer.function)
+ continue;
+
+ list_for_each_entry_safe(e, n, &elist[idx].list, list) {
+ if (e->rate_est != rate_est || e->bstats != bstats)
continue;
- }
write_lock_bh(&est_lock);
- *pest = est->next;
+ e->bstats = NULL;
write_unlock_bh(&est_lock);
- kfree(est);
- killed++;
+ list_del_rcu(&e->list);
+ call_rcu(&e->e_rcu, __gen_kill_estimator);
}
- if (killed && elist[idx].list == NULL)
- del_timer(&elist[idx].timer);
}
}
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-16 17:45 ` Ranko Zivojnovic
@ 2007-07-17 1:28 ` David Miller
2007-07-17 12:04 ` Jarek Poplawski
1 sibling, 0 replies; 18+ messages in thread
From: David Miller @ 2007-07-17 1:28 UTC (permalink / raw)
To: ranko; +Cc: kaber, jarkao2, netdev
From: Ranko Zivojnovic <ranko@spidernet.net>
Date: Mon, 16 Jul 2007 20:45:05 +0300
> [NET] gen_estimator deadlock fix
>
> -Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
>
> > There is at least one ABBA deadlock, est_timer() does:
> > read_lock(&est_lock)
> > spin_lock(e->stats_lock) (which is dev->queue_lock)
> >
> > and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> > calls htb_destroy_class, then gen_kill_estimator and this
> > write_locks est_lock.
>
> To fix the ABBA deadlock the rate estimators are now kept on an rcu list.
>
> -The est_lock changes the use from protecting the list to protecting
> the update to the 'bstat' pointer in order to avoid NULL dereferencing.
>
> -The 'interval' member of the gen_estimator structure removed as it is
> not needed.
>
> Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
Applied, thanks.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-17 12:04 ` Jarek Poplawski
@ 2007-07-17 12:01 ` Patrick McHardy
2007-07-17 12:28 ` Jarek Poplawski
0 siblings, 1 reply; 18+ messages in thread
From: Patrick McHardy @ 2007-07-17 12:01 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: Ranko Zivojnovic, netdev
Jarek Poplawski wrote:
> This patch looks fine, but while checking for this lock I've found
> another strange thing: for actions tcfc_stats_lock is used here, which
> is equivalent to tcfc_lock; so, in gen_kill_estimator we get this lock
> sometimes after dev->queue_lock; this order is also possible during
> tc_classify if actions are used; on the other hand act_mirred calls
> dev_queue_xmit under this lock, so dev->queue_lock is taken in another
> order. I hope it's with different devs, and there is no real deadlock
> possible, but this all is a bit queer...
It *should* be a different device, but AFAIK nothing enforces this.
There are quite a few possible deadlocks with TC actions, mid-term
things like the mirred action need to be redesigned to inject packet
from a different context.
> I don't know actions enough, but it seems, if it's possible that they
> are always run only from tc_classify, with dev->queue_lock, maybe it
> would be simpler to use this lock for actions' stats with
> gen_estimator too.
The same action can be shared between devices, so they need seperate
locks.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-16 17:45 ` Ranko Zivojnovic
2007-07-17 1:28 ` David Miller
@ 2007-07-17 12:04 ` Jarek Poplawski
2007-07-17 12:01 ` Patrick McHardy
1 sibling, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-17 12:04 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Ranko Zivojnovic, netdev
On Mon, Jul 16, 2007 at 08:45:05PM +0300, Ranko Zivojnovic wrote:
...
> [NET] gen_estimator deadlock fix
>
> -Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
>
> > There is at least one ABBA deadlock, est_timer() does:
> > read_lock(&est_lock)
> > spin_lock(e->stats_lock) (which is dev->queue_lock)
> >
> > and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> > calls htb_destroy_class, then gen_kill_estimator and this
> > write_locks est_lock.
>
> To fix the ABBA deadlock the rate estimators are now kept on an rcu list.
>
> -The est_lock changes the use from protecting the list to protecting
> the update to the 'bstat' pointer in order to avoid NULL dereferencing.
This patch looks fine, but while checking for this lock I've found
another strange thing: for actions tcfc_stats_lock is used here, which
is equivalent to tcfc_lock; so, in gen_kill_estimator we get this lock
sometimes after dev->queue_lock; this order is also possible during
tc_classify if actions are used; on the other hand act_mirred calls
dev_queue_xmit under this lock, so dev->queue_lock is taken in another
order. I hope it's with different devs, and there is no real deadlock
possible, but this all is a bit queer...
I don't know actions enough, but it seems, if it's possible that they
are always run only from tc_classify, with dev->queue_lock, maybe it
would be simpler to use this lock for actions' stats with
gen_estimator too. And if gen_kill_estimator is sometimes run with
dev->queue_lock, maybe doing this always would make this locking
really understandable and less prone for such inversions (plus
est_lock could be forgotten)?
Regards,
Jarek P.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [NET]: gen_estimator deadlock fix
2007-07-17 12:01 ` Patrick McHardy
@ 2007-07-17 12:28 ` Jarek Poplawski
0 siblings, 0 replies; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-17 12:28 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Ranko Zivojnovic, netdev
On Tue, Jul 17, 2007 at 02:01:48PM +0200, Patrick McHardy wrote:
...
Thanks,
Jarek P.
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2007-07-17 12:19 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-11 13:41 [NET]: gen_estimator deadlock fix Ranko Zivojnovic
2007-07-12 7:37 ` Jarek Poplawski
2007-07-12 9:18 ` Ranko Zivojnovic
2007-07-12 9:40 ` Ranko Zivojnovic
2007-07-12 10:46 ` Jarek Poplawski
2007-07-12 11:47 ` Ranko Zivojnovic
2007-07-12 12:07 ` Patrick McHardy
2007-07-12 17:48 ` Ranko Zivojnovic
2007-07-13 12:17 ` Jarek Poplawski
2007-07-13 12:26 ` Ranko Zivojnovic
2007-07-13 13:42 ` Jarek Poplawski
2007-07-16 7:00 ` Jarek Poplawski
2007-07-16 13:03 ` Patrick McHardy
2007-07-16 17:45 ` Ranko Zivojnovic
2007-07-17 1:28 ` David Miller
2007-07-17 12:04 ` Jarek Poplawski
2007-07-17 12:01 ` Patrick McHardy
2007-07-17 12:28 ` Jarek Poplawski
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).