netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [NET]: gen_estimator deadlock fix
@ 2007-07-11 13:41 Ranko Zivojnovic
  2007-07-12  7:37 ` Jarek Poplawski
  0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-11 13:41 UTC (permalink / raw)
  To: netdev; +Cc: Andrew Morton, Patrick McHardy, Jarek Poplawski

Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> There is at least one ABBA deadlock, est_timer() does:
> read_lock(&est_lock)
> spin_lock(e->stats_lock) (which is dev->queue_lock)
> 
> and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> calls htb_destroy_class, then gen_kill_estimator and this
> write_locks est_lock.

est_lock completely removed and the RCU list implemented instead.
Both gen_kill_estimator and gen_new_estimator are already called under
rtnl_mutex.

Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
---
 net/core/gen_estimator.c |   73 ++++++++++++++++++++++++----------------------
 1 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cc84d8d..444b2bd 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -79,7 +79,7 @@

 struct gen_estimator
 {
-       struct gen_estimator    *next;
+       struct list_head        list;
        struct gnet_stats_basic *bstats;
        struct gnet_stats_rate_est      *rate_est;
        spinlock_t              *stats_lock;
@@ -89,26 +89,24 @@ struct gen_estimator
        u32                     last_packets;
        u32                     avpps;
        u32                     avbps;
+       struct rcu_head         e_rcu;
 };

 struct gen_estimator_head
 {
        struct timer_list       timer;
-       struct gen_estimator    *list;
+       struct list_head        list;
 };

 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];

-/* Estimator array lock */
-static DEFINE_RWLOCK(est_lock);
-
 static void est_timer(unsigned long arg)
 {
        int idx = (int)arg;
        struct gen_estimator *e;

-       read_lock(&est_lock);
-       for (e = elist[idx].list; e; e = e->next) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(e, &elist[idx].list, list) {
                u64 nbytes;
                u32 npackets;
                u32 rate;
@@ -128,9 +126,9 @@ static void est_timer(unsigned long arg)
                spin_unlock(e->stats_lock);
        }

-       if (elist[idx].list != NULL)
+       if (!list_empty(&elist[idx].list))
                mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
-       read_unlock(&est_lock);
+       rcu_read_unlock();
 }

 /**
@@ -147,12 +145,15 @@ static void est_timer(unsigned long arg)
  * &rate_est with the statistics lock grabed during this period.
  *
  * Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
  */
 int gen_new_estimator(struct gnet_stats_basic *bstats,
        struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
 {
        struct gen_estimator *est;
        struct gnet_estimator *parm = RTA_DATA(opt);
+       int idx;

        if (RTA_PAYLOAD(opt) < sizeof(*parm))
                return -EINVAL;
@@ -164,7 +165,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
        if (est == NULL)
                return -ENOBUFS;

-       est->interval = parm->interval + 2;
+       est->interval = idx = parm->interval + 2;
        est->bstats = bstats;
        est->rate_est = rate_est;
        est->stats_lock = stats_lock;
@@ -174,20 +175,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
        est->last_packets = bstats->packets;
        est->avpps = rate_est->pps<<10;

-       est->next = elist[est->interval].list;
-       if (est->next == NULL) {
-               init_timer(&elist[est->interval].timer);
-               elist[est->interval].timer.data = est->interval;
-               elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
-               elist[est->interval].timer.function = est_timer;
-               add_timer(&elist[est->interval].timer);
+       if (!elist[idx].timer.function) {
+               INIT_LIST_HEAD(&elist[idx].list);
+               setup_timer(&elist[idx].timer, est_timer, est->interval);
        }
-       write_lock_bh(&est_lock);
-       elist[est->interval].list = est;
-       write_unlock_bh(&est_lock);
+
+       if (list_empty(&elist[idx].list))
+               mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+       list_add_rcu(&est->list, &elist[idx].list);
        return 0;
 }

+static void __gen_kill_estimator(struct rcu_head *head)
+{
+       struct gen_estimator *e = container_of(head,
+                                       struct gen_estimator, e_rcu);
+       kfree(e);
+}
+
 /**
  * gen_kill_estimator - remove a rate estimator
  * @bstats: basic statistics
@@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
  *
  * Removes the rate estimator specified by &bstats and &rate_est
  * and deletes the timer.
+ *
+ * NOTE: Called under rtnl_mutex
  */
 void gen_kill_estimator(struct gnet_stats_basic *bstats,
        struct gnet_stats_rate_est *rate_est)
 {
        int idx;
-       struct gen_estimator *est, **pest;
+       struct gen_estimator *e, *n;

        for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
-               int killed = 0;
-               pest = &elist[idx].list;
-               while ((est=*pest) != NULL) {
-                       if (est->rate_est != rate_est || est->bstats != bstats) {
-                               pest = &est->next;
-                               continue;
-                       }

-                       write_lock_bh(&est_lock);
-                       *pest = est->next;
-                       write_unlock_bh(&est_lock);
+               /* Skip non initialized indexes */
+               if (!elist[idx].timer.function)
+                       continue;
+
+               list_for_each_entry_safe(e, n, &elist[idx].list, list) {
+                       if (e->rate_est != rate_est || e->bstats != bstats)
+                               continue;

-                       kfree(est);
-                       killed++;
+                       list_del_rcu(&e->list);
+                       call_rcu(&e->e_rcu, __gen_kill_estimator);
                }
-               if (killed && elist[idx].list == NULL)
-                       del_timer(&elist[idx].timer);
        }
 }




^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-11 13:41 [NET]: gen_estimator deadlock fix Ranko Zivojnovic
@ 2007-07-12  7:37 ` Jarek Poplawski
  2007-07-12  9:18   ` Ranko Zivojnovic
  0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-12  7:37 UTC (permalink / raw)
  To: Ranko Zivojnovic; +Cc: netdev, Andrew Morton, Patrick McHardy

On Wed, Jul 11, 2007 at 04:41:37PM +0300, Ranko Zivojnovic wrote:
> Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> > There is at least one ABBA deadlock, est_timer() does:
> > read_lock(&est_lock)
> > spin_lock(e->stats_lock) (which is dev->queue_lock)
> > 
> > and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> > calls htb_destroy_class, then gen_kill_estimator and this
> > write_locks est_lock.
> 
> est_lock completely removed and the RCU list implemented instead.
> Both gen_kill_estimator and gen_new_estimator are already called under
> rtnl_mutex.
> 
> Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>

Maybe it's only my issue, but it seems there are no tabs: all spaces...

...plus some old doubts:

> ---
>  net/core/gen_estimator.c |   73 ++++++++++++++++++++++++----------------------
>  1 files changed, 38 insertions(+), 35 deletions(-)
> 
> diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
> index cc84d8d..444b2bd 100644
> --- a/net/core/gen_estimator.c
> +++ b/net/core/gen_estimator.c
...

> @@ -174,20 +175,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
>         est->last_packets = bstats->packets;
>         est->avpps = rate_est->pps<<10;
> 
> -       est->next = elist[est->interval].list;
> -       if (est->next == NULL) {
> -               init_timer(&elist[est->interval].timer);
> -               elist[est->interval].timer.data = est->interval;
> -               elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
> -               elist[est->interval].timer.function = est_timer;
> -               add_timer(&elist[est->interval].timer);
> +       if (!elist[idx].timer.function) {
> +               INIT_LIST_HEAD(&elist[idx].list);
> +               setup_timer(&elist[idx].timer, est_timer, est->interval);


 -               setup_timer(&elist[idx].timer, est_timer, est->interval);
 +               setup_timer(&elist[idx].timer, est_timer, idx);

...
>  /**
>   * gen_kill_estimator - remove a rate estimator
>   * @bstats: basic statistics
> @@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
>   *
>   * Removes the rate estimator specified by &bstats and &rate_est
>   * and deletes the timer.
> + *
> + * NOTE: Called under rtnl_mutex
>   */
>  void gen_kill_estimator(struct gnet_stats_basic *bstats,
>         struct gnet_stats_rate_est *rate_est)
>  {
...
> +               list_for_each_entry_safe(e, n, &elist[idx].list, list) {
> +                       if (e->rate_est != rate_est || e->bstats != bstats)
> +                               continue;
> 
> -                       kfree(est);
> -                       killed++;
> +                       list_del_rcu(&e->list);
> +                       call_rcu(&e->e_rcu, __gen_kill_estimator);
>                 }
> -               if (killed && elist[idx].list == NULL)
> -                       del_timer(&elist[idx].timer);
>         }
>  }

I've done a bit of mess last time, so maybe it was forgotten, but I
still think this kind of race is possible:

- gen_kill_estimator is called during qdisc_destroy under
  dev->queue_lock,
- est_timer is running and waiting on this lock just on the
  list entry of the destroyed class,
- gen_kill_estimator kills the entry and returns,
- in xxx_destroy_class kfree(cl) is done etc.,
- est_timer gets the lock and does nbytes = e->bstats->bytes or
  e->rate_est-bps = ... with freed memory.

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12  7:37 ` Jarek Poplawski
@ 2007-07-12  9:18   ` Ranko Zivojnovic
  2007-07-12  9:40     ` Ranko Zivojnovic
  2007-07-12 10:46     ` Jarek Poplawski
  0 siblings, 2 replies; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12  9:18 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: netdev, Andrew Morton, Patrick McHardy

On Thu, 2007-07-12 at 09:37 +0200, Jarek Poplawski wrote:
> On Wed, Jul 11, 2007 at 04:41:37PM +0300, Ranko Zivojnovic wrote:
> > Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
> 
> Maybe it's only my issue, but it seems there are no tabs: all spaces...

Nope - you are right - just noticed my mailer converts tabs into spaces
- sorry about this...

> 
> ...plus some old doubts:
> 

>  -               setup_timer(&elist[idx].timer, est_timer, est->interval);
>  +               setup_timer(&elist[idx].timer, est_timer, idx);
> 

I left this because setup_timer expects unsigned long.

> ...
> >  /**
> >   * gen_kill_estimator - remove a rate estimator
> >   * @bstats: basic statistics
> > @@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
> >   *
> >   * Removes the rate estimator specified by &bstats and &rate_est
> >   * and deletes the timer.
> > + *
> > + * NOTE: Called under rtnl_mutex
> >   */
> >  void gen_kill_estimator(struct gnet_stats_basic *bstats,
> >         struct gnet_stats_rate_est *rate_est)
> >  {
> ...
> > +               list_for_each_entry_safe(e, n, &elist[idx].list, list) {
> > +                       if (e->rate_est != rate_est || e->bstats != bstats)
> > +                               continue;
> > 
> > -                       kfree(est);
> > -                       killed++;
> > +                       list_del_rcu(&e->list);
> > +                       call_rcu(&e->e_rcu, __gen_kill_estimator);
> >                 }
> > -               if (killed && elist[idx].list == NULL)
> > -                       del_timer(&elist[idx].timer);
> >         }
> >  }
> 
> I've done a bit of mess last time, so maybe it was forgotten, but I
> still think this kind of race is possible:
> 
> - gen_kill_estimator is called during qdisc_destroy under
>   dev->queue_lock,
> - est_timer is running and waiting on this lock just on the
>   list entry of the destroyed class,
> - gen_kill_estimator kills the entry and returns,
> - in xxx_destroy_class kfree(cl) is done etc.,
> - est_timer gets the lock and does nbytes = e->bstats->bytes or
>   e->rate_est-bps = ... with freed memory.

Unfortunately I cannot say for sure that _currently_ anything really bad
can happen here... someone with better knowledge will have to confirm
this - Patrick?

Here's why - the only updates that are being done in the est_timer are
to the structure pointed by 'e'. Yes - it is possible the values it will
refer to could be random (freed memory), _however_ the 'e' structure is
still valid as it is _not_ kfree()'d directly from gen_kill_estimator()
- i.e. it will not be kfree()'d until rcu_read_unlock() is called -
which is outside the loop.

I don't mind fixing all the classful qdiscs to call_rcu() to release
their class structures for consistency purposes ... in fact ... that is
exactly what I will do in order to avoid any potential future mishaps.
One may actually decide in the future to add a callback to a user
defined function to update some qdisc/class specific rates and given
this inconsistency in handling qdiscs vs classes - it could have a nasty
backfire.

Will have a new patch soon.

Regards,

Ranko


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12  9:18   ` Ranko Zivojnovic
@ 2007-07-12  9:40     ` Ranko Zivojnovic
  2007-07-12 10:46     ` Jarek Poplawski
  1 sibling, 0 replies; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12  9:40 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: netdev, Andrew Morton, Patrick McHardy

On Thu, 2007-07-12 at 12:18 +0300, Ranko Zivojnovic wrote:
> > 
> > I've done a bit of mess last time, so maybe it was forgotten, but I
> > still think this kind of race is possible:
> > 
> > - gen_kill_estimator is called during qdisc_destroy under
> >   dev->queue_lock,
> > - est_timer is running and waiting on this lock just on the
> >   list entry of the destroyed class,
> > - gen_kill_estimator kills the entry and returns,
> > - in xxx_destroy_class kfree(cl) is done etc.,
> > - est_timer gets the lock and does nbytes = e->bstats->bytes or
> >   e->rate_est-bps = ... with freed memory.
> 
> Unfortunately I cannot say for sure that _currently_ anything really bad
> can happen here... someone with better knowledge will have to confirm
> this - Patrick?
> 

Oops - just saw it - and you even mention it up there: 

e->rate_est->bps = ...
          ^^^^
>
> Will have a new patch soon.
>

R.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12  9:18   ` Ranko Zivojnovic
  2007-07-12  9:40     ` Ranko Zivojnovic
@ 2007-07-12 10:46     ` Jarek Poplawski
  2007-07-12 11:47       ` Ranko Zivojnovic
  1 sibling, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-12 10:46 UTC (permalink / raw)
  To: Ranko Zivojnovic; +Cc: netdev, Andrew Morton, Patrick McHardy

On Thu, Jul 12, 2007 at 12:18:23PM +0300, Ranko Zivojnovic wrote:
> On Thu, 2007-07-12 at 09:37 +0200, Jarek Poplawski wrote:
> > On Wed, Jul 11, 2007 at 04:41:37PM +0300, Ranko Zivojnovic wrote:
> > > Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>
> > 
> > Maybe it's only my issue, but it seems there are no tabs: all spaces...
> 
> Nope - you are right - just noticed my mailer converts tabs into spaces
> - sorry about this...
> 
> > 
> > ...plus some old doubts:
> > 
> 
> >  -               setup_timer(&elist[idx].timer, est_timer, est->interval);
> >  +               setup_timer(&elist[idx].timer, est_timer, idx);
> > 
> 
> I left this because setup_timer expects unsigned long.

Are there any warnings? It'll look strange without such comment.
I didn't check this, but maybe idx should be unsigned too?

> 
> > ...
> > >  /**
> > >   * gen_kill_estimator - remove a rate estimator
> > >   * @bstats: basic statistics
> > > @@ -195,31 +201,28 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
> > >   *
> > >   * Removes the rate estimator specified by &bstats and &rate_est
> > >   * and deletes the timer.
> > > + *
> > > + * NOTE: Called under rtnl_mutex
> > >   */
> > >  void gen_kill_estimator(struct gnet_stats_basic *bstats,
> > >         struct gnet_stats_rate_est *rate_est)
> > >  {
> > ...
> > > +               list_for_each_entry_safe(e, n, &elist[idx].list, list) {
> > > +                       if (e->rate_est != rate_est || e->bstats != bstats)
> > > +                               continue;
> > > 
> > > -                       kfree(est);
> > > -                       killed++;
> > > +                       list_del_rcu(&e->list);
> > > +                       call_rcu(&e->e_rcu, __gen_kill_estimator);
> > >                 }
> > > -               if (killed && elist[idx].list == NULL)
> > > -                       del_timer(&elist[idx].timer);
> > >         }
> > >  }
> > 
> > I've done a bit of mess last time, so maybe it was forgotten, but I
> > still think this kind of race is possible:
> > 
> > - gen_kill_estimator is called during qdisc_destroy under
> >   dev->queue_lock,
> > - est_timer is running and waiting on this lock just on the
> >   list entry of the destroyed class,
> > - gen_kill_estimator kills the entry and returns,
> > - in xxx_destroy_class kfree(cl) is done etc.,
> > - est_timer gets the lock and does nbytes = e->bstats->bytes or
> >   e->rate_est-bps = ... with freed memory.
...
> I don't mind fixing all the classful qdiscs to call_rcu() to release
> their class structures for consistency purposes ... in fact ... that is
> exactly what I will do in order to avoid any potential future mishaps.
> One may actually decide in the future to add a callback to a user
> defined function to update some qdisc/class specific rates and given
> this inconsistency in handling qdiscs vs classes - it could have a nasty
> backfire.

I don't know if such broad changes are acceptable, or if it's even
required that these structs have to belong to a class struct.

IMHO, unless I miss something, they could be included into
gen_estimator struct after some api change. BTW, maybe it would be
resonable to return a pointer to such gen_estimator from
gen_new_estimator, then lookups could be avoided in
gen_kill_estimator.

Alas, there is probably more (of course very unprobable): there
is no dev_hold for gen_estimator now, so I hope it'll always manage
to unlock in time -  before dev is freed.

Jarek P.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12 10:46     ` Jarek Poplawski
@ 2007-07-12 11:47       ` Ranko Zivojnovic
  2007-07-12 12:07         ` Patrick McHardy
  0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12 11:47 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: netdev, Andrew Morton, Patrick McHardy

On Thu, 2007-07-12 at 12:46 +0200, Jarek Poplawski wrote:
> On Thu, Jul 12, 2007 at 12:18:23PM +0300, Ranko Zivojnovic wrote:
> > On Thu, 2007-07-12 at 09:37 +0200, Jarek Poplawski wrote:
> > >  -               setup_timer(&elist[idx].timer, est_timer, est->interval);
> > >  +               setup_timer(&elist[idx].timer, est_timer, idx);
> > 
> > I left this because setup_timer expects unsigned long.
> 
> Are there any warnings? It'll look strange without such comment.
> I didn't check this, but maybe idx should be unsigned too?
> 

I'll make idx unsigned.

> > 
> > > - gen_kill_estimator is called during qdisc_destroy under
> > >   dev->queue_lock,
> > > - est_timer is running and waiting on this lock just on the
> > >   list entry of the destroyed class,
> > > - gen_kill_estimator kills the entry and returns,
> > > - in xxx_destroy_class kfree(cl) is done etc.,
> > > - est_timer gets the lock and does nbytes = e->bstats->bytes or
> > >   e->rate_est-bps = ... with freed memory.
> ...
> > I don't mind fixing all the classful qdiscs to call_rcu() to release
> > their class structures for consistency purposes ... in fact ... that is
> > exactly what I will do in order to avoid any potential future mishaps.
> > One may actually decide in the future to add a callback to a user
> > defined function to update some qdisc/class specific rates and given
> > this inconsistency in handling qdiscs vs classes - it could have a nasty
> > backfire.
> 
> I don't know if such broad changes are acceptable, or if it's even
> required that these structs have to belong to a class struct.
> 
> IMHO, unless I miss something, they could be included into
> gen_estimator struct after some api change. BTW, maybe it would be
> resonable to return a pointer to such gen_estimator from
> gen_new_estimator, then lookups could be avoided in
> gen_kill_estimator.
> 
> Alas, there is probably more (of course very unprobable): there
> is no dev_hold for gen_estimator now, so I hope it'll always manage
> to unlock in time -  before dev is freed.
> 

I agree - it does look like the most sensible thing to do - have
gnet_stats_basic and gnet_stats_rate_est allocated within the
gen_estimator struct rather than pointers looking here and there - and
provide api to maintain those stats - it simplifies the picture.

Also - the stats_lock in this case could be local to gen_estimator
struct, thus making the implementation completely "dev agnostic" - and
will not break on dev removal.

All this however will require a rather intrusive "surgical procedure" to
fix all the references.

I don't mind implementing and testing it - given there's consensus from
maintainers on the "correctness".

Waiting for feedback.

R.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12 11:47       ` Ranko Zivojnovic
@ 2007-07-12 12:07         ` Patrick McHardy
  2007-07-12 17:48           ` Ranko Zivojnovic
  0 siblings, 1 reply; 18+ messages in thread
From: Patrick McHardy @ 2007-07-12 12:07 UTC (permalink / raw)
  To: Ranko Zivojnovic; +Cc: Jarek Poplawski, netdev

[Removed Andrew from CC]

Ranko Zivojnovic wrote:
> I agree - it does look like the most sensible thing to do - have
> gnet_stats_basic and gnet_stats_rate_est allocated within the
> gen_estimator struct rather than pointers looking here and there - and
> provide api to maintain those stats - it simplifies the picture.


The API is not very pretty, some improvement there would be welcome.

> Also - the stats_lock in this case could be local to gen_estimator
> struct, thus making the implementation completely "dev agnostic" - and
> will not break on dev removal.


The queue lock is used since thats what protects the qdisc counters.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12 12:07         ` Patrick McHardy
@ 2007-07-12 17:48           ` Ranko Zivojnovic
  2007-07-13 12:17             ` Jarek Poplawski
  0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-12 17:48 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jarek Poplawski, netdev

[-- Attachment #1: Type: text/plain, Size: 1255 bytes --]

On Thu, 2007-07-12 at 14:07 +0200, Patrick McHardy wrote:
> [Removed Andrew from CC]
> 
> Ranko Zivojnovic wrote:
> > I agree - it does look like the most sensible thing to do - have
> > gnet_stats_basic and gnet_stats_rate_est allocated within the
> > gen_estimator struct rather than pointers looking here and there - and
> > provide api to maintain those stats - it simplifies the picture.
> 
> 
> The API is not very pretty, some improvement there would be welcome.
> 
> > Also - the stats_lock in this case could be local to gen_estimator
> > struct, thus making the implementation completely "dev agnostic" - and
> > will not break on dev removal.
> 
> 
> The queue lock is used since thats what protects the qdisc counters.

Ok - here's the patch for a review - it compiles clean ... and that's as
much as it has been tested - I'll try give it a run later today or
definitely tomorrow.

'bstats' and 'rate_est' are now embedded into gen_estimator as well as
the spin_lock that protects them.

Introduced two new inline functions: gen_bstat_add() and gen_bstat_skb()
for maintaining the bstats within the gen_estimator structure.

Note that gen_bstat_skb includes also the TSO check and effectively
propagates the TSO support to all schedulers.

R.

[-- Attachment #2: t --]
[-- Type: text/plain, Size: 32294 bytes --]

 include/net/act_api.h     |    6 +-
 include/net/gen_stats.h   |   42 ++++++++++--
 include/net/sch_generic.h |    7 +-
 net/core/gen_estimator.c  |  159 +++++++++++++++++++--------------------------
 net/sched/act_api.c       |   23 ++++--
 net/sched/act_police.c    |   34 +++++-----
 net/sched/sch_api.c       |   16 ++--
 net/sched/sch_atm.c       |    3 +-
 net/sched/sch_cbq.c       |   39 ++++++------
 net/sched/sch_dsmark.c    |    3 +-
 net/sched/sch_generic.c   |    2 +-
 net/sched/sch_hfsc.c      |   34 +++++-----
 net/sched/sch_htb.c       |   36 +++++------
 net/sched/sch_ingress.c   |    9 +--
 net/sched/sch_netem.c     |    6 +-
 net/sched/sch_prio.c      |    5 +-
 net/sched/sch_red.c       |    3 +-
 net/sched/sch_sfq.c       |    3 +-
 net/sched/sch_tbf.c       |    3 +-
 net/sched/sch_teql.c      |    3 +-
 20 files changed, 214 insertions(+), 222 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 2f0273f..f5e0f05 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -16,9 +16,9 @@ struct tcf_common {
 	u32				tcfc_capab;
 	int				tcfc_action;
 	struct tcf_t			tcfc_tm;
+	struct gen_estimator		*tcfc_ge;
 	struct gnet_stats_basic		tcfc_bstats;
 	struct gnet_stats_queue		tcfc_qstats;
-	struct gnet_stats_rate_est	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
 };
 #define tcf_next	common.tcfc_next
@@ -29,8 +29,10 @@ struct tcf_common {
 #define tcf_action	common.tcfc_action
 #define tcf_tm		common.tcfc_tm
 #define tcf_bstats	common.tcfc_bstats
+#define tcf_ge		common.tcfc_ge
+#define tcf_ge_bstats	common.tcfc_ge->bstats
+#define tcf_ge_rate_est	common.tcfc_ge->rate_est
 #define tcf_qstats	common.tcfc_qstats
-#define tcf_rate_est	common.tcfc_rate_est
 #define tcf_lock	common.tcfc_lock
 
 struct tcf_police {
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 0b95cf0..1103ec8 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -20,6 +20,21 @@ struct gnet_dump
 	struct tc_stats   tc_stats;
 };
 
+struct gen_estimator
+{
+	struct list_head	list;
+	struct gnet_stats_basic	bstats;
+	struct gnet_stats_rate_est	rate_est;
+	spinlock_t		stats_lock; /* Protects the stats updates */
+	unsigned		interval;
+	int			ewma_log;
+	u64			last_bytes;
+	u32			last_packets;
+	u32			avpps;
+	u32			avbps;
+	struct rcu_head		e_rcu;
+};
+
 extern int gnet_stats_start_copy(struct sk_buff *skb, int type,
 				 spinlock_t *lock, struct gnet_dump *d);
 
@@ -37,13 +52,24 @@ extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
 
 extern int gnet_stats_finish_copy(struct gnet_dump *d);
 
-extern int gen_new_estimator(struct gnet_stats_basic *bstats,
-			     struct gnet_stats_rate_est *rate_est,
-			     spinlock_t *stats_lock, struct rtattr *opt);
-extern void gen_kill_estimator(struct gnet_stats_basic *bstats,
-			       struct gnet_stats_rate_est *rate_est);
-extern int gen_replace_estimator(struct gnet_stats_basic *bstats,
-				 struct gnet_stats_rate_est *rate_est,
-				 spinlock_t *stats_lock, struct rtattr *opt);
+extern int gen_new_estimator(struct rtattr *opt, struct gen_estimator **ep);
+extern void gen_kill_estimator(struct gen_estimator *ep);
+extern int gen_replace_estimator(struct rtattr *opt, struct gen_estimator **ep);
+
+static inline void gen_bstat_add(struct gen_estimator *e,
+				  int bytes,
+				  int packets)
+{
+	spin_lock(&e->stats_lock);
+	e->bstats.bytes += bytes;
+	e->bstats.packets += packets;
+	spin_unlock(&e->stats_lock);
+}
+
+static inline void gen_bstat_skb(struct gen_estimator *e,
+				  struct sk_buff *skb)
+{
+	gen_bstat_add(e, skb->len, skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1);
+}
 
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1b8e351..ff13671 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -40,9 +40,9 @@ struct Qdisc
 	struct net_device	*dev;
 	struct list_head	list;
 
-	struct gnet_stats_basic	bstats;
+	struct gen_estimator	*ge;
+
 	struct gnet_stats_queue	qstats;
-	struct gnet_stats_rate_est	rate_est;
 	spinlock_t		*stats_lock;
 	struct rcu_head 	q_rcu;
 	int			(*reshape_fail)(struct sk_buff *skb,
@@ -185,8 +185,7 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 {
 	__skb_queue_tail(list, skb);
 	sch->qstats.backlog += skb->len;
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
+	gen_bstat_skb(sch->ge, skb);
 
 	return NET_XMIT_SUCCESS;
 }
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cc84d8d..502deb3 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -77,82 +77,65 @@
 
 #define EST_MAX_INTERVAL	5
 
-struct gen_estimator
-{
-	struct gen_estimator	*next;
-	struct gnet_stats_basic	*bstats;
-	struct gnet_stats_rate_est	*rate_est;
-	spinlock_t		*stats_lock;
-	unsigned		interval;
-	int			ewma_log;
-	u64			last_bytes;
-	u32			last_packets;
-	u32			avpps;
-	u32			avbps;
-};
-
 struct gen_estimator_head
 {
 	struct timer_list	timer;
-	struct gen_estimator	*list;
+	struct list_head	list;
 };
 
 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 
-/* Estimator array lock */
-static DEFINE_RWLOCK(est_lock);
-
 static void est_timer(unsigned long arg)
 {
 	int idx = (int)arg;
 	struct gen_estimator *e;
 
-	read_lock(&est_lock);
-	for (e = elist[idx].list; e; e = e->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &elist[idx].list, list) {
 		u64 nbytes;
 		u32 npackets;
 		u32 rate;
 
-		spin_lock(e->stats_lock);
-		nbytes = e->bstats->bytes;
-		npackets = e->bstats->packets;
+		spin_lock(&e->stats_lock);
+		nbytes = e->bstats.bytes;
+		npackets = e->bstats.packets;
 		rate = (nbytes - e->last_bytes)<<(7 - idx);
 		e->last_bytes = nbytes;
 		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
-		e->rate_est->bps = (e->avbps+0xF)>>5;
+		e->rate_est.bps = (e->avbps+0xF)>>5;
 
 		rate = (npackets - e->last_packets)<<(12 - idx);
 		e->last_packets = npackets;
 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
-		e->rate_est->pps = (e->avpps+0x1FF)>>10;
-		spin_unlock(e->stats_lock);
+		e->rate_est.pps = (e->avpps+0x1FF)>>10;
+		spin_unlock(&e->stats_lock);
 	}
 
-	if (elist[idx].list != NULL)
+	if (!list_empty(&elist[idx].list))
 		mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
-	read_unlock(&est_lock);
+	rcu_read_unlock();
 }
 
 /**
  * gen_new_estimator - create a new rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
  * @opt: rate estimator configuration TLV
+ * @ep: new gen_estimator struct is returned through this pointer on success
  *
- * Creates a new rate estimator with &bstats as source and &rate_est
- * as destination. A new timer with the interval specified in the
+ * Creates a new rate estimator with bstats as source and rate_est
+ * as destination. A timer with the interval specified in the
  * configuration TLV is created. Upon each interval, the latest statistics
- * will be read from &bstats and the estimated rate will be stored in
- * &rate_est with the statistics lock grabed during this period.
+ * will be read from bstats and the estimated rate will be stored in
+ * rate_est with the statistics lock grabed during this period.
  *
  * Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
  */
-int gen_new_estimator(struct gnet_stats_basic *bstats,
-	struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+int gen_new_estimator(struct rtattr *opt, struct gen_estimator **ep)
 {
 	struct gen_estimator *est;
 	struct gnet_estimator *parm = RTA_DATA(opt);
+	int idx;
 
 	if (RTA_PAYLOAD(opt) < sizeof(*parm))
 		return -EINVAL;
@@ -164,84 +147,74 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
-	est->interval = parm->interval + 2;
-	est->bstats = bstats;
-	est->rate_est = rate_est;
-	est->stats_lock = stats_lock;
+	spin_lock_init(&est->stats_lock);
+	est->interval = idx = parm->interval + 2;
 	est->ewma_log = parm->ewma_log;
-	est->last_bytes = bstats->bytes;
-	est->avbps = rate_est->bps<<5;
-	est->last_packets = bstats->packets;
-	est->avpps = rate_est->pps<<10;
-
-	est->next = elist[est->interval].list;
-	if (est->next == NULL) {
-		init_timer(&elist[est->interval].timer);
-		elist[est->interval].timer.data = est->interval;
-		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
-		elist[est->interval].timer.function = est_timer;
-		add_timer(&elist[est->interval].timer);
+
+	if (*ep != NULL) {
+		est->last_bytes = (*ep)->bstats.bytes;
+		est->avbps = (*ep)->rate_est.bps<<5;
+		est->last_packets = (*ep)->bstats.packets;
+		est->avpps = (*ep)->rate_est.pps<<10;
+	}
+
+	if (!elist[idx].timer.function) {
+		INIT_LIST_HEAD(&elist[idx].list);
+		setup_timer(&elist[idx].timer, est_timer, est->interval);
 	}
-	write_lock_bh(&est_lock);
-	elist[est->interval].list = est;
-	write_unlock_bh(&est_lock);
+
+	if (list_empty(&elist[idx].list))
+		mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+	list_add_rcu(&est->list, &elist[idx].list);
+
+	*ep = est;
+
 	return 0;
 }
 
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+	struct gen_estimator *e = container_of(head,
+					struct gen_estimator, e_rcu);
+	kfree(e);
+}
+
 /**
  * gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @ep: rate estimator to kill
+ *
+ * Removes the rate estimator specified
  *
- * Removes the rate estimator specified by &bstats and &rate_est
- * and deletes the timer.
+ * NOTE: Called under rtnl_mutex
  */
-void gen_kill_estimator(struct gnet_stats_basic *bstats,
-	struct gnet_stats_rate_est *rate_est)
+void gen_kill_estimator(struct gen_estimator *e)
 {
-	int idx;
-	struct gen_estimator *est, **pest;
-
-	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
-		int killed = 0;
-		pest = &elist[idx].list;
-		while ((est=*pest) != NULL) {
-			if (est->rate_est != rate_est || est->bstats != bstats) {
-				pest = &est->next;
-				continue;
-			}
-
-			write_lock_bh(&est_lock);
-			*pest = est->next;
-			write_unlock_bh(&est_lock);
-
-			kfree(est);
-			killed++;
-		}
-		if (killed && elist[idx].list == NULL)
-			del_timer(&elist[idx].timer);
-	}
+	list_del_rcu(&e->list);
+	call_rcu(&e->e_rcu, __gen_kill_estimator);
 }
 
 /**
  * gen_replace_estimator - replace rate estimator configruation
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
  * @opt: rate estimator configuration TLV
+ * @ep: new gen_estimator struct is returned through this pointer on success
  *
  * Replaces the configuration of a rate estimator by calling
  * gen_kill_estimator() and gen_new_estimator().
  *
  * Returns 0 on success or a negative error code.
  */
-int
-gen_replace_estimator(struct gnet_stats_basic *bstats,
-	struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock,
-	struct rtattr *opt)
+int gen_replace_estimator(struct rtattr *opt, struct gen_estimator **ep)
 {
-    gen_kill_estimator(bstats, rate_est);
-    return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+	struct gen_estimator *old = *ep;
+	int err;
+
+	err = gen_new_estimator(opt, ep);
+
+	if (!err)
+		gen_kill_estimator(old);
+
+	return err;
 }
 
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index feef366..bf94944 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -32,8 +32,7 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 			write_lock_bh(hinfo->lock);
 			*p1p = p->tcfc_next;
 			write_unlock_bh(hinfo->lock);
-			gen_kill_estimator(&p->tcfc_bstats,
-					   &p->tcfc_rate_est);
+			gen_kill_estimator(p->tcfc_ge);
 			kfree(p);
 			return;
 		}
@@ -209,7 +208,13 @@ struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind,
 }
 EXPORT_SYMBOL(tcf_hash_check);
 
-struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est, struct tc_action *a, int size, int bind, u32 *idx_gen, struct tcf_hashinfo *hinfo)
+struct tcf_common *tcf_hash_create(u32 index,
+				   struct rtattr *est,
+				   struct tc_action *a,
+				   int size,
+				   int bind,
+				   u32 *idx_gen,
+				   struct tcf_hashinfo *hinfo)
 {
 	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
 
@@ -223,9 +228,11 @@ struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est, struct tc_acti
 	p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo);
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
-	if (est)
-		gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
-				  &p->tcfc_lock, est);
+	if (est) {
+		gen_new_estimator(est, &p->tcfc_ge);
+		/* XXX TO-DO: verify successful return from
+		   gen_new_estimator()! */
+	}
 	a->priv = (void *) p;
 	return p;
 }
@@ -598,8 +605,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 		if (a->ops->get_stats(skb, a) < 0)
 			goto errout;
 
-	if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &h->tcf_rate_est) < 0 ||
+	if (gnet_stats_copy_basic(&d, &h->tcf_ge_bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &h->tcf_ge_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0)
 		goto errout;
 
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index d204038..2a3e1d5 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -108,8 +108,7 @@ void tcf_police_destroy(struct tcf_police *p)
 			write_lock_bh(&police_lock);
 			*p1p = p->tcf_next;
 			write_unlock_bh(&police_lock);
-			gen_kill_estimator(&p->tcf_bstats,
-					   &p->tcf_rate_est);
+			gen_kill_estimator(p->tcf_ge);
 			if (p->tcfp_R_tab)
 				qdisc_put_rtab(p->tcfp_R_tab);
 			if (p->tcfp_P_tab)
@@ -217,10 +216,11 @@ override:
 	if (tb[TCA_POLICE_AVRATE-1])
 		police->tcfp_ewma_rate =
 			*(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
-	if (est)
-		gen_replace_estimator(&police->tcf_bstats,
-				      &police->tcf_rate_est,
-				      &police->tcf_lock, est);
+	if (est) {
+		gen_replace_estimator(est, &police->tcf_ge);
+		/* XXX TO-DO: verify successful return from
+		   gen_replace_estimator()! */
+	}
 
 	spin_unlock_bh(&police->tcf_lock);
 	if (ret != ACT_P_CREATED)
@@ -263,11 +263,10 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&police->tcf_lock);
 
-	police->tcf_bstats.bytes += skb->len;
-	police->tcf_bstats.packets++;
+	gen_bstat_skb(police->tcf_ge, skb);
 
 	if (police->tcfp_ewma_rate &&
-	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+	    police->tcf_ge_rate_est.bps >= police->tcfp_ewma_rate) {
 		police->tcf_qstats.overlimits++;
 		spin_unlock(&police->tcf_lock);
 		return police->tcf_action;
@@ -476,9 +475,11 @@ struct tcf_police *tcf_police_locate(struct rtattr *rta, struct rtattr *est)
 	police->tcf_index = parm->index ? parm->index :
 		tcf_police_new_index();
 	police->tcf_action = parm->action;
-	if (est)
-		gen_new_estimator(&police->tcf_bstats, &police->tcf_rate_est,
-				  &police->tcf_lock, est);
+	if (est) {
+		gen_new_estimator(est, &police->tcf_ge);
+		/* XXX TO-DO: verify successful return from
+		   gen_new_estimator()! */
+	}
 	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
 	write_lock_bh(&police_lock);
 	police->tcf_next = tcf_police_ht[h];
@@ -501,11 +502,10 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *police)
 
 	spin_lock(&police->tcf_lock);
 
-	police->tcf_bstats.bytes += skb->len;
-	police->tcf_bstats.packets++;
+	gen_bstat_skb(police->tcf_ge, skb);
 
 	if (police->tcfp_ewma_rate &&
-	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+	    police->tcf_ge_rate_est.bps >= police->tcfp_ewma_rate) {
 		police->tcf_qstats.overlimits++;
 		spin_unlock(&police->tcf_lock);
 		return police->tcf_action;
@@ -583,8 +583,8 @@ int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *police)
 					 &d) < 0)
 		goto errout;
 
-	if (gnet_stats_copy_basic(&d, &police->tcf_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &police->tcf_rate_est) < 0 ||
+	if (gnet_stats_copy_basic(&d, &police->tcf_ge_bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &police->tcf_ge_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, &police->tcf_qstats) < 0)
 		goto errout;
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d92ea26..dbcc846 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -504,9 +504,7 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 
 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
 		if (tca[TCA_RATE-1]) {
-			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
-						sch->stats_lock,
-						tca[TCA_RATE-1]);
+			err = gen_new_estimator(tca[TCA_RATE-1], &sch->ge);
 			if (err) {
 				/*
 				 * Any broken qdiscs that would require
@@ -545,9 +543,11 @@ static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
 		if (err)
 			return err;
 	}
-	if (tca[TCA_RATE-1])
-		gen_replace_estimator(&sch->bstats, &sch->rate_est,
-			sch->stats_lock, tca[TCA_RATE-1]);
+	if (tca[TCA_RATE-1]) {
+		gen_replace_estimator(tca[TCA_RATE-1], &sch->ge);
+		/* XXX TO-DO: verify successful return from
+			   gen_replace_estimator()! */
+	}
 	return 0;
 }
 
@@ -822,8 +822,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 		goto rtattr_failure;
 
-	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
+	if (gnet_stats_copy_basic(&d, &q->ge->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &q->ge->rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
 		goto rtattr_failure;
 
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 54b92d2..35065f9 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -437,8 +437,7 @@ static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 		if (flow) flow->qstats.drops++;
 		return ret;
 	}
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
+	gen_bstat_skb(sch->ge, skb);
 	flow->bstats.bytes += skb->len;
 	flow->bstats.packets++;
 	/*
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index b184c35..5c00954 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -129,9 +129,10 @@ struct cbq_class
 	long			avgidle;
 	long			deficit;	/* Saved deficit for WRR */
 	psched_time_t		penalized;
-	struct gnet_stats_basic bstats;
+
+	struct gen_estimator	*ge; 		/* byte and packet stats
+						   collected here */
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
 	struct tc_cbq_xstats	xstats;
 
 	struct tcf_proto	*filter_list;
@@ -385,7 +386,6 @@ static int
 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	int len = skb->len;
 	int ret;
 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 
@@ -404,8 +404,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 #endif
 	if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
-		sch->bstats.packets++;
-		sch->bstats.bytes+=len;
+		gen_bstat_skb(sch->ge, skb);
 		cbq_mark_toplevel(q, cl);
 		if (!cl->next_alive)
 			cbq_activate_class(cl);
@@ -674,7 +673,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 
 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 {
-	int len = skb->len;
 	struct Qdisc *sch = child->__parent;
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl = q->rx_class;
@@ -690,8 +688,7 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 
 		if (cl->q->enqueue(skb, cl->q) == 0) {
 			sch->q.qlen++;
-			sch->bstats.packets++;
-			sch->bstats.bytes+=len;
+			gen_bstat_skb(sch->ge, skb);
 			if (!cl->next_alive)
 				cbq_activate_class(cl);
 			return 0;
@@ -749,8 +746,7 @@ cbq_update(struct cbq_sched_data *q)
 		long avgidle = cl->avgidle;
 		long idle;
 
-		cl->bstats.packets++;
-		cl->bstats.bytes += len;
+		gen_bstat_add(cl->ge, len, 1);
 
 		/*
 		   (now - last) is total time between packet right edges.
@@ -1634,8 +1630,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	if (cl->undertime != PSCHED_PASTPERFECT)
 		cl->xstats.undertime = cl->undertime - q->now;
 
-	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+	if (gnet_stats_copy_basic(d, &cl->ge->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->ge->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1706,7 +1702,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 	tcf_destroy_chain(cl->filter_list);
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(cl->ge);
 	if (cl != &q->link)
 		kfree(cl);
 }
@@ -1851,10 +1847,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
 
 		sch_tree_unlock(sch);
 
-		if (tca[TCA_RATE-1])
-			gen_replace_estimator(&cl->bstats, &cl->rate_est,
-					      &sch->dev->queue_lock,
-					      tca[TCA_RATE-1]);
+		if (tca[TCA_RATE-1]) {
+			gen_replace_estimator(tca[TCA_RATE-1], &cl->ge);
+			/* XXX TO-DO: verify successful return from
+			   gen_replace_estimator()! */
+		}
 		return 0;
 	}
 
@@ -1939,9 +1936,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
 		cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
 	sch_tree_unlock(sch);
 
-	if (tca[TCA_RATE-1])
-		gen_new_estimator(&cl->bstats, &cl->rate_est,
-				  &sch->dev->queue_lock, tca[TCA_RATE-1]);
+	if (tca[TCA_RATE-1]) {
+		gen_new_estimator(tca[TCA_RATE-1], &cl->ge);
+		/* XXX TO-DO: verify successful return from
+		   gen_new_estimator()! */
+	}
 
 	*arg = (unsigned long)cl;
 	return 0;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 4d2c233..8c44328 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -265,8 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 		return err;
 	}
 
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
+	gen_bstat_skb(sch->ge, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index c81649c..acc6d90 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -506,7 +506,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 		return;
 
 	list_del(&qdisc->list);
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	gen_kill_estimator(qdisc->ge);
 	if (ops->reset)
 		ops->reset(qdisc);
 	if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 874452c..2b95b04 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -116,9 +116,10 @@ struct hfsc_class
 	u32		classid;	/* class id */
 	unsigned int	refcnt;		/* usage count */
 
-	struct gnet_stats_basic bstats;
+	struct gen_estimator *ge;	/* byte and packet stats collected here
+					 */
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+
 	unsigned int	level;		/* class level in hierarchy */
 	struct tcf_proto *filter_list;	/* filter list */
 	unsigned int	filter_cnt;	/* filter count */
@@ -1050,10 +1051,11 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		}
 		sch_tree_unlock(sch);
 
-		if (tca[TCA_RATE-1])
-			gen_replace_estimator(&cl->bstats, &cl->rate_est,
-					      &sch->dev->queue_lock,
-					      tca[TCA_RATE-1]);
+		if (tca[TCA_RATE-1]) {
+			gen_replace_estimator(tca[TCA_RATE-1], &cl->ge);
+			/* XXX TO-DO: verify successful return from
+			   gen_replace_estimator()! */
+		}
 		return 0;
 	}
 
@@ -1106,9 +1108,11 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->cl_pcvtoff = parent->cl_cvtoff;
 	sch_tree_unlock(sch);
 
-	if (tca[TCA_RATE-1])
-		gen_new_estimator(&cl->bstats, &cl->rate_est,
-				  &sch->dev->queue_lock, tca[TCA_RATE-1]);
+	if (tca[TCA_RATE-1]) {
+		gen_new_estimator(tca[TCA_RATE-1], &cl->ge);
+		/* XXX TO-DO: verify successful return from
+		   gen_new_estimator()! */
+	}
 	*arg = (unsigned long)cl;
 	return 0;
 }
@@ -1120,7 +1124,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 
 	tcf_destroy_chain(cl->filter_list);
 	qdisc_destroy(cl->qdisc);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(cl->ge);
 	if (cl != &q->root)
 		kfree(cl);
 }
@@ -1373,8 +1377,8 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.work    = cl->cl_total;
 	xstats.rtwork  = cl->cl_cumul;
 
-	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+	if (gnet_stats_copy_basic(d, &cl->ge->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->ge->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1587,10 +1591,8 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl->qdisc->q.qlen == 1)
 		set_active(cl, len);
 
-	cl->bstats.packets++;
-	cl->bstats.bytes += len;
-	sch->bstats.packets++;
-	sch->bstats.bytes += len;
+	gen_bstat_skb(cl->ge, skb);
+	gen_bstat_skb(sch->ge, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index b417a95..25bddd8 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -71,9 +71,8 @@ enum htb_cmode {
 struct htb_class {
 	/* general class parameters */
 	u32 classid;
-	struct gnet_stats_basic bstats;
+	struct gen_estimator *ge; /* byte and packet stats collected here */
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
 	struct tc_htb_xstats xstats;	/* our special stats */
 	int refcnt;		/* usage count of this class */
 
@@ -603,15 +602,12 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		cl->qstats.drops++;
 		return NET_XMIT_DROP;
 	} else {
-		cl->bstats.packets +=
-			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
-		cl->bstats.bytes += skb->len;
+		gen_bstat_skb(cl->ge, skb);
 		htb_activate(q, cl);
 	}
 
 	sch->q.qlen++;
-	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
-	sch->bstats.bytes += skb->len;
+	gen_bstat_skb(sch->ge, skb);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -696,9 +692,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 
 		/* update byte stats except for leaves which are already updated */
 		if (cl->level) {
-			cl->bstats.bytes += bytes;
-			cl->bstats.packets += skb_is_gso(skb)?
-					skb_shinfo(skb)->gso_segs:1;
+			gen_bstat_skb(cl->ge, skb);
 		}
 		cl = cl->parent;
 	}
@@ -1112,8 +1106,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 	cl->xstats.tokens = cl->tokens;
 	cl->xstats.ctokens = cl->ctokens;
 
-	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+	if (gnet_stats_copy_basic(d, &cl->ge->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->ge->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1204,7 +1198,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 		BUG_TRAP(cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(cl->ge);
 	qdisc_put_rtab(cl->rate);
 	qdisc_put_rtab(cl->ceil);
 
@@ -1357,9 +1351,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
 			goto failure;
 
-		gen_new_estimator(&cl->bstats, &cl->rate_est,
-				  &sch->dev->queue_lock,
-				  tca[TCA_RATE-1] ? : &est.rta);
+		gen_new_estimator(tca[TCA_RATE-1] ? : &est.rta, &cl->ge);
+		/* XXX TO-DO: verify successful return from
+		   gen_new_estimator()! */
+
 		cl->refcnt = 1;
 		INIT_LIST_HEAD(&cl->sibling);
 		INIT_HLIST_NODE(&cl->hlist);
@@ -1412,10 +1407,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		list_add_tail(&cl->sibling,
 			      parent ? &parent->children : &q->root);
 	} else {
-		if (tca[TCA_RATE-1])
-			gen_replace_estimator(&cl->bstats, &cl->rate_est,
-					      &sch->dev->queue_lock,
-					      tca[TCA_RATE-1]);
+		if (tca[TCA_RATE-1]) {
+			gen_replace_estimator(tca[TCA_RATE-1], &cl->ge);
+			/* XXX TO-DO: verify successful return from
+			   gen_replace_estimator()! */
+		}
 		sch_tree_lock(sch);
 	}
 
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index cd0aab6..7be51ad 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -145,8 +145,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 	 * firewall FW_* code.
 	 */
 #ifdef CONFIG_NET_CLS_ACT
-	sch->bstats.packets++;
-	sch->bstats.bytes += skb->len;
+	gen_bstat_skb(sch->ge, skb);
 	switch (result) {
 		case TC_ACT_SHOT:
 			result = TC_ACT_SHOT;
@@ -176,8 +175,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 		case TC_POLICE_OK:
 		case TC_POLICE_UNSPEC:
 		default:
-		sch->bstats.packets++;
-		sch->bstats.bytes += skb->len;
+		gen_bstat_skb(sch->ge, skb);
 		result = NF_ACCEPT;
 		break;
 	}
@@ -185,8 +183,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 #else
 	D2PRINTK("Overriding result to ACCEPT\n");
 	result = NF_ACCEPT;
-	sch->bstats.packets++;
-	sch->bstats.bytes += skb->len;
+	gen_bstat_skb(sch->ge, skb);
 #endif
 #endif
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 9e5e87e..77c3575 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -231,8 +231,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	if (likely(ret == NET_XMIT_SUCCESS)) {
 		sch->q.qlen++;
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
+		gen_bstat_skb(sch->ge, skb);
 	} else
 		sch->qstats.drops++;
 
@@ -506,8 +505,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		__skb_queue_after(list, skb, nskb);
 
 		sch->qstats.backlog += nskb->len;
-		sch->bstats.bytes += nskb->len;
-		sch->bstats.packets++;
+		gen_bstat_skb(sch->ge, nskb);
 
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 2d8c084..5fb1954 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -88,8 +88,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 #endif
 
 	if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
+		gen_bstat_skb(sch->ge, skb);
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
@@ -430,7 +429,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	struct Qdisc *cl_q;
 
 	cl_q = q->queues[cl - 1];
-	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	if (gnet_stats_copy_basic(d, &cl_q->ge->bstats) < 0 ||
 	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
 		return -1;
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9b95fef..7161a37 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -94,8 +94,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
 	ret = child->enqueue(skb, child);
 	if (likely(ret == NET_XMIT_SUCCESS)) {
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
+		gen_bstat_skb(sch->ge, skb);
 		sch->q.qlen++;
 	} else {
 		q->stats.pdrop++;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 9579573..4880299 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -271,8 +271,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 		}
 	}
 	if (++sch->q.qlen < q->limit-1) {
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
+		gen_bstat_skb(sch->ge, skb);
 		return 0;
 	}
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 22e431d..817eed2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -139,8 +139,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 	}
 
 	sch->q.qlen++;
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
+	gen_bstat_skb(sch->ge, skb);
 	return 0;
 }
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 0968184..af6deaa 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -83,8 +83,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
 	if (q->q.qlen < dev->tx_queue_len) {
 		__skb_queue_tail(&q->q, skb);
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
+		gen_bstat_skb(sch->ge, skb);
 		return 0;
 	}
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-12 17:48           ` Ranko Zivojnovic
@ 2007-07-13 12:17             ` Jarek Poplawski
  2007-07-13 12:26               ` Ranko Zivojnovic
  0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-13 12:17 UTC (permalink / raw)
  To: Ranko Zivojnovic; +Cc: Patrick McHardy, netdev

On Thu, Jul 12, 2007 at 08:48:45PM +0300, Ranko Zivojnovic wrote:
...
> Ok - here's the patch for a review - it compiles clean ... and that's as
> much as it has been tested - I'll try give it a run later today or
> definitely tomorrow.
...

Ranko, you have some powers!

Alas, I definitely need more time. I hope I'll manage to check this
till monday. Of course, if testing will be OK and somebody will check
and ack this earlier I don't see any reason in waiting on my opinion.

Thanks & good weekend,
Jarek P.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-13 12:17             ` Jarek Poplawski
@ 2007-07-13 12:26               ` Ranko Zivojnovic
  2007-07-13 13:42                 ` Jarek Poplawski
  0 siblings, 1 reply; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-13 12:26 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Patrick McHardy, netdev

On Fri, 2007-07-13 at 14:17 +0200, Jarek Poplawski wrote:
> On Thu, Jul 12, 2007 at 08:48:45PM +0300, Ranko Zivojnovic wrote:
> ...
> > Ok - here's the patch for a review - it compiles clean ... and that's as
> > much as it has been tested - I'll try give it a run later today or
> > definitely tomorrow.
> ...
> 
> Ranko, you have some powers!
> 
> Alas, I definitely need more time. I hope I'll manage to check this
> till monday. Of course, if testing will be OK and somebody will check
> and ack this earlier I don't see any reason in waiting on my opinion.

Don't bother - just tested and it is a no-go (it would have been a
wander if it worked from the first time :)) - have to resolve the issues
with qdiscs that do not calculate/use rates...

I'm not sure if it is desirable to have rates available on all qdiscs -
even on pfifo_fast... that way, when you issue...

# tc -s qdisc show dev eth0
qdisc pfifo_fast 0: root bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
 Sent 115971059 bytes 196761 pkt (dropped 0, overlimits 0 requeues 0)
 rate 0bit 0pps backlog 0b 0p requeues 0

...you will not get the 'rate 0bit'.

R.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-13 12:26               ` Ranko Zivojnovic
@ 2007-07-13 13:42                 ` Jarek Poplawski
  2007-07-16  7:00                   ` Jarek Poplawski
  0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-13 13:42 UTC (permalink / raw)
  To: Ranko Zivojnovic; +Cc: Patrick McHardy, netdev

On Fri, Jul 13, 2007 at 03:26:42PM +0300, Ranko Zivojnovic wrote:
> On Fri, 2007-07-13 at 14:17 +0200, Jarek Poplawski wrote:
> > On Thu, Jul 12, 2007 at 08:48:45PM +0300, Ranko Zivojnovic wrote:
> > ...
> > > Ok - here's the patch for a review - it compiles clean ... and that's as
> > > much as it has been tested - I'll try give it a run later today or
> > > definitely tomorrow.
> > ...
> > 
> > Ranko, you have some powers!
> > 
> > Alas, I definitely need more time. I hope I'll manage to check this
> > till monday. Of course, if testing will be OK and somebody will check
> > and ack this earlier I don't see any reason in waiting on my opinion.
> 
> Don't bother - just tested and it is a no-go (it would have been a
> wander if it worked from the first time :)) - have to resolve the issues
> with qdiscs that do not calculate/use rates...
> 
> I'm not sure if it is desirable to have rates available on all qdiscs -
> even on pfifo_fast... that way, when you issue...
> 
> # tc -s qdisc show dev eth0
> qdisc pfifo_fast 0: root bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
>  Sent 115971059 bytes 196761 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 0bit 0pps backlog 0b 0p requeues 0
> 
> ...you will not get the 'rate 0bit'.
> 

I've been a bit tight on time today, and only now I see that maybe
you have done too much. Of course, you can do it your way, but I
think it should be easier not change too much at once, so if possible
only include structs bstats and rate_est into struct gen_estimator
and do otherwise in these few sch_'s and act_'s which use this plus
all acceses and allocs changed appropriately. So it should be only 6
or 7 files affected. It seems no changes about gen_stats are necessary
now (next patch?).

I really can't check more now (or answer until monday).

Bye,
Jarek P.

PS: I'm in a little hurry now so I hope I'm not very wrong above...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-13 13:42                 ` Jarek Poplawski
@ 2007-07-16  7:00                   ` Jarek Poplawski
  2007-07-16 13:03                     ` Patrick McHardy
  0 siblings, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-16  7:00 UTC (permalink / raw)
  To: Ranko Zivojnovic; +Cc: Patrick McHardy, netdev

On Fri, Jul 13, 2007 at 03:42:31PM +0200, Jarek Poplawski wrote:
...
> On Fri, Jul 13, 2007 at 03:26:42PM +0300, Ranko Zivojnovic wrote:
> I've been a bit tight on time today, and only now I see that maybe
> you have done too much. Of course, you can do it your way, but I
> think it should be easier not change too much at once, so if possible
> only include structs bstats and rate_est into struct gen_estimator
> and do otherwise in these few sch_'s and act_'s which use this plus
> all acceses and allocs changed appropriately. So it should be only 6
> or 7 files affected. It seems no changes about gen_stats are necessary
> now (next patch?).

Alas, I was wrong. This idea really needs to change all these files
plus even more! If I could've foreseen this...

There is probably quite easy way to get rid of this one race only
by e.g. replacing *bstats field with NULL in gen_kill_estimator,
and check for this in est_timer just after taking a lock.

The gain from an api change would be mainly faster gen_kill_
and gen_replace_estimator. But, on the other hand, you have it
almost done. Ranko, I think these changes need opinion of more
maintainers as soon as possible - after all they could have some
objections; IMHO at least: Jamal Hadi Salim, Thomas Graf (authors
plus act_), Stephen Hemminger (sch_netem) and Jiri Benc
(net/mac80211).

I also have some suggestions, but maybe it would be better to do
them on a current version (the main is about this *stats_lock - 
you probably misunderstood Patrick - but I hope you found it
yourself).

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-16  7:00                   ` Jarek Poplawski
@ 2007-07-16 13:03                     ` Patrick McHardy
  2007-07-16 17:45                       ` Ranko Zivojnovic
  0 siblings, 1 reply; 18+ messages in thread
From: Patrick McHardy @ 2007-07-16 13:03 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Ranko Zivojnovic, netdev

Jarek Poplawski wrote:
> There is probably quite easy way to get rid of this one race only
> by e.g. replacing *bstats field with NULL in gen_kill_estimator,
> and check for this in est_timer just after taking a lock.
> 
> The gain from an api change would be mainly faster gen_kill_
> and gen_replace_estimator. But, on the other hand, you have it
> almost done. Ranko, I think these changes need opinion of more
> maintainers as soon as possible - after all they could have some
> objections; IMHO at least: Jamal Hadi Salim, Thomas Graf (authors
> plus act_), Stephen Hemminger (sch_netem) and Jiri Benc
> (net/mac80211).


Frankly, all we need is a final patch.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-16 13:03                     ` Patrick McHardy
@ 2007-07-16 17:45                       ` Ranko Zivojnovic
  2007-07-17  1:28                         ` David Miller
  2007-07-17 12:04                         ` Jarek Poplawski
  0 siblings, 2 replies; 18+ messages in thread
From: Ranko Zivojnovic @ 2007-07-16 17:45 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jarek Poplawski, netdev

[-- Attachment #1: Type: text/plain, Size: 1901 bytes --]

Patrick McHardy wrote:
> Jarek Poplawski wrote:
> > There is probably quite easy way to get rid of this one race only
> > by e.g. replacing *bstats field with NULL in gen_kill_estimator,
> > and check for this in est_timer just after taking a lock.
> > 
You are absolutely right - I definitely overcomplicated the solution to
the actual problem in hand - the deadlock... Attached is the patch that
does exactly as above and should put a final resolution to the current
gen_estimator issues ... I've also given it a spin and so far it looks
ok. I will re-post it tomorrow evening for inclusion in the tree if no
objections come in the mean time.

> Jarek Poplawski wrote:
> > The gain from an api change would be mainly faster gen_kill_
> > and gen_replace_estimator. But, on the other hand, you have it
> > almost done. Ranko, I think these changes need opinion of more
> > maintainers as soon as possible - after all they could have some
> > objections; IMHO at least: Jamal Hadi Salim, Thomas Graf (authors
> > plus act_), Stephen Hemminger (sch_netem) and Jiri Benc
> > (net/mac80211).

>From my point of view the api change would help in untying the knot
entirely... It would become an independent piece of code with a sole
focus on the rate estimation. I am even thinking of making it even more
independent than the previous patch I posted... A pure small library
that can be configured to track rates with configurable algo's etc... It
may prove to be useful for the other things as well. 

But now that gen_estimator issues are hopefully resolved through this
simpler patch - the api change can take its own pace.

Patrick McHardy wrote:
> Frankly, all we need is a final patch.

The patch that resolves the ABBA deadlock is attached. WRT api patch -
although I did not face any problems so far, I still need to do more
comprehensive testing...

Best regards and thanks for the patience,

R.



[-- Attachment #2: x --]
[-- Type: text/plain, Size: 5699 bytes --]

[NET] gen_estimator deadlock fix

-Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:

> There is at least one ABBA deadlock, est_timer() does:
> read_lock(&est_lock)
> spin_lock(e->stats_lock) (which is dev->queue_lock)
>
> and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> calls htb_destroy_class, then gen_kill_estimator and this
> write_locks est_lock.

To fix the ABBA deadlock the rate estimators are now kept on an rcu list.

-The est_lock changes the use from protecting the list to protecting
the update to the 'bstat' pointer in order to avoid NULL dereferencing.

-The 'interval' member of the gen_estimator structure removed as it is
not needed.

Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>

---

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cc84d8d..590a767 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -79,27 +79,27 @@
 
 struct gen_estimator
 {
-	struct gen_estimator	*next;
+	struct list_head	list;
 	struct gnet_stats_basic	*bstats;
 	struct gnet_stats_rate_est	*rate_est;
 	spinlock_t		*stats_lock;
-	unsigned		interval;
 	int			ewma_log;
 	u64			last_bytes;
 	u32			last_packets;
 	u32			avpps;
 	u32			avbps;
+	struct rcu_head		e_rcu;
 };
 
 struct gen_estimator_head
 {
 	struct timer_list	timer;
-	struct gen_estimator	*list;
+	struct list_head	list;
 };
 
 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 
-/* Estimator array lock */
+/* Protects against NULL dereference */
 static DEFINE_RWLOCK(est_lock);
 
 static void est_timer(unsigned long arg)
@@ -107,13 +107,17 @@ static void est_timer(unsigned long arg)
 	int idx = (int)arg;
 	struct gen_estimator *e;
 
-	read_lock(&est_lock);
-	for (e = elist[idx].list; e; e = e->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &elist[idx].list, list) {
 		u64 nbytes;
 		u32 npackets;
 		u32 rate;
 
 		spin_lock(e->stats_lock);
+		read_lock(&est_lock);
+		if (e->bstats == NULL)
+			goto skip;
+
 		nbytes = e->bstats->bytes;
 		npackets = e->bstats->packets;
 		rate = (nbytes - e->last_bytes)<<(7 - idx);
@@ -125,12 +129,14 @@ static void est_timer(unsigned long arg)
 		e->last_packets = npackets;
 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
 		e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+		read_unlock(&est_lock);
 		spin_unlock(e->stats_lock);
 	}
 
-	if (elist[idx].list != NULL)
+	if (!list_empty(&elist[idx].list))
 		mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
-	read_unlock(&est_lock);
+	rcu_read_unlock();
 }
 
 /**
@@ -147,12 +153,17 @@ static void est_timer(unsigned long arg)
  * &rate_est with the statistics lock grabed during this period.
  *
  * Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
  */
 int gen_new_estimator(struct gnet_stats_basic *bstats,
-	struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+		      struct gnet_stats_rate_est *rate_est,
+		      spinlock_t *stats_lock,
+		      struct rtattr *opt)
 {
 	struct gen_estimator *est;
 	struct gnet_estimator *parm = RTA_DATA(opt);
+	int idx;
 
 	if (RTA_PAYLOAD(opt) < sizeof(*parm))
 		return -EINVAL;
@@ -164,7 +175,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
-	est->interval = parm->interval + 2;
+	idx = parm->interval + 2;
 	est->bstats = bstats;
 	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
@@ -174,20 +185,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 	est->last_packets = bstats->packets;
 	est->avpps = rate_est->pps<<10;
 
-	est->next = elist[est->interval].list;
-	if (est->next == NULL) {
-		init_timer(&elist[est->interval].timer);
-		elist[est->interval].timer.data = est->interval;
-		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
-		elist[est->interval].timer.function = est_timer;
-		add_timer(&elist[est->interval].timer);
+	if (!elist[idx].timer.function) {
+		INIT_LIST_HEAD(&elist[idx].list);
+		setup_timer(&elist[idx].timer, est_timer, idx);
 	}
-	write_lock_bh(&est_lock);
-	elist[est->interval].list = est;
-	write_unlock_bh(&est_lock);
+
+	if (list_empty(&elist[idx].list))
+		mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+	list_add_rcu(&est->list, &elist[idx].list);
 	return 0;
 }
 
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+	struct gen_estimator *e = container_of(head,
+					struct gen_estimator, e_rcu);
+	kfree(e);
+}
+
 /**
  * gen_kill_estimator - remove a rate estimator
  * @bstats: basic statistics
@@ -195,31 +211,32 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
  *
  * Removes the rate estimator specified by &bstats and &rate_est
  * and deletes the timer.
+ *
+ * NOTE: Called under rtnl_mutex
  */
 void gen_kill_estimator(struct gnet_stats_basic *bstats,
 	struct gnet_stats_rate_est *rate_est)
 {
 	int idx;
-	struct gen_estimator *est, **pest;
+	struct gen_estimator *e, *n;
 
 	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
-		int killed = 0;
-		pest = &elist[idx].list;
-		while ((est=*pest) != NULL) {
-			if (est->rate_est != rate_est || est->bstats != bstats) {
-				pest = &est->next;
+
+		/* Skip non initialized indexes */
+		if (!elist[idx].timer.function)
+			continue;
+
+		list_for_each_entry_safe(e, n, &elist[idx].list, list) {
+			if (e->rate_est != rate_est || e->bstats != bstats)
 				continue;
-			}
 
 			write_lock_bh(&est_lock);
-			*pest = est->next;
+			e->bstats = NULL;
 			write_unlock_bh(&est_lock);
 
-			kfree(est);
-			killed++;
+			list_del_rcu(&e->list);
+			call_rcu(&e->e_rcu, __gen_kill_estimator);
 		}
-		if (killed && elist[idx].list == NULL)
-			del_timer(&elist[idx].timer);
 	}
 }
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-16 17:45                       ` Ranko Zivojnovic
@ 2007-07-17  1:28                         ` David Miller
  2007-07-17 12:04                         ` Jarek Poplawski
  1 sibling, 0 replies; 18+ messages in thread
From: David Miller @ 2007-07-17  1:28 UTC (permalink / raw)
  To: ranko; +Cc: kaber, jarkao2, netdev

From: Ranko Zivojnovic <ranko@spidernet.net>
Date: Mon, 16 Jul 2007 20:45:05 +0300

> [NET] gen_estimator deadlock fix
> 
> -Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> 
> > There is at least one ABBA deadlock, est_timer() does:
> > read_lock(&est_lock)
> > spin_lock(e->stats_lock) (which is dev->queue_lock)
> >
> > and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> > calls htb_destroy_class, then gen_kill_estimator and this
> > write_locks est_lock.
> 
> To fix the ABBA deadlock the rate estimators are now kept on an rcu list.
> 
> -The est_lock changes the use from protecting the list to protecting
> the update to the 'bstat' pointer in order to avoid NULL dereferencing.
> 
> -The 'interval' member of the gen_estimator structure removed as it is
> not needed.
> 
> Signed-off-by: Ranko Zivojnovic <ranko@spidernet.net>

Applied, thanks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-17 12:04                         ` Jarek Poplawski
@ 2007-07-17 12:01                           ` Patrick McHardy
  2007-07-17 12:28                             ` Jarek Poplawski
  0 siblings, 1 reply; 18+ messages in thread
From: Patrick McHardy @ 2007-07-17 12:01 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Ranko Zivojnovic, netdev

Jarek Poplawski wrote:
> This patch looks fine, but while checking for this lock I've found
> another strange thing: for actions tcfc_stats_lock is used here, which
> is equivalent to tcfc_lock; so, in gen_kill_estimator we get this lock
> sometimes after dev->queue_lock; this order is also possible during
> tc_classify if actions are used; on the other hand act_mirred calls
> dev_queue_xmit under this lock, so dev->queue_lock is taken in another
> order. I hope it's with different devs, and there is no real deadlock
> possible, but this all is a bit queer...


It *should* be a different device, but AFAIK nothing enforces this.
There are quite a few possible deadlocks with TC actions, mid-term
things like the mirred action need to be redesigned to inject packet
from a different context.

> I don't know actions enough, but it seems, if it's possible that they
> are always run only from tc_classify, with dev->queue_lock, maybe it
> would be simpler to use this lock for actions' stats with
> gen_estimator too.


The same action can be shared between devices, so they need seperate
locks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-16 17:45                       ` Ranko Zivojnovic
  2007-07-17  1:28                         ` David Miller
@ 2007-07-17 12:04                         ` Jarek Poplawski
  2007-07-17 12:01                           ` Patrick McHardy
  1 sibling, 1 reply; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-17 12:04 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Ranko Zivojnovic, netdev

On Mon, Jul 16, 2007 at 08:45:05PM +0300, Ranko Zivojnovic wrote:
...
> [NET] gen_estimator deadlock fix
> 
> -Fixes ABBA deadlock noted by Patrick McHardy <kaber@trash.net>:
> 
> > There is at least one ABBA deadlock, est_timer() does:
> > read_lock(&est_lock)
> > spin_lock(e->stats_lock) (which is dev->queue_lock)
> >
> > and qdisc_destroy calls htb_destroy under dev->queue_lock, which
> > calls htb_destroy_class, then gen_kill_estimator and this
> > write_locks est_lock.
> 
> To fix the ABBA deadlock the rate estimators are now kept on an rcu list.
> 
> -The est_lock changes the use from protecting the list to protecting
> the update to the 'bstat' pointer in order to avoid NULL dereferencing.

This patch looks fine, but while checking for this lock I've found
another strange thing: for actions tcfc_stats_lock is used here, which
is equivalent to tcfc_lock; so, in gen_kill_estimator we get this lock
sometimes after dev->queue_lock; this order is also possible during
tc_classify if actions are used; on the other hand act_mirred calls
dev_queue_xmit under this lock, so dev->queue_lock is taken in another
order. I hope it's with different devs, and there is no real deadlock
possible, but this all is a bit queer...

I don't know actions enough, but it seems, if it's possible that they
are always run only from tc_classify, with dev->queue_lock, maybe it
would be simpler to use this lock for actions' stats with
gen_estimator too. And if gen_kill_estimator is sometimes run with
dev->queue_lock, maybe doing this always would make this locking
really understandable and less prone for such inversions (plus
est_lock could be forgotten)?

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [NET]: gen_estimator deadlock fix
  2007-07-17 12:01                           ` Patrick McHardy
@ 2007-07-17 12:28                             ` Jarek Poplawski
  0 siblings, 0 replies; 18+ messages in thread
From: Jarek Poplawski @ 2007-07-17 12:28 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Ranko Zivojnovic, netdev

On Tue, Jul 17, 2007 at 02:01:48PM +0200, Patrick McHardy wrote:
...

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2007-07-17 12:19 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-11 13:41 [NET]: gen_estimator deadlock fix Ranko Zivojnovic
2007-07-12  7:37 ` Jarek Poplawski
2007-07-12  9:18   ` Ranko Zivojnovic
2007-07-12  9:40     ` Ranko Zivojnovic
2007-07-12 10:46     ` Jarek Poplawski
2007-07-12 11:47       ` Ranko Zivojnovic
2007-07-12 12:07         ` Patrick McHardy
2007-07-12 17:48           ` Ranko Zivojnovic
2007-07-13 12:17             ` Jarek Poplawski
2007-07-13 12:26               ` Ranko Zivojnovic
2007-07-13 13:42                 ` Jarek Poplawski
2007-07-16  7:00                   ` Jarek Poplawski
2007-07-16 13:03                     ` Patrick McHardy
2007-07-16 17:45                       ` Ranko Zivojnovic
2007-07-17  1:28                         ` David Miller
2007-07-17 12:04                         ` Jarek Poplawski
2007-07-17 12:01                           ` Patrick McHardy
2007-07-17 12:28                             ` Jarek Poplawski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).