netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next-2.6] pkt_sched: gen_estimator: kill est_lock rwlock
@ 2010-06-07 14:32 Eric Dumazet
  2010-06-07 14:53 ` Changli Gao
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-07 14:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Stephen Hemminger, Jarek Poplawski

We can use RCU in est_timer() and kill est_lock rwlock.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/gen_estimator.c |   45 ++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cf8e703..406d880 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -102,9 +102,6 @@ struct gen_estimator_head
 
 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
 /* Protects against soft lockup during large deletion */
 static struct rb_root est_root = RB_ROOT;
 
@@ -115,29 +112,25 @@ static void est_timer(unsigned long arg)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
-		u64 nbytes;
-		u64 brate;
-		u32 npackets;
-		u32 rate;
+		struct gnet_stats_basic_packed *bstats;
 
 		spin_lock(e->stats_lock);
-		read_lock(&est_lock);
-		if (e->bstats == NULL)
-			goto skip;
-
-		nbytes = e->bstats->bytes;
-		npackets = e->bstats->packets;
-		brate = (nbytes - e->last_bytes)<<(7 - idx);
-		e->last_bytes = nbytes;
-		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
-		e->rate_est->bps = (e->avbps+0xF)>>5;
-
-		rate = (npackets - e->last_packets)<<(12 - idx);
-		e->last_packets = npackets;
-		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		e->rate_est->pps = (e->avpps+0x1FF)>>10;
-skip:
-		read_unlock(&est_lock);
+		bstats = rcu_dereference(e->bstats);
+		if (bstats) {
+			u64 nbytes = ACCESS_ONCE(bstats->bytes);
+			u32 npackets = ACCESS_ONCE(bstats->packets);
+			u64 brate = (nbytes - e->last_bytes)<<(7 - idx);
+			u32 rate;
+
+			e->last_bytes = nbytes;
+			e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
+			e->rate_est->bps = (e->avbps+0xF)>>5;
+
+			rate = (npackets - e->last_packets)<<(12 - idx);
+			e->last_packets = npackets;
+			e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
+			e->rate_est->pps = (e->avpps+0x1FF)>>10;
+		}
 		spin_unlock(e->stats_lock);
 	}
 
@@ -271,9 +264,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 	while ((e = gen_find_node(bstats, rate_est))) {
 		rb_erase(&e->node, &est_root);
 
-		write_lock_bh(&est_lock);
-		e->bstats = NULL;
-		write_unlock_bh(&est_lock);
+		rcu_assign_pointer(e->bstats, NULL);
 
 		list_del_rcu(&e->list);
 		call_rcu(&e->e_rcu, __gen_kill_estimator);



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-07 14:32 [PATCH net-next-2.6] pkt_sched: gen_estimator: kill est_lock rwlock Eric Dumazet
@ 2010-06-07 14:53 ` Changli Gao
  2010-06-07 15:30   ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Changli Gao @ 2010-06-07 14:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski

On Mon, Jun 7, 2010 at 10:32 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> We can use RCU in est_timer() and kill est_lock rwlock.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/core/gen_estimator.c |   45 ++++++++++++++-----------------------
>  1 file changed, 18 insertions(+), 27 deletions(-)
>
> diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
> index cf8e703..406d880 100644
> --- a/net/core/gen_estimator.c
> +++ b/net/core/gen_estimator.c
> @@ -102,9 +102,6 @@ struct gen_estimator_head
>
>  static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
>
> -/* Protects against NULL dereference */
> -static DEFINE_RWLOCK(est_lock);
> -
>  /* Protects against soft lockup during large deletion */
>  static struct rb_root est_root = RB_ROOT;
>
> @@ -115,29 +112,25 @@ static void est_timer(unsigned long arg)
>
>        rcu_read_lock();
>        list_for_each_entry_rcu(e, &elist[idx].list, list) {
> -               u64 nbytes;
> -               u64 brate;
> -               u32 npackets;
> -               u32 rate;
> +               struct gnet_stats_basic_packed *bstats;
>
>                spin_lock(e->stats_lock);
> -               read_lock(&est_lock);
> -               if (e->bstats == NULL)
> -                       goto skip;
> -
> -               nbytes = e->bstats->bytes;
> -               npackets = e->bstats->packets;
> -               brate = (nbytes - e->last_bytes)<<(7 - idx);
> -               e->last_bytes = nbytes;
> -               e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
> -               e->rate_est->bps = (e->avbps+0xF)>>5;
> -
> -               rate = (npackets - e->last_packets)<<(12 - idx);
> -               e->last_packets = npackets;
> -               e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
> -               e->rate_est->pps = (e->avpps+0x1FF)>>10;
> -skip:
> -               read_unlock(&est_lock);
> +               bstats = rcu_dereference(e->bstats);
> +               if (bstats) {
> +                       u64 nbytes = ACCESS_ONCE(bstats->bytes);
> +                       u32 npackets = ACCESS_ONCE(bstats->packets);
> +                       u64 brate = (nbytes - e->last_bytes)<<(7 - idx);
> +                       u32 rate;
> +
> +                       e->last_bytes = nbytes;
> +                       e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
> +                       e->rate_est->bps = (e->avbps+0xF)>>5;
> +
> +                       rate = (npackets - e->last_packets)<<(12 - idx);
> +                       e->last_packets = npackets;
> +                       e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
> +                       e->rate_est->pps = (e->avpps+0x1FF)>>10;
> +               }
>                spin_unlock(e->stats_lock);
>        }
>
> @@ -271,9 +264,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
>        while ((e = gen_find_node(bstats, rate_est))) {
>                rb_erase(&e->node, &est_root);
>
> -               write_lock_bh(&est_lock);
> -               e->bstats = NULL;
> -               write_unlock_bh(&est_lock);
> +               rcu_assign_pointer(e->bstats, NULL);
>
>                list_del_rcu(&e->list);
>                call_rcu(&e->e_rcu, __gen_kill_estimator);
>

Hmm. I don't think it is correct.

Look at this code:

void xt_rateest_put(struct xt_rateest *est)
{
        mutex_lock(&xt_rateest_mutex);
        if (--est->refcnt == 0) {
                hlist_del(&est->list);
                gen_kill_estimator(&est->bstats, &est->rstats);
                kfree(est);
        }
        mutex_unlock(&xt_rateest_mutex);
}

est will be released after gen_kill_estimator() returns. After est is
freed, there may still be some other CPUs running in the branch:
     if (bstats) {
         ....
     }

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-07 14:53 ` Changli Gao
@ 2010-06-07 15:30   ` Eric Dumazet
  2010-06-07 15:55     ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-07 15:30 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski

Le lundi 07 juin 2010 à 22:53 +0800, Changli Gao a écrit :

> Hmm. I don't think it is correct.
> 
> Look at this code:
> 
> void xt_rateest_put(struct xt_rateest *est)
> {
>         mutex_lock(&xt_rateest_mutex);
>         if (--est->refcnt == 0) {
>                 hlist_del(&est->list);
>                 gen_kill_estimator(&est->bstats, &est->rstats);
>                 kfree(est);
>         }
>         mutex_unlock(&xt_rateest_mutex);
> }
> 
> est will be released after gen_kill_estimator() returns. After est is
> freed, there may still be some other CPUs running in the branch:
>      if (bstats) {
>          ....
>      }
> 

Oh well, I think I knew this from a previous attempt, but then I
forgot :)

I'll provide an updated patch, so that no other attempt is tried next
yer, thanks !




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-07 15:30   ` Eric Dumazet
@ 2010-06-07 15:55     ` Eric Dumazet
  2010-06-07 16:56       ` [PATCH net-next-2.6 v2] " Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-07 15:55 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski

Le lundi 07 juin 2010 à 17:30 +0200, Eric Dumazet a écrit :
> Le lundi 07 juin 2010 à 22:53 +0800, Changli Gao a écrit :
> 
> > Hmm. I don't think it is correct.
> > 
> > Look at this code:
> > 
> > void xt_rateest_put(struct xt_rateest *est)
> > {
> >         mutex_lock(&xt_rateest_mutex);
> >         if (--est->refcnt == 0) {
> >                 hlist_del(&est->list);
> >                 gen_kill_estimator(&est->bstats, &est->rstats);
> >                 kfree(est);
> >         }
> >         mutex_unlock(&xt_rateest_mutex);
> > }
> > 
> > est will be released after gen_kill_estimator() returns. After est is
> > freed, there may still be some other CPUs running in the branch:
> >      if (bstats) {
> >          ....
> >      }
> > 
> 
> Oh well, I think I knew this from a previous attempt, but then I
> forgot :)
> 
> I'll provide an updated patch, so that no other attempt is tried next
> yer, thanks !
> 
> 

For your information, bug is already there before my patch.

So this est_lock is a wrong protection, in the sense its so convoluted
that nobody but you and me even noticed it was buggy in the first place.

(see commit 5d944c640b4 for a first patch)




^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-07 15:55     ` Eric Dumazet
@ 2010-06-07 16:56       ` Eric Dumazet
  2010-06-07 17:18         ` [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock Eric Dumazet
                           ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-07 16:56 UTC (permalink / raw)
  To: Changli Gao, David Miller
  Cc: netdev, Stephen Hemminger, Jarek Poplawski, Patrick McHardy


> 
> For your information, bug is already there before my patch.
> 
> So this est_lock is a wrong protection, in the sense its so convoluted
> that nobody but you and me even noticed it was buggy in the first place.
> 
> (see commit 5d944c640b4 for a first patch)
> 
> 

Here is v2 of the patch.

Even if its a bug correction, I cooked it for net-next-2.6 since bug
probably never occured, and patch is too large to be sent to
net-2.6/linux-2.6 before testing.

Another bug comes from net/netfilter/xt_RATEEST.c : It apparently
calls gen_kill_estimator() / gen_new_estimator() without holding RTNL ?

So we should add another lock to protect things (est_root, elist[], ...)

David, I can send a net-2.6 patch for this one, since it should be small
enough. If yes, I'll respin this patch of course ;)

Thanks

[PATCH net-next-2.6] pkt_sched: gen_kill_estimator() rcu fixes

gen_kill_estimator() API is fundamentaly wrong, since caller should make
sure an RCU grace period is respected before freeing bstats or lock.

This was partially addressed in commit 5d944c640b4 (gen_estimator:
deadlock fix), but same problem exist for all gen_kill_estimator()
users.

Change its name to gen_kill_estimator_rcu() and change all callers to
use RCU grace period before freeing bstats container.

As a bonus, est_lock rwlock can be killed for good.

Special thanks to Changli Gao for commenting on a previous patch, this
made this bug clear to me.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/act_api.h              |    2 
 include/net/gen_stats.h            |    2 
 include/net/netfilter/xt_rateest.h |    1 
 net/core/gen_estimator.c           |   58 ++++++++++++---------------
 net/netfilter/xt_RATEEST.c         |   10 +++-
 net/sched/act_api.c                |    9 +++-
 net/sched/act_police.c             |   12 ++++-
 net/sched/sch_cbq.c                |   11 ++++-
 net/sched/sch_drr.c                |   11 ++++-
 net/sched/sch_generic.c            |    4 -
 net/sched/sch_hfsc.c               |   11 ++++-
 net/sched/sch_htb.c                |   11 ++++-
 12 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c05fd71..bab385f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -20,6 +20,7 @@ struct tcf_common {
 	struct gnet_stats_queue		tcfc_qstats;
 	struct gnet_stats_rate_est	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
+	struct rcu_head			tcfc_rcu;
 };
 #define tcf_next	common.tcfc_next
 #define tcf_index	common.tcfc_index
@@ -32,6 +33,7 @@ struct tcf_common {
 #define tcf_qstats	common.tcfc_qstats
 #define tcf_rate_est	common.tcfc_rate_est
 #define tcf_lock	common.tcfc_lock
+#define tcf_rcu		common.tcfc_rcu
 
 struct tcf_police {
 	struct tcf_common	common;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index fa15771..3545696 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -40,7 +40,7 @@ extern int gnet_stats_finish_copy(struct gnet_dump *d);
 extern int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 			     struct gnet_stats_rate_est *rate_est,
 			     spinlock_t *stats_lock, struct nlattr *opt);
-extern void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+extern void gen_kill_estimator_rcu(struct gnet_stats_basic_packed *bstats,
 			       struct gnet_stats_rate_est *rate_est);
 extern int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 				 struct gnet_stats_rate_est *rate_est,
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index ddbf37e..5e14277 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -9,6 +9,7 @@ struct xt_rateest {
 	struct gnet_estimator		params;
 	struct gnet_stats_rate_est	rstats;
 	struct gnet_stats_basic_packed	bstats;
+	struct rcu_head			rcu;
 };
 
 extern struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cf8e703..bd06b4a 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -102,9 +102,6 @@ struct gen_estimator_head
 
 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
 /* Protects against soft lockup during large deletion */
 static struct rb_root est_root = RB_ROOT;
 
@@ -115,29 +112,25 @@ static void est_timer(unsigned long arg)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
-		u64 nbytes;
-		u64 brate;
-		u32 npackets;
-		u32 rate;
+		struct gnet_stats_basic_packed *bstats;
 
 		spin_lock(e->stats_lock);
-		read_lock(&est_lock);
-		if (e->bstats == NULL)
-			goto skip;
-
-		nbytes = e->bstats->bytes;
-		npackets = e->bstats->packets;
-		brate = (nbytes - e->last_bytes)<<(7 - idx);
-		e->last_bytes = nbytes;
-		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
-		e->rate_est->bps = (e->avbps+0xF)>>5;
-
-		rate = (npackets - e->last_packets)<<(12 - idx);
-		e->last_packets = npackets;
-		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		e->rate_est->pps = (e->avpps+0x1FF)>>10;
-skip:
-		read_unlock(&est_lock);
+		bstats = rcu_dereference(e->bstats);
+		if (bstats) {
+			u64 nbytes = ACCESS_ONCE(bstats->bytes);
+			u32 npackets = ACCESS_ONCE(bstats->packets);
+			u64 brate = (nbytes - e->last_bytes)<<(7 - idx);
+			u32 rate;
+
+			e->last_bytes = nbytes;
+			e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
+			e->rate_est->bps = (e->avbps+0xF)>>5;
+
+			rate = (npackets - e->last_packets)<<(12 - idx);
+			e->last_packets = npackets;
+			e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
+			e->rate_est->pps = (e->avpps+0x1FF)>>10;
+		}
 		spin_unlock(e->stats_lock);
 	}
 
@@ -255,15 +248,18 @@ static void __gen_kill_estimator(struct rcu_head *head)
 }
 
 /**
- * gen_kill_estimator - remove a rate estimator
+ * gen_kill_estimator_rcu - remove a rate estimator
  * @bstats: basic statistics
  * @rate_est: rate estimator statistics
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
- * NOTE: Called under rtnl_mutex
+ * NOTES:
+ *	Called under rtnl_mutex
+ *	Because est_timer() requirements (RCU protection), caller
+ *	should respect an RCU grace period before freeing bstats
  */
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+void gen_kill_estimator_rcu(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
 {
 	struct gen_estimator *e;
@@ -271,15 +267,13 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 	while ((e = gen_find_node(bstats, rate_est))) {
 		rb_erase(&e->node, &est_root);
 
-		write_lock_bh(&est_lock);
-		e->bstats = NULL;
-		write_unlock_bh(&est_lock);
+		rcu_assign_pointer(e->bstats, NULL);
 
 		list_del_rcu(&e->list);
 		call_rcu(&e->e_rcu, __gen_kill_estimator);
 	}
 }
-EXPORT_SYMBOL(gen_kill_estimator);
+EXPORT_SYMBOL(gen_kill_estimator_rcu);
 
 /**
  * gen_replace_estimator - replace rate estimator configuration
@@ -297,7 +291,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_rate_est *rate_est,
 			  spinlock_t *stats_lock, struct nlattr *opt)
 {
-	gen_kill_estimator(bstats, rate_est);
+	gen_kill_estimator_rcu(bstats, rate_est);
 	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 69c01e1..55747cd 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -60,13 +60,18 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
+static void xt_rateeset_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct xt_rateest, rcu));
+}
+
 void xt_rateest_put(struct xt_rateest *est)
 {
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
-		gen_kill_estimator(&est->bstats, &est->rstats);
-		kfree(est);
+		gen_kill_estimator_rcu(&est->bstats, &est->rstats);
+		call_rcu(&est->rcu, xt_rateeset_free_rcu);
 	}
 	mutex_unlock(&xt_rateest_mutex);
 }
@@ -179,6 +184,7 @@ static int __init xt_rateest_tg_init(void)
 static void __exit xt_rateest_tg_fini(void)
 {
 	xt_unregister_target(&xt_rateest_tg_reg);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 972378f..597b260 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -26,6 +26,11 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static void tcf_common_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_common, tcfc_rcu));
+}
+
 void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 {
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
@@ -36,9 +41,9 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 			write_lock_bh(hinfo->lock);
 			*p1p = p->tcfc_next;
 			write_unlock_bh(hinfo->lock);
-			gen_kill_estimator(&p->tcfc_bstats,
+			gen_kill_estimator_rcu(&p->tcfc_bstats,
 					   &p->tcfc_rate_est);
-			kfree(p);
+			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
 			return;
 		}
 	}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 654f73d..c3afcba 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -97,6 +97,11 @@ nla_put_failure:
 	goto done;
 }
 
+static void tcf_police_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_police, tcf_rcu));
+}
+
 static void tcf_police_destroy(struct tcf_police *p)
 {
 	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
@@ -107,13 +112,13 @@ static void tcf_police_destroy(struct tcf_police *p)
 			write_lock_bh(&police_lock);
 			*p1p = p->tcf_next;
 			write_unlock_bh(&police_lock);
-			gen_kill_estimator(&p->tcf_bstats,
-					   &p->tcf_rate_est);
+			gen_kill_estimator_rcu(&p->tcf_bstats,
+					       &p->tcf_rate_est);
 			if (p->tcfp_R_tab)
 				qdisc_put_rtab(p->tcfp_R_tab);
 			if (p->tcfp_P_tab)
 				qdisc_put_rtab(p->tcfp_P_tab);
-			kfree(p);
+			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
 			return;
 		}
 	}
@@ -397,6 +402,7 @@ static void __exit
 police_cleanup_module(void)
 {
 	tcf_unregister_action(&act_police_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 module_init(police_init_module);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 28c01ef..d61b3b3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -140,6 +140,7 @@ struct cbq_class
 	int			filters;
 
 	struct cbq_class 	*defaults[TC_PRIO_MAX+1];
+	struct rcu_head		rcu;
 };
 
 struct cbq_sched_data
@@ -1671,6 +1672,11 @@ static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
 	return 0;
 }
 
+static void cbq_class_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct cbq_class, rcu));
+}
+
 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
@@ -1680,9 +1686,9 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator_rcu(&cl->bstats, &cl->rate_est);
 	if (cl != &q->link)
-		kfree(cl);
+		call_rcu(&cl->rcu, cbq_class_free_rcu);
 }
 
 static void
@@ -2066,6 +2072,7 @@ static int __init cbq_module_init(void)
 static void __exit cbq_module_exit(void)
 {
 	unregister_qdisc(&cbq_qdisc_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 module_init(cbq_module_init)
 module_exit(cbq_module_exit)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index b74046a..f0d5aae 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -31,6 +31,7 @@ struct drr_class {
 
 	u32				quantum;
 	u32				deficit;
+	struct rcu_head			rcu;
 };
 
 struct drr_sched {
@@ -136,11 +137,16 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	return 0;
 }
 
+static void drr_class_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct drr_class, rcu));
+}
+
 static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
 {
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator_rcu(&cl->bstats, &cl->rate_est);
 	qdisc_destroy(cl->qdisc);
-	kfree(cl);
+	call_rcu(&cl->rcu, drr_class_free_rcu);
 }
 
 static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
@@ -522,6 +528,7 @@ static int __init drr_init(void)
 static void __exit drr_exit(void)
 {
 	unregister_qdisc(&drr_qdisc_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 module_init(drr_init);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index d20fcd2..22120d4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -632,7 +632,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	qdisc_put_stab(qdisc->stab);
 #endif
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	gen_kill_estimator_rcu(&qdisc->bstats, &qdisc->rate_est);
 	if (ops->reset)
 		ops->reset(qdisc);
 	if (ops->destroy)
@@ -643,7 +643,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	kfree_skb(qdisc->gso_skb);
 	/*
-	 * gen_estimator est_timer() might access qdisc->q.lock,
+	 * gen_estimator est_timer() might access qdisc->q.lock or bstats
 	 * wait a RCU grace period before freeing qdisc.
 	 */
 	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index abd904b..c433aa8 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -174,6 +174,7 @@ struct hfsc_class
 	unsigned long	cl_vtperiod;	/* vt period sequence number */
 	unsigned long	cl_parentperiod;/* parent's vt period sequence number*/
 	unsigned long	cl_nactive;	/* number of active children */
+	struct rcu_head rcu;
 };
 
 struct hfsc_sched
@@ -1111,6 +1112,11 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	return 0;
 }
 
+static void hfsc_class_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct hfsc_class, rcu));
+}
+
 static void
 hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 {
@@ -1118,9 +1124,9 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->qdisc);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator_rcu(&cl->bstats, &cl->rate_est);
 	if (cl != &q->root)
-		kfree(cl);
+		call_rcu(&cl->rcu, hfsc_class_free_rcu);
 }
 
 static int
@@ -1742,6 +1748,7 @@ static void __exit
 hfsc_cleanup(void)
 {
 	unregister_qdisc(&hfsc_qdisc_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0b52b8d..b7b5e29 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -123,6 +123,7 @@ struct htb_class {
 	psched_tdiff_t mbuffer;	/* max wait time */
 	long tokens, ctokens;	/* current number of tokens */
 	psched_time_t t_c;	/* checkpoint time */
+	struct rcu_head rcu;
 };
 
 struct htb_sched {
@@ -1190,18 +1191,23 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
 	parent->cmode = HTB_CAN_SEND;
 }
 
+static void htb_class_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct htb_class, rcu));
+}
+
 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 {
 	if (!cl->level) {
 		WARN_ON(!cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator_rcu(&cl->bstats, &cl->rate_est);
 	qdisc_put_rtab(cl->rate);
 	qdisc_put_rtab(cl->ceil);
 
 	tcf_destroy_chain(&cl->filter_list);
-	kfree(cl);
+	call_rcu(&cl->rcu, htb_class_free_rcu);
 }
 
 static void htb_destroy(struct Qdisc *sch)
@@ -1573,6 +1579,7 @@ static int __init htb_module_init(void)
 static void __exit htb_module_exit(void)
 {
 	unregister_qdisc(&htb_qdisc_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 module_init(htb_module_init)



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-07 16:56       ` [PATCH net-next-2.6 v2] " Eric Dumazet
@ 2010-06-07 17:18         ` Eric Dumazet
  2010-06-08  1:00           ` Changli Gao
  2010-06-09  9:39           ` [PATCH net-2.6 v2] " Eric Dumazet
  2010-06-08 12:15         ` [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock Jarek Poplawski
  2010-06-09  9:40         ` [PATCH] pkt_sched: gen_kill_estimator() rcu fixes Eric Dumazet
  2 siblings, 2 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-07 17:18 UTC (permalink / raw)
  To: Changli Gao, David Miller
  Cc: netdev, Stephen Hemminger, Jarek Poplawski, Patrick McHardy

Le lundi 07 juin 2010 à 18:56 +0200, Eric Dumazet a écrit :
> > 
> > For your information, bug is already there before my patch.
> > 
> > So this est_lock is a wrong protection, in the sense its so convoluted
> > that nobody but you and me even noticed it was buggy in the first place.
> > 
> > (see commit 5d944c640b4 for a first patch)
> > 
> > 
> 
> Here is v2 of the patch.
> 
> Even if its a bug correction, I cooked it for net-next-2.6 since bug
> probably never occured, and patch is too large to be sent to
> net-2.6/linux-2.6 before testing.
> 
> Another bug comes from net/netfilter/xt_RATEEST.c : It apparently
> calls gen_kill_estimator() / gen_new_estimator() without holding RTNL ?
> 
> So we should add another lock to protect things (est_root, elist[], ...)
> 
> David, I can send a net-2.6 patch for this one, since it should be small
> enough. If yes, I'll respin this patch of course ;)

[PATCH net-2.6] pkt_sched: gen_estimator: add a new lock

gen_kill_estimator() / gen_new_estimator() is not always called with
RTNL held.

net/netfilter/xt_RATEEST.c is one user of these API that do not hold
RTNL, so random corruptions can occur between "tc" and "iptables"

Add a new fine grained lock instead of trying to use RTNL in xt_RATEEST

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/gen_estimator.c |    7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cf8e703..3d11203 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -107,6 +107,7 @@ static DEFINE_RWLOCK(est_lock);
 
 /* Protects against soft lockup during large deletion */
 static struct rb_root est_root = RB_ROOT;
+static DEFINE_SPINLOCK(est_tree_lock);
 
 static void est_timer(unsigned long arg)
 {
@@ -201,7 +202,6 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  *
  * Returns 0 on success or a negative error code.
  *
- * NOTE: Called under rtnl_mutex
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_rate_est *rate_est,
@@ -222,6 +222,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
+	spin_lock(&est_tree_lock);
 	idx = parm->interval + 2;
 	est->bstats = bstats;
 	est->rate_est = rate_est;
@@ -242,6 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 
 	list_add_rcu(&est->list, &elist[idx].list);
 	gen_add_node(est);
+	spin_unlock(&est_tree_lock);
 
 	return 0;
 }
@@ -261,13 +263,13 @@ static void __gen_kill_estimator(struct rcu_head *head)
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
- * NOTE: Called under rtnl_mutex
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
 {
 	struct gen_estimator *e;
 
+	spin_lock(&est_tree_lock);
 	while ((e = gen_find_node(bstats, rate_est))) {
 		rb_erase(&e->node, &est_root);
 
@@ -278,6 +280,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 		list_del_rcu(&e->list);
 		call_rcu(&e->e_rcu, __gen_kill_estimator);
 	}
+	spin_unlock(&est_tree_lock);
 }
 EXPORT_SYMBOL(gen_kill_estimator);
 



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-07 17:18         ` [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock Eric Dumazet
@ 2010-06-08  1:00           ` Changli Gao
  2010-06-08  4:30             ` Eric Dumazet
  2010-06-08  4:58             ` Eric Dumazet
  2010-06-09  9:39           ` [PATCH net-2.6 v2] " Eric Dumazet
  1 sibling, 2 replies; 35+ messages in thread
From: Changli Gao @ 2010-06-08  1:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski,
	Patrick McHardy

On Tue, Jun 8, 2010 at 1:18 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
> [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
>
> gen_kill_estimator() / gen_new_estimator() is not always called with
> RTNL held.
>
> net/netfilter/xt_RATEEST.c is one user of these API that do not hold
> RTNL, so random corruptions can occur between "tc" and "iptables"
>
> Add a new fine grained lock instead of trying to use RTNL in xt_RATEEST
>

Why not use RTNL in xt_RATEEST? It seems xt_RATEEST misuse the APIs.

and I think gen_replace_estimator is expected to be an atomic operation.

And gen_estimator_active() is also assumed to be called with RTNL locked.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-08  1:00           ` Changli Gao
@ 2010-06-08  4:30             ` Eric Dumazet
  2010-06-08  4:57               ` Changli Gao
  2010-06-08  4:58             ` Eric Dumazet
  1 sibling, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-08  4:30 UTC (permalink / raw)
  To: Changli Gao
  Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski,
	Patrick McHardy

Le mardi 08 juin 2010 à 09:00 +0800, Changli Gao a écrit :
> On Tue, Jun 8, 2010 at 1:18 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> > [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
> >
> > gen_kill_estimator() / gen_new_estimator() is not always called with
> > RTNL held.
> >
> > net/netfilter/xt_RATEEST.c is one user of these API that do not hold
> > RTNL, so random corruptions can occur between "tc" and "iptables"
> >
> > Add a new fine grained lock instead of trying to use RTNL in xt_RATEEST
> >
> 
> Why not use RTNL in xt_RATEEST? It seems xt_RATEEST misuse the APIs.
> 
> and I think gen_replace_estimator is expected to be an atomic operation.
> 
> And gen_estimator_active() is also assumed to be called with RTNL locked.
> 

Thank you for asking this question.

Because I want to kill RTNL when possible, I dont even want to try
adding RTNL to xt_RATEEST and solve all lock dependencies it might
raise.

RTNL = Big and Horrible Network LOCK

You never got blocked because of this RTNL thing, dont you ?

I did. And it sucks, because when you hit this, you are in a hurry and
locating the bottleneck takes lot of time.

RTNL is the thing we must hold during device register / unregister.
Its locked for long delays because of all synchronize_rcu() that must be
done, and that is already a big problem on some setups.

Every time someone adds a RTNL requirement, you can be sure another guy
will zap it during following ten years.

Let's do this right now, not later.

For an example of horrible rtnl behavior, take a look at following
construct :

if (!rtnl_trylock())
	return restart_syscall();

I saw hundred of udev looping, trying to get rtnl to dump some
information. (Patrick added a rtnl requirement to all dump operations,
and it sucks)




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-08  4:30             ` Eric Dumazet
@ 2010-06-08  4:57               ` Changli Gao
  0 siblings, 0 replies; 35+ messages in thread
From: Changli Gao @ 2010-06-08  4:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski,
	Patrick McHardy

On Tue, Jun 8, 2010 at 12:30 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Thank you for asking this question.
>
> Because I want to kill RTNL when possible, I dont even want to try
> adding RTNL to xt_RATEEST and solve all lock dependencies it might
> raise.
>
> RTNL = Big and Horrible Network LOCK
>

It is much like the BKL, and should be killed too. Thanks. :)


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-08  1:00           ` Changli Gao
  2010-06-08  4:30             ` Eric Dumazet
@ 2010-06-08  4:58             ` Eric Dumazet
  2010-06-08  5:20               ` Changli Gao
  1 sibling, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-08  4:58 UTC (permalink / raw)
  To: Changli Gao
  Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski,
	Patrick McHardy

Le mardi 08 juin 2010 à 09:00 +0800, Changli Gao a écrit :

> and I think gen_replace_estimator is expected to be an atomic operation.
> 
> And gen_estimator_active() is also assumed to be called with RTNL locked.
> 

My patch fixes a bug of new/kill operators, regardless of RTNL being
held or not. Its should be small enough to be included in linux-2.6.35.

If what you say is right, all gen_replace_estimator() /
gen_estimator_active() callers should still holds RTNL.
I didnt change this part.
If you believe one caller doesnt hold RTNL, please submit another patch.

Then, in net-next-2.6, we can probably cleanup this to remove RTNL
requirement if possible for gen_replace_estimator() /
gen_estimator_active()

Yes, it sounds a bit difficult (three patches instead of a single one),
but this is the how things should be done, step by step.



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-08  4:58             ` Eric Dumazet
@ 2010-06-08  5:20               ` Changli Gao
  2010-06-08  5:39                 ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Changli Gao @ 2010-06-08  5:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski,
	Patrick McHardy

On Tue, Jun 8, 2010 at 12:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 08 juin 2010 à 09:00 +0800, Changli Gao a écrit :
>
>> and I think gen_replace_estimator is expected to be an atomic operation.
>>
>> And gen_estimator_active() is also assumed to be called with RTNL locked.
>>
>
> My patch fixes a bug of new/kill operators, regardless of RTNL being
> held or not. Its should be small enough to be included in linux-2.6.35.
>
> If what you say is right, all gen_replace_estimator() /
> gen_estimator_active() callers should still holds RTNL.
> I didnt change this part.
> If you believe one caller doesnt hold RTNL, please submit another patch.
>
> Then, in net-next-2.6, we can probably cleanup this to remove RTNL
> requirement if possible for gen_replace_estimator() /
> gen_estimator_active()
>
> Yes, it sounds a bit difficult (three patches instead of a single one),
> but this is the how things should be done, step by step.
>

IMO, this bug should be fixed by adding rtnl_lock to xt_RATEEST.c.
Killing rtnl should be done in separated patches. They are different
things. Your patch introduces another locks, and it is extra overhead
for other users.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
  2010-06-08  5:20               ` Changli Gao
@ 2010-06-08  5:39                 ` Eric Dumazet
  0 siblings, 0 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-08  5:39 UTC (permalink / raw)
  To: Changli Gao
  Cc: David Miller, netdev, Stephen Hemminger, Jarek Poplawski,
	Patrick McHardy

Le mardi 08 juin 2010 à 13:20 +0800, Changli Gao a écrit :

> IMO, this bug should be fixed by adding rtnl_lock to xt_RATEEST.c.
> Killing rtnl should be done in separated patches. They are different
> things. Your patch introduces another locks, and it is extra overhead
> for other users.
> 

extra overhead, in new/kill estimators ? 

Are you kidding ?

RTNL is taken, taking an extra-uncontended spinlock is free.

Nope, I wont add rtnl lock to xt_RATEEST.c

I believe you dont really understood what I patiently explained to you.

Thats becoming rediculous.




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-07 16:56       ` [PATCH net-next-2.6 v2] " Eric Dumazet
  2010-06-07 17:18         ` [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock Eric Dumazet
@ 2010-06-08 12:15         ` Jarek Poplawski
  2010-06-08 12:27           ` Eric Dumazet
  2010-06-09  9:40         ` [PATCH] pkt_sched: gen_kill_estimator() rcu fixes Eric Dumazet
  2 siblings, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-08 12:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Eric Dumazet wrote:
...
> [PATCH net-next-2.6] pkt_sched: gen_kill_estimator() rcu fixes
> 
> gen_kill_estimator() API is fundamentaly wrong, since caller should make
> sure an RCU grace period is respected before freeing bstats or lock.
> 
> This was partially addressed in commit 5d944c640b4 (gen_estimator:
> deadlock fix), but same problem exist for all gen_kill_estimator()
> users.
> 
> Change its name to gen_kill_estimator_rcu() and change all callers to
> use RCU grace period before freeing bstats container.
> 
> As a bonus, est_lock rwlock can be killed for good.
> 
> Special thanks to Changli Gao for commenting on a previous patch, this
> made this bug clear to me.
> 

Actually, I guess, Changli meant the bug introduced by your previous
patch by removing the est_lock. With this lock (and your commit 5d944)
bstats (and API) seem "fundamentaly" safe.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 12:15         ` [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock Jarek Poplawski
@ 2010-06-08 12:27           ` Eric Dumazet
  2010-06-08 12:40             ` Jarek Poplawski
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-08 12:27 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Le mardi 08 juin 2010 à 12:15 +0000, Jarek Poplawski a écrit :

> 
> Actually, I guess, Changli meant the bug introduced by your previous
> patch by removing the est_lock. With this lock (and your commit 5d944)
> bstats (and API) seem "fundamentaly" safe.
> 

Sorry, I have no idea of what you want to say, I cant find commit 5d944.




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 12:27           ` Eric Dumazet
@ 2010-06-08 12:40             ` Jarek Poplawski
  2010-06-08 19:29               ` Jarek Poplawski
  0 siblings, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-08 12:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Tue, Jun 08, 2010 at 02:27:32PM +0200, Eric Dumazet wrote:
> Le mardi 08 juin 2010 ?? 12:15 +0000, Jarek Poplawski a écrit :
> 
> > 
> > Actually, I guess, Changli meant the bug introduced by your previous
> > patch by removing the est_lock. With this lock (and your commit 5d944)
> > bstats (and API) seem "fundamentaly" safe.
> > 
> 
> Sorry, I have no idea of what you want to say, I cant find commit 5d944.
> 

Sorry, I meant the commit mentioned in your changelog which was quoted.

> This was partially addressed in commit 5d944c640b4 (gen_estimator:
> deadlock fix), but same problem exist for all gen_kill_estimator()
> users.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 12:40             ` Jarek Poplawski
@ 2010-06-08 19:29               ` Jarek Poplawski
  2010-06-08 19:45                 ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-08 19:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Jarek Poplawski wrote, On 06/08/2010 02:40 PM:

> On Tue, Jun 08, 2010 at 02:27:32PM +0200, Eric Dumazet wrote:
>> Le mardi 08 juin 2010 ?? 12:15 +0000, Jarek Poplawski a écrit :
>>
>>> Actually, I guess, Changli meant the bug introduced by your previous
>>> patch by removing the est_lock. With this lock (and your commit 5d944)
>>> bstats (and API) seem "fundamentaly" safe.
>>>
>> Sorry, I have no idea of what you want to say, I cant find commit 5d944.
>>
> 
> Sorry, I meant the commit mentioned in your changelog which was quoted.
> 
>> This was partially addressed in commit 5d944c640b4 (gen_estimator:
>> deadlock fix), but same problem exist for all gen_kill_estimator()
>> users.

In case it's still unclear, I wanted to say that IMHO this patch's
title and changelog are misleading because most of its content are
est_lock to RCU changes - not fixes.

>> [PATCH net-next-2.6] pkt_sched: gen_kill_estimator() rcu fixes
>> 
>> gen_kill_estimator() API is fundamentaly wrong, since caller should make
>> sure an RCU grace period is respected before freeing bstats or lock.

Freeing bstats doesn't currently require RCU protection.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 19:29               ` Jarek Poplawski
@ 2010-06-08 19:45                 ` Eric Dumazet
  2010-06-08 20:24                   ` Jarek Poplawski
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-08 19:45 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Le mardi 08 juin 2010 à 21:29 +0200, Jarek Poplawski a écrit :
> Jarek Poplawski wrote, On 06/08/2010 02:40 PM:
> 
> > On Tue, Jun 08, 2010 at 02:27:32PM +0200, Eric Dumazet wrote:
> >> Le mardi 08 juin 2010 ?? 12:15 +0000, Jarek Poplawski a écrit :
> >>
> >>> Actually, I guess, Changli meant the bug introduced by your previous
> >>> patch by removing the est_lock. With this lock (and your commit 5d944)
> >>> bstats (and API) seem "fundamentaly" safe.
> >>>
> >> Sorry, I have no idea of what you want to say, I cant find commit 5d944.
> >>
> > 
> > Sorry, I meant the commit mentioned in your changelog which was quoted.
> > 
> >> This was partially addressed in commit 5d944c640b4 (gen_estimator:
> >> deadlock fix), but same problem exist for all gen_kill_estimator()
> >> users.
> 
> In case it's still unclear, I wanted to say that IMHO this patch's
> title and changelog are misleading because most of its content are
> est_lock to RCU changes - not fixes.
> 

...

> >> [PATCH net-next-2.6] pkt_sched: gen_kill_estimator() rcu fixes
> >> 
> >> gen_kill_estimator() API is fundamentaly wrong, since caller should make
> >> sure an RCU grace period is respected before freeing bstats or lock.
> 
> Freeing bstats doesn't currently require RCU protection.
> 
> Jarek P.

So what ? No changes needed ?

I am really lost by your comments Jarek.

Maybe you could provide an alternative patch, so that we can make some
progress ?




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 19:45                 ` Eric Dumazet
@ 2010-06-08 20:24                   ` Jarek Poplawski
  2010-06-08 20:52                     ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-08 20:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Tue, Jun 08, 2010 at 09:45:29PM +0200, Eric Dumazet wrote:
> Le mardi 08 juin 2010 ?? 21:29 +0200, Jarek Poplawski a écrit :
> > Jarek Poplawski wrote, On 06/08/2010 02:40 PM:
> > 
> > > On Tue, Jun 08, 2010 at 02:27:32PM +0200, Eric Dumazet wrote:
> > >> Le mardi 08 juin 2010 ?? 12:15 +0000, Jarek Poplawski a écrit :
> > >>
> > >>> Actually, I guess, Changli meant the bug introduced by your previous
> > >>> patch by removing the est_lock. With this lock (and your commit 5d944)
> > >>> bstats (and API) seem "fundamentaly" safe.
> > >>>
> > >> Sorry, I have no idea of what you want to say, I cant find commit 5d944.
> > >>
> > > 
> > > Sorry, I meant the commit mentioned in your changelog which was quoted.
> > > 
> > >> This was partially addressed in commit 5d944c640b4 (gen_estimator:
> > >> deadlock fix), but same problem exist for all gen_kill_estimator()
> > >> users.
> > 
> > In case it's still unclear, I wanted to say that IMHO this patch's
> > title and changelog are misleading because most of its content are
> > est_lock to RCU changes - not fixes.
> > 
> 
> ...
> 
> > >> [PATCH net-next-2.6] pkt_sched: gen_kill_estimator() rcu fixes
> > >> 
> > >> gen_kill_estimator() API is fundamentaly wrong, since caller should make
> > >> sure an RCU grace period is respected before freeing bstats or lock.
> > 
> > Freeing bstats doesn't currently require RCU protection.
> > 
> > Jarek P.
> 
> So what ? No changes needed ?

Such a changelog is a documentation for future, just like this one,
crucial for this suject:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=0929c2dd83317813425b937fbc0041013b8685ff

There is no reason to make people (our?) life harder with describing
unexistent bugs.

> 
> I am really lost by your comments Jarek.
> 
> Maybe you could provide an alternative patch, so that we can make some
> progress ?
> 

No, I only meant fixing the title and changelog if you have any problem
with separating it for two parts.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 20:24                   ` Jarek Poplawski
@ 2010-06-08 20:52                     ` Eric Dumazet
  2010-06-08 21:18                       ` Jarek Poplawski
  2010-06-09  6:13                       ` pkt_sched: gen_estimator: more fuel for Jarek and Changli Eric Dumazet
  0 siblings, 2 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-08 20:52 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy


> Such a changelog is a documentation for future, just like this one,
> crucial for this suject:
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=0929c2dd83317813425b937fbc0041013b8685ff
> 
> There is no reason to make people (our?) life harder with describing
> unexistent bugs.
> 

Now you are telling me there is no bug, and I am lying ?

> > 
> > I am really lost by your comments Jarek.
> > 
> > Maybe you could provide an alternative patch, so that we can make some
> > progress ?
> > 
> 
> No, I only meant fixing the title and changelog if you have any problem
> with separating it for two parts.

I am suggesting you rewrite this Changelog, as being part of the patch,
since I still miss your point, sorry, I must be very dumb tonight.

Then, add your Signed-off-by.

This is normal iterative process.



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock
  2010-06-08 20:52                     ` Eric Dumazet
@ 2010-06-08 21:18                       ` Jarek Poplawski
  2010-06-09  6:13                       ` pkt_sched: gen_estimator: more fuel for Jarek and Changli Eric Dumazet
  1 sibling, 0 replies; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-08 21:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Tue, Jun 08, 2010 at 10:52:34PM +0200, Eric Dumazet wrote:
> 
> > Such a changelog is a documentation for future, just like this one,
> > crucial for this suject:
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=0929c2dd83317813425b937fbc0041013b8685ff
> > 
> > There is no reason to make people (our?) life harder with describing
> > unexistent bugs.
> > 
> 
> Now you are telling me there is no bug, and I am lying ?
> 

I didn't question freeing stats_lock (great catch BTW!).

> > > 
> > > I am really lost by your comments Jarek.
> > > 
> > > Maybe you could provide an alternative patch, so that we can make some
> > > progress ?
> > > 
> > 
> > No, I only meant fixing the title and changelog if you have any problem
> > with separating it for two parts.
> 
> I am suggesting you rewrite this Changelog, as being part of the patch,
> since I still miss your point, sorry, I must be very dumb tonight.
> 
> Then, add your Signed-off-by.
> 
> This is normal iterative process.
> 

No understado - no Englisch ;-)

Sorry,
Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* pkt_sched: gen_estimator: more fuel for Jarek and Changli
  2010-06-08 20:52                     ` Eric Dumazet
  2010-06-08 21:18                       ` Jarek Poplawski
@ 2010-06-09  6:13                       ` Eric Dumazet
  2010-06-09  6:51                         ` Jarek Poplawski
  1 sibling, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09  6:13 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy


With un-modified kernel, I ran following scripts on my machine

taskset 01 sh -c "while :;do iptables  -I INPUT -i eth0 -j RATEEST --rateest-name eth0 --rateest-interval 250ms --rateest-ewmalog 1000ms; done" &
taskset 02 sh -c "while :;do iptables  -F INPUT; done" &
taskset 02 sh -c "while :;do tc qdisc del dev eth0 root 2>/dev/null;done" &
taskset 08 sh -c "while :;do tc qdisc add dev eth0 root handle 1: est 250msec 1sec cbq avpkt 1000 rate 1000Mbit bandwidth 1000Mbit 2>/dev/null;done" &


I got following oops in about 10 seconds, and my machine had to be
rebooted, rtnl being locked forever, so many commands block hard in
rtnl_lock()

root      6016  0.0  0.0  2040  536 pts/0    D    07:14   0:00 tc qdisc del dev eth0 root
root      6021  0.0  0.0  2040  676 pts/0    D    07:14   0:00 tc qdisc add dev eth0 root handle 1: est 250msec 1sec cbq avpkt 1000 rate 1
root     19358  0.0  0.0  1752  252 ?        D    07:45   0:00 ip -o link ls dev eth0

[  753.892107] BUG: unable to handle kernel NULL pointer dereference at (null)
[  753.892132] IP: [<c116b6c8>] rb_insert_color+0xc6/0xd0
[  753.892156] *pdpt = 0000000032827001 *pde = 0000000000000000 
[  753.892177] Oops: 0002 [#1] PREEMPT SMP 
[  753.892196] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:01:04.6/class
[  753.892218] Modules linked in: xt_RATEEST iptable_filter ip_tables x_tables ipmi_devintf ipmi_si ipmi_msghandler ipv6 dm_mod button battery ac ehci_hcd uhci_hcd tg3 libphy bnx2x crc32c libcrc32c mdio [last unloaded: x_tables]
[  753.892314] 
[  753.892321] Pid: 5951, comm: tc Not tainted 2.6.35-rc1-00208-g50e3a9a #68 /ProLiant BL460c G6
[  753.892341] EIP: 0060:[<c116b6c8>] EFLAGS: 00010202 CPU: 3
[  753.892356] EIP is at rb_insert_color+0xc6/0xd0
[  753.892368] EAX: 00000000 EBX: f34c1750 ECX: f34c1750 EDX: c1b5a1bc
[  753.892384] ESI: 00000001 EDI: f34c1ae0 EBP: f34a0c0c ESP: f34a0bf8
[  753.892399]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  753.892413] Process tc (pid: 5951, ti=f34a0000 task=f43f2ac0 task.ti=f34a0000)
[  753.892430] Stack:
[  753.892465]  c1292899 c1b5a1bc f34c1aa8 f3ae47f4 f36baf78 f34a0c34 c1292a66 f36baf5c
[  753.892524] <0> 00000098 d8d43110 f36baf2c 00000000 f36baf00 f34a0ca0 00000000 f34a0c6c
[  753.892598] <0> c12aa80c d8d4310c c16ba5a0 00000000 f4160000 c1561fa0 f43f2a00 00000000
[  753.892681] Call Trace:
[  753.892707]  [<c1292899>] ? gen_new_estimator+0x55/0x247
[  753.892736]  [<c1292a66>] ? gen_new_estimator+0x222/0x247
[  753.892765]  [<c12aa80c>] ? qdisc_create+0x1e4/0x273
[  753.892793]  [<c12aabd8>] ? tc_modify_qdisc+0x33d/0x3be
[  753.892822]  [<c12aa89b>] ? tc_modify_qdisc+0x0/0x3be
[  753.892850]  [<c12a1c10>] ? rtnetlink_rcv_msg+0x197/0x1a6
[  753.892880]  [<c132d454>] ? mutex_lock_nested+0x26e/0x288
[  753.892909]  [<c12a1a79>] ? rtnetlink_rcv_msg+0x0/0x1a6
[  753.892938]  [<c12c74ec>] ? netlink_rcv_skb+0x32/0x73
[  753.892966]  [<c12a1a00>] ? rtnetlink_rcv+0x1b/0x22
[  753.892993]  [<c12c7045>] ? netlink_unicast+0x1b3/0x214
[  753.893021]  [<c12c72dc>] ? netlink_sendmsg+0x236/0x243
[  753.893050]  [<c1288262>] ? sock_sendmsg+0xc0/0xdb
[  753.893080]  [<c109f15a>] ? might_fault+0x36/0x70
[  753.893107]  [<c109f15a>] ? might_fault+0x36/0x70
[  753.893134]  [<c109f15a>] ? might_fault+0x36/0x70
[  753.893161]  [<c116f330>] ? _copy_from_user+0x39/0x4d
[  753.893189]  [<c1290a91>] ? verify_iovec+0x3e/0x6d
[  753.893217]  [<c1289b89>] ? sys_sendmsg+0x13f/0x18c
[  753.893244]  [<c12882cd>] ? sockfd_lookup_light+0x19/0x4b
[  753.893274]  [<c1094dea>] ? __lru_cache_add+0x64/0x7b
[  753.893302]  [<c102a200>] ? get_parent_ip+0x9/0x31
[  753.893332]  [<c105a62b>] ? lock_release_non_nested+0x88/0x245
[  753.893362]  [<c109f15a>] ? might_fault+0x36/0x70
[  753.893389]  [<c109f15a>] ? might_fault+0x36/0x70
[  753.893415]  [<c109f15a>] ? might_fault+0x36/0x70
[  753.893443]  [<c1289f62>] ? sys_socketcall+0x163/0x1a3
[  753.893472]  [<c116edd0>] ? trace_hardirqs_on_thunk+0xc/0x10
[  753.893501]  [<c100278c>] ? sysenter_do_call+0x12/0x32
[  753.893537] Code: cb 83 0b 01 89 f0 83 26 fe 8b 55 f0 e8 8e fe ff ff 8b 1f 83 e3 fc 74 0e 8b 33 f7 c6 01 00 00 00 0f 84 61 ff ff ff 8b 55 f0 8b 02 <83> 08 01 58 5a 5b 5e 5f 5d c3 55 89 e5 57 56 89 d6 53 89 c3 83 
[  753.893763] EIP: [<c116b6c8>] rb_insert_color+0xc6/0xd0 SS:ESP 0068:f34a0bf8
[  753.893799] CR2: 0000000000000000
[  753.894062] ---[ end trace da6bae989b9be023 ]---



Triggering the other bug is more difficult :

est_timer() should be interrupted 
(by hard irqs for example), right before spin_lock(e->stats_lock);

Then a caller of gen_kill_estimator() might freed stats_lock and 
est_timer() reference a freed spinlock.

This can be simulated with following patch, to inject a 100 ms delay.

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cf8e703..55ba060 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -120,6 +120,8 @@ static void est_timer(unsigned long arg)
                u32 npackets;
                u32 rate;
 
+               for (rate = 0; rate < 100; rate++)
+                       udelay(1000);
                spin_lock(e->stats_lock);
                read_lock(&est_lock);
                if (e->bstats == NULL)

My machine crash almost instantly in spin_lock(e->stats_lock)

I'll post v3 of the patch, with updated Changelog



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: pkt_sched: gen_estimator: more fuel for Jarek and Changli
  2010-06-09  6:13                       ` pkt_sched: gen_estimator: more fuel for Jarek and Changli Eric Dumazet
@ 2010-06-09  6:51                         ` Jarek Poplawski
  2010-06-09  7:36                           ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-09  6:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Wed, Jun 09, 2010 at 08:13:17AM +0200, Eric Dumazet wrote:
> 
> With un-modified kernel, I ran following scripts on my machine

Why not modified with your other patch quite obviously needed by
rateest?:

> [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
> 
> gen_kill_estimator() / gen_new_estimator() is not always called with
> RTNL held.

Btw, I agree with Changli that adding RTNL to rateest (if possible),
and doing the RTNL replacement later, seems more proper.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: pkt_sched: gen_estimator: more fuel for Jarek and Changli
  2010-06-09  6:51                         ` Jarek Poplawski
@ 2010-06-09  7:36                           ` Eric Dumazet
  2010-06-09  8:14                             ` Jarek Poplawski
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09  7:36 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Le mercredi 09 juin 2010 à 06:51 +0000, Jarek Poplawski a écrit :
> On Wed, Jun 09, 2010 at 08:13:17AM +0200, Eric Dumazet wrote:
> > 
> > With un-modified kernel, I ran following scripts on my machine
> 
> Why not modified with your other patch quite obviously needed by
> rateest?:
> 

First patch is obvious, but I cooked same script to trigger both bugs,
because you needed some evidences.

> > [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
> > 
> > gen_kill_estimator() / gen_new_estimator() is not always called with
> > RTNL held.
> 
> Btw, I agree with Changli that adding RTNL to rateest (if possible),
> and doing the RTNL replacement later, seems more proper.
> 

I wont be the guy adding RTNL to netfilter, thats for sure. That would
be a step backward. 
Sometimes, the 'obvious' fix is also a dumb one.
Do you really think I didnt had this idea too ?

xt_RATEEST is an unfortunate domain intersection (netfilter / sched).

We can solve this using a fine grained lock, instead of interesting
lockdep issues, yet to be discovered.

You can submit your patch, but I wont Ack it, I'll Nack it for all these
reasons.

Why dont we remove all locks we have 'because we can use RTNL and be
with it' ?

qdisc_mod_lock could be removed quite instantly, qdisc_base could be
protected by RTNL... And so on...




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: pkt_sched: gen_estimator: more fuel for Jarek and Changli
  2010-06-09  7:36                           ` Eric Dumazet
@ 2010-06-09  8:14                             ` Jarek Poplawski
  0 siblings, 0 replies; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-09  8:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Wed, Jun 09, 2010 at 09:36:04AM +0200, Eric Dumazet wrote:
> Le mercredi 09 juin 2010 ?? 06:51 +0000, Jarek Poplawski a écrit :
> > On Wed, Jun 09, 2010 at 08:13:17AM +0200, Eric Dumazet wrote:
> > > 
> > > With un-modified kernel, I ran following scripts on my machine
> > 
> > Why not modified with your other patch quite obviously needed by
> > rateest?:
> > 
> 
> First patch is obvious, but I cooked same script to trigger both bugs,
> because you needed some evidences.
> 
> > > [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
> > > 
> > > gen_kill_estimator() / gen_new_estimator() is not always called with
> > > RTNL held.
> > 
> > Btw, I agree with Changli that adding RTNL to rateest (if possible),
> > and doing the RTNL replacement later, seems more proper.
> > 
> 
> I wont be the guy adding RTNL to netfilter, thats for sure. That would
> be a step backward. 
> Sometimes, the 'obvious' fix is also a dumb one.
> Do you really think I didnt had this idea too ?
> 
> xt_RATEEST is an unfortunate domain intersection (netfilter / sched).
> 
> We can solve this using a fine grained lock, instead of interesting
> lockdep issues, yet to be discovered.
> 
> You can submit your patch, but I wont Ack it, I'll Nack it for all these
> reasons.

I'm not going to submit my patch. The reason for using the RTNL is
the current gen_estimator API. If rateest were done properly it would
have to use it, and I'm astonished Patrick missed it, since it took
place just around this API fix (with Patrick's part) with the patch I
mentioned:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=0929c2dd83317813425b937fbc0041013b8685ff

So, the best would be to get Patrick's opinion. Otherwise, of course
your patch is the next alternative.

> 
> Why dont we remove all locks we have 'because we can use RTNL and be
> with it' ?
> 
> qdisc_mod_lock could be removed quite instantly, qdisc_base could be
> protected by RTNL... And so on...
> 

Yes, but it should be done independently from fixing bugs.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
  2010-06-07 17:18         ` [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock Eric Dumazet
  2010-06-08  1:00           ` Changli Gao
@ 2010-06-09  9:39           ` Eric Dumazet
  2010-06-09 11:33             ` Jarek Poplawski
  2010-06-11  5:54             ` David Miller
  1 sibling, 2 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09  9:39 UTC (permalink / raw)
  To: Changli Gao, David Miller
  Cc: netdev, Stephen Hemminger, Jarek Poplawski, Patrick McHardy

Le lundi 07 juin 2010 à 19:18 +0200, Eric Dumazet a écrit :


> [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock
> 
> gen_kill_estimator() / gen_new_estimator() is not always called with
> RTNL held.
> 
> net/netfilter/xt_RATEEST.c is one user of these API that do not hold
> RTNL, so random corruptions can occur between "tc" and "iptables"
> 
> Add a new fine grained lock instead of trying to use RTNL in xt_RATEEST


Here is v2 of patch, adding protection in gen_estimator_active() as
well.

[PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock

gen_kill_estimator() / gen_new_estimator() is not always called with
RTNL held.

net/netfilter/xt_RATEEST.c is one user of these API that do not hold
RTNL, so random corruptions can occur between "tc" and "iptables".

Add a new fine grained lock instead of trying to use RTNL in netfilter.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/gen_estimator.c |   15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cf8e703..785e527 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -107,6 +107,7 @@ static DEFINE_RWLOCK(est_lock);
 
 /* Protects against soft lockup during large deletion */
 static struct rb_root est_root = RB_ROOT;
+static DEFINE_SPINLOCK(est_tree_lock);
 
 static void est_timer(unsigned long arg)
 {
@@ -201,7 +202,6 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  *
  * Returns 0 on success or a negative error code.
  *
- * NOTE: Called under rtnl_mutex
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_rate_est *rate_est,
@@ -232,6 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	est->last_packets = bstats->packets;
 	est->avpps = rate_est->pps<<10;
 
+	spin_lock(&est_tree_lock);
 	if (!elist[idx].timer.function) {
 		INIT_LIST_HEAD(&elist[idx].list);
 		setup_timer(&elist[idx].timer, est_timer, idx);
@@ -242,6 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 
 	list_add_rcu(&est->list, &elist[idx].list);
 	gen_add_node(est);
+	spin_unlock(&est_tree_lock);
 
 	return 0;
 }
@@ -261,13 +263,13 @@ static void __gen_kill_estimator(struct rcu_head *head)
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
- * NOTE: Called under rtnl_mutex
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
 {
 	struct gen_estimator *e;
 
+	spin_lock(&est_tree_lock);
 	while ((e = gen_find_node(bstats, rate_est))) {
 		rb_erase(&e->node, &est_root);
 
@@ -278,6 +280,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 		list_del_rcu(&e->list);
 		call_rcu(&e->e_rcu, __gen_kill_estimator);
 	}
+	spin_unlock(&est_tree_lock);
 }
 EXPORT_SYMBOL(gen_kill_estimator);
 
@@ -312,8 +315,14 @@ EXPORT_SYMBOL(gen_replace_estimator);
 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 			  const struct gnet_stats_rate_est *rate_est)
 {
+	bool res;
+
 	ASSERT_RTNL();
 
-	return gen_find_node(bstats, rate_est) != NULL;
+	spin_lock(&est_tree_lock);
+	res = gen_find_node(bstats, rate_est) != NULL;
+	spin_unlock(&est_tree_lock);
+
+	return res;
 }
 EXPORT_SYMBOL(gen_estimator_active);



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-07 16:56       ` [PATCH net-next-2.6 v2] " Eric Dumazet
  2010-06-07 17:18         ` [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock Eric Dumazet
  2010-06-08 12:15         ` [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock Jarek Poplawski
@ 2010-06-09  9:40         ` Eric Dumazet
  2010-06-09  9:56           ` Eric Dumazet
  2 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09  9:40 UTC (permalink / raw)
  To: Changli Gao, David Miller
  Cc: netdev, Stephen Hemminger, Jarek Poplawski, Patrick McHardy

Le lundi 07 juin 2010 à 18:56 +0200, Eric Dumazet a écrit :

> 
> Even if its a bug correction, I cooked it for net-next-2.6 since bug
> probably never occured, and patch is too large to be sent to
> net-2.6/linux-2.6 before testing.
> 
> Another bug comes from net/netfilter/xt_RATEEST.c : It apparently
> calls gen_kill_estimator() / gen_new_estimator() without holding RTNL ?
> 
> So we should add another lock to protect things (est_root, elist[], ...)
> 
> David, I can send a net-2.6 patch for this one, since it should be small
> enough. If yes, I'll respin this patch of course ;)
> 
> Thanks
> 



Here is v3 of the patch, reduced to bare minimum.

This patch should be applied after previous one (pkt_sched:
gen_estimator: add a new lock)

Thanks


[PATCH] pkt_sched: gen_kill_estimator() rcu fixes

gen_kill_estimator() API is a bit misleading, since caller
should make sure an RCU grace period is respected before
freeing stats_lock.

This was partially addressed in commit 5d944c640b4 
(gen_estimator: deadlock fix), but same problem exist for all
gen_kill_estimator() users, if lock they use is not already rcu
protected.

A code review shows xt_RATEEST.c, act_api.c, act_police.c have this
problem. Other are ok because they use qdisc lock.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/act_api.h              |    2 ++
 include/net/netfilter/xt_rateest.h |    1 +
 net/core/gen_estimator.c           |    1 +
 net/netfilter/xt_RATEEST.c         |    8 +++++++-
 net/sched/act_api.c                |    7 ++++++-
 net/sched/act_police.c             |    7 +++++++
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c05fd71..bab385f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -20,6 +20,7 @@ struct tcf_common {
 	struct gnet_stats_queue		tcfc_qstats;
 	struct gnet_stats_rate_est	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
+	struct rcu_head			tcfc_rcu;
 };
 #define tcf_next	common.tcfc_next
 #define tcf_index	common.tcfc_index
@@ -32,6 +33,7 @@ struct tcf_common {
 #define tcf_qstats	common.tcfc_qstats
 #define tcf_rate_est	common.tcfc_rate_est
 #define tcf_lock	common.tcfc_lock
+#define tcf_rcu		common.tcfc_rcu
 
 struct tcf_police {
 	struct tcf_common	common;
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index ddbf37e..5e14277 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -9,6 +9,7 @@ struct xt_rateest {
 	struct gnet_estimator		params;
 	struct gnet_stats_rate_est	rstats;
 	struct gnet_stats_basic_packed	bstats;
+	struct rcu_head			rcu;
 };
 
 extern struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 785e527..9fbe7f7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -263,6 +263,7 @@ static void __gen_kill_estimator(struct rcu_head *head)
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 69c01e1..096b18b 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -60,13 +60,18 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
+static void xt_rateeset_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct xt_rateest, rcu));
+}
+
 void xt_rateest_put(struct xt_rateest *est)
 {
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
 		gen_kill_estimator(&est->bstats, &est->rstats);
-		kfree(est);
+		call_rcu(&est->rcu, xt_rateeset_free_rcu);
 	}
 	mutex_unlock(&xt_rateest_mutex);
 }
@@ -179,6 +184,7 @@ static int __init xt_rateest_tg_init(void)
 static void __exit xt_rateest_tg_fini(void)
 {
 	xt_unregister_target(&xt_rateest_tg_reg);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 972378f..0e6fd2b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -26,6 +26,11 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static void tcf_common_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_common, tcfc_rcu));
+}
+
 void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 {
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
@@ -38,7 +43,7 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 			write_unlock_bh(hinfo->lock);
 			gen_kill_estimator(&p->tcfc_bstats,
 					   &p->tcfc_rate_est);
-			kfree(p);
+			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
 			return;
 		}
 	}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 654f73d..9df45ed 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -97,6 +97,11 @@ nla_put_failure:
 	goto done;
 }
 
+static void tcf_police_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_police, tcf_rcu));
+}
+
 static void tcf_police_destroy(struct tcf_police *p)
 {
 	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
@@ -113,6 +118,7 @@ static void tcf_police_destroy(struct tcf_police *p)
 				qdisc_put_rtab(p->tcfp_R_tab);
 			if (p->tcfp_P_tab)
 				qdisc_put_rtab(p->tcfp_P_tab);
+			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
 			kfree(p);
 			return;
 		}
@@ -397,6 +403,7 @@ static void __exit
 police_cleanup_module(void)
 {
 	tcf_unregister_action(&act_police_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 module_init(police_init_module);



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-09  9:40         ` [PATCH] pkt_sched: gen_kill_estimator() rcu fixes Eric Dumazet
@ 2010-06-09  9:56           ` Eric Dumazet
  2010-06-09 10:41             ` Jarek Poplawski
  0 siblings, 1 reply; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09  9:56 UTC (permalink / raw)
  To: Changli Gao, David Miller
  Cc: netdev, Stephen Hemminger, Jarek Poplawski, Patrick McHardy



Oops, I missed  kfree() deletion in  :


>  static void tcf_police_destroy(struct tcf_police *p)
>  {
>  	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
> @@ -113,6 +118,7 @@ static void tcf_police_destroy(struct tcf_police *p)
>  				qdisc_put_rtab(p->tcfp_R_tab);
>  			if (p->tcfp_P_tab)
>  				qdisc_put_rtab(p->tcfp_P_tab);
> +			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
>  			kfree(p);
>  			return;

Updated patch follows :

[PATCH] pkt_sched: gen_kill_estimator() rcu fixes

gen_kill_estimator() API is a bit misleading, since caller
should make sure an RCU grace period is respected before
freeing stats_lock.

This was partially addressed in commit 5d944c640b4 
(gen_estimator: deadlock fix), but same problem exist for all
gen_kill_estimator() users, if lock they use is not already rcu
protected.

A code review shows xt_RATEEST.c, act_api.c, act_police.c have this
problem. Other are ok because they use qdisc lock.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/act_api.h              |    2 ++
 include/net/netfilter/xt_rateest.h |    1 +
 net/core/gen_estimator.c           |    1 +
 net/netfilter/xt_RATEEST.c         |    8 +++++++-
 net/sched/act_api.c                |    7 ++++++-
 net/sched/act_police.c             |    8 +++++++-
 6 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c05fd71..bab385f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -20,6 +20,7 @@ struct tcf_common {
 	struct gnet_stats_queue		tcfc_qstats;
 	struct gnet_stats_rate_est	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
+	struct rcu_head			tcfc_rcu;
 };
 #define tcf_next	common.tcfc_next
 #define tcf_index	common.tcfc_index
@@ -32,6 +33,7 @@ struct tcf_common {
 #define tcf_qstats	common.tcfc_qstats
 #define tcf_rate_est	common.tcfc_rate_est
 #define tcf_lock	common.tcfc_lock
+#define tcf_rcu		common.tcfc_rcu
 
 struct tcf_police {
 	struct tcf_common	common;
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index ddbf37e..5e14277 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -9,6 +9,7 @@ struct xt_rateest {
 	struct gnet_estimator		params;
 	struct gnet_stats_rate_est	rstats;
 	struct gnet_stats_basic_packed	bstats;
+	struct rcu_head			rcu;
 };
 
 extern struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 785e527..9fbe7f7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -263,6 +263,7 @@ static void __gen_kill_estimator(struct rcu_head *head)
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 69c01e1..096b18b 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -60,13 +60,18 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
+static void xt_rateeset_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct xt_rateest, rcu));
+}
+
 void xt_rateest_put(struct xt_rateest *est)
 {
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
 		gen_kill_estimator(&est->bstats, &est->rstats);
-		kfree(est);
+		call_rcu(&est->rcu, xt_rateeset_free_rcu);
 	}
 	mutex_unlock(&xt_rateest_mutex);
 }
@@ -179,6 +184,7 @@ static int __init xt_rateest_tg_init(void)
 static void __exit xt_rateest_tg_fini(void)
 {
 	xt_unregister_target(&xt_rateest_tg_reg);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 972378f..0e6fd2b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -26,6 +26,11 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static void tcf_common_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_common, tcfc_rcu));
+}
+
 void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 {
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
@@ -38,7 +43,7 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 			write_unlock_bh(hinfo->lock);
 			gen_kill_estimator(&p->tcfc_bstats,
 					   &p->tcfc_rate_est);
-			kfree(p);
+			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
 			return;
 		}
 	}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 654f73d..04929ce 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -97,6 +97,11 @@ nla_put_failure:
 	goto done;
 }
 
+static void tcf_police_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_police, tcf_rcu));
+}
+
 static void tcf_police_destroy(struct tcf_police *p)
 {
 	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
@@ -113,7 +118,7 @@ static void tcf_police_destroy(struct tcf_police *p)
 				qdisc_put_rtab(p->tcfp_R_tab);
 			if (p->tcfp_P_tab)
 				qdisc_put_rtab(p->tcfp_P_tab);
-			kfree(p);
+			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
 			return;
 		}
 	}
@@ -397,6 +402,7 @@ static void __exit
 police_cleanup_module(void)
 {
 	tcf_unregister_action(&act_police_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 module_init(police_init_module);



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-09  9:56           ` Eric Dumazet
@ 2010-06-09 10:41             ` Jarek Poplawski
  2010-06-09 12:09               ` Eric Dumazet
  0 siblings, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-09 10:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Wed, Jun 09, 2010 at 11:56:56AM +0200, Eric Dumazet wrote:
...
> Updated patch follows :
> 
> [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
> 
> gen_kill_estimator() API is a bit misleading, since caller

I'd call it incomplete or simply buggy. Plus a tiny note below.

> should make sure an RCU grace period is respected before
> freeing stats_lock.
> 
> This was partially addressed in commit 5d944c640b4 
> (gen_estimator: deadlock fix), but same problem exist for all
> gen_kill_estimator() users, if lock they use is not already rcu
> protected.
> 
> A code review shows xt_RATEEST.c, act_api.c, act_police.c have this
> problem. Other are ok because they use qdisc lock.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
...
> diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
> index 69c01e1..096b18b 100644
> --- a/net/netfilter/xt_RATEEST.c
> +++ b/net/netfilter/xt_RATEEST.c
> @@ -60,13 +60,18 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
>  }
>  EXPORT_SYMBOL_GPL(xt_rateest_lookup);
>  
> +static void xt_rateeset_free_rcu(struct rcu_head *head)
> +{
> +	kfree(container_of(head, struct xt_rateest, rcu));
> +}
> +
>  void xt_rateest_put(struct xt_rateest *est)
>  {
>  	mutex_lock(&xt_rateest_mutex);
>  	if (--est->refcnt == 0) {
>  		hlist_del(&est->list);
>  		gen_kill_estimator(&est->bstats, &est->rstats);
> -		kfree(est);
> +		call_rcu(&est->rcu, xt_rateeset_free_rcu);

Spelling suggestion: xt_rateest_free_rcu?

Since it's only 3 places I'd consider adding little comments,
who needs this call_rcu (like in qdisc_destroy).

Anyway this patch looks OK to me.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
  2010-06-09  9:39           ` [PATCH net-2.6 v2] " Eric Dumazet
@ 2010-06-09 11:33             ` Jarek Poplawski
  2010-06-09 11:55               ` Eric Dumazet
  2010-06-11  5:54             ` David Miller
  1 sibling, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-09 11:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Wed, Jun 09, 2010 at 11:39:10AM +0200, Eric Dumazet wrote:
...
> Here is v2 of patch, adding protection in gen_estimator_active() as
> well.
> 
> [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
> 
> gen_kill_estimator() / gen_new_estimator() is not always called with
> RTNL held.
> 
> net/netfilter/xt_RATEEST.c is one user of these API that do not hold
> RTNL, so random corruptions can occur between "tc" and "iptables".
> 
> Add a new fine grained lock instead of trying to use RTNL in netfilter.

Btw, maybe it's a different case, but RTNL happens in netfilter already
(nf_conntrack_helper.c, nf_conntrack_proto.c).

It would be nice to mention Changli's argument that
gen_replace_estimator isn't atomic operation after this change.

> @@ -312,8 +315,14 @@ EXPORT_SYMBOL(gen_replace_estimator);
>  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
>  			  const struct gnet_stats_rate_est *rate_est)
>  {
> +	bool res;
> +
>  	ASSERT_RTNL();

This line should go away as well.

I'm OK with this patch unless there is an alternative xt_RATEEST RTNL
fix available.

Jarek P.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
  2010-06-09 11:33             ` Jarek Poplawski
@ 2010-06-09 11:55               ` Eric Dumazet
  0 siblings, 0 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09 11:55 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Le mercredi 09 juin 2010 à 11:33 +0000, Jarek Poplawski a écrit :
> On Wed, Jun 09, 2010 at 11:39:10AM +0200, Eric Dumazet wrote:
> ...
> > Here is v2 of patch, adding protection in gen_estimator_active() as
> > well.
> > 
> > [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
> > 
> > gen_kill_estimator() / gen_new_estimator() is not always called with
> > RTNL held.
> > 
> > net/netfilter/xt_RATEEST.c is one user of these API that do not hold
> > RTNL, so random corruptions can occur between "tc" and "iptables".
> > 
> > Add a new fine grained lock instead of trying to use RTNL in netfilter.
> 
> Btw, maybe it's a different case, but RTNL happens in netfilter already
> (nf_conntrack_helper.c, nf_conntrack_proto.c).
> 
> It would be nice to mention Changli's argument that
> gen_replace_estimator isn't atomic operation after this change.
> 

See my next comment. This function is still used in qdisc domain only.
We could add ASSERT_RTNL() here, but its not a bug fix...

> > @@ -312,8 +315,14 @@ EXPORT_SYMBOL(gen_replace_estimator);
> >  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
> >  			  const struct gnet_stats_rate_est *rate_est)
> >  {
> > +	bool res;
> > +
> >  	ASSERT_RTNL();
> 
> This line should go away as well.
> 

Nope. This is really needed for safety. Of course, its a debugging knob
and you can remove it if you trust all callers to be good.

Caller should still use RTNL here, or else, as soon as we exit from
gen_estimator_active() with a 'true' result, some other qdisc user could
destroy the estimator.

Fortunatly up to 2.6.35, xt_RATEEST cannot reuse/delete an estimator
created by a qdisc user, and it doesnt use gen_estimator_active().

In a future patch, we should add RCU safety here instead of ASSERT_RTNL.
Thats to make sure caller of gen_estimator_active() is inside a
rcu_read_lock() section, and estimator cannot disappear under it, even
if another cpu/thread calls gen_kill_estimator() on same estimator.




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-09 10:41             ` Jarek Poplawski
@ 2010-06-09 12:09               ` Eric Dumazet
  2010-06-09 12:50                 ` Jarek Poplawski
  2010-06-12  1:39                 ` David Miller
  0 siblings, 2 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09 12:09 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Le mercredi 09 juin 2010 à 10:41 +0000, Jarek Poplawski a écrit :

> Spelling suggestion: xt_rateest_free_rcu?
> 
> Since it's only 3 places I'd consider adding little comments,
> who needs this call_rcu (like in qdisc_destroy).
> 
> Anyway this patch looks OK to me.
> 

Thanks for reviewing, you are right about typo and adding some comments.

What about following ?

[PATCH v2] pkt_sched: gen_kill_estimator() rcu fixes

gen_kill_estimator() API is incomplete or not well documented, since
caller should make sure an RCU grace period is respected before
freeing stats_lock.

This was partially addressed in commit 5d944c640b4 
(gen_estimator: deadlock fix), but same problem exist for all
gen_kill_estimator() users, if lock they use is not already RCU
protected.

A code review shows xt_RATEEST.c, act_api.c, act_police.c have this
problem. Other are ok because they use qdisc lock, already RCU
protected.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/act_api.h              |    2 ++
 include/net/netfilter/xt_rateest.h |    1 +
 net/core/gen_estimator.c           |    1 +
 net/netfilter/xt_RATEEST.c         |   12 +++++++++++-
 net/sched/act_api.c                |   11 ++++++++++-
 net/sched/act_police.c             |   12 +++++++++++-
 6 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c05fd71..bab385f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -20,6 +20,7 @@ struct tcf_common {
 	struct gnet_stats_queue		tcfc_qstats;
 	struct gnet_stats_rate_est	tcfc_rate_est;
 	spinlock_t			tcfc_lock;
+	struct rcu_head			tcfc_rcu;
 };
 #define tcf_next	common.tcfc_next
 #define tcf_index	common.tcfc_index
@@ -32,6 +33,7 @@ struct tcf_common {
 #define tcf_qstats	common.tcfc_qstats
 #define tcf_rate_est	common.tcfc_rate_est
 #define tcf_lock	common.tcfc_lock
+#define tcf_rcu		common.tcfc_rcu
 
 struct tcf_police {
 	struct tcf_common	common;
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index ddbf37e..5e14277 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -9,6 +9,7 @@ struct xt_rateest {
 	struct gnet_estimator		params;
 	struct gnet_stats_rate_est	rstats;
 	struct gnet_stats_basic_packed	bstats;
+	struct rcu_head			rcu;
 };
 
 extern struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 785e527..9fbe7f7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -263,6 +263,7 @@ static void __gen_kill_estimator(struct rcu_head *head)
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 69c01e1..de079ab 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -60,13 +60,22 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
+static void xt_rateest_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct xt_rateest, rcu));
+}
+
 void xt_rateest_put(struct xt_rateest *est)
 {
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
 		gen_kill_estimator(&est->bstats, &est->rstats);
-		kfree(est);
+		/*
+		 * gen_estimator est_timer() might access est->lock or bstats,
+		 * wait a RCU grace period before freeing 'est'
+		 */
+		call_rcu(&est->rcu, xt_rateest_free_rcu);
 	}
 	mutex_unlock(&xt_rateest_mutex);
 }
@@ -179,6 +188,7 @@ static int __init xt_rateest_tg_init(void)
 static void __exit xt_rateest_tg_fini(void)
 {
 	xt_unregister_target(&xt_rateest_tg_reg);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */
 }
 
 
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 972378f..23b25f8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -26,6 +26,11 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static void tcf_common_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_common, tcfc_rcu));
+}
+
 void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 {
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
@@ -38,7 +43,11 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 			write_unlock_bh(hinfo->lock);
 			gen_kill_estimator(&p->tcfc_bstats,
 					   &p->tcfc_rate_est);
-			kfree(p);
+			/*
+			 * gen_estimator est_timer() might access p->tcfc_lock
+			 * or bstats, wait a RCU grace period before freeing p
+			 */
+			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
 			return;
 		}
 	}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 654f73d..537a487 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -97,6 +97,11 @@ nla_put_failure:
 	goto done;
 }
 
+static void tcf_police_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_police, tcf_rcu));
+}
+
 static void tcf_police_destroy(struct tcf_police *p)
 {
 	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
@@ -113,7 +118,11 @@ static void tcf_police_destroy(struct tcf_police *p)
 				qdisc_put_rtab(p->tcfp_R_tab);
 			if (p->tcfp_P_tab)
 				qdisc_put_rtab(p->tcfp_P_tab);
-			kfree(p);
+			/*
+			 * gen_estimator est_timer() might access p->tcf_lock
+			 * or bstats, wait a RCU grace period before freeing p
+			 */
+			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
 			return;
 		}
 	}
@@ -397,6 +406,7 @@ static void __exit
 police_cleanup_module(void)
 {
 	tcf_unregister_action(&act_police_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */
 }
 
 module_init(police_init_module);



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-09 12:09               ` Eric Dumazet
@ 2010-06-09 12:50                 ` Jarek Poplawski
  2010-06-09 13:05                   ` Eric Dumazet
  2010-06-12  1:39                 ` David Miller
  1 sibling, 1 reply; 35+ messages in thread
From: Jarek Poplawski @ 2010-06-09 12:50 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

On Wed, Jun 09, 2010 at 02:09:23PM +0200, Eric Dumazet wrote:
> Le mercredi 09 juin 2010 ?? 10:41 +0000, Jarek Poplawski a écrit :
> 
> > Spelling suggestion: xt_rateest_free_rcu?
> > 
> > Since it's only 3 places I'd consider adding little comments,
> > who needs this call_rcu (like in qdisc_destroy).
> > 
> > Anyway this patch looks OK to me.
> > 
> 
> Thanks for reviewing, you are right about typo and adding some comments.
> 
> What about following ?

Very nice! (But let's remember these comments aren't very precise
wrt. bstats at the moment ;-)

Thanks,
Jarek P.

> 
> [PATCH v2] pkt_sched: gen_kill_estimator() rcu fixes
> 
> gen_kill_estimator() API is incomplete or not well documented, since
> caller should make sure an RCU grace period is respected before
> freeing stats_lock.
> 
> This was partially addressed in commit 5d944c640b4 
> (gen_estimator: deadlock fix), but same problem exist for all
> gen_kill_estimator() users, if lock they use is not already RCU
> protected.
> 
> A code review shows xt_RATEEST.c, act_api.c, act_police.c have this
> problem. Other are ok because they use qdisc lock, already RCU
> protected.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  include/net/act_api.h              |    2 ++
>  include/net/netfilter/xt_rateest.h |    1 +
>  net/core/gen_estimator.c           |    1 +
>  net/netfilter/xt_RATEEST.c         |   12 +++++++++++-
>  net/sched/act_api.c                |   11 ++++++++++-
>  net/sched/act_police.c             |   12 +++++++++++-
>  6 files changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index c05fd71..bab385f 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -20,6 +20,7 @@ struct tcf_common {
>  	struct gnet_stats_queue		tcfc_qstats;
>  	struct gnet_stats_rate_est	tcfc_rate_est;
>  	spinlock_t			tcfc_lock;
> +	struct rcu_head			tcfc_rcu;
>  };
>  #define tcf_next	common.tcfc_next
>  #define tcf_index	common.tcfc_index
> @@ -32,6 +33,7 @@ struct tcf_common {
>  #define tcf_qstats	common.tcfc_qstats
>  #define tcf_rate_est	common.tcfc_rate_est
>  #define tcf_lock	common.tcfc_lock
> +#define tcf_rcu		common.tcfc_rcu
>  
>  struct tcf_police {
>  	struct tcf_common	common;
> diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
> index ddbf37e..5e14277 100644
> --- a/include/net/netfilter/xt_rateest.h
> +++ b/include/net/netfilter/xt_rateest.h
> @@ -9,6 +9,7 @@ struct xt_rateest {
>  	struct gnet_estimator		params;
>  	struct gnet_stats_rate_est	rstats;
>  	struct gnet_stats_basic_packed	bstats;
> +	struct rcu_head			rcu;
>  };
>  
>  extern struct xt_rateest *xt_rateest_lookup(const char *name);
> diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
> index 785e527..9fbe7f7 100644
> --- a/net/core/gen_estimator.c
> +++ b/net/core/gen_estimator.c
> @@ -263,6 +263,7 @@ static void __gen_kill_estimator(struct rcu_head *head)
>   *
>   * Removes the rate estimator specified by &bstats and &rate_est.
>   *
> + * Note : Caller should respect an RCU grace period before freeing stats_lock
>   */
>  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
>  			struct gnet_stats_rate_est *rate_est)
> diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
> index 69c01e1..de079ab 100644
> --- a/net/netfilter/xt_RATEEST.c
> +++ b/net/netfilter/xt_RATEEST.c
> @@ -60,13 +60,22 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
>  }
>  EXPORT_SYMBOL_GPL(xt_rateest_lookup);
>  
> +static void xt_rateest_free_rcu(struct rcu_head *head)
> +{
> +	kfree(container_of(head, struct xt_rateest, rcu));
> +}
> +
>  void xt_rateest_put(struct xt_rateest *est)
>  {
>  	mutex_lock(&xt_rateest_mutex);
>  	if (--est->refcnt == 0) {
>  		hlist_del(&est->list);
>  		gen_kill_estimator(&est->bstats, &est->rstats);
> -		kfree(est);
> +		/*
> +		 * gen_estimator est_timer() might access est->lock or bstats,
> +		 * wait a RCU grace period before freeing 'est'
> +		 */
> +		call_rcu(&est->rcu, xt_rateest_free_rcu);
>  	}
>  	mutex_unlock(&xt_rateest_mutex);
>  }
> @@ -179,6 +188,7 @@ static int __init xt_rateest_tg_init(void)
>  static void __exit xt_rateest_tg_fini(void)
>  {
>  	xt_unregister_target(&xt_rateest_tg_reg);
> +	rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */
>  }
>  
>  
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 972378f..23b25f8 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -26,6 +26,11 @@
>  #include <net/act_api.h>
>  #include <net/netlink.h>
>  
> +static void tcf_common_free_rcu(struct rcu_head *head)
> +{
> +	kfree(container_of(head, struct tcf_common, tcfc_rcu));
> +}
> +
>  void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
>  {
>  	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
> @@ -38,7 +43,11 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
>  			write_unlock_bh(hinfo->lock);
>  			gen_kill_estimator(&p->tcfc_bstats,
>  					   &p->tcfc_rate_est);
> -			kfree(p);
> +			/*
> +			 * gen_estimator est_timer() might access p->tcfc_lock
> +			 * or bstats, wait a RCU grace period before freeing p
> +			 */
> +			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
>  			return;
>  		}
>  	}
> diff --git a/net/sched/act_police.c b/net/sched/act_police.c
> index 654f73d..537a487 100644
> --- a/net/sched/act_police.c
> +++ b/net/sched/act_police.c
> @@ -97,6 +97,11 @@ nla_put_failure:
>  	goto done;
>  }
>  
> +static void tcf_police_free_rcu(struct rcu_head *head)
> +{
> +	kfree(container_of(head, struct tcf_police, tcf_rcu));
> +}
> +
>  static void tcf_police_destroy(struct tcf_police *p)
>  {
>  	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
> @@ -113,7 +118,11 @@ static void tcf_police_destroy(struct tcf_police *p)
>  				qdisc_put_rtab(p->tcfp_R_tab);
>  			if (p->tcfp_P_tab)
>  				qdisc_put_rtab(p->tcfp_P_tab);
> -			kfree(p);
> +			/*
> +			 * gen_estimator est_timer() might access p->tcf_lock
> +			 * or bstats, wait a RCU grace period before freeing p
> +			 */
> +			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
>  			return;
>  		}
>  	}
> @@ -397,6 +406,7 @@ static void __exit
>  police_cleanup_module(void)
>  {
>  	tcf_unregister_action(&act_police_ops);
> +	rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */
>  }
>  
>  module_init(police_init_module);
> 
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-09 12:50                 ` Jarek Poplawski
@ 2010-06-09 13:05                   ` Eric Dumazet
  0 siblings, 0 replies; 35+ messages in thread
From: Eric Dumazet @ 2010-06-09 13:05 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Changli Gao, David Miller, netdev, Stephen Hemminger,
	Patrick McHardy

Le mercredi 09 juin 2010 à 12:50 +0000, Jarek Poplawski a écrit :
> On Wed, Jun 09, 2010 at 02:09:23PM +0200, Eric Dumazet wrote:
> > Le mercredi 09 juin 2010 ?? 10:41 +0000, Jarek Poplawski a écrit :
> > 
> > > Spelling suggestion: xt_rateest_free_rcu?
> > > 
> > > Since it's only 3 places I'd consider adding little comments,
> > > who needs this call_rcu (like in qdisc_destroy).
> > > 
> > > Anyway this patch looks OK to me.
> > > 
> > 
> > Thanks for reviewing, you are right about typo and adding some comments.
> > 
> > What about following ?
> 
> Very nice! (But let's remember these comments aren't very precise
> wrt. bstats at the moment ;-)

Yes, I anticipated a bit next patch ;)

BTW, I think using qdisc spinlock to update stats or reading them is not
really needed, particularly in xt_rateest/xt_RATEEST case, where
per_cpu bstats would be good

We could have one seqcount_t used by (bytes/packets) bstats producer,
and one seqcount_t used by est_timer when computing rates.


bstats updates :


write_seqcount_begin(&bstats->abs_seq);
bstats->bytes += len;
bstats->packets++;
write_seqcount_end(&bstats->abs_seq);


est_timer() would really run lockless, only rcu protected.

(No more spinlock, only the actual rcu_read_lock())


do {
	seq = read_seqcount_begin(&bstats->abs_seq);
	abs_bytes = bstats->bytes;
	abs_packets = bstats->packets;
} while (read_seqcount_retry(&bstats->abs_seq, seq));

write_seqcount_begin(&s->rate_seq);
/* compute rate estimation and store it */
...
write_seqcount_end(&s->rate_seq);

This would mean bstats should be rcu protected too...




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
  2010-06-09  9:39           ` [PATCH net-2.6 v2] " Eric Dumazet
  2010-06-09 11:33             ` Jarek Poplawski
@ 2010-06-11  5:54             ` David Miller
  1 sibling, 0 replies; 35+ messages in thread
From: David Miller @ 2010-06-11  5:54 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, netdev, shemminger, jarkao2, kaber

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 09 Jun 2010 11:39:10 +0200

> [PATCH net-2.6 v2] pkt_sched: gen_estimator: add a new lock
> 
> gen_kill_estimator() / gen_new_estimator() is not always called with
> RTNL held.
> 
> net/netfilter/xt_RATEEST.c is one user of these API that do not hold
> RTNL, so random corruptions can occur between "tc" and "iptables".
> 
> Add a new fine grained lock instead of trying to use RTNL in netfilter.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Wow, a lot of discussion to read and digest for this one :-)

Applied, thanks!

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] pkt_sched: gen_kill_estimator() rcu fixes
  2010-06-09 12:09               ` Eric Dumazet
  2010-06-09 12:50                 ` Jarek Poplawski
@ 2010-06-12  1:39                 ` David Miller
  1 sibling, 0 replies; 35+ messages in thread
From: David Miller @ 2010-06-12  1:39 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jarkao2, xiaosuo, netdev, shemminger, kaber

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 09 Jun 2010 14:09:23 +0200

> [PATCH v2] pkt_sched: gen_kill_estimator() rcu fixes
> 
> gen_kill_estimator() API is incomplete or not well documented, since
> caller should make sure an RCU grace period is respected before
> freeing stats_lock.
> 
> This was partially addressed in commit 5d944c640b4 
> (gen_estimator: deadlock fix), but same problem exist for all
> gen_kill_estimator() users, if lock they use is not already RCU
> protected.
> 
> A code review shows xt_RATEEST.c, act_api.c, act_police.c have this
> problem. Other are ok because they use qdisc lock, already RCU
> protected.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2010-06-12  1:38 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-07 14:32 [PATCH net-next-2.6] pkt_sched: gen_estimator: kill est_lock rwlock Eric Dumazet
2010-06-07 14:53 ` Changli Gao
2010-06-07 15:30   ` Eric Dumazet
2010-06-07 15:55     ` Eric Dumazet
2010-06-07 16:56       ` [PATCH net-next-2.6 v2] " Eric Dumazet
2010-06-07 17:18         ` [PATCH net-2.6] pkt_sched: gen_estimator: add a new lock Eric Dumazet
2010-06-08  1:00           ` Changli Gao
2010-06-08  4:30             ` Eric Dumazet
2010-06-08  4:57               ` Changli Gao
2010-06-08  4:58             ` Eric Dumazet
2010-06-08  5:20               ` Changli Gao
2010-06-08  5:39                 ` Eric Dumazet
2010-06-09  9:39           ` [PATCH net-2.6 v2] " Eric Dumazet
2010-06-09 11:33             ` Jarek Poplawski
2010-06-09 11:55               ` Eric Dumazet
2010-06-11  5:54             ` David Miller
2010-06-08 12:15         ` [PATCH net-next-2.6 v2] pkt_sched: gen_estimator: kill est_lock rwlock Jarek Poplawski
2010-06-08 12:27           ` Eric Dumazet
2010-06-08 12:40             ` Jarek Poplawski
2010-06-08 19:29               ` Jarek Poplawski
2010-06-08 19:45                 ` Eric Dumazet
2010-06-08 20:24                   ` Jarek Poplawski
2010-06-08 20:52                     ` Eric Dumazet
2010-06-08 21:18                       ` Jarek Poplawski
2010-06-09  6:13                       ` pkt_sched: gen_estimator: more fuel for Jarek and Changli Eric Dumazet
2010-06-09  6:51                         ` Jarek Poplawski
2010-06-09  7:36                           ` Eric Dumazet
2010-06-09  8:14                             ` Jarek Poplawski
2010-06-09  9:40         ` [PATCH] pkt_sched: gen_kill_estimator() rcu fixes Eric Dumazet
2010-06-09  9:56           ` Eric Dumazet
2010-06-09 10:41             ` Jarek Poplawski
2010-06-09 12:09               ` Eric Dumazet
2010-06-09 12:50                 ` Jarek Poplawski
2010-06-09 13:05                   ` Eric Dumazet
2010-06-12  1:39                 ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).