[PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
@ 2007-11-16 16:40 Eric Dumazet
  2007-11-16 19:50 ` Simon Horman
  2007-11-17  0:54 ` David Miller
  0 siblings, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2007-11-16 16:40 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org

Hello David

This patch against net-2.6.25 is another step to get a more resistant ip route cache.

Thank you

[PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue

Every 600 seconds (ip_rt_secret_interval), a softirq flush of the whole
ip route cache is triggered. On loaded machines, this can starve softirq for
many seconds and can eventually crash.

This patch moves this flush to a workqueue context, using the worker we
intoduced in commit 39c90ece7565f5c47110c2fa77409d7a9478bd5b
(IPV4: Convert rt_check_expire() from softirq processing to workqueue.)

Also, immediate flushes (echo 0 >/proc/sys/net/ipv4/route/flush) are using
rt_do_flush() helper function, wich take attention to rescheduling.

Next step will be to handle delayed flushes 
("echo -1 >/proc/sys/net/ipv4/route/flush"  or
"ip route flush cache")

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

 net/ipv4/route.c |   89 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 24 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 856807c..5d74620 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,13 +133,14 @@ static int ip_rt_mtu_expires		= 10 * 60 * HZ;
 static int ip_rt_min_pmtu		= 512 + 20 + 20;
 static int ip_rt_min_advmss		= 256;
 static int ip_rt_secret_interval	= 10 * 60 * HZ;
+static int ip_rt_flush_expected;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
 
 static struct timer_list rt_flush_timer;
-static void rt_check_expire(struct work_struct *work);
-static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
+static void rt_worker_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 static struct timer_list rt_secret_timer;
 
 /*
@@ -561,7 +562,42 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 		(fl1->iif ^ fl2->iif)) == 0;
 }
 
-static void rt_check_expire(struct work_struct *work)
+/*
+ * Perform a full scan of hash table and free all entries.
+ * Can be called by a softirq or a process.
+ * In the later case, we want to be reschedule if necessary
+ */
+static void rt_do_flush(int process_context)
+{
+	unsigned int i;
+	struct rtable *rth, *next;
+	unsigned long fake = 0, *flag_ptr;
+
+	flag_ptr = process_context ? &current_thread_info()->flags : &fake;
+
+	for (i = 0; i <= rt_hash_mask; i++) {
+		rth = rt_hash_table[i].chain;
+		if (rth) {
+			spin_lock_bh(rt_hash_lock_addr(i));
+			rth = rt_hash_table[i].chain;
+			rt_hash_table[i].chain = NULL;
+			spin_unlock_bh(rt_hash_lock_addr(i));
+		}
+		/*
+		 * This is a fast version of :
+		 * if (process_context && need_resched())
+		 */
+		if (unlikely(test_bit(TIF_NEED_RESCHED, flag_ptr)))
+			cond_resched();
+
+		for (; rth; rth = next) {
+			next = rth->u.dst.rt_next;
+			rt_free(rth);
+		}
+	}
+}
+
+static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
@@ -607,33 +643,33 @@ static void rt_check_expire(struct work_struct *work)
 		spin_unlock_bh(rt_hash_lock_addr(i));
 	}
 	rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * If a whole flush was scheduled, it is done.
+ * Else, we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+	if (ip_rt_flush_expected) {
+		ip_rt_flush_expected = 0;
+		rt_do_flush(1);
+	} else
+		rt_check_expire();
 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
  * in the case of a forced flush event.
  */
-static void rt_run_flush(unsigned long dummy)
+static void rt_run_flush(unsigned long process_context)
 {
-	int i;
-	struct rtable *rth, *next;
-
 	rt_deadline = 0;
 
 	get_random_bytes(&rt_hash_rnd, 4);
 
-	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(rt_hash_lock_addr(i));
-		rth = rt_hash_table[i].chain;
-		if (rth)
-			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; rth; rth = next) {
-			next = rth->u.dst.rt_next;
-			rt_free(rth);
-		}
-	}
+	rt_do_flush(process_context);
 }
 
 static DEFINE_SPINLOCK(rt_flush_lock);
@@ -667,7 +703,7 @@ void rt_cache_flush(int delay)
 
 	if (delay <= 0) {
 		spin_unlock_bh(&rt_flush_lock);
-		rt_run_flush(0);
+		rt_run_flush(user_mode);
 		return;
 	}
 
@@ -678,12 +714,17 @@ void rt_cache_flush(int delay)
 	spin_unlock_bh(&rt_flush_lock);
 }
 
+/*
+ * We change rt_hash_rnd and ask next rt_worker_func() invocation
+ * to perform a flush in process context
+ */
 static void rt_secret_rebuild(unsigned long dummy)
 {
-	unsigned long now = jiffies;
-
-	rt_cache_flush(0);
-	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+	get_random_bytes(&rt_hash_rnd, 4);
+	ip_rt_flush_expected = 1;
+	cancel_delayed_work(&expires_work);
+	schedule_delayed_work(&expires_work, HZ/10);
+	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-16 16:40 [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue Eric Dumazet
@ 2007-11-16 19:50 ` Simon Horman
  2007-11-16 21:23   ` Eric Dumazet
  2007-11-17  0:54 ` David Miller
  1 sibling, 1 reply; 12+ messages in thread
From: Simon Horman @ 2007-11-16 19:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev@vger.kernel.org

On Fri, Nov 16, 2007 at 05:40:27PM +0100, Eric Dumazet wrote:
> Hello David
> 
> This patch against net-2.6.25 is another step to get a more resistant ip route cache.
> 
> Thank you
> 
> [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
> 
> Every 600 seconds (ip_rt_secret_interval), a softirq flush of the whole
> ip route cache is triggered. On loaded machines, this can starve softirq for
> many seconds and can eventually crash.
> 
> This patch moves this flush to a workqueue context, using the worker we
> intoduced in commit 39c90ece7565f5c47110c2fa77409d7a9478bd5b
> (IPV4: Convert rt_check_expire() from softirq processing to workqueue.)
> 
> Also, immediate flushes (echo 0 >/proc/sys/net/ipv4/route/flush) are using
> rt_do_flush() helper function, wich take attention to rescheduling.
> 
> Next step will be to handle delayed flushes 
> ("echo -1 >/proc/sys/net/ipv4/route/flush"  or
> "ip route flush cache")
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
>  net/ipv4/route.c |   89 ++++++++++++++++++++++++++++++++-------------
>  1 file changed, 65 insertions(+), 24 deletions(-)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 856807c..5d74620 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -133,13 +133,14 @@ static int ip_rt_mtu_expires		= 10 * 60 * HZ;
>  static int ip_rt_min_pmtu		= 512 + 20 + 20;
>  static int ip_rt_min_advmss		= 256;
>  static int ip_rt_secret_interval	= 10 * 60 * HZ;
> +static int ip_rt_flush_expected;
>  static unsigned long rt_deadline;
>  
>  #define RTprint(a...)	printk(KERN_DEBUG a)
>  
>  static struct timer_list rt_flush_timer;
> -static void rt_check_expire(struct work_struct *work);
> -static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
> +static void rt_worker_func(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
>  static struct timer_list rt_secret_timer;
>  
>  /*
> @@ -561,7 +562,42 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
>  		(fl1->iif ^ fl2->iif)) == 0;
>  }
>  
> -static void rt_check_expire(struct work_struct *work)
> +/*
> + * Perform a full scan of hash table and free all entries.
> + * Can be called by a softirq or a process.
> + * In the later case, we want to be reschedule if necessary
> + */
> +static void rt_do_flush(int process_context)
> +{
> +	unsigned int i;
> +	struct rtable *rth, *next;
> +	unsigned long fake = 0, *flag_ptr;
> +
> +	flag_ptr = process_context ? &current_thread_info()->flags : &fake;
> +
> +	for (i = 0; i <= rt_hash_mask; i++) {
> +		rth = rt_hash_table[i].chain;
> +		if (rth) {
> +			spin_lock_bh(rt_hash_lock_addr(i));
> +			rth = rt_hash_table[i].chain;
> +			rt_hash_table[i].chain = NULL;
> +			spin_unlock_bh(rt_hash_lock_addr(i));
> +		}
> +		/*
> +		 * This is a fast version of :
> +		 * if (process_context && need_resched())
> +		 */
> +		if (unlikely(test_bit(TIF_NEED_RESCHED, flag_ptr)))
> +			cond_resched();
> +
> +		for (; rth; rth = next) {
> +			next = rth->u.dst.rt_next;
> +			rt_free(rth);
> +		}
> +	}
> +}

Is it ever neccessary to call cond_resched() if rt_hash_table[i].chain
is NULL? If not, the following looks cleaner to my eyes:

	for (i = 0; i <= rt_hash_mask; i++) {
		rth = rt_hash_table[i].chain;
		if (!rth)
			continue;

		spin_lock_bh(rt_hash_lock_addr(i));
		rth = rt_hash_table[i].chain;
		rt_hash_table[i].chain = NULL;
		spin_unlock_bh(rt_hash_lock_addr(i));

		...

-- 
Horms, California Edition


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-16 19:50 ` Simon Horman
@ 2007-11-16 21:23   ` Eric Dumazet
  0 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2007-11-16 21:23 UTC (permalink / raw)
  To: Simon Horman; +Cc: David Miller, netdev@vger.kernel.org

Simon Horman a écrit :
> Is it ever neccessary to call cond_resched() if rt_hash_table[i].chain
> is NULL? If not, the following looks cleaner to my eyes:
> 
> 	for (i = 0; i <= rt_hash_mask; i++) {
> 		rth = rt_hash_table[i].chain;
> 		if (!rth)
> 			continue;
> 
> 		spin_lock_bh(rt_hash_lock_addr(i));
> 		rth = rt_hash_table[i].chain;
> 		rt_hash_table[i].chain = NULL;
> 		spin_unlock_bh(rt_hash_lock_addr(i));
> 
> 		...
> 

Well, it depends if the table is empty or not, and its size.

Some RT tasks want to to be wakeup every ms for example.

With a 8MB hash table (one million slots on x86_64), the full scan is longer 
than 1 ms, even if no IRQ came and stole the cpu...


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-16 16:40 [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue Eric Dumazet
  2007-11-16 19:50 ` Simon Horman
@ 2007-11-17  0:54 ` David Miller
  2007-11-17  9:41   ` Eric Dumazet
  1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2007-11-17  0:54 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 16 Nov 2007 17:40:27 +0100

> +	unsigned long fake = 0, *flag_ptr;
 ...
> +		/*
> +		 * This is a fast version of :
> +		 * if (process_context && need_resched())
> +		 */
> +		if (unlikely(test_bit(TIF_NEED_RESCHED, flag_ptr)))
> +			cond_resched();

Too much exposure to internals for me to apply this, really.

I have no problem with the change conceptually at all, this
detail is just too dirty.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-17  0:54 ` David Miller
@ 2007-11-17  9:41   ` Eric Dumazet
  2007-11-17 14:36     ` Herbert Xu
  2007-11-20  6:43     ` David Miller
  0 siblings, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2007-11-17  9:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 1545 bytes --]

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Fri, 16 Nov 2007 17:40:27 +0100
> 
>> +	unsigned long fake = 0, *flag_ptr;
>  ...
>> +		/*
>> +		 * This is a fast version of :
>> +		 * if (process_context && need_resched())
>> +		 */
>> +		if (unlikely(test_bit(TIF_NEED_RESCHED, flag_ptr)))
>> +			cond_resched();
> 
> Too much exposure to internals for me to apply this, really.
> 
> I have no problem with the change conceptually at all, this
> detail is just too dirty.

Fair enough ;)

Re-sending this patch from a thunderbird on a winXP machine, I hope you wont 
mind...

Have a nice week end.

[PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to 
workqueue

Every 600 seconds (ip_rt_secret_interval), a softirq flush of the whole
ip route cache is triggered. On loaded machines, this can starve softirq for
many seconds and can eventually crash.

This patch moves this flush to a workqueue context, using the worker we
intoduced in commit 39c90ece7565f5c47110c2fa77409d7a9478bd5b
(IPV4: Convert rt_check_expire() from softirq processing to workqueue.)

Also, immediate flushes (echo 0 >/proc/sys/net/ipv4/route/flush) are using
rt_do_flush() helper function, wich take attention to rescheduling.

Next step will be to handle delayed flushes
("echo -1 >/proc/sys/net/ipv4/route/flush"  or
"ip route flush cache")

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

  net/ipv4/route.c |   83 +++++++++++++++++++++++++++++++--------------
  1 files changed, 59 insertions(+), 24 deletions(-)


[-- Attachment #2: ip_rt_secret.patch --]
[-- Type: text/plain, Size: 3721 bytes --]

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 856807c..ad297f4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,13 +133,14 @@ static int ip_rt_mtu_expires		= 10 * 60 * HZ;
 static int ip_rt_min_pmtu		= 512 + 20 + 20;
 static int ip_rt_min_advmss		= 256;
 static int ip_rt_secret_interval	= 10 * 60 * HZ;
+static int ip_rt_flush_expected;
 static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
 
 static struct timer_list rt_flush_timer;
-static void rt_check_expire(struct work_struct *work);
-static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
+static void rt_worker_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 static struct timer_list rt_secret_timer;
 
 /*
@@ -561,7 +562,36 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 		(fl1->iif ^ fl2->iif)) == 0;
 }
 
-static void rt_check_expire(struct work_struct *work)
+/*
+ * Perform a full scan of hash table and free all entries.
+ * Can be called by a softirq or a process.
+ * In the later case, we want to be reschedule if necessary
+ */
+static void rt_do_flush(int process_context)
+{
+	unsigned int i;
+	struct rtable *rth, *next;
+
+	for (i = 0; i <= rt_hash_mask; i++) {
+		if (process_context && need_resched())
+			cond_resched();
+		rth = rt_hash_table[i].chain;
+		if (!rth)
+			continue;
+
+		spin_lock_bh(rt_hash_lock_addr(i));
+		rth = rt_hash_table[i].chain;
+		rt_hash_table[i].chain = NULL;
+		spin_unlock_bh(rt_hash_lock_addr(i));
+
+		for (; rth; rth = next) {
+			next = rth->u.dst.rt_next;
+			rt_free(rth);
+		}
+	}
+}
+
+static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
@@ -607,33 +637,33 @@ static void rt_check_expire(struct work_struct *work)
 		spin_unlock_bh(rt_hash_lock_addr(i));
 	}
 	rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * If a whole flush was scheduled, it is done.
+ * Else, we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+	if (ip_rt_flush_expected) {
+		ip_rt_flush_expected = 0;
+		rt_do_flush(1);
+	} else
+		rt_check_expire();
 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
  * in the case of a forced flush event.
  */
-static void rt_run_flush(unsigned long dummy)
+static void rt_run_flush(unsigned long process_context)
 {
-	int i;
-	struct rtable *rth, *next;
-
 	rt_deadline = 0;
 
 	get_random_bytes(&rt_hash_rnd, 4);
 
-	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(rt_hash_lock_addr(i));
-		rth = rt_hash_table[i].chain;
-		if (rth)
-			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; rth; rth = next) {
-			next = rth->u.dst.rt_next;
-			rt_free(rth);
-		}
-	}
+	rt_do_flush(process_context);
 }
 
 static DEFINE_SPINLOCK(rt_flush_lock);
@@ -667,7 +697,7 @@ void rt_cache_flush(int delay)
 
 	if (delay <= 0) {
 		spin_unlock_bh(&rt_flush_lock);
-		rt_run_flush(0);
+		rt_run_flush(user_mode);
 		return;
 	}
 
@@ -678,12 +708,17 @@ void rt_cache_flush(int delay)
 	spin_unlock_bh(&rt_flush_lock);
 }
 
+/*
+ * We change rt_hash_rnd and ask next rt_worker_func() invocation
+ * to perform a flush in process context
+ */
 static void rt_secret_rebuild(unsigned long dummy)
 {
-	unsigned long now = jiffies;
-
-	rt_cache_flush(0);
-	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+	get_random_bytes(&rt_hash_rnd, 4);
+	ip_rt_flush_expected = 1;
+	cancel_delayed_work(&expires_work);
+	schedule_delayed_work(&expires_work, HZ/10);
+	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-17  9:41   ` Eric Dumazet
@ 2007-11-17 14:36     ` Herbert Xu
  2007-11-17 16:18       ` Eric Dumazet
  2007-11-20  6:43     ` David Miller
  1 sibling, 1 reply; 12+ messages in thread
From: Herbert Xu @ 2007-11-17 14:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Sat, Nov 17, 2007 at 09:41:47AM +0000, Eric Dumazet wrote:
>
> [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to 
> workqueue

Thanks for your work on this Eric! It's very much needed.

> @@ -667,7 +697,7 @@ void rt_cache_flush(int delay)
>  
>  	if (delay <= 0) {
>  		spin_unlock_bh(&rt_flush_lock);
> -		rt_run_flush(0);
> +		rt_run_flush(user_mode);
>  		return;
>  	}

This seems to be the only potentially softirq caller of rt_run_flush.
However, I just checked the callers of it and most of them seem to
hold the RTNL which would indicate that they're in process context.

So do you know if you we have any real softirq callers left?
If we do perhaps we can look at either moving them out or see
if they can cope with the flush occuring after the call returns.

If not we can get rid of the softirq special case.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-17 14:36     ` Herbert Xu
@ 2007-11-17 16:18       ` Eric Dumazet
  2007-11-17 16:29         ` Herbert Xu
  0 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2007-11-17 16:18 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, netdev

Herbert Xu a écrit :
> On Sat, Nov 17, 2007 at 09:41:47AM +0000, Eric Dumazet wrote:
>> [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to 
>> workqueue
> 
> Thanks for your work on this Eric! It's very much needed.

Thanks :)

> 
>> @@ -667,7 +697,7 @@ void rt_cache_flush(int delay)
>>  
>>  	if (delay <= 0) {
>>  		spin_unlock_bh(&rt_flush_lock);
>> -		rt_run_flush(0);
>> +		rt_run_flush(user_mode);
>>  		return;
>>  	}
> 
> This seems to be the only potentially softirq caller of rt_run_flush.
> However, I just checked the callers of it and most of them seem to
> hold the RTNL which would indicate that they're in process context.
> 
> So do you know if you we have any real softirq callers left?
> If we do perhaps we can look at either moving them out or see
> if they can cope with the flush occuring after the call returns.
> 
> If not we can get rid of the softirq special case.
> 

Unfortunatly we have softirq callers left. But my goal is to move everything 
to process context yes. I choose small patches, so that they can be more 
easyly reviewed and accepted.

The most common case is triggered by "ip route flush cache"
Since it's arming a 2 second timer (ip_rt_min_delay) . When this
timer is fired (softirq), it is flushing the table.

Then, every calls to rt_cache_flush(-1) are asking the same thing, while 
rt_cache_flush(0) are synchronous (immediate flushing unless a flush already 
is in flight)

net/decnet/dn_table.c:621:                      dn_rt_cache_flush(-1);
net/decnet/dn_table.c:625:              dn_rt_cache_flush(-1);
net/decnet/dn_table.c:700:                              dn_rt_cache_flush(-1);
net/decnet/dn_table.c:707:                              dn_rt_cache_flush(-1);
net/decnet/dn_table.c:876:              dn_rt_cache_flush(-1);
net/decnet/dn_rules.c:234:      dn_rt_cache_flush(-1);
net/decnet/dn_fib.c:632:        dn_rt_cache_flush(0);
net/decnet/dn_fib.c:644:                        dn_rt_cache_flush(-1);
net/decnet/dn_fib.c:651:                                dn_rt_cache_flush(-1);
net/decnet/dn_route.c:339:void dn_rt_cache_flush(int delay)
net/ipv4/devinet.c:1344:        rt_cache_flush(0);
net/ipv4/devinet.c:1359:                        rt_cache_flush(0);
net/ipv4/devinet.c:1374:                rt_cache_flush(0);
net/ipv4/devinet.c:1387:                rt_cache_flush(0);
net/ipv4/fib_frontend.c:126:            rt_cache_flush(-1);
net/ipv4/fib_frontend.c:833:    rt_cache_flush(0);
net/ipv4/fib_frontend.c:847:            rt_cache_flush(-1);
net/ipv4/fib_frontend.c:857:                    rt_cache_flush(-1);
net/ipv4/fib_frontend.c:888:            rt_cache_flush(-1);
net/ipv4/fib_frontend.c:895:            rt_cache_flush(0);
net/ipv4/fib_rules.c:274:       rt_cache_flush(-1);
net/ipv4/fib_trie.c:1235:                               rt_cache_flush(-1);
net/ipv4/fib_trie.c:1288:       rt_cache_flush(-1);
net/ipv4/fib_trie.c:1654:               rt_cache_flush(-1);
net/ipv4/route.c:671:void rt_cache_flush(int delay)
net/ipv4/route.c:2688:  rt_cache_flush(0);
net/ipv4/route.c:2700:          rt_cache_flush(flush_delay);
net/ipv4/route.c:2720:  rt_cache_flush(delay);
net/ipv4/arp.c:1215:            rt_cache_flush(0);
net/ipv4/fib_hash.c:459:                                rt_cache_flush(-1);
net/ipv4/fib_hash.c:526:        rt_cache_flush(-1);
net/ipv4/fib_hash.c:608:                        rt_cache_flush(-1);


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-17 16:18       ` Eric Dumazet
@ 2007-11-17 16:29         ` Herbert Xu
  2007-11-18  0:45           ` David Miller
  0 siblings, 1 reply; 12+ messages in thread
From: Herbert Xu @ 2007-11-17 16:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Sat, Nov 17, 2007 at 05:18:35PM +0100, Eric Dumazet wrote:
>
> >This seems to be the only potentially softirq caller of rt_run_flush.
> >However, I just checked the callers of it and most of them seem to
> >hold the RTNL which would indicate that they're in process context.
> >
> >So do you know if you we have any real softirq callers left?
> >If we do perhaps we can look at either moving them out or see
> >if they can cope with the flush occuring after the call returns.
> >
> >If not we can get rid of the softirq special case.
> 
> Unfortunatly we have softirq callers left. But my goal is to move 
> everything to process context yes. I choose small patches, so that they can 
> be more easyly reviewed and accepted.
> 
> The most common case is triggered by "ip route flush cache"
> Since it's arming a 2 second timer (ip_rt_min_delay) . When this
> timer is fired (softirq), it is flushing the table.
> 
> Then, every calls to rt_cache_flush(-1) are asking the same thing, while 
> rt_cache_flush(0) are synchronous (immediate flushing unless a flush 
> already is in flight)

Right.  Obviously the ones with non-zero arguments aren't an
issue because it's delayed anyway.

What I meant above is that which of the ones that call it with
zero are really in a softirq context.  As the ones I looked at
all seem to hold the RTNL it would suggest that most of them
are already in process context.

However, since you're already working on this as your next step
I can wait :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-17 16:29         ` Herbert Xu
@ 2007-11-18  0:45           ` David Miller
  2007-11-18  2:07             ` Herbert Xu
  0 siblings, 1 reply; 12+ messages in thread
From: David Miller @ 2007-11-18  0:45 UTC (permalink / raw)
  To: herbert; +Cc: dada1, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 18 Nov 2007 00:29:39 +0800

> However, since you're already working on this as your next step
> I can wait :)

Me too.

Herbert, you asked about just nop'ing out cond_resched() when we're
doing real preemption.

A lot of code goes:

	if (need_resched()) {
		/* drop some locks, etc. */
		cond_resched();
		/* reacquire locks, etc. */
	}

So it has to do something even with real preemption enabled.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-18  0:45           ` David Miller
@ 2007-11-18  2:07             ` Herbert Xu
  2007-11-18  2:13               ` David Miller
  0 siblings, 1 reply; 12+ messages in thread
From: Herbert Xu @ 2007-11-18  2:07 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, netdev

On Sat, Nov 17, 2007 at 04:45:42PM -0800, David Miller wrote:
>
> Herbert, you asked about just nop'ing out cond_resched() when we're
> doing real preemption.
> 
> A lot of code goes:
> 
> 	if (need_resched()) {
> 		/* drop some locks, etc. */
> 		cond_resched();
> 		/* reacquire locks, etc. */
> 	}
> 
> So it has to do something even with real preemption enabled.

Actually that shouldn't be necessary.  Because things like spin_unlock
does preempt_enable which in turn does:

#define preempt_enable() \
do { \
        preempt_enable_no_resched(); \
        barrier(); \
        preempt_check_resched(); \
} while (0)

when CONFIG_PREEMPT is enabled.  So at least in this case the
cond_resched call is superfluous.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-18  2:07             ` Herbert Xu
@ 2007-11-18  2:13               ` David Miller
  0 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2007-11-18  2:13 UTC (permalink / raw)
  To: herbert; +Cc: dada1, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 18 Nov 2007 10:07:37 +0800

> On Sat, Nov 17, 2007 at 04:45:42PM -0800, David Miller wrote:
> >
> > Herbert, you asked about just nop'ing out cond_resched() when we're
> > doing real preemption.
> > 
> > A lot of code goes:
> > 
> > 	if (need_resched()) {
> > 		/* drop some locks, etc. */
> > 		cond_resched();
> > 		/* reacquire locks, etc. */
> > 	}
> > 
> > So it has to do something even with real preemption enabled.
> 
> Actually that shouldn't be necessary.  Because things like spin_unlock
> does preempt_enable which in turn does:
> 
> #define preempt_enable() \
> do { \
>         preempt_enable_no_resched(); \
>         barrier(); \
>         preempt_check_resched(); \
> } while (0)
> 
> when CONFIG_PREEMPT is enabled.  So at least in this case the
> cond_resched call is superfluous.

I see what you mean, ok yes that would catch it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
  2007-11-17  9:41   ` Eric Dumazet
  2007-11-17 14:36     ` Herbert Xu
@ 2007-11-20  6:43     ` David Miller
  1 sibling, 0 replies; 12+ messages in thread
From: David Miller @ 2007-11-20  6:43 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 17 Nov 2007 10:41:47 +0100

> [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
> 
> Every 600 seconds (ip_rt_secret_interval), a softirq flush of the whole
> ip route cache is triggered. On loaded machines, this can starve softirq for
> many seconds and can eventually crash.
> 
> This patch moves this flush to a workqueue context, using the worker we
> intoduced in commit 39c90ece7565f5c47110c2fa77409d7a9478bd5b
> (IPV4: Convert rt_check_expire() from softirq processing to workqueue.)
> 
> Also, immediate flushes (echo 0 >/proc/sys/net/ipv4/route/flush) are using
> rt_do_flush() helper function, wich take attention to rescheduling.
> 
> Next step will be to handle delayed flushes
> ("echo -1 >/proc/sys/net/ipv4/route/flush"  or
> "ip route flush cache")
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2007-11-20  6:43 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-16 16:40 [PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue Eric Dumazet
2007-11-16 19:50 ` Simon Horman
2007-11-16 21:23   ` Eric Dumazet
2007-11-17  0:54 ` David Miller
2007-11-17  9:41   ` Eric Dumazet
2007-11-17 14:36     ` Herbert Xu
2007-11-17 16:18       ` Eric Dumazet
2007-11-17 16:29         ` Herbert Xu
2007-11-18  0:45           ` David Miller
2007-11-18  2:07             ` Herbert Xu
2007-11-18  2:13               ` David Miller
2007-11-20  6:43     ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).