Re: [patch 1/2] rcu batch tuning

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* Re: [patch 1/2] rcu batch tuning
  2006-01-26 18:40 [patch 0/2] RCU: fix various latency/oom issues Dipankar Sarma
@ 2006-01-26 18:41 ` Dipankar Sarma
  2006-01-26 19:33   ` Paul E. McKenney
  0 siblings, 1 reply; 23+ messages in thread
From: Dipankar Sarma @ 2006-01-26 18:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, Paul E.McKenney, linux-kernel


This patch adds new tunables for RCU queue and finished batches.
There are two types of controls - number of completed RCU updates
invoked in a batch (blimit) and monitoring for high rate of
incoming RCUs on a cpu (qhimark, qlowmark). By default,
the per-cpu batch limit is set to a small value. If
the input RCU rate exceeds the high watermark, we do two things -
force quiescent state on all cpus and set the batch limit
of the CPU to INTMAX. Setting batch limit to INTMAX forces all
finished RCUs to be processed in one shot. If we have more than
INTMAX RCUs queued up, then we have bigger problems anyway.
Once the incoming queued RCUs fall below the low watermark, the batch limit
is set to the default.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
---


 include/linux/rcupdate.h |    6 +++
 kernel/rcupdate.c        |   76 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 63 insertions(+), 19 deletions(-)

diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h
--- linux-2.6.16-rc1-rcu/include/linux/rcupdate.h~rcu-batch-tuning	2006-01-25 00:09:54.000000000 +0530
+++ linux-2.6.16-rc1-rcu-dipankar/include/linux/rcupdate.h	2006-01-25 01:07:39.000000000 +0530
@@ -98,13 +98,17 @@ struct rcu_data {
 	long  	       	batch;           /* Batch # for current RCU batch */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
-	long            count; /* # of queued items */
+	long            qlen; 	 	 /* # of queued callbacks */
 	struct rcu_head *curlist;
 	struct rcu_head **curtail;
 	struct rcu_head *donelist;
 	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
 	int cpu;
 	struct rcu_head barrier;
+#ifdef CONFIG_SMP
+	long		last_rs_qlen;	 /* qlen during the last resched */
+#endif
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c
--- linux-2.6.16-rc1-rcu/kernel/rcupdate.c~rcu-batch-tuning	2006-01-25 00:09:54.000000000 +0530
+++ linux-2.6.16-rc1-rcu-dipankar/kernel/rcupdate.c	2006-01-25 23:08:03.000000000 +0530
@@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10000;
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+
+static atomic_t rcu_barrier_cpu_count;
+static struct semaphore rcu_barrier_sema;
+static struct completion rcu_barrier_completion;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+		rdp->last_rs_qlen = rdp->qlen;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else 
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->count > 10000))
-		set_need_resched();
-
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
 	local_irq_restore(flags);
 }
 
-static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
-static struct completion rcu_barrier_completion;
-
 /**
  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_hea
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-	rdp->count++;
-/*
- *  Should we directly call rcu_do_batch() here ?
- *  if (unlikely(rdp->count > 10000))
- *      rcu_do_batch(rdp);
- */
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
 	local_irq_restore(flags);
 }
 
@@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
-		rdp->count--;
-		if (++count >= maxbatch)
+		rdp->qlen--;
+		if (++count >= rdp->blimit)
 			break;
 	}
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
@@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
+	rdp->blimit = blimit;
 }
 
 static void __devinit rcu_online_cpu(int cpu)
@@ -567,7 +602,12 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */

_

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
  2006-01-26 18:41 ` [patch 1/2] rcu batch tuning Dipankar Sarma
@ 2006-01-26 19:33   ` Paul E. McKenney
  2006-01-26 19:42     ` Dipankar Sarma
  0 siblings, 1 reply; 23+ messages in thread
From: Paul E. McKenney @ 2006-01-26 19:33 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

On Fri, Jan 27, 2006 at 12:11:27AM +0530, Dipankar Sarma wrote:
> 
> This patch adds new tunables for RCU queue and finished batches.
> There are two types of controls - number of completed RCU updates
> invoked in a batch (blimit) and monitoring for high rate of
> incoming RCUs on a cpu (qhimark, qlowmark). By default,
> the per-cpu batch limit is set to a small value. If
> the input RCU rate exceeds the high watermark, we do two things -
> force quiescent state on all cpus and set the batch limit
> of the CPU to INTMAX. Setting batch limit to INTMAX forces all
> finished RCUs to be processed in one shot. If we have more than
> INTMAX RCUs queued up, then we have bigger problems anyway.
> Once the incoming queued RCUs fall below the low watermark, the batch limit
> is set to the default.

Looks good to me!  We might have to have more sophisticated adjustment
of blimit, but starting simple is definitely the right way to go.

						Thanx, Paul

Acked-by: <paulmck@us.ibm.com>
> Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
> ---
> 
> 
>  include/linux/rcupdate.h |    6 +++
>  kernel/rcupdate.c        |   76 +++++++++++++++++++++++++++++++++++------------
>  2 files changed, 63 insertions(+), 19 deletions(-)
> 
> diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h
> --- linux-2.6.16-rc1-rcu/include/linux/rcupdate.h~rcu-batch-tuning	2006-01-25 00:09:54.000000000 +0530
> +++ linux-2.6.16-rc1-rcu-dipankar/include/linux/rcupdate.h	2006-01-25 01:07:39.000000000 +0530
> @@ -98,13 +98,17 @@ struct rcu_data {
>  	long  	       	batch;           /* Batch # for current RCU batch */
>  	struct rcu_head *nxtlist;
>  	struct rcu_head **nxttail;
> -	long            count; /* # of queued items */
> +	long            qlen; 	 	 /* # of queued callbacks */
>  	struct rcu_head *curlist;
>  	struct rcu_head **curtail;
>  	struct rcu_head *donelist;
>  	struct rcu_head **donetail;
> +	long		blimit;		 /* Upper limit on a processed batch */
>  	int cpu;
>  	struct rcu_head barrier;
> +#ifdef CONFIG_SMP
> +	long		last_rs_qlen;	 /* qlen during the last resched */
> +#endif
>  };
>  
>  DECLARE_PER_CPU(struct rcu_data, rcu_data);
> diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c
> --- linux-2.6.16-rc1-rcu/kernel/rcupdate.c~rcu-batch-tuning	2006-01-25 00:09:54.000000000 +0530
> +++ linux-2.6.16-rc1-rcu-dipankar/kernel/rcupdate.c	2006-01-25 23:08:03.000000000 +0530
> @@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
>  
>  /* Fake initialization required by compiler */
>  static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
> -static int maxbatch = 10000;
> +static int blimit = 10;
> +static int qhimark = 10000;
> +static int qlowmark = 100;
> +#ifdef CONFIG_SMP
> +static int rsinterval = 1000;
> +#endif
> +
> +static atomic_t rcu_barrier_cpu_count;
> +static struct semaphore rcu_barrier_sema;
> +static struct completion rcu_barrier_completion;
> +
> +#ifdef CONFIG_SMP
> +static void force_quiescent_state(struct rcu_data *rdp,
> +			struct rcu_ctrlblk *rcp)
> +{
> +	int cpu;
> +	cpumask_t cpumask;
> +	set_need_resched();
> +	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
> +		rdp->last_rs_qlen = rdp->qlen;
> +		/*
> +		 * Don't send IPI to itself. With irqs disabled,
> +		 * rdp->cpu is the current cpu.
> +		 */
> +		cpumask = rcp->cpumask;
> +		cpu_clear(rdp->cpu, cpumask);
> +		for_each_cpu_mask(cpu, cpumask)
> +			smp_send_reschedule(cpu);
> +	}
> +}
> +#else 
> +static inline void force_quiescent_state(struct rcu_data *rdp,
> +			struct rcu_ctrlblk *rcp)
> +{
> +	set_need_resched();
> +}
> +#endif
>  
>  /**
>   * call_rcu - Queue an RCU callback for invocation after a grace period.
> @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *
>  	rdp = &__get_cpu_var(rcu_data);
>  	*rdp->nxttail = head;
>  	rdp->nxttail = &head->next;
> -
> -	if (unlikely(++rdp->count > 10000))
> -		set_need_resched();
> -
> +	if (unlikely(++rdp->qlen > qhimark)) {
> +		rdp->blimit = INT_MAX;
> +		force_quiescent_state(rdp, &rcu_ctrlblk);
> +	}
>  	local_irq_restore(flags);
>  }
>  
> -static atomic_t rcu_barrier_cpu_count;
> -static struct semaphore rcu_barrier_sema;
> -static struct completion rcu_barrier_completion;
> -
>  /**
>   * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
>   * @head: structure to be used for queueing the RCU updates.
> @@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_hea
>  	rdp = &__get_cpu_var(rcu_bh_data);
>  	*rdp->nxttail = head;
>  	rdp->nxttail = &head->next;
> -	rdp->count++;
> -/*
> - *  Should we directly call rcu_do_batch() here ?
> - *  if (unlikely(rdp->count > 10000))
> - *      rcu_do_batch(rdp);
> - */
> +
> +	if (unlikely(++rdp->qlen > qhimark)) {
> +		rdp->blimit = INT_MAX;
> +		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
> +	}
> +
>  	local_irq_restore(flags);
>  }
>  
> @@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data
>  		next = rdp->donelist = list->next;
>  		list->func(list);
>  		list = next;
> -		rdp->count--;
> -		if (++count >= maxbatch)
> +		rdp->qlen--;
> +		if (++count >= rdp->blimit)
>  			break;
>  	}
> +	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
> +		rdp->blimit = blimit;
>  	if (!rdp->donelist)
>  		rdp->donetail = &rdp->donelist;
>  	else
> @@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu
>  	rdp->quiescbatch = rcp->completed;
>  	rdp->qs_pending = 0;
>  	rdp->cpu = cpu;
> +	rdp->blimit = blimit;
>  }
>  
>  static void __devinit rcu_online_cpu(int cpu)
> @@ -567,7 +602,12 @@ void synchronize_kernel(void)
>  	synchronize_rcu();
>  }
>  
> -module_param(maxbatch, int, 0);
> +module_param(blimit, int, 0);
> +module_param(qhimark, int, 0);
> +module_param(qlowmark, int, 0);
> +#ifdef CONFIG_SMP
> +module_param(rsinterval, int, 0);
> +#endif
>  EXPORT_SYMBOL_GPL(rcu_batches_completed);
>  EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
>  EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
> 
> _
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
  2006-01-26 19:33   ` Paul E. McKenney
@ 2006-01-26 19:42     ` Dipankar Sarma
  0 siblings, 0 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-01-26 19:42 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Andrew Morton, Linus Torvalds, linux-kernel

On Thu, Jan 26, 2006 at 11:33:17AM -0800, Paul E. McKenney wrote:
> On Fri, Jan 27, 2006 at 12:11:27AM +0530, Dipankar Sarma wrote:
> > 
> > This patch adds new tunables for RCU queue and finished batches.
> > There are two types of controls - number of completed RCU updates
> > invoked in a batch (blimit) and monitoring for high rate of
> > incoming RCUs on a cpu (qhimark, qlowmark). By default,
> > the per-cpu batch limit is set to a small value. If
> > the input RCU rate exceeds the high watermark, we do two things -
> > force quiescent state on all cpus and set the batch limit
> > of the CPU to INTMAX. Setting batch limit to INTMAX forces all
> > finished RCUs to be processed in one shot. If we have more than
> > INTMAX RCUs queued up, then we have bigger problems anyway.
> > Once the incoming queued RCUs fall below the low watermark, the batch limit
> > is set to the default.
> 
> Looks good to me!  We might have to have more sophisticated adjustment
> of blimit, but starting simple is definitely the right way to go.

Yes. Once I am set up to do latency measurements, I will look at
the possible need for more sophisticated adjustment of blimit.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
@ 2006-01-27 19:57 Oleg Nesterov
  2006-01-27 23:42 ` Paul E. McKenney
  2006-01-28 17:08 ` Dipankar Sarma
  0 siblings, 2 replies; 23+ messages in thread
From: Oleg Nesterov @ 2006-01-27 19:57 UTC (permalink / raw)
  To: Dipankar Sarma
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Linus Torvalds

Dipankar Sarma wrote:
>
> +static void force_quiescent_state(struct rcu_data *rdp,
> +			struct rcu_ctrlblk *rcp)
> +{
> +	int cpu;
> +	cpumask_t cpumask;
> +	set_need_resched();
> +	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
> +		rdp->last_rs_qlen = rdp->qlen;
> +		/*
> +		 * Don't send IPI to itself. With irqs disabled,
> +		 * rdp->cpu is the current cpu.
> +		 */
> +		cpumask = rcp->cpumask;
> +		cpu_clear(rdp->cpu, cpumask);
> +		for_each_cpu_mask(cpu, cpumask)
> +			smp_send_reschedule(cpu);
> +	}
> +}
> [ ... snip ... ]
>
> @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *
>  	rdp = &__get_cpu_var(rcu_data);
>  	*rdp->nxttail = head;
>  	rdp->nxttail = &head->next;
> -
> -	if (unlikely(++rdp->count > 10000))
> -		set_need_resched();
> -
> +	if (unlikely(++rdp->qlen > qhimark)) {
> +		rdp->blimit = INT_MAX;
> +		force_quiescent_state(rdp, &rcu_ctrlblk);
> +	}

When ->qlen exceeds qhimark for the first time we send reschedule IPI to
other CPUs and force_quiescent_state() records ->last_rs_qlen = ->qlen.
But we don't reset ->last_rs_qlen when ->qlen goes to 0, this means that
next time we need ++rdp->qlen > qhimark + rsinterval to force other CPUS
to pass quiescent state, no?

Also, it seems to me it's better to have 2 counters, one for length(->donelist)
and another for length(->curlist + ->nxtlist). I think we don't need
force_quiescent_state() when all rcu callbacks are placed in ->donelist,
we only need to increase rdp->blimit in this case.

Oleg.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
  2006-01-27 19:57 [patch " Oleg Nesterov
@ 2006-01-27 23:42 ` Paul E. McKenney
  2006-01-28 18:07   ` Oleg Nesterov
  2006-01-28 17:08 ` Dipankar Sarma
  1 sibling, 1 reply; 23+ messages in thread
From: Paul E. McKenney @ 2006-01-27 23:42 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Dipankar Sarma, linux-kernel, Andrew Morton, Linus Torvalds

On Fri, Jan 27, 2006 at 10:57:59PM +0300, Oleg Nesterov wrote:
> Dipankar Sarma wrote:
> >
> > +static void force_quiescent_state(struct rcu_data *rdp,
> > +			struct rcu_ctrlblk *rcp)
> > +{
> > +	int cpu;
> > +	cpumask_t cpumask;
> > +	set_need_resched();
> > +	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
> > +		rdp->last_rs_qlen = rdp->qlen;
> > +		/*
> > +		 * Don't send IPI to itself. With irqs disabled,
> > +		 * rdp->cpu is the current cpu.
> > +		 */
> > +		cpumask = rcp->cpumask;
> > +		cpu_clear(rdp->cpu, cpumask);
> > +		for_each_cpu_mask(cpu, cpumask)
> > +			smp_send_reschedule(cpu);
> > +	}
> > +}
> > [ ... snip ... ]
> >
> > @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *
> >  	rdp = &__get_cpu_var(rcu_data);
> >  	*rdp->nxttail = head;
> >  	rdp->nxttail = &head->next;
> > -
> > -	if (unlikely(++rdp->count > 10000))
> > -		set_need_resched();
> > -
> > +	if (unlikely(++rdp->qlen > qhimark)) {
> > +		rdp->blimit = INT_MAX;
> > +		force_quiescent_state(rdp, &rcu_ctrlblk);
> > +	}
> 
> When ->qlen exceeds qhimark for the first time we send reschedule IPI to
> other CPUs and force_quiescent_state() records ->last_rs_qlen = ->qlen.
> But we don't reset ->last_rs_qlen when ->qlen goes to 0, this means that
> next time we need ++rdp->qlen > qhimark + rsinterval to force other CPUS
> to pass quiescent state, no?

Good catch -- this could well explain Lee's continuing to hit
latency problems.  Although this would not cause the first
latency event, only subsequent ones, it seems to me that ->last_rs_qlen
should be reset whenever ->blimit is reset.

Dipankar, thoughts?

> Also, it seems to me it's better to have 2 counters, one for length(->donelist)
> and another for length(->curlist + ->nxtlist). I think we don't need
> force_quiescent_state() when all rcu callbacks are placed in ->donelist,
> we only need to increase rdp->blimit in this case.

True, currently the patch keeps the sum of the length of all three lists,
and takes both actions when the sum gets too large.  But the only way
you would get unneeded IPIs would be if callback processing was
stalled, but callback generation and grace-period processing was
still proceeding.  Seems at first glance to be an unusual corner
case, with the only downside being some extra IPIs.  Or am I missing
some aspect?

						Thanx, Paul

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
  2006-01-27 19:57 [patch " Oleg Nesterov
  2006-01-27 23:42 ` Paul E. McKenney
@ 2006-01-28 17:08 ` Dipankar Sarma
  1 sibling, 0 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-01-28 17:08 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Linus Torvalds

On Fri, Jan 27, 2006 at 10:57:59PM +0300, Oleg Nesterov wrote:
> Dipankar Sarma wrote:
> >
> > @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *
> >  	rdp = &__get_cpu_var(rcu_data);
> >  	*rdp->nxttail = head;
> >  	rdp->nxttail = &head->next;
> > -
> > -	if (unlikely(++rdp->count > 10000))
> > -		set_need_resched();
> > -
> > +	if (unlikely(++rdp->qlen > qhimark)) {
> > +		rdp->blimit = INT_MAX;
> > +		force_quiescent_state(rdp, &rcu_ctrlblk);
> > +	}
> 
> When ->qlen exceeds qhimark for the first time we send reschedule IPI to
> other CPUs and force_quiescent_state() records ->last_rs_qlen = ->qlen.
> But we don't reset ->last_rs_qlen when ->qlen goes to 0, this means that
> next time we need ++rdp->qlen > qhimark + rsinterval to force other CPUS
> to pass quiescent state, no?

Yes. I have fixed it my code.

> 
> Also, it seems to me it's better to have 2 counters, one for length(->donelist)
> and another for length(->curlist + ->nxtlist). I think we don't need
> force_quiescent_state() when all rcu callbacks are placed in ->donelist,
> we only need to increase rdp->blimit in this case.

We could, but I am not sure that ->qlen is not a good measure of
grace period not happening. In fact, in the past, long donelists
often resulted in longer grace periods. So, no harm in forcing
reschedule and allowing ksoftirq to relinquish cpu.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
  2006-01-27 23:42 ` Paul E. McKenney
@ 2006-01-28 18:07   ` Oleg Nesterov
  2006-01-30  3:30     ` Paul E. McKenney
  0 siblings, 1 reply; 23+ messages in thread
From: Oleg Nesterov @ 2006-01-28 18:07 UTC (permalink / raw)
  To: paulmck; +Cc: Dipankar Sarma, linux-kernel, Andrew Morton, Linus Torvalds

"Paul E. McKenney" wrote:
> 
> On Fri, Jan 27, 2006 at 10:57:59PM +0300, Oleg Nesterov wrote:
> >
> > When ->qlen exceeds qhimark for the first time we send reschedule IPI to
> > other CPUs and force_quiescent_state() records ->last_rs_qlen = ->qlen.
> > But we don't reset ->last_rs_qlen when ->qlen goes to 0, this means that
> > next time we need ++rdp->qlen > qhimark + rsinterval to force other CPUS
> > to pass quiescent state, no?
> 
> Good catch -- this could well explain Lee's continuing to hit
> latency problems.  Although this would not cause the first
> latency event, only subsequent ones, it seems to me that ->last_rs_qlen
> should be reset whenever ->blimit is reset.

May be it's better to do it in other way?

struct rcu_ctrlblk {
	...
	int signaled;
	...
};

void force_quiescent_state(rdp, rcp)
{
	if (!rcp->signaled) {
		// racy, but tolerable
		rcp->signaled = 1;

		for_each_cpu_mask(cpu, cpumask)
			smp_send_reschedule(cpu);
	}
}

void rcu_start_batch(rcp, rdp)
{
	if (->next_pending && ->completed == ->cur) {
		...
		rcp->signaled = 0;
		...
	}
}

Probably it is also makes sense to tasklet_schedule(rcu_tasklet)
in call_rcu() when ++rdp->qlen > qhimark, this way we can detect
that we need to start the next batch earlier.

> > Also, it seems to me it's better to have 2 counters, one for length(->donelist)
> > and another for length(->curlist + ->nxtlist). I think we don't need
> > force_quiescent_state() when all rcu callbacks are placed in ->donelist,
> > we only need to increase rdp->blimit in this case.
>
> True, currently the patch keeps the sum of the length of all three lists,
> and takes both actions when the sum gets too large.  But the only way
> you would get unneeded IPIs would be if callback processing was
> stalled, but callback generation and grace-period processing was
> still proceeding.  Seems at first glance to be an unusual corner
> case, with the only downside being some extra IPIs.  Or am I missing
> some aspect?

Yes, it is probably not worth to complicate the code.

Oleg.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/2] rcu batch tuning
  2006-01-28 18:07   ` Oleg Nesterov
@ 2006-01-30  3:30     ` Paul E. McKenney
  0 siblings, 0 replies; 23+ messages in thread
From: Paul E. McKenney @ 2006-01-30  3:30 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Dipankar Sarma, linux-kernel, Andrew Morton, Linus Torvalds

On Sat, Jan 28, 2006 at 09:07:13PM +0300, Oleg Nesterov wrote:
> "Paul E. McKenney" wrote:
> > 
> > On Fri, Jan 27, 2006 at 10:57:59PM +0300, Oleg Nesterov wrote:
> > >
> > > When ->qlen exceeds qhimark for the first time we send reschedule IPI to
> > > other CPUs and force_quiescent_state() records ->last_rs_qlen = ->qlen.
> > > But we don't reset ->last_rs_qlen when ->qlen goes to 0, this means that
> > > next time we need ++rdp->qlen > qhimark + rsinterval to force other CPUS
> > > to pass quiescent state, no?
> > 
> > Good catch -- this could well explain Lee's continuing to hit
> > latency problems.  Although this would not cause the first
> > latency event, only subsequent ones, it seems to me that ->last_rs_qlen
> > should be reset whenever ->blimit is reset.
> 
> May be it's better to do it in other way?
> 
> struct rcu_ctrlblk {
> 	...
> 	int signaled;
> 	...
> };
> 
> void force_quiescent_state(rdp, rcp)
> {
> 	if (!rcp->signaled) {
> 		// racy, but tolerable
> 		rcp->signaled = 1;
> 
> 		for_each_cpu_mask(cpu, cpumask)
> 			smp_send_reschedule(cpu);
> 	}
> }
> 
> void rcu_start_batch(rcp, rdp)
> {
> 	if (->next_pending && ->completed == ->cur) {
> 		...
> 		rcp->signaled = 0;
> 		...
> 	}
> }

Possibly...  But the best thing would be for you and Dipankar to
get together to work out the best strategy for this.

							Thanx, Paul

> Probably it is also makes sense to tasklet_schedule(rcu_tasklet)
> in call_rcu() when ++rdp->qlen > qhimark, this way we can detect
> that we need to start the next batch earlier.
> 
> > > Also, it seems to me it's better to have 2 counters, one for length(->donelist)
> > > and another for length(->curlist + ->nxtlist). I think we don't need
> > > force_quiescent_state() when all rcu callbacks are placed in ->donelist,
> > > we only need to increase rdp->blimit in this case.
> >
> > True, currently the patch keeps the sum of the length of all three lists,
> > and takes both actions when the sum gets too large.  But the only way
> > you would get unneeded IPIs would be if callback processing was
> > stalled, but callback generation and grace-period processing was
> > still proceeding.  Seems at first glance to be an unusual corner
> > case, with the only downside being some extra IPIs.  Or am I missing
> > some aspect?
> 
> Yes, it is probably not worth to complicate the code.
> 
> Oleg.
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 0/2] RCU updates
@ 2006-02-17 15:41 Dipankar Sarma
  2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
  0 siblings, 1 reply; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-17 15:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney

Here are the patches that I have been testing that should help
some of the latency and OOM issues (file limit) that we had
discussed in the past. If the patchset looks ok,
we should queue them up in -mm for some testing before
merging. I have lightly tested the patchset on both ppc64 and x86_64
using ltp, dbench etc.

Update since the last time I published - file counting stuff uses
percpu_counter.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] rcu batch tuning
  2006-02-17 15:41 [PATCH 0/2] RCU updates Dipankar Sarma
@ 2006-02-17 15:43 ` Dipankar Sarma
  2006-02-17 15:46   ` [PATCH 2/2] fix file counting Dipankar Sarma
                     ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-17 15:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney


This patch adds new tunables for RCU queue and finished batches.
There are two types of controls - number of completed RCU updates
invoked in a batch (blimit) and monitoring for high rate of
incoming RCUs on a cpu (qhimark, qlowmark). By default,
the per-cpu batch limit is set to a small value. If
the input RCU rate exceeds the high watermark, we do two things -
force quiescent state on all cpus and set the batch limit
of the CPU to INTMAX. Setting batch limit to INTMAX forces all
finished RCUs to be processed in one shot. If we have more than
INTMAX RCUs queued up, then we have bigger problems anyway.
Once the incoming queued RCUs fall below the low watermark, the batch limit
is set to the default.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
---


 include/linux/rcupdate.h |    6 +++
 kernel/rcupdate.c        |   76 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 63 insertions(+), 19 deletions(-)

diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h
--- linux-2.6.16-rc3-rcu/include/linux/rcupdate.h~rcu-batch-tuning	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/include/linux/rcupdate.h	2006-02-15 16:00:02.000000000 +0530
@@ -98,13 +98,17 @@ struct rcu_data {
 	long  	       	batch;           /* Batch # for current RCU batch */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
-	long            count; /* # of queued items */
+	long            qlen; 	 	 /* # of queued callbacks */
 	struct rcu_head *curlist;
 	struct rcu_head **curtail;
 	struct rcu_head *donelist;
 	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
 	int cpu;
 	struct rcu_head barrier;
+#ifdef CONFIG_SMP
+	long		last_rs_qlen;	 /* qlen during the last resched */
+#endif
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c
--- linux-2.6.16-rc3-rcu/kernel/rcupdate.c~rcu-batch-tuning	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/kernel/rcupdate.c	2006-02-15 16:00:02.000000000 +0530
@@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10000;
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+
+static atomic_t rcu_barrier_cpu_count;
+static struct semaphore rcu_barrier_sema;
+static struct completion rcu_barrier_completion;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+		rdp->last_rs_qlen = rdp->qlen;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else 
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->count > 10000))
-		set_need_resched();
-
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
 	local_irq_restore(flags);
 }
 
-static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
-static struct completion rcu_barrier_completion;
-
 /**
  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_hea
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-	rdp->count++;
-/*
- *  Should we directly call rcu_do_batch() here ?
- *  if (unlikely(rdp->count > 10000))
- *      rcu_do_batch(rdp);
- */
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
 	local_irq_restore(flags);
 }
 
@@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
-		rdp->count--;
-		if (++count >= maxbatch)
+		rdp->qlen--;
+		if (++count >= rdp->blimit)
 			break;
 	}
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
@@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
+	rdp->blimit = blimit;
 }
 
 static void __devinit rcu_online_cpu(int cpu)
@@ -567,7 +602,12 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */

_

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
@ 2006-02-17 15:46   ` Dipankar Sarma
  2006-02-18  9:04     ` Andrew Morton
  2006-02-17 20:33   ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
  2006-02-18  8:45   ` Andrew Morton
  2 siblings, 1 reply; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-17 15:46 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney, dada1

I have benchmarked this on an x86_64 NUMA system and see no
significant performance difference on kernbench. Tested on
both x86_64 and powerpc.

The way we do file struct accounting is not very suitable for batched
freeing. For scalability reasons, file accounting was constructor/destructor
based. This meant that nr_files was decremented only when
the object was removed from the slab cache. This is
susceptible to slab fragmentation. With RCU based file structure,
consequent batched freeing and a test program like Serge's,
we just speed this up and end up with a very fragmented slab -

llm22:~ # cat /proc/sys/fs/file-nr
587730  0       758844

At the same time, I see only a 2000+ objects in filp cache.
The following patch I fixes this problem. 

This patch changes the file counting by removing the filp_count_lock.
Instead we use a separate percpu counter, nr_files, for now and all
accesses to it are through get_nr_files() api. In the sysctl
handler for nr_files, we populate files_stat.nr_files before returning
to user.

Counting files as an when they are created and destroyed (as opposed
to inside slab) allows us to correctly count open files with RCU.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
---




 fs/dcache.c          |    2 -
 fs/file_table.c      |   80 +++++++++++++++++++++++++++++++--------------------
 include/linux/file.h |    2 -
 include/linux/fs.h   |    2 +
 kernel/sysctl.c      |    5 ++-
 net/unix/af_unix.c   |    2 -
 6 files changed, 57 insertions(+), 36 deletions(-)

diff -puN fs/dcache.c~fix-file-counting fs/dcache.c
--- linux-2.6.16-rc3-rcu/fs/dcache.c~fix-file-counting	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/fs/dcache.c	2006-02-15 16:00:02.000000000 +0530
@@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned lon
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff -puN fs/file_table.c~fix-file-counting fs/file_table.c
--- linux-2.6.16-rc3-rcu/fs/file_table.c~fix-file-counting	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/fs/file_table.c	2006-02-16 21:52:35.000000000 +0530
@@ -5,6 +5,7 @@
  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -19,52 +20,67 @@
 #include <linux/capability.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
-
+#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
+#include <asm/atomic.h>
+  
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
-
 /* public. Not pretty! */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+  
+static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-static DEFINE_SPINLOCK(filp_count_lock);
+static inline void file_free_rcu(struct rcu_head *head)
+{
+	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
+	kmem_cache_free(filp_cachep, f);
+}
 
-/* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
+static inline void file_free(struct file *f)
+{
+	percpu_counter_mod(&nr_files, -1L);
+	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+}
+  
+/*
+ * Return the total number of open files in the system
  */
-void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags)
+int get_nr_files(void)
 {
-	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
-	}
+	return percpu_counter_read_positive(&nr_files);
 }
 
-void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags)
+/*
+ * Return the maximum number of open files in the system
+ */
+int get_max_files(void)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	return files_stat.max_files;
 }
 
-static inline void file_free_rcu(struct rcu_head *head)
+EXPORT_SYMBOL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
-	kmem_cache_free(filp_cachep, f);
+	files_stat.nr_files = get_nr_files();
+	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
 }
-
-static inline void file_free(struct file *f)
+#else
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+	return -ENOSYS;
 }
+#endif
 
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
@@ -78,7 +94,7 @@ struct file *get_empty_filp(void)
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (files_stat.nr_files >= files_stat.max_files &&
+	if (get_nr_files() >= files_stat.max_files &&
 				!capable(CAP_SYS_ADMIN))
 		goto over;
 
@@ -86,6 +102,7 @@ struct file *get_empty_filp(void)
 	if (f == NULL)
 		goto fail;
 
+	percpu_counter_mod(&nr_files, 1L);
 	memset(f, 0, sizeof(*f));
 	if (security_file_alloc(f))
 		goto fail_sec;
@@ -101,10 +118,10 @@ struct file *get_empty_filp(void)
 
 over:
 	/* Ran out of filps - report that */
-	if (files_stat.nr_files > old_max) {
+	if (get_nr_files() > old_max) {
 		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					files_stat.max_files);
-		old_max = files_stat.nr_files;
+					get_max_files());
+		old_max = get_nr_files();
 	}
 	goto fail;
 
@@ -276,4 +293,5 @@ void __init files_init(unsigned long mem
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
+	percpu_counter_init(&nr_files);
 } 
diff -puN include/linux/file.h~fix-file-counting include/linux/file.h
--- linux-2.6.16-rc3-rcu/include/linux/file.h~fix-file-counting	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/include/linux/file.h	2006-02-15 16:00:02.000000000 +0530
@@ -60,8 +60,6 @@ extern void put_filp(struct file *);
 extern int get_unused_fd(void);
 extern void FASTCALL(put_unused_fd(unsigned int fd));
 struct kmem_cache;
-extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags);
-extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags);
 
 extern struct file ** alloc_fd_array(int);
 extern void free_fd_array(struct file **, int);
diff -puN include/linux/fs.h~fix-file-counting include/linux/fs.h
--- linux-2.6.16-rc3-rcu/include/linux/fs.h~fix-file-counting	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/include/linux/fs.h	2006-02-15 16:00:02.000000000 +0530
@@ -35,6 +35,8 @@ struct files_stat_struct {
 	int max_files;		/* tunable */
 };
 extern struct files_stat_struct files_stat;
+extern int get_nr_files(void);
+extern int get_max_files(void);
 
 struct inodes_stat_t {
 	int nr_inodes;
diff -puN kernel/sysctl.c~fix-file-counting kernel/sysctl.c
--- linux-2.6.16-rc3-rcu/kernel/sysctl.c~fix-file-counting	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/kernel/sysctl.c	2006-02-15 16:00:02.000000000 +0530
@@ -52,6 +52,9 @@
 #include <linux/nfs_fs.h>
 #endif
 
+extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -921,7 +924,7 @@ static ctl_table fs_table[] = {
 		.data		= &files_stat,
 		.maxlen		= 3*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_nr_files,
 	},
 	{
 		.ctl_name	= FS_MAXFILE,
diff -puN net/unix/af_unix.c~fix-file-counting net/unix/af_unix.c
--- linux-2.6.16-rc3-rcu/net/unix/af_unix.c~fix-file-counting	2006-02-15 16:00:02.000000000 +0530
+++ linux-2.6.16-rc3-rcu-dipankar/net/unix/af_unix.c	2006-02-15 16:00:02.000000000 +0530
@@ -547,7 +547,7 @@ static struct sock * unix_create1(struct
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
+	if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
 		goto out;
 
 	sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);

_

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] rcu batch tuning
  2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
  2006-02-17 15:46   ` [PATCH 2/2] fix file counting Dipankar Sarma
@ 2006-02-17 20:33   ` Dipankar Sarma
  2006-02-18  8:45   ` Andrew Morton
  2 siblings, 0 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-17 20:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Paul E.McKenney

Andrew,

The earlier version of this patch was against 2.6.16-rc3 and didn't
apply cleanly to 2.6.16-rc3-mm1. This version applies cleanly
to 2.6.16-rc3-mm1.

This patch adds new tunables for RCU queue and finished batches.
There are two types of controls - number of completed RCU updates
invoked in a batch (blimit) and monitoring for high rate of
incoming RCUs on a cpu (qhimark, qlowmark). By default,
the per-cpu batch limit is set to a small value. If
the input RCU rate exceeds the high watermark, we do two things -
force quiescent state on all cpus and set the batch limit
of the CPU to INTMAX. Setting batch limit to INTMAX forces all
finished RCUs to be processed in one shot. If we have more than
INTMAX RCUs queued up, then we have bigger problems anyway.
Once the incoming queued RCUs fall below the low watermark, the batch limit
is set to the default.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
---


 include/linux/rcupdate.h |    6 +++
 kernel/rcupdate.c        |   74 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 63 insertions(+), 17 deletions(-)

diff -puN include/linux/rcupdate.h~rcu-batch-tuning include/linux/rcupdate.h
--- linux-2.6.16-rc3-mm1-rcu/include/linux/rcupdate.h~rcu-batch-tuning	2006-02-18 01:18:24.000000000 +0530
+++ linux-2.6.16-rc3-mm1-rcu-dipankar/include/linux/rcupdate.h	2006-02-18 01:18:24.000000000 +0530
@@ -98,13 +98,17 @@ struct rcu_data {
 	long  	       	batch;           /* Batch # for current RCU batch */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
-	long            count; /* # of queued items */
+	long            qlen; 	 	 /* # of queued callbacks */
 	struct rcu_head *curlist;
 	struct rcu_head **curtail;
 	struct rcu_head *donelist;
 	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
 	int cpu;
 	struct rcu_head barrier;
+#ifdef CONFIG_SMP
+	long		last_rs_qlen;	 /* qlen during the last resched */
+#endif
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
diff -puN kernel/rcupdate.c~rcu-batch-tuning kernel/rcupdate.c
--- linux-2.6.16-rc3-mm1-rcu/kernel/rcupdate.c~rcu-batch-tuning	2006-02-18 01:18:24.000000000 +0530
+++ linux-2.6.16-rc3-mm1-rcu-dipankar/kernel/rcupdate.c	2006-02-18 01:24:07.000000000 +0530
@@ -68,7 +68,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10000;
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+		rdp->last_rs_qlen = rdp->qlen;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else 
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -94,16 +130,14 @@ void fastcall call_rcu(struct rcu_head *
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
 
-	if (unlikely(++rdp->count > 10000))
-		set_need_resched();
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
 
 	local_irq_restore(flags);
 }
 
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
 /**
  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -132,12 +166,12 @@ void fastcall call_rcu_bh(struct rcu_hea
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-	rdp->count++;
-/*
- *  Should we directly call rcu_do_batch() here ?
- *  if (unlikely(rdp->count > 10000))
- *      rcu_do_batch(rdp);
- */
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
 	local_irq_restore(flags);
 }
 
@@ -200,10 +234,12 @@ static void rcu_do_batch(struct rcu_data
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
-		rdp->count--;
-		if (++count >= maxbatch)
+		rdp->qlen--;
+		if (++count >= rdp->blimit)
 			break;
 	}
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
@@ -473,6 +509,7 @@ static void rcu_init_percpu_data(int cpu
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
+	rdp->blimit = blimit;
 }
 
 static void __devinit rcu_online_cpu(int cpu)
@@ -566,7 +603,12 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL_GPL_FUTURE(call_rcu);	/* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);	/* WARNING: GPL-only in April 2006. */

_

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] rcu batch tuning
  2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
  2006-02-17 15:46   ` [PATCH 2/2] fix file counting Dipankar Sarma
  2006-02-17 20:33   ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
@ 2006-02-18  8:45   ` Andrew Morton
  2006-02-18  9:15     ` Dipankar Sarma
  2 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2006-02-18  8:45 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, paulmck

Dipankar Sarma <dipankar@in.ibm.com> wrote:
>
> -module_param(maxbatch, int, 0);
>  +module_param(blimit, int, 0);
>  +module_param(qhimark, int, 0);
>  +module_param(qlowmark, int, 0);
>  +#ifdef CONFIG_SMP
>  +module_param(rsinterval, int, 0);
>  +#endif

It's a bit unusual to add boot-time tunables via module_param, but there's
no law against it.

But you do get arrested for not adding them to
Documentation/kernel-parameters.txt.  That's if you think they're permanent
(I hope they aren't).  If they are, they'll probably need a more extensive
description than kernel-parameters.txt entries normally provide.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-17 15:46   ` [PATCH 2/2] fix file counting Dipankar Sarma
@ 2006-02-18  9:04     ` Andrew Morton
  2006-02-18  9:25       ` Dipankar Sarma
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2006-02-18  9:04 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, paulmck, dada1

Dipankar Sarma <dipankar@in.ibm.com> wrote:
>
> I have benchmarked this on an x86_64 NUMA system and see no
>  significant performance difference on kernbench. Tested on
>  both x86_64 and powerpc.
> 
>  The way we do file struct accounting is not very suitable for batched
>  freeing. For scalability reasons, file accounting was constructor/destructor
>  based. This meant that nr_files was decremented only when
>  the object was removed from the slab cache. This is
>  susceptible to slab fragmentation. With RCU based file structure,
>  consequent batched freeing and a test program like Serge's,
>  we just speed this up and end up with a very fragmented slab -
> 
>  llm22:~ # cat /proc/sys/fs/file-nr
>  587730  0       758844
> 
>  At the same time, I see only a 2000+ objects in filp cache.
>  The following patch I fixes this problem. 
> 
>  This patch changes the file counting by removing the filp_count_lock.
>  Instead we use a separate percpu counter, nr_files, for now and all
>  accesses to it are through get_nr_files() api. In the sysctl
>  handler for nr_files, we populate files_stat.nr_files before returning
>  to user.
> 
>  Counting files as an when they are created and destroyed (as opposed
>  to inside slab) allows us to correctly count open files with RCU.

Fair enough.

What do you think of these changes?




From: Andrew Morton <akpm@osdl.org>

- Nuke the blank line between "}" and EXPORT_SYMBOL().  That's never seemed
  pointful to me.

- Make the get_max_files export use _GPL - only unix.ko uses it.

- Use `-1' in the arg to percpu_counter_mod() rather than `-1L'.  The
  compiler will dtrt and we shouldn't be peering inside percpu_counter
  internals here anyway.

- Scrub that - use percpu_counter_dec() and percpu_counter_inc().

- percpu_counters can be inaccurate on big SMP.  Before we actually fail a
  get_empty_filp() attempt, use the (new in -mm) expensive
  percpu_counter_sum() to check whether we're really over the limit.

- Make get_nr_files() static.  Which is just as well - any callers might
  want the percpu_counter_sum() treatment.  In which case we'd be better off
  exporting some

	bool are_files_over_limit(how_many_i_want);

  API.

Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 fs/file_table.c    |   21 +++++++++++++--------
 include/linux/fs.h |    1 -
 2 files changed, 13 insertions(+), 9 deletions(-)

diff -puN fs/file_table.c~fix-file-counting-fixes fs/file_table.c
--- devel/fs/file_table.c~fix-file-counting-fixes	2006-02-18 01:02:43.000000000 -0800
+++ devel-akpm/fs/file_table.c	2006-02-18 01:02:43.000000000 -0800
@@ -22,6 +22,7 @@
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
 #include <linux/percpu_counter.h>
+
 #include <asm/atomic.h>
 
 /* sysctl tunables... */
@@ -42,14 +43,14 @@ static inline void file_free_rcu(struct 
 
 static inline void file_free(struct file *f)
 {
-	percpu_counter_mod(&nr_files, -1L);
+	percpu_counter_dec(&nr_files);
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
 /*
  * Return the total number of open files in the system
  */
-int get_nr_files(void)
+static int get_nr_files(void)
 {
 	return percpu_counter_read_positive(&nr_files);
 }
@@ -61,8 +62,7 @@ int get_max_files(void)
 {
 	return files_stat.max_files;
 }
-
-EXPORT_SYMBOL(get_max_files);
+EXPORT_SYMBOL_GPL(get_max_files);
 
 /*
  * Handle nr_files sysctl
@@ -95,15 +95,20 @@ struct file *get_empty_filp(void)
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (get_nr_files() >= files_stat.max_files &&
-				!capable(CAP_SYS_ADMIN))
-		goto over;
+	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+		/*
+		 * percpu_counters are inaccurate.  Do an expensive check before
+		 * we go and fail.
+		 */
+		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
+			goto over;
+	}
 
 	f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
 	if (f == NULL)
 		goto fail;
 
-	percpu_counter_mod(&nr_files, 1L);
+	percpu_counter_inc(&nr_files);
 	memset(f, 0, sizeof(*f));
 	if (security_file_alloc(f))
 		goto fail_sec;
diff -puN include/linux/fs.h~fix-file-counting-fixes include/linux/fs.h
--- devel/include/linux/fs.h~fix-file-counting-fixes	2006-02-18 01:02:43.000000000 -0800
+++ devel-akpm/include/linux/fs.h	2006-02-18 01:02:43.000000000 -0800
@@ -35,7 +35,6 @@ struct files_stat_struct {
 	int max_files;		/* tunable */
 };
 extern struct files_stat_struct files_stat;
-extern int get_nr_files(void);
 extern int get_max_files(void);
 
 struct inodes_stat_t {
_


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] rcu batch tuning
  2006-02-18  8:45   ` Andrew Morton
@ 2006-02-18  9:15     ` Dipankar Sarma
  0 siblings, 0 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-18  9:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, paulmck

On Sat, Feb 18, 2006 at 12:45:51AM -0800, Andrew Morton wrote:
> Dipankar Sarma <dipankar@in.ibm.com> wrote:
> >
> > -module_param(maxbatch, int, 0);
> >  +module_param(blimit, int, 0);
> >  +module_param(qhimark, int, 0);
> >  +module_param(qlowmark, int, 0);
> >  +#ifdef CONFIG_SMP
> >  +module_param(rsinterval, int, 0);
> >  +#endif
> 
> It's a bit unusual to add boot-time tunables via module_param, but there's
> no law against it.
> 
> But you do get arrested for not adding them to
> Documentation/kernel-parameters.txt.  That's if you think they're permanent
> (I hope they aren't).  If they are, they'll probably need a more extensive
> description than kernel-parameters.txt entries normally provide.

I hope that we will not need that many tunables eventually. But my
theory has been that with widespread use and experiments with
the tunables in the event of OOM and latency problems will allow
us to figure out what kind of automatic tuning really works.

Regardless, I think there should be documentation. I will send
a documentation patch.

Thanks
Dipankar


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18  9:04     ` Andrew Morton
@ 2006-02-18  9:25       ` Dipankar Sarma
  2006-02-18  9:45         ` Andrew Morton
  2006-02-18 12:14         ` Christoph Hellwig
  0 siblings, 2 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-18  9:25 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, paulmck, dada1

On Sat, Feb 18, 2006 at 01:04:14AM -0800, Andrew Morton wrote:
> Dipankar Sarma <dipankar@in.ibm.com> wrote:
> Fair enough.
> 
> What do you think of these changes?
> 
> 
> - Nuke the blank line between "}" and EXPORT_SYMBOL().  That's never seemed
>   pointful to me.

Sounds good.

> 
> - Make the get_max_files export use _GPL - only unix.ko uses it.

Always good. I just didn't want to be the one :)

> - Use `-1' in the arg to percpu_counter_mod() rather than `-1L'.  The
>   compiler will dtrt and we shouldn't be peering inside percpu_counter
>   internals here anyway.
> 
> - Scrub that - use percpu_counter_dec() and percpu_counter_inc().

Gah! I missed those APIs. Makes perfect sense.

> 
> - percpu_counters can be inaccurate on big SMP.  Before we actually fail a
>   get_empty_filp() attempt, use the (new in -mm) expensive
>   percpu_counter_sum() to check whether we're really over the limit.


> -	if (get_nr_files() >= files_stat.max_files &&
> -				!capable(CAP_SYS_ADMIN))
> -		goto over;
> +	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
> +		/*
> +		 * percpu_counters are inaccurate.  Do an expensive check before
> +		 * we go and fail.
> +		 */
> +		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
> +			goto over;
> +	}

Slight optimization -

	if (get_nr_files() >= files_stat.max_files) {
		if (capable(CAP_SYS_ADMIN)) {
		/*
		 * percpu_counters are inaccurate.  Do an expensive check before
		 * we go and fail.
		 */
			if (percpu_counter_sum(&nr_files) >= 
						files_stat.max_files)
				goto over;
		} else 
			goto over;
	}
	
>  
> 
> - Make get_nr_files() static.  Which is just as well - any callers might
>   want the percpu_counter_sum() treatment.  In which case we'd be better off
>   exporting some
> 
> 	bool are_files_over_limit(how_many_i_want);

Well, as of now there is none (xfs use was removed a few months ago).
So, I removed the EXPORT_SYMBOL for get_nr_files() and forgot to
make it static. Your patch does the right thing.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18  9:25       ` Dipankar Sarma
@ 2006-02-18  9:45         ` Andrew Morton
  2006-02-18 10:06           ` Dipankar Sarma
  2006-02-18 12:14         ` Christoph Hellwig
  1 sibling, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2006-02-18  9:45 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, paulmck, dada1

Dipankar Sarma <dipankar@in.ibm.com> wrote:
>
>  > -	if (get_nr_files() >= files_stat.max_files &&
>  > -				!capable(CAP_SYS_ADMIN))
>  > -		goto over;
>  > +	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
>  > +		/*
>  > +		 * percpu_counters are inaccurate.  Do an expensive check before
>  > +		 * we go and fail.
>  > +		 */
>  > +		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
>  > +			goto over;
>  > +	}
> 
>  Slight optimization -
> 
>  	if (get_nr_files() >= files_stat.max_files) {
>  		if (capable(CAP_SYS_ADMIN)) {
>  		/*
>  		 * percpu_counters are inaccurate.  Do an expensive check before
>  		 * we go and fail.
>  		 */
>  			if (percpu_counter_sum(&nr_files) >= 
>  						files_stat.max_files)
>  				goto over;
>  		} else 
>  			goto over;
>  	}

That changes the behaviour for root.  Maybe you meant !capable(), but that
still changes the behaviour.  I'm all confused.



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18  9:45         ` Andrew Morton
@ 2006-02-18 10:06           ` Dipankar Sarma
  2006-02-18 10:10             ` Andrew Morton
  0 siblings, 1 reply; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-18 10:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, paulmck, dada1

On Sat, Feb 18, 2006 at 01:45:29AM -0800, Andrew Morton wrote:
> Dipankar Sarma <dipankar@in.ibm.com> wrote:
> >
> > 
> >  Slight optimization -
> > 
> >  	if (get_nr_files() >= files_stat.max_files) {
> >  		if (capable(CAP_SYS_ADMIN)) {
> >  		/*
> >  		 * percpu_counters are inaccurate.  Do an expensive check before
> >  		 * we go and fail.
> >  		 */
> >  			if (percpu_counter_sum(&nr_files) >= 
> >  						files_stat.max_files)
> >  				goto over;
> >  		} else 
> >  			goto over;
> >  	}
> 
> That changes the behaviour for root.  Maybe you meant !capable(), but that
> still changes the behaviour.  I'm all confused.

Hmm.. on second thoughts, there is no harm doing the expensive check
for both priviledged and non-priviledged user. It will correctly
allow non-priviledged users to create the new file provided
the fast-path percpu counter value returned was greater than the 
slow path per-cpu counter value.

Just ignore my comment in the previous mail.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18 10:06           ` Dipankar Sarma
@ 2006-02-18 10:10             ` Andrew Morton
  2006-02-18 10:44               ` Dipankar Sarma
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2006-02-18 10:10 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, paulmck, dada1

Dipankar Sarma <dipankar@in.ibm.com> wrote:
>
>  On Sat, Feb 18, 2006 at 01:45:29AM -0800, Andrew Morton wrote:
>  > Dipankar Sarma <dipankar@in.ibm.com> wrote:
>  > >
>  > > 
>  > >  Slight optimization -
>  > > 
>  > >  	if (get_nr_files() >= files_stat.max_files) {
>  > >  		if (capable(CAP_SYS_ADMIN)) {
>  > >  		/*
>  > >  		 * percpu_counters are inaccurate.  Do an expensive check before
>  > >  		 * we go and fail.
>  > >  		 */
>  > >  			if (percpu_counter_sum(&nr_files) >= 
>  > >  						files_stat.max_files)
>  > >  				goto over;
>  > >  		} else 
>  > >  			goto over;
>  > >  	}
>  > 
>  > That changes the behaviour for root.  Maybe you meant !capable(), but that
>  > still changes the behaviour.  I'm all confused.
> 
>  Hmm.. on second thoughts, there is no harm doing the expensive check
>  for both priviledged and non-priviledged user. It will correctly
>  allow non-priviledged users to create the new file provided
>  the fast-path percpu counter value returned was greater than the 
>  slow path per-cpu counter value.

Look closer ;)

	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
		/*
		 * percpu_counters are inaccurate.  Do an expensive check before
		 * we go and fail.
		 */
		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
			goto over;
	}


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18 10:10             ` Andrew Morton
@ 2006-02-18 10:44               ` Dipankar Sarma
  0 siblings, 0 replies; 23+ messages in thread
From: Dipankar Sarma @ 2006-02-18 10:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, paulmck, dada1

On Sat, Feb 18, 2006 at 02:10:22AM -0800, Andrew Morton wrote:
> Dipankar Sarma <dipankar@in.ibm.com> wrote:
> >
> >  On Sat, Feb 18, 2006 at 01:45:29AM -0800, Andrew Morton wrote:
> 
> Look closer ;)
> 
> 	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
> 		/*
> 		 * percpu_counters are inaccurate.  Do an expensive check before
> 		 * we go and fail.
> 		 */
> 		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
> 			goto over;
> 	}

Ah, that check is not relevant for priviledged users anyway. That
means I should just continue doing where my mind is at the moment - running
the weekend errands :) And not look at code.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18  9:25       ` Dipankar Sarma
  2006-02-18  9:45         ` Andrew Morton
@ 2006-02-18 12:14         ` Christoph Hellwig
  2006-02-18 12:31           ` Arjan van de Ven
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2006-02-18 12:14 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: Andrew Morton, linux-kernel, paulmck, dada1, davem, netdev

> > - Make the get_max_files export use _GPL - only unix.ko uses it.

The real question is, does af_unix really need to allow beeing built
modular?  It's quite different from other network protocol and deeply
tied to the kernel due to things like descriptor passing or using
the filesystem namespace.  I already had to export another symbol that
really should be internal just for it, and if one module acquires lots
of such hacks it's usually a bad sign..

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] fix file counting
  2006-02-18 12:14         ` Christoph Hellwig
@ 2006-02-18 12:31           ` Arjan van de Ven
  2006-02-20 22:36             ` [2.6 patch] make UNIX a bool Adrian Bunk
  0 siblings, 1 reply; 23+ messages in thread
From: Arjan van de Ven @ 2006-02-18 12:31 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Dipankar Sarma, Andrew Morton, linux-kernel, paulmck, dada1,
	davem, netdev

On Sat, 2006-02-18 at 12:14 +0000, Christoph Hellwig wrote:
> > > - Make the get_max_files export use _GPL - only unix.ko uses it.
> 
> The real question is, does af_unix really need to allow beeing built
> modular?  It's quite different from other network protocol and deeply
> tied to the kernel due to things like descriptor passing or using
> the filesystem namespace.  I already had to export another symbol that
> really should be internal just for it, and if one module acquires lots
> of such hacks it's usually a bad sign..

in 2.4 the answer would have been simple; modutils back then used
AF_UNIX stuff before it could load modules, so modular was in practice
impossible. 

Anyway I'd agree with making this non-modular... NOBODY will use this as
a module, or if they do loading it somehow is the very first thing done.
You just can't live without this, so making it a module is non-sensical.



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [2.6 patch] make UNIX a bool
  2006-02-18 12:31           ` Arjan van de Ven
@ 2006-02-20 22:36             ` Adrian Bunk
  0 siblings, 0 replies; 23+ messages in thread
From: Adrian Bunk @ 2006-02-20 22:36 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Christoph Hellwig, Dipankar Sarma, Andrew Morton, linux-kernel,
	paulmck, dada1, davem, netdev

On Sat, Feb 18, 2006 at 01:31:30PM +0100, Arjan van de Ven wrote:
> On Sat, 2006-02-18 at 12:14 +0000, Christoph Hellwig wrote:
> > > > - Make the get_max_files export use _GPL - only unix.ko uses it.
> > 
> > The real question is, does af_unix really need to allow beeing built
> > modular?  It's quite different from other network protocol and deeply
> > tied to the kernel due to things like descriptor passing or using
> > the filesystem namespace.  I already had to export another symbol that
> > really should be internal just for it, and if one module acquires lots
> > of such hacks it's usually a bad sign..
> 
> in 2.4 the answer would have been simple; modutils back then used
> AF_UNIX stuff before it could load modules, so modular was in practice
> impossible. 
> 
> Anyway I'd agree with making this non-modular... NOBODY will use this as
> a module, or if they do loading it somehow is the very first thing done.
> You just can't live without this, so making it a module is non-sensical.

So let's send a patch.  ;-)

cu
Adrian


<--  snip  -->


CONFIG_UNIX=m doesn't make much sense.


Signed-off-by: Adrian Bunk <bunk@stusta.de>

--- linux-2.6.16-rc4-mm1-full/net/unix/Kconfig.old	2006-02-20 14:40:19.000000000 +0100
+++ linux-2.6.16-rc4-mm1-full/net/unix/Kconfig	2006-02-20 14:40:27.000000000 +0100
@@ -3,7 +3,7 @@
 #
 
 config UNIX
-	tristate "Unix domain sockets"
+	bool "Unix domain sockets"
 	---help---
 	  If you say Y here, you will include support for Unix domain sockets;
 	  sockets are the standard Unix mechanism for establishing and


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2006-02-20 22:37 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-02-17 15:41 [PATCH 0/2] RCU updates Dipankar Sarma
2006-02-17 15:43 ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
2006-02-17 15:46   ` [PATCH 2/2] fix file counting Dipankar Sarma
2006-02-18  9:04     ` Andrew Morton
2006-02-18  9:25       ` Dipankar Sarma
2006-02-18  9:45         ` Andrew Morton
2006-02-18 10:06           ` Dipankar Sarma
2006-02-18 10:10             ` Andrew Morton
2006-02-18 10:44               ` Dipankar Sarma
2006-02-18 12:14         ` Christoph Hellwig
2006-02-18 12:31           ` Arjan van de Ven
2006-02-20 22:36             ` [2.6 patch] make UNIX a bool Adrian Bunk
2006-02-17 20:33   ` [PATCH 1/2] rcu batch tuning Dipankar Sarma
2006-02-18  8:45   ` Andrew Morton
2006-02-18  9:15     ` Dipankar Sarma
  -- strict thread matches above, loose matches on Subject: below --
2006-01-27 19:57 [patch " Oleg Nesterov
2006-01-27 23:42 ` Paul E. McKenney
2006-01-28 18:07   ` Oleg Nesterov
2006-01-30  3:30     ` Paul E. McKenney
2006-01-28 17:08 ` Dipankar Sarma
2006-01-26 18:40 [patch 0/2] RCU: fix various latency/oom issues Dipankar Sarma
2006-01-26 18:41 ` [patch 1/2] rcu batch tuning Dipankar Sarma
2006-01-26 19:33   ` Paul E. McKenney
2006-01-26 19:42     ` Dipankar Sarma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox