Re: [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re: [PATCH] cpuacct: add a branch prediction

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: paulmck@linux.vnet.ibm.com, Bharata B Rao <bharata.rao@gmail.com>,
	Li Zefan <lizf@cn.fujitsu.com>, Ingo Molnar <mingo@elte.hu>,
	Paul Menage <menage@google.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	LKML <linux-kernel@vger.kernel.org>
Subject: Re: [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re: [PATCH] cpuacct: add a branch prediction
Date: Mon, 02 Mar 2009 15:56:10 +0100	[thread overview]
Message-ID: <1236005770.5330.583.camel@laptop> (raw)
In-Reply-To: <20090227122239.875a3f56.kamezawa.hiroyu@jp.fujitsu.com>

On Fri, 2009-02-27 at 12:22 +0900, KAMEZAWA Hiroyuki wrote:

Comments below..

> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> cgroup/cpuacct subsystem counts cpu usage by 64bit coutnter in
> per-cpu object. In read-side (via cpuacct.usage file), for reading 64bit
> value in safe manner, it takes rq->lock of (other) cpus.
> 
> In general, taking rq->lock of other cpus from codes not for scheduler
> is not good. This patch tries to remove rq->lock used in read-side.
> 
> To read 64bit value in safe, this patch uses seqcounter.
> 
> Pros.
>   - rq->lock is not necessary.
> Cons.
>   - When updating counter, sequence number must be updated.
>     (I hope this per-cpu sequence number is on cache...)
>   - not simple.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  kernel/sched.c |  141 ++++++++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 105 insertions(+), 36 deletions(-)
> 
> Index: mmotm-2.6.29-Feb24/kernel/sched.c
> ===================================================================
> --- mmotm-2.6.29-Feb24.orig/kernel/sched.c
> +++ mmotm-2.6.29-Feb24/kernel/sched.c
> @@ -9581,6 +9581,67 @@ struct cgroup_subsys cpu_cgroup_subsys =
>  
>  #ifdef CONFIG_CGROUP_CPUACCT
>  
> +#ifndef CONFIG_64BIT
> +DEFINE_PER_CPU(struct seqcount, cpuacct_cgroup_seq);
> +
> +static inline void cpuacct_start_counter_update(void)
> +{
> +	/* This is called under rq->lock and IRQ is off */
> +	struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
> +
> +	write_seqcount_begin(s);
> +	put_cpu_var(cpuacct_cgroup_seq);
> +}
> +
> +static inline void cpuacct_end_counter_update(void)
> +{
> +	struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
> +
> +	write_seqcount_end(s);
> +	put_cpu_var(cpuacct_cgroup_seq);
> +}

It seems odd we disable/enable preemption in both, I would expect for
start to disable preemption, and have end enable it again, or use
__get_cpu_var() and assume preemption is already disabled (callsites are
under rq->lock, right?)

> +static inline u64
> +cpuacct_read_counter(u64 *val, int cpu)
> +{
> +	struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
> +	unsigned int seq;
> +	u64 data;
> +
> +	do {
> +		seq = read_seqcount_begin(s);
> +		data = *val;
> +	} while (read_seqcount_retry(s, seq));
> +	return data;
> +}
> +/* This is a special funtion called against "offline" cpus. */
> +static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
> +{
> +	struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
> +
> +	preempt_disable();
> +	write_seqcount_begin(s);
> +	*val = 0;
> +	write_seqcount_end(s);
> +	preempt_enable();
> +}

And here you double disable preemption, quite useless if you take a
remote cpu's per-cpu data.

> +#else
> +static inline void cpuacct_start_counter_update(void)
> +{
> +}
> +static inline void cpuacct_end_counter_update(void)
> +{
> +}
> +static inline u64 cpuacct_read_counter(u64 *val, int cpu)
> +{
> +	return *val;
> +}
> +static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
> +{
> +	*val = 0;
> +}
> +#endif
> +
>  /*
>   * CPU accounting code for task groups.
>   *
> @@ -9596,6 +9657,11 @@ struct cpuacct {
>  	struct cpuacct *parent;
>  };
>  
> +struct cpuacct_work {
> +	struct work_struct work;
> +	struct cpuacct *cpuacct;
> +};
> +
>  struct cgroup_subsys cpuacct_subsys;
>  
>  /* return cpu accounting group corresponding to this container */
> @@ -9643,39 +9709,29 @@ cpuacct_destroy(struct cgroup_subsys *ss
>  	kfree(ca);
>  }
>  
> +/* In 32bit enviroment, seqcounter is used for reading 64bit in safe way */
>  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
>  {
>  	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
>  	u64 data;
>  
> -#ifndef CONFIG_64BIT
> -	/*
> -	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
> -	 */
> -	spin_lock_irq(&cpu_rq(cpu)->lock);
> -	data = *cpuusage;
> -	spin_unlock_irq(&cpu_rq(cpu)->lock);
> -#else
> -	data = *cpuusage;
> -#endif
> +	data = cpuacct_read_counter(cpuusage, cpu);
>  
>  	return data;
>  }
>  
> -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
> +/* called by per-cpu workqueue */
> +static void cpuacct_cpuusage_reset_cpu(struct work_struct *work)
>  {
> +	struct cpuacct_work *cw = container_of(work, struct cpuacct_work, work);
> +	struct cpuacct *ca = cw->cpuacct;
> +	int cpu = get_cpu();
>  	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
>  
> -#ifndef CONFIG_64BIT
> -	/*
> -	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
> -	 */
> -	spin_lock_irq(&cpu_rq(cpu)->lock);
> -	*cpuusage = val;
> -	spin_unlock_irq(&cpu_rq(cpu)->lock);
> -#else
> -	*cpuusage = val;
> -#endif
> +	cpuacct_start_counter_update();
> +	*cpuusage = 0;
> +	cpuacct_end_counter_update();
> +	put_cpu();
>  }
>  
>  /* return total cpu usage (in nanoseconds) of a group */
> @@ -9691,23 +9747,34 @@ static u64 cpuusage_read(struct cgroup *
>  	return totalcpuusage;
>  }
>  
> -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
> -								u64 reset)
> +static int cpuacct_cpuusage_reset(struct cgroup *cgrp, unsigned int event)
>  {
>  	struct cpuacct *ca = cgroup_ca(cgrp);
> -	int err = 0;
> -	int i;
> -
> -	if (reset) {
> -		err = -EINVAL;
> -		goto out;
> +	int cpu;
> +	/*
> +	 * Reset All counters....doesn't need to be fast.
> +	 * "ca" will be stable while doing this. We are in write() syscall.
> +	 */
> +	get_online_cpus();
> +	/*
> +	 * Because we use alloc_percpu() for allocating counter, we have
> +	 * a counter per a possible cpu. Reset all online's by workqueue and
> +	 * reset offline cpu's directly.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		if (cpu_online(cpu)) {
> +			struct cpuacct_work cw;
> +			INIT_WORK(&cw.work, cpuacct_cpuusage_reset_cpu);
> +			cw.cpuacct = ca;
> +			schedule_work_on(cpu, &cw.work);
> +			flush_work(&cw.work);
> +		} else {
> +			u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
> +			cpuacct_reset_offline_counter(cpuusage, cpu);
> +		}

I'm not particularly convinced this is the right way, schedule_work_on()
sounds way expensive for setting a variable to 0.

Furthermore, if you want something like schedule_work_on() for each cpu,
there's schedule_on_each_cpu().

>  	}
> -
> -	for_each_present_cpu(i)
> -		cpuacct_cpuusage_write(ca, i, 0);
> -
> -out:
> -	return err;
> +	put_online_cpus();
> +	return 0;
>  }
>  
>  static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
> @@ -9729,7 +9796,7 @@ static struct cftype files[] = {
>  	{
>  		.name = "usage",
>  		.read_u64 = cpuusage_read,
> -		.write_u64 = cpuusage_write,
> +		.trigger = cpuacct_cpuusage_reset,
>  	},
>  	{
>  		.name = "usage_percpu",
> @@ -9756,10 +9823,12 @@ static void cpuacct_charge(struct task_s
>  	cpu = task_cpu(tsk);
>  	ca = task_ca(tsk);
>  
> +	cpuacct_start_counter_update();
>  	for (; ca; ca = ca->parent) {
>  		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
>  		*cpuusage += cputime;
>  	}
> +	cpuacct_end_counter_update();
>  }
>  
>  struct cgroup_subsys cpuacct_subsys = {
>

next prev parent reply	other threads:[~2009-03-02 14:56 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-02-26  7:40 [PATCH] cpuacct: add a branch prediction Li Zefan
2009-02-26  8:07 ` KAMEZAWA Hiroyuki
2009-02-26  8:17   ` Li Zefan
2009-02-26  8:22     ` KAMEZAWA Hiroyuki
2009-02-26  8:35       ` Li Zefan
2009-02-26  8:40         ` KAMEZAWA Hiroyuki
2009-02-26 10:10           ` Bharata B Rao
2009-02-26 10:28             ` KAMEZAWA Hiroyuki
2009-02-26 10:44               ` Peter Zijlstra
2009-02-26 10:55                 ` KAMEZAWA Hiroyuki
2009-02-26 11:22                   ` Peter Zijlstra
2009-02-26 11:17                 ` KAMEZAWA Hiroyuki
2009-02-26 11:28                   ` Peter Zijlstra
2009-02-26 12:06                     ` KAMEZAWA Hiroyuki
2009-02-26 12:20                       ` Peter Zijlstra
2009-02-26 12:26                         ` Ingo Molnar
2009-02-26 12:40                           ` Arnd Bergmann
2009-02-27  4:25                           ` Paul Mackerras
2009-02-26 16:45                       ` Paul E. McKenney
2009-02-27  0:58                         ` KAMEZAWA Hiroyuki
2009-02-27  1:29                           ` Paul E. McKenney
2009-02-27  3:22                             ` [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was " KAMEZAWA Hiroyuki
2009-03-02 14:56                               ` Peter Zijlstra [this message]
2009-03-02 23:42                                 ` KAMEZAWA Hiroyuki
2009-03-03  7:51                                   ` Peter Zijlstra
2009-03-03  9:04                                     ` KAMEZAWA Hiroyuki
2009-03-03  9:40                                       ` Peter Zijlstra
2009-03-03 10:42                                         ` KAMEZAWA Hiroyuki
2009-03-03 10:44                                           ` KAMEZAWA Hiroyuki
2009-03-03 11:54                                           ` Peter Zijlstra
2009-03-04  6:32                                             ` [PATCH] remove rq->lock from cpuacct cgroup v2 KAMEZAWA Hiroyuki
2009-03-04  7:54                                               ` Bharata B Rao
2009-03-04  8:20                                                 ` KAMEZAWA Hiroyuki
2009-03-04  8:46                                                   ` KAMEZAWA Hiroyuki
2009-03-04 10:35                                                     ` Bharata B Rao
2009-03-04 12:11                                                   ` Bharata B Rao
2009-03-04 14:17                                                     ` KAMEZAWA Hiroyuki
2009-02-26  8:37 ` [PATCH] cpuacct: add a branch prediction Balbir Singh
2009-02-26  8:41   ` Li Zefan
2009-02-26 10:40     ` Balbir Singh
2009-02-26 10:43       ` Peter Zijlstra
2009-02-26  8:43   ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1236005770.5330.583.camel@laptop \
    --to=a.p.zijlstra@chello.nl \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=bharata.rao@gmail.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=mingo@elte.hu \
    --cc=paulmck@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.