public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Andrew Morton <akpm@linux-foundation.org>
To: Balaji Rao <balajirrao@gmail.com>
Cc: dhaval@linux.vnet.ibm.com, linux-kernel@vger.kernel.org,
	containers@lists.osdl.org, menage@google.com, balbir@in.ibm.com,
	vatsa@linux.vnet.ibm.com, a.p.zijlstra@chello.nl
Subject: Re: [RFC][-mm] Simple stats for cpu resource controller v3
Date: Thu, 1 May 2008 14:00:26 -0700	[thread overview]
Message-ID: <20080501140026.6188065e.akpm@linux-foundation.org> (raw)
In-Reply-To: <200805012311.06797.balajirrao@gmail.com>

On Thu, 1 May 2008 23:11:06 +0530
Balaji Rao <balajirrao@gmail.com> wrote:

> This implements a couple of basic statistics for the CPU resource controller. 
> 
> v2->v3
> -------
> Proper locking while collecting stats. Thanks to Peter Zijlstra for suggesting
> the delta approach.
> 
> This applies against 2.6.25-mm1
> ---
> 
> Signed-off-by: Balaji Rao <balajirrao@gmail.com>
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index bbdc32a..5bda75a 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -248,10 +248,51 @@ struct cfs_rq;
>  
>  static LIST_HEAD(task_groups);
>  
> +#ifdef CONFIG_CGROUP_SCHED
> +
> +#define CPU_CGROUP_STAT_THRESHOLD 1 << 30

Needs a comment, and parentheses?

> +enum cpu_cgroup_stat_index {
> +	CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
> +	CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
> +
> +	CPU_CGROUP_STAT_NSTATS,
> +};
> +
> +struct cpu_cgroup_stat_cpu {
> +	s64 count[CPU_CGROUP_STAT_NSTATS];
> +	u32 delta[CPU_CGROUP_STAT_NSTATS];
> +} ____cacheline_aligned_in_smp;

That's 128 bytes on ia64.

> +struct cpu_cgroup_stat {
> +	struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
> +	spinlock_t lock;
> +};

And with NR_CPUS=1024, we're starting to talk serious pigginess.

And there's one of these per task_group!  Chances are we just won't be able
to allocate that much contiguous memory, so that will solve the problem ;)

> +/* Called under irq disable. */
> +static void __cpu_cgroup_stat_add(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx, int val)
> +{
> +	int cpu = smp_processor_id();
> +	unsigned long flags;
> +
> +	BUG_ON(!irqs_disabled());

Ok...

> +	stat->cpustat[cpu].delta[idx] += val;
> +
> +	if (stat->cpustat[cpu].delta[idx] > CPU_CGROUP_STAT_THRESHOLD) {
> +		spin_lock_irqsave(&stat->lock, flags);

so we could have used plain old spin_lock() here.

> +		stat->cpustat[cpu].count[idx] += stat->cpustat[cpu].delta[idx];
> +		stat->cpustat[cpu].delta[idx] = 0;
> +		spin_unlock_irqrestore(&stat->lock, flags);
> +	}
> +}
> +#endif
> +
>  /* task group related information */
>  struct task_group {
>  #ifdef CONFIG_CGROUP_SCHED
>  	struct cgroup_subsys_state css;
> +	struct cpu_cgroup_stat stat;
>  #endif
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -3837,6 +3878,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
>  		cpustat->nice = cputime64_add(cpustat->nice, tmp);
>  	else
>  		cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> +	/* Charge the task's group */
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +		struct task_group *tg;
> +		tg = task_group(p);
> +		__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_UTIME,
> +				cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
>  
>  /*
> @@ -3900,6 +3951,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
>  		cpustat->idle = cputime64_add(cpustat->idle, tmp);
>  	/* Account for system time used */
>  	acct_update_integrals(p);
> +
> +#ifdef CONFIG_CGROUP_SCHED
> +	{
> +		struct task_group *tg;
> +		tg = task_group(p);
> +		__cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_STIME,
> +				cputime_to_msecs(cputime));
> +	}
> +#endif
>  }
>  
>  /*
> @@ -7838,7 +7898,9 @@ struct task_group *sched_create_group(void)
>  	}
>  	list_add_rcu(&tg->list, &task_groups);
>  	spin_unlock_irqrestore(&task_group_lock, flags);
> -
> +#ifdef CONFIG_CGROUP_SCHED
> +	spin_lock_init(&tg->stat.lock);
> +#endif
>  	return tg;
>  
>  err:
> @@ -8249,6 +8311,48 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
>  
>  	return (u64) tg->shares;
>  }
> +
> +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> +		enum cpu_cgroup_stat_index idx)
> +{
> +	int cpu;
> +	s64 ret = 0;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&stat->lock, flags);
> +	for_each_possible_cpu(cpu) {
> +		stat->cpustat[cpu].count[idx] += stat->cpustat[cpu].delta[idx];
> +		stat->cpustat[cpu].delta[idx] = 0;
> +		ret += stat->cpustat[cpu].count[idx];
> +	}
> +	spin_unlock_irqrestore(&stat->lock, flags);
> +
> +	return ret;
> +}

That loop iterates 1024 times, under spin_lock_irqsave().  On a 2-way. 
Sad, no?

> +static const struct cpu_cgroup_stat_desc {
> +	const char *msg;
> +	u64 unit;
> +} cpu_cgroup_stat_desc[] = {
> +	[CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
> +	[CPU_CGROUP_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
> +				struct cgroup_map_cb *cb)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	struct cpu_cgroup_stat *stat = &tg->stat;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
> +		s64 val;
> +		val = cpu_cgroup_read_stat(stat, i);
> +		val *= cpu_cgroup_stat_desc[i].unit;
> +		cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
> +	}
> +	return 0;
> +}
>  #endif
>  
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -8295,6 +8399,10 @@ static struct cftype cpu_files[] = {
>  		.write_u64 = cpu_rt_period_write_uint,
>  	},
>  #endif
> +	{
> +		.name = "stat",
> +		.read_map = cpu_cgroup_stats_show,
> +	},
>  };
>  

Rethink, please...

There are numerous reasons here for implementing the counters as
dynamically-allocated, per-online-cpu things, with a cpu-hotplug notifier. 
All a bit of a hassle, but that's life.


Anyway, forget all that.

Did you consider using include/linux/percpu_counter.h?

If so, what was wrong with it?

Because it would be much better to fix per-cpu counters than to invent new
stuff.


  reply	other threads:[~2008-05-01 21:01 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-04-05 18:09 [RFC][-mm] [1/2] Simple stats for cpu resource controller Balaji Rao
2008-04-05 18:56 ` Dhaval Giani
2008-04-05 19:09   ` Balaji Rao
2008-04-05 19:40 ` Dhaval Giani
2008-04-05 20:31   ` Balaji Rao
2008-04-05 20:59     ` Dhaval Giani
2008-04-05 21:21       ` Balaji Rao
2008-04-06  5:12         ` Balbir Singh
2008-04-07 13:24     ` Peter Zijlstra
2008-04-10 16:09       ` Balaji Rao
2008-04-10 16:25         ` Peter Zijlstra
2008-05-01 17:41           ` [RFC][-mm] Simple stats for cpu resource controller v3 Balaji Rao
2008-05-01 21:00             ` Andrew Morton [this message]
2008-05-02 19:40               ` Balaji Rao
2008-05-02 19:53                 ` Andrew Morton
2008-05-02 22:47                   ` Balaji Rao
2008-05-02 23:04                     ` Andrew Morton
     [not found] ` <200805030457.54073.balajirrao@gmail.com>
     [not found]   ` <20080502164133.9025a02c.akpm@linux-foundation.org>
2008-05-02 23:56     ` Balaji Rao
2008-05-03  0:19       ` Andrew Morton
2008-05-11 20:18         ` [RFC][-mm] Simple stats for cpu resource controller v4 Balaji Rao
2008-05-12  2:54           ` Li Zefan
2008-05-12  5:41             ` Balaji Rao
2008-05-12 23:48           ` Andrew Morton
2008-05-13  3:30             ` Balaji Rao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080501140026.6188065e.akpm@linux-foundation.org \
    --to=akpm@linux-foundation.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=balajirrao@gmail.com \
    --cc=balbir@in.ibm.com \
    --cc=containers@lists.osdl.org \
    --cc=dhaval@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=vatsa@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox