From: Andrew Morton <akpm@linux-foundation.org>
To: Balaji Rao <balajirrao@gmail.com>
Cc: dhaval@linux.vnet.ibm.com, linux-kernel@vger.kernel.org,
containers@lists.osdl.org, menage@google.com, balbir@in.ibm.com,
vatsa@linux.vnet.ibm.com, a.p.zijlstra@chello.nl
Subject: Re: [RFC][-mm] Simple stats for cpu resource controller v3
Date: Thu, 1 May 2008 14:00:26 -0700 [thread overview]
Message-ID: <20080501140026.6188065e.akpm@linux-foundation.org> (raw)
In-Reply-To: <200805012311.06797.balajirrao@gmail.com>
On Thu, 1 May 2008 23:11:06 +0530
Balaji Rao <balajirrao@gmail.com> wrote:
> This implements a couple of basic statistics for the CPU resource controller.
>
> v2->v3
> -------
> Proper locking while collecting stats. Thanks to Peter Zijlstra for suggesting
> the delta approach.
>
> This applies against 2.6.25-mm1
> ---
>
> Signed-off-by: Balaji Rao <balajirrao@gmail.com>
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index bbdc32a..5bda75a 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -248,10 +248,51 @@ struct cfs_rq;
>
> static LIST_HEAD(task_groups);
>
> +#ifdef CONFIG_CGROUP_SCHED
> +
> +#define CPU_CGROUP_STAT_THRESHOLD 1 << 30
Needs a comment, and parentheses?
> +enum cpu_cgroup_stat_index {
> + CPU_CGROUP_STAT_UTIME, /* Usertime of the task group */
> + CPU_CGROUP_STAT_STIME, /* Kerneltime of the task group */
> +
> + CPU_CGROUP_STAT_NSTATS,
> +};
> +
> +struct cpu_cgroup_stat_cpu {
> + s64 count[CPU_CGROUP_STAT_NSTATS];
> + u32 delta[CPU_CGROUP_STAT_NSTATS];
> +} ____cacheline_aligned_in_smp;
That's 128 bytes on ia64.
> +struct cpu_cgroup_stat {
> + struct cpu_cgroup_stat_cpu cpustat[NR_CPUS];
> + spinlock_t lock;
> +};
And with NR_CPUS=1024, we're starting to talk serious pigginess.
And there's one of these per task_group! Chances are we just won't be able
to allocate that much contiguous memory, so that will solve the problem ;)
> +/* Called under irq disable. */
> +static void __cpu_cgroup_stat_add(struct cpu_cgroup_stat *stat,
> + enum cpu_cgroup_stat_index idx, int val)
> +{
> + int cpu = smp_processor_id();
> + unsigned long flags;
> +
> + BUG_ON(!irqs_disabled());
Ok...
> + stat->cpustat[cpu].delta[idx] += val;
> +
> + if (stat->cpustat[cpu].delta[idx] > CPU_CGROUP_STAT_THRESHOLD) {
> + spin_lock_irqsave(&stat->lock, flags);
so we could have used plain old spin_lock() here.
> + stat->cpustat[cpu].count[idx] += stat->cpustat[cpu].delta[idx];
> + stat->cpustat[cpu].delta[idx] = 0;
> + spin_unlock_irqrestore(&stat->lock, flags);
> + }
> +}
> +#endif
> +
> /* task group related information */
> struct task_group {
> #ifdef CONFIG_CGROUP_SCHED
> struct cgroup_subsys_state css;
> + struct cpu_cgroup_stat stat;
> #endif
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -3837,6 +3878,16 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
> cpustat->nice = cputime64_add(cpustat->nice, tmp);
> else
> cpustat->user = cputime64_add(cpustat->user, tmp);
> +
> + /* Charge the task's group */
> +#ifdef CONFIG_CGROUP_SCHED
> + {
> + struct task_group *tg;
> + tg = task_group(p);
> + __cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_UTIME,
> + cputime_to_msecs(cputime));
> + }
> +#endif
> }
>
> /*
> @@ -3900,6 +3951,15 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
> cpustat->idle = cputime64_add(cpustat->idle, tmp);
> /* Account for system time used */
> acct_update_integrals(p);
> +
> +#ifdef CONFIG_CGROUP_SCHED
> + {
> + struct task_group *tg;
> + tg = task_group(p);
> + __cpu_cgroup_stat_add(&tg->stat, CPU_CGROUP_STAT_STIME,
> + cputime_to_msecs(cputime));
> + }
> +#endif
> }
>
> /*
> @@ -7838,7 +7898,9 @@ struct task_group *sched_create_group(void)
> }
> list_add_rcu(&tg->list, &task_groups);
> spin_unlock_irqrestore(&task_group_lock, flags);
> -
> +#ifdef CONFIG_CGROUP_SCHED
> + spin_lock_init(&tg->stat.lock);
> +#endif
> return tg;
>
> err:
> @@ -8249,6 +8311,48 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
>
> return (u64) tg->shares;
> }
> +
> +static s64 cpu_cgroup_read_stat(struct cpu_cgroup_stat *stat,
> + enum cpu_cgroup_stat_index idx)
> +{
> + int cpu;
> + s64 ret = 0;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&stat->lock, flags);
> + for_each_possible_cpu(cpu) {
> + stat->cpustat[cpu].count[idx] += stat->cpustat[cpu].delta[idx];
> + stat->cpustat[cpu].delta[idx] = 0;
> + ret += stat->cpustat[cpu].count[idx];
> + }
> + spin_unlock_irqrestore(&stat->lock, flags);
> +
> + return ret;
> +}
That loop iterates 1024 times, under spin_lock_irqsave(). On a 2-way.
Sad, no?
> +static const struct cpu_cgroup_stat_desc {
> + const char *msg;
> + u64 unit;
> +} cpu_cgroup_stat_desc[] = {
> + [CPU_CGROUP_STAT_UTIME] = { "utime", 1, },
> + [CPU_CGROUP_STAT_STIME] = { "stime", 1, },
> +};
> +
> +static int cpu_cgroup_stats_show(struct cgroup *cgrp, struct cftype *cft,
> + struct cgroup_map_cb *cb)
> +{
> + struct task_group *tg = cgroup_tg(cgrp);
> + struct cpu_cgroup_stat *stat = &tg->stat;
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
> + s64 val;
> + val = cpu_cgroup_read_stat(stat, i);
> + val *= cpu_cgroup_stat_desc[i].unit;
> + cb->fill(cb, cpu_cgroup_stat_desc[i].msg, val);
> + }
> + return 0;
> +}
> #endif
>
> #ifdef CONFIG_RT_GROUP_SCHED
> @@ -8295,6 +8399,10 @@ static struct cftype cpu_files[] = {
> .write_u64 = cpu_rt_period_write_uint,
> },
> #endif
> + {
> + .name = "stat",
> + .read_map = cpu_cgroup_stats_show,
> + },
> };
>
Rethink, please...
There are numerous reasons here for implementing the counters as
dynamically-allocated, per-online-cpu things, with a cpu-hotplug notifier.
All a bit of a hassle, but that's life.
Anyway, forget all that.
Did you consider using include/linux/percpu_counter.h?
If so, what was wrong with it?
Because it would be much better to fix per-cpu counters than to invent new
stuff.
next prev parent reply other threads:[~2008-05-01 21:00 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-04-05 18:09 [RFC][-mm] [1/2] Simple stats for cpu resource controller Balaji Rao
2008-04-05 18:56 ` Dhaval Giani
2008-04-05 19:09 ` Balaji Rao
2008-04-05 19:40 ` Dhaval Giani
2008-04-05 19:40 ` Dhaval Giani
2008-04-05 20:31 ` Balaji Rao
2008-04-05 20:59 ` Dhaval Giani
2008-04-05 21:21 ` Balaji Rao
2008-04-06 5:12 ` Balbir Singh
2008-04-07 13:24 ` Peter Zijlstra
2008-04-10 16:09 ` Balaji Rao
2008-04-10 16:25 ` Peter Zijlstra
2008-05-01 17:41 ` [RFC][-mm] Simple stats for cpu resource controller v3 Balaji Rao
2008-05-01 21:00 ` Andrew Morton [this message]
2008-05-02 19:40 ` Balaji Rao
[not found] ` <200805030110.29511.balajirrao-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2008-05-02 19:53 ` Andrew Morton
2008-05-02 19:53 ` Andrew Morton
2008-05-02 22:47 ` Balaji Rao
2008-05-02 23:04 ` Andrew Morton
[not found] ` <200805030457.54073.balajirrao@gmail.com>
[not found] ` <20080502164133.9025a02c.akpm@linux-foundation.org>
2008-05-02 23:56 ` Balaji Rao
2008-05-03 0:19 ` Andrew Morton
2008-05-11 20:18 ` [RFC][-mm] Simple stats for cpu resource controller v4 Balaji Rao
2008-05-12 2:54 ` Li Zefan
2008-05-12 5:41 ` Balaji Rao
2008-05-12 23:48 ` Andrew Morton
2008-05-13 3:30 ` Balaji Rao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080501140026.6188065e.akpm@linux-foundation.org \
--to=akpm@linux-foundation.org \
--cc=a.p.zijlstra@chello.nl \
--cc=balajirrao@gmail.com \
--cc=balbir@in.ibm.com \
--cc=containers@lists.osdl.org \
--cc=dhaval@linux.vnet.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=menage@google.com \
--cc=vatsa@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.