From: Andrea Righi <righiandr@users.sourceforge.net>
To: Shailabh Nagar <nagar@watson.ibm.com>, Mark Seger <Mark.Seger@hp.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>,
Jes Sorensen <jes@sgi.com>, Chris Sturtivant <csturtiv@sgi.com>,
Tony Ernst <tee@sgi.com>, LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?)
Date: Fri, 16 May 2008 22:38:24 +0200 [thread overview]
Message-ID: <482DF0C0.6050208@users.sourceforge.net> (raw)
Balbir Singh wrote:
> Mark Seger wrote:
>> If you look at /proc/pid/stat, you can get the total CPU consumed by a
>> process. If you look at /proc/pid/task/tid/stat you can get the cpu
>> consumed by a thread and if the tid is that of the parent you only gets
>> its consumption as opposed to all its children.
>>
>> I was surprised to see that the way process I/O is reported doesn't
>> follow this model. There are no /prod/pid/task/tid/io entries but
>> rather you need to look in /proc/tid/io. While I view this as a minor
>> inconvenience, I can certainly live with it. However, /proc/pid/io does
>> not show the aggregate I/O numbers for the whole process and that both
>> surprises and disappoints. This means if I have a process that starts a
>> bunch of worker threads that do the real work and I want to find the top
>> I/O consumers I can't simply walk the /proc/pid tree but rather have to
>> look at all the threads of each process and add them up.
>>
>> Or am I missing something?
>>
>
> I looked through the code and your argument seems to be correct. The behaviour
> is inconsistent w.r.t. other statistics like utime and stime. We currently
> accumulate tgid information in signal_struct, we need to do something similar
> for io as well. If nobody gets to it by the time I finish my backlog, I'll try
> and get to it.
Balbir, Mark,
what do you think about the following approach?
Patch against 2.6.25.4, tested in KVM.
-Andrea
--
Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io. This approach follows the same
model used to account per-process and per-thread CPU times.
As a practial application, this allows for example to quickly find the
top I/O consumer when a process spawns many child threads that perform
the actual I/O work, because the aggregated I/O statistics can always be
found in /proc/pid/io.
Bug reported by Mark Seger <Mark.Seger@hp.com>.
Signed-off-by: Andrea Righi <righiandr@users.sourceforge.net>
---
fs/proc/base.c | 83 +++++++++++++++++++++++++++++++++++++++++++------
include/linux/sched.h | 4 ++
kernel/exit.c | 27 ++++++++++++++++
kernel/fork.c | 6 +++
4 files changed, 110 insertions(+), 10 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81d7d14..23763c5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2255,8 +2255,58 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
}
#ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
{
+ unsigned long flags;
+ u64 rchar, wchar, syscr, syscw;
+ struct task_io_accounting ioac;
+
+ if (!whole) {
+#ifdef CONFIG_TASK_XACCT
+ rchar = task->rchar;
+ wchar = task->wchar;
+ syscr = task->syscr;
+ syscw = task->syscw;
+#endif
+ memcpy(&ioac, &task->ioac, sizeof(ioac));
+ } else {
+ rchar = wchar = syscr = syscw = 0;
+ memset(&ioac, 0, sizeof(ioac));
+ rcu_read_lock();
+ if (lock_task_sighand(task, &flags)) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t = task;
+
+ do {
+#ifdef CONFIG_TASK_XACCT
+ rchar += t->rchar;
+ wchar += t->wchar;
+ syscr += t->syscr;
+ syscw += t->syscw;
+#endif
+ ioac.read_bytes += t->ioac.read_bytes;
+ ioac.write_bytes += t->ioac.write_bytes;
+ ioac.cancelled_write_bytes +=
+ t->ioac.cancelled_write_bytes;
+ t = next_thread(t);
+ } while (t != task);
+
+#ifdef CONFIG_TASK_XACCT
+ rchar += sig->rchar;
+ wchar += sig->wchar;
+ syscr += sig->syscr;
+ syscw += sig->syscw;
+#endif
+ ioac.read_bytes += sig->ioac.read_bytes;
+ ioac.write_bytes += sig->ioac.write_bytes;
+ ioac.cancelled_write_bytes +=
+ sig->ioac.cancelled_write_bytes;
+
+ unlock_task_sighand(task, &flags);
+ }
+ rcu_read_unlock();
+ }
+
return sprintf(buffer,
#ifdef CONFIG_TASK_XACCT
"rchar: %llu\n"
@@ -2268,16 +2318,26 @@ static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
"write_bytes: %llu\n"
"cancelled_write_bytes: %llu\n",
#ifdef CONFIG_TASK_XACCT
- (unsigned long long)task->rchar,
- (unsigned long long)task->wchar,
- (unsigned long long)task->syscr,
- (unsigned long long)task->syscw,
+ (unsigned long long)rchar,
+ (unsigned long long)wchar,
+ (unsigned long long)syscr,
+ (unsigned long long)syscw,
#endif
- (unsigned long long)task->ioac.read_bytes,
- (unsigned long long)task->ioac.write_bytes,
- (unsigned long long)task->ioac.cancelled_write_bytes);
+ (unsigned long long)ioac.read_bytes,
+ (unsigned long long)ioac.write_bytes,
+ (unsigned long long)ioac.cancelled_write_bytes);
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+ return do_io_accounting(task, buffer, 0);
}
-#endif
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+ return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
/*
* Thread groups
@@ -2348,7 +2408,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUGO, pid_io_accounting),
+ INF("io", S_IRUGO, tgid_io_accounting),
#endif
};
@@ -2675,6 +2735,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ INF("io", S_IRUGO, tid_io_accounting),
+#endif
};
static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6a1e7af..ebf8b45 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -504,6 +504,10 @@ struct signal_struct {
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
+#ifdef CONFIG_TASK_XACCT
+ u64 rchar, wchar, syscr, syscw;
+#endif
+ struct task_io_accounting ioac;
/*
* Cumulative ns of scheduled CPU time for dead threads in the
diff --git a/kernel/exit.c b/kernel/exit.c
index 073005b..1a35583 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -114,6 +114,18 @@ static void __exit_signal(struct task_struct *tsk)
sig->nivcsw += tsk->nivcsw;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+ sig->rchar += tsk->rchar;
+ sig->wchar += tsk->wchar;
+ sig->syscr += tsk->syscr;
+ sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ sig->ioac.read_bytes += tsk->ioac.read_bytes;
+ sig->ioac.write_bytes += tsk->ioac.write_bytes;
+ sig->ioac.cancelled_write_bytes +=
+ tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
@@ -1250,6 +1262,21 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
psig->coublock +=
task_io_get_oublock(p) +
sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+ psig->rchar += p->rchar + sig->rchar;
+ psig->wchar += p->wchar + sig->wchar;
+ psig->syscr += p->syscr + sig->syscr;
+ psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ psig->ioac.read_bytes +=
+ p->ioac.read_bytes + sig->ioac.read_bytes;
+ psig->ioac.write_bytes +=
+ p->ioac.write_bytes + sig->ioac.write_bytes;
+ psig->ioac.cancelled_write_bytes +=
+ p->ioac.cancelled_write_bytes +
+ sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
spin_unlock_irq(&p->parent->sighand->siglock);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f9..b8f408e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -924,6 +924,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+ sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
sig->sum_sched_runtime = 0;
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
--
1.5.4.3
next reply other threads:[~2008-05-16 20:38 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-05-16 20:38 Andrea Righi [this message]
2008-05-19 16:36 ` [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?) Andrea Righi
2008-05-19 17:49 ` Balbir Singh
2008-05-19 18:33 ` [PATCH] distinct tgid/tid I/O statistics Andrea Righi
2008-05-19 20:08 ` Andrea Righi
2008-05-19 21:57 ` [PATCH v2] " Andrea Righi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=482DF0C0.6050208@users.sourceforge.net \
--to=righiandr@users.sourceforge.net \
--cc=Mark.Seger@hp.com \
--cc=balbir@linux.vnet.ibm.com \
--cc=csturtiv@sgi.com \
--cc=jes@sgi.com \
--cc=linux-kernel@vger.kernel.org \
--cc=nagar@watson.ibm.com \
--cc=tee@sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.