public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?)
@ 2008-05-16 20:38 Andrea Righi
  2008-05-19 16:36 ` Andrea Righi
  2008-05-19 17:49 ` Balbir Singh
  0 siblings, 2 replies; 6+ messages in thread
From: Andrea Righi @ 2008-05-16 20:38 UTC (permalink / raw)
  To: Shailabh Nagar, Mark Seger
  Cc: Balbir Singh, Jes Sorensen, Chris Sturtivant, Tony Ernst, LKML

Balbir Singh wrote:
> Mark Seger wrote:
>> If you look at /proc/pid/stat, you can get the total CPU consumed by a
>> process.  If you look at /proc/pid/task/tid/stat you can get the cpu
>> consumed by a thread and if the tid is that of the parent you only gets
>> its consumption as opposed to all its children.
>>
>> I was surprised to see that the way process I/O is reported doesn't
>> follow this model.  There are no /prod/pid/task/tid/io entries but
>> rather you need to look in /proc/tid/io.  While I view this as a minor
>> inconvenience, I can certainly live with it.  However, /proc/pid/io does
>> not show the aggregate I/O numbers for the whole process and that both
>> surprises and disappoints.  This means if I have a process that starts a
>> bunch of worker threads that do the real work and I want to find the top
>> I/O consumers I can't simply walk the /proc/pid tree but rather have to
>> look at all the threads of each process and add them up.
>>
>> Or am I missing something?
>>
> 
> I looked through the code and your argument seems to be correct. The behaviour
> is inconsistent w.r.t. other statistics like utime and stime. We currently
> accumulate tgid information in signal_struct, we need to do something similar
> for io as well. If nobody gets to it by the time I finish my backlog, I'll try
> and get to it.

Balbir, Mark,

what do you think about the following approach?
Patch against 2.6.25.4, tested in KVM.

-Andrea

--
Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io. This approach follows the same
model used to account per-process and per-thread CPU times.

As a practial application, this allows for example to quickly find the
top I/O consumer when a process spawns many child threads that perform
the actual I/O work, because the aggregated I/O statistics can always be
found in /proc/pid/io.

Bug reported by Mark Seger <Mark.Seger@hp.com>.

Signed-off-by: Andrea Righi <righiandr@users.sourceforge.net>
---
 fs/proc/base.c        |   83 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/sched.h |    4 ++
 kernel/exit.c         |   27 ++++++++++++++++
 kernel/fork.c         |    6 +++
 4 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81d7d14..23763c5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2255,8 +2255,58 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 }
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
+	unsigned long flags;
+	u64 rchar, wchar, syscr, syscw;
+	struct task_io_accounting ioac;
+
+	if (!whole) {
+#ifdef CONFIG_TASK_XACCT
+		rchar = task->rchar;
+		wchar = task->wchar;
+		syscr = task->syscr;
+		syscw = task->syscw;
+#endif
+		memcpy(&ioac, &task->ioac, sizeof(ioac));
+	} else {
+		rchar = wchar = syscr = syscw = 0;
+		memset(&ioac, 0, sizeof(ioac));
+		rcu_read_lock();
+		if (lock_task_sighand(task, &flags)) {
+		struct signal_struct *sig = task->signal;
+		struct task_struct *t = task;
+
+			do {
+#ifdef CONFIG_TASK_XACCT
+				rchar += t->rchar;
+				wchar += t->wchar;
+				syscr += t->syscr;
+				syscw += t->syscw;
+#endif
+				ioac.read_bytes += t->ioac.read_bytes;
+				ioac.write_bytes += t->ioac.write_bytes;
+				ioac.cancelled_write_bytes +=
+						t->ioac.cancelled_write_bytes;
+				t = next_thread(t);
+			} while (t != task);
+
+#ifdef CONFIG_TASK_XACCT
+			rchar += sig->rchar;
+			wchar += sig->wchar;
+			syscr += sig->syscr;
+			syscw += sig->syscw;
+#endif
+			ioac.read_bytes += sig->ioac.read_bytes;
+			ioac.write_bytes += sig->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					sig->ioac.cancelled_write_bytes;
+
+			unlock_task_sighand(task, &flags);
+		}
+		rcu_read_unlock();
+	}
+
 	return sprintf(buffer,
 #ifdef CONFIG_TASK_XACCT
 			"rchar: %llu\n"
@@ -2268,16 +2318,26 @@ static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
 #ifdef CONFIG_TASK_XACCT
-			(unsigned long long)task->rchar,
-			(unsigned long long)task->wchar,
-			(unsigned long long)task->syscr,
-			(unsigned long long)task->syscw,
+			(unsigned long long)rchar,
+			(unsigned long long)wchar,
+			(unsigned long long)syscr,
+			(unsigned long long)syscw,
 #endif
-			(unsigned long long)task->ioac.read_bytes,
-			(unsigned long long)task->ioac.write_bytes,
-			(unsigned long long)task->ioac.cancelled_write_bytes);
+			(unsigned long long)ioac.read_bytes,
+			(unsigned long long)ioac.write_bytes,
+			(unsigned long long)ioac.cancelled_write_bytes);
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 0);
 }
-#endif
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 
 /*
  * Thread groups
@@ -2348,7 +2408,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, pid_io_accounting),
+	INF("io",	S_IRUGO, tgid_io_accounting),
 #endif
 };
 
@@ -2675,6 +2735,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUGO, tid_io_accounting),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6a1e7af..ebf8b45 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -504,6 +504,10 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
+#ifdef CONFIG_TASK_XACCT
+	u64 rchar, wchar, syscr, syscw;
+#endif
+	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
diff --git a/kernel/exit.c b/kernel/exit.c
index 073005b..1a35583 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -114,6 +114,18 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+		sig->rchar += tsk->rchar;
+		sig->wchar += tsk->wchar;
+		sig->syscr += tsk->syscr;
+		sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		sig->ioac.read_bytes += tsk->ioac.read_bytes;
+		sig->ioac.write_bytes += tsk->ioac.write_bytes;
+		sig->ioac.cancelled_write_bytes +=
+					tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1250,6 +1262,21 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+		psig->rchar += p->rchar + sig->rchar;
+		psig->wchar += p->wchar + sig->wchar;
+		psig->syscr += p->syscr + sig->syscr;
+		psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		psig->ioac.read_bytes +=
+			p->ioac.read_bytes + sig->ioac.read_bytes;
+		psig->ioac.write_bytes +=
+			p->ioac.write_bytes + sig->ioac.write_bytes;
+		psig->ioac.cancelled_write_bytes +=
+				p->ioac.cancelled_write_bytes +
+				sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f9..b8f408e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -924,6 +924,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-- 
1.5.4.3

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?)
  2008-05-16 20:38 [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?) Andrea Righi
@ 2008-05-19 16:36 ` Andrea Righi
  2008-05-19 17:49 ` Balbir Singh
  1 sibling, 0 replies; 6+ messages in thread
From: Andrea Righi @ 2008-05-19 16:36 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Mark Seger, Balbir Singh, Jes Sorensen, Chris Sturtivant,
	Tony Ernst, LKML, linux-next

Andrea Righi wrote:
> Patch against 2.6.25.4, tested in KVM.

Tested in latest linux-next (next-20080519) as well.

-Andrea

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?)
  2008-05-16 20:38 [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?) Andrea Righi
  2008-05-19 16:36 ` Andrea Righi
@ 2008-05-19 17:49 ` Balbir Singh
  2008-05-19 18:33   ` [PATCH] distinct tgid/tid I/O statistics Andrea Righi
  1 sibling, 1 reply; 6+ messages in thread
From: Balbir Singh @ 2008-05-19 17:49 UTC (permalink / raw)
  To: righiandr
  Cc: Shailabh Nagar, Mark Seger, Jes Sorensen, Chris Sturtivant,
	Tony Ernst, LKML

Andrea Righi wrote:
> Balbir Singh wrote:
>> Mark Seger wrote:
>>> If you look at /proc/pid/stat, you can get the total CPU consumed by a
>>> process.  If you look at /proc/pid/task/tid/stat you can get the cpu
>>> consumed by a thread and if the tid is that of the parent you only gets
>>> its consumption as opposed to all its children.
>>>
>>> I was surprised to see that the way process I/O is reported doesn't
>>> follow this model.  There are no /prod/pid/task/tid/io entries but
>>> rather you need to look in /proc/tid/io.  While I view this as a minor
>>> inconvenience, I can certainly live with it.  However, /proc/pid/io does
>>> not show the aggregate I/O numbers for the whole process and that both
>>> surprises and disappoints.  This means if I have a process that starts a
>>> bunch of worker threads that do the real work and I want to find the top
>>> I/O consumers I can't simply walk the /proc/pid tree but rather have to
>>> look at all the threads of each process and add them up.
>>>
>>> Or am I missing something?
>>>
>>
>> I looked through the code and your argument seems to be correct. The
>> behaviour
>> is inconsistent w.r.t. other statistics like utime and stime. We
>> currently
>> accumulate tgid information in signal_struct, we need to do something
>> similar
>> for io as well. If nobody gets to it by the time I finish my backlog,
>> I'll try
>> and get to it.
> 
> Balbir, Mark,
> 
> what do you think about the following approach?
> Patch against 2.6.25.4, tested in KVM.
> 
> -Andrea
> 
> -- 
> Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
> parent I/O statistics in /proc/pid/io. This approach follows the same
> model used to account per-process and per-thread CPU times.
> 
> As a practial application, this allows for example to quickly find the
> top I/O consumer when a process spawns many child threads that perform
> the actual I/O work, because the aggregated I/O statistics can always be
> found in /proc/pid/io.
> 
> Bug reported by Mark Seger <Mark.Seger@hp.com>.
> 

Comments below. Overall looks pretty well covered. Have you tested the patches?
Mark could you please test these and see?

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>


> Signed-off-by: Andrea Righi <righiandr@users.sourceforge.net>
> ---
> fs/proc/base.c        |   83
> +++++++++++++++++++++++++++++++++++++++++++------
> include/linux/sched.h |    4 ++
> kernel/exit.c         |   27 ++++++++++++++++
> kernel/fork.c         |    6 +++
> 4 files changed, 110 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 81d7d14..23763c5 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2255,8 +2255,58 @@ static int proc_base_fill_cache(struct file
> *filp, void *dirent,
> }
> 
> #ifdef CONFIG_TASK_IO_ACCOUNTING
> -static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
> +static int do_io_accounting(struct task_struct *task, char *buffer, int
> whole)
> {
> +    unsigned long flags;
> +    u64 rchar, wchar, syscr, syscw;
> +    struct task_io_accounting ioac;
> +
> +    if (!whole) {
> +#ifdef CONFIG_TASK_XACCT

Can't we abstract this into a function for ease?

> +        rchar = task->rchar;
> +        wchar = task->wchar;
> +        syscr = task->syscr;
> +        syscw = task->syscw;
> +#endif
> +        memcpy(&ioac, &task->ioac, sizeof(ioac));
> +    } else {
> +        rchar = wchar = syscr = syscw = 0;
> +        memset(&ioac, 0, sizeof(ioac));
> +        rcu_read_lock();
> +        if (lock_task_sighand(task, &flags)) {

If lock(), what happens otherwise?

> +        struct signal_struct *sig = task->signal;
> +        struct task_struct *t = task;
> +
> +            do {
> +#ifdef CONFIG_TASK_XACCT
> +                rchar += t->rchar;
> +                wchar += t->wchar;
> +                syscr += t->syscr;
> +                syscw += t->syscw;
> +#endif
> +                ioac.read_bytes += t->ioac.read_bytes;
> +                ioac.write_bytes += t->ioac.write_bytes;
> +                ioac.cancelled_write_bytes +=
> +                        t->ioac.cancelled_write_bytes;
> +                t = next_thread(t);
> +            } while (t != task);
> +
> +#ifdef CONFIG_TASK_XACCT
> +            rchar += sig->rchar;
> +            wchar += sig->wchar;
> +            syscr += sig->syscr;
> +            syscw += sig->syscw;
> +#endif
> +            ioac.read_bytes += sig->ioac.read_bytes;
> +            ioac.write_bytes += sig->ioac.write_bytes;
> +            ioac.cancelled_write_bytes +=
> +                    sig->ioac.cancelled_write_bytes;
> +
> +            unlock_task_sighand(task, &flags);
> +        }
> +        rcu_read_unlock();
> +    }
> +
>     return sprintf(buffer,
> #ifdef CONFIG_TASK_XACCT
>             "rchar: %llu\n"
> @@ -2268,16 +2318,26 @@ static int proc_pid_io_accounting(struct
> task_struct *task, char *buffer)
>             "write_bytes: %llu\n"
>             "cancelled_write_bytes: %llu\n",
> #ifdef CONFIG_TASK_XACCT
> -            (unsigned long long)task->rchar,
> -            (unsigned long long)task->wchar,
> -            (unsigned long long)task->syscr,
> -            (unsigned long long)task->syscw,
> +            (unsigned long long)rchar,
> +            (unsigned long long)wchar,
> +            (unsigned long long)syscr,
> +            (unsigned long long)syscw,
> #endif
> -            (unsigned long long)task->ioac.read_bytes,
> -            (unsigned long long)task->ioac.write_bytes,
> -            (unsigned long long)task->ioac.cancelled_write_bytes);
> +            (unsigned long long)ioac.read_bytes,
> +            (unsigned long long)ioac.write_bytes,
> +            (unsigned long long)ioac.cancelled_write_bytes);
> +}
> +
> +static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
> +{
> +    return do_io_accounting(task, buffer, 0);
> }
> -#endif
> +
> +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
> +{
> +    return do_io_accounting(task, buffer, 1);
> +}
> +#endif /* CONFIG_TASK_IO_ACCOUNTING */
> 
> /*
>  * Thread groups
> @@ -2348,7 +2408,7 @@ static const struct pid_entry tgid_base_stuff[] = {
>     REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
> #endif
> #ifdef CONFIG_TASK_IO_ACCOUNTING
> -    INF("io",    S_IRUGO, pid_io_accounting),
> +    INF("io",    S_IRUGO, tgid_io_accounting),
> #endif
> };
> 
> @@ -2675,6 +2735,9 @@ static const struct pid_entry tid_base_stuff[] = {
> #ifdef CONFIG_FAULT_INJECTION
>     REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
> #endif
> +#ifdef CONFIG_TASK_IO_ACCOUNTING
> +    INF("io",    S_IRUGO, tid_io_accounting),
> +#endif
> };
> 
> static int proc_tid_base_readdir(struct file * filp,
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6a1e7af..ebf8b45 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -504,6 +504,10 @@ struct signal_struct {
>     unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
>     unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
>     unsigned long inblock, oublock, cinblock, coublock;
> +#ifdef CONFIG_TASK_XACCT
> +    u64 rchar, wchar, syscr, syscw;
> +#endif
> +    struct task_io_accounting ioac;
> 
>     /*
>      * Cumulative ns of scheduled CPU time for dead threads in the
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 073005b..1a35583 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -114,6 +114,18 @@ static void __exit_signal(struct task_struct *tsk)
>         sig->nivcsw += tsk->nivcsw;
>         sig->inblock += task_io_get_inblock(tsk);
>         sig->oublock += task_io_get_oublock(tsk);
> +#ifdef CONFIG_TASK_XACCT
> +        sig->rchar += tsk->rchar;
> +        sig->wchar += tsk->wchar;
> +        sig->syscr += tsk->syscr;
> +        sig->syscw += tsk->syscw;
> +#endif /* CONFIG_TASK_XACCT */
> +#ifdef CONFIG_TASK_IO_ACCOUNTING
> +        sig->ioac.read_bytes += tsk->ioac.read_bytes;
> +        sig->ioac.write_bytes += tsk->ioac.write_bytes;
> +        sig->ioac.cancelled_write_bytes +=
> +                    tsk->ioac.cancelled_write_bytes;
> +#endif /* CONFIG_TASK_IO_ACCOUNTING */
>         sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
>         sig = NULL; /* Marker for below. */
>     }
> @@ -1250,6 +1262,21 @@ static int wait_task_zombie(struct task_struct
> *p, int noreap,
>         psig->coublock +=
>             task_io_get_oublock(p) +
>             sig->oublock + sig->coublock;
> +#ifdef CONFIG_TASK_XACCT
> +        psig->rchar += p->rchar + sig->rchar;
> +        psig->wchar += p->wchar + sig->wchar;
> +        psig->syscr += p->syscr + sig->syscr;
> +        psig->syscw += p->syscw + sig->syscw;
> +#endif /* CONFIG_TASK_XACCT */
> +#ifdef CONFIG_TASK_IO_ACCOUNTING
> +        psig->ioac.read_bytes +=
> +            p->ioac.read_bytes + sig->ioac.read_bytes;
> +        psig->ioac.write_bytes +=
> +            p->ioac.write_bytes + sig->ioac.write_bytes;
> +        psig->ioac.cancelled_write_bytes +=
> +                p->ioac.cancelled_write_bytes +
> +                sig->ioac.cancelled_write_bytes;
> +#endif /* CONFIG_TASK_IO_ACCOUNTING */
>         spin_unlock_irq(&p->parent->sighand->siglock);
>     }
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9c042f9..b8f408e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -924,6 +924,12 @@ static int copy_signal(unsigned long clone_flags,
> struct task_struct *tsk)
>     sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
>     sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
>     sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
> +#ifdef CONFIG_TASK_XACCT
> +    sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
> +#endif
> +#ifdef CONFIG_TASK_IO_ACCOUNTING
> +    memset(&sig->ioac, 0, sizeof(sig->ioac));
> +#endif
>     sig->sum_sched_runtime = 0;
>     INIT_LIST_HEAD(&sig->cpu_timers[0]);
>     INIT_LIST_HEAD(&sig->cpu_timers[1]);


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] distinct tgid/tid I/O statistics
  2008-05-19 17:49 ` Balbir Singh
@ 2008-05-19 18:33   ` Andrea Righi
  2008-05-19 20:08     ` Andrea Righi
  0 siblings, 1 reply; 6+ messages in thread
From: Andrea Righi @ 2008-05-19 18:33 UTC (permalink / raw)
  To: balbir
  Cc: Shailabh Nagar, Mark Seger, Jes Sorensen, Chris Sturtivant,
	Tony Ernst, LKML

Balbir Singh wrote:
>> +
>> +    if (!whole) {
>> +#ifdef CONFIG_TASK_XACCT
> 
> Can't we abstract this into a function for ease?

Maybe:

#ifdef CONFIG_TASK_XACCT
#define _task_xacct 1
#else
#define _task_xacct 0
#endif

and:

rchar = __task_xacct ? task->rchar : 0;
wchar = __task_xacct ? task->wchar : 0;
syscr = __task_xacct ? task->syscr : 0;
syscw = __task_xacct ? task->syscw : 0;

But it's very ugly anyway. It seems a similar problem has been raised
here:

http://lkml.org/lkml/2008/5/17/154

Apparently with no solution.

> 
>> +        rchar = task->rchar;
>> +        wchar = task->wchar;
>> +        syscr = task->syscr;
>> +        syscw = task->syscw;
>> +#endif
>> +        memcpy(&ioac, &task->ioac, sizeof(ioac));
>> +    } else {
>> +        rchar = wchar = syscr = syscw = 0;
>> +        memset(&ioac, 0, sizeof(ioac));
>> +        rcu_read_lock();
>> +        if (lock_task_sighand(task, &flags)) {
> 
> If lock(), what happens otherwise?
> 

mmmh... we must surely move the threads stats accounting code out of
the if statement. OK, I'll post a new patch soon.

Thanks,
-Andrea

>> +        struct signal_struct *sig = task->signal;
>> +        struct task_struct *t = task;
>> +
>> +            do {
>> +#ifdef CONFIG_TASK_XACCT
>> +                rchar += t->rchar;
>> +                wchar += t->wchar;
>> +                syscr += t->syscr;
>> +                syscw += t->syscw;
>> +#endif
>> +                ioac.read_bytes += t->ioac.read_bytes;
>> +                ioac.write_bytes += t->ioac.write_bytes;
>> +                ioac.cancelled_write_bytes +=
>> +                        t->ioac.cancelled_write_bytes;
>> +                t = next_thread(t);
>> +            } while (t != task);
>> +
>> +#ifdef CONFIG_TASK_XACCT
>> +            rchar += sig->rchar;
>> +            wchar += sig->wchar;
>> +            syscr += sig->syscr;
>> +            syscw += sig->syscw;
>> +#endif
>> +            ioac.read_bytes += sig->ioac.read_bytes;
>> +            ioac.write_bytes += sig->ioac.write_bytes;
>> +            ioac.cancelled_write_bytes +=
>> +                    sig->ioac.cancelled_write_bytes;
>> +
>> +            unlock_task_sighand(task, &flags);
>> +        }
>> +        rcu_read_unlock();
>> +    }

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] distinct tgid/tid I/O statistics
  2008-05-19 18:33   ` [PATCH] distinct tgid/tid I/O statistics Andrea Righi
@ 2008-05-19 20:08     ` Andrea Righi
  2008-05-19 21:57       ` [PATCH v2] " Andrea Righi
  0 siblings, 1 reply; 6+ messages in thread
From: Andrea Righi @ 2008-05-19 20:08 UTC (permalink / raw)
  To: balbir
  Cc: Shailabh Nagar, Mark Seger, Jes Sorensen, Chris Sturtivant,
	Tony Ernst, LKML

Andrea Righi wrote:
> Balbir Singh wrote:
>>> +
>>> +    if (!whole) {
>>> +#ifdef CONFIG_TASK_XACCT
>> Can't we abstract this into a function for ease?
> 

BTW, CONFIG_TASK_IO_ACCOUNTING depends on CONFIG_TASK_XACCT, so I think
we can just remove those #ifdefs inside CONFIG_TASK_IO_ACCOUNTING.

-Andrea

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] distinct tgid/tid I/O statistics
  2008-05-19 20:08     ` Andrea Righi
@ 2008-05-19 21:57       ` Andrea Righi
  0 siblings, 0 replies; 6+ messages in thread
From: Andrea Righi @ 2008-05-19 21:57 UTC (permalink / raw)
  To: Balbir Singh, Shailabh Nagar, Mark Seger
  Cc: Jes Sorensen, Chris Sturtivant, Tony Ernst, LKML

Changes based on Balbir Singh feedback:
- fix: correcly account children threads i/o activity
- removed CONFIG_TASK_XACCT #ifdefs inside CONFIG_TASK_IO_ACCOUNTING

Tested in latest Linus git.
---
Subject: [PATCH] distinct tgid/tid I/O statistics
From: Andrea Righi <righiandr@users.sourceforge.net>

Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io. This approach follows the same
model used to account per-process and per-thread CPU times.

As a practial application, this allows for example to quickly find the
top I/O consumer when a process spawns many child threads that perform
the actual I/O work, because the aggregated I/O statistics can always be
found in /proc/pid/io.

Bug reported by Mark Seger <Mark.Seger@hp.com>.

Signed-off-by: Andrea Righi <righiandr@users.sourceforge.net>
---
 fs/proc/base.c        |   84 ++++++++++++++++++++++++++++++++++++++++--------
 include/linux/sched.h |    4 ++
 kernel/exit.c         |   27 ++++++++++++++++
 kernel/fork.c         |    6 +++
 4 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 808cbdc..b905c6d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2356,29 +2356,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 }
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
+	unsigned long flags;
+	u64 rchar, wchar, syscr, syscw;
+	struct task_io_accounting ioac;
+
+	if (!whole) {
+		rchar = task->rchar;
+		wchar = task->wchar;
+		syscr = task->syscr;
+		syscw = task->syscw;
+		memcpy(&ioac, &task->ioac, sizeof(ioac));
+	} else {
+		struct task_struct *t = task;
+		rchar = wchar = syscr = syscw = 0;
+		memset(&ioac, 0, sizeof(ioac));
+
+		rcu_read_lock();
+		do {
+			rchar += t->rchar;
+			wchar += t->wchar;
+			syscr += t->syscr;
+			syscw += t->syscw;
+
+			ioac.read_bytes += t->ioac.read_bytes;
+			ioac.write_bytes += t->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					t->ioac.cancelled_write_bytes;
+			t = next_thread(t);
+		} while (t != task);
+
+		if (lock_task_sighand(task, &flags)) {
+			struct signal_struct *sig = task->signal;
+
+			rchar += sig->rchar;
+			wchar += sig->wchar;
+			syscr += sig->syscr;
+			syscw += sig->syscw;
+
+			ioac.read_bytes += sig->ioac.read_bytes;
+			ioac.write_bytes += sig->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					sig->ioac.cancelled_write_bytes;
+
+			unlock_task_sighand(task, &flags);
+		}
+		rcu_read_unlock();
+	}
+
 	return sprintf(buffer,
-#ifdef CONFIG_TASK_XACCT
 			"rchar: %llu\n"
 			"wchar: %llu\n"
 			"syscr: %llu\n"
 			"syscw: %llu\n"
-#endif
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-#ifdef CONFIG_TASK_XACCT
-			(unsigned long long)task->rchar,
-			(unsigned long long)task->wchar,
-			(unsigned long long)task->syscr,
-			(unsigned long long)task->syscw,
-#endif
-			(unsigned long long)task->ioac.read_bytes,
-			(unsigned long long)task->ioac.write_bytes,
-			(unsigned long long)task->ioac.cancelled_write_bytes);
+			(unsigned long long)rchar,
+			(unsigned long long)wchar,
+			(unsigned long long)syscr,
+			(unsigned long long)syscw,
+			(unsigned long long)ioac.read_bytes,
+			(unsigned long long)ioac.write_bytes,
+			(unsigned long long)ioac.cancelled_write_bytes);
 }
-#endif
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 0);
+}
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 
 /*
  * Thread groups
@@ -2450,7 +2503,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, pid_io_accounting),
+	INF("io",	S_IRUGO, tgid_io_accounting),
 #endif
 };
 
@@ -2778,6 +2831,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUGO, tid_io_accounting),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a61..d4d9adf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -504,6 +504,10 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
+#ifdef CONFIG_TASK_XACCT
+	u64 rchar, wchar, syscr, syscw;
+#endif
+	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
diff --git a/kernel/exit.c b/kernel/exit.c
index 1510f78..1f3c0ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+		sig->rchar += tsk->rchar;
+		sig->wchar += tsk->wchar;
+		sig->syscr += tsk->syscr;
+		sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		sig->ioac.read_bytes += tsk->ioac.read_bytes;
+		sig->ioac.write_bytes += tsk->ioac.write_bytes;
+		sig->ioac.cancelled_write_bytes +=
+					tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1321,6 +1333,21 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+		psig->rchar += p->rchar + sig->rchar;
+		psig->wchar += p->wchar + sig->wchar;
+		psig->syscr += p->syscr + sig->syscr;
+		psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		psig->ioac.read_bytes +=
+			p->ioac.read_bytes + sig->ioac.read_bytes;
+		psig->ioac.write_bytes +=
+			p->ioac.write_bytes + sig->ioac.write_bytes;
+		psig->ioac.cancelled_write_bytes +=
+				p->ioac.cancelled_write_bytes +
+				sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 933e60e..bad9981 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -915,6 +915,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-05-19 21:58 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-16 20:38 [PATCH] distinct tgid/tid I/O statistics (was: taskstats and /proc/.../io asymmetry?) Andrea Righi
2008-05-19 16:36 ` Andrea Righi
2008-05-19 17:49 ` Balbir Singh
2008-05-19 18:33   ` [PATCH] distinct tgid/tid I/O statistics Andrea Righi
2008-05-19 20:08     ` Andrea Righi
2008-05-19 21:57       ` [PATCH v2] " Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox