[patch v2 0/4] taskstats: Improve cumulative time accounting

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch v2 0/4] taskstats: Improve cumulative time accounting
@ 2010-11-29 16:42 Michael Holzheu
  2010-11-29 16:42 ` [patch v2 1/4] taskstats: Introduce "struct cdata" Michael Holzheu
                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Michael Holzheu @ 2010-11-29 16:42 UTC (permalink / raw)
  To: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Balbir Singh, Martin Schwidefsky,
	Heiko Carstens, Roland McGrath, Valdis.Kletnieks
  Cc: linux-kernel, linux-s390

Version 2
---------
* Integrate review comments (see change log of single patches)
* Add sysctl to switch to new behavior instead of adding the
  additional cdata_acct to struct signal (see patch 3)

Due to POSIX POSIX.1-2001, the CPU time of processes is not accounted
to the cumulative time of the parents, if the parents ignore SIGCHLD
or have set SA_NOCLDWAIT. This behaviour has the major drawback that
it is not possible to calculate all consumed CPU time of a system by
looking at the current tasks. CPU time can be lost.

This patch series adds a new sysctl "full_cdata" that allows to switch
the kernel to a mode where also the missing data is accounted to the
cumulative counters.

PATCH SET OVERVIEW
------------------
Patches apply on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

[1] Introduce "struct cdata"
[2] Introduce __account_cdata() function
[3] Introduce kernel.full_cdata sysctl
[4] Export "cdata_wait" CPU times with taskstats

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch v2 1/4] taskstats: Introduce "struct cdata"
  2010-11-29 16:42 [patch v2 0/4] taskstats: Improve cumulative time accounting Michael Holzheu
@ 2010-11-29 16:42 ` Michael Holzheu
  2010-11-29 16:42 ` [patch v2 2/4] taskstats: Introduce __account_cdata() function Michael Holzheu
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 19+ messages in thread
From: Michael Holzheu @ 2010-11-29 16:42 UTC (permalink / raw)
  To: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Balbir Singh, Martin Schwidefsky,
	Heiko Carstens, Roland McGrath, Valdis.Kletnieks
  Cc: linux-kernel, linux-s390

[-- Attachment #1: 01-taskstats-top-improve-ctime-add-cdata-struct.patch --]
[-- Type: text/plain, Size: 12485 bytes --]

From: Michael Holzheu <holzheu@linux.vnet.ibm.com>

Version 2
---------
* Add signal.c changes
* thread_group_cputime() cosmetics

Description
-----------
This patch introduces a new structure "struct cdata" that is used to
store cumulative resource counters for dead child processes and threads.

Note that there is one asymmetry:
For "struct task_io_accounting" (ioc) there is no extra accounting field for
dead threads. One field is used for both dead processes and threads.

This patch introduces no functional change.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
 fs/binfmt_elf.c           |    4 +-
 fs/exec.c                 |    2 -
 fs/proc/array.c           |   16 ++++----
 include/linux/sched.h     |   22 +++++++----
 kernel/exit.c             |   86 ++++++++++++++++++++++++----------------------
 kernel/posix-cpu-timers.c |    8 ++--
 kernel/signal.c           |    4 +-
 kernel/sys.c              |   44 ++++++++++++-----------
 8 files changed, 100 insertions(+), 86 deletions(-)

--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1296,8 +1296,8 @@ static void fill_prstatus(struct elf_prs
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+	cputime_to_timeval(p->signal->cdata_wait.utime, &prstatus->pr_cutime);
+	cputime_to_timeval(p->signal->cdata_wait.stime, &prstatus->pr_cstime);
 }
 
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -894,7 +894,7 @@ static int de_thread(struct task_struct
 
 no_thread_group:
 	if (current->mm)
-		setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+		setmax_mm_hiwater_rss(&sig->cdata_threads.maxrss, current->mm);
 
 	exit_itimers(sig);
 	flush_itimer_signals();
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -413,11 +413,11 @@ static int do_task_stat(struct seq_file
 		num_threads = get_nr_threads(task);
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 
-		cmin_flt = sig->cmin_flt;
-		cmaj_flt = sig->cmaj_flt;
-		cutime = sig->cutime;
-		cstime = sig->cstime;
-		cgtime = sig->cgtime;
+		cmin_flt = sig->cdata_wait.min_flt;
+		cmaj_flt = sig->cdata_wait.maj_flt;
+		cutime = sig->cdata_wait.utime;
+		cstime = sig->cdata_wait.stime;
+		cgtime = sig->cdata_wait.gtime;
 		rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
 
 		/* add up live thread stats at the group level */
@@ -430,10 +430,10 @@ static int do_task_stat(struct seq_file
 				t = next_thread(t);
 			} while (t != task);
 
-			min_flt += sig->min_flt;
-			maj_flt += sig->maj_flt;
+			min_flt += sig->cdata_threads.min_flt;
+			maj_flt += sig->cdata_threads.maj_flt;
 			thread_group_times(task, &utime, &stime);
-			gtime = cputime_add(gtime, sig->gtime);
+			gtime = cputime_add(gtime, sig->cdata_threads.gtime);
 		}
 
 		sid = task_session_nr_ns(task, ns);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -510,6 +510,17 @@ struct thread_group_cputimer {
 };
 
 /*
+ * Cumulative resource counters
+ */
+struct cdata {
+	cputime_t utime, stime, gtime;
+	unsigned long nvcsw, nivcsw;
+	unsigned long min_flt, maj_flt;
+	unsigned long inblock, oublock;
+	unsigned long maxrss;
+};
+
+/*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
  * implies a shared sighand_struct, so locking
@@ -582,17 +593,12 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t utime, stime, cutime, cstime;
-	cputime_t gtime;
-	cputime_t cgtime;
+	struct cdata cdata_wait;
+	struct cdata cdata_threads;
+	struct task_io_accounting ioac;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
 #endif
-	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
-	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
-	unsigned long inblock, oublock, cinblock, coublock;
-	unsigned long maxrss, cmaxrss;
-	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of schedule CPU time fo dead threads in the
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -95,6 +95,7 @@ static void __exit_signal(struct task_st
 		tty = sig->tty;
 		sig->tty = NULL;
 	} else {
+		struct cdata *tcd = &sig->cdata_threads;
 		/*
 		 * This can only happen if the caller is de_thread().
 		 * FIXME: this is the temporary hack, we should teach
@@ -122,15 +123,15 @@ static void __exit_signal(struct task_st
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, tsk->utime);
-		sig->stime = cputime_add(sig->stime, tsk->stime);
-		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
-		sig->min_flt += tsk->min_flt;
-		sig->maj_flt += tsk->maj_flt;
-		sig->nvcsw += tsk->nvcsw;
-		sig->nivcsw += tsk->nivcsw;
-		sig->inblock += task_io_get_inblock(tsk);
-		sig->oublock += task_io_get_oublock(tsk);
+		tcd->utime = cputime_add(tcd->utime, tsk->utime);
+		tcd->stime = cputime_add(tcd->stime, tsk->stime);
+		tcd->gtime = cputime_add(tcd->gtime, tsk->gtime);
+		tcd->min_flt += tsk->min_flt;
+		tcd->maj_flt += tsk->maj_flt;
+		tcd->nvcsw += tsk->nvcsw;
+		tcd->nivcsw += tsk->nivcsw;
+		tcd->inblock += task_io_get_inblock(tsk);
+		tcd->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	}
@@ -960,10 +961,11 @@ NORET_TYPE void do_exit(long code)
 		sync_mm_rss(tsk, tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
+		struct cdata *tcd = &tsk->signal->cdata_threads;
 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		if (tsk->mm)
-			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
+			setmax_mm_hiwater_rss(&tcd->maxrss, tsk->mm);
 	}
 	acct_collect(code, group_dead);
 	if (group_dead)
@@ -1225,8 +1227,7 @@ static int wait_task_zombie(struct wait_
 	 * !task_detached() to filter out sub-threads.
 	 */
 	if (likely(!traced) && likely(!task_detached(p))) {
-		struct signal_struct *psig;
-		struct signal_struct *sig;
+		struct cdata *cd, *pcd, *tcd;
 		unsigned long maxrss;
 		cputime_t tgutime, tgstime;
 
@@ -1251,40 +1252,43 @@ static int wait_task_zombie(struct wait_
 		 */
 		thread_group_times(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
-		psig = p->real_parent->signal;
-		sig = p->signal;
-		psig->cutime =
-			cputime_add(psig->cutime,
+		pcd = &p->real_parent->signal->cdata_wait;
+		tcd = &p->signal->cdata_threads;
+		cd = &p->signal->cdata_wait;
+
+		pcd->utime =
+			cputime_add(pcd->utime,
 			cputime_add(tgutime,
-				    sig->cutime));
-		psig->cstime =
-			cputime_add(psig->cstime,
+				    cd->utime));
+		pcd->stime =
+			cputime_add(pcd->stime,
 			cputime_add(tgstime,
-				    sig->cstime));
-		psig->cgtime =
-			cputime_add(psig->cgtime,
+				    cd->stime));
+		pcd->gtime =
+			cputime_add(pcd->gtime,
 			cputime_add(p->gtime,
-			cputime_add(sig->gtime,
-				    sig->cgtime)));
-		psig->cmin_flt +=
-			p->min_flt + sig->min_flt + sig->cmin_flt;
-		psig->cmaj_flt +=
-			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
-		psig->cnvcsw +=
-			p->nvcsw + sig->nvcsw + sig->cnvcsw;
-		psig->cnivcsw +=
-			p->nivcsw + sig->nivcsw + sig->cnivcsw;
-		psig->cinblock +=
+			cputime_add(tcd->gtime,
+				    cd->gtime)));
+		pcd->min_flt +=
+			p->min_flt + tcd->min_flt + cd->min_flt;
+		pcd->maj_flt +=
+			p->maj_flt + tcd->maj_flt + cd->maj_flt;
+		pcd->nvcsw +=
+			p->nvcsw + tcd->nvcsw + cd->nvcsw;
+		pcd->nivcsw +=
+			p->nivcsw + tcd->nivcsw + cd->nivcsw;
+		pcd->inblock +=
 			task_io_get_inblock(p) +
-			sig->inblock + sig->cinblock;
-		psig->coublock +=
+			tcd->inblock + cd->inblock;
+		pcd->oublock +=
 			task_io_get_oublock(p) +
-			sig->oublock + sig->coublock;
-		maxrss = max(sig->maxrss, sig->cmaxrss);
-		if (psig->cmaxrss < maxrss)
-			psig->cmaxrss = maxrss;
-		task_io_accounting_add(&psig->ioac, &p->ioac);
-		task_io_accounting_add(&psig->ioac, &sig->ioac);
+			tcd->oublock + cd->oublock;
+		maxrss = max(tcd->maxrss, cd->maxrss);
+		if (pcd->maxrss < maxrss)
+			pcd->maxrss = maxrss;
+		task_io_accounting_add(&p->real_parent->signal->ioac, &p->ioac);
+		task_io_accounting_add(&p->real_parent->signal->ioac,
+				       &p->signal->ioac);
 		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -235,8 +235,8 @@ void thread_group_cputime(struct task_st
 	struct signal_struct *sig = tsk->signal;
 	struct task_struct *t;
 
-	times->utime = sig->utime;
-	times->stime = sig->stime;
+	times->utime = sig->cdata_threads.utime;
+	times->stime = sig->cdata_threads.stime;
 	times->sum_exec_runtime = sig->sum_sched_runtime;
 
 	rcu_read_lock();
@@ -516,8 +516,8 @@ void posix_cpu_timers_exit_group(struct
 	struct signal_struct *const sig = tsk->signal;
 
 	cleanup_timers(tsk->signal->cpu_timers,
-		       cputime_add(tsk->utime, sig->utime),
-		       cputime_add(tsk->stime, sig->stime),
+		       cputime_add(tsk->utime, sig->cdata_threads.utime),
+		       cputime_add(tsk->stime, sig->cdata_threads.stime),
 		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1476,9 +1476,9 @@ int do_notify_parent(struct task_struct
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-				tsk->signal->utime));
+				tsk->signal->cdata_threads.utime));
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
-				tsk->signal->stime));
+				tsk->signal->cdata_threads.stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -884,8 +884,8 @@ void do_sys_times(struct tms *tms)
 
 	spin_lock_irq(&current->sighand->siglock);
 	thread_group_times(current, &tgutime, &tgstime);
-	cutime = current->signal->cutime;
-	cstime = current->signal->cstime;
+	cutime = current->signal->cdata_wait.utime;
+	cstime = current->signal->cdata_wait.stime;
 	spin_unlock_irq(&current->sighand->siglock);
 	tms->tms_utime = cputime_to_clock_t(tgutime);
 	tms->tms_stime = cputime_to_clock_t(tgstime);
@@ -1490,6 +1490,7 @@ static void k_getrusage(struct task_stru
 	unsigned long flags;
 	cputime_t tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
+	struct cdata *cd;
 
 	memset((char *) r, 0, sizeof *r);
 	utime = stime = cputime_zero;
@@ -1497,7 +1498,7 @@ static void k_getrusage(struct task_stru
 	if (who == RUSAGE_THREAD) {
 		task_times(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
-		maxrss = p->signal->maxrss;
+		maxrss = p->signal->cdata_threads.maxrss;
 		goto out;
 	}
 
@@ -1507,31 +1508,34 @@ static void k_getrusage(struct task_stru
 	switch (who) {
 		case RUSAGE_BOTH:
 		case RUSAGE_CHILDREN:
-			utime = p->signal->cutime;
-			stime = p->signal->cstime;
-			r->ru_nvcsw = p->signal->cnvcsw;
-			r->ru_nivcsw = p->signal->cnivcsw;
-			r->ru_minflt = p->signal->cmin_flt;
-			r->ru_majflt = p->signal->cmaj_flt;
-			r->ru_inblock = p->signal->cinblock;
-			r->ru_oublock = p->signal->coublock;
-			maxrss = p->signal->cmaxrss;
+			cd = &p->signal->cdata_wait;
+			utime = cd->utime;
+			stime = cd->stime;
+			r->ru_nvcsw = cd->nvcsw;
+			r->ru_nivcsw = cd->nivcsw;
+			r->ru_minflt = cd->min_flt;
+			r->ru_majflt = cd->maj_flt;
+			r->ru_inblock = cd->inblock;
+			r->ru_oublock = cd->oublock;
+			maxrss = cd->maxrss;
 
 			if (who == RUSAGE_CHILDREN)
 				break;
 
 		case RUSAGE_SELF:
+			cd = &p->signal->cdata_threads;
 			thread_group_times(p, &tgutime, &tgstime);
 			utime = cputime_add(utime, tgutime);
 			stime = cputime_add(stime, tgstime);
-			r->ru_nvcsw += p->signal->nvcsw;
-			r->ru_nivcsw += p->signal->nivcsw;
-			r->ru_minflt += p->signal->min_flt;
-			r->ru_majflt += p->signal->maj_flt;
-			r->ru_inblock += p->signal->inblock;
-			r->ru_oublock += p->signal->oublock;
-			if (maxrss < p->signal->maxrss)
-				maxrss = p->signal->maxrss;
+			r->ru_nvcsw += cd->nvcsw;
+			r->ru_nivcsw += cd->nivcsw;
+			r->ru_minflt += cd->min_flt;
+			r->ru_majflt += cd->maj_flt;
+			r->ru_inblock += cd->inblock;
+			r->ru_oublock += cd->oublock;
+			if (maxrss < cd->maxrss)
+				maxrss = cd->maxrss;
+
 			t = p;
 			do {
 				accumulate_thread_rusage(t, r);


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch v2 2/4] taskstats: Introduce __account_cdata() function
  2010-11-29 16:42 [patch v2 0/4] taskstats: Improve cumulative time accounting Michael Holzheu
  2010-11-29 16:42 ` [patch v2 1/4] taskstats: Introduce "struct cdata" Michael Holzheu
@ 2010-11-29 16:42 ` Michael Holzheu
  2010-11-29 16:42 ` [patch v2 3/4] taskstats: Introduce kernel.full_cdata sysctl Michael Holzheu
  2010-11-29 16:42 ` [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats Michael Holzheu
  3 siblings, 0 replies; 19+ messages in thread
From: Michael Holzheu @ 2010-11-29 16:42 UTC (permalink / raw)
  To: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Balbir Singh, Martin Schwidefsky,
	Heiko Carstens, Roland McGrath, Valdis.Kletnieks
  Cc: linux-kernel, linux-s390

[-- Attachment #1: 02-taskstats-top-improve-ctime-add_cdata-func.patch --]
[-- Type: text/plain, Size: 5535 bytes --]

From: Michael Holzheu <holzheu@linux.vnet.ibm.com>

Version 2
---------
* Move lock of siglock down in __account_cdata()

Description
-----------
This patch introduces the function __account_cdata() that does the
cummulative resource accounting for dead processes in sys_wait().
No functional changes are done. This patch is a preparation for
the full cdata accounting (full_cdata sysctl).

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
---
 kernel/exit.c |  133 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 68 insertions(+), 65 deletions(-)

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,72 @@ static void __unhash_process(struct task
 	list_del_rcu(&p->thread_group);
 }
 
+static void __account_cdata(struct task_struct *p)
+{
+	struct cdata *cd, *pcd, *tcd;
+	unsigned long maxrss;
+	cputime_t tgutime, tgstime;
+
+	/*
+	 * The resource counters for the group leader are in its
+	 * own task_struct.  Those for dead threads in the group
+	 * are in its signal_struct, as are those for the child
+	 * processes it has previously reaped.  All these
+	 * accumulate in the parent's signal_struct c* fields.
+	 *
+	 * We don't bother to take a lock here to protect these
+	 * p->signal fields, because they are only touched by
+	 * __exit_signal, which runs with tasklist_lock
+	 * write-locked anyway, and so is excluded here.  We do
+	 * need to protect the access to parent->signal fields,
+	 * as other threads in the parent group can be right
+	 * here reaping other children at the same time.
+	 *
+	 * We use thread_group_times() to get times for the thread
+	 * group, which consolidates times for all threads in the
+	 * group including the group leader.
+	 */
+	thread_group_times(p, &tgutime, &tgstime);
+	pcd = &p->real_parent->signal->cdata_wait;
+	tcd = &p->signal->cdata_threads;
+	cd = &p->signal->cdata_wait;
+
+	spin_lock_irq(&p->real_parent->sighand->siglock);
+	pcd->utime =
+		cputime_add(pcd->utime,
+		cputime_add(tgutime,
+			    cd->utime));
+	pcd->stime =
+		cputime_add(pcd->stime,
+		cputime_add(tgstime,
+			    cd->stime));
+	pcd->gtime =
+		cputime_add(pcd->gtime,
+		cputime_add(p->gtime,
+		cputime_add(tcd->gtime,
+			    cd->gtime)));
+	pcd->min_flt +=
+		p->min_flt + tcd->min_flt + cd->min_flt;
+	pcd->maj_flt +=
+		p->maj_flt + tcd->maj_flt + cd->maj_flt;
+	pcd->nvcsw +=
+		p->nvcsw + tcd->nvcsw + cd->nvcsw;
+	pcd->nivcsw +=
+		p->nivcsw + tcd->nivcsw + cd->nivcsw;
+	pcd->inblock +=
+		task_io_get_inblock(p) +
+		tcd->inblock + cd->inblock;
+	pcd->oublock +=
+		task_io_get_oublock(p) +
+		tcd->oublock + cd->oublock;
+	maxrss = max(tcd->maxrss, cd->maxrss);
+	if (pcd->maxrss < maxrss)
+		pcd->maxrss = maxrss;
+	task_io_accounting_add(&p->real_parent->signal->ioac, &p->ioac);
+	task_io_accounting_add(&p->real_parent->signal->ioac, &p->signal->ioac);
+	spin_unlock_irq(&p->real_parent->sighand->siglock);
+}
+
 /*
  * This function expects the tasklist_lock write-locked.
  */
@@ -1226,71 +1292,8 @@ static int wait_task_zombie(struct wait_
 	 * It can be ptraced but not reparented, check
 	 * !task_detached() to filter out sub-threads.
 	 */
-	if (likely(!traced) && likely(!task_detached(p))) {
-		struct cdata *cd, *pcd, *tcd;
-		unsigned long maxrss;
-		cputime_t tgutime, tgstime;
-
-		/*
-		 * The resource counters for the group leader are in its
-		 * own task_struct.  Those for dead threads in the group
-		 * are in its signal_struct, as are those for the child
-		 * processes it has previously reaped.  All these
-		 * accumulate in the parent's signal_struct c* fields.
-		 *
-		 * We don't bother to take a lock here to protect these
-		 * p->signal fields, because they are only touched by
-		 * __exit_signal, which runs with tasklist_lock
-		 * write-locked anyway, and so is excluded here.  We do
-		 * need to protect the access to parent->signal fields,
-		 * as other threads in the parent group can be right
-		 * here reaping other children at the same time.
-		 *
-		 * We use thread_group_times() to get times for the thread
-		 * group, which consolidates times for all threads in the
-		 * group including the group leader.
-		 */
-		thread_group_times(p, &tgutime, &tgstime);
-		spin_lock_irq(&p->real_parent->sighand->siglock);
-		pcd = &p->real_parent->signal->cdata_wait;
-		tcd = &p->signal->cdata_threads;
-		cd = &p->signal->cdata_wait;
-
-		pcd->utime =
-			cputime_add(pcd->utime,
-			cputime_add(tgutime,
-				    cd->utime));
-		pcd->stime =
-			cputime_add(pcd->stime,
-			cputime_add(tgstime,
-				    cd->stime));
-		pcd->gtime =
-			cputime_add(pcd->gtime,
-			cputime_add(p->gtime,
-			cputime_add(tcd->gtime,
-				    cd->gtime)));
-		pcd->min_flt +=
-			p->min_flt + tcd->min_flt + cd->min_flt;
-		pcd->maj_flt +=
-			p->maj_flt + tcd->maj_flt + cd->maj_flt;
-		pcd->nvcsw +=
-			p->nvcsw + tcd->nvcsw + cd->nvcsw;
-		pcd->nivcsw +=
-			p->nivcsw + tcd->nivcsw + cd->nivcsw;
-		pcd->inblock +=
-			task_io_get_inblock(p) +
-			tcd->inblock + cd->inblock;
-		pcd->oublock +=
-			task_io_get_oublock(p) +
-			tcd->oublock + cd->oublock;
-		maxrss = max(tcd->maxrss, cd->maxrss);
-		if (pcd->maxrss < maxrss)
-			pcd->maxrss = maxrss;
-		task_io_accounting_add(&p->real_parent->signal->ioac, &p->ioac);
-		task_io_accounting_add(&p->real_parent->signal->ioac,
-				       &p->signal->ioac);
-		spin_unlock_irq(&p->real_parent->sighand->siglock);
-	}
+	if (likely(!traced) && likely(!task_detached(p)))
+		__account_cdata(p);
 
 	/*
 	 * Now we are sure this task is interesting, and no other


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch v2 3/4] taskstats: Introduce kernel.full_cdata sysctl
  2010-11-29 16:42 [patch v2 0/4] taskstats: Improve cumulative time accounting Michael Holzheu
  2010-11-29 16:42 ` [patch v2 1/4] taskstats: Introduce "struct cdata" Michael Holzheu
  2010-11-29 16:42 ` [patch v2 2/4] taskstats: Introduce __account_cdata() function Michael Holzheu
@ 2010-11-29 16:42 ` Michael Holzheu
  2010-11-29 16:42 ` [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats Michael Holzheu
  3 siblings, 0 replies; 19+ messages in thread
From: Michael Holzheu @ 2010-11-29 16:42 UTC (permalink / raw)
  To: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Balbir Singh, Martin Schwidefsky,
	Heiko Carstens, Roland McGrath, Valdis.Kletnieks
  Cc: linux-kernel, linux-s390

[-- Attachment #1: 03-taskstats-top-improve-ctime-account-cdata_acct.patch --]
[-- Type: text/plain, Size: 4305 bytes --]

From: Michael Holzheu <holzheu@linux.vnet.ibm.com>

Version 2
---------
* Implement sysctl instead of adding parallel cdata_acct
* Call __account_cdata() in __exit_signal() without tsk siglock

Description
-----------
Currently the cumulative time accounting in Linux is not complete.
Due to POSIX POSIX.1-2001, the CPU time of processes is not accounted
to the cumulative time of the parents, if the parents ignore SIGCHLD
or have set SA_NOCLDWAIT. This behaviour has the major drawback that
it is not possible to calculate all consumed CPU time of a system by
looking at the current tasks. CPU time can be lost.

This patch adds a new sysctl "kernel.full_cdata" that allows to switch
between the POSIX behavior and complete cumulative accounting.
The default is the POSIX semantics.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
---
 Documentation/sysctl/kernel.txt |   12 ++++++++++++
 kernel/exit.c                   |   13 +++++++++----
 kernel/sysctl.c                 |   10 ++++++++++
 3 files changed, 31 insertions(+), 4 deletions(-)

--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -247,6 +247,18 @@ see the hostname(1) man page.
 
 ==============================================================
 
+full_cdata:
+
+With "full_cdata = 0" (default) cumulative resource accounting
+under Linux is done according to POSIX.1-2001: The resource
+counters of processes are not accounted to the cumulative counters
+of their parents, if the parents ignore SIGCHLD or have set
+SA_NOCLDWAIT. With "full_cdata = 1" it is possible to enforce
+that all dead processes without exception are accounted to their
+parents.
+
+==============================================================
+
 hotplug:
 
 Path for the hotplug policy agent.
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -57,6 +57,8 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+int full_cdata_enabled;
+
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -77,7 +79,7 @@ static void __unhash_process(struct task
 static void __account_cdata(struct task_struct *p)
 {
 	struct cdata *cd, *pcd, *tcd;
-	unsigned long maxrss;
+	unsigned long maxrss, flags;
 	cputime_t tgutime, tgstime;
 
 	/*
@@ -104,7 +106,7 @@ static void __account_cdata(struct task_
 	tcd = &p->signal->cdata_threads;
 	cd = &p->signal->cdata_wait;
 
-	spin_lock_irq(&p->real_parent->sighand->siglock);
+	spin_lock_irqsave(&p->real_parent->sighand->siglock, flags);
 	pcd->utime =
 		cputime_add(pcd->utime,
 		cputime_add(tgutime,
@@ -137,7 +139,7 @@ static void __account_cdata(struct task_
 		pcd->maxrss = maxrss;
 	task_io_accounting_add(&p->real_parent->signal->ioac, &p->ioac);
 	task_io_accounting_add(&p->real_parent->signal->ioac, &p->signal->ioac);
-	spin_unlock_irq(&p->real_parent->sighand->siglock);
+	spin_unlock_irqrestore(&p->real_parent->sighand->siglock, flags);
 }
 
 /*
@@ -150,6 +152,9 @@ static void __exit_signal(struct task_st
 	struct sighand_struct *sighand;
 	struct tty_struct *uninitialized_var(tty);
 
+	if (group_dead && full_cdata_enabled)
+		__account_cdata(tsk);
+
 	sighand = rcu_dereference_check(tsk->sighand,
 					rcu_read_lock_held() ||
 					lockdep_tasklist_lock_is_held());
@@ -1292,7 +1297,7 @@ static int wait_task_zombie(struct wait_
 	 * It can be ptraced but not reparented, check
 	 * !task_detached() to filter out sub-threads.
 	 */
-	if (likely(!traced) && likely(!task_detached(p)))
+	if (likely(!traced) && likely(!task_detached(p)) && !full_cdata_enabled)
 		__account_cdata(p);
 
 	/*
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -85,6 +85,7 @@
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
+extern int full_cdata_enabled;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int max_threads;
@@ -963,6 +964,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+	{
+		.procname	= "full_cdata",
+		.data		= &full_cdata_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-11-29 16:42 [patch v2 0/4] taskstats: Improve cumulative time accounting Michael Holzheu
                   ` (2 preceding siblings ...)
  2010-11-29 16:42 ` [patch v2 3/4] taskstats: Introduce kernel.full_cdata sysctl Michael Holzheu
@ 2010-11-29 16:42 ` Michael Holzheu
  2010-12-01 18:51   ` Oleg Nesterov
  3 siblings, 1 reply; 19+ messages in thread
From: Michael Holzheu @ 2010-11-29 16:42 UTC (permalink / raw)
  To: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Balbir Singh, Martin Schwidefsky,
	Heiko Carstens, Roland McGrath, Valdis.Kletnieks
  Cc: linux-kernel, linux-s390

[-- Attachment #1: 04-taskstats-top-improve-ctime-taskstats.patch --]
[-- Type: text/plain, Size: 2177 bytes --]

From: Michael Holzheu <holzheu@linux.vnet.ibm.com>

Version 2
---------
* Use thread_group_leader() instead of (tsk->pid == tsk->tgid)
* Use cdata_wait instead of cdata_acct
* I left the struct signal locking in the patch, because I think
  that in order to get consistent data this is necessary. See also
  do_task_stat() in fs/proc/array.c. One problem is that we
  report wrong values (zero) for cdata, if lock_task_sighand()
  fails. This will be the same behavior as for /proc/<pid>/stat.
  But maybe we should somehow return to userspace that the
  information is not correct. Any ideas?

Description
-----------
With this patch the cumulative CPU time is added to "struct taskstats".
The CPU time is only returned for the thread group leader.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
---
 include/linux/taskstats.h |    7 ++++++-
 kernel/tsacct.c           |    7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -33,7 +33,7 @@
  */
 
 
-#define TASKSTATS_VERSION	7
+#define TASKSTATS_VERSION	8
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -163,6 +163,11 @@ struct taskstats {
 	/* Delay waiting for memory reclaim */
 	__u64	freepages_count;
 	__u64	freepages_delay_total;
+	/* version 7 ends here */
+
+	/* Cumulative CPU time of dead children */
+	__u64	ac_cutime;		/* User CPU time [usec] */
+	__u64	ac_cstime;		/* System CPU time [usec] */
 };
 
 
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -30,6 +30,7 @@ void bacct_add_tsk(struct taskstats *sta
 {
 	const struct cred *tcred;
 	struct timespec uptime, ts;
+	unsigned long flags;
 	u64 ac_etime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -71,6 +72,12 @@ void bacct_add_tsk(struct taskstats *sta
 	stats->ac_majflt = tsk->maj_flt;
 
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+	if (thread_group_leader(tsk) && lock_task_sighand(tsk, &flags)) {
+		struct cdata *cd = &tsk->signal->cdata_wait;
+		stats->ac_cutime = cputime_to_usecs(cd->utime);
+		stats->ac_cstime = cputime_to_usecs(cd->stime);
+		unlock_task_sighand(tsk, &flags);
+	}
 }
 
 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-11-29 16:42 ` [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats Michael Holzheu
@ 2010-12-01 18:51   ` Oleg Nesterov
  2010-12-02 16:34     ` Michael Holzheu
                       ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Oleg Nesterov @ 2010-12-01 18:51 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On 11/29, Michael Holzheu wrote:
>
> * I left the struct signal locking in the patch, because I think
>   that in order to get consistent data this is necessary.

OK, I guess you mean that we want to read utime/stime "atomically",
and thus we need ->siglock to prevent the race with __exit_signal().

> @@ -30,6 +30,7 @@ void bacct_add_tsk(struct taskstats *sta
>  {
>  	const struct cred *tcred;
>  	struct timespec uptime, ts;
> +	unsigned long flags;
>  	u64 ac_etime;
>
>  	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
> @@ -71,6 +72,12 @@ void bacct_add_tsk(struct taskstats *sta
>  	stats->ac_majflt = tsk->maj_flt;
>
>  	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
> +	if (thread_group_leader(tsk) && lock_task_sighand(tsk, &flags)) {
> +		struct cdata *cd = &tsk->signal->cdata_wait;
> +		stats->ac_cutime = cputime_to_usecs(cd->utime);
> +		stats->ac_cstime = cputime_to_usecs(cd->stime);

perhaps we need a small comment to explain ->siglock...

But in fact I don't really understand this anyway. This is called
before we reparent our children. This means that ac_cutime/ac_cstime
can be changed after that (multithreading, or full_cdata_enabled).

Say, taskstats_exit()->fill_stats()->bacct_add_tsk(). Every thread
does this, including the group_leader. But, it is possible that
group_leader exits first, before other threads. IOW, what
stats->ac_cXtime actually mean?

Nevermind, the whole series looks correct to me. Although I still
can't find the time to read it ;) But at first glance this version
addresses all concerns we discussed before.

Oleg.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-01 18:51   ` Oleg Nesterov
@ 2010-12-02 16:34     ` Michael Holzheu
  2010-12-06  9:06       ` Balbir Singh
  2010-12-08 20:23       ` Oleg Nesterov
  2010-12-03  7:33     ` Balbir Singh
  2010-12-07 10:45     ` Michael Holzheu
  2 siblings, 2 replies; 19+ messages in thread
From: Michael Holzheu @ 2010-12-02 16:34 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On Wed, 2010-12-01 at 19:51 +0100, Oleg Nesterov wrote:
> But in fact I don't really understand this anyway. This is called
> before we reparent our children. This means that ac_cutime/ac_cstime
> can be changed after that (multithreading, or full_cdata_enabled).
> 
> Say, taskstats_exit()->fill_stats()->bacct_add_tsk(). Every thread
> does this, including the group_leader. But, it is possible that
> group_leader exits first, before other threads. IOW, what
> stats->ac_cXtime actually mean?

Because I worked mostly with the ptop tool, I was not so much focused on
the taskstats exit events, but instead more on the taskstats commands to
query data for running tasks.

For the query scenario stats->ac_cXtime means:

1) full_cdata=0: "Sum of CPU time of exited child processes where
   sys_wait() have been done (up to this time)"
2) full_cdata=1: "Sum of CPU time of exited child processes where
   sys_wait() have been done plus exited child processes where
   the parents ignored SIGCHLD or have set SA_NOCLDWAIT (up to
   this time)"

Regarding taskstats_exit(): Do you have something like the following
scenario in mind?

1) You have a thread group with several threads
2) Thread group leader dies and reports cdata_wait in taskstats_exit()
3) Thread group leader stays around as zombie until the thread
   group dies
4) Other forked processes of this thread group die
5) cdata_wait of thread group is increased
6) The new cdata is not reported by any exit event of the thread group

So maybe we should remove the thread_group_leader() check and report
cdata_wait for all threads and not only for the thread group leader? We
also should add ac_tgid to taskstats so that userspace can find the
corresponding thread group for each thread.

When the last thread exits and the process/thread group dies,
taskstats_exit() sends an additional taskstats struct to userspace that
aggregates the thread accounting data. Currently only the delay
accounting data is aggregated (see
taskstats_exit->fill_tgid_exit->delayacct_add_tsk). Not sure, why the
other information is not aggregated. We perhaps also should include
ac_cXtime in the aggregated taskstats.

Michael

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-01 18:51   ` Oleg Nesterov
  2010-12-02 16:34     ` Michael Holzheu
@ 2010-12-03  7:33     ` Balbir Singh
  2010-12-06 12:37       ` Michael Holzheu
  2010-12-07 10:45     ` Michael Holzheu
  2 siblings, 1 reply; 19+ messages in thread
From: Balbir Singh @ 2010-12-03  7:33 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael Holzheu, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

* Oleg Nesterov <oleg@redhat.com> [2010-12-01 19:51:28]:

> On 11/29, Michael Holzheu wrote:
> >
> > * I left the struct signal locking in the patch, because I think
> >   that in order to get consistent data this is necessary.
> 
> OK, I guess you mean that we want to read utime/stime "atomically",
> and thus we need ->siglock to prevent the race with __exit_signal().
> 
> > @@ -30,6 +30,7 @@ void bacct_add_tsk(struct taskstats *sta
> >  {
> >  	const struct cred *tcred;
> >  	struct timespec uptime, ts;
> > +	unsigned long flags;
> >  	u64 ac_etime;
> >
> >  	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
> > @@ -71,6 +72,12 @@ void bacct_add_tsk(struct taskstats *sta
> >  	stats->ac_majflt = tsk->maj_flt;
> >
> >  	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
> > +	if (thread_group_leader(tsk) && lock_task_sighand(tsk, &flags)) {
> > +		struct cdata *cd = &tsk->signal->cdata_wait;
> > +		stats->ac_cutime = cputime_to_usecs(cd->utime);
> > +		stats->ac_cstime = cputime_to_usecs(cd->stime);
> 
> perhaps we need a small comment to explain ->siglock...
> 
> But in fact I don't really understand this anyway. This is called
> before we reparent our children. This means that ac_cutime/ac_cstime
> can be changed after that (multithreading, or full_cdata_enabled).
> 
> Say, taskstats_exit()->fill_stats()->bacct_add_tsk(). Every thread
> does this, including the group_leader. But, it is possible that
> group_leader exits first, before other threads. IOW, what
> stats->ac_cXtime actually mean?
> 

stats->ac_* time was designed only for tgid's to begin with, so I am
not sure if ac_cXtime makes sense for threads

> 
> Nevermind, the whole series looks correct to me. Although I still
> can't find the time to read it ;) But at first glance this version
> addresses all concerns we discussed before.
> 
> Oleg.
> 

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-02 16:34     ` Michael Holzheu
@ 2010-12-06  9:06       ` Balbir Singh
  2010-12-08 20:23       ` Oleg Nesterov
  1 sibling, 0 replies; 19+ messages in thread
From: Balbir Singh @ 2010-12-06  9:06 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

* Michael Holzheu <holzheu@linux.vnet.ibm.com> [2010-12-02 17:34:01]:

> On Wed, 2010-12-01 at 19:51 +0100, Oleg Nesterov wrote:
> > But in fact I don't really understand this anyway. This is called
> > before we reparent our children. This means that ac_cutime/ac_cstime
> > can be changed after that (multithreading, or full_cdata_enabled).
> > 
> > Say, taskstats_exit()->fill_stats()->bacct_add_tsk(). Every thread
> > does this, including the group_leader. But, it is possible that
> > group_leader exits first, before other threads. IOW, what
> > stats->ac_cXtime actually mean?
> 
> Because I worked mostly with the ptop tool, I was not so much focused on
> the taskstats exit events, but instead more on the taskstats commands to
> query data for running tasks.
> 
> For the query scenario stats->ac_cXtime means:
> 
> 1) full_cdata=0: "Sum of CPU time of exited child processes where
>    sys_wait() have been done (up to this time)"
> 2) full_cdata=1: "Sum of CPU time of exited child processes where
>    sys_wait() have been done plus exited child processes where
>    the parents ignored SIGCHLD or have set SA_NOCLDWAIT (up to
>    this time)"
> 
> Regarding taskstats_exit(): Do you have something like the following
> scenario in mind?
> 
> 1) You have a thread group with several threads
> 2) Thread group leader dies and reports cdata_wait in taskstats_exit()
> 3) Thread group leader stays around as zombie until the thread
>    group dies
> 4) Other forked processes of this thread group die
> 5) cdata_wait of thread group is increased
> 6) The new cdata is not reported by any exit event of the thread group
> 
> So maybe we should remove the thread_group_leader() check and report
> cdata_wait for all threads and not only for the thread group leader? We
> also should add ac_tgid to taskstats so that userspace can find the
> corresponding thread group for each thread.
> 
> When the last thread exits and the process/thread group dies,
> taskstats_exit() sends an additional taskstats struct to userspace that
> aggregates the thread accounting data. Currently only the delay
> accounting data is aggregated (see
> taskstats_exit->fill_tgid_exit->delayacct_add_tsk). Not sure, why the
> other information is not aggregated. We perhaps also should include
> ac_cXtime in the aggregated taskstats.
>

The delay accounting data is aggregated, the other part tsacct do not
care much about aggregation (IIRC). tsacct was added to export pid
data and never tgid data again IIRC. This subsystem is the one that
owns ac_*, perhaps if tsacct supported tgid's then what you say makes
sense.

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-03  7:33     ` Balbir Singh
@ 2010-12-06 12:37       ` Michael Holzheu
  2010-12-06 15:15         ` Balbir Singh
  0 siblings, 1 reply; 19+ messages in thread
From: Michael Holzheu @ 2010-12-06 12:37 UTC (permalink / raw)
  To: balbir
  Cc: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

Hello Balbir,

On Fri, 2010-12-03 at 13:03 +0530, Balbir Singh wrote:
> * Oleg Nesterov <oleg@redhat.com> [2010-12-01 19:51:28]:
> 
> > On 11/29, Michael Holzheu wrote:
[snip]
> > Say, taskstats_exit()->fill_stats()->bacct_add_tsk(). Every thread
> > does this, including the group_leader. But, it is possible that
> > group_leader exits first, before other threads. IOW, what
> > stats->ac_cXtime actually mean?
> > 
> 
> stats->ac_* time was designed only for tgid's to begin with,

You mean for tids (threads/tasks), no? stats->ac_* time is only reported
for threads in bacct_add_tsk() and not for tgids.

> so I am
> not sure if ac_cXtime makes sense for threads

I would suggest to do it the same way as /proc/<tgid>/tasks/<tid>/stat.
It reports the (same) cumulative time for each thread. See
do_task_stat() in fs/proc/array.c. So IMHO also for taskstats it makes
sense to include the cXtime for all threads and not only for the thread
group leader. 

Also I would include tgid in taskstats so that userspace can group the
tasks into thread groups. 

I am not sure regarding the aggregation for tgids in
TASKSTATS_CMD_ATTR_TGID and exit events with group_dead=1. Currently
only the delay accounting numbers are aggregated. If we would do it
like /proc/<tgid>/stat (do_task_stat() with whole=1) we also could
aggregate also the other values e.g. the CPU time.

I think the following tgid data would make sense to be returned for
TASKSTATS_CMD_ATTR_TGID and exit events with group_dead=1:

bacct_add_tsk():
----------------
ac_pid         thread group leader (tsk->tgid)
ac_etime       thread group leader
ac_btime       thread group leader
ac_nice        thread group leader
ac_sched       thread group leader
ac_uid         thread group leader
ac_gid         thread group leader
ac_ppid        thread group leader
ac_comm        thread group leader
ac_exit_code   thread group leader
ac_flags       thread group leader
ac_utimescaled ??
ac_stimescaled ??
ac_utime       sum for all live threads + cdata_threads
ac_stime       sum for all live threads + cdata_threads
ac_minflt      sum for all live threads + cdata_threads
ac_majflt      sum for all live threads + cdata_threads

new:
ac_cutime      cdata_wait
ac_cstime      cdata_wait
ac_tgid        thread group leader

xacct_add_tsk():
----------------
coremem        ?
virtmem        ?
hiwater_rss    thread group leader
hiwater_vm     thread group leader
read_char      sum for all live threads + tsk->signal.ioac
write_char     sum for all live threads + tsk->signal.ioac
read_syscalls  sum for all live threads + tsk->signal.ioac
write_syscalls sum for all live threads + tsk->signal.ioac
read_bytes     sum for all live threads + tsk->signal.ioac
write bytes    sum for all live threads + tsk->signal.ioac
cancelled_write_bytes sum for all live threads + tsk->signal.ioac

If we leave everything as it currently is for tgid accounting, we
probably should also not include the cXtime for tgids and just include
the data for threads (TASKSTATS_CMD_ATTR_PID and task exit events).

What do you think?

Michael

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-06 12:37       ` Michael Holzheu
@ 2010-12-06 15:15         ` Balbir Singh
  0 siblings, 0 replies; 19+ messages in thread
From: Balbir Singh @ 2010-12-06 15:15 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: Oleg Nesterov, Shailabh Nagar, Andrew Morton, Peter Zijlstra,
	John stultz, Thomas Gleixner, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

* Michael Holzheu <holzheu@linux.vnet.ibm.com> [2010-12-06 13:37:37]:

> Hello Balbir,
> 
> On Fri, 2010-12-03 at 13:03 +0530, Balbir Singh wrote:
> > * Oleg Nesterov <oleg@redhat.com> [2010-12-01 19:51:28]:
> > 
> > > On 11/29, Michael Holzheu wrote:
> [snip]
> > > Say, taskstats_exit()->fill_stats()->bacct_add_tsk(). Every thread
> > > does this, including the group_leader. But, it is possible that
> > > group_leader exits first, before other threads. IOW, what
> > > stats->ac_cXtime actually mean?
> > > 
> > 
> > stats->ac_* time was designed only for tgid's to begin with,
> 
> You mean for tids (threads/tasks), no? stats->ac_* time is only reported
> for threads in bacct_add_tsk() and not for tgids.

Sorry, I meant tid's or pid

> 
> > so I am
> > not sure if ac_cXtime makes sense for threads
> 
> I would suggest to do it the same way as /proc/<tgid>/tasks/<tid>/stat.
> It reports the (same) cumulative time for each thread. See
> do_task_stat() in fs/proc/array.c. So IMHO also for taskstats it makes
> sense to include the cXtime for all threads and not only for the thread
> group leader. 
> 
> Also I would include tgid in taskstats so that userspace can group the
> tasks into thread groups. 
> 
> I am not sure regarding the aggregation for tgids in
> TASKSTATS_CMD_ATTR_TGID and exit events with group_dead=1. Currently
> only the delay accounting numbers are aggregated. If we would do it
> like /proc/<tgid>/stat (do_task_stat() with whole=1) we also could
> aggregate also the other values e.g. the CPU time.
> 
> I think the following tgid data would make sense to be returned for
> TASKSTATS_CMD_ATTR_TGID and exit events with group_dead=1:
> 
> bacct_add_tsk():
> ----------------
> ac_pid         thread group leader (tsk->tgid)
> ac_etime       thread group leader
> ac_btime       thread group leader
> ac_nice        thread group leader
> ac_sched       thread group leader
> ac_uid         thread group leader
> ac_gid         thread group leader
> ac_ppid        thread group leader
> ac_comm        thread group leader
> ac_exit_code   thread group leader
> ac_flags       thread group leader
> ac_utimescaled ??
> ac_stimescaled ??
> ac_utime       sum for all live threads + cdata_threads
> ac_stime       sum for all live threads + cdata_threads
> ac_minflt      sum for all live threads + cdata_threads
> ac_majflt      sum for all live threads + cdata_threads
> 
> new:
> ac_cutime      cdata_wait
> ac_cstime      cdata_wait
> ac_tgid        thread group leader
>

This seems to be make sense.
 
> xacct_add_tsk():
> ----------------
> coremem        ?
> virtmem        ?
> hiwater_rss    thread group leader
> hiwater_vm     thread group leader
> read_char      sum for all live threads + tsk->signal.ioac
> write_char     sum for all live threads + tsk->signal.ioac
> read_syscalls  sum for all live threads + tsk->signal.ioac
> write_syscalls sum for all live threads + tsk->signal.ioac
> read_bytes     sum for all live threads + tsk->signal.ioac
> write bytes    sum for all live threads + tsk->signal.ioac
> cancelled_write_bytes sum for all live threads + tsk->signal.ioac
>   
> If we leave everything as it currently is for tgid accounting, we
> probably should also not include the cXtime for tgids and just include
> the data for threads (TASKSTATS_CMD_ATTR_PID and task exit events).
> 
> What do you think?
>

This makes sense to me, we'd need to bump up the API version if we are
going to make this change. 

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-01 18:51   ` Oleg Nesterov
  2010-12-02 16:34     ` Michael Holzheu
  2010-12-03  7:33     ` Balbir Singh
@ 2010-12-07 10:45     ` Michael Holzheu
  2010-12-08 20:39       ` Oleg Nesterov
  2 siblings, 1 reply; 19+ messages in thread
From: Michael Holzheu @ 2010-12-07 10:45 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On Wed, 2010-12-01 at 19:51 +0100, Oleg Nesterov wrote:
> On 11/29, Michael Holzheu wrote:
> >
> > * I left the struct signal locking in the patch, because I think
> >   that in order to get consistent data this is necessary.
> 
> OK, I guess you mean that we want to read utime/stime "atomically",
> and thus we need ->siglock to prevent the race with __exit_signal().

Yes.

But I changed my mind. If I understand the code right, currently no
locking is done for reading the tasks statistics in both the taskstats
and also in the procfs interface. So e.g. it is not guaranteed to get a
consistent set of data when reading /proc/<pid>/stat, because of the
race with account_user/system/whatever_time() and other accounting
functions.

I assume that has been made by intention. Or am I missing something?

Because you said that since 2.6.35 accessing the signal_struct is save,
I think now also for cdata it is not necessary to lock anything. As a
result of this and the discussion regarding the group leader and cdata I
would propose the following patch:
---
Subject: taskstats: Export "cdata_wait" CPU times with taskstats

From: Michael Holzheu <holzheu@linux.vnet.ibm.com>

Version 3
---------
* Remove signal struct locking because accessing signal_struct is save
  since 2.6.35.
* Report cdata for all tasks not only for the thread group leader.
  This is then the same behavior as for /proc/<tgid>/tasks/<tid>/stat.
* Add tgid to taskstats to allow userspace to group threads.

Version 2
---------
* Use thread_group_leader() instead of (tsk->pid == tsk->tgid)
* Use cdata_wait instead of cdata_acct
* I left the struct signal locking in the patch, because I think
  that in order to get consistent data this is necessary. See also
  do_task_stat() in fs/proc/array.c. One problem is that we
  report wrong values (zero) for cdata, if lock_task_sighand()
  fails. This will be the same behavior as for /proc/<pid>/stat.
  But maybe we should somehow return to userspace that the
  information is not correct. Any ideas?

Description
-----------
With this patch the cumulative CPU time and the tgid (thread group ID)
is added to "struct taskstats".

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
---
 include/linux/taskstats.h |   10 +++++++++-
 kernel/tsacct.c           |    3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -33,7 +33,7 @@
  */
 
 
-#define TASKSTATS_VERSION	7
+#define TASKSTATS_VERSION	8
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -163,6 +163,14 @@ struct taskstats {
 	/* Delay waiting for memory reclaim */
 	__u64	freepages_count;
 	__u64	freepages_delay_total;
+	/* version 7 ends here */
+
+	__u32	ac_tgid;		/* Thread group ID */
+
+	/* Cumulative CPU time of dead children */
+	__u64	ac_cutime;		/* User CPU time [usec] */
+	__u64	ac_cstime;		/* System CPU time [usec] */
+	/* version 8 ends here */
 };
 
 
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -71,6 +71,9 @@ void bacct_add_tsk(struct taskstats *sta
 	stats->ac_majflt = tsk->maj_flt;
 
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+	stats->ac_cutime = cputime_to_usecs(tsk->signal->cdata_wait.utime);
+	stats->ac_cstime = cputime_to_usecs(tsk->signal->cdata_wait.stime);
+	stats->ac_tgid = tsk->tgid;
 }
 
 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-02 16:34     ` Michael Holzheu
  2010-12-06  9:06       ` Balbir Singh
@ 2010-12-08 20:23       ` Oleg Nesterov
  2010-12-10 13:26         ` Michael Holzheu
  1 sibling, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2010-12-08 20:23 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

Oh, sorry for the delay. Again ;)

On 12/02, Michael Holzheu wrote:
>
> 1) You have a thread group with several threads
> 2) Thread group leader dies and reports cdata_wait in taskstats_exit()
> 3) Thread group leader stays around as zombie until the thread
>    group dies
> 4) Other forked processes of this thread group die
> 5) cdata_wait of thread group is increased
> 6) The new cdata is not reported by any exit event of the thread group

Yes.

> So maybe we should remove the thread_group_leader() check and report
> cdata_wait for all threads and not only for the thread group leader? We
> also should add ac_tgid to taskstats so that userspace can find the
> corresponding thread group for each thread.

I do not know. My only point was, this thread_group_leader() looks
a bit confusing, at least if we are talking about do_exit() path.

> When the last thread exits and the process/thread group dies,
> taskstats_exit() sends an additional taskstats struct to userspace that
> aggregates the thread accounting data. Currently only the delay
> accounting data is aggregated (see
> taskstats_exit->fill_tgid_exit->delayacct_add_tsk). Not sure, why the
> other information is not aggregated. We perhaps also should include
> ac_cXtime in the aggregated taskstats.

Not sure I understand... Do you mean

	if (is_thread_group)
		fill_tgid_exit(tsk);

? Afaics, this is not "When the last thread exits", this is
"this program is multithreaded, it has (or had) other threads".

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-07 10:45     ` Michael Holzheu
@ 2010-12-08 20:39       ` Oleg Nesterov
  0 siblings, 0 replies; 19+ messages in thread
From: Oleg Nesterov @ 2010-12-08 20:39 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On 12/07, Michael Holzheu wrote:
>
> On Wed, 2010-12-01 at 19:51 +0100, Oleg Nesterov wrote:
> > On 11/29, Michael Holzheu wrote:
> > >
> > > * I left the struct signal locking in the patch, because I think
> > >   that in order to get consistent data this is necessary.
> >
> > OK, I guess you mean that we want to read utime/stime "atomically",
> > and thus we need ->siglock to prevent the race with __exit_signal().
>
> Yes.
>
> But I changed my mind. If I understand the code right, currently no
> locking is done for reading the tasks statistics in both the taskstats
> and also in the procfs interface. So e.g. it is not guaranteed to get a
> consistent set of data when reading /proc/<pid>/stat, because of the
> race with account_user/system/whatever_time() and other accounting
> functions.

Not sure, I think account_user/system/whatever_time() is a bit different.
And, say, do_task_stat() takes ->siglock for thread_group_times(). Btw,
I tried to remove this lock_task_sighand(), and I still hope we can
do this ;) But the patch wasn't acked (iiuc) exactly because it can
make top unhappy..

> Because you said that since 2.6.35 accessing the signal_struct is save,
> I think now also for cdata it is not necessary to lock anything. As a
> result of this and the discussion regarding the group leader and cdata I
> would propose the following patch:

To me, this certainly makes sense. I do not really understand why
it is so important to prevent the case when, say, bacct_add_tsk()
sees the updated .utime and !updated .stime.

If we can race with the exiting tasks, then these numbers are not
"final" anyway.

But again, this all is up to you.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-08 20:23       ` Oleg Nesterov
@ 2010-12-10 13:26         ` Michael Holzheu
       [not found]           ` <20101211173931.GA8084@redhat.com>
  0 siblings, 1 reply; 19+ messages in thread
From: Michael Holzheu @ 2010-12-10 13:26 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

Hello Oleg,

On Wed, 2010-12-08 at 21:23 +0100, Oleg Nesterov wrote:
> Oh, sorry for the delay. Again ;)

No problem, I know this can be exhausting ;-)

> On 12/02, Michael Holzheu wrote:
> > When the last thread exits and the process/thread group dies,
> > taskstats_exit() sends an additional taskstats struct to userspace that
> > aggregates the thread accounting data. Currently only the delay
> > accounting data is aggregated (see
> > taskstats_exit->fill_tgid_exit->delayacct_add_tsk). Not sure, why the
> > other information is not aggregated. We perhaps also should include
> > ac_cXtime in the aggregated taskstats.
> 
> Not sure I understand... Do you mean
> 
> 	if (is_thread_group)
> 		fill_tgid_exit(tsk);
> 
> ? Afaics, this is not "When the last thread exits", this is
> "this program is multithreaded, it has (or had) other threads".

fill_tgid_exit() adds the data of the dying thread to the thread group
data (tsk->signal->stats). Currently only for delay accounting.

But the accumulated data is sent to userspace only after the last thread
has died:

        if (!is_thread_group || >>>!group_dead<<<)
                goto send;

        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
        if (!stats)
                goto err;

Michael


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
       [not found]           ` <20101211173931.GA8084@redhat.com>
@ 2010-12-13 13:05             ` Michael Holzheu
  2010-12-13 13:20               ` Michael Holzheu
  0 siblings, 1 reply; 19+ messages in thread
From: Michael Holzheu @ 2010-12-13 13:05 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

Hello Oleg,

On Sat, 2010-12-11 at 18:39 +0100, Oleg Nesterov wrote:
> On 12/10, Michael Holzheu wrote:
> >
> > > > When the last thread exits and the process/thread group dies,
> > > > taskstats_exit() sends an additional taskstats struct to userspace that
> > > > aggregates the thread accounting data. Currently only the delay
> > > > accounting data is aggregated (see
> > > > taskstats_exit->fill_tgid_exit->delayacct_add_tsk). Not sure, why the
> > > > other information is not aggregated. We perhaps also should include
> > > > ac_cXtime in the aggregated taskstats.
> > >
> > > Not sure I understand... Do you mean
> > >
> > > 	if (is_thread_group)
> > > 		fill_tgid_exit(tsk);
> > >
> > > ? Afaics, this is not "When the last thread exits", this is
> > > "this program is multithreaded, it has (or had) other threads".
> >
> > fill_tgid_exit() adds the data of the dying thread to the thread group
> > data (tsk->signal->stats). Currently only for delay accounting.
> >
> > But the accumulated data is sent to userspace only after the last thread
> > has died:
> >
> >         if (!is_thread_group || >>>!group_dead<<<)
> >                 goto send;
> 
> Ah, right you are.
> 
> And this looks racy, or I missed something again. group_dead can be
> true, but this doesn't mean all other threads have already passed
> taskstats_exit()->fill_tgid_exit()->delayacct_add_tsk().

I think you are right.

One way to fix that could be to separate the aggregation from the
sending. We could call fill_tgid_exit()->delayacct_add_tsk() before
atomic_dec_and_test(&tsk->signal->live) in do_exit() and
taskstats_exit() with the sender part afterwards.

> And. Why "if (is_thread_group)" does "size *= 2" unconditionally?
> IOW, perhaps the patch below makes sense?

Yes, I think the patch makes sense. Balbir?

> 
> Oleg.
> 
> --- x/kernel/taskstats.c
> +++ x/kernel/taskstats.c
> @@ -576,7 +576,8 @@ void taskstats_exit(struct task_struct *
>  	is_thread_group = !!taskstats_tgid_alloc(tsk);
>  	if (is_thread_group) {
>  		/* PID + STATS + TGID + STATS */
> -		size = 2 * size;
> +		if (group_dead)
> +			size *= 2;
>  		/* fill the tsk->signal->stats structure */
>  		fill_tgid_exit(tsk);
>  	}
> 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-13 13:05             ` Michael Holzheu
@ 2010-12-13 13:20               ` Michael Holzheu
  2010-12-13 14:33                 ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Michael Holzheu @ 2010-12-13 13:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On Mon, 2010-12-13 at 14:05 +0100, Michael Holzheu wrote:
> > And this looks racy, or I missed something again. group_dead can be
> > true, but this doesn't mean all other threads have already passed
> > taskstats_exit()->fill_tgid_exit()->delayacct_add_tsk().
> 
> I think you are right.
> 
> One way to fix that could be to separate the aggregation from the
> sending. We could call fill_tgid_exit()->delayacct_add_tsk() before
> atomic_dec_and_test(&tsk->signal->live) in do_exit() and
> taskstats_exit() with the sender part afterwards.

Something like the following patch...
---
 include/linux/taskstats_kern.h |    3 +
 kernel/exit.c                  |    3 +
 kernel/taskstats.c             |   62 +++++++++++++++++------------------------
 3 files changed, 31 insertions(+), 37 deletions(-)

--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -21,7 +21,8 @@ static inline void taskstats_tgid_free(s
 		kmem_cache_free(taskstats_cache, sig->stats);
 }
 
-extern void taskstats_exit(struct task_struct *, int group_dead);
+extern void taskstats_exit_notify(struct task_struct *, int group_dead);
+extern void taskstats_exit_add_thread(struct task_struct *);
 extern void taskstats_init_early(void);
 #else
 static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1030,6 +1030,7 @@ NORET_TYPE void do_exit(long code)
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
 		sync_mm_rss(tsk, tsk->mm);
+	taskstats_exit_add_thread(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		struct cdata *tcd = &tsk->signal->cdata_threads;
@@ -1045,7 +1046,7 @@ NORET_TYPE void do_exit(long code)
 		audit_free(tsk);
 
 	tsk->exit_code = code;
-	taskstats_exit(tsk, group_dead);
+	taskstats_exit_notify(tsk, group_dead);
 
 	exit_mm(tsk);
 
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -263,10 +263,32 @@ out:
 	return rc;
 }
 
-static void fill_tgid_exit(struct task_struct *tsk)
+static void alloc_signal_stats(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct taskstats *stats;
+
+	/* No problem if kmem_cache_zalloc() fails */
+	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!sig->stats) {
+		sig->stats = stats;
+		stats = NULL;
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
+
+void taskstats_exit_add_thread(struct task_struct *tsk)
 {
 	unsigned long flags;
 
+	if (tsk->signal->stats == NULL && !thread_group_empty(tsk))
+		alloc_signal_stats(tsk);
+
 	spin_lock_irqsave(&tsk->sighand->siglock, flags);
 	if (!tsk->signal->stats)
 		goto ret;
@@ -530,39 +552,14 @@ static int taskstats_user_cmd(struct sk_
 		return -EINVAL;
 }
 
-static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
-{
-	struct signal_struct *sig = tsk->signal;
-	struct taskstats *stats;
-
-	if (sig->stats || thread_group_empty(tsk))
-		goto ret;
-
-	/* No problem if kmem_cache_zalloc() fails */
-	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (!sig->stats) {
-		sig->stats = stats;
-		stats = NULL;
-	}
-	spin_unlock_irq(&tsk->sighand->siglock);
-
-	if (stats)
-		kmem_cache_free(taskstats_cache, stats);
-ret:
-	return sig->stats;
-}
-
 /* Send pid data out on exit */
-void taskstats_exit(struct task_struct *tsk, int group_dead)
+void taskstats_exit_notify(struct task_struct *tsk, int group_dead)
 {
 	int rc;
 	struct listener_list *listeners;
 	struct taskstats *stats;
 	struct sk_buff *rep_skb;
 	size_t size;
-	int is_thread_group;
 
 	if (!family_registered)
 		return;
@@ -573,13 +570,8 @@ void taskstats_exit(struct task_struct *
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	is_thread_group = !!taskstats_tgid_alloc(tsk);
-	if (is_thread_group) {
-		/* PID + STATS + TGID + STATS */
-		size = 2 * size;
-		/* fill the tsk->signal->stats structure */
-		fill_tgid_exit(tsk);
-	}
+	if (group_dead && tsk->signal->stats)
+		size = 2 * size; /* PID + STATS + TGID + STATS */
 
 	listeners = &__raw_get_cpu_var(listener_array);
 	if (list_empty(&listeners->list))
@@ -598,7 +590,7 @@ void taskstats_exit(struct task_struct *
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (!is_thread_group || !group_dead)
+	if (!group_dead || tsk->signal->stats == NULL)
 		goto send;
 
 	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);




^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-13 13:20               ` Michael Holzheu
@ 2010-12-13 14:33                 ` Oleg Nesterov
  2010-12-13 16:42                   ` Michael Holzheu
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2010-12-13 14:33 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On 12/13, Michael Holzheu wrote:
>
> On Mon, 2010-12-13 at 14:05 +0100, Michael Holzheu wrote:
> > > And this looks racy, or I missed something again. group_dead can be
> > > true, but this doesn't mean all other threads have already passed
> > > taskstats_exit()->fill_tgid_exit()->delayacct_add_tsk().
> >
> > I think you are right.
> >
> > One way to fix that could be to separate the aggregation from the
> > sending. We could call fill_tgid_exit()->delayacct_add_tsk() before
> > atomic_dec_and_test(&tsk->signal->live) in do_exit() and
> > taskstats_exit() with the sender part afterwards.

Yes, I think this should fix the race. Some nits below...

> --- a/include/linux/taskstats_kern.h
> +++ b/include/linux/taskstats_kern.h
> @@ -21,7 +21,8 @@ static inline void taskstats_tgid_free(s
>  		kmem_cache_free(taskstats_cache, sig->stats);
>  }
>  
> -extern void taskstats_exit(struct task_struct *, int group_dead);
> +extern void taskstats_exit_notify(struct task_struct *, int group_dead);
> +extern void taskstats_exit_add_thread(struct task_struct *);

You forgot to update the !CONFIG_TASKSTATS case ;)

> -static void fill_tgid_exit(struct task_struct *tsk)
> +static void alloc_signal_stats(struct task_struct *tsk)
> +{
> +	struct signal_struct *sig = tsk->signal;
> +	struct taskstats *stats;
> +
> +	/* No problem if kmem_cache_zalloc() fails */
> +	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
> +
> +	spin_lock_irq(&tsk->sighand->siglock);
> +	if (!sig->stats) {
> +		sig->stats = stats;
> +		stats = NULL;
> +	}
> +	spin_unlock_irq(&tsk->sighand->siglock);
> +
> +	if (stats)
> +		kmem_cache_free(taskstats_cache, stats);
> +}
> +
> +void taskstats_exit_add_thread(struct task_struct *tsk)
>  {
>  	unsigned long flags;
>  
> +	if (tsk->signal->stats == NULL && !thread_group_empty(tsk))
> +		alloc_signal_stats(tsk);
> +
>  	spin_lock_irqsave(&tsk->sighand->siglock, flags);
>  	if (!tsk->signal->stats)
>  		goto ret;

Well. I do not like the fact we take ->siglock unconditionally.
And _irqsave is not needed. And we take it twice if sig->stats == NULL.
And "if (!tsk->signal->stats)" under ->siglock in
taskstats_exit_add_thread() looks a bit ugly...

How about

	void taskstats_exit_add_thread(struct task_struct *tsk)
	{
		struct taskstats *stats = NULL;

		if (!tsk->signal->stats) {
			if (thread_group_empty(tsk)
				return;

			stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
			if (!stats)
				return;
		}

		spin_lock_irq(&tsk->sighand->siglock);
		if (!tsk->signal->stats) {
			tsk->signal->stats = stats;
			stats = NULL;
		}
		/*
		 * Each accounting subsystem calls its functions here to
		 * accumalate its per-task stats for tsk, into the per-tgid structure
		 *
		 *	per-task-foo(tsk->signal->stats, tsk);
		 */
		delayacct_add_tsk(tsk->signal->stats, tsk);
		spin_unlock_irq(&tsk->sighand->siglock);

		if (unlikely(stats))
			kmem_cache_free(taskstats_cache, stats);
	}

?

Note that it absorbs alloc_signal_stats().

But up to you, probably this is matter of taste.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats
  2010-12-13 14:33                 ` Oleg Nesterov
@ 2010-12-13 16:42                   ` Michael Holzheu
  0 siblings, 0 replies; 19+ messages in thread
From: Michael Holzheu @ 2010-12-13 16:42 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Shailabh Nagar, Andrew Morton, Peter Zijlstra, John stultz,
	Thomas Gleixner, Balbir Singh, Martin Schwidefsky, Heiko Carstens,
	Roland McGrath, Valdis.Kletnieks, linux-kernel, linux-s390

On Mon, 2010-12-13 at 15:33 +0100, Oleg Nesterov wrote:
> > --- a/include/linux/taskstats_kern.h
> > +++ b/include/linux/taskstats_kern.h
> > @@ -21,7 +21,8 @@ static inline void taskstats_tgid_free(s
> >  		kmem_cache_free(taskstats_cache, sig->stats);
> >  }
> >  
> > -extern void taskstats_exit(struct task_struct *, int group_dead);
> > +extern void taskstats_exit_notify(struct task_struct *, int group_dead);
> > +extern void taskstats_exit_add_thread(struct task_struct *);
> You forgot to update the !CONFIG_TASKSTATS case ;)

... of course, thanks!

[snip]

> Well. I do not like the fact we take ->siglock unconditionally.
> And _irqsave is not needed. And we take it twice if sig->stats == NULL.
> And "if (!tsk->signal->stats)" under ->siglock in
> taskstats_exit_add_thread() looks a bit ugly...

Yes I also saw that, but I didn't want to change too much. In fact your
code is much better. I added it to the patch and also replaced
_add_thread with _add_task, because task seems to be the term for thread
used in the taskstats code. I also did some testing. Everything still
seems to work well.

The new patch is attached below. Balbir, do you agree?

Michael
---
 include/linux/taskstats_kern.h |    8 +++-
 kernel/exit.c                  |    3 +
 kernel/taskstats.c             |   67 ++++++++++++++---------------------------
 3 files changed, 32 insertions(+), 46 deletions(-)

--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -21,10 +21,14 @@ static inline void taskstats_tgid_free(s
 		kmem_cache_free(taskstats_cache, sig->stats);
 }
 
-extern void taskstats_exit(struct task_struct *, int group_dead);
+extern void taskstats_exit_notify(struct task_struct *tsk, int group_dead);
+extern void taskstats_exit_add_task(struct task_struct *tsk);
 extern void taskstats_init_early(void);
 #else
-static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
+static inline void taskstats_exit_notify(struct task_struct *tsk,
+					 int group_dead)
+{}
+static inline void taskstats_exit_add_task(struct task_struct *tsk);
 {}
 static inline void taskstats_tgid_free(struct signal_struct *sig)
 {}
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1030,6 +1030,7 @@ NORET_TYPE void do_exit(long code)
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
 		sync_mm_rss(tsk, tsk->mm);
+	taskstats_exit_add_task(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		struct cdata *tcd = &tsk->signal->cdata_threads;
@@ -1045,7 +1046,7 @@ NORET_TYPE void do_exit(long code)
 		audit_free(tsk);
 
 	tsk->exit_code = code;
-	taskstats_exit(tsk, group_dead);
+	taskstats_exit_notify(tsk, group_dead);
 
 	exit_mm(tsk);
 
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -263,24 +263,35 @@ out:
 	return rc;
 }
 
-static void fill_tgid_exit(struct task_struct *tsk)
+void taskstats_exit_add_task(struct task_struct *tsk)
 {
-	unsigned long flags;
+	struct taskstats *stats = NULL;
 
-	spin_lock_irqsave(&tsk->sighand->siglock, flags);
-	if (!tsk->signal->stats)
-		goto ret;
+	if (!tsk->signal->stats) {
+		if (thread_group_empty(tsk))
+			return;
 
+		stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+		if (!stats)
+			return; /* Bad luck, we will loose this task */
+	}
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!tsk->signal->stats) {
+		tsk->signal->stats = stats;
+		stats = NULL;
+	}
 	/*
 	 * Each accounting subsystem calls its functions here to
 	 * accumalate its per-task stats for tsk, into the per-tgid structure
 	 *
-	 *	per-task-foo(tsk->signal->stats, tsk);
+	 *      per-task-foo(tsk->signal->stats, tsk);
 	 */
 	delayacct_add_tsk(tsk->signal->stats, tsk);
-ret:
-	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
-	return;
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	if (unlikely(stats))
+		kmem_cache_free(taskstats_cache, stats);
 }
 
 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
@@ -530,39 +541,14 @@ static int taskstats_user_cmd(struct sk_
 		return -EINVAL;
 }
 
-static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
-{
-	struct signal_struct *sig = tsk->signal;
-	struct taskstats *stats;
-
-	if (sig->stats || thread_group_empty(tsk))
-		goto ret;
-
-	/* No problem if kmem_cache_zalloc() fails */
-	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (!sig->stats) {
-		sig->stats = stats;
-		stats = NULL;
-	}
-	spin_unlock_irq(&tsk->sighand->siglock);
-
-	if (stats)
-		kmem_cache_free(taskstats_cache, stats);
-ret:
-	return sig->stats;
-}
-
 /* Send pid data out on exit */
-void taskstats_exit(struct task_struct *tsk, int group_dead)
+void taskstats_exit_notify(struct task_struct *tsk, int group_dead)
 {
 	int rc;
 	struct listener_list *listeners;
 	struct taskstats *stats;
 	struct sk_buff *rep_skb;
 	size_t size;
-	int is_thread_group;
 
 	if (!family_registered)
 		return;
@@ -573,13 +559,8 @@ void taskstats_exit(struct task_struct *
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
 
-	is_thread_group = !!taskstats_tgid_alloc(tsk);
-	if (is_thread_group) {
-		/* PID + STATS + TGID + STATS */
-		size = 2 * size;
-		/* fill the tsk->signal->stats structure */
-		fill_tgid_exit(tsk);
-	}
+	if (group_dead && tsk->signal->stats)
+		size = 2 * size; /* PID + STATS + TGID + STATS */
 
 	listeners = &__raw_get_cpu_var(listener_array);
 	if (list_empty(&listeners->list))
@@ -598,7 +579,7 @@ void taskstats_exit(struct task_struct *
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (!is_thread_group || !group_dead)
+	if (!group_dead || tsk->signal->stats == NULL)
 		goto send;
 
 	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);



^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2010-12-13 16:42 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-11-29 16:42 [patch v2 0/4] taskstats: Improve cumulative time accounting Michael Holzheu
2010-11-29 16:42 ` [patch v2 1/4] taskstats: Introduce "struct cdata" Michael Holzheu
2010-11-29 16:42 ` [patch v2 2/4] taskstats: Introduce __account_cdata() function Michael Holzheu
2010-11-29 16:42 ` [patch v2 3/4] taskstats: Introduce kernel.full_cdata sysctl Michael Holzheu
2010-11-29 16:42 ` [patch v2 4/4] taskstats: Export "cdata_wait" CPU times with taskstats Michael Holzheu
2010-12-01 18:51   ` Oleg Nesterov
2010-12-02 16:34     ` Michael Holzheu
2010-12-06  9:06       ` Balbir Singh
2010-12-08 20:23       ` Oleg Nesterov
2010-12-10 13:26         ` Michael Holzheu
     [not found]           ` <20101211173931.GA8084@redhat.com>
2010-12-13 13:05             ` Michael Holzheu
2010-12-13 13:20               ` Michael Holzheu
2010-12-13 14:33                 ` Oleg Nesterov
2010-12-13 16:42                   ` Michael Holzheu
2010-12-03  7:33     ` Balbir Singh
2010-12-06 12:37       ` Michael Holzheu
2010-12-06 15:15         ` Balbir Singh
2010-12-07 10:45     ` Michael Holzheu
2010-12-08 20:39       ` Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).