public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [Patch 1/8] Setup
@ 2006-05-02  6:12 Balbir Singh
  2006-05-08 21:17 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Balbir Singh @ 2006-05-02  6:12 UTC (permalink / raw)
  To: linux-kernel; +Cc: lse-tech, jlan


Changelog

Fixes comments by akpm
- unnecessary initialization of delayacct_on
- use kmem_cache_zalloc
- redundant check in __delayacct_tsk_exit

Other changes
- do not mix tabs and spaces

delayacct-setup.patch

Initialization code related to collection of per-task "delay"
statistics which measure how long it had to wait for cpu,
sync block io, swapping etc. The collection of statistics and
the interface are in other patches. This patch sets up the data
structures and allows the statistics collection to be disabled
through a  kernel boot paramater.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
---

 Documentation/kernel-parameters.txt |    2 
 include/linux/delayacct.h           |   69 ++++++++++++++++++++++++++++
 include/linux/sched.h               |   20 ++++++++
 include/linux/time.h                |   10 ++++
 init/Kconfig                        |   10 ++++
 init/main.c                         |    2 
 kernel/Makefile                     |    1 
 kernel/delayacct.c                  |   87 ++++++++++++++++++++++++++++++++++++
 kernel/exit.c                       |    2 
 kernel/fork.c                       |    2 
 10 files changed, 205 insertions(+)

diff -puN Documentation/kernel-parameters.txt~delayacct-setup Documentation/kernel-parameters.txt
--- linux-2.6.17-rc3/Documentation/kernel-parameters.txt~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/Documentation/kernel-parameters.txt	2006-04-28 23:47:55.000000000 +0530
@@ -430,6 +430,8 @@ running once the system is up.
 			Format: <area>[,<node>]
 			See also Documentation/networking/decnet.txt.
 
+	delayacct	[KNL] Enable per-task delay accounting
+
 	devfs=		[DEVFS]
 			See Documentation/filesystems/devfs/boot-options.
 
diff -puN /dev/null include/linux/delayacct.h
--- /dev/null	2004-06-24 23:34:38.000000000 +0530
+++ linux-2.6.17-rc3-balbir/include/linux/delayacct.h	2006-05-02 09:44:48.000000000 +0530
@@ -0,0 +1,69 @@
+/* delayacct.h - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_DELAYACCT_H
+#define _LINUX_DELAYACCT_H
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+
+extern int delayacct_on;	/* Delay accounting turned on/off */
+extern kmem_cache_t *delayacct_cache;
+extern void delayacct_init(void);
+extern void __delayacct_tsk_init(struct task_struct *);
+extern void __delayacct_tsk_exit(struct task_struct *);
+
+static inline void delayacct_set_flag(int flag)
+{
+	if (current->delays)
+		current->delays->flags |= flag;
+}
+
+static inline void delayacct_clear_flag(int flag)
+{
+	if (current->delays)
+		current->delays->flags &= ~flag;
+}
+
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{
+	/* reinitialize in case parent's non-null pointer was dup'ed*/
+	tsk->delays = NULL;
+	if (unlikely(delayacct_on))
+		__delayacct_tsk_init(tsk);
+}
+
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{
+	if (tsk->delays)
+		__delayacct_tsk_exit(tsk);
+}
+
+#else
+static inline void delayacct_set_flag(int flag)
+{}
+static inline void delayacct_clear_flag(int flag)
+{}
+static inline void delayacct_init(void)
+{}
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{}
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{}
+#endif /* CONFIG_TASK_DELAY_ACCT */
+
+#endif
diff -puN include/linux/sched.h~delayacct-setup include/linux/sched.h
--- linux-2.6.17-rc3/include/linux/sched.h~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/include/linux/sched.h	2006-05-02 09:44:48.000000000 +0530
@@ -536,6 +536,23 @@ struct sched_info {
 extern struct file_operations proc_schedstat_operations;
 #endif
 
+#ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+	spinlock_t	lock;
+	unsigned int	flags;	/* Private per-task flags */
+
+	/* For each stat XXX, add following, aligned appropriately
+	 *
+	 * struct timespec XXX_start, XXX_end;
+	 * u64 XXX_delay;
+	 * u32 XXX_count;
+	 *
+	 * Atomicity of updates to XXX_delay, XXX_count protected by
+	 * single lock above (split into XXX_lock if contention is an issue).
+	 */
+};
+#endif
+
 enum idle_type
 {
 	SCHED_IDLE,
@@ -888,6 +905,9 @@ struct task_struct {
 	 * cache last used pipe for splice
 	 */
 	struct pipe_inode_info *splice_pipe;
+#ifdef	CONFIG_TASK_DELAY_ACCT
+	struct task_delay_info *delays;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff -puN include/linux/time.h~delayacct-setup include/linux/time.h
--- linux-2.6.17-rc3/include/linux/time.h~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/include/linux/time.h	2006-04-28 23:47:55.000000000 +0530
@@ -68,6 +68,16 @@ extern unsigned long mktime(const unsign
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 
 /*
+ * sub = end - start, in normalized form
+ */
+static inline void timespec_sub(struct timespec *start, struct timespec *end,
+				struct timespec *sub)
+{
+	set_normalized_timespec(sub, end->tv_sec - start->tv_sec,
+				end->tv_nsec - start->tv_nsec);
+}
+
+/*
  * Returns true if the timespec is norm, false if denorm:
  */
 #define timespec_valid(ts) \
diff -puN init/Kconfig~delayacct-setup init/Kconfig
--- linux-2.6.17-rc3/init/Kconfig~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/init/Kconfig	2006-05-02 09:44:40.000000000 +0530
@@ -150,6 +150,16 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>.
 
+config TASK_DELAY_ACCT
+	bool "Enable per-task delay accounting (EXPERIMENTAL)"
+	help
+	  Collect information on time spent by a task waiting for system
+	  resources like cpu, synchronous block I/O completion and swapping
+	  in pages. Such statistics can help in setting a task's priorities
+	  relative to other tasks for cpu, io, rss limits etc.
+
+	  Say N if unsure.
+
 config SYSCTL
 	bool "Sysctl support"
 	---help---
diff -puN init/main.c~delayacct-setup init/main.c
--- linux-2.6.17-rc3/init/main.c~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/init/main.c	2006-05-02 09:44:21.000000000 +0530
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/delayacct.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -541,6 +542,7 @@ asmlinkage void __init start_kernel(void
 	proc_root_init();
 #endif
 	cpuset_init();
+	delayacct_init();
 
 	check_bugs();
 
diff -puN /dev/null kernel/delayacct.c
--- /dev/null	2004-06-24 23:34:38.000000000 +0530
+++ linux-2.6.17-rc3-balbir/kernel/delayacct.c	2006-05-02 09:46:26.000000000 +0530
@@ -0,0 +1,87 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_enable(char *str)
+{
+	delayacct_on = 1;
+	return 1;
+}
+__setup("delayacct", delayacct_setup_enable);
+
+void delayacct_init(void)
+{
+	delayacct_cache = kmem_cache_create("delayacct_cache",
+					sizeof(struct task_delay_info),
+					0,
+					SLAB_PANIC,
+					NULL, NULL);
+	delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+	if (tsk->delays)
+		spin_lock_init(&tsk->delays->lock);
+}
+
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+	kmem_cache_free(delayacct_cache, tsk->delays);
+	tsk->delays = NULL;
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+	do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static inline void delayacct_end(struct timespec *start, struct timespec *end,
+				u64 *total, u32 *count)
+{
+	struct timespec ts = {0, 0};
+	s64 ns;
+
+	do_posix_clock_monotonic_gettime(end);
+	timespec_sub(&ts, start, end);
+	ns = timespec_to_ns(&ts);
+	if (ns < 0)
+		return;
+
+	spin_lock(&current->delays->lock);
+	*total += ns;
+	(*count)++;
+	spin_unlock(&current->delays->lock);
+}
+
diff -puN kernel/exit.c~delayacct-setup kernel/exit.c
--- linux-2.6.17-rc3/kernel/exit.c~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/kernel/exit.c	2006-05-02 09:44:21.000000000 +0530
@@ -35,6 +35,7 @@
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/delayacct.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -910,6 +911,7 @@ fastcall NORET_TYPE void do_exit(long co
 	if (unlikely(tsk->compat_robust_list))
 		compat_exit_robust_list(tsk);
 #endif
+	delayacct_tsk_exit(tsk);
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff -puN kernel/fork.c~delayacct-setup kernel/fork.c
--- linux-2.6.17-rc3/kernel/fork.c~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/kernel/fork.c	2006-04-28 23:47:55.000000000 +0530
@@ -44,6 +44,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/delayacct.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -984,6 +985,7 @@ static task_t *copy_process(unsigned lon
 		goto bad_fork_cleanup_put_domain;
 
 	p->did_exec = 0;
+	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
 	p->pid = pid;
 	retval = -EFAULT;
diff -puN kernel/Makefile~delayacct-setup kernel/Makefile
--- linux-2.6.17-rc3/kernel/Makefile~delayacct-setup	2006-04-28 23:47:55.000000000 +0530
+++ linux-2.6.17-rc3-balbir/kernel/Makefile	2006-05-02 09:44:21.000000000 +0530
@@ -38,6 +38,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
_

^ permalink raw reply	[flat|nested] 18+ messages in thread
* [Patch 0/8] per-task delay accounting
@ 2006-04-22  2:16 Shailabh Nagar
  2006-04-22  2:23 ` [Patch 1/8] Setup Shailabh Nagar
  0 siblings, 1 reply; 18+ messages in thread
From: Shailabh Nagar @ 2006-04-22  2:16 UTC (permalink / raw)
  To: linux-kernel
  Cc: LSE, Jes Sorensen, Peter Chubb, Erich Focht, Levent Serinol,
	Jay Lan



Here are the delay accounting patches again. I'm not using the
earlier email thread due to code being refactored a bit.

The previous posting
    http://www.uwsg.indiana.edu/hypermail/linux/kernel/0603.3/1776.html
of these patches elicited several review comments from Andrew Morton
all of which have been addressed.

The other main thread of the comments was whether other accounting
stakeholders would be ok with this interface. Towards this end,
I'd posted an overview of what the other packages do (which didn't seem
to make the archives) and some of the stakeholders responded.

I'll repost the analysis as a reply to this post. Meanwhile, here's
the list of the stakeholders identified by Andrew and a summary of status
of their comments.


1. CSA accounting/PAGG/JOB: Jay Lan <jlan@engr.sgi.com>

Raised several points
       http://www.uwsg.indiana.edu/hypermail/linux/kernel/0604.1/0397.html
all of which have been addressed in this set of patches.

2. per-process IO statistics: Levent Serinol <lserinol@gmail.com>

No reponse.
I'd ascertained that its needs are a subset of CSA.

3. per-cpu time statistics: Erich Focht <efocht@ess.nec.de>

No response.
I'd ascertained that its needs can be met by taskstats
interface whenever these statistics are submitted for inclusion.

4. Microstate accounting: Peter Chubb <peterc@gelato.unsw.edu.au>

Mentioned overlap of patches with delay accounting
http://www.uwsg.indiana.edu/hypermail/linux/kernel/0603.3/2286.html

and also that a /proc interface was preferable due to convenience.
My position is that the netlink interface is a superset of /proc due to
former's ability to supply exit-time data.


5. ELSA:  Guillaume Thouvenin <guillaume.thouvenin@bull.net>

Confirmed that ELSA is not a direct user of a new kernel statistics
interface since it is a consumer of CSA or BSD accounting's statistics.


6. pnotify: Jes Sorensen <jes@sgi.com>
(taken over pnotify from Erik Jacobson)

Informed over private email that pnotify replacement is
being worked on.

I'd ascertained that pnotify (or its replacemenent) will not be
concerned with exporting data to userspace or collecting any stats.
Thats left to the kernel module that uses pnotify to get
notifications. CSA is one expected user of pnotify.
Hence CSA's concerns are the only ones relevant to pnotify as well.


7. Scalable statistics counters with /proc reporting:
 Ravikiran G Thirumalai, Dipankar Sarma <dipankar@in.ibm.com>

Confirmed these counters aren't relevant to this discussion.



--Shailabh


Series

delayacct-setup.patch
delayacct-blkio-swapin.patch
delayacct-schedstats.patch
genetlink-utils.patch
taskstats-setup.patch
delayacct-taskstats.patch
delayacct-doc.patch
delayacct-procfs.patch

^ permalink raw reply	[flat|nested] 18+ messages in thread
* [Patch 0/8] per-task delay accounting
@ 2006-03-30  0:32 Shailabh Nagar
  2006-03-30  0:35 ` [Patch 1/8] Setup Shailabh Nagar
  0 siblings, 1 reply; 18+ messages in thread
From: Shailabh Nagar @ 2006-03-30  0:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Greg KH, Arjan van de Ven, Jamal, Andi Kleen, linux-kernel

Andrew,

Could you please include the following delay accounting patches
in -mm ?

The patches have gone through several iterations on lkml and
numerous comments raised by reviewers have been addressed

- several netlink interface comments (Jamal)
- block I/O collection method (Arjan)
- block I/O delays export through /proc (Andi)
- performance issues (Greg) (just addressed, see below)
- GPL headers (Arjan)

Most of the descriptions of the patches are either in the
patch itself or in the documentation patch at the end.

Thanks
--Shailabh


Patch series

delayacct-setup.patch
delayacct-blkio-swapin.patch
delayacct-schedstats.patch
genetlink-utils.patch
delayacct-genetlink.patch
delayacct-virtcpu.patch
delayacct-procfs.patch
delayacct-doc.patch



Results highlights

- No statistically significant performance degradation is seen in
  kernbench, hackbench and large OLTP benchmark when delay
  accounting is configured.

  The overheads of configuring delay accounting,
  without enabling at boot time, are statistically negligible
  for hackbench and a large OLTP benchmark and negative
  (i.e. performance improves) in kernbench.

- Similar lack of degradation is seen in kernbench and hackbench
  even when delay accounting is enabled at boot.

  No data could be collected for the large OLTP benchmark (efforts
  ongoing).

Legend

Base
	Vanilla 2.6.16 kernel
	without any patches applied
+patch	
	Delay accounting configured
	but not enabled at boot
+patch+enable
	Delay accounting enabled at boot
	but no stats read



Time	Elapsed time, averaged over 10 runs
Stddev	Standard deviation of elapsed times
Ovhd	% difference of elapsed time with respect to base kernel
t-value	Used to measure statistical significance
	of difference of two mean values (in this
	case mean elapsed time). Low t-values indicate
	insignificant difference. The t-values here were
	calculated at 95% confidence interval using the tool at
	http://www.polarismr.com/education/tools_stat_diff_means.html


Hackbench
---------
200 groups, using pipes
Elapsed time, in seconds, lower better

		Ovhd	Time 	Stddev	Ovhd significant (t-value)
Base		0%	43.483	0.178	na
+patch		0.1%	43.517	0.265	No (0.337)
+patch+enable	0.3%	43.629	0.167	No (1.892)

Kernbench
---------
Average of 10 iterations
Elapsed time, in seconds, lower better

		Ovhd	Time 	Stddev	Ovhd significant (t-value)
Base		0%	196.704	0.459	na
+patch		-0.5%	195.812	0.477	Yes (4.261)
+patch+enable	0.02%	196.752 0.356	No (0.261)


Large OLTP benchmark
--------------------
An industry standard large database online transaction processing
workload was run with delay accounting patches configured
ON and OFF.

The performance degradation of delay accounting was about 0.2%,
which was well within the normal range of variation between
similar runs.

No runs were taken with delay accounting enabled at boot time.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2006-05-10 11:02 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-05-02  6:12 [Patch 1/8] Setup Balbir Singh
2006-05-08 21:17 ` Andrew Morton
2006-05-09  3:48   ` Balbir Singh
2006-05-10 10:16   ` [PATCH][delayacct] Fix the timespec_sub() interface (was Re: [Patch 1/8] Setup) Balbir Singh
2006-05-10 10:24     ` Andrew Morton
2006-05-10 10:27       ` Balbir Singh
2006-05-10 10:58         ` Balbir Singh
2006-05-08 21:23 ` [Patch 1/8] Setup Andrew Morton
2006-05-09  3:56   ` Balbir Singh
2006-05-10 10:18   ` [PATCH][delayacct] un-inline delayacct_end(), remove initialization of ts (was Re: [Patch 1/8] Setup) Balbir Singh
2006-05-09 11:46 ` [Patch 1/8] Setup Thomas Gleixner
2006-05-09 13:20   ` [Lse-tech] " Balbir Singh
  -- strict thread matches above, loose matches on Subject: below --
2006-04-22  2:16 [Patch 0/8] per-task delay accounting Shailabh Nagar
2006-04-22  2:23 ` [Patch 1/8] Setup Shailabh Nagar
2006-04-24  2:02   ` Randy.Dunlap
2006-04-24 17:26     ` Shailabh Nagar
2006-03-30  0:32 [Patch 0/8] per-task delay accounting Shailabh Nagar
2006-03-30  0:35 ` [Patch 1/8] Setup Shailabh Nagar
2006-03-30  5:03   ` Andrew Morton
2006-03-30 15:07     ` Shailabh Nagar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox