[PATCH 1/7] Simple Performance Counters: Core Piece

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/7] Simple Performance Counters: Core Piece
@ 2007-07-31 23:25 Christoph Lameter
  2007-08-01  1:30 ` Frank Ch. Eigler
  2007-08-17 16:42 ` Mathieu Desnoyers
  0 siblings, 2 replies; 5+ messages in thread
From: Christoph Lameter @ 2007-07-31 23:25 UTC (permalink / raw)
  To: linux-kernel; +Cc: Christoph Lameter

Simple performance counters are a way to measure the performance on code
paths in the Linux kernel. Code must be instrumented with calls that signal
the start and the stop of a measurement.

The beginning of a code path must have the following. Either:

	INIT_PC(var)

or
	struct pc var;

	...

	pc_start(&var);



and at the end of the segment of code to be measured either:

	pc_stop(&var, PC_xxx);

to just measure time intervals. Or

	pc_bytes(&var, bytes, PC_xxx)

to measure the amount of data that a code path can handle.


The data can then be viewed as the kernel runs via

cat /proc/perf/all

Which will show some timing and performance statistics. The numbers in ()
show 3 values: (mininum/average/maximum)

update_process_times                21370 14.8ms(194ns/693ns/9us)
alloc_pages                        297542 189.4ms(96ns/637ns/68.7us) 1.2gb(4.1kb/4.2kb/16.4kb)
kmem_cache_alloc                   637116 71.7ms(10ns/113ns/60.8us)
kmem_cache_free                    566426 39.2ms(19ns/69ns/7.1us)
kfree                               48622 4.1ms(19ns/84ns/3.7us)


update_process_times needed between 194ns and 9us. On average is needsd 693 nanoseconds.
21370 measurements werer performed.


Data can be zeroed by writing to /proc/perf/reset.

Typically one would zero the counters and then perform a kernel activity that
exercises the instrumented code path.



Data can be viewed in
/proc/perf

Special files:

/proc/perf/all		Shows a summary
/proc/perf/reset	Writing to this file resets counters
/proc/perf/0		Counters on processor  0

Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 include/linux/perf.h |   55 ++++++++
 init/Kconfig         |   10 ++
 kernel/Makefile      |    1 +
 kernel/perf.c        |  368 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c       |    3 +
 5 files changed, 437 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/perf.h
 create mode 100644 kernel/perf.c

diff --git a/include/linux/perf.h b/include/linux/perf.h
new file mode 100644
index 0000000..2958c81
--- /dev/null
+++ b/include/linux/perf.h
@@ -0,0 +1,55 @@
+#ifndef __LINUX_PERF_H
+#define __LINUX_PERF_H
+
+#include <linux/timex.h>
+/*
+ * Time Stamp Performance Counters
+ *
+ * (C) 2007 Silicon Graphics, Inc.
+ *	Christoph Lameter <clameter@sgi.com>, April 2007
+ *
+ * Counters are calculated using the cycle counter. If a process
+ * is migrated to another cpu during the measurement then the measurement
+ * is invalid.
+ */
+
+enum pc_item {
+	PC_UPDATE_PROCESS_TIMES,
+	NR_PC_ITEMS
+};
+
+/*
+ * Information about the start of the measurement
+ */
+struct pc {
+	unsigned long time;
+	int processor;
+};
+
+#define pc_stop(__pc, __nr) pc_bytes(__pc, 0, __nr)
+
+#ifdef CONFIG_PERFCOUNT
+
+#define INIT_PC(__var) struct pc __var = \
+		{ get_cycles(), raw_smp_processor_id() }
+
+static inline void pc_start(struct pc *pc)
+{
+	pc->processor = raw_smp_processor_id();
+	pc->time = get_cycles();
+}
+
+void pc_bytes(struct pc *pc, unsigned long bytes, enum pc_item nr);
+
+void pc_stop_printk(struct pc *pc);
+
+#else
+
+#define INIT_PC(__var) char __var[0]
+#define pc_start(__var) do { } while (!__var)
+#define pc_bytes(__var, __b, __i) do { } while (!__var)
+#define pc_stop_printk(__var) do { } while (!__var)
+#endif
+
+#endif
+
diff --git a/init/Kconfig b/init/Kconfig
index 96b5459..affb532 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -206,6 +206,16 @@ config TASK_IO_ACCOUNTING
 
 	  Say N if unsure.
 
+config PERFCOUNT
+	bool "Time Stamp Counter based performance measurements"
+	help
+	  Enables performance counters based on the time stamp counters.
+	  These can be used to measure code paths in the kernel and also
+	  gauge their effectiveness in transferring bytes. The performance
+	  counters must be added by modifying code. The counters will then
+	  be visible via files in /proc/perf/*.
+
+
 config USER_NS
 	bool "User Namespaces (EXPERIMENTAL)"
 	default n
diff --git a/kernel/Makefile b/kernel/Makefile
index 2a99983..0f3eaa9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_PERFCOUNT) += perf.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/perf.c b/kernel/perf.c
new file mode 100644
index 0000000..14cba9c
--- /dev/null
+++ b/kernel/perf.c
@@ -0,0 +1,368 @@
+/*
+ * Simple Performance Counter subsystem
+ *
+ * (C) 2007 Silicong Graphics, Inc.
+ *
+ * 	Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/cpumask.h>
+#include <linux/perf.h>
+
+#ifdef CONFIG_NUMA
+static int unsynced_get_cycles = 1;
+#else
+#define unsynced_get_cycles 0
+#endif
+
+const char *var_id[NR_PC_ITEMS] = {
+	"update_process_times",
+};
+
+struct perf_counter {
+	u32 events;
+	u32 mintime;
+	u32 maxtime;
+	u32 minbytes;
+	u32 maxbytes;
+	u32 skipped;
+	u64 time;
+	u64 bytes;
+};
+
+static DEFINE_PER_CPU(struct perf_counter, perf_counters)[NR_PC_ITEMS];
+
+void pc_bytes(struct pc *pc, unsigned long bytes, enum pc_item nr)
+{
+	unsigned long time = get_cycles();
+	unsigned long ns;
+	struct perf_counter *p = &get_cpu_var(perf_counters)[nr];
+
+	if (unlikely(nr >= NR_PC_ITEMS)) {
+		printk(KERN_CRIT "pc_bytes: item number "
+			"(%d) out of range\n", nr);
+		dump_stack();
+		goto out;
+	}
+
+	if (unlikely(unsynced_get_cycles &&
+			pc->processor != smp_processor_id())) {
+		/* On different processor. TSC measurement not possible. */
+		p->skipped++;
+		goto out;
+	}
+
+	ns = cycles_to_ns((unsigned long long)(time - pc->time));
+	p->time += ns;
+	p->events++;
+
+	if (ns > p->maxtime)
+		p->maxtime = ns;
+
+	if (p->mintime == 0 || ns < p->mintime)
+		p->mintime = ns;
+
+	if (bytes) {
+		p->bytes += bytes;
+		if (bytes > p->maxbytes)
+			p->maxbytes = bytes;
+		if (p->minbytes == 0 || bytes < p->minbytes)
+			p->minbytes = bytes;
+	}
+out:
+	put_cpu_var();
+	return;
+}
+EXPORT_SYMBOL(pc_bytes);
+
+static void reset_perfcount_item(struct perf_counter *c)
+{
+	memset(c, 0, sizeof(struct perf_counter));
+}
+
+static void perfcount_reset(void) {
+	int cpu;
+	enum pc_item i;
+
+	for_each_online_cpu(cpu)
+		for (i = 0; i < NR_PC_ITEMS; i++)
+			reset_perfcount_item(
+				&per_cpu(perf_counters, cpu)[i]);
+}
+
+struct unit {
+	unsigned int n;
+	const char * s;
+};
+
+static const struct unit event_units[] = {
+	{ 1000, "" },
+	{ 1000, "K" },
+	{ 1000, "M" },
+	{ 1000, "G" },
+	{ 1000, "T" },
+	{ 1000, "P" },
+	{ 1000, "Q" },
+};
+
+
+static const struct unit time_units[] = {
+	{ 1000, "ns" },
+	{ 1000, "us" },
+	{ 1000, "ms" },
+	{ 60, "s" },
+	{ 60, "m" },
+	{ 24, "h" },
+	{ 365, "d" },
+	{ 1000, "y" },
+};
+
+static const struct unit byte_units[] = {
+	{ 1000, "b" },
+	{ 1000, "kb" },
+	{ 1000, "mb" },
+	{ 1000, "gb" },
+	{ 1000, "tb" },
+	{ 1000, "pb" },
+	{ 1000, "qb" }
+};
+
+/* Print a value using the given array of units and scale it properly */
+static void pval(struct seq_file *s, unsigned long x, const struct unit *u)
+{
+	unsigned n = 0;
+	unsigned rem = 0;
+	unsigned last_divisor = 0;
+
+	while (x >= u[n].n) {
+		last_divisor = u[n].n;
+		rem = x % last_divisor;
+		x = x / last_divisor;
+		n++;
+	}
+
+	if (last_divisor)
+		rem = (rem * 10 + last_divisor / 2) / last_divisor;
+	else
+		rem = 0;
+
+	/*
+	 * Rounding may have resulted in the need to go
+	 * to the next number
+	 */
+	if (rem == 10) {
+		x++;
+		rem = 0;
+	};
+
+	seq_printf(s, "%lu", x);
+	if (rem) {
+		seq_putc(s, '.');
+		seq_putc(s, '0' + rem);
+	}
+	seq_puts(s, u[n].s);
+}
+
+/* Print a value using the given array of units and scale it properly */
+void pc_stop_printk(struct pc *pc)
+{
+	unsigned n = 0;
+	unsigned rem = 0;
+	unsigned last_divisor = 0;
+	const struct unit *u = time_units;
+	unsigned long x = cycles_to_ns(get_cycles() - pc->time);
+
+	while (x >= u[n].n) {
+		last_divisor = u[n].n;
+		rem = x % last_divisor;
+		x = x / last_divisor;
+		n++;
+	}
+
+	if (last_divisor)
+		rem = (rem * 10 + last_divisor / 2) / last_divisor;
+	else
+		rem = 0;
+
+	/*
+	 * Rounding may have resulted in the need to go
+	 * to the next number
+	 */
+	if (rem == 10) {
+		x++;
+		rem = 0;
+	};
+
+	printk("%lu", x);
+	if (rem) {
+		char x[3] = ".0";
+
+		x[1] += rem;
+		printk(x);
+	}
+	printk(u[n].s);
+}
+
+/* Print a set of statistical values in the form sum(max/avg/min) */
+static void pc_print(struct seq_file *s, const struct unit *u,
+	unsigned long count, unsigned long sum,
+	unsigned long min, unsigned long max)
+{
+	pval(s, sum, u);
+	seq_putc(s,'(');
+	pval(s, min, u);
+	seq_putc(s,'/');
+	if (count)
+		pval(s, (sum + count / 2 ) / count, u);
+	else
+		pval(s, 0, u);
+	seq_putc(s,'/');
+	pval(s, max, u);
+	seq_putc(s,')');
+}
+
+
+static int perf_show(struct seq_file *s, void *v)
+{
+	int cpu = (unsigned long)s->private;
+	enum pc_item counter = (unsigned long)v - 1;
+	struct perf_counter summary, *x;
+
+	if (cpu >= 0)
+		x = &per_cpu(perf_counters, cpu)[counter];
+	else {
+		memcpy(&summary, &per_cpu(perf_counters, 0)[counter],
+			sizeof(summary));
+		for_each_online_cpu(cpu) {
+			struct perf_counter *c =
+				&per_cpu(perf_counters, 0)[counter];
+
+			summary.events += c->events;
+			summary.skipped += c->skipped;
+			summary.time += c->time;
+			summary.bytes += c->bytes;
+
+			if (summary.maxtime < c->maxtime)
+				summary.maxtime = c->maxtime;
+
+			if (summary.mintime == 0 ||
+				(c->mintime != 0 &&
+				summary.mintime > c->mintime))
+					summary.mintime = c->mintime;
+
+			if (summary.maxbytes < c->maxbytes)
+				summary.maxbytes = c->maxbytes;
+
+			if (summary.minbytes == 0 ||
+				(c->minbytes != 0 &&
+				summary.minbytes > c->minbytes))
+					summary.minbytes = c->minbytes;
+
+		}
+		x = &summary;
+	}
+
+	seq_printf(s, "%-30s %10u ", var_id[counter], x->events);
+	if (x->skipped)
+		seq_printf(s, "(+%3u) ", x->skipped);
+	pc_print(s, time_units, x->events, x->time, x->mintime, x->maxtime);
+	if (x->bytes) {
+		seq_putc(s,' ');
+		pc_print(s, byte_units, x->events, x->bytes,
+			x->minbytes, x->maxbytes);
+	}
+	seq_putc(s, '\n');
+	return 0;
+}
+
+static void *perf_start(struct seq_file *m, loff_t *pos)
+{
+	return (*pos < NR_PC_ITEMS) ? (void *)(*pos +1) : NULL;
+}
+
+static void *perf_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return perf_start(m, pos);
+}
+
+static void perf_stop(struct seq_file *m, void *v)
+{
+}
+
+struct seq_operations perf_data_ops = {
+	.start  = perf_start,
+	.next   = perf_next,
+	.stop   = perf_stop,
+	.show   = perf_show,
+};
+
+static int perf_data_open(struct inode *inode, struct file *file)
+{
+	int res;
+
+	res = seq_open(file, &perf_data_ops);
+	if (!res)
+		((struct seq_file *)file->private_data)->private =
+							PDE(inode)->data;
+
+	return res;
+}
+
+static struct file_operations perf_data_fops = {
+	.open		= perf_data_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int perf_reset_write(struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	perfcount_reset();
+	return count;
+}
+
+static __init int init_perfcounter(void) {
+	int cpu;
+
+	struct proc_dir_entry *proc_perf, *perf_reset, *perf_all;
+
+	proc_perf = proc_mkdir("perf", NULL);
+	if (!proc_perf)
+		return -ENOMEM;
+
+	perf_reset = create_proc_entry("reset", S_IWUGO, proc_perf);
+	perf_reset->write_proc = perf_reset_write;
+
+	perf_all = create_proc_entry("all", S_IRUGO, proc_perf);
+	perf_all->proc_fops = &perf_data_fops;
+	perf_all->data = (void *)-1;
+
+	for_each_possible_cpu(cpu) {
+		char name[20];
+		struct proc_dir_entry *p;
+
+		sprintf(name, "%d", cpu);
+		p = create_proc_entry(name, S_IRUGO, proc_perf);
+
+		p->proc_fops = &perf_data_fops;
+		p->data = (void *)(unsigned long)cpu;
+	}
+
+	perfcount_reset();
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	if (!unsynchronized_tsc())
+		unsynced_get_cycles = 0;
+#endif
+	return 0;
+}
+
+__initcall(init_perfcounter);
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ce1952..55d4619 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -36,6 +36,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -824,6 +825,7 @@ void update_process_times(int user_tick)
 {
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	INIT_PC(pc);
 
 	/* Note: this timer irq context must be accounted for as well. */
 	if (user_tick)
@@ -835,6 +837,7 @@ void update_process_times(int user_tick)
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
 	run_posix_cpu_timers(p);
+	pc_stop(&pc, PC_UPDATE_PROCESS_TIMES);
 }
 
 /*
-- 
1.5.2.4


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/7] Simple Performance Counters: Core Piece
  2007-07-31 23:25 [PATCH 1/7] Simple Performance Counters: Core Piece Christoph Lameter
@ 2007-08-01  1:30 ` Frank Ch. Eigler
  2007-08-17 16:42 ` Mathieu Desnoyers
  1 sibling, 0 replies; 5+ messages in thread
From: Frank Ch. Eigler @ 2007-08-01  1:30 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel

Christoph Lameter <clameter@sgi.com> writes:

> Simple performance counters are a way to measure the performance on
> code paths in the Linux kernel. Code must be instrumented with calls
> that signal the start and the stop of a measurement.  [...]

For what it's worth, this kind of measurement widget could be usefully
recast as a pure client of the lttng/systemtap markers that Mathieu is
still working on.  Instead of the custom pc_start/pc_stop functions,
you would have generic markers to identify the start/end spots, and a
bit of callback code to compute/export the statistics to procfs.

- FChE

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/7] Simple Performance Counters: Core Piece
  2007-07-31 23:25 [PATCH 1/7] Simple Performance Counters: Core Piece Christoph Lameter
  2007-08-01  1:30 ` Frank Ch. Eigler
@ 2007-08-17 16:42 ` Mathieu Desnoyers
  2007-08-17 20:42   ` Christoph Lameter
  1 sibling, 1 reply; 5+ messages in thread
From: Mathieu Desnoyers @ 2007-08-17 16:42 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel

Hi Christoph,

Actually, get_cycles() at least on some AMD cpus, do not synchronize the
core, which can skew the results. You might want to use
get_cycles_sync() there.

Mathieu

* Christoph Lameter (clameter@sgi.com) wrote:
> Simple performance counters are a way to measure the performance on code
> paths in the Linux kernel. Code must be instrumented with calls that signal
> the start and the stop of a measurement.
> 
> The beginning of a code path must have the following. Either:
> 
> 	INIT_PC(var)
> 
> or
> 	struct pc var;
> 
> 	...
> 
> 	pc_start(&var);
> 
> 
> 
> and at the end of the segment of code to be measured either:
> 
> 	pc_stop(&var, PC_xxx);
> 
> to just measure time intervals. Or
> 
> 	pc_bytes(&var, bytes, PC_xxx)
> 
> to measure the amount of data that a code path can handle.
> 
> 
> The data can then be viewed as the kernel runs via
> 
> cat /proc/perf/all
> 
> Which will show some timing and performance statistics. The numbers in ()
> show 3 values: (mininum/average/maximum)
> 
> update_process_times                21370 14.8ms(194ns/693ns/9us)
> alloc_pages                        297542 189.4ms(96ns/637ns/68.7us) 1.2gb(4.1kb/4.2kb/16.4kb)
> kmem_cache_alloc                   637116 71.7ms(10ns/113ns/60.8us)
> kmem_cache_free                    566426 39.2ms(19ns/69ns/7.1us)
> kfree                               48622 4.1ms(19ns/84ns/3.7us)
> 
> 
> update_process_times needed between 194ns and 9us. On average is needsd 693 nanoseconds.
> 21370 measurements werer performed.
> 
> 
> Data can be zeroed by writing to /proc/perf/reset.
> 
> Typically one would zero the counters and then perform a kernel activity that
> exercises the instrumented code path.
> 
> 
> 
> Data can be viewed in
> /proc/perf
> 
> Special files:
> 
> /proc/perf/all		Shows a summary
> /proc/perf/reset	Writing to this file resets counters
> /proc/perf/0		Counters on processor  0
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> ---
>  include/linux/perf.h |   55 ++++++++
>  init/Kconfig         |   10 ++
>  kernel/Makefile      |    1 +
>  kernel/perf.c        |  368 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/timer.c       |    3 +
>  5 files changed, 437 insertions(+), 0 deletions(-)
>  create mode 100644 include/linux/perf.h
>  create mode 100644 kernel/perf.c
> 
> diff --git a/include/linux/perf.h b/include/linux/perf.h
> new file mode 100644
> index 0000000..2958c81
> --- /dev/null
> +++ b/include/linux/perf.h
> @@ -0,0 +1,55 @@
> +#ifndef __LINUX_PERF_H
> +#define __LINUX_PERF_H
> +
> +#include <linux/timex.h>
> +/*
> + * Time Stamp Performance Counters
> + *
> + * (C) 2007 Silicon Graphics, Inc.
> + *	Christoph Lameter <clameter@sgi.com>, April 2007
> + *
> + * Counters are calculated using the cycle counter. If a process
> + * is migrated to another cpu during the measurement then the measurement
> + * is invalid.
> + */
> +
> +enum pc_item {
> +	PC_UPDATE_PROCESS_TIMES,
> +	NR_PC_ITEMS
> +};
> +
> +/*
> + * Information about the start of the measurement
> + */
> +struct pc {
> +	unsigned long time;
> +	int processor;
> +};
> +
> +#define pc_stop(__pc, __nr) pc_bytes(__pc, 0, __nr)
> +
> +#ifdef CONFIG_PERFCOUNT
> +
> +#define INIT_PC(__var) struct pc __var = \
> +		{ get_cycles(), raw_smp_processor_id() }
> +
> +static inline void pc_start(struct pc *pc)
> +{
> +	pc->processor = raw_smp_processor_id();
> +	pc->time = get_cycles();
> +}
> +
> +void pc_bytes(struct pc *pc, unsigned long bytes, enum pc_item nr);
> +
> +void pc_stop_printk(struct pc *pc);
> +
> +#else
> +
> +#define INIT_PC(__var) char __var[0]
> +#define pc_start(__var) do { } while (!__var)
> +#define pc_bytes(__var, __b, __i) do { } while (!__var)
> +#define pc_stop_printk(__var) do { } while (!__var)
> +#endif
> +
> +#endif
> +
> diff --git a/init/Kconfig b/init/Kconfig
> index 96b5459..affb532 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -206,6 +206,16 @@ config TASK_IO_ACCOUNTING
>  
>  	  Say N if unsure.
>  
> +config PERFCOUNT
> +	bool "Time Stamp Counter based performance measurements"
> +	help
> +	  Enables performance counters based on the time stamp counters.
> +	  These can be used to measure code paths in the kernel and also
> +	  gauge their effectiveness in transferring bytes. The performance
> +	  counters must be added by modifying code. The counters will then
> +	  be visible via files in /proc/perf/*.
> +
> +
>  config USER_NS
>  	bool "User Namespaces (EXPERIMENTAL)"
>  	default n
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 2a99983..0f3eaa9 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -51,6 +51,7 @@ obj-$(CONFIG_RELAY) += relay.o
>  obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
>  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
>  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
> +obj-$(CONFIG_PERFCOUNT) += perf.o
>  
>  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
>  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
> diff --git a/kernel/perf.c b/kernel/perf.c
> new file mode 100644
> index 0000000..14cba9c
> --- /dev/null
> +++ b/kernel/perf.c
> @@ -0,0 +1,368 @@
> +/*
> + * Simple Performance Counter subsystem
> + *
> + * (C) 2007 Silicong Graphics, Inc.
> + *
> + * 	Christoph Lameter <clameter@sgi.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/percpu.h>
> +#include <linux/seq_file.h>
> +#include <linux/fs.h>
> +#include <linux/proc_fs.h>
> +#include <linux/cpumask.h>
> +#include <linux/perf.h>
> +
> +#ifdef CONFIG_NUMA
> +static int unsynced_get_cycles = 1;
> +#else
> +#define unsynced_get_cycles 0
> +#endif
> +
> +const char *var_id[NR_PC_ITEMS] = {
> +	"update_process_times",
> +};
> +
> +struct perf_counter {
> +	u32 events;
> +	u32 mintime;
> +	u32 maxtime;
> +	u32 minbytes;
> +	u32 maxbytes;
> +	u32 skipped;
> +	u64 time;
> +	u64 bytes;
> +};
> +
> +static DEFINE_PER_CPU(struct perf_counter, perf_counters)[NR_PC_ITEMS];
> +
> +void pc_bytes(struct pc *pc, unsigned long bytes, enum pc_item nr)
> +{
> +	unsigned long time = get_cycles();
> +	unsigned long ns;
> +	struct perf_counter *p = &get_cpu_var(perf_counters)[nr];
> +
> +	if (unlikely(nr >= NR_PC_ITEMS)) {
> +		printk(KERN_CRIT "pc_bytes: item number "
> +			"(%d) out of range\n", nr);
> +		dump_stack();
> +		goto out;
> +	}
> +
> +	if (unlikely(unsynced_get_cycles &&
> +			pc->processor != smp_processor_id())) {
> +		/* On different processor. TSC measurement not possible. */
> +		p->skipped++;
> +		goto out;
> +	}
> +
> +	ns = cycles_to_ns((unsigned long long)(time - pc->time));
> +	p->time += ns;
> +	p->events++;
> +
> +	if (ns > p->maxtime)
> +		p->maxtime = ns;
> +
> +	if (p->mintime == 0 || ns < p->mintime)
> +		p->mintime = ns;
> +
> +	if (bytes) {
> +		p->bytes += bytes;
> +		if (bytes > p->maxbytes)
> +			p->maxbytes = bytes;
> +		if (p->minbytes == 0 || bytes < p->minbytes)
> +			p->minbytes = bytes;
> +	}
> +out:
> +	put_cpu_var();
> +	return;
> +}
> +EXPORT_SYMBOL(pc_bytes);
> +
> +static void reset_perfcount_item(struct perf_counter *c)
> +{
> +	memset(c, 0, sizeof(struct perf_counter));
> +}
> +
> +static void perfcount_reset(void) {
> +	int cpu;
> +	enum pc_item i;
> +
> +	for_each_online_cpu(cpu)
> +		for (i = 0; i < NR_PC_ITEMS; i++)
> +			reset_perfcount_item(
> +				&per_cpu(perf_counters, cpu)[i]);
> +}
> +
> +struct unit {
> +	unsigned int n;
> +	const char * s;
> +};
> +
> +static const struct unit event_units[] = {
> +	{ 1000, "" },
> +	{ 1000, "K" },
> +	{ 1000, "M" },
> +	{ 1000, "G" },
> +	{ 1000, "T" },
> +	{ 1000, "P" },
> +	{ 1000, "Q" },
> +};
> +
> +
> +static const struct unit time_units[] = {
> +	{ 1000, "ns" },
> +	{ 1000, "us" },
> +	{ 1000, "ms" },
> +	{ 60, "s" },
> +	{ 60, "m" },
> +	{ 24, "h" },
> +	{ 365, "d" },
> +	{ 1000, "y" },
> +};
> +
> +static const struct unit byte_units[] = {
> +	{ 1000, "b" },
> +	{ 1000, "kb" },
> +	{ 1000, "mb" },
> +	{ 1000, "gb" },
> +	{ 1000, "tb" },
> +	{ 1000, "pb" },
> +	{ 1000, "qb" }
> +};
> +
> +/* Print a value using the given array of units and scale it properly */
> +static void pval(struct seq_file *s, unsigned long x, const struct unit *u)
> +{
> +	unsigned n = 0;
> +	unsigned rem = 0;
> +	unsigned last_divisor = 0;
> +
> +	while (x >= u[n].n) {
> +		last_divisor = u[n].n;
> +		rem = x % last_divisor;
> +		x = x / last_divisor;
> +		n++;
> +	}
> +
> +	if (last_divisor)
> +		rem = (rem * 10 + last_divisor / 2) / last_divisor;
> +	else
> +		rem = 0;
> +
> +	/*
> +	 * Rounding may have resulted in the need to go
> +	 * to the next number
> +	 */
> +	if (rem == 10) {
> +		x++;
> +		rem = 0;
> +	};
> +
> +	seq_printf(s, "%lu", x);
> +	if (rem) {
> +		seq_putc(s, '.');
> +		seq_putc(s, '0' + rem);
> +	}
> +	seq_puts(s, u[n].s);
> +}
> +
> +/* Print a value using the given array of units and scale it properly */
> +void pc_stop_printk(struct pc *pc)
> +{
> +	unsigned n = 0;
> +	unsigned rem = 0;
> +	unsigned last_divisor = 0;
> +	const struct unit *u = time_units;
> +	unsigned long x = cycles_to_ns(get_cycles() - pc->time);
> +
> +	while (x >= u[n].n) {
> +		last_divisor = u[n].n;
> +		rem = x % last_divisor;
> +		x = x / last_divisor;
> +		n++;
> +	}
> +
> +	if (last_divisor)
> +		rem = (rem * 10 + last_divisor / 2) / last_divisor;
> +	else
> +		rem = 0;
> +
> +	/*
> +	 * Rounding may have resulted in the need to go
> +	 * to the next number
> +	 */
> +	if (rem == 10) {
> +		x++;
> +		rem = 0;
> +	};
> +
> +	printk("%lu", x);
> +	if (rem) {
> +		char x[3] = ".0";
> +
> +		x[1] += rem;
> +		printk(x);
> +	}
> +	printk(u[n].s);
> +}
> +
> +/* Print a set of statistical values in the form sum(max/avg/min) */
> +static void pc_print(struct seq_file *s, const struct unit *u,
> +	unsigned long count, unsigned long sum,
> +	unsigned long min, unsigned long max)
> +{
> +	pval(s, sum, u);
> +	seq_putc(s,'(');
> +	pval(s, min, u);
> +	seq_putc(s,'/');
> +	if (count)
> +		pval(s, (sum + count / 2 ) / count, u);
> +	else
> +		pval(s, 0, u);
> +	seq_putc(s,'/');
> +	pval(s, max, u);
> +	seq_putc(s,')');
> +}
> +
> +
> +static int perf_show(struct seq_file *s, void *v)
> +{
> +	int cpu = (unsigned long)s->private;
> +	enum pc_item counter = (unsigned long)v - 1;
> +	struct perf_counter summary, *x;
> +
> +	if (cpu >= 0)
> +		x = &per_cpu(perf_counters, cpu)[counter];
> +	else {
> +		memcpy(&summary, &per_cpu(perf_counters, 0)[counter],
> +			sizeof(summary));
> +		for_each_online_cpu(cpu) {
> +			struct perf_counter *c =
> +				&per_cpu(perf_counters, 0)[counter];
> +
> +			summary.events += c->events;
> +			summary.skipped += c->skipped;
> +			summary.time += c->time;
> +			summary.bytes += c->bytes;
> +
> +			if (summary.maxtime < c->maxtime)
> +				summary.maxtime = c->maxtime;
> +
> +			if (summary.mintime == 0 ||
> +				(c->mintime != 0 &&
> +				summary.mintime > c->mintime))
> +					summary.mintime = c->mintime;
> +
> +			if (summary.maxbytes < c->maxbytes)
> +				summary.maxbytes = c->maxbytes;
> +
> +			if (summary.minbytes == 0 ||
> +				(c->minbytes != 0 &&
> +				summary.minbytes > c->minbytes))
> +					summary.minbytes = c->minbytes;
> +
> +		}
> +		x = &summary;
> +	}
> +
> +	seq_printf(s, "%-30s %10u ", var_id[counter], x->events);
> +	if (x->skipped)
> +		seq_printf(s, "(+%3u) ", x->skipped);
> +	pc_print(s, time_units, x->events, x->time, x->mintime, x->maxtime);
> +	if (x->bytes) {
> +		seq_putc(s,' ');
> +		pc_print(s, byte_units, x->events, x->bytes,
> +			x->minbytes, x->maxbytes);
> +	}
> +	seq_putc(s, '\n');
> +	return 0;
> +}
> +
> +static void *perf_start(struct seq_file *m, loff_t *pos)
> +{
> +	return (*pos < NR_PC_ITEMS) ? (void *)(*pos +1) : NULL;
> +}
> +
> +static void *perf_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> +	++*pos;
> +	return perf_start(m, pos);
> +}
> +
> +static void perf_stop(struct seq_file *m, void *v)
> +{
> +}
> +
> +struct seq_operations perf_data_ops = {
> +	.start  = perf_start,
> +	.next   = perf_next,
> +	.stop   = perf_stop,
> +	.show   = perf_show,
> +};
> +
> +static int perf_data_open(struct inode *inode, struct file *file)
> +{
> +	int res;
> +
> +	res = seq_open(file, &perf_data_ops);
> +	if (!res)
> +		((struct seq_file *)file->private_data)->private =
> +							PDE(inode)->data;
> +
> +	return res;
> +}
> +
> +static struct file_operations perf_data_fops = {
> +	.open		= perf_data_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= seq_release,
> +};
> +
> +static int perf_reset_write(struct file *file, const char __user *buffer,
> +	unsigned long count, void *data)
> +{
> +	perfcount_reset();
> +	return count;
> +}
> +
> +static __init int init_perfcounter(void) {
> +	int cpu;
> +
> +	struct proc_dir_entry *proc_perf, *perf_reset, *perf_all;
> +
> +	proc_perf = proc_mkdir("perf", NULL);
> +	if (!proc_perf)
> +		return -ENOMEM;
> +
> +	perf_reset = create_proc_entry("reset", S_IWUGO, proc_perf);
> +	perf_reset->write_proc = perf_reset_write;
> +
> +	perf_all = create_proc_entry("all", S_IRUGO, proc_perf);
> +	perf_all->proc_fops = &perf_data_fops;
> +	perf_all->data = (void *)-1;
> +
> +	for_each_possible_cpu(cpu) {
> +		char name[20];
> +		struct proc_dir_entry *p;
> +
> +		sprintf(name, "%d", cpu);
> +		p = create_proc_entry(name, S_IRUGO, proc_perf);
> +
> +		p->proc_fops = &perf_data_fops;
> +		p->data = (void *)(unsigned long)cpu;
> +	}
> +
> +	perfcount_reset();
> +
> +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
> +	if (!unsynchronized_tsc())
> +		unsynced_get_cycles = 0;
> +#endif
> +	return 0;
> +}
> +
> +__initcall(init_perfcounter);
> +
> diff --git a/kernel/timer.c b/kernel/timer.c
> index 6ce1952..55d4619 100644
> --- a/kernel/timer.c
> +++ b/kernel/timer.c
> @@ -36,6 +36,7 @@
>  #include <linux/delay.h>
>  #include <linux/tick.h>
>  #include <linux/kallsyms.h>
> +#include <linux/perf.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/unistd.h>
> @@ -824,6 +825,7 @@ void update_process_times(int user_tick)
>  {
>  	struct task_struct *p = current;
>  	int cpu = smp_processor_id();
> +	INIT_PC(pc);
>  
>  	/* Note: this timer irq context must be accounted for as well. */
>  	if (user_tick)
> @@ -835,6 +837,7 @@ void update_process_times(int user_tick)
>  		rcu_check_callbacks(cpu, user_tick);
>  	scheduler_tick();
>  	run_posix_cpu_timers(p);
> +	pc_stop(&pc, PC_UPDATE_PROCESS_TIMES);
>  }
>  
>  /*
> -- 
> 1.5.2.4
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/7] Simple Performance Counters: Core Piece
  2007-08-17 16:42 ` Mathieu Desnoyers
@ 2007-08-17 20:42   ` Christoph Lameter
  2007-08-18  3:25     ` Mathieu Desnoyers
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Lameter @ 2007-08-17 20:42 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: linux-kernel

On Fri, 17 Aug 2007, Mathieu Desnoyers wrote:

> Actually, get_cycles() at least on some AMD cpus, do not synchronize the
> core, which can skew the results. You might want to use
> get_cycles_sync() there.

get_cycle() results as used here are bound to a single processor. If we 
end up on a different processor at the end of the measurement then the 
result is discarded. So not need for get_cycles_sync.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/7] Simple Performance Counters: Core Piece
  2007-08-17 20:42   ` Christoph Lameter
@ 2007-08-18  3:25     ` Mathieu Desnoyers
  0 siblings, 0 replies; 5+ messages in thread
From: Mathieu Desnoyers @ 2007-08-18  3:25 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel

* Christoph Lameter (clameter@sgi.com) wrote:
> On Fri, 17 Aug 2007, Mathieu Desnoyers wrote:
> 
> > Actually, get_cycles() at least on some AMD cpus, do not synchronize the
> > core, which can skew the results. You might want to use
> > get_cycles_sync() there.
> 
> get_cycle() results as used here are bound to a single processor. If we 
> end up on a different processor at the end of the measurement then the 
> result is discarded. So not need for get_cycles_sync.
> 

I may be wrong, but I think the UP case still needs to synchronize the
core to have precise measurement. This sync core will make sure that
rdtsc is not executed speculatively. It therefore makes sure it is not
misplaced in the instruction stream. I therefore don't think this is a
SMP special case.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2007-08-18  3:26 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-31 23:25 [PATCH 1/7] Simple Performance Counters: Core Piece Christoph Lameter
2007-08-01  1:30 ` Frank Ch. Eigler
2007-08-17 16:42 ` Mathieu Desnoyers
2007-08-17 20:42   ` Christoph Lameter
2007-08-18  3:25     ` Mathieu Desnoyers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox