[RFC][Patch 1/1] Oprofile Multiplexing

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC][Patch 1/1] Oprofile Multiplexing
@ 2008-05-27 20:08 Jason Yeh
  2008-05-29  8:10 ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: Jason Yeh @ 2008-05-27 20:08 UTC (permalink / raw)
  To: linux-kernel

hi,

This patch adds multiplexing functionality to Oprofile driver and also
the AMD arch specific handler. Basically, the mechanism schedules delayed
work and swaps the contents of event counter and event control in arch
specific handler when the scheduled work is executed.

Please let me know if you have any comment or question. Thanks.

Signed-off-by: Jason Yeh <jason.yeh@amd.com>

---

  arch/x86/oprofile/nmi_int.c         |   22 ++++++
  arch/x86/oprofile/op_counter.h      |    3
  arch/x86/oprofile/op_model_athlon.c |  125 +++++++++++++++++++++++++++++-------
  arch/x86/oprofile/op_x86_model.h    |    2
  drivers/oprofile/oprof.c            |   48 +++++++++++++
  drivers/oprofile/oprof.h            |    4 -
  drivers/oprofile/oprofile_files.c   |   35 +++++++++-
  include/linux/oprofile.h            |    3
  8 files changed, 215 insertions(+), 27 deletions(-)


diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cc48d3f..4995aff 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -80,6 +80,24 @@ static void exit_sysfs(void)
  #define exit_sysfs() do { } while (0)
  #endif /* CONFIG_PM */

+static void nmi_cpu_switch(void *dummy)
+{
+	struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
+	model->switch_ctrs(msrs);
+}
+
+static int nmi_switch_event(void)
+{
+	/* Check CPU 0 should be sufficient */
+	struct op_msrs const *msrs = &per_cpu(cpu_msrs, 0);
+
+	if (model->check_multiplexing(msrs) < 0)
+		return -EINVAL;
+
+	on_each_cpu(nmi_cpu_switch, NULL, 0, 1);
+	return 0;
+}
+
  static int profile_exceptions_notify(struct notifier_block *self,
  				     unsigned long val, void *data)
  {
@@ -326,6 +344,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
  		oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
  		oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
  		oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
+		counter_config[i].save_count_low = 0;
  	}

  	return 0;
@@ -419,7 +438,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
  			break;
  		case 0x10:
  			model = &op_athlon_spec;
-			cpu_type = "x86-64/family10";
+			cpu_type = "x86-64/family10h";
  			break;
  		}
  		break;
@@ -455,6 +474,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
  	ops->start = nmi_start;
  	ops->stop = nmi_stop;
  	ops->cpu_type = cpu_type;
+	ops->switch_events = nmi_switch_event;
  	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
  	return 0;
  }
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 2880b15..786d6e0 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -10,13 +10,14 @@
  #ifndef OP_COUNTER_H
  #define OP_COUNTER_H

-#define OP_MAX_COUNTER 8
+#define OP_MAX_COUNTER 32

  /* Per-perfctr configuration as set via
   * oprofilefs.
   */
  struct op_counter_config {
          unsigned long count;
+		unsigned long save_count_low;
          unsigned long enabled;
          unsigned long event;
          unsigned long kernel;
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
index 3d53487..2684683 100644
--- a/arch/x86/oprofile/op_model_athlon.c
+++ b/arch/x86/oprofile/op_model_athlon.c
@@ -11,6 +11,7 @@
   */

  #include <linux/oprofile.h>
+#include <linux/percpu.h>
  #include <asm/ptrace.h>
  #include <asm/msr.h>
  #include <asm/nmi.h>
@@ -18,8 +19,10 @@
  #include "op_x86_model.h"
  #include "op_counter.h"

-#define NUM_COUNTERS 4
-#define NUM_CONTROLS 4
+#define NUM_COUNTERS 32
+#define NUM_HARDWARE_COUNTERS 4
+#define NUM_CONTROLS 32
+#define NUM_HARDWARE_CONTROLS 4

  #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
  #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
@@ -43,21 +46,24 @@
  #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))

  static unsigned long reset_value[NUM_COUNTERS];
+DEFINE_PER_CPU(int, switch_index);

  static void athlon_fill_in_addresses(struct op_msrs * const msrs)
  {
  	int i;

  	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		int hw_counter = i % NUM_HARDWARE_COUNTERS;
+ 		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter))
+ 			msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter;
  		else
  			msrs->counters[i].addr = 0;
  	}

  	for (i = 0; i < NUM_CONTROLS; i++) {
-		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		int hw_control = i % NUM_HARDWARE_CONTROLS;
+		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control))
+			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control;
  		else
  			msrs->controls[i].addr = 0;
  	}
@@ -69,8 +75,15 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
  	unsigned int low, high;
  	int i;

+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (counter_config[i].enabled)
+			reset_value[i] = counter_config[i].count;
+		else
+			reset_value[i] = 0;
+	}
+
  	/* clear all counters */
-	for (i = 0 ; i < NUM_CONTROLS; ++i) {
+	for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) {
  		if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
  			continue;
  		CTRL_READ(low, high, msrs, i);
@@ -80,14 +93,14 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
  	}

  	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
+	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
  		if (unlikely(!CTR_IS_RESERVED(msrs, i)))
  			continue;
  		CTR_WRITE(1, msrs, i);
  	}

  	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
+	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
  		if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
  			reset_value[i] = counter_config[i].count;

@@ -106,26 +119,41 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
  			CTRL_SET_GUEST_ONLY(high, 0);

  			CTRL_WRITE(low, high, msrs, i);
-		} else {
-			reset_value[i] = 0;
  		}
  	}
  }


+/*
+ * Quick check to see if multiplexing is necessary.
+ * The check should be efficient since counters are used
+ * in ordre.
+ */
+static int athlon_check_multiplexing(struct op_msrs const * const msrs)
+{
+	int ret = 0;
+
+	if (!counter_config[NUM_HARDWARE_COUNTERS].count)
+		ret = -EINVAL;
+	
+	return ret;
+}
+
+
  static int athlon_check_ctrs(struct pt_regs * const regs,
  			     struct op_msrs const * const msrs)
  {
  	unsigned int low, high;
  	int i;

-	for (i = 0 ; i < NUM_COUNTERS; ++i) {
-		if (!reset_value[i])
+	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
+		int offset = i + __get_cpu_var(switch_index);
+		if (!reset_value[offset])
  			continue;
  		CTR_READ(low, high, msrs, i);
  		if (CTR_OVERFLOWED(low)) {
-			oprofile_add_sample(regs, i);
-			CTR_WRITE(reset_value[i], msrs, i);
+			oprofile_add_sample(regs, offset);
+			CTR_WRITE(reset_value[offset], msrs, i);
  		}
  	}

@@ -138,13 +166,14 @@ static void athlon_start(struct op_msrs const * const msrs)
  {
  	unsigned int low, high;
  	int i;
-	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
  		if (reset_value[i]) {
  			CTRL_READ(low, high, msrs, i);
  			CTRL_SET_ACTIVE(low);
  			CTRL_WRITE(low, high, msrs, i);
  		}
  	}
+	__get_cpu_var(switch_index) = 0;
  }


@@ -155,8 +184,8 @@ static void athlon_stop(struct op_msrs const * const msrs)

  	/* Subtle: stop on all counters to avoid race with
  	 * setting our pm callback */
-	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
-		if (!reset_value[i])
+	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
+		if (!reset_value[i + per_cpu(switch_index, smp_processor_id())])
  			continue;
  		CTRL_READ(low, high, msrs, i);
  		CTRL_SET_INACTIVE(low);
@@ -164,15 +193,67 @@ static void athlon_stop(struct op_msrs const * const msrs)
  	}
  }

+
+static void athlon_switch_ctrs(struct op_msrs const * const msrs)
+{
+	unsigned int low, high;
+	int i, s = per_cpu(switch_index, smp_processor_id());
+
+	athlon_stop(msrs);
+
+	/* save the current hw counts */
+	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
+		int offset = i + s;
+		if (!reset_value[offset])
+			continue;
+		CTR_READ(low, high, msrs, i);
+		/* convert counter value to actual count, assume high = -1 */
+		counter_config[offset].save_count_low = (unsigned int)-1 - low - 1;
+	}
+
+
+	/* move to next eventset */
+	s += NUM_HARDWARE_COUNTERS;
+	if ((s > NUM_HARDWARE_COUNTERS) || (counter_config[s].count == 0)) {
+		per_cpu(switch_index, smp_processor_id()) = 0;
+		s = 0;
+	} else
+		per_cpu(switch_index, smp_processor_id()) = s;
+
+	/* enable next active counters */
+	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
+		int offset = i + s;
+		if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs,i))) {
+			if (unlikely(!counter_config[offset].save_count_low))
+				counter_config[offset].save_count_low = counter_config[offset].count;
+			CTR_WRITE(counter_config[offset].save_count_low, msrs, i);
+			CTRL_READ(low, high, msrs, i);
+			CTRL_CLEAR_LO(low);
+			CTRL_CLEAR_HI(high);
+			CTRL_SET_ENABLE(low);
+			CTRL_SET_USR(low, counter_config[offset].user);
+			CTRL_SET_KERN(low, counter_config[offset].kernel);
+			CTRL_SET_UM(low, counter_config[offset].unit_mask);
+			CTRL_SET_EVENT_LOW(low, counter_config[offset].event);
+			CTRL_SET_EVENT_HIGH(high, counter_config[offset].event);
+			CTRL_SET_HOST_ONLY(high, 0);
+			CTRL_SET_GUEST_ONLY(high, 0);
+			CTRL_SET_ACTIVE(low);
+			CTRL_WRITE(low, high, msrs, i);
+		}
+	}
+}
+
+
  static void athlon_shutdown(struct op_msrs const * const msrs)
  {
  	int i;

-	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
  		if (CTR_IS_RESERVED(msrs, i))
  			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
  	}
-	for (i = 0 ; i < NUM_CONTROLS ; ++i) {
+	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
  		if (CTRL_IS_RESERVED(msrs, i))
  			release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
  	}
@@ -186,5 +267,7 @@ struct op_x86_model_spec const op_athlon_spec = {
  	.check_ctrs = &athlon_check_ctrs,
  	.start = &athlon_start,
  	.stop = &athlon_stop,
-	.shutdown = &athlon_shutdown
+	.shutdown = &athlon_shutdown,
+	.switch_ctrs = &athlon_switch_ctrs,
+	.check_multiplexing = &athlon_check_multiplexing
  };
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 45b605f..45003c2 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -41,6 +41,8 @@ struct op_x86_model_spec {
  	void (*start)(struct op_msrs const * const msrs);
  	void (*stop)(struct op_msrs const * const msrs);
  	void (*shutdown)(struct op_msrs const * const msrs);
+	void (*switch_ctrs)(struct op_msrs const * const msrs);
+	int (*check_multiplexing)(struct op_msrs const * const msrs);
  };

  extern struct op_x86_model_spec const op_ppro_spec;
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index 2c64517..ca5bd7b 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -12,6 +12,7 @@
  #include <linux/init.h>
  #include <linux/oprofile.h>
  #include <linux/moduleparam.h>
+#include <linux/workqueue.h>
  #include <asm/mutex.h>

  #include "oprof.h"
@@ -24,6 +25,9 @@ struct oprofile_operations oprofile_ops;

  unsigned long oprofile_started;
  unsigned long backtrace_depth;
+/* Multiplexing defaults at 5000 useconds */
+unsigned long time_slice = 5 * USEC_PER_SEC	;
+struct delayed_work switch_work;
  static unsigned long is_setup;
  static DEFINE_MUTEX(start_mutex);

@@ -87,6 +91,16 @@ out:
  	return err;
  }

+static void start_switch_worker(void)
+{
+	schedule_delayed_work(&switch_work, usecs_to_jiffies(time_slice));
+}
+
+static void switch_worker(unsigned long ptr)
+{
+	if (!oprofile_ops.switch_events())
+		start_switch_worker();
+}

  /* Actually start profiling (echo 1>/dev/oprofile/enable) */
  int oprofile_start(void)
@@ -94,7 +108,6 @@ int oprofile_start(void)
  	int err = -EINVAL;

  	mutex_lock(&start_mutex);
-
  	if (!is_setup)
  		goto out;

@@ -108,6 +121,9 @@ int oprofile_start(void)
  	if ((err = oprofile_ops.start()))
  		goto out;

+	if (oprofile_ops.switch_events)
+		start_switch_worker();
+
  	oprofile_started = 1;
  out:
  	mutex_unlock(&start_mutex);
@@ -123,6 +139,7 @@ void oprofile_stop(void)
  		goto out;
  	oprofile_ops.stop();
  	oprofile_started = 0;
+	cancel_delayed_work_sync(&switch_work);
  	/* wake up the daemon to read what remains */
  	wake_up_buffer_waiter();
  out:
@@ -155,6 +172,29 @@ post_sync:
  	mutex_unlock(&start_mutex);
  }

+int oprofile_set_time_slice(unsigned long val)
+{
+	int err = 0;
+
+	mutex_lock(&start_mutex);
+
+	if (oprofile_started) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (!oprofile_ops.switch_events) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	time_slice = val;
+
+out:
+	mutex_unlock(&start_mutex);
+	return err;
+
+}

  int oprofile_set_backtrace(unsigned long val)
  {
@@ -179,10 +219,16 @@ out:
  	return err;
  }

+static void __init oprofile_switch_timer_init(void)
+{
+	INIT_DELAYED_WORK(&switch_work, switch_worker);
+}
+
  static int __init oprofile_init(void)
  {
  	int err;

+	oprofile_switch_timer_init();
  	err = oprofile_arch_init(&oprofile_ops);

  	if (err < 0 || timer) {
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index 1832365..fc3a2bd 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -27,7 +27,8 @@ extern unsigned long fs_buffer_watershed;
  extern struct oprofile_operations oprofile_ops;
  extern unsigned long oprofile_started;
  extern unsigned long backtrace_depth;
-
+extern unsigned long time_slice;
+
  struct super_block;
  struct dentry;

@@ -35,5 +36,6 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root);
  void oprofile_timer_init(struct oprofile_operations * ops);

  int oprofile_set_backtrace(unsigned long depth);
+int oprofile_set_time_slice(unsigned long time);

  #endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index ef953ba..25c78dd 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -18,6 +18,37 @@ unsigned long fs_buffer_size = 131072;
  unsigned long fs_cpu_buffer_size = 8192;
  unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */

+static ssize_t time_slice_read(struct file * file, char __user * buf, size_t count, loff_t * offset)
+{
+	return oprofilefs_ulong_to_user(time_slice, buf, count, offset);
+}
+
+
+static ssize_t time_slice_write(struct file * file, char const __user * buf, size_t count, loff_t * offset)
+{
+	unsigned long val;
+	int retval;
+
+	if (*offset)
+		return -EINVAL;
+
+	retval = oprofilefs_ulong_from_user(&val, buf, count);
+	if (retval)
+		return retval;
+
+	retval = oprofile_set_time_slice(val);
+
+	if (retval)
+		return retval;
+	return count;
+}
+
+static const struct file_operations time_slice_fops = {
+	.read		= time_slice_read,
+	.write		= time_slice_write
+};
+
+
  static ssize_t depth_read(struct file * file, char __user * buf, size_t count, loff_t * offset)
  {
  	return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset);
@@ -85,11 +116,10 @@ static ssize_t enable_write(struct file * file, char const __user * buf, size_t

  	if (*offset)
  		return -EINVAL;
-
  	retval = oprofilefs_ulong_from_user(&val, buf, count);
  	if (retval)
  		return retval;
-
+
  	if (val)
  		retval = oprofile_start();
  	else
@@ -129,6 +159,7 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root)
  	oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops);
  	oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops);
  	oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops);
+	oprofilefs_create_file(sb, root, "time_slice", &time_slice_fops);
  	oprofile_create_stats_files(sb, root);
  	if (oprofile_ops.create_files)
  		oprofile_ops.create_files(sb, root);
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 041bb31..6c764cb 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -65,6 +65,9 @@ struct oprofile_operations {

  	/* Initiate a stack backtrace. Optional. */
  	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
+
+	/* Multiplex between different events. Optioinal. */
+	int (*switch_events)(void);
  	/* CPU identification string. */
  	char * cpu_type;
  };


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC][Patch 1/1] Oprofile Multiplexing
  2008-05-27 20:08 [RFC][Patch 1/1] Oprofile Multiplexing Jason Yeh
@ 2008-05-29  8:10 ` Andrew Morton
  2008-05-29 12:25   ` Mathieu Desnoyers
  0 siblings, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2008-05-29  8:10 UTC (permalink / raw)
  To: Jason Yeh; +Cc: linux-kernel, Philippe Elie

On Tue, 27 May 2008 15:08:56 -0500 Jason Yeh <jason.yeh@amd.com> wrote:

> hi,
> 
> This patch adds multiplexing functionality to Oprofile driver and also
> the AMD arch specific handler. Basically, the mechanism schedules delayed
> work and swaps the contents of event counter and event control in arch
> specific handler when the scheduled work is executed.
> 
> Please let me know if you have any comment or question. Thanks.
> 

In the next version, please fully describe the patch in the changelog. 
Such as: what "multiplexing" is!

Your email client has space-stuffed the patch. 
http://mbligh.org/linuxdocs/Email/Clients/Thunderbird has repair
instructions.

Your patch has a number of trivial coding-style errors.  Please feed it
through scripts/checkpatch.pl and consider the result.

> 
> ---
> 
>   arch/x86/oprofile/nmi_int.c         |   22 ++++++
>   arch/x86/oprofile/op_counter.h      |    3
>   arch/x86/oprofile/op_model_athlon.c |  125 +++++++++++++++++++++++++++++-------
>   arch/x86/oprofile/op_x86_model.h    |    2
>   drivers/oprofile/oprof.c            |   48 +++++++++++++
>   drivers/oprofile/oprof.h            |    4 -
>   drivers/oprofile/oprofile_files.c   |   35 +++++++++-
>   include/linux/oprofile.h            |    3

It's half an oprofile patch and half an x86 patch.  Please cc the x86
maintainers and myself on the next version and we'll work out who should
be merging it.

Please also cc the oprofile maintainer and mailing list.

>
> ...
>
>   static int profile_exceptions_notify(struct notifier_block *self,
>   				     unsigned long val, void *data)
>   {
> @@ -326,6 +344,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
>   		oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
>   		oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
>   		oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
> +		counter_config[i].save_count_low = 0;
>   	}
> 
>   	return 0;
> @@ -419,7 +438,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
>   			break;
>   		case 0x10:
>   			model = &op_athlon_spec;
> -			cpu_type = "x86-64/family10";
> +			cpu_type = "x86-64/family10h";

Are there any userspace compatibility implications here?


>   			break;
>   		}
>   		break;
> @@ -455,6 +474,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
>   	ops->start = nmi_start;
>   	ops->stop = nmi_stop;
>   	ops->cpu_type = cpu_type;
> +	ops->switch_events = nmi_switch_event;
>   	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
>   	return 0;
>   }
> diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
> index 2880b15..786d6e0 100644
> --- a/arch/x86/oprofile/op_counter.h
> +++ b/arch/x86/oprofile/op_counter.h
> @@ -10,13 +10,14 @@
>   #ifndef OP_COUNTER_H
>   #define OP_COUNTER_H
> 
> -#define OP_MAX_COUNTER 8
> +#define OP_MAX_COUNTER 32

What weas the reason for changing this?

>   /* Per-perfctr configuration as set via
>    * oprofilefs.
>    */
>   struct op_counter_config {
>           unsigned long count;
> +		unsigned long save_count_low;

Extra tab.

>           unsigned long enabled;
>           unsigned long event;
>           unsigned long kernel;
> diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
> index 3d53487..2684683 100644
> --- a/arch/x86/oprofile/op_model_athlon.c
> +++ b/arch/x86/oprofile/op_model_athlon.c
> @@ -11,6 +11,7 @@
>    */
> 
>   #include <linux/oprofile.h>
> +#include <linux/percpu.h>
>   #include <asm/ptrace.h>
>   #include <asm/msr.h>
>   #include <asm/nmi.h>
> @@ -18,8 +19,10 @@
>   #include "op_x86_model.h"
>   #include "op_counter.h"
> 
> -#define NUM_COUNTERS 4
> -#define NUM_CONTROLS 4
> +#define NUM_COUNTERS 32
> +#define NUM_HARDWARE_COUNTERS 4
> +#define NUM_CONTROLS 32
> +#define NUM_HARDWARE_CONTROLS 4

reasons?

>   #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
>   #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
> @@ -43,21 +46,24 @@
>   #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
> 
>   static unsigned long reset_value[NUM_COUNTERS];
> +DEFINE_PER_CPU(int, switch_index);

Can be made static.

>   static void athlon_fill_in_addresses(struct op_msrs * const msrs)
>   {
>   	int i;
> 
>   	for (i = 0; i < NUM_COUNTERS; i++) {
> -		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
> -			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
> +		int hw_counter = i % NUM_HARDWARE_COUNTERS;
> + 		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter))
> + 			msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter;
>   		else
>   			msrs->counters[i].addr = 0;
>   	}
> 
>   	for (i = 0; i < NUM_CONTROLS; i++) {
> -		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
> -			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
> +		int hw_control = i % NUM_HARDWARE_CONTROLS;
> +		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control))
> +			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control;
>   		else
>   			msrs->controls[i].addr = 0;
>   	}
> @@ -69,8 +75,15 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
>   	unsigned int low, high;
>   	int i;
> 
> +	for (i = 0; i < NUM_COUNTERS; ++i) {
> +		if (counter_config[i].enabled)
> +			reset_value[i] = counter_config[i].count;
> +		else
> +			reset_value[i] = 0;
> +	}

Maybe reset_value[] was all-zeroes here?

>   	/* clear all counters */
> -	for (i = 0 ; i < NUM_CONTROLS; ++i) {
> +	for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) {

checkpatch will complain, even though you're just retaining previous
bustage.  Feel free to unbust things as you alter them.

>
> ...
>
>   static int profile_exceptions_notify(struct notifier_block *self,
> +/*
> + * Quick check to see if multiplexing is necessary.
> + * The check should be efficient since counters are used
> + * in ordre.
> + */
> +static int athlon_check_multiplexing(struct op_msrs const * const msrs)
> +{
> +	int ret = 0;
> +
> +	if (!counter_config[NUM_HARDWARE_COUNTERS].count)
> +		ret = -EINVAL;
> +	
> +	return ret;
> +}

That was a bit verbose.

	return counter_config[NUM_HARDWARE_COUNTERS].count ? 0 : -EINVAL;

?

> +
>   static int athlon_check_ctrs(struct pt_regs * const regs,
>   			     struct op_msrs const * const msrs)
>   {
>   	unsigned int low, high;
>   	int i;
> 
> -	for (i = 0 ; i < NUM_COUNTERS; ++i) {
> -		if (!reset_value[i])
> +	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
> +		int offset = i + __get_cpu_var(switch_index);
> +		if (!reset_value[offset])
>   			continue;
>   		CTR_READ(low, high, msrs, i);
>   		if (CTR_OVERFLOWED(low)) {
> -			oprofile_add_sample(regs, i);
> -			CTR_WRITE(reset_value[i], msrs, i);
> +			oprofile_add_sample(regs, offset);
> +			CTR_WRITE(reset_value[offset], msrs, i);
>   		}
>   	}

Is preemption always disabled by the op_x86_model_spec.check_ctrs()
caller?  If not we have a race here and warnings will be generated if
suitable debugging options were enabled.

We this code runtime tested with all debugging options enabled? 
Documentation/SubmitChecklist has info about this.

> @@ -138,13 +166,14 @@ static void athlon_start(struct op_msrs const * const msrs)
>   {
>   	unsigned int low, high;
>   	int i;
> -	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
> +	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
>   		if (reset_value[i]) {
>   			CTRL_READ(low, high, msrs, i);
>   			CTRL_SET_ACTIVE(low);
>   			CTRL_WRITE(low, high, msrs, i);
>   		}
>   	}
> +	__get_cpu_var(switch_index) = 0;
>   }

Ditto.

> 
> @@ -155,8 +184,8 @@ static void athlon_stop(struct op_msrs const * const msrs)
> 
>   	/* Subtle: stop on all counters to avoid race with
>   	 * setting our pm callback */
> -	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
> -		if (!reset_value[i])
> +	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
> +		if (!reset_value[i + per_cpu(switch_index, smp_processor_id())])
>   			continue;
>   		CTRL_READ(low, high, msrs, i);
>   		CTRL_SET_INACTIVE(low);
> @@ -164,15 +193,67 @@ static void athlon_stop(struct op_msrs const * const msrs)
>   	}
>   }
> 
> +
> +static void athlon_switch_ctrs(struct op_msrs const * const msrs)
> +{
> +	unsigned int low, high;
> +	int i, s = per_cpu(switch_index, smp_processor_id());

More dittoes.

> +	athlon_stop(msrs);
> +
> +	/* save the current hw counts */
> +	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
> +		int offset = i + s;
> +		if (!reset_value[offset])
> +			continue;
> +		CTR_READ(low, high, msrs, i);
> +		/* convert counter value to actual count, assume high = -1 */
> +		counter_config[offset].save_count_low = (unsigned int)-1 - low - 1;
> +	}
> +
> +
> +	/* move to next eventset */
> +	s += NUM_HARDWARE_COUNTERS;
> +	if ((s > NUM_HARDWARE_COUNTERS) || (counter_config[s].count == 0)) {
> +		per_cpu(switch_index, smp_processor_id()) = 0;
> +		s = 0;
> +	} else
> +		per_cpu(switch_index, smp_processor_id()) = s;
> +
> +	/* enable next active counters */
> +	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
> +		int offset = i + s;
> +		if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs,i))) {
> +			if (unlikely(!counter_config[offset].save_count_low))
> +				counter_config[offset].save_count_low = counter_config[offset].count;
> +			CTR_WRITE(counter_config[offset].save_count_low, msrs, i);
> +			CTRL_READ(low, high, msrs, i);
> +			CTRL_CLEAR_LO(low);
> +			CTRL_CLEAR_HI(high);
> +			CTRL_SET_ENABLE(low);
> +			CTRL_SET_USR(low, counter_config[offset].user);
> +			CTRL_SET_KERN(low, counter_config[offset].kernel);
> +			CTRL_SET_UM(low, counter_config[offset].unit_mask);
> +			CTRL_SET_EVENT_LOW(low, counter_config[offset].event);
> +			CTRL_SET_EVENT_HIGH(high, counter_config[offset].event);
> +			CTRL_SET_HOST_ONLY(high, 0);
> +			CTRL_SET_GUEST_ONLY(high, 0);
> +			CTRL_SET_ACTIVE(low);
> +			CTRL_WRITE(low, high, msrs, i);
> +		}
> +	}
> +}
> +
>
> ...
>
>   static int profile_exceptions_notify(struct notifier_block *self,
> @@ -24,6 +25,9 @@ struct oprofile_operations oprofile_ops;
> 
>   unsigned long oprofile_started;
>   unsigned long backtrace_depth;
> +/* Multiplexing defaults at 5000 useconds */
> +unsigned long time_slice = 5 * USEC_PER_SEC	;

static?

> +struct delayed_work switch_work;

static?

Can we use DECLARE_DELAYED_WORK() here?

>   static unsigned long is_setup;
>   static DEFINE_MUTEX(start_mutex);
> 
>
> ...
>
>   static int profile_exceptions_notify(struct notifier_block *self,
> @@ -123,6 +139,7 @@ void oprofile_stop(void)
>   		goto out;
>   	oprofile_ops.stop();
>   	oprofile_started = 0;
> +	cancel_delayed_work_sync(&switch_work);

Whoa.  Yours is that one patch in twenty which remembered to do this.

>   	/* wake up the daemon to read what remains */
>   	wake_up_buffer_waiter();
>   out:
> @@ -155,6 +172,29 @@ post_sync:
>   	mutex_unlock(&start_mutex);
>   }
> 
> +int oprofile_set_time_slice(unsigned long val)
> +{
> +	int err = 0;
> +
> +	mutex_lock(&start_mutex);
> +
> +	if (oprofile_started) {
> +		err = -EBUSY;
> +		goto out;
> +	}
> +
> +	if (!oprofile_ops.switch_events) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	time_slice = val;
> +
> +out:
> +	mutex_unlock(&start_mutex);
> +	return err;
> +
> +}

Pity the poor reader who tries to work out what units `val' is in. 
Better choice of identifiers and addition of some comments would help
maintainability.

>   int oprofile_set_backtrace(unsigned long val)
>   {
> @@ -179,10 +219,16 @@ out:
>   	return err;
>   }
> 
> +static void __init oprofile_switch_timer_init(void)
> +{
> +	INIT_DELAYED_WORK(&switch_work, switch_worker);
> +}

Can be removed if DECLARE_DELAYED_WORK() is used.

>   static int __init oprofile_init(void)
>   {
>   	int err;
> 
> +	oprofile_switch_timer_init();

zap

>   	err = oprofile_arch_init(&oprofile_ops);
> 
>   	if (err < 0 || timer) {
> diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
> index 1832365..fc3a2bd 100644
> --- a/drivers/oprofile/oprof.h
> +++ b/drivers/oprofile/oprof.h
> @@ -27,7 +27,8 @@ extern unsigned long fs_buffer_watershed;
>   extern struct oprofile_operations oprofile_ops;
>   extern unsigned long oprofile_started;
>   extern unsigned long backtrace_depth;
> -
> +extern unsigned long time_slice;

It would really help if the identifier were to show the units also. 
time_slice_nsecs?  time_slice_fortnights?

>   struct super_block;
>   struct dentry;
> 
>
> ...
>
>   static int profile_exceptions_notify(struct notifier_block *self,
> +static ssize_t time_slice_write(struct file * file, char const __user * buf, size_t count, loff_t * offset)
> +{
> +	unsigned long val;
> +	int retval;
> +
> +	if (*offset)
> +		return -EINVAL;
> +
> +	retval = oprofilefs_ulong_from_user(&val, buf, count);
> +	if (retval)
> +		return retval;
> +
> +	retval = oprofile_set_time_slice(val);
> +
> +	if (retval)
> +		return retval;
> +	return count;
> +}
> +
> +static const struct file_operations time_slice_fops = {
> +	.read		= time_slice_read,
> +	.write		= time_slice_write

It's conventional to add a "," on the last entry here.  So that later
patches are simpler.

> +};

So we have userspace interface changes.  I assume that changes to
oprofile userspace are in the pipeline?

>
> ...
>
>   static int profile_exceptions_notify(struct notifier_block *self,
> @@ -65,6 +65,9 @@ struct oprofile_operations {
> 
>   	/* Initiate a stack backtrace. Optional. */
>   	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
> +
> +	/* Multiplex between different events. Optioinal. */

typo there.

> +	int (*switch_events)(void);
>   	/* CPU identification string. */
>   	char * cpu_type;
>   };


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC][Patch 1/1] Oprofile Multiplexing
  2008-05-29  8:10 ` Andrew Morton
@ 2008-05-29 12:25   ` Mathieu Desnoyers
  2008-05-29 19:03     ` Jason Yeh
  0 siblings, 1 reply; 4+ messages in thread
From: Mathieu Desnoyers @ 2008-05-29 12:25 UTC (permalink / raw)
  To: Jason Yeh, Philippe Elie, Andrew Morton, linux-kernel

* Andrew Morton (akpm@linux-foundation.org) wrote:
> On Tue, 27 May 2008 15:08:56 -0500 Jason Yeh <jason.yeh@amd.com> wrote:
> 
> > hi,
> > 
> > This patch adds multiplexing functionality to Oprofile driver and also
> > the AMD arch specific handler. Basically, the mechanism schedules delayed
> > work and swaps the contents of event counter and event control in arch
> > specific handler when the scheduled work is executed.
> > 
> > Please let me know if you have any comment or question. Thanks.
> > 
> 
> In the next version, please fully describe the patch in the changelog. 
> Such as: what "multiplexing" is!
> 
> Your email client has space-stuffed the patch. 
> http://mbligh.org/linuxdocs/Email/Clients/Thunderbird has repair
> instructions.
> 
> Your patch has a number of trivial coding-style errors.  Please feed it
> through scripts/checkpatch.pl and consider the result.
> 

I have been contemplating the idea of using the performance counters in
the LTTng kernel tracer for quite a while, but I see the the underlying
implementations often deal with those counters as if they were
per-process ressources. Given those ressources are limited amongst the
whole system, would it make sense to allow an in-kernel consumer to
connect to some of these counters on a per-CPU basis and return a
"ressource not available error" to userspace when it tries to connect to
them ? (and also to kick out a userspace reader when an higher priority
in-kernel request is done, which implies that the "read" operation may
fail)

It could already be considered, but when I see such counter multiplexing
proposals, I am always afraid of how this is designed to work with
tracer interested in system-wide counters.

Thanks,

Mathieu

> > 
> > ---
> > 
> >   arch/x86/oprofile/nmi_int.c         |   22 ++++++
> >   arch/x86/oprofile/op_counter.h      |    3
> >   arch/x86/oprofile/op_model_athlon.c |  125 +++++++++++++++++++++++++++++-------
> >   arch/x86/oprofile/op_x86_model.h    |    2
> >   drivers/oprofile/oprof.c            |   48 +++++++++++++
> >   drivers/oprofile/oprof.h            |    4 -
> >   drivers/oprofile/oprofile_files.c   |   35 +++++++++-
> >   include/linux/oprofile.h            |    3
> 
> It's half an oprofile patch and half an x86 patch.  Please cc the x86
> maintainers and myself on the next version and we'll work out who should
> be merging it.
> 
> Please also cc the oprofile maintainer and mailing list.
> 
> >
> > ...
> >
> >   static int profile_exceptions_notify(struct notifier_block *self,
> >   				     unsigned long val, void *data)
> >   {
> > @@ -326,6 +344,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
> >   		oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
> >   		oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
> >   		oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
> > +		counter_config[i].save_count_low = 0;
> >   	}
> > 
> >   	return 0;
> > @@ -419,7 +438,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
> >   			break;
> >   		case 0x10:
> >   			model = &op_athlon_spec;
> > -			cpu_type = "x86-64/family10";
> > +			cpu_type = "x86-64/family10h";
> 
> Are there any userspace compatibility implications here?
> 
> 
> >   			break;
> >   		}
> >   		break;
> > @@ -455,6 +474,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
> >   	ops->start = nmi_start;
> >   	ops->stop = nmi_stop;
> >   	ops->cpu_type = cpu_type;
> > +	ops->switch_events = nmi_switch_event;
> >   	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
> >   	return 0;
> >   }
> > diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
> > index 2880b15..786d6e0 100644
> > --- a/arch/x86/oprofile/op_counter.h
> > +++ b/arch/x86/oprofile/op_counter.h
> > @@ -10,13 +10,14 @@
> >   #ifndef OP_COUNTER_H
> >   #define OP_COUNTER_H
> > 
> > -#define OP_MAX_COUNTER 8
> > +#define OP_MAX_COUNTER 32
> 
> What weas the reason for changing this?
> 
> >   /* Per-perfctr configuration as set via
> >    * oprofilefs.
> >    */
> >   struct op_counter_config {
> >           unsigned long count;
> > +		unsigned long save_count_low;
> 
> Extra tab.
> 
> >           unsigned long enabled;
> >           unsigned long event;
> >           unsigned long kernel;
> > diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
> > index 3d53487..2684683 100644
> > --- a/arch/x86/oprofile/op_model_athlon.c
> > +++ b/arch/x86/oprofile/op_model_athlon.c
> > @@ -11,6 +11,7 @@
> >    */
> > 
> >   #include <linux/oprofile.h>
> > +#include <linux/percpu.h>
> >   #include <asm/ptrace.h>
> >   #include <asm/msr.h>
> >   #include <asm/nmi.h>
> > @@ -18,8 +19,10 @@
> >   #include "op_x86_model.h"
> >   #include "op_counter.h"
> > 
> > -#define NUM_COUNTERS 4
> > -#define NUM_CONTROLS 4
> > +#define NUM_COUNTERS 32
> > +#define NUM_HARDWARE_COUNTERS 4
> > +#define NUM_CONTROLS 32
> > +#define NUM_HARDWARE_CONTROLS 4
> 
> reasons?
> 
> >   #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
> >   #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
> > @@ -43,21 +46,24 @@
> >   #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
> > 
> >   static unsigned long reset_value[NUM_COUNTERS];
> > +DEFINE_PER_CPU(int, switch_index);
> 
> Can be made static.
> 
> >   static void athlon_fill_in_addresses(struct op_msrs * const msrs)
> >   {
> >   	int i;
> > 
> >   	for (i = 0; i < NUM_COUNTERS; i++) {
> > -		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
> > -			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
> > +		int hw_counter = i % NUM_HARDWARE_COUNTERS;
> > + 		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter))
> > + 			msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter;
> >   		else
> >   			msrs->counters[i].addr = 0;
> >   	}
> > 
> >   	for (i = 0; i < NUM_CONTROLS; i++) {
> > -		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
> > -			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
> > +		int hw_control = i % NUM_HARDWARE_CONTROLS;
> > +		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control))
> > +			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control;
> >   		else
> >   			msrs->controls[i].addr = 0;
> >   	}
> > @@ -69,8 +75,15 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
> >   	unsigned int low, high;
> >   	int i;
> > 
> > +	for (i = 0; i < NUM_COUNTERS; ++i) {
> > +		if (counter_config[i].enabled)
> > +			reset_value[i] = counter_config[i].count;
> > +		else
> > +			reset_value[i] = 0;
> > +	}
> 
> Maybe reset_value[] was all-zeroes here?
> 
> >   	/* clear all counters */
> > -	for (i = 0 ; i < NUM_CONTROLS; ++i) {
> > +	for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) {
> 
> checkpatch will complain, even though you're just retaining previous
> bustage.  Feel free to unbust things as you alter them.
> 
> >
> > ...
> >
> >   static int profile_exceptions_notify(struct notifier_block *self,
> > +/*
> > + * Quick check to see if multiplexing is necessary.
> > + * The check should be efficient since counters are used
> > + * in ordre.
> > + */
> > +static int athlon_check_multiplexing(struct op_msrs const * const msrs)
> > +{
> > +	int ret = 0;
> > +
> > +	if (!counter_config[NUM_HARDWARE_COUNTERS].count)
> > +		ret = -EINVAL;
> > +	
> > +	return ret;
> > +}
> 
> That was a bit verbose.
> 
> 	return counter_config[NUM_HARDWARE_COUNTERS].count ? 0 : -EINVAL;
> 
> ?
> 
> > +
> >   static int athlon_check_ctrs(struct pt_regs * const regs,
> >   			     struct op_msrs const * const msrs)
> >   {
> >   	unsigned int low, high;
> >   	int i;
> > 
> > -	for (i = 0 ; i < NUM_COUNTERS; ++i) {
> > -		if (!reset_value[i])
> > +	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
> > +		int offset = i + __get_cpu_var(switch_index);
> > +		if (!reset_value[offset])
> >   			continue;
> >   		CTR_READ(low, high, msrs, i);
> >   		if (CTR_OVERFLOWED(low)) {
> > -			oprofile_add_sample(regs, i);
> > -			CTR_WRITE(reset_value[i], msrs, i);
> > +			oprofile_add_sample(regs, offset);
> > +			CTR_WRITE(reset_value[offset], msrs, i);
> >   		}
> >   	}
> 
> Is preemption always disabled by the op_x86_model_spec.check_ctrs()
> caller?  If not we have a race here and warnings will be generated if
> suitable debugging options were enabled.
> 
> We this code runtime tested with all debugging options enabled? 
> Documentation/SubmitChecklist has info about this.
> 
> > @@ -138,13 +166,14 @@ static void athlon_start(struct op_msrs const * const msrs)
> >   {
> >   	unsigned int low, high;
> >   	int i;
> > -	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
> > +	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
> >   		if (reset_value[i]) {
> >   			CTRL_READ(low, high, msrs, i);
> >   			CTRL_SET_ACTIVE(low);
> >   			CTRL_WRITE(low, high, msrs, i);
> >   		}
> >   	}
> > +	__get_cpu_var(switch_index) = 0;
> >   }
> 
> Ditto.
> 
> > 
> > @@ -155,8 +184,8 @@ static void athlon_stop(struct op_msrs const * const msrs)
> > 
> >   	/* Subtle: stop on all counters to avoid race with
> >   	 * setting our pm callback */
> > -	for (i = 0 ; i < NUM_COUNTERS ; ++i) {
> > -		if (!reset_value[i])
> > +	for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) {
> > +		if (!reset_value[i + per_cpu(switch_index, smp_processor_id())])
> >   			continue;
> >   		CTRL_READ(low, high, msrs, i);
> >   		CTRL_SET_INACTIVE(low);
> > @@ -164,15 +193,67 @@ static void athlon_stop(struct op_msrs const * const msrs)
> >   	}
> >   }
> > 
> > +
> > +static void athlon_switch_ctrs(struct op_msrs const * const msrs)
> > +{
> > +	unsigned int low, high;
> > +	int i, s = per_cpu(switch_index, smp_processor_id());
> 
> More dittoes.
> 
> > +	athlon_stop(msrs);
> > +
> > +	/* save the current hw counts */
> > +	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
> > +		int offset = i + s;
> > +		if (!reset_value[offset])
> > +			continue;
> > +		CTR_READ(low, high, msrs, i);
> > +		/* convert counter value to actual count, assume high = -1 */
> > +		counter_config[offset].save_count_low = (unsigned int)-1 - low - 1;
> > +	}
> > +
> > +
> > +	/* move to next eventset */
> > +	s += NUM_HARDWARE_COUNTERS;
> > +	if ((s > NUM_HARDWARE_COUNTERS) || (counter_config[s].count == 0)) {
> > +		per_cpu(switch_index, smp_processor_id()) = 0;
> > +		s = 0;
> > +	} else
> > +		per_cpu(switch_index, smp_processor_id()) = s;
> > +
> > +	/* enable next active counters */
> > +	for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) {
> > +		int offset = i + s;
> > +		if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs,i))) {
> > +			if (unlikely(!counter_config[offset].save_count_low))
> > +				counter_config[offset].save_count_low = counter_config[offset].count;
> > +			CTR_WRITE(counter_config[offset].save_count_low, msrs, i);
> > +			CTRL_READ(low, high, msrs, i);
> > +			CTRL_CLEAR_LO(low);
> > +			CTRL_CLEAR_HI(high);
> > +			CTRL_SET_ENABLE(low);
> > +			CTRL_SET_USR(low, counter_config[offset].user);
> > +			CTRL_SET_KERN(low, counter_config[offset].kernel);
> > +			CTRL_SET_UM(low, counter_config[offset].unit_mask);
> > +			CTRL_SET_EVENT_LOW(low, counter_config[offset].event);
> > +			CTRL_SET_EVENT_HIGH(high, counter_config[offset].event);
> > +			CTRL_SET_HOST_ONLY(high, 0);
> > +			CTRL_SET_GUEST_ONLY(high, 0);
> > +			CTRL_SET_ACTIVE(low);
> > +			CTRL_WRITE(low, high, msrs, i);
> > +		}
> > +	}
> > +}
> > +
> >
> > ...
> >
> >   static int profile_exceptions_notify(struct notifier_block *self,
> > @@ -24,6 +25,9 @@ struct oprofile_operations oprofile_ops;
> > 
> >   unsigned long oprofile_started;
> >   unsigned long backtrace_depth;
> > +/* Multiplexing defaults at 5000 useconds */
> > +unsigned long time_slice = 5 * USEC_PER_SEC	;
> 
> static?
> 
> > +struct delayed_work switch_work;
> 
> static?
> 
> Can we use DECLARE_DELAYED_WORK() here?
> 
> >   static unsigned long is_setup;
> >   static DEFINE_MUTEX(start_mutex);
> > 
> >
> > ...
> >
> >   static int profile_exceptions_notify(struct notifier_block *self,
> > @@ -123,6 +139,7 @@ void oprofile_stop(void)
> >   		goto out;
> >   	oprofile_ops.stop();
> >   	oprofile_started = 0;
> > +	cancel_delayed_work_sync(&switch_work);
> 
> Whoa.  Yours is that one patch in twenty which remembered to do this.
> 
> >   	/* wake up the daemon to read what remains */
> >   	wake_up_buffer_waiter();
> >   out:
> > @@ -155,6 +172,29 @@ post_sync:
> >   	mutex_unlock(&start_mutex);
> >   }
> > 
> > +int oprofile_set_time_slice(unsigned long val)
> > +{
> > +	int err = 0;
> > +
> > +	mutex_lock(&start_mutex);
> > +
> > +	if (oprofile_started) {
> > +		err = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	if (!oprofile_ops.switch_events) {
> > +		err = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	time_slice = val;
> > +
> > +out:
> > +	mutex_unlock(&start_mutex);
> > +	return err;
> > +
> > +}
> 
> Pity the poor reader who tries to work out what units `val' is in. 
> Better choice of identifiers and addition of some comments would help
> maintainability.
> 
> >   int oprofile_set_backtrace(unsigned long val)
> >   {
> > @@ -179,10 +219,16 @@ out:
> >   	return err;
> >   }
> > 
> > +static void __init oprofile_switch_timer_init(void)
> > +{
> > +	INIT_DELAYED_WORK(&switch_work, switch_worker);
> > +}
> 
> Can be removed if DECLARE_DELAYED_WORK() is used.
> 
> >   static int __init oprofile_init(void)
> >   {
> >   	int err;
> > 
> > +	oprofile_switch_timer_init();
> 
> zap
> 
> >   	err = oprofile_arch_init(&oprofile_ops);
> > 
> >   	if (err < 0 || timer) {
> > diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
> > index 1832365..fc3a2bd 100644
> > --- a/drivers/oprofile/oprof.h
> > +++ b/drivers/oprofile/oprof.h
> > @@ -27,7 +27,8 @@ extern unsigned long fs_buffer_watershed;
> >   extern struct oprofile_operations oprofile_ops;
> >   extern unsigned long oprofile_started;
> >   extern unsigned long backtrace_depth;
> > -
> > +extern unsigned long time_slice;
> 
> It would really help if the identifier were to show the units also. 
> time_slice_nsecs?  time_slice_fortnights?
> 
> >   struct super_block;
> >   struct dentry;
> > 
> >
> > ...
> >
> >   static int profile_exceptions_notify(struct notifier_block *self,
> > +static ssize_t time_slice_write(struct file * file, char const __user * buf, size_t count, loff_t * offset)
> > +{
> > +	unsigned long val;
> > +	int retval;
> > +
> > +	if (*offset)
> > +		return -EINVAL;
> > +
> > +	retval = oprofilefs_ulong_from_user(&val, buf, count);
> > +	if (retval)
> > +		return retval;
> > +
> > +	retval = oprofile_set_time_slice(val);
> > +
> > +	if (retval)
> > +		return retval;
> > +	return count;
> > +}
> > +
> > +static const struct file_operations time_slice_fops = {
> > +	.read		= time_slice_read,
> > +	.write		= time_slice_write
> 
> It's conventional to add a "," on the last entry here.  So that later
> patches are simpler.
> 
> > +};
> 
> So we have userspace interface changes.  I assume that changes to
> oprofile userspace are in the pipeline?
> 
> >
> > ...
> >
> >   static int profile_exceptions_notify(struct notifier_block *self,
> > @@ -65,6 +65,9 @@ struct oprofile_operations {
> > 
> >   	/* Initiate a stack backtrace. Optional. */
> >   	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
> > +
> > +	/* Multiplex between different events. Optioinal. */
> 
> typo there.
> 
> > +	int (*switch_events)(void);
> >   	/* CPU identification string. */
> >   	char * cpu_type;
> >   };
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC][Patch 1/1] Oprofile Multiplexing
  2008-05-29 12:25   ` Mathieu Desnoyers
@ 2008-05-29 19:03     ` Jason Yeh
  0 siblings, 0 replies; 4+ messages in thread
From: Jason Yeh @ 2008-05-29 19:03 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: linux-kernel


Mathieu Desnoyers wrote:
> 
> I have been contemplating the idea of using the performance counters in
> the LTTng kernel tracer for quite a while, but I see the the underlying
> implementations often deal with those counters as if they were
> per-process ressources. Given those ressources are limited amongst the
> whole system, would it make sense to allow an in-kernel consumer to
> connect to some of these counters on a per-CPU basis and return a
> "ressource not available error" to userspace when it tries to connect to
> them ? (and also to kick out a userspace reader when an higher priority
> in-kernel request is done, which implies that the "read" operation may
> fail)
> 

Mathieu,

It sounds like you are asking a re-design of the current Oprofile. I agree
with you that some sort of abstraction are needed for arbitrate who get to
connect the counters. This is probably part of the objectives of the Perfmon2
guys working toward.

The patch is aimed to provide a quick way based on the current
implementation to be able to profile more events than the number of hardware
counters.

Jason


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-05-29 19:04 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-27 20:08 [RFC][Patch 1/1] Oprofile Multiplexing Jason Yeh
2008-05-29  8:10 ` Andrew Morton
2008-05-29 12:25   ` Mathieu Desnoyers
2008-05-29 19:03     ` Jason Yeh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox