[Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix
@ 2008-03-25 15:58 Carl Love
  2008-04-02  5:21 ` Arnd Bergmann
  0 siblings, 1 reply; 7+ messages in thread
From: Carl Love @ 2008-03-25 15:58 UTC (permalink / raw)
  To: cbe-oss-dev, linuxppc-dev, linux-kernel, cel

This patch fixes a bug in the code that records the SPU data and
context switches.  The buffer_mutex lock must be held when the
kernel is adding data to the buffer between the kernel and the
OProfile daemon.  The lock is not being held in the current code
base.  This patch fixes the bug using work queues.  The data to 
be passed to the daemon is caputured by the interrupt handler.  
The workqueue function is invoked to grab the buffer_mutex lock
and add the data to the buffer.  

Signed-off-by: Carl Love <cel@us.ibm.com>


Index: linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
===================================================================
--- linux-2.6.25-rc4.orig/arch/powerpc/oprofile/cell/spu_profiler.c
+++ linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -16,6 +16,7 @@
 #include <linux/smp.h>
 #include <linux/slab.h>
 #include <asm/cell-pmu.h>
+#include <linux/workqueue.h>
 #include "pr_util.h"
 
 #define TRACE_ARRAY_SIZE 1024
@@ -32,9 +33,19 @@ static unsigned int profiling_interval;
 
 #define SPU_PC_MASK	     0xFFFF
 
+/* The generic OProfile code uses the buffer_mutex to protect the buffer
+ * between the kernel and the daemon.  The SPU code needs to use the buffer
+ * to ensure that the kernel SPU writes complete as a single block before
+ * being consumed by the daemon.
+ */
+extern struct mutex buffer_mutex;
+
 static DEFINE_SPINLOCK(sample_array_lock);
 unsigned long sample_array_lock_flags;
 
+struct work_struct spu_record_wq;
+extern struct workqueue_struct *oprofile_spu_wq;
+
 void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
 {
 	unsigned long ns_per_cyc;
@@ -123,14 +134,14 @@ static int cell_spu_pc_collection(int cp
 	return entry;
 }
 
-
-static enum hrtimer_restart profile_spus(struct hrtimer *timer)
-{
-	ktime_t kt;
+static void profile_spus_record_samples (struct work_struct *ws) {
+	/* This routine is called via schedule_work() to record the
+	 * spu data.  It must be run in a normal kernel mode to
+	 * grab the OProfile mutex lock.
+	 */
 	int cpu, node, k, num_samples, spu_num;
 
-	if (!spu_prof_running)
-		goto stop;
+	mutex_lock(&buffer_mutex);
 
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
@@ -170,6 +181,20 @@ static enum hrtimer_restart profile_spus
 	smp_wmb();	/* insure spu event buffer updates are written */
 			/* don't want events intermingled... */
 
+	mutex_unlock(&buffer_mutex);
+}
+
+static enum hrtimer_restart profile_spus(struct hrtimer *timer)
+{
+	ktime_t kt;
+
+
+	if (!spu_prof_running)
+		goto stop;
+
+	/* schedule the funtion to record the data */
+	schedule_work(&spu_record_wq);
+
 	kt = ktime_set(0, profiling_interval);
 	if (!spu_prof_running)
 		goto stop;
@@ -209,6 +234,10 @@ int start_spu_profiling(unsigned int cyc
 	spu_prof_running = 1;
 	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
 
+	/* setup the workqueue for recording the SPU data */
+	INIT_WORK(&spu_record_wq,
+		  profile_spus_record_samples);
+
 	return 0;
 }
 
Index: linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================
--- linux-2.6.25-rc4.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -27,15 +27,44 @@
 #include <linux/numa.h>
 #include <linux/oprofile.h>
 #include <linux/spinlock.h>
+#include <linux/workqueue.h>
 #include "pr_util.h"
 
 #define RELEASE_ALL 9999
 
-static DEFINE_SPINLOCK(buffer_lock);
+extern struct mutex buffer_mutex;
+extern struct workqueue_struct *oprofile_spu_wq;
+extern int calls_to_record_switch;
+
 static DEFINE_SPINLOCK(cache_lock);
 static int num_spu_nodes;
+
 int spu_prof_num_nodes;
 int last_guard_val[MAX_NUMNODES * 8];
+int cnt_swtch_processed_flag[MAX_NUMNODES * 8];
+
+struct spus_profiling_code_data_s {
+	int num_spu_nodes;
+	struct work_struct spu_prof_code_wq;
+} spus_profiling_code_data;
+
+struct spu_context_switch_data_s {
+	struct spu *spu;
+	unsigned long spu_cookie;
+	unsigned long app_dcookie;
+	unsigned int offset;
+	unsigned long objectId;
+	int valid_entry;
+} spu_context_switch_data;
+
+int calls_to_record_switch = 0;
+int record_spu_start_flag = 0;
+
+struct spus_cntxt_sw_data_s {
+	int num_spu_nodes;
+	struct spu_context_switch_data_s spu_data[MAX_NUMNODES * 8];
+	struct work_struct spu_cntxt_work;
+} spus_cntxt_sw_data;
 
 /* Container for caching information about an active SPU task. */
 struct cached_info {
@@ -44,6 +73,8 @@ struct cached_info {
 	struct kref cache_ref;
 };
 
+struct workqueue_struct *oprofile_spu_wq;
+
 static struct cached_info *spu_info[MAX_NUMNODES * 8];
 
 static void destroy_cached_info(struct kref *kref)
@@ -283,39 +314,90 @@ fail_no_image_cookie:
  * passed SPU and records SPU context information into the OProfile
  * event buffer.
  */
+static void record_spu_process_switch(struct work_struct *ws) {
+	int spu;
+	int req_processed=0;
+	struct spus_cntxt_sw_data_s *data
+		= container_of(ws, struct spus_cntxt_sw_data_s,
+			       spu_cntxt_work);
+
+	mutex_lock(&buffer_mutex);
+
+	if (record_spu_start_flag) {
+		add_event_entry(ESCAPE_CODE);
+		add_event_entry(SPU_PROFILING_CODE);
+		add_event_entry(data->num_spu_nodes);
+		record_spu_start_flag = 0;
+	}
+
+	for (spu = 0; spu < data->num_spu_nodes; spu ++) {
+		/* If the cached info can be created, put the info
+		 * into the oprofile buffer for the daemon. Otherwise
+		 * toss the entry.
+		 */
+		if (data->spu_data[spu].valid_entry == 1) {
+			/* Record context info in event buffer */
+			req_processed++;
+			add_event_entry(ESCAPE_CODE);
+			add_event_entry(SPU_CTX_SWITCH_CODE);
+			add_event_entry(spu);
+			add_event_entry(data->spu_data[spu].spu->pid);
+			add_event_entry(data->spu_data[spu].spu->tgid);
+			add_event_entry(data->spu_data[spu].app_dcookie);
+			add_event_entry(data->spu_data[spu].spu_cookie);
+			add_event_entry(data->spu_data[spu].offset);
+
+			/* Context switch has been processed, can now
+			 * begin recording samples.
+			 */
+			cnt_swtch_processed_flag[spu] = 1;
+
+			 /* insure spu event buffer updates are written
+			  * don't want entries intermingled...
+			  */
+			smp_wmb();
+		}
+		data->spu_data[spu].valid_entry = 0;
+	}
+
+	mutex_unlock(&buffer_mutex);
+}
+
 static int process_context_switch(struct spu *spu, unsigned long objectId)
 {
-	unsigned long flags;
-	int retval;
+	int retval = 0;
 	unsigned int offset = 0;
 	unsigned long spu_cookie = 0, app_dcookie;
 
 	retval = prepare_cached_spu_info(spu, objectId);
-	if (retval)
+	if (retval) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: prepare_cached_spu_info call "
+		       "failed, returned %d\n",
+		       __FUNCTION__, __LINE__, retval);
 		goto out;
+	}
+
 
-	/* Get dcookie first because a mutex_lock is taken in that
-	 * code path, so interrupts must not be disabled.
-	 */
 	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
 	if (!app_dcookie || !spu_cookie) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: get_exec_dcookie_and_offset call "
+		       "failed, returned %lu\n",
+		       __FUNCTION__, __LINE__, app_dcookie);
 		retval  = -ENOENT;
 		goto out;
 	}
 
-	/* Record context info in event buffer */
-	spin_lock_irqsave(&buffer_lock, flags);
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(SPU_CTX_SWITCH_CODE);
-	add_event_entry(spu->number);
-	add_event_entry(spu->pid);
-	add_event_entry(spu->tgid);
-	add_event_entry(app_dcookie);
-	add_event_entry(spu_cookie);
-	add_event_entry(offset);
-	spin_unlock_irqrestore(&buffer_lock, flags);
-	smp_wmb();	/* insure spu event buffer updates are written */
-			/* don't want entries intermingled... */
+	/* save the spu contect info */
+	spus_cntxt_sw_data.spu_data[spu->number].spu = spu;
+	spus_cntxt_sw_data.spu_data[spu->number].app_dcookie = app_dcookie;
+	spus_cntxt_sw_data.spu_data[spu->number].spu_cookie = spu_cookie;
+	spus_cntxt_sw_data.spu_data[spu->number].offset = offset;
+	spus_cntxt_sw_data.spu_data[spu->number].objectId = objectId;
+	spus_cntxt_sw_data.spu_data[spu->number].valid_entry = 1;
+
+	queue_work(oprofile_spu_wq, &spus_cntxt_sw_data.spu_cntxt_work);
 out:
 	return retval;
 }
@@ -375,16 +457,30 @@ int spu_sync_start(void)
 	int k;
 	int ret = SKIP_GENERIC_SYNC;
 	int register_ret;
-	unsigned long flags = 0;
 
 	spu_prof_num_nodes = number_of_online_nodes();
 	num_spu_nodes = spu_prof_num_nodes * 8;
 
-	spin_lock_irqsave(&buffer_lock, flags);
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(SPU_PROFILING_CODE);
-	add_event_entry(num_spu_nodes);
-	spin_unlock_irqrestore(&buffer_lock, flags);
+	/* create private work queue, execution of work is time critical */
+	oprofile_spu_wq = create_workqueue("spu_oprofile");
+
+	/* due to a race when the spu is already running stuff, need to
+	 * set a flag to tell the spu context switch to record the start
+	 * before recording the context switches.
+	 */
+	record_spu_start_flag = 1;
+
+	spus_profiling_code_data.num_spu_nodes = num_spu_nodes;
+
+	/* setup work queue functiion for recording context switch info */
+	spus_cntxt_sw_data.num_spu_nodes = num_spu_nodes;
+	for (k = 0; k<(MAX_NUMNODES * 8); k++) {
+		spus_cntxt_sw_data.spu_data[k].valid_entry = 0;
+		cnt_swtch_processed_flag[k] = 0;
+	}
+
+	INIT_WORK(&spus_cntxt_sw_data.spu_cntxt_work,
+		  record_spu_process_switch);
 
 	/* Register for SPU events  */
 	register_ret = spu_switch_event_register(&spu_active);
@@ -413,6 +509,17 @@ void spu_sync_buffer(int spu_num, unsign
 	unsigned long long spu_num_shifted = spu_num_ll << 32;
 	struct cached_info *c_info;
 
+	/* Do not record any samples until after the context switch
+	 * for this SPU has been recorded.  The daemon gets really
+	 * confused.
+	 */
+	if (!(cnt_swtch_processed_flag[spu_num]))
+		return;
+
+	/* note, the mutex lock on buffer_mutex has already been
+	 * grabbed by the caller to this function.
+	 */
+
 	/* We need to obtain the cache_lock here because it's
 	 * possible that after getting the cached_info, the SPU job
 	 * corresponding to this cached_info may end, thus resulting
@@ -432,7 +539,7 @@ void spu_sync_buffer(int spu_num, unsign
 
 	map = c_info->map;
 	the_spu = c_info->the_spu;
-	spin_lock(&buffer_lock);
+
 	for (i = 0; i < num_samples; i++) {
 		unsigned int sample = *(samples+i);
 		int grd_val = 0;
@@ -452,9 +559,11 @@ void spu_sync_buffer(int spu_num, unsign
 			break;
 		}
 
+		/* add_event() protected by the mutex_lock(buffer_mutex) taken
+		 * in routine profile_spus_record_samples()
+		 */
 		add_event_entry(file_offset | spu_num_shifted);
 	}
-	spin_unlock(&buffer_lock);
 out:
 	spin_unlock_irqrestore(&cache_lock, flags);
 }
@@ -463,7 +572,9 @@ out:
 int spu_sync_stop(void)
 {
 	unsigned long flags = 0;
+	int k;
 	int ret = spu_switch_event_unregister(&spu_active);
+
 	if (ret) {
 		printk(KERN_ERR "SPU_PROF: "
 			"%s, line %d: spu_switch_event_unregister returned %d\n",
@@ -475,6 +586,15 @@ int spu_sync_stop(void)
 	ret = release_cached_info(RELEASE_ALL);
 	spin_unlock_irqrestore(&cache_lock, flags);
 out:
+	/* clear any pending spu cntx switch request */
+
+	for (k = 0; k<(MAX_NUMNODES * 8); k++)
+	        if (spus_cntxt_sw_data.spu_data[k].valid_entry == 1)
+			pr_debug ("spu_sync_stop -- removed "\
+				  "pending SPU sw req\n");
+
+		spus_cntxt_sw_data.spu_data[k].valid_entry = 0;
+
 	pr_debug("spu_sync_stop -- done.\n");
 	return ret;
 }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Cbe-oss-dev]  [PATCH] Cell OProfile: SPU mutex lock fix
  2008-03-25 15:58 [Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix Carl Love
@ 2008-04-02  5:21 ` Arnd Bergmann
  2008-04-02 16:42   ` Carl Love
  0 siblings, 1 reply; 7+ messages in thread
From: Arnd Bergmann @ 2008-04-02  5:21 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: linuxppc-dev, cel, linux-kernel, Carl Love

On Tuesday 25 March 2008, Carl Love wrote:
> This patch fixes a bug in the code that records the SPU data and
> context switches.  The buffer_mutex lock must be held when the
> kernel is adding data to the buffer between the kernel and the
> OProfile daemon.  The lock is not being held in the current code
> base.  This patch fixes the bug using work queues.  The data to 
> be passed to the daemon is caputured by the interrupt handler.  
> The workqueue function is invoked to grab the buffer_mutex lock
> and add the data to the buffer.  

So what was the exact bug you're fixing with this? There was no
buffer_mutex before, so why do you need it now? Can't this be a
spinlock so you can get it from interrupt context instead of
using a workqueue?

> Index: linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
> ===================================================================
> --- linux-2.6.25-rc4.orig/arch/powerpc/oprofile/cell/spu_profiler.c
> +++ linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
> @@ -16,6 +16,7 @@
>  #include <linux/smp.h>
>  #include <linux/slab.h>
>  #include <asm/cell-pmu.h>
> +#include <linux/workqueue.h>
>  #include "pr_util.h"
>  

Please keep #include statements in alphabetical order, with all linux/ files
before the asm/ files.

>  #define TRACE_ARRAY_SIZE 1024
> @@ -32,9 +33,19 @@ static unsigned int profiling_interval;
>  
>  #define SPU_PC_MASK	     0xFFFF
>  
> +/* The generic OProfile code uses the buffer_mutex to protect the buffer
> + * between the kernel and the daemon.  The SPU code needs to use the buffer
> + * to ensure that the kernel SPU writes complete as a single block before
> + * being consumed by the daemon.
> + */
> +extern struct mutex buffer_mutex;
> +
>  static DEFINE_SPINLOCK(sample_array_lock);
>  unsigned long sample_array_lock_flags;
>  
> +struct work_struct spu_record_wq;
> +extern struct workqueue_struct *oprofile_spu_wq;
> +
>  void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
>  {
>  	unsigned long ns_per_cyc;

Never put extern statements in the implementation, they describe the
interface between two parts of the code and should be inside of a
common header.

Why do you want to have your own workqueue instead of using the
global one?

> @@ -123,14 +134,14 @@ static int cell_spu_pc_collection(int cp
>  	return entry;
>  }
>  
> -
> -static enum hrtimer_restart profile_spus(struct hrtimer *timer)
> -{
> -	ktime_t kt;
> +static void profile_spus_record_samples (struct work_struct *ws) {
> +	/* This routine is called via schedule_work() to record the
> +	 * spu data.  It must be run in a normal kernel mode to
> +	 * grab the OProfile mutex lock.
> +	 */
>  	int cpu, node, k, num_samples, spu_num;
>  
> -	if (!spu_prof_running)
> -		goto stop;
> +	mutex_lock(&buffer_mutex);
>  
>  	for_each_online_cpu(cpu) {
>  		if (cbe_get_hw_thread_id(cpu))
> @@ -170,6 +181,20 @@ static enum hrtimer_restart profile_spus
>  	smp_wmb();	/* insure spu event buffer updates are written */
>  			/* don't want events intermingled... */
>  
> +	mutex_unlock(&buffer_mutex);
> +}
> +
> +static enum hrtimer_restart profile_spus(struct hrtimer *timer)
> +{
> +	ktime_t kt;
> +
> +
> +	if (!spu_prof_running)
> +		goto stop;
> +
> +	/* schedule the funtion to record the data */
> +	schedule_work(&spu_record_wq);
> +
>  	kt = ktime_set(0, profiling_interval);
>  	if (!spu_prof_running)
>  		goto stop;

This looks like you want to use a delayed_work rather than building your
own out of hrtimer and work. Is there any point why you want to use
an hrtimer?

> -static DEFINE_SPINLOCK(buffer_lock);
> +extern struct mutex buffer_mutex;
> +extern struct workqueue_struct *oprofile_spu_wq;
> +extern int calls_to_record_switch;
> +

Again, public interfaces need to go to a header file, and should
have a name that identifies the interface. "buffer_mutex" is
certainly not a suitable name for a kernel-wide global variable!

>  static DEFINE_SPINLOCK(cache_lock);
>  static int num_spu_nodes;
> +
>  int spu_prof_num_nodes;
>  int last_guard_val[MAX_NUMNODES * 8];
> +int cnt_swtch_processed_flag[MAX_NUMNODES * 8];
> +
> +struct spus_profiling_code_data_s {
> +	int num_spu_nodes;
> +	struct work_struct spu_prof_code_wq;
> +} spus_profiling_code_data;
> +
> +struct spu_context_switch_data_s {
> +	struct spu *spu;
> +	unsigned long spu_cookie;
> +	unsigned long app_dcookie;
> +	unsigned int offset;
> +	unsigned long objectId;
> +	int valid_entry;
> +} spu_context_switch_data;

I don't understand what these variables are really doing, but
having e.g. just one spu_context_switch_data for all the SPUs
doesn't seem to make much sense. What happens when two SPUs do
a context switch at the same time?

> +int calls_to_record_switch = 0;
> +int record_spu_start_flag = 0;
> +
> +struct spus_cntxt_sw_data_s {
> +	int num_spu_nodes;
> +	struct spu_context_switch_data_s spu_data[MAX_NUMNODES * 8];
> +	struct work_struct spu_cntxt_work;
> +} spus_cntxt_sw_data;

Something is very wrong if you need so many global variables!

>  /* Container for caching information about an active SPU task. */
>  struct cached_info {
> @@ -44,6 +73,8 @@ struct cached_info {
>  	struct kref cache_ref;
>  };
>  
> +struct workqueue_struct *oprofile_spu_wq;
> +
>  static struct cached_info *spu_info[MAX_NUMNODES * 8];

While you're cleaning this up, I guess the cached_info should
be moved into a pointer from struct spu as well, instead of
having this global variable here.

> @@ -375,16 +457,30 @@ int spu_sync_start(void)
>  	int k;
>  	int ret = SKIP_GENERIC_SYNC;
>  	int register_ret;
> -	unsigned long flags = 0;
>  
>  	spu_prof_num_nodes = number_of_online_nodes();
>  	num_spu_nodes = spu_prof_num_nodes * 8;
>  
> -	spin_lock_irqsave(&buffer_lock, flags);
> -	add_event_entry(ESCAPE_CODE);
> -	add_event_entry(SPU_PROFILING_CODE);
> -	add_event_entry(num_spu_nodes);
> -	spin_unlock_irqrestore(&buffer_lock, flags);
> +	/* create private work queue, execution of work is time critical */
> +	oprofile_spu_wq = create_workqueue("spu_oprofile");
> +
> +	/* due to a race when the spu is already running stuff, need to
> +	 * set a flag to tell the spu context switch to record the start
> +	 * before recording the context switches.
> +	 */
> +	record_spu_start_flag = 1;
> +
> +	spus_profiling_code_data.num_spu_nodes = num_spu_nodes;
> +
> +	/* setup work queue functiion for recording context switch info */
> +	spus_cntxt_sw_data.num_spu_nodes = num_spu_nodes;
> +	for (k = 0; k<(MAX_NUMNODES * 8); k++) {
> +		spus_cntxt_sw_data.spu_data[k].valid_entry = 0;
> +		cnt_swtch_processed_flag[k] = 0;
> +	}
> +
> +	INIT_WORK(&spus_cntxt_sw_data.spu_cntxt_work,
> +		  record_spu_process_switch);

I would guess that you need one work struct per SPU instead of a global
one, if you want to pass the SPU pointer as an argument.

	Arnd <><

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Cbe-oss-dev]  [PATCH] Cell OProfile: SPU mutex lock fix
  2008-04-02  5:21 ` Arnd Bergmann
@ 2008-04-02 16:42   ` Carl Love
  2008-04-02 17:02     ` Carl Love
  0 siblings, 1 reply; 7+ messages in thread
From: Carl Love @ 2008-04-02 16:42 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, cel, cbe-oss-dev, linux-kernel


On Wed, 2008-04-02 at 07:21 +0200, Arnd Bergmann wrote:
> On Tuesday 25 March 2008, Carl Love wrote:
> > This patch fixes a bug in the code that records the SPU data and
> > context switches.  The buffer_mutex lock must be held when the
> > kernel is adding data to the buffer between the kernel and the
> > OProfile daemon.  The lock is not being held in the current code
> > base.  This patch fixes the bug using work queues.  The data to 
> > be passed to the daemon is caputured by the interrupt handler.  
> > The workqueue function is invoked to grab the buffer_mutex lock
> > and add the data to the buffer.  
> 
> So what was the exact bug you're fixing with this? There was no
> buffer_mutex before, so why do you need it now? Can't this be a
> spinlock so you can get it from interrupt context instead of
> using a workqueue?

The generic OProfile code defines a mutex lock, called buffer_mutex, to
protect the kernel/daemon data buffer from being writen by the kernal
and simultaneously read by the Daemon.  When adding a PPU sample the
oprofile routine  oprofile_add_ext_sample(pc, regs, i, is_kernel) is
called from the interrupt context to request the sample be stored.  The
generic oprofile code takes care of passing the data to a non interrupt
context where the mutex lock is held and the necessary sequence of data
is written into the kernel/daemon data buffer.  However, OProfile does
not have any built in functions for handling the SPU.  Hence, we have to
implement the code to capture the data in the interrupt context, pass it
to a non interrupt context and put it into the buffer.  This was not
done correctly in the original implementation.  Specifically, the mutex
lock was not being held.  

Writing data to the OProfile buffer consists of a sequence of items.
For example when writing an SPU entry, first comes the escape code so
the daemon knows this is a new entry.  The next item is the SPU context
switch code which says the data which will follow is the information
about a new context.  There is a different code to identify the data as
an address sample.  Finally the data about the SPU context switch is
entered into the buffer.  The issue is the OProfile daemon is read all
of the entire sequence of items then process the data.  Without the
mutex lock, the daemon may read part of the sequence try to process it
before everything is written into the buffer.  When the daemon reads
again, it doesn't see the escape code as the first item and isn't smart
enough to realize it is part of a previous sequence.  The generic
OProfile code defines the mutex lock and calls it buffer_mutex.  The
OProfile kernel/daemon API uses the mutex lock. The mutex lock can only
be held in a non interrupt context.  The current implementation uses a
spin lock to make sure the kernel writes each sequence if items into the
buffer but since the API does not use a spin lock we have no way to
prevent the daemon from reading the buffer until the entire sequence of
items has been written to the buffer.  Hence the need to hold the
buffer_mutex lock which prevents the daemon from accessing the buffer.

> 
> > Index: linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
> > ===================================================================
> > --- linux-2.6.25-rc4.orig/arch/powerpc/oprofile/cell/spu_profiler.c
> > +++ linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
> > @@ -16,6 +16,7 @@
> >  #include <linux/smp.h>
> >  #include <linux/slab.h>
> >  #include <asm/cell-pmu.h>
> > +#include <linux/workqueue.h>
> >  #include "pr_util.h"
> >  
> 
> Please keep #include statements in alphabetical order, with all linux/ files
> before the asm/ files.
> 
> >  #define TRACE_ARRAY_SIZE 1024
> > @@ -32,9 +33,19 @@ static unsigned int profiling_interval;
> >  
> >  #define SPU_PC_MASK	     0xFFFF
> >  
> > +/* The generic OProfile code uses the buffer_mutex to protect the buffer
> > + * between the kernel and the daemon.  The SPU code needs to use the buffer
> > + * to ensure that the kernel SPU writes complete as a single block before
> > + * being consumed by the daemon.
> > + */
> > +extern struct mutex buffer_mutex;
> > +
> >  static DEFINE_SPINLOCK(sample_array_lock);
> >  unsigned long sample_array_lock_flags;
> >  
> > +struct work_struct spu_record_wq;
> > +extern struct workqueue_struct *oprofile_spu_wq;
> > +
> >  void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
> >  {
> >  	unsigned long ns_per_cyc;
> 
> Never put extern statements in the implementation, they describe the
> interface between two parts of the code and should be inside of a
> common header.
> 
> Why do you want to have your own workqueue instead of using the
> global one?
> 
> > @@ -123,14 +134,14 @@ static int cell_spu_pc_collection(int cp
> >  	return entry;
> >  }
> >  
> > -
> > -static enum hrtimer_restart profile_spus(struct hrtimer *timer)
> > -{
> > -	ktime_t kt;
> > +static void profile_spus_record_samples (struct work_struct *ws) {
> > +	/* This routine is called via schedule_work() to record the
> > +	 * spu data.  It must be run in a normal kernel mode to
> > +	 * grab the OProfile mutex lock.
> > +	 */
> >  	int cpu, node, k, num_samples, spu_num;
> >  
> > -	if (!spu_prof_running)
> > -		goto stop;
> > +	mutex_lock(&buffer_mutex);
> >  
> >  	for_each_online_cpu(cpu) {
> >  		if (cbe_get_hw_thread_id(cpu))
> > @@ -170,6 +181,20 @@ static enum hrtimer_restart profile_spus
> >  	smp_wmb();	/* insure spu event buffer updates are written */
> >  			/* don't want events intermingled... */
> >  
> > +	mutex_unlock(&buffer_mutex);
> > +}
> > +
> > +static enum hrtimer_restart profile_spus(struct hrtimer *timer)
> > +{
> > +	ktime_t kt;
> > +
> > +
> > +	if (!spu_prof_running)
> > +		goto stop;
> > +
> > +	/* schedule the funtion to record the data */
> > +	schedule_work(&spu_record_wq);
> > +
> >  	kt = ktime_set(0, profiling_interval);
> >  	if (!spu_prof_running)
> >  		goto stop;
> 
> This looks like you want to use a delayed_work rather than building your
> own out of hrtimer and work. Is there any point why you want to use
> an hrtimer?

The current implementation uses the hrtimer to schedule when to read the
trace buffer the next time.  This patch does not change how the
scheduling of the buffer reads is done.  Yes, you could change the
implementation to use workqueues instead.  If you feel that it is better
to use the workqueue then we could make that change.  Not sure that
making that change in this bug fix patch is appropriate.  I would need
to create a second patch for that change.
> 
> > -static DEFINE_SPINLOCK(buffer_lock);
> > +extern struct mutex buffer_mutex;
> > +extern struct workqueue_struct *oprofile_spu_wq;
> > +extern int calls_to_record_switch;
> > +
> 
> Again, public interfaces need to go to a header file, and should
> have a name that identifies the interface. "buffer_mutex" is
> certainly not a suitable name for a kernel-wide global variable!

As stated earlier, the generic OProfile code defines the variable
"buffer_mutex".  Changing the name in the generic OProfile code is
beyond the scope of this patch.

> 
> >  static DEFINE_SPINLOCK(cache_lock);
> >  static int num_spu_nodes;
> > +
> >  int spu_prof_num_nodes;
> >  int last_guard_val[MAX_NUMNODES * 8];
> > +int cnt_swtch_processed_flag[MAX_NUMNODES * 8];
> > +
> > +struct spus_profiling_code_data_s {
> > +	int num_spu_nodes;
> > +	struct work_struct spu_prof_code_wq;
> > +} spus_profiling_code_data;
> > +
> > +struct spu_context_switch_data_s {
> > +	struct spu *spu;
> > +	unsigned long spu_cookie;
> > +	unsigned long app_dcookie;
> > +	unsigned int offset;
> > +	unsigned long objectId;
> > +	int valid_entry;
> > +} spu_context_switch_data;
> 
> I don't understand what these variables are really doing, but
> having e.g. just one spu_context_switch_data for all the SPUs
> doesn't seem to make much sense. What happens when two SPUs do
> a context switch at the same time?

This is the data same data that was being put into the event buffer
directly from the interrupt context.  We need to store the data that is
only available in the interrupt context so the same data can be put into
the buffer by the work queue function in the non interrupt context.
This is the declaration of the data needed per SPU.  Below in the
spu_cntx_sw_data structure, we declare an array of entries so we can
store the switch
> 
> > +int calls_to_record_switch = 0;
> > +int record_spu_start_flag = 0;
> > +
> > +struct spus_cntxt_sw_data_s {
> > +	int num_spu_nodes;
> > +	struct spu_context_switch_data_s spu_data[MAX_NUMNODES * 8];
> > +	struct work_struct spu_cntxt_work;
> > +} spus_cntxt_sw_data;
> 
> Something is very wrong if you need so many global variables!
> 
> >  /* Container for caching information about an active SPU task. */
> >  struct cached_info {
> > @@ -44,6 +73,8 @@ struct cached_info {
> >  	struct kref cache_ref;
> >  };
> >  
> > +struct workqueue_struct *oprofile_spu_wq;
> > +
> >  static struct cached_info *spu_info[MAX_NUMNODES * 8];
> 
> While you're cleaning this up, I guess the cached_info should
> be moved into a pointer from struct spu as well, instead of
> having this global variable here.
> 
> > @@ -375,16 +457,30 @@ int spu_sync_start(void)
> >  	int k;
> >  	int ret = SKIP_GENERIC_SYNC;
> >  	int register_ret;
> > -	unsigned long flags = 0;
> >  
> >  	spu_prof_num_nodes = number_of_online_nodes();
> >  	num_spu_nodes = spu_prof_num_nodes * 8;
> >  
> > -	spin_lock_irqsave(&buffer_lock, flags);
> > -	add_event_entry(ESCAPE_CODE);
> > -	add_event_entry(SPU_PROFILING_CODE);
> > -	add_event_entry(num_spu_nodes);
> > -	spin_unlock_irqrestore(&buffer_lock, flags);
> > +	/* create private work queue, execution of work is time critical */
> > +	oprofile_spu_wq = create_workqueue("spu_oprofile");
> > +
> > +	/* due to a race when the spu is already running stuff, need to
> > +	 * set a flag to tell the spu context switch to record the start
> > +	 * before recording the context switches.
> > +	 */
> > +	record_spu_start_flag = 1;
> > +
> > +	spus_profiling_code_data.num_spu_nodes = num_spu_nodes;
> > +
> > +	/* setup work queue functiion for recording context switch info */
> > +	spus_cntxt_sw_data.num_spu_nodes = num_spu_nodes;
> > +	for (k = 0; k<(MAX_NUMNODES * 8); k++) {
> > +		spus_cntxt_sw_data.spu_data[k].valid_entry = 0;
> > +		cnt_swtch_processed_flag[k] = 0;
> > +	}
> > +
> > +	INIT_WORK(&spus_cntxt_sw_data.spu_cntxt_work,
> > +		  record_spu_process_switch);
> 
> I would guess that you need one work struct per SPU instead of a global
> one, if you want to pass the SPU pointer as an argument.
> 
> 	Arnd <><

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Cbe-oss-dev]  [PATCH] Cell OProfile: SPU mutex lock fix
  2008-04-02 16:42   ` Carl Love
@ 2008-04-02 17:02     ` Carl Love
  2008-04-04  6:38       ` Arnd Bergmann
  0 siblings, 1 reply; 7+ messages in thread
From: Carl Love @ 2008-04-02 17:02 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, cel, cbe-oss-dev, linux-kernel


On Wed, 2008-04-02 at 09:42 -0700, Carl Love wrote:
> On Wed, 2008-04-02 at 07:21 +0200, Arnd Bergmann wrote:
> > On Tuesday 25 March 2008, Carl Love wrote:
> > > This patch fixes a bug in the code that records the SPU data and
> > > context switches.  The buffer_mutex lock must be held when the
> > > kernel is adding data to the buffer between the kernel and the
> > > OProfile daemon.  The lock is not being held in the current code
> > > base.  This patch fixes the bug using work queues.  The data to 
> > > be passed to the daemon is caputured by the interrupt handler.  
> > > The workqueue function is invoked to grab the buffer_mutex lock
> > > and add the data to the buffer.  
> > 
> > So what was the exact bug you're fixing with this? There was no
> > buffer_mutex before, so why do you need it now? Can't this be a
> > spinlock so you can get it from interrupt context instead of
> > using a workqueue?
> 
> The generic OProfile code defines a mutex lock, called buffer_mutex, to
> protect the kernel/daemon data buffer from being writen by the kernal
> and simultaneously read by the Daemon.  When adding a PPU sample the
> oprofile routine  oprofile_add_ext_sample(pc, regs, i, is_kernel) is
> called from the interrupt context to request the sample be stored.  The
> generic oprofile code takes care of passing the data to a non interrupt
> context where the mutex lock is held and the necessary sequence of data
> is written into the kernel/daemon data buffer.  However, OProfile does
> not have any built in functions for handling the SPU.  Hence, we have to
> implement the code to capture the data in the interrupt context, pass it
> to a non interrupt context and put it into the buffer.  This was not
> done correctly in the original implementation.  Specifically, the mutex
> lock was not being held.  
> 
> Writing data to the OProfile buffer consists of a sequence of items.
> For example when writing an SPU entry, first comes the escape code so
> the daemon knows this is a new entry.  The next item is the SPU context
> switch code which says the data which will follow is the information
> about a new context.  There is a different code to identify the data as
> an address sample.  Finally the data about the SPU context switch is
> entered into the buffer.  The issue is the OProfile daemon is read all
> of the entire sequence of items then process the data.  Without the
> mutex lock, the daemon may read part of the sequence try to process it
> before everything is written into the buffer.  When the daemon reads
> again, it doesn't see the escape code as the first item and isn't smart
> enough to realize it is part of a previous sequence.  The generic
> OProfile code defines the mutex lock and calls it buffer_mutex.  The
> OProfile kernel/daemon API uses the mutex lock. The mutex lock can only
> be held in a non interrupt context.  The current implementation uses a
> spin lock to make sure the kernel writes each sequence if items into the
> buffer but since the API does not use a spin lock we have no way to
> prevent the daemon from reading the buffer until the entire sequence of
> items has been written to the buffer.  Hence the need to hold the
> buffer_mutex lock which prevents the daemon from accessing the buffer.
> 
> > 
> > > Index: linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
> > > ===================================================================
> > > --- linux-2.6.25-rc4.orig/arch/powerpc/oprofile/cell/spu_profiler.c
> > > +++ linux-2.6.25-rc4/arch/powerpc/oprofile/cell/spu_profiler.c
> > > @@ -16,6 +16,7 @@
> > >  #include <linux/smp.h>
> > >  #include <linux/slab.h>
> > >  #include <asm/cell-pmu.h>
> > > +#include <linux/workqueue.h>
> > >  #include "pr_util.h"
> > >  
> > 
> > Please keep #include statements in alphabetical order, with all linux/ files
> > before the asm/ files.
> > 
> > >  #define TRACE_ARRAY_SIZE 1024
> > > @@ -32,9 +33,19 @@ static unsigned int profiling_interval;
> > >  
> > >  #define SPU_PC_MASK	     0xFFFF
> > >  
> > > +/* The generic OProfile code uses the buffer_mutex to protect the buffer
> > > + * between the kernel and the daemon.  The SPU code needs to use the buffer
> > > + * to ensure that the kernel SPU writes complete as a single block before
> > > + * being consumed by the daemon.
> > > + */
> > > +extern struct mutex buffer_mutex;
> > > +
> > >  static DEFINE_SPINLOCK(sample_array_lock);
> > >  unsigned long sample_array_lock_flags;
> > >  
> > > +struct work_struct spu_record_wq;
> > > +extern struct workqueue_struct *oprofile_spu_wq;
> > > +
> > >  void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
> > >  {
> > >  	unsigned long ns_per_cyc;
> > 
> > Never put extern statements in the implementation, they describe the
> > interface between two parts of the code and should be inside of a
> > common header.
> > 
> > Why do you want to have your own workqueue instead of using the
> > global one?

It is important that the data get context switch data get recorded as
quickly as possible to avoid dropping data unnecessarily.  The PC
counter data for each SPU is ignored until the context switch record is
put into the kernel/daemon buffer.  The API documentation says that
using a private workqueue has better performance then using the global
workqueue.  There is a comment in the code about this, perhaps it is not
clear enough.

> > 
> > > @@ -123,14 +134,14 @@ static int cell_spu_pc_collection(int cp
> > >  	return entry;
> > >  }
> > >  
> > > -
> > > -static enum hrtimer_restart profile_spus(struct hrtimer *timer)
> > > -{
> > > -	ktime_t kt;
> > > +static void profile_spus_record_samples (struct work_struct *ws) {
> > > +	/* This routine is called via schedule_work() to record the
> > > +	 * spu data.  It must be run in a normal kernel mode to
> > > +	 * grab the OProfile mutex lock.
> > > +	 */
> > >  	int cpu, node, k, num_samples, spu_num;
> > >  
> > > -	if (!spu_prof_running)
> > > -		goto stop;
> > > +	mutex_lock(&buffer_mutex);
> > >  
> > >  	for_each_online_cpu(cpu) {
> > >  		if (cbe_get_hw_thread_id(cpu))
> > > @@ -170,6 +181,20 @@ static enum hrtimer_restart profile_spus
> > >  	smp_wmb();	/* insure spu event buffer updates are written */
> > >  			/* don't want events intermingled... */
> > >  
> > > +	mutex_unlock(&buffer_mutex);
> > > +}
> > > +
> > > +static enum hrtimer_restart profile_spus(struct hrtimer *timer)
> > > +{
> > > +	ktime_t kt;
> > > +
> > > +
> > > +	if (!spu_prof_running)
> > > +		goto stop;
> > > +
> > > +	/* schedule the funtion to record the data */
> > > +	schedule_work(&spu_record_wq);
> > > +
> > >  	kt = ktime_set(0, profiling_interval);
> > >  	if (!spu_prof_running)
> > >  		goto stop;
> > 
> > This looks like you want to use a delayed_work rather than building your
> > own out of hrtimer and work. Is there any point why you want to use
> > an hrtimer?
> 
> The current implementation uses the hrtimer to schedule when to read the
> trace buffer the next time.  This patch does not change how the
> scheduling of the buffer reads is done.  Yes, you could change the
> implementation to use workqueues instead.  If you feel that it is better
> to use the workqueue then we could make that change.  Not sure that
> making that change in this bug fix patch is appropriate.  I would need
> to create a second patch for that change.
> > 
> > > -static DEFINE_SPINLOCK(buffer_lock);
> > > +extern struct mutex buffer_mutex;
> > > +extern struct workqueue_struct *oprofile_spu_wq;
> > > +extern int calls_to_record_switch;
> > > +
> > 
> > Again, public interfaces need to go to a header file, and should
> > have a name that identifies the interface. "buffer_mutex" is
> > certainly not a suitable name for a kernel-wide global variable!
> 
> As stated earlier, the generic OProfile code defines the variable
> "buffer_mutex".  Changing the name in the generic OProfile code is
> beyond the scope of this patch.
> 
> > 
> > >  static DEFINE_SPINLOCK(cache_lock);
> > >  static int num_spu_nodes;
> > > +
> > >  int spu_prof_num_nodes;
> > >  int last_guard_val[MAX_NUMNODES * 8];
> > > +int cnt_swtch_processed_flag[MAX_NUMNODES * 8];
> > > +
> > > +struct spus_profiling_code_data_s {
> > > +	int num_spu_nodes;
> > > +	struct work_struct spu_prof_code_wq;
> > > +} spus_profiling_code_data;
> > > +
> > > +struct spu_context_switch_data_s {
> > > +	struct spu *spu;
> > > +	unsigned long spu_cookie;
> > > +	unsigned long app_dcookie;
> > > +	unsigned int offset;
> > > +	unsigned long objectId;
> > > +	int valid_entry;
> > > +} spu_context_switch_data;
> > 
> > I don't understand what these variables are really doing, but
> > having e.g. just one spu_context_switch_data for all the SPUs
> > doesn't seem to make much sense. What happens when two SPUs do
> > a context switch at the same time?
> 
> This is the data same data that was being put into the event buffer
> directly from the interrupt context.  We need to store the data that is
> only available in the interrupt context so the same data can be put into
> the buffer by the work queue function in the non interrupt context.
> This is the declaration of the data needed per SPU.  Below in the
> spu_cntx_sw_data structure, we declare an array of entries so we can
> store the switch

oops, accidentally hit save when I tried to move the window. Sorry.

data on a per SPU basis as you alluded to.  


> > 
> > > +int calls_to_record_switch = 0;
> > > +int record_spu_start_flag = 0;
> > > +
> > > +struct spus_cntxt_sw_data_s {
> > > +	int num_spu_nodes;
> > > +	struct spu_context_switch_data_s spu_data[MAX_NUMNODES * 8];
> > > +	struct work_struct spu_cntxt_work;
> > > +} spus_cntxt_sw_data;
> > 
> > Something is very wrong if you need so many global variables!

The calls_to_record_switch variable is not used, my mistake for not
getting it out of the patch.  The record_spu_stat_flag is used. It is
set in the spu_sync_start when SPU profiling is started.  The first time
the work function is called to record SPU context switches it sees the
flag is set and writes the initial record to the daemon/kernel buffer
stating that this is an SPU profile run not a PPU profile run.  The
daemon needs to know this as it effects how the postprocessing is done.
The initial record is only written once.  

The spus_context_sw_data structure has the array per SPU for all off the
interrupt context data that was recorded and needs to be written to the
kernel/daemon buffer.  
> > 
> > >  /* Container for caching information about an active SPU task. */
> > >  struct cached_info {
> > > @@ -44,6 +73,8 @@ struct cached_info {
> > >  	struct kref cache_ref;
> > >  };
> > >  
> > > +struct workqueue_struct *oprofile_spu_wq;
> > > +
> > >  static struct cached_info *spu_info[MAX_NUMNODES * 8];
> > 
> > While you're cleaning this up, I guess the cached_info should
> > be moved into a pointer from struct spu as well, instead of
> > having this global variable here.

This would be a functional change and it belongs in a functional change
patch not in a bug fix patch.

> > 
> > > @@ -375,16 +457,30 @@ int spu_sync_start(void)
> > >  	int k;
> > >  	int ret = SKIP_GENERIC_SYNC;
> > >  	int register_ret;
> > > -	unsigned long flags = 0;
> > >  
> > >  	spu_prof_num_nodes = number_of_online_nodes();
> > >  	num_spu_nodes = spu_prof_num_nodes * 8;
> > >  
> > > -	spin_lock_irqsave(&buffer_lock, flags);
> > > -	add_event_entry(ESCAPE_CODE);
> > > -	add_event_entry(SPU_PROFILING_CODE);
> > > -	add_event_entry(num_spu_nodes);
> > > -	spin_unlock_irqrestore(&buffer_lock, flags);
> > > +	/* create private work queue, execution of work is time critical */
> > > +	oprofile_spu_wq = create_workqueue("spu_oprofile");
> > > +
> > > +	/* due to a race when the spu is already running stuff, need to
> > > +	 * set a flag to tell the spu context switch to record the start
> > > +	 * before recording the context switches.
> > > +	 */
> > > +	record_spu_start_flag = 1;
> > > +
> > > +	spus_profiling_code_data.num_spu_nodes = num_spu_nodes;
> > > +
> > > +	/* setup work queue functiion for recording context switch info */
> > > +	spus_cntxt_sw_data.num_spu_nodes = num_spu_nodes;
> > > +	for (k = 0; k<(MAX_NUMNODES * 8); k++) {
> > > +		spus_cntxt_sw_data.spu_data[k].valid_entry = 0;
> > > +		cnt_swtch_processed_flag[k] = 0;
> > > +	}
> > > +
> > > +	INIT_WORK(&spus_cntxt_sw_data.spu_cntxt_work,
> > > +		  record_spu_process_switch);
> > 
> > I would guess that you need one work struct per SPU instead of a global
> > one, if you want to pass the SPU pointer as an argument.
> > 
> > 	Arnd <><

We only need one work struct because we have an array that contains the
data for each SPU that has done a context switch.  

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix
  2008-04-02 17:02     ` Carl Love
@ 2008-04-04  6:38       ` Arnd Bergmann
  2008-04-08 20:21         ` Carl Love
  0 siblings, 1 reply; 7+ messages in thread
From: Arnd Bergmann @ 2008-04-04  6:38 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: linuxppc-dev, cel, linux-kernel, Carl Love

On Wednesday 02 April 2008, Carl Love wrote:
> On Wed, 2008-04-02 at 07:21 +0200, Arnd Bergmann wrote:
> > On Tuesday 25 March 2008, Carl Love wrote:
> > > This patch fixes a bug in the code that records the SPU data and
> > > context switches.  The buffer_mutex lock must be held when the
> > > kernel is adding data to the buffer between the kernel and the
> > > OProfile daemon.  The lock is not being held in the current code
> > > base.  This patch fixes the bug using work queues.  The data to 
> > > be passed to the daemon is caputured by the interrupt handler.  
> > > The workqueue function is invoked to grab the buffer_mutex lock
> > > and add the data to the buffer.  
> > 
> > So what was the exact bug you're fixing with this? There was no
> > buffer_mutex before, so why do you need it now? Can't this be a
> > spinlock so you can get it from interrupt context instead of
> > using a workqueue?
> 
> The generic OProfile code defines a mutex lock, called buffer_mutex, to
> protect the kernel/daemon data buffer from being writen by the kernal
> and simultaneously read by the Daemon.  When adding a PPU sample the
> oprofile routine  oprofile_add_ext_sample(pc, regs, i, is_kernel) is
> called from the interrupt context to request the sample be stored.  The
> generic oprofile code takes care of passing the data to a non interrupt
> context where the mutex lock is held and the necessary sequence of data
> is written into the kernel/daemon data buffer.  However, OProfile does
> not have any built in functions for handling the SPU.  Hence, we have to
> implement the code to capture the data in the interrupt context, pass it
> to a non interrupt context and put it into the buffer.  This was not
> done correctly in the original implementation.  Specifically, the mutex
> lock was not being held.  

Ok, I see.

However, I'm pretty sure that the switch notification does not get
called from an atomic context, so you don't need a workqueue for
bringing that into a process context. Doing the context switch
notification directly from the scheduler sounds much better regarding
the impact on the measurement.

> > Never put extern statements in the implementation, they describe the
> > interface between two parts of the code and should be inside of a
> > common header.
> > 
> > Why do you want to have your own workqueue instead of using the
> > global one?
> 
> It is important that the data get context switch data get recorded as
> quickly as possible to avoid dropping data unnecessarily.  The PC
> counter data for each SPU is ignored until the context switch record is
> put into the kernel/daemon buffer.  The API documentation says that
> using a private workqueue has better performance then using the global
> workqueue.  There is a comment in the code about this, perhaps it is not
> clear enough.

This sounds like an unrelated bug in the implementation. The PC
data should *not* be ignored in any case. As long as the records
get stored in the right order, everything should be fine here.


> > This looks like you want to use a delayed_work rather than building your
> > own out of hrtimer and work. Is there any point why you want to use
> > an hrtimer?
> 
> The current implementation uses the hrtimer to schedule when to read the
> trace buffer the next time.  This patch does not change how the
> scheduling of the buffer reads is done.  Yes, you could change the
> implementation to use workqueues instead.  If you feel that it is better
> to use the workqueue then we could make that change.  Not sure that
> making that change in this bug fix patch is appropriate.  I would need
> to create a second patch for that change.

I would guess that the change from hrtimer to delayed_workqueue is
smaller than the current patch changing from hrtimer to hrtimer plus
workqueue, so I would prefer to have only one changeset.

Since the timer only causes statistical data collection anyway, delaying
it a bit should not have any negative effect on the accuracy of the
measurement, unlike delaying the context switch notification.

> > > -static DEFINE_SPINLOCK(buffer_lock);
> > > +extern struct mutex buffer_mutex;
> > > +extern struct workqueue_struct *oprofile_spu_wq;
> > > +extern int calls_to_record_switch;
> > > +
> > 
> > Again, public interfaces need to go to a header file, and should
> > have a name that identifies the interface. "buffer_mutex" is
> > certainly not a suitable name for a kernel-wide global variable!
> 
> As stated earlier, the generic OProfile code defines the variable
> "buffer_mutex".  Changing the name in the generic OProfile code is
> beyond the scope of this patch.

Ok, didn't see that the name was already part of the main oprofile
driver. However, this makes it even worse: you are accessing data
structures that are clearly not meant to be shared with architecture
code. The fact that it was not declared in a global header file should
have told you that.

I think you should instead add a function to drivers/oprofile/buffer_sync.c
that takes care of moving the data to the common buffer under the right
mutex_lock.

> > 
> > >  static DEFINE_SPINLOCK(cache_lock);
> > >  static int num_spu_nodes;
> > > +
> > >  int spu_prof_num_nodes;
> > >  int last_guard_val[MAX_NUMNODES * 8];
> > > +int cnt_swtch_processed_flag[MAX_NUMNODES * 8];
> > > +
> > > +struct spus_profiling_code_data_s {
> > > +	int num_spu_nodes;
> > > +	struct work_struct spu_prof_code_wq;
> > > +} spus_profiling_code_data;
> > > +
> > > +struct spu_context_switch_data_s {
> > > +	struct spu *spu;
> > > +	unsigned long spu_cookie;
> > > +	unsigned long app_dcookie;
> > > +	unsigned int offset;
> > > +	unsigned long objectId;
> > > +	int valid_entry;
> > > +} spu_context_switch_data;
> > 
> > I don't understand what these variables are really doing, but
> > having e.g. just one spu_context_switch_data for all the SPUs
> > doesn't seem to make much sense. What happens when two SPUs do
> > a context switch at the same time?
> 
> This is the data same data that was being put into the event buffer
> directly from the interrupt context.  We need to store the data that is
> only available in the interrupt context so the same data can be put into
> the buffer by the work queue function in the non interrupt context.
> This is the declaration of the data needed per SPU.  Below in the
> spu_cntx_sw_data structure, we declare an array of entries so we can
> store the switch data on a per SPU basis as you alluded to.  

The spu_context_switch_data and spus_profiling_code_data variables
are also unused, or just write-only. They look like they were left
over after a conversion from a typedef.

> The calls_to_record_switch variable is not used, my mistake for not
> getting it out of the patch.  The record_spu_stat_flag is used. It is
> set in the spu_sync_start when SPU profiling is started.  The first time
> the work function is called to record SPU context switches it sees the
> flag is set and writes the initial record to the daemon/kernel buffer
> stating that this is an SPU profile run not a PPU profile run.  The
> daemon needs to know this as it effects how the postprocessing is done.
> The initial record is only written once.  
> 
> The spus_context_sw_data structure has the array per SPU for all of the
> interrupt context data that was recorded and needs to be written to the
> kernel/daemon buffer.  

An ideal driver should not have *any* global variables at all, but store
all data in the (reference counted) objects it is dealing with, or
just on the stack while it's processing the data.

Storing the context switch information in a global breaks down as soon
as there are multiple context switches taking place for a single
SPU without the workqueue running in between, which is a very likely
scenario if you have high-priority tasks on the SPU.

> > >  /* Container for caching information about an active SPU task. */
> > >  struct cached_info {
> > > @@ -44,6 +73,8 @@ struct cached_info {
> > >  	struct kref cache_ref;
> > >  };
> > >  
> > > +struct workqueue_struct *oprofile_spu_wq;
> > > +
> > >  static struct cached_info *spu_info[MAX_NUMNODES * 8];
> > 
> > While you're cleaning this up, I guess the cached_info should
> > be moved into a pointer from struct spu as well, instead of
> > having this global variable here.
>
> This would be a functional change and it belongs in a functional change
> patch not in a bug fix patch.

The patch is already far bigger than a simple bug fix, but you're right
in that this part should be separate. Upon reading through the code
again, I noticed that the cached_info is thrown away on each context
switch and rebuilt, which I guess makes it impossible to really profile
the context switch code. In the initial design phase for spu oprofile, we
decided that the information should be cached in the spu_context, which
would not only be much cleaner but also avoid the performance problem.

Do you have any idea why this idea was dropped? Is it still work in
progress to get that functionality right?

> > I would guess that you need one work struct per SPU instead of a global
> > one, if you want to pass the SPU pointer as an argument.
> 
> We only need one work struct because we have an array that contains the
> data for each SPU that has done a context switch.  

right, but as I explained, the global array is the real problem that should
be fixed.

	Arnd <><

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix
  2008-04-04  6:38       ` Arnd Bergmann
@ 2008-04-08 20:21         ` Carl Love
  2008-04-09  2:08           ` Arnd Bergmann
  0 siblings, 1 reply; 7+ messages in thread
From: Carl Love @ 2008-04-08 20:21 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linuxppc-dev, cel, cbe-oss-dev, linux-kernel


On Fri, 2008-04-04 at 08:38 +0200, Arnd Bergmann wrote: 
> On Wednesday 02 April 2008, Carl Love wrote:
> > On Wed, 2008-04-02 at 07:21 +0200, Arnd Bergmann wrote:
> > > On Tuesday 25 March 2008, Carl Love wrote:
> > > > This patch fixes a bug in the code that records the SPU data and
> > > > context switches.  The buffer_mutex lock must be held when the
> > > > kernel is adding data to the buffer between the kernel and the
> > > > OProfile daemon.  The lock is not being held in the current code
> > > > base.  This patch fixes the bug using work queues.  The data to 
> > > > be passed to the daemon is caputured by the interrupt handler.  
> > > > The workqueue function is invoked to grab the buffer_mutex lock
> > > > and add the data to the buffer.  
> > > 
> > > So what was the exact bug you're fixing with this? There was no
> > > buffer_mutex before, so why do you need it now? Can't this be a
> > > spinlock so you can get it from interrupt context instead of
> > > using a workqueue?
> > 
> > The generic OProfile code defines a mutex lock, called buffer_mutex, to
> > protect the kernel/daemon data buffer from being writen by the kernal
> > and simultaneously read by the Daemon.  When adding a PPU sample the
> > oprofile routine  oprofile_add_ext_sample(pc, regs, i, is_kernel) is
> > called from the interrupt context to request the sample be stored.  The
> > generic oprofile code takes care of passing the data to a non interrupt
> > context where the mutex lock is held and the necessary sequence of data
> > is written into the kernel/daemon data buffer.  However, OProfile does
> > not have any built in functions for handling the SPU.  Hence, we have to
> > implement the code to capture the data in the interrupt context, pass it
> > to a non interrupt context and put it into the buffer.  This was not
> > done correctly in the original implementation.  Specifically, the mutex
> > lock was not being held.  
> 
> Ok, I see.
> 
> However, I'm pretty sure that the switch notification does not get
> called from an atomic context, so you don't need a workqueue for
> bringing that into a process context. Doing the context switch
> notification directly from the scheduler sounds much better regarding
> the impact on the measurement.

Our first thought to fix the bug was to just grab the mutex lock when
adding the switch notification data to the queue.  The kernel gives us
an oops message saying something along the line of "could not call mutex
lock in interrupt context".  Hence we had to go to work queues so we
could access the lock outside of the SPU switch notification context.

Secondly, it is my understanding that if the schedule_work() call tries
to queue the same work function multiple times the subsequent requests
are dropped.  Thus we were not able to pass the context switch data as
part of the schedule work requests.  This forced us to have an array to
store the data for each SPU.   
> 
> > > Never put extern statements in the implementation, they describe the
> > > interface between two parts of the code and should be inside of a
> > > common header.
> > > 
> > > Why do you want to have your own workqueue instead of using the
> > > global one?
> > 
> > It is important that the data get context switch data get recorded as
> > quickly as possible to avoid dropping data unnecessarily.  The PC
> > counter data for each SPU is ignored until the context switch record is
> > put into the kernel/daemon buffer.  The API documentation says that
> > using a private workqueue has better performance then using the global
> > workqueue.  There is a comment in the code about this, perhaps it is not
> > clear enough.
> 
> This sounds like an unrelated bug in the implementation. The PC
> data should *not* be ignored in any case. As long as the records
> get stored in the right order, everything should be fine here.

Until the OProfile sees the context switch record, it does not know what
to do with the PC samples and just drops them.  The thought was using a
private work queue might help get the context switch records processed a
little earlier.  It probably doesn't make that much difference.  I can
just use the generic work queue.  
> 
> 
> > > This looks like you want to use a delayed_work rather than building your
> > > own out of hrtimer and work. Is there any point why you want to use
> > > an hrtimer?
> > 
> > The current implementation uses the hrtimer to schedule when to read the
> > trace buffer the next time.  This patch does not change how the
> > scheduling of the buffer reads is done.  Yes, you could change the
> > implementation to use workqueues instead.  If you feel that it is better
> > to use the workqueue then we could make that change.  Not sure that
> > making that change in this bug fix patch is appropriate.  I would need
> > to create a second patch for that change.
> 
> I would guess that the change from hrtimer to delayed_workqueue is
> smaller than the current patch changing from hrtimer to hrtimer plus
> workqueue, so I would prefer to have only one changeset.
> 
> Since the timer only causes statistical data collection anyway, delaying
> it a bit should not have any negative effect on the accuracy of the
> measurement, unlike delaying the context switch notification.

The goal is to be able to support very high sampling rates (small
periods).  The schedule_delayed_work() is based on jiffies which I
believe is 1/250 for this machine.  This only gives millisecond
resolution.  The goal is for the users to be able to specify a time
period of 60,000 cycles or less then 20 micro second sampling periods
when the real high resolution timers are available.  We can't achieve
the desired sampling rates with the schedule_dealyed_work() function.

> 
> > > > -static DEFINE_SPINLOCK(buffer_lock);
> > > > +extern struct mutex buffer_mutex;
> > > > +extern struct workqueue_struct *oprofile_spu_wq;
> > > > +extern int calls_to_record_switch;
> > > > +
> > > 
> > > Again, public interfaces need to go to a header file, and should
> > > have a name that identifies the interface. "buffer_mutex" is
> > > certainly not a suitable name for a kernel-wide global variable!
> > 
> > As stated earlier, the generic OProfile code defines the variable
> > "buffer_mutex".  Changing the name in the generic OProfile code is
> > beyond the scope of this patch.
> 
> Ok, didn't see that the name was already part of the main oprofile
> driver. However, this makes it even worse: you are accessing data
> structures that are clearly not meant to be shared with architecture
> code. The fact that it was not declared in a global header file should
> have told you that.
> 
> I think you should instead add a function to drivers/oprofile/buffer_sync.c
> that takes care of moving the data to the common buffer under the right
> mutex_lock.


Oprofile provides nice clean interfaces for recording kernel/user
switches and CPU data recording.  This is all that was needed by any
architecture until CELL came along. With CELL, we now have need to add
processor data plus SPU data to the queue.  The buffer_mutex variable
and the add_event_entry() were not visible outside of the OProfile
driver code.  The original SPU support added add_event_entry() to the
include/linux/oprofile.h file.  We can add the buffer_mutex as well
since there is now a need to access both of these.  

I have been looking to see how I could create a generic oprofile routine
which could take the data.  The routine would still have to work from an
interrupt context, so it will need to store the data and call a work
queue function.  The function would need to know how much data will be
needed, thus you would probably need to statically allocate data or use
a list and malloc the data as needed.  I don't really want to have to
malloc data from an interrupt context.  List management adds additional
overhead.  It would be possible to have an init function that you could
call at startup time telling it how much memory you need, in this case
we could allocate a buffer the size of spu_info (defined below) at
startup time.  The call could pass an array to the OProfile routine that
would put the data into the buffer and call the work function.  We still
have to allocate the storage, it does clean up the arch specific code.
Not sure if this really buys us much.  There is more copying of data
i.e. more overhead.  Not convinced the OProfile maintainers would accept
anything I have thought of so far.

Any suggestions?

> 
> > > 
> > > >  static DEFINE_SPINLOCK(cache_lock);
> > > >  static int num_spu_nodes;
> > > > +
> > > >  int spu_prof_num_nodes;
> > > >  int last_guard_val[MAX_NUMNODES * 8];
> > > > +int cnt_swtch_processed_flag[MAX_NUMNODES * 8];
> > > > +
> > > > +struct spus_profiling_code_data_s {
> > > > +	int num_spu_nodes;
> > > > +	struct work_struct spu_prof_code_wq;
> > > > +} spus_profiling_code_data;
> > > > +
> > > > +struct spu_context_switch_data_s {
> > > > +	struct spu *spu;
> > > > +	unsigned long spu_cookie;
> > > > +	unsigned long app_dcookie;
> > > > +	unsigned int offset;
> > > > +	unsigned long objectId;
> > > > +	int valid_entry;
> > > > +} spu_context_switch_data;
> > > 
> > > I don't understand what these variables are really doing, but
> > > having e.g. just one spu_context_switch_data for all the SPUs
> > > doesn't seem to make much sense. What happens when two SPUs do
> > > a context switch at the same time?
> > 
> > This is the data same data that was being put into the event buffer
> > directly from the interrupt context.  We need to store the data that is
> > only available in the interrupt context so the same data can be put into
> > the buffer by the work queue function in the non interrupt context.
> > This is the declaration of the data needed per SPU.  Below in the
> > spu_cntx_sw_data structure, we declare an array of entries so we can
> > store the switch data on a per SPU basis as you alluded to.  
> 
> The spu_context_switch_data and spus_profiling_code_data variables
> are also unused, or just write-only. They look like they were left
> over after a conversion from a typedef.

The spu_context_switch_data structure is the type of the array used in
struct spus_cntxt_sw_data_s which is the data used by the work queue.

Yes, spus_profiling_code_data seems to be unused.

> 
> > The calls_to_record_switch variable is not used, my mistake for not
> > getting it out of the patch.  The record_spu_stat_flag is used. It is
> > set in the spu_sync_start when SPU profiling is started.  The first time
> > the work function is called to record SPU context switches it sees the
> > flag is set and writes the initial record to the daemon/kernel buffer
> > stating that this is an SPU profile run not a PPU profile run.  The
> > daemon needs to know this as it effects how the postprocessing is done.
> > The initial record is only written once.  
> > 
> > The spus_context_sw_data structure has the array per SPU for all of the
> > interrupt context data that was recorded and needs to be written to the
> > kernel/daemon buffer.  
> 
> An ideal driver should not have *any* global variables at all, but store
> all data in the (reference counted) objects it is dealing with, or
> just on the stack while it's processing the data.
> 
> Storing the context switch information in a global breaks down as soon
> as there are multiple context switches taking place for a single
> SPU without the workqueue running in between, which is a very likely
> scenario if you have high-priority tasks on the SPU.

Yes, it was recognized that we could overwrite data.  The expectation is
that the workqueue function will run fairly quickly.  If the SPU context
was only loaded for a very short period of time at most a few samples
would be captured for that frame so the error would be down in the
noise.  OProfile is a statistical tool.  If we don't get enough samples
for the spu context, then the data is not statistically valid.  Losing
the context switch is not an issue.  The context must be loaded long
enough that we collect a reasonable number of samples for the output to
be meaningful.  

> 
> > > >  /* Container for caching information about an active SPU task. */
> > > >  struct cached_info {
> > > > @@ -44,6 +73,8 @@ struct cached_info {
> > > >  	struct kref cache_ref;
> > > >  };
> > > >  
> > > > +struct workqueue_struct *oprofile_spu_wq;
> > > > +
> > > >  static struct cached_info *spu_info[MAX_NUMNODES * 8];
> > > 
> > > While you're cleaning this up, I guess the cached_info should
> > > be moved into a pointer from struct spu as well, instead of
> > > having this global variable here.
> >
> > This would be a functional change and it belongs in a functional change
> > patch not in a bug fix patch.
> 
> The patch is already far bigger than a simple bug fix, but you're right
> in that this part should be separate. Upon reading through the code
> again, I noticed that the cached_info is thrown away on each context
> switch and rebuilt, which I guess makes it impossible to really profile
> the context switch code. In the initial design phase for spu oprofile, we
> decided that the information should be cached in the spu_context, which
> would not only be much cleaner but also avoid the performance problem.
> 
> Do you have any idea why this idea was dropped? Is it still work in
> progress to get that functionality right?

It was not dropped.  It was implemented.  The issue that I have is the
dcookie spu_cookie, app_dcookie, offset, objectId is not included in the
local cahced spu context data.  Previously, there was no need to save it
since it was being immediatly written to oprofile buffer (without
holding the mutex lock).  Now we need to store the data until the
workqueue function runs.  We can store it in the array as I have done or
you could put it in the spu context.  Functionally, it doesn't change
anything.  The data in the SPU context would get overwritten, just as it
does in the array, if there was an SPU context switch before the
workqueue function runs so that doesn't solve that issue.

> 
> > > I would guess that you need one work struct per SPU instead of a global
> > > one, if you want to pass the SPU pointer as an argument.
> > 
> > We only need one work struct because we have an array that contains the
> > data for each SPU that has done a context switch.  
> 
> right, but as I explained, the global array is the real problem that should
> be fixed.
> 
> 	Arnd <><

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix
  2008-04-08 20:21         ` Carl Love
@ 2008-04-09  2:08           ` Arnd Bergmann
  0 siblings, 0 replies; 7+ messages in thread
From: Arnd Bergmann @ 2008-04-09  2:08 UTC (permalink / raw)
  To: cbe-oss-dev; +Cc: linuxppc-dev, cel, linux-kernel, Carl Love

On Tuesday 08 April 2008, Carl Love wrote:
> On Fri, 2008-04-04 at 08:38 +0200, Arnd Bergmann wrote: 
> Our first thought to fix the bug was to just grab the mutex lock when
> adding the switch notification data to the queue.  The kernel gives us
> an oops message saying something along the line of "could not call mutex
> lock in interrupt context".  Hence we had to go to work queues so we
> could access the lock outside of the SPU switch notification context.

By the time the notifier is called, the kernel is certainly not
in an atomic context. Maybe you were nesting the mutex lock inside
of your own spinlock?

> Secondly, it is my understanding that if the schedule_work() call tries
> to queue the same work function multiple times the subsequent requests
> are dropped.  Thus we were not able to pass the context switch data as
> part of the schedule work requests.  This forced us to have an array to
> store the data for each SPU.   

The way that work queues are designed, you embed the struct work in the
data you pass down, so that you are guaranteed to execute the work struct
exactly once per data element and you don't need any global pointers.

> > I would guess that the change from hrtimer to delayed_workqueue is
> > smaller than the current patch changing from hrtimer to hrtimer plus
> > workqueue, so I would prefer to have only one changeset.
> > 
> > Since the timer only causes statistical data collection anyway, delaying
> > it a bit should not have any negative effect on the accuracy of the
> > measurement, unlike delaying the context switch notification.
> 
> The goal is to be able to support very high sampling rates (small
> periods).  The schedule_delayed_work() is based on jiffies which I
> believe is 1/250 for this machine.  This only gives millisecond
> resolution.  The goal is for the users to be able to specify a time
> period of 60,000 cycles or less then 20 micro second sampling periods
> when the real high resolution timers are available.  We can't achieve
> the desired sampling rates with the schedule_dealyed_work() function.

You actually can't get anywhere close to that resolution if you do your
data collection in a work queue, because under high load (which is what
the only time when measuring really gets interesting) the work queue
is likely to be delayed by a few jiffies!

If you rely on high resolution timer precision, you need to look
at the performance counters from inside the timer function, and deal
with the problem of the work queue not being called for a number of
timer events.

> 
> Oprofile provides nice clean interfaces for recording kernel/user
> switches and CPU data recording.  This is all that was needed by any
> architecture until CELL came along. With CELL, we now have need to add
> processor data plus SPU data to the queue.  The buffer_mutex variable
> and the add_event_entry() were not visible outside of the OProfile
> driver code.  The original SPU support added add_event_entry() to the
> include/linux/oprofile.h file.  We can add the buffer_mutex as well
> since there is now a need to access both of these.  
> 
> I have been looking to see how I could create a generic oprofile routine
> which could take the data.  The routine would still have to work from an
> interrupt context, so it will need to store the data and call a work
> queue function.  The function would need to know how much data will be
> needed, thus you would probably need to statically allocate data or use
> a list and malloc the data as needed.  I don't really want to have to
> malloc data from an interrupt context.  List management adds additional
> overhead.  It would be possible to have an init function that you could
> call at startup time telling it how much memory you need, in this case
> we could allocate a buffer the size of spu_info (defined below) at
> startup time.  The call could pass an array to the OProfile routine that
> would put the data into the buffer and call the work function.  We still
> have to allocate the storage, it does clean up the arch specific code.
> Not sure if this really buys us much.  There is more copying of data
> i.e. more overhead.  Not convinced the OProfile maintainers would accept
> anything I have thought of so far.
> 
> Any suggestions?

My first attempt to do this would be to add this to the oprofile/cpu_buffer.c
infrastructure. Basically extend the add_sample() function to have
helpers you can call from the spu code to put entries into the per-cpu
buffer of the CPU that happens to execute the code at the time.

add_sample() can already be called from an atomic context since it uses
its own buffer, and it uses a clever ring buffer to get around the
need for locking against the event_buffer functions.
Only event_buffer needs the mutex, so it's best to leave that out
of the architecture code running at interrupt time altogether.

> > An ideal driver should not have *any* global variables at all, but store
> > all data in the (reference counted) objects it is dealing with, or
> > just on the stack while it's processing the data.
> > 
> > Storing the context switch information in a global breaks down as soon
> > as there are multiple context switches taking place for a single
> > SPU without the workqueue running in between, which is a very likely
> > scenario if you have high-priority tasks on the SPU.
> 
> Yes, it was recognized that we could overwrite data.  The expectation is
> that the workqueue function will run fairly quickly.  If the SPU context
> was only loaded for a very short period of time at most a few samples
> would be captured for that frame so the error would be down in the
> noise.  OProfile is a statistical tool.  If we don't get enough samples
> for the spu context, then the data is not statistically valid.  Losing
> the context switch is not an issue.  The context must be loaded long
> enough that we collect a reasonable number of samples for the output to
> be meaningful.  
 
Hmm, I wonder if you have a problem here when the context switch time
is not independent of the program you are analysing. Usually, context
switches happen when a program does a callback or syscall to the ppu
and the spu becomes idle. If you often lose events just after a context
switch, that means that the code that gets called after the context
is restored shows up in the profile as being called less often than
it actually is, right?

> > The patch is already far bigger than a simple bug fix, but you're right
> > in that this part should be separate. Upon reading through the code
> > again, I noticed that the cached_info is thrown away on each context
> > switch and rebuilt, which I guess makes it impossible to really profile
> > the context switch code. In the initial design phase for spu oprofile, we
> > decided that the information should be cached in the spu_context, which
> > would not only be much cleaner but also avoid the performance problem.
> > 
> > Do you have any idea why this idea was dropped? Is it still work in
> > progress to get that functionality right?
> 
> It was not dropped.  It was implemented.  The issue that I have is the
> dcookie spu_cookie, app_dcookie, offset, objectId is not included in the
> local cahced spu context data.  Previously, there was no need to save it
> since it was being immediatly written to oprofile buffer (without
> holding the mutex lock).  Now we need to store the data until the
> workqueue function runs.  We can store it in the array as I have done or
> you could put it in the spu context.  Functionally, it doesn't change
> anything.  The data in the SPU context would get overwritten, just as it
> does in the array, if there was an SPU context switch before the
> workqueue function runs so that doesn't solve that issue.

That's not what I meant. My point was that the information about the
location of overlays in the context remains constant over the contexts
life time. If you store it in the context, all the data will still
be there by the time you restore the context when it was switched
out and comes back.

	Arnd <><

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2008-04-09  2:08 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-25 15:58 [Cbe-oss-dev] [PATCH] Cell OProfile: SPU mutex lock fix Carl Love
2008-04-02  5:21 ` Arnd Bergmann
2008-04-02 16:42   ` Carl Love
2008-04-02 17:02     ` Carl Love
2008-04-04  6:38       ` Arnd Bergmann
2008-04-08 20:21         ` Carl Love
2008-04-09  2:08           ` Arnd Bergmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).