linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [RFC/PATCH] Fix rtas_ibm_suspend_me bugs
@ 2007-11-06  4:43 Nathan Lynch
  2007-11-06 23:08 ` jschopp
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Nathan Lynch @ 2007-11-06  4:43 UTC (permalink / raw)
  To: linuxppc-dev

(very rfc for now, no sign-off, needs more testing)

There are a couple of bugs in the rtas_ibm_suspend_me() and
rtas_percpu_suspend_me() functions:

1.  rtas_ibm_suspend_me() uses on_each_cpu() to invoke
rtas_percpu_suspend_me() via IPI:

if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
...

'data' is on the stack, and rtas_ibm_suspend_me() takes no measures to
ensure that all instances of rtas_percpu_suspend_me() are finished
accessing 'data' before returning.  This can result in the IPI'd cpus
accessing random stack data and getting stuck in H_JOIN.

Fix this by moving rtas_suspend_me_data off the stack, and protect it
with a mutex.  Use a completion to ensure that all cpus are done
accessing the data before unlocking.

2.  rtas_percpu_suspend_me passes the Linux logical cpu id to the
H_PROD hypervisor method, when it should be using the platform
interrupt server value for that cpu (hard_smp_processor_id).  In
practice, this probably causes problems only on partitions where
processors have been removed and added in a particular order.

---
 arch/powerpc/kernel/rtas.c |   64 ++++++++++++++++++++++++++++++++-----------
 1 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 2147807..24faaea 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -19,6 +19,9 @@
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/smp.h>
 
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -34,6 +37,7 @@
 #include <asm/lmb.h>
 #include <asm/udbg.h>
 #include <asm/syscalls.h>
+#include <asm/atomic.h>
 
 struct rtas_t rtas = {
 	.lock = SPIN_LOCK_UNLOCKED
@@ -41,10 +45,21 @@ struct rtas_t rtas = {
 EXPORT_SYMBOL(rtas);
 
 struct rtas_suspend_me_data {
+	atomic_t working; /* number of cpus accessing rtas_suspend_me_data */
 	long waiting;
 	struct rtas_args *args;
+	struct completion done; /* wait on this until working == 0 */
 };
 
+static void rtas_suspend_me_data_init(struct rtas_suspend_me_data *rsmd,
+				      struct rtas_args *args)
+{
+	atomic_set(&rsmd->working, 0);
+	rsmd->waiting = 1;
+	rsmd->args = args;
+	init_completion(&rsmd->done);
+}
+
 DEFINE_SPINLOCK(rtas_data_buf_lock);
 EXPORT_SYMBOL(rtas_data_buf_lock);
 
@@ -671,36 +686,49 @@ static void rtas_percpu_suspend_me(void *info)
 	 * we set it to <0.
 	 */
 	local_irq_save(flags);
+	atomic_inc(&data->working);
 	do {
 		rc = plpar_hcall_norets(H_JOIN);
 		smp_rmb();
 	} while (rc == H_SUCCESS && data->waiting > 0);
 	if (rc == H_SUCCESS)
+		/* join is complete (or there was an error) and this
+		 * cpu was prodded
+		 */
 		goto out;
 
 	if (rc == H_CONTINUE) {
+		/* this cpu does the join */
 		data->waiting = 0;
 		data->args->args[data->args->nargs] =
 			rtas_call(ibm_suspend_me_token, 0, 1, NULL);
-		for_each_possible_cpu(i)
-			plpar_hcall_norets(H_PROD,i);
 	} else {
 		data->waiting = -EBUSY;
 		printk(KERN_ERR "Error on H_JOIN hypervisor call\n");
 	}
 
+	/* This cpu did the join or got an error, so we need to prod
+	 * everyone else.  Extra prods are harmless.
+	 */
+	for_each_online_cpu(i)
+		plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(i));
+
 out:
+	if (atomic_dec_return(&data->working) == 0)
+		complete(&data->done);
 	local_irq_restore(flags);
 	return;
 }
 
+static DEFINE_MUTEX(rsm_lock); /* protects rsm_data */
+static struct rtas_suspend_me_data rsm_data;
+
 static int rtas_ibm_suspend_me(struct rtas_args *args)
 {
-	int i;
+	int err;
 	long state;
 	long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-	struct rtas_suspend_me_data data;
 
 	/* Make sure the state is valid */
 	rc = plpar_hcall(H_VASI_STATE, retbuf,
@@ -721,25 +749,27 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 		return 0;
 	}
 
-	data.waiting = 1;
-	data.args = args;
+	mutex_lock(&rsm_lock);
+
+	rtas_suspend_me_data_init(&rsm_data, args);
 
-	/* Call function on all CPUs.  One of us will make the
-	 * rtas call
+	/* Call function on all CPUs.  One of us (but not necessarily
+	 * this one) will make the ibm,suspend-me call.
 	 */
-	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
-		data.waiting = -EINVAL;
+	if (on_each_cpu(rtas_percpu_suspend_me, &rsm_data, 1, 0))
+		rsm_data.waiting = -EINVAL;
+
+	/* Must wait for all IPIs to complete before unlocking */
+	wait_for_completion(&rsm_data.done);
 
-	if (data.waiting != 0)
+	if (rsm_data.waiting != 0)
 		printk(KERN_ERR "Error doing global join\n");
 
-	/* Prod each CPU.  This won't hurt, and will wake
-	 * anyone we successfully put to sleep with H_JOIN.
-	 */
-	for_each_possible_cpu(i)
-		plpar_hcall_norets(H_PROD, i);
+	err = rsm_data.waiting;
+
+	mutex_unlock(&rsm_lock);
 
-	return data.waiting;
+	return err;
 }
 #else /* CONFIG_PPC_PSERIES */
 static int rtas_ibm_suspend_me(struct rtas_args *args)
-- 
1.5.3.4.56.ge720

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH] Fix rtas_ibm_suspend_me bugs
  2007-11-06  4:43 [RFC/PATCH] Fix rtas_ibm_suspend_me bugs Nathan Lynch
@ 2007-11-06 23:08 ` jschopp
  2007-11-07 19:19 ` Nathan Lynch
  2007-11-09 20:44 ` [PATCH v2] fix multiple bugs in rtas_ibm_suspend_me code Nathan Lynch
  2 siblings, 0 replies; 7+ messages in thread
From: jschopp @ 2007-11-06 23:08 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: linuxppc-dev

 > -		for_each_possible_cpu(i)
 > -			plpar_hcall_norets(H_PROD,i);

...

 > +	for_each_online_cpu(i)
 > +		plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(i));

I assume this bit would be non-contriversial and could be sent up for 
immediate upstream inclusion.

Nathan Lynch wrote:
> (very rfc for now, no sign-off, needs more testing)
> 
> There are a couple of bugs in the rtas_ibm_suspend_me() and
> rtas_percpu_suspend_me() functions:
> 
> 1.  rtas_ibm_suspend_me() uses on_each_cpu() to invoke
> rtas_percpu_suspend_me() via IPI:
> 
> if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
> ...
> 
> 'data' is on the stack, and rtas_ibm_suspend_me() takes no measures to
> ensure that all instances of rtas_percpu_suspend_me() are finished
> accessing 'data' before returning.  This can result in the IPI'd cpus
> accessing random stack data and getting stuck in H_JOIN.
> 
> Fix this by moving rtas_suspend_me_data off the stack, and protect it
> with a mutex.  Use a completion to ensure that all cpus are done
> accessing the data before unlocking.
> 
> 2.  rtas_percpu_suspend_me passes the Linux logical cpu id to the
> H_PROD hypervisor method, when it should be using the platform
> interrupt server value for that cpu (hard_smp_processor_id).  In
> practice, this probably causes problems only on partitions where
> processors have been removed and added in a particular order.
> 
> ---
>  arch/powerpc/kernel/rtas.c |   64 ++++++++++++++++++++++++++++++++-----------
>  1 files changed, 47 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 2147807..24faaea 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -19,6 +19,9 @@
>  #include <linux/init.h>
>  #include <linux/capability.h>
>  #include <linux/delay.h>
> +#include <linux/mutex.h>
> +#include <linux/completion.h>
> +#include <linux/smp.h>
>  
>  #include <asm/prom.h>
>  #include <asm/rtas.h>
> @@ -34,6 +37,7 @@
>  #include <asm/lmb.h>
>  #include <asm/udbg.h>
>  #include <asm/syscalls.h>
> +#include <asm/atomic.h>
>  
>  struct rtas_t rtas = {
>  	.lock = SPIN_LOCK_UNLOCKED
> @@ -41,10 +45,21 @@ struct rtas_t rtas = {
>  EXPORT_SYMBOL(rtas);
>  
>  struct rtas_suspend_me_data {
> +	atomic_t working; /* number of cpus accessing rtas_suspend_me_data */
>  	long waiting;
>  	struct rtas_args *args;
> +	struct completion done; /* wait on this until working == 0 */
>  };
>  
> +static void rtas_suspend_me_data_init(struct rtas_suspend_me_data *rsmd,
> +				      struct rtas_args *args)
> +{
> +	atomic_set(&rsmd->working, 0);
> +	rsmd->waiting = 1;
> +	rsmd->args = args;
> +	init_completion(&rsmd->done);
> +}
> +
>  DEFINE_SPINLOCK(rtas_data_buf_lock);
>  EXPORT_SYMBOL(rtas_data_buf_lock);
>  
> @@ -671,36 +686,49 @@ static void rtas_percpu_suspend_me(void *info)
>  	 * we set it to <0.
>  	 */
>  	local_irq_save(flags);
> +	atomic_inc(&data->working);
>  	do {
>  		rc = plpar_hcall_norets(H_JOIN);
>  		smp_rmb();
>  	} while (rc == H_SUCCESS && data->waiting > 0);
>  	if (rc == H_SUCCESS)
> +		/* join is complete (or there was an error) and this
> +		 * cpu was prodded
> +		 */
>  		goto out;
>  
>  	if (rc == H_CONTINUE) {
> +		/* this cpu does the join */
>  		data->waiting = 0;
>  		data->args->args[data->args->nargs] =
>  			rtas_call(ibm_suspend_me_token, 0, 1, NULL);
> -		for_each_possible_cpu(i)
> -			plpar_hcall_norets(H_PROD,i);
>  	} else {
>  		data->waiting = -EBUSY;
>  		printk(KERN_ERR "Error on H_JOIN hypervisor call\n");
>  	}
>  
> +	/* This cpu did the join or got an error, so we need to prod
> +	 * everyone else.  Extra prods are harmless.
> +	 */
> +	for_each_online_cpu(i)
> +		plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(i));
> +
>  out:
> +	if (atomic_dec_return(&data->working) == 0)
> +		complete(&data->done);
>  	local_irq_restore(flags);
>  	return;
>  }
>  
> +static DEFINE_MUTEX(rsm_lock); /* protects rsm_data */
> +static struct rtas_suspend_me_data rsm_data;
> +
>  static int rtas_ibm_suspend_me(struct rtas_args *args)
>  {
> -	int i;
> +	int err;
>  	long state;
>  	long rc;
>  	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> -	struct rtas_suspend_me_data data;
>  
>  	/* Make sure the state is valid */
>  	rc = plpar_hcall(H_VASI_STATE, retbuf,
> @@ -721,25 +749,27 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
>  		return 0;
>  	}
>  
> -	data.waiting = 1;
> -	data.args = args;
> +	mutex_lock(&rsm_lock);
> +
> +	rtas_suspend_me_data_init(&rsm_data, args);
>  
> -	/* Call function on all CPUs.  One of us will make the
> -	 * rtas call
> +	/* Call function on all CPUs.  One of us (but not necessarily
> +	 * this one) will make the ibm,suspend-me call.
>  	 */
> -	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
> -		data.waiting = -EINVAL;
> +	if (on_each_cpu(rtas_percpu_suspend_me, &rsm_data, 1, 0))
> +		rsm_data.waiting = -EINVAL;
> +
> +	/* Must wait for all IPIs to complete before unlocking */
> +	wait_for_completion(&rsm_data.done);
>  
> -	if (data.waiting != 0)
> +	if (rsm_data.waiting != 0)
>  		printk(KERN_ERR "Error doing global join\n");
>  
> -	/* Prod each CPU.  This won't hurt, and will wake
> -	 * anyone we successfully put to sleep with H_JOIN.
> -	 */
> -	for_each_possible_cpu(i)
> -		plpar_hcall_norets(H_PROD, i);
> +	err = rsm_data.waiting;
> +
> +	mutex_unlock(&rsm_lock);
>  
> -	return data.waiting;
> +	return err;
>  }
>  #else /* CONFIG_PPC_PSERIES */
>  static int rtas_ibm_suspend_me(struct rtas_args *args)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH] Fix rtas_ibm_suspend_me bugs
  2007-11-06  4:43 [RFC/PATCH] Fix rtas_ibm_suspend_me bugs Nathan Lynch
  2007-11-06 23:08 ` jschopp
@ 2007-11-07 19:19 ` Nathan Lynch
  2007-11-09 20:44 ` [PATCH v2] fix multiple bugs in rtas_ibm_suspend_me code Nathan Lynch
  2 siblings, 0 replies; 7+ messages in thread
From: Nathan Lynch @ 2007-11-07 19:19 UTC (permalink / raw)
  To: linuxppc-dev

Nathan Lynch wrote:
> (very rfc for now, no sign-off, needs more testing)
> 
> There are a couple of bugs in the rtas_ibm_suspend_me() and
> rtas_percpu_suspend_me() functions:
> 
> 1.  rtas_ibm_suspend_me() uses on_each_cpu() to invoke
> rtas_percpu_suspend_me() via IPI:
> 
> if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
> ...
> 
> 'data' is on the stack, and rtas_ibm_suspend_me() takes no measures to
> ensure that all instances of rtas_percpu_suspend_me() are finished
> accessing 'data' before returning.  This can result in the IPI'd cpus
> accessing random stack data and getting stuck in H_JOIN.

Another possible issue is that H_JOIN requires MSR.EE to be off, but
lazy interrupt disabling could conceivably allow that constraint to be
violated if we end up doing H_JOIN on the cpu which calls on_each_cpu().
At least I think so...

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] fix multiple bugs in rtas_ibm_suspend_me code
  2007-11-06  4:43 [RFC/PATCH] Fix rtas_ibm_suspend_me bugs Nathan Lynch
  2007-11-06 23:08 ` jschopp
  2007-11-07 19:19 ` Nathan Lynch
@ 2007-11-09 20:44 ` Nathan Lynch
  2007-11-13  5:11   ` Paul Mackerras
  2 siblings, 1 reply; 7+ messages in thread
From: Nathan Lynch @ 2007-11-09 20:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul Mackerras

There are several issues with the rtas_ibm_suspend_me code, which
enables platform-assisted suspension of an LPAR for migration or
hibernation as covered in PAPR 2.2.

1.) rtas_ibm_suspend_me uses on_each_cpu() to invoke
rtas_percpu_suspend_me on all cpus via IPI:

if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
...

'data' is on the calling task's stack, but rtas_ibm_suspend_me takes
no measures to ensure that all instances of rtas_percpu_suspend_me are
finished accessing 'data' before returning.  This can result in the
IPI'd cpus accessing random stack data and getting stuck in H_JOIN.

This is addressed by using an atomic count of workers and a completion
on the stack.

2.) rtas_percpu_suspend_me is needlessly calling H_JOIN in a loop.
The only event that can cause a cpu to return from H_JOIN is an H_PROD
from another cpu or a NMI/system reset.  Each cpu need call H_JOIN
only once per suspend operation.

Remove the loop and the now unnecessary 'waiting' state variable.

3.) H_JOIN must be called with MSR[EE] off, but lazy interrupt
disabling may cause the caller of rtas_ibm_suspend_me to call H_JOIN
with it on; the local_irq_disable() in on_each_cpu() is not
sufficient.

Fix this by explicitly saving the MSR and clearing the EE bit before
calling H_JOIN.

4.) H_PROD is being called with the Linux logical cpu number as the
parameter, not the platform interrupt server value.  (It's also being
called for all possible cpus, which is harmless, but unnecessary.)

This is fixed by calling H_PROD for each online cpu using
get_hard_smp_processor_id(cpu) for the argument.

Signed-off-by: Nathan Lynch <ntl@pobox.com>

---
 arch/powerpc/kernel/rtas.c |   94 +++++++++++++++++++++++++-------------------
 1 files changed, 53 insertions(+), 41 deletions(-)

Changes since v1:
- the completion is adequate for ensuring that all IPIs have
completed, so there's no need for a mutex and rtas_suspend_me_data
can be on the stack as before.

- add fix for hard-disabling interrupts before H_JOIN

- remove unnecessary H_JOIN loop and state machinery

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 2147807..3afe771 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -41,8 +41,10 @@ struct rtas_t rtas = {
 EXPORT_SYMBOL(rtas);
 
 struct rtas_suspend_me_data {
-	long waiting;
-	struct rtas_args *args;
+	atomic_t working; /* number of cpus accessing this struct */
+	int token; /* ibm,suspend-me */
+	int error;
+	struct completion *complete; /* wait on this until working == 0 */
 };
 
 DEFINE_SPINLOCK(rtas_data_buf_lock);
@@ -657,50 +659,62 @@ static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE;
 #ifdef CONFIG_PPC_PSERIES
 static void rtas_percpu_suspend_me(void *info)
 {
-	int i;
 	long rc;
-	long flags;
+	unsigned long msr_save;
+	int cpu;
 	struct rtas_suspend_me_data *data =
 		(struct rtas_suspend_me_data *)info;
 
-	/*
-	 * We use "waiting" to indicate our state.  As long
-	 * as it is >0, we are still trying to all join up.
-	 * If it goes to 0, we have successfully joined up and
-	 * one thread got H_CONTINUE.  If any error happens,
-	 * we set it to <0.
-	 */
-	local_irq_save(flags);
-	do {
-		rc = plpar_hcall_norets(H_JOIN);
-		smp_rmb();
-	} while (rc == H_SUCCESS && data->waiting > 0);
-	if (rc == H_SUCCESS)
-		goto out;
+	atomic_inc(&data->working);
+
+	/* really need to ensure MSR.EE is off for H_JOIN */
+	msr_save = mfmsr();
+	mtmsr(msr_save & ~(MSR_EE));
+
+	rc = plpar_hcall_norets(H_JOIN);
+
+	mtmsr(msr_save);
 
-	if (rc == H_CONTINUE) {
-		data->waiting = 0;
-		data->args->args[data->args->nargs] =
-			rtas_call(ibm_suspend_me_token, 0, 1, NULL);
-		for_each_possible_cpu(i)
-			plpar_hcall_norets(H_PROD,i);
+	if (rc == H_SUCCESS) {
+		/* This cpu was prodded and the suspend is complete. */
+		goto out;
+	} else if (rc == H_CONTINUE) {
+		/* All other cpus are in H_JOIN, this cpu does
+		 * the suspend.
+		 */
+		printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n",
+		       smp_processor_id());
+		data->error = rtas_call(data->token, 0, 1, NULL);
+
+		if (data->error)
+			printk(KERN_DEBUG "ibm,suspend-me returned %d\n",
+			       data->error);
 	} else {
-		data->waiting = -EBUSY;
-		printk(KERN_ERR "Error on H_JOIN hypervisor call\n");
+		printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n",
+		       smp_processor_id(), rc);
+		data->error = rc;
 	}
-
+	/* This cpu did the suspend or got an error; in either case,
+	 * we need to prod all other other cpus out of join state.
+	 * Extra prods are harmless.
+	 */
+	for_each_online_cpu(cpu)
+		plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
 out:
-	local_irq_restore(flags);
-	return;
+	if (atomic_dec_return(&data->working) == 0)
+		complete(data->complete);
 }
 
 static int rtas_ibm_suspend_me(struct rtas_args *args)
 {
-	int i;
 	long state;
 	long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 	struct rtas_suspend_me_data data;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (!rtas_service_present("ibm,suspend-me"))
+		return -ENOSYS;
 
 	/* Make sure the state is valid */
 	rc = plpar_hcall(H_VASI_STATE, retbuf,
@@ -721,25 +735,23 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 		return 0;
 	}
 
-	data.waiting = 1;
-	data.args = args;
+	atomic_set(&data.working, 0);
+	data.token = rtas_token("ibm,suspend-me");
+	data.error = 0;
+	data.complete = &done;
 
 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
 	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
-		data.waiting = -EINVAL;
+		data.error = -EINVAL;
 
-	if (data.waiting != 0)
-		printk(KERN_ERR "Error doing global join\n");
+	wait_for_completion(&done);
 
-	/* Prod each CPU.  This won't hurt, and will wake
-	 * anyone we successfully put to sleep with H_JOIN.
-	 */
-	for_each_possible_cpu(i)
-		plpar_hcall_norets(H_PROD, i);
+	if (data.error != 0)
+		printk(KERN_ERR "Error doing global join\n");
 
-	return data.waiting;
+	return data.error;
 }
 #else /* CONFIG_PPC_PSERIES */
 static int rtas_ibm_suspend_me(struct rtas_args *args)
-- 
1.5.3.4.206.g58ba4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] fix multiple bugs in rtas_ibm_suspend_me code
  2007-11-09 20:44 ` [PATCH v2] fix multiple bugs in rtas_ibm_suspend_me code Nathan Lynch
@ 2007-11-13  5:11   ` Paul Mackerras
  2007-11-13 16:15     ` [PATCH v3] " Nathan Lynch
  0 siblings, 1 reply; 7+ messages in thread
From: Paul Mackerras @ 2007-11-13  5:11 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: linuxppc-dev

Nathan Lynch writes:

> There are several issues with the rtas_ibm_suspend_me code, which
> enables platform-assisted suspension of an LPAR for migration or
> hibernation as covered in PAPR 2.2.

On a uniprocessor configuration, with this patch I get:

  CC      arch/powerpc/kernel/rtas.o
/home/paulus/kernel/powerpc/arch/powerpc/kernel/rtas.c: In function ‘rtas_percpu_suspend_me’:
/home/paulus/kernel/powerpc/arch/powerpc/kernel/rtas.c:702: error: implicit declaration of function ‘get_hard_smp_processor_id’
make[2]: *** [arch/powerpc/kernel/rtas.o] Error 1

I think you need to #include <asm/smp.h> in rtas.c.

Paul.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3] fix multiple bugs in rtas_ibm_suspend_me code
  2007-11-13  5:11   ` Paul Mackerras
@ 2007-11-13 16:15     ` Nathan Lynch
  2007-11-13 16:25       ` Nathan Lynch
  0 siblings, 1 reply; 7+ messages in thread
From: Nathan Lynch @ 2007-11-13 16:15 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev

There are several issues with the rtas_ibm_suspend_me code, which
enables platform-assisted suspension of an LPAR for migration or
hibernation as covered in PAPR 2.2.

1.) rtas_ibm_suspend_me uses on_each_cpu() to invoke
rtas_percpu_suspend_me on all cpus via IPI:

if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
...

'data' is on the calling task's stack, but rtas_ibm_suspend_me takes
no measures to ensure that all instances of rtas_percpu_suspend_me are
finished accessing 'data' before returning.  This can result in the
IPI'd cpus accessing random stack data and getting stuck in H_JOIN.

This is addressed by using an atomic count of workers and a completion
on the stack.

2.) rtas_percpu_suspend_me is needlessly calling H_JOIN in a loop.
The only event that can cause a cpu to return from H_JOIN is an H_PROD
from another cpu or a NMI/system reset.  Each cpu need call H_JOIN
only once per suspend operation.

Remove the loop and the now unnecessary 'waiting' state variable.

3.) H_JOIN must be called with MSR[EE] off, but lazy interrupt
disabling may cause the caller of rtas_ibm_suspend_me to call H_JOIN
with it on; the local_irq_disable() in on_each_cpu() is not
sufficient.

Fix this by explicitly saving the MSR and clearing the EE bit before
calling H_JOIN.

4.) H_PROD is being called with the Linux logical cpu number as the
parameter, not the platform interrupt server value.  (It's also being
called for all possible cpus, which is harmless, but unnecessary.)

This is fixed by calling H_PROD for each online cpu using
get_hard_smp_processor_id(cpu) for the argument.

Signed-off-by: Nathan Lynch <ntl@pobox.com>

---

Paul Mackerras wrote:
>
> On a uniprocessor configuration, with this patch I get:
>
>   CC      arch/powerpc/kernel/rtas.o
> /home/paulus/kernel/powerpc/arch/powerpc/kernel/rtas.c: In function ‘rtas_percpu_suspend_me’:
> /home/paulus/kernel/powerpc/arch/powerpc/kernel/rtas.c:702: error: implicit declaration of function ‘get_hard_smp_processor_id
> make[2]: *** [arch/powerpc/kernel/rtas.o] Error 1
>
> I think you need to #include <asm/smp.h> in rtas.c.

Rather sloppy of me, sorry.

Changes since v2:
- Add appropriate #includes for APIs used, fixing SMP=n build

Changes since v1:
- the completion is adequate for ensuring that all IPIs have
completed, so there's no need for a mutex and rtas_suspend_me_data
can be on the stack as before.

- add fix for hard-disabling interrupts before H_JOIN

- remove unnecessary H_JOIN loop and state machinery


 arch/powerpc/kernel/rtas.c |   99 ++++++++++++++++++++++++++------------------
 1 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 2147807..52e95c2 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -19,6 +19,9 @@
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/completion.h>
+#include <linux/cpumask.h>

 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -34,6 +37,8 @@
 #include <asm/lmb.h>
 #include <asm/udbg.h>
 #include <asm/syscalls.h>
+#include <asm/smp.h>
+#include <asm/atomic.h>

 struct rtas_t rtas = {
 	.lock = SPIN_LOCK_UNLOCKED
@@ -41,8 +46,10 @@ struct rtas_t rtas = {
 EXPORT_SYMBOL(rtas);

 struct rtas_suspend_me_data {
-	long waiting;
-	struct rtas_args *args;
+	atomic_t working; /* number of cpus accessing this struct */
+	int token; /* ibm,suspend-me */
+	int error;
+	struct completion *complete; /* wait on this until working == 0 */
 };

 DEFINE_SPINLOCK(rtas_data_buf_lock);
@@ -657,50 +664,62 @@ static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE;
 #ifdef CONFIG_PPC_PSERIES
 static void rtas_percpu_suspend_me(void *info)
 {
-	int i;
 	long rc;
-	long flags;
+	unsigned long msr_save;
+	int cpu;
 	struct rtas_suspend_me_data *data =
 		(struct rtas_suspend_me_data *)info;

-	/*
-	 * We use "waiting" to indicate our state.  As long
-	 * as it is >0, we are still trying to all join up.
-	 * If it goes to 0, we have successfully joined up and
-	 * one thread got H_CONTINUE.  If any error happens,
-	 * we set it to <0.
-	 */
-	local_irq_save(flags);
-	do {
-		rc = plpar_hcall_norets(H_JOIN);
-		smp_rmb();
-	} while (rc == H_SUCCESS && data->waiting > 0);
-	if (rc == H_SUCCESS)
-		goto out;
+	atomic_inc(&data->working);
+
+	/* really need to ensure MSR.EE is off for H_JOIN */
+	msr_save = mfmsr();
+	mtmsr(msr_save & ~(MSR_EE));
+
+	rc = plpar_hcall_norets(H_JOIN);
+
+	mtmsr(msr_save);

-	if (rc == H_CONTINUE) {
-		data->waiting = 0;
-		data->args->args[data->args->nargs] =
-			rtas_call(ibm_suspend_me_token, 0, 1, NULL);
-		for_each_possible_cpu(i)
-			plpar_hcall_norets(H_PROD,i);
+	if (rc == H_SUCCESS) {
+		/* This cpu was prodded and the suspend is complete. */
+		goto out;
+	} else if (rc == H_CONTINUE) {
+		/* All other cpus are in H_JOIN, this cpu does
+		 * the suspend.
+		 */
+		printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n",
+		       smp_processor_id());
+		data->error = rtas_call(data->token, 0, 1, NULL);
+
+		if (data->error)
+			printk(KERN_DEBUG "ibm,suspend-me returned %d\n",
+			       data->error);
 	} else {
-		data->waiting = -EBUSY;
-		printk(KERN_ERR "Error on H_JOIN hypervisor call\n");
+		printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n",
+		       smp_processor_id(), rc);
+		data->error = rc;
 	}
-
+	/* This cpu did the suspend or got an error; in either case,
+	 * we need to prod all other other cpus out of join state.
+	 * Extra prods are harmless.
+	 */
+	for_each_online_cpu(cpu)
+		plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
 out:
-	local_irq_restore(flags);
-	return;
+	if (atomic_dec_return(&data->working) == 0)
+		complete(data->complete);
 }

 static int rtas_ibm_suspend_me(struct rtas_args *args)
 {
-	int i;
 	long state;
 	long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 	struct rtas_suspend_me_data data;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (!rtas_service_present("ibm,suspend-me"))
+		return -ENOSYS;

 	/* Make sure the state is valid */
 	rc = plpar_hcall(H_VASI_STATE, retbuf,
@@ -721,25 +740,23 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 		return 0;
 	}

-	data.waiting = 1;
-	data.args = args;
+	atomic_set(&data.working, 0);
+	data.token = rtas_token("ibm,suspend-me");
+	data.error = 0;
+	data.complete = &done;

 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
 	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
-		data.waiting = -EINVAL;
+		data.error = -EINVAL;

-	if (data.waiting != 0)
-		printk(KERN_ERR "Error doing global join\n");
+	wait_for_completion(&done);

-	/* Prod each CPU.  This won't hurt, and will wake
-	 * anyone we successfully put to sleep with H_JOIN.
-	 */
-	for_each_possible_cpu(i)
-		plpar_hcall_norets(H_PROD, i);
+	if (data.error != 0)
+		printk(KERN_ERR "Error doing global join\n");

-	return data.waiting;
+	return data.error;
 }
 #else /* CONFIG_PPC_PSERIES */
 static int rtas_ibm_suspend_me(struct rtas_args *args)
--
1.5.3.4.206.g58ba4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] fix multiple bugs in rtas_ibm_suspend_me code
  2007-11-13 16:15     ` [PATCH v3] " Nathan Lynch
@ 2007-11-13 16:25       ` Nathan Lynch
  0 siblings, 0 replies; 7+ messages in thread
From: Nathan Lynch @ 2007-11-13 16:25 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev

Nathan Lynch wrote:
> 
> 3.) H_JOIN must be called with MSR[EE] off, but lazy interrupt
> disabling may cause the caller of rtas_ibm_suspend_me to call H_JOIN
> with it on; the local_irq_disable() in on_each_cpu() is not
> sufficient.
> 
> Fix this by explicitly saving the MSR and clearing the EE bit before
> calling H_JOIN.

...

> +	atomic_inc(&data->working);
> +
> +	/* really need to ensure MSR.EE is off for H_JOIN */
> +	msr_save = mfmsr();
> +	mtmsr(msr_save & ~(MSR_EE));
> +
> +	rc = plpar_hcall_norets(H_JOIN);
> +
> +	mtmsr(msr_save);

BTW, I'm wondering if this is the right way to do this.  I think
there's the possibility that we could enter this routine hard-enabled
and take take an interrupt between the mfmsr and the first mtmsr, but
I haven't worked out all the implications.  Would hard_irq_disable be
better?

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-11-13 16:25 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-06  4:43 [RFC/PATCH] Fix rtas_ibm_suspend_me bugs Nathan Lynch
2007-11-06 23:08 ` jschopp
2007-11-07 19:19 ` Nathan Lynch
2007-11-09 20:44 ` [PATCH v2] fix multiple bugs in rtas_ibm_suspend_me code Nathan Lynch
2007-11-13  5:11   ` Paul Mackerras
2007-11-13 16:15     ` [PATCH v3] " Nathan Lynch
2007-11-13 16:25       ` Nathan Lynch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).