All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown
@ 2015-12-02 13:46 Ross Lagerwall
  2015-12-02 13:53 ` Andrew Cooper
  2015-12-02 14:02 ` Jan Beulich
  0 siblings, 2 replies; 6+ messages in thread
From: Ross Lagerwall @ 2015-12-02 13:46 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Jan Beulich, Ross Lagerwall

Commit fc0c3fa2ad5c ("x86/IO-APIC: fix setup of Xen internally used IRQs
(take 2)") introduced a regression on some hardware where Xen would hang
during shutdown, repeating the following message:
APIC error on CPU0: 08(08), Receive accept error

This appears to be because an interrupt (in this case from the serial
console) destined for a CPU other than the boot CPU is left unhandled so
an APIC error on CPU 0 is generated instead.

To fix this, before taking down the non-boot CPUs, call fixup_irqs()
with a CPU mask of only the boot CPU to reset the IRQ affinities
correctly.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
---
 xen/arch/x86/irq.c        | 36 ++++++++++++++++++++++--------------
 xen/arch/x86/smp.c        |  8 ++++++++
 xen/arch/x86/smpboot.c    |  3 ++-
 xen/include/asm-x86/irq.h |  5 +++--
 4 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 5f515a0..28337a1 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2328,14 +2328,12 @@ static int __init setup_dump_irqs(void)
 }
 __initcall(setup_dump_irqs);
 
-/* A cpu has been removed from cpu_online_mask.  Re-set irq affinities. */
-void fixup_irqs(void)
+/* CPU(s) have been removed from mask.  Re-set irq affinities. */
+void fixup_irqs(const cpumask_t *mask, bool_t verbose)
 {
-    unsigned int irq, sp;
+    unsigned int irq;
     static int warned;
     struct irq_desc *desc;
-    irq_guest_action_t *action;
-    struct pending_eoi *peoi;
 
     for ( irq = 0; irq < nr_irqs; irq++ )
     {
@@ -2355,21 +2353,20 @@ void fixup_irqs(void)
         vector = irq_to_vector(irq);
         if ( vector >= FIRST_HIPRIORITY_VECTOR &&
              vector <= LAST_HIPRIORITY_VECTOR )
-            cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask,
-                        &cpu_online_map);
+            cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
 
         cpumask_copy(&affinity, desc->affinity);
-        if ( !desc->action || cpumask_subset(&affinity, &cpu_online_map) )
+        if ( !desc->action || cpumask_subset(&affinity, mask) )
         {
             spin_unlock(&desc->lock);
             continue;
         }
 
-        cpumask_and(&affinity, &affinity, &cpu_online_map);
+        cpumask_and(&affinity, &affinity, mask);
         if ( cpumask_empty(&affinity) )
         {
             break_affinity = 1;
-            cpumask_copy(&affinity, &cpu_online_map);
+            cpumask_copy(&affinity, mask);
         }
 
         if ( desc->handler->disable )
@@ -2385,16 +2382,27 @@ void fixup_irqs(void)
 
         spin_unlock(&desc->lock);
 
-        if ( break_affinity && set_affinity )
-            printk("Broke affinity for irq %i\n", irq);
-        else if ( !set_affinity )
-            printk("Cannot set affinity for irq %i\n", irq);
+        if ( verbose )
+        {
+            if ( break_affinity && set_affinity )
+                printk("Broke affinity for irq %i\n", irq);
+            else if ( !set_affinity )
+                printk("Cannot set affinity for irq %i\n", irq);
+        }
     }
 
     /* That doesn't seem sufficient.  Give it 1ms. */
     local_irq_enable();
     mdelay(1);
     local_irq_disable();
+}
+
+void fixup_eoi(void)
+{
+    unsigned int irq, sp;
+    struct irq_desc *desc;
+    irq_guest_action_t *action;
+    struct pending_eoi *peoi;
 
     /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */
     for ( irq = 0; irq < nr_irqs; irq++ )
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 50ff6c2..4446769 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -286,6 +286,7 @@ void __stop_this_cpu(void)
 
 static void stop_this_cpu(void *dummy)
 {
+    fixup_eoi();
     __stop_this_cpu();
     for ( ; ; )
         halt();
@@ -298,6 +299,13 @@ static void stop_this_cpu(void *dummy)
 void smp_send_stop(void)
 {
     int timeout = 10;
+    cpumask_t online;
+
+    cpumask_clear(&online);
+    cpumask_set_cpu(smp_processor_id(), &online);
+    local_irq_disable();
+    fixup_irqs(&online, 0);
+    local_irq_enable();
 
     smp_call_function(stop_this_cpu, NULL, 0);
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 5d48bac..1eb16cb 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -922,7 +922,8 @@ void __cpu_disable(void)
 
     /* It's now safe to remove this processor from the online map */
     cpumask_clear_cpu(cpu, &cpu_online_map);
-    fixup_irqs();
+    fixup_irqs(&cpu_online_map, 1);
+    fixup_eoi();
 
     if ( cpu_disable_scheduler(cpu) )
         BUG();
diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h
index fcf37a3..6c8f968 100644
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -148,8 +148,9 @@ int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq);
 int unmap_domain_pirq_emuirq(struct domain *d, int pirq);
 bool_t hvm_domain_use_pirq(const struct domain *, const struct pirq *);
 
-/* A cpu has been removed from cpu_online_mask.  Re-set irq affinities. */
-void fixup_irqs(void);
+/* CPU(s) have been removed from mask.  Re-set irq affinities. */
+void fixup_irqs(const cpumask_t *mask, bool_t verbose);
+void fixup_eoi(void);
 
 int  init_irq_data(void);
 
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown
  2015-12-02 13:46 [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown Ross Lagerwall
@ 2015-12-02 13:53 ` Andrew Cooper
  2015-12-02 14:02 ` Jan Beulich
  1 sibling, 0 replies; 6+ messages in thread
From: Andrew Cooper @ 2015-12-02 13:53 UTC (permalink / raw)
  To: Ross Lagerwall, xen-devel; +Cc: Jan Beulich

On 02/12/15 13:46, Ross Lagerwall wrote:
> @@ -298,6 +299,13 @@ static void stop_this_cpu(void *dummy)
>  void smp_send_stop(void)
>  {
>      int timeout = 10;
> +    cpumask_t online;
> +
> +    cpumask_clear(&online);
> +    cpumask_set_cpu(smp_processor_id(), &online);

You can avoid this intermediate variable with cpumask_of(smp_processor_id())

There are a set of pre-canned cpumasks with a single cpu set.

~Andrew

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown
  2015-12-02 13:46 [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown Ross Lagerwall
  2015-12-02 13:53 ` Andrew Cooper
@ 2015-12-02 14:02 ` Jan Beulich
  2015-12-02 15:09   ` Ross Lagerwall
  1 sibling, 1 reply; 6+ messages in thread
From: Jan Beulich @ 2015-12-02 14:02 UTC (permalink / raw)
  To: Ross Lagerwall; +Cc: Andrew Cooper, xen-devel

>>> On 02.12.15 at 14:46, <ross.lagerwall@citrix.com> wrote:
> Commit fc0c3fa2ad5c ("x86/IO-APIC: fix setup of Xen internally used IRQs
> (take 2)") introduced a regression on some hardware where Xen would hang
> during shutdown, repeating the following message:
> APIC error on CPU0: 08(08), Receive accept error
> 
> This appears to be because an interrupt (in this case from the serial
> console) destined for a CPU other than the boot CPU is left unhandled so
> an APIC error on CPU 0 is generated instead.
> 
> To fix this, before taking down the non-boot CPUs, call fixup_irqs()
> with a CPU mask of only the boot CPU to reset the IRQ affinities
> correctly.
> 
> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
> ---

Even though in this case interested people may know, missing info
on changes from previous version here.

> +/* CPU(s) have been removed from mask.  Re-set irq affinities. */
> +void fixup_irqs(const cpumask_t *mask, bool_t verbose)

The comment doesn't match reality. And I wonder whether it
wouldn't be reasonable to imply "verbose" (either from mask
equaling &cpu_online_map, or by introducing SYS_STATE_shutdown
and/or SYS_STATE_reboot).

> @@ -2385,16 +2382,27 @@ void fixup_irqs(void)
>  
>          spin_unlock(&desc->lock);
>  
> -        if ( break_affinity && set_affinity )
> -            printk("Broke affinity for irq %i\n", irq);
> -        else if ( !set_affinity )
> -            printk("Cannot set affinity for irq %i\n", irq);
> +        if ( verbose )
> +        {
> +            if ( break_affinity && set_affinity )
> +                printk("Broke affinity for irq %i\n", irq);
> +            else if ( !set_affinity )
> +                printk("Cannot set affinity for irq %i\n", irq);
> +        }

How about

        if ( !verbose )
           continue;

limiting churn on code?

> --- a/xen/arch/x86/smp.c
> +++ b/xen/arch/x86/smp.c
> @@ -286,6 +286,7 @@ void __stop_this_cpu(void)
>  
>  static void stop_this_cpu(void *dummy)
>  {
> +    fixup_eoi();
>      __stop_this_cpu();

Is this really needed during shutdown?

> @@ -298,6 +299,13 @@ static void stop_this_cpu(void *dummy)
>  void smp_send_stop(void)
>  {
>      int timeout = 10;
> +    cpumask_t online;
> +
> +    cpumask_clear(&online);
> +    cpumask_set_cpu(smp_processor_id(), &online);

That's what we have cpumask_of() for.

Jan

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown
  2015-12-02 14:02 ` Jan Beulich
@ 2015-12-02 15:09   ` Ross Lagerwall
  2015-12-02 15:30     ` Jan Beulich
  0 siblings, 1 reply; 6+ messages in thread
From: Ross Lagerwall @ 2015-12-02 15:09 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, xen-devel

On 12/02/2015 02:02 PM, Jan Beulich wrote:
>>>> On 02.12.15 at 14:46, <ross.lagerwall@citrix.com> wrote:
>> Commit fc0c3fa2ad5c ("x86/IO-APIC: fix setup of Xen internally used IRQs
>> (take 2)") introduced a regression on some hardware where Xen would hang
>> during shutdown, repeating the following message:
>> APIC error on CPU0: 08(08), Receive accept error
>>
>> This appears to be because an interrupt (in this case from the serial
>> console) destined for a CPU other than the boot CPU is left unhandled so
>> an APIC error on CPU 0 is generated instead.
>>
>> To fix this, before taking down the non-boot CPUs, call fixup_irqs()
>> with a CPU mask of only the boot CPU to reset the IRQ affinities
>> correctly.
>>
>> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
>> ---
>
> Even though in this case interested people may know, missing info
> on changes from previous version here.
>
>> +/* CPU(s) have been removed from mask.  Re-set irq affinities. */
>> +void fixup_irqs(const cpumask_t *mask, bool_t verbose)
>
> The comment doesn't match reality. And I wonder whether it
> wouldn't be reasonable to imply "verbose" (either from mask
> equaling &cpu_online_map, or by introducing SYS_STATE_shutdown
> and/or SYS_STATE_reboot).

I considered introducing a new SYS_STATE but decided that a function 
parameter was clearer rather than implying it from some other global state.

>
>> @@ -2385,16 +2382,27 @@ void fixup_irqs(void)
>>
>>           spin_unlock(&desc->lock);
>>
>> -        if ( break_affinity && set_affinity )
>> -            printk("Broke affinity for irq %i\n", irq);
>> -        else if ( !set_affinity )
>> -            printk("Cannot set affinity for irq %i\n", irq);
>> +        if ( verbose )
>> +        {
>> +            if ( break_affinity && set_affinity )
>> +                printk("Broke affinity for irq %i\n", irq);
>> +            else if ( !set_affinity )
>> +                printk("Cannot set affinity for irq %i\n", irq);
>> +        }
>
> How about
>
>          if ( !verbose )
>             continue;
>
> limiting churn on code?

OK.

>
>> --- a/xen/arch/x86/smp.c
>> +++ b/xen/arch/x86/smp.c
>> @@ -286,6 +286,7 @@ void __stop_this_cpu(void)
>>
>>   static void stop_this_cpu(void *dummy)
>>   {
>> +    fixup_eoi();
>>       __stop_this_cpu();
>
> Is this really needed during shutdown?

Possibly not, but I think it's cleaner to do the same as what is used 
for CPU down.

>
>> @@ -298,6 +299,13 @@ static void stop_this_cpu(void *dummy)
>>   void smp_send_stop(void)
>>   {
>>       int timeout = 10;
>> +    cpumask_t online;
>> +
>> +    cpumask_clear(&online);
>> +    cpumask_set_cpu(smp_processor_id(), &online);
>
> That's what we have cpumask_of() for.
>

OK.

-- 
Ross Lagerwall

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown
  2015-12-02 15:09   ` Ross Lagerwall
@ 2015-12-02 15:30     ` Jan Beulich
  2015-12-04 10:46       ` Andrew Cooper
  0 siblings, 1 reply; 6+ messages in thread
From: Jan Beulich @ 2015-12-02 15:30 UTC (permalink / raw)
  To: Andrew Cooper, Ross Lagerwall; +Cc: xen-devel

>>> On 02.12.15 at 16:09, <ross.lagerwall@citrix.com> wrote:
> On 12/02/2015 02:02 PM, Jan Beulich wrote:
>>>>> On 02.12.15 at 14:46, <ross.lagerwall@citrix.com> wrote:
>>> --- a/xen/arch/x86/smp.c
>>> +++ b/xen/arch/x86/smp.c
>>> @@ -286,6 +286,7 @@ void __stop_this_cpu(void)
>>>
>>>   static void stop_this_cpu(void *dummy)
>>>   {
>>> +    fixup_eoi();
>>>       __stop_this_cpu();
>>
>> Is this really needed during shutdown?
> 
> Possibly not, but I think it's cleaner to do the same as what is used 
> for CPU down.

I'm not convinced. Andrew?

Jan

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown
  2015-12-02 15:30     ` Jan Beulich
@ 2015-12-04 10:46       ` Andrew Cooper
  0 siblings, 0 replies; 6+ messages in thread
From: Andrew Cooper @ 2015-12-04 10:46 UTC (permalink / raw)
  To: Jan Beulich, Ross Lagerwall; +Cc: xen-devel

On 02/12/15 15:30, Jan Beulich wrote:
>>>> On 02.12.15 at 16:09, <ross.lagerwall@citrix.com> wrote:
>> On 12/02/2015 02:02 PM, Jan Beulich wrote:
>>>>>> On 02.12.15 at 14:46, <ross.lagerwall@citrix.com> wrote:
>>>> --- a/xen/arch/x86/smp.c
>>>> +++ b/xen/arch/x86/smp.c
>>>> @@ -286,6 +286,7 @@ void __stop_this_cpu(void)
>>>>
>>>>   static void stop_this_cpu(void *dummy)
>>>>   {
>>>> +    fixup_eoi();
>>>>       __stop_this_cpu();
>>> Is this really needed during shutdown?
>> Possibly not, but I think it's cleaner to do the same as what is used 
>> for CPU down.
> I'm not convinced. Andrew?

Suppose there is an oustanding line interrupt on the pending eoi stack. 
Without this fixup_eoi(), it could stay permanently attached to a cpu
which isn't processing anything further.

With the current use of smp_send_stop(), all cpus will end up in a state
where they can only be recovered with an #INIT, so I suppose it doesn't
actually matter.

Therefore, not performing redundant work is probably the best course of
action.

~Andrew

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-12-04 10:46 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-12-02 13:46 [PATCH v2] x86: Fixup IRQs when CPUs go down during shutdown Ross Lagerwall
2015-12-02 13:53 ` Andrew Cooper
2015-12-02 14:02 ` Jan Beulich
2015-12-02 15:09   ` Ross Lagerwall
2015-12-02 15:30     ` Jan Beulich
2015-12-04 10:46       ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.