* [PATCH v2] x86/mce: fix failed to reenable cmci when swiching to interrupt mode
@ 2015-08-12 2:51 Xie XiuQi
2015-08-12 9:54 ` Borislav Petkov
0 siblings, 1 reply; 3+ messages in thread
From: Xie XiuQi @ 2015-08-12 2:51 UTC (permalink / raw)
To: tony.luck, bp, tglx, mingo, hpa
Cc: x86, linux-edac, linux-kernel, zhangliguang, rui.xiang,
huawei.libin
Zhang Liguang report a bug as bellow:
1) system detected cmci storm on current cpu
2) disable cmci interrupt on banks ownd by current cpu, then swiching to poll mode
3) a few minites later, system swiching to interrupt mode on current cpu
4) we expect system to reenable cmci interrupt on banks ownd by current cpu
mce_intel_adjust_timer
|-> cmci_reenable
|-> cmci_discover # but, ownd banks is ignore here
> static void cmci_discover(int banks)
> ...
> for (i = 0; i < banks; i++) {
> ...
> if (test_bit(i, owned)) # ownd banks is ignore here
> continue;
In this patch, we add a func cmci_storm_set_cmci(), just to enable or
disable banks which ownd by current cpu without clean the ownd flags.
Reported-by: Zhang Liguang <zhangliguang@huawei.com>
Cc: stable@vger.kernel.org # v3.15+
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
---
arch/x86/kernel/cpu/mcheck/mce_intel.c | 41 +++++++++++++++++++---------------
1 file changed, 23 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 844f56c..a20e18b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
+static void cmci_storm_set_cmci(bool on)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = this_cpu_ptr(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+ if (on)
+ val |= MCI_CTL2_CMCI_EN;
+ else
+ val &= ~MCI_CTL2_CMCI_EN;
+
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_reenable();
+ cmci_storm_set_cmci(true);
cmci_recheck();
}
return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
}
}
-static void cmci_storm_disable_banks(void)
-{
- unsigned long flags, *owned;
- int bank;
- u64 val;
-
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
- cmci_storm_disable_banks();
+ cmci_storm_set_cmci(false);
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_STORM_INTERVAL);
--
2.0.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v2] x86/mce: fix failed to reenable cmci when swiching to interrupt mode
2015-08-12 2:51 [PATCH v2] x86/mce: fix failed to reenable cmci when swiching to interrupt mode Xie XiuQi
@ 2015-08-12 9:54 ` Borislav Petkov
2015-08-12 11:52 ` Xie XiuQi
0 siblings, 1 reply; 3+ messages in thread
From: Borislav Petkov @ 2015-08-12 9:54 UTC (permalink / raw)
To: Xie XiuQi
Cc: tony.luck, tglx, mingo, hpa, x86, linux-edac, linux-kernel,
zhangliguang, rui.xiang, huawei.libin
On Wed, Aug 12, 2015 at 10:51:11AM +0800, Xie XiuQi wrote:
> Zhang Liguang report a bug as bellow:
> 1) system detected cmci storm on current cpu
> 2) disable cmci interrupt on banks ownd by current cpu, then swiching to poll mode
> 3) a few minites later, system swiching to interrupt mode on current cpu
> 4) we expect system to reenable cmci interrupt on banks ownd by current cpu
> mce_intel_adjust_timer
> |-> cmci_reenable
> |-> cmci_discover # but, ownd banks is ignore here
>
> > static void cmci_discover(int banks)
> > ...
> > for (i = 0; i < banks; i++) {
> > ...
> > if (test_bit(i, owned)) # ownd banks is ignore here
> > continue;
>
> In this patch, we add a func cmci_storm_set_cmci(), just to enable or
Yeah, that's too many "cmci"'s in the name. Here's what I committed:
---
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Wed, 12 Aug 2015 10:51:11 +0800
Subject: [PATCH] x86/mce: Reenable CMCI banks when swiching back to interrupt mode
Zhang Liguang reported the following issue:
1) System detects a CMCI storm on the current CPU.
2) Kernel disables the CMCI interrupt on banks owned by the current CPU and
switches to poll mode
3) After the CMCI storm subsides, kernel switches back to interrupt mode
4) We expect the system to reenable the CMCI interrupt on banks owned by
the current CPU
mce_intel_adjust_timer
|-> cmci_reenable
|-> cmci_discover # owned banks are ignored here
static void cmci_discover(int banks)
...
for (i = 0; i < banks; i++) {
...
if (test_bit(i, owned)) # ownd banks is ignore here
continue;
So convert cmci_storm_disable_banks() to cmci_toggle_interrupt_mode()
which controls whether to enable or disable CMCI interrupts with its
argument.
NB: We cannot clear the owned bit because the banks won't be polled,
otherwise. See
27f6c573e0f7 ("x86, CMCI: Add proper detection of end of CMCI storms")
for more info.
Reported-by: Zhang Liguang <zhangliguang@huawei.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Cc: <stable@vger.kernel.org> # v3.15+
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: huawei.libin@huawei.com
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: rui.xiang@huawei.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/1439347871-2702-1-git-send-email-xiexiuqi@huawei.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mcheck/mce_intel.c | 41 +++++++++++++++++++---------------
1 file changed, 23 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index c5c003291861..1e8bb6c94f14 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
+static void cmci_toggle_interrupt_mode(bool on)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = this_cpu_ptr(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+ if (on)
+ val |= MCI_CTL2_CMCI_EN;
+ else
+ val &= ~MCI_CTL2_CMCI_EN;
+
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_reenable();
+ cmci_toggle_interrupt_mode(true);
cmci_recheck();
}
return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
}
}
-static void cmci_storm_disable_banks(void)
-{
- unsigned long flags, *owned;
- int bank;
- u64 val;
-
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
- cmci_storm_disable_banks();
+ cmci_toggle_interrupt_mode(false);
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_STORM_INTERVAL);
--
2.5.0.rc2.28.g6003e7f
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v2] x86/mce: fix failed to reenable cmci when swiching to interrupt mode
2015-08-12 9:54 ` Borislav Petkov
@ 2015-08-12 11:52 ` Xie XiuQi
0 siblings, 0 replies; 3+ messages in thread
From: Xie XiuQi @ 2015-08-12 11:52 UTC (permalink / raw)
To: Borislav Petkov
Cc: tony.luck, tglx, mingo, hpa, x86, linux-edac, linux-kernel,
zhangliguang, rui.xiang, huawei.libin
On 2015/8/12 17:54, Borislav Petkov wrote:
> On Wed, Aug 12, 2015 at 10:51:11AM +0800, Xie XiuQi wrote:
>> Zhang Liguang report a bug as bellow:
>> 1) system detected cmci storm on current cpu
>> 2) disable cmci interrupt on banks ownd by current cpu, then swiching to poll mode
>> 3) a few minites later, system swiching to interrupt mode on current cpu
>> 4) we expect system to reenable cmci interrupt on banks ownd by current cpu
>> mce_intel_adjust_timer
>> |-> cmci_reenable
>> |-> cmci_discover # but, ownd banks is ignore here
>>
>>> static void cmci_discover(int banks)
>>> ...
>>> for (i = 0; i < banks; i++) {
>>> ...
>>> if (test_bit(i, owned)) # ownd banks is ignore here
>>> continue;
>>
>> In this patch, we add a func cmci_storm_set_cmci(), just to enable or
>
> Yeah, that's too many "cmci"'s in the name. Here's what I committed:
It looks much better than me.
Thanks.
>
> ---
> From: Xie XiuQi <xiexiuqi@huawei.com>
> Date: Wed, 12 Aug 2015 10:51:11 +0800
> Subject: [PATCH] x86/mce: Reenable CMCI banks when swiching back to interrupt mode
>
> Zhang Liguang reported the following issue:
>
> 1) System detects a CMCI storm on the current CPU.
>
> 2) Kernel disables the CMCI interrupt on banks owned by the current CPU and
> switches to poll mode
>
> 3) After the CMCI storm subsides, kernel switches back to interrupt mode
>
> 4) We expect the system to reenable the CMCI interrupt on banks owned by
> the current CPU
>
> mce_intel_adjust_timer
> |-> cmci_reenable
> |-> cmci_discover # owned banks are ignored here
>
> static void cmci_discover(int banks)
> ...
> for (i = 0; i < banks; i++) {
> ...
> if (test_bit(i, owned)) # ownd banks is ignore here
> continue;
>
> So convert cmci_storm_disable_banks() to cmci_toggle_interrupt_mode()
> which controls whether to enable or disable CMCI interrupts with its
> argument.
>
> NB: We cannot clear the owned bit because the banks won't be polled,
> otherwise. See
>
> 27f6c573e0f7 ("x86, CMCI: Add proper detection of end of CMCI storms")
>
> for more info.
>
> Reported-by: Zhang Liguang <zhangliguang@huawei.com>
> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
> Cc: <stable@vger.kernel.org> # v3.15+
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: huawei.libin@huawei.com
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: linux-edac <linux-edac@vger.kernel.org>
> Cc: rui.xiang@huawei.com
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Tony Luck <tony.luck@intel.com>
> Cc: x86-ml <x86@kernel.org>
> Link: http://lkml.kernel.org/r/1439347871-2702-1-git-send-email-xiexiuqi@huawei.com
> Signed-off-by: Borislav Petkov <bp@suse.de>
> ---
> arch/x86/kernel/cpu/mcheck/mce_intel.c | 41 +++++++++++++++++++---------------
> 1 file changed, 23 insertions(+), 18 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
> index c5c003291861..1e8bb6c94f14 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
> @@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
> per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
> }
>
> +static void cmci_toggle_interrupt_mode(bool on)
> +{
> + unsigned long flags, *owned;
> + int bank;
> + u64 val;
> +
> + raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> + owned = this_cpu_ptr(mce_banks_owned);
> + for_each_set_bit(bank, owned, MAX_NR_BANKS) {
> + rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +
> + if (on)
> + val |= MCI_CTL2_CMCI_EN;
> + else
> + val &= ~MCI_CTL2_CMCI_EN;
> +
> + wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> + }
> + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
> +}
> +
> unsigned long cmci_intel_adjust_timer(unsigned long interval)
> {
> if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
> @@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
> */
> if (!atomic_read(&cmci_storm_on_cpus)) {
> __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
> - cmci_reenable();
> + cmci_toggle_interrupt_mode(true);
> cmci_recheck();
> }
> return CMCI_POLL_INTERVAL;
> @@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
> }
> }
>
> -static void cmci_storm_disable_banks(void)
> -{
> - unsigned long flags, *owned;
> - int bank;
> - u64 val;
> -
> - raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> - owned = this_cpu_ptr(mce_banks_owned);
> - for_each_set_bit(bank, owned, MAX_NR_BANKS) {
> - rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> - val &= ~MCI_CTL2_CMCI_EN;
> - wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> - }
> - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
> -}
> -
> static bool cmci_storm_detect(void)
> {
> unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
> @@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
> if (cnt <= CMCI_STORM_THRESHOLD)
> return false;
>
> - cmci_storm_disable_banks();
> + cmci_toggle_interrupt_mode(false);
> __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
> r = atomic_add_return(1, &cmci_storm_on_cpus);
> mce_timer_kick(CMCI_STORM_INTERVAL);
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2015-08-12 11:52 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-12 2:51 [PATCH v2] x86/mce: fix failed to reenable cmci when swiching to interrupt mode Xie XiuQi
2015-08-12 9:54 ` Borislav Petkov
2015-08-12 11:52 ` Xie XiuQi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox