public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] kmemcheck: SMP support
@ 2008-05-23 14:17 Vegard Nossum
  2008-05-23 15:06 ` Ingo Molnar
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Vegard Nossum @ 2008-05-23 14:17 UTC (permalink / raw)
  To: Ingo Molnar, Pekka Enberg; +Cc: linux-kernel

Hi,

This works on real hw, but not on qemu. It seems to get stuck waiting for one
of the atomic values to change. Don't know why yet, it might just be yet
another bug in qemu... (we've hit at least two of them so far. And they were
real bugs too.)

But do you think this approach is feasible? It will kill interactivity,
that's for sure, though only when kmemcheck is run-time enabled and number
of CPUs > 1 (the more the worse).


Vegard


From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 23 May 2008 15:53:03 +0200
Subject: [PATCH] kmemcheck: SMP support

This patch adds SMP support to kmemcheck, that is, the ability to boot
more than one processor even when kmemcheck is enabled. (Previously,
only one CPU would be booted even if more were present in the system.)

On page fault, kmemcheck needs to pause all the other CPUs in the system
in order to guarantee that no other CPU will modify the same memory
location (which will otherwise be unprotected after we set the present
bit of the PTE).

Since the page fault can be taken with any irq state (i.e. enabled or
disabled), we can't send a normal IPI broadcast since this can deadlock.

Instead, we send an NMI. This is guaranteed to be received _except_ if
the processor is already inside the NMI handler.

This is of course not very efficient, and booting with maxcpus=1 is
recommended, however, this allows the kernel to be configured with
CONFIG_KMEMCHECK=y with close to zero overhead when kmemcheck is
disabled. (It can still be runtime-enabled at any time, though.)

The patch has been tested on real hardware.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/kernel/kmemcheck.c |  108 +++++++++++++++++++++++++++++++++++++++----
 1 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/kmemcheck.c b/arch/x86/kernel/kmemcheck.c
index c0045e8..fdf8acb 100644
--- a/arch/x86/kernel/kmemcheck.c
+++ b/arch/x86/kernel/kmemcheck.c
@@ -16,6 +16,7 @@
 #include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/notifier.h>
 #include <linux/page-flags.h>
 #include <linux/percpu.h>
 #include <linux/stacktrace.h>
@@ -23,9 +24,12 @@
 #include <asm/cacheflush.h>
 #include <asm/kmemcheck.h>
 #include <asm/pgtable.h>
+#include <asm/smp.h>
 #include <asm/string.h>
 #include <asm/tlbflush.h>
 
+#include <mach_ipi.h>
+
 enum shadow {
 	SHADOW_UNALLOCATED,
 	SHADOW_UNINITIALIZED,
@@ -240,18 +244,91 @@ static void do_wakeup(unsigned long data)
 	}
 }
 
+static atomic_t nmi_wait;
+static atomic_t nmi_resume;
+static atomic_t started;
+static atomic_t finished;
+
+static int nmi_notifier(struct notifier_block *self,
+	unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *) data;
+
+	if (val != DIE_NMI_IPI || !atomic_read(&nmi_wait))
+		return NOTIFY_DONE;
+
+	atomic_inc(&started);
+
+	/* Pause until the fault has been handled */
+	while (!atomic_read(&nmi_resume))
+		cpu_relax();
+
+	atomic_inc(&finished);
+
+	return NOTIFY_STOP;
+}
+
+static void
+pause_allbutself(void)
+{
+#ifdef CONFIG_SMP
+	static spinlock_t nmi_spinlock;
+
+	int cpus;
+	cpumask_t mask = cpu_online_map;
+
+	spin_lock(&nmi_spinlock);
+
+	cpus = num_online_cpus() - 1;
+
+	atomic_set(&started, 0);
+	atomic_set(&finished, 0);
+	atomic_set(&nmi_wait, 1);
+	atomic_set(&nmi_resume, 0);
+
+	cpu_clear(safe_smp_processor_id(), mask);
+	if (!cpus_empty(mask))
+		send_IPI_mask(mask, NMI_VECTOR);
+
+	while (atomic_read(&started) != cpus)
+		cpu_relax();
+
+	atomic_set(&nmi_wait, 0);
+
+	spin_unlock(&nmi_spinlock);
+#endif
+}
+
+static void
+resume(void)
+{
+#ifdef CONFIG_SMP
+	int cpus;
+
+	cpus = num_online_cpus() - 1;
+
+	atomic_set(&nmi_resume, 1);
+
+	while (atomic_read(&finished) != cpus)
+		cpu_relax();
+#endif
+}
+
+#ifdef CONFIG_SMP
+static struct notifier_block nmi_nb = {
+	.notifier_call = &nmi_notifier,
+};
+#endif
+
 void __init kmemcheck_init(void)
 {
+	int err;
+
 	printk(KERN_INFO "kmemcheck: \"Bugs, beware!\"\n");
 
 #ifdef CONFIG_SMP
-	/* Limit SMP to use a single CPU. We rely on the fact that this code
-	 * runs before SMP is set up. */
-	if (setup_max_cpus > 1) {
-		printk(KERN_INFO
-			"kmemcheck: Limiting number of CPUs to 1.\n");
-		setup_max_cpus = 1;
-	}
+	err = register_die_notifier(&nmi_nb);
+	BUG_ON(err);
 #endif
 }
 
@@ -376,11 +453,14 @@ void kmemcheck_show(struct pt_regs *regs)
 
 	BUG_ON(!irqs_disabled());
 
+	pause_allbutself();
+
 	if (unlikely(data->balance != 0)) {
 		show_addr(data->addr1);
 		show_addr(data->addr2);
 		error_save_bug(regs);
 		data->balance = 0;
+		resume();
 		return;
 	}
 
@@ -390,8 +470,10 @@ void kmemcheck_show(struct pt_regs *regs)
 
 	/* None of the addresses actually belonged to kmemcheck. Note that
 	 * this is not an error. */
-	if (n == 0)
+	if (n == 0) {
+		resume();
 		return;
+	}
 
 	++data->balance;
 
@@ -421,8 +503,10 @@ void kmemcheck_hide(struct pt_regs *regs)
 
 	BUG_ON(!irqs_disabled());
 
-	if (data->balance == 0)
+	if (data->balance == 0) {
+		resume();
 		return;
+	}
 
 	if (unlikely(data->balance != 1)) {
 		show_addr(data->addr1);
@@ -436,6 +520,7 @@ void kmemcheck_hide(struct pt_regs *regs)
 			regs->flags &= ~X86_EFLAGS_TF;
 		if (data->flags & X86_EFLAGS_IF)
 			regs->flags |= X86_EFLAGS_IF;
+		resume();
 		return;
 	}
 
@@ -448,8 +533,10 @@ void kmemcheck_hide(struct pt_regs *regs)
 		n += show_addr(data->addr2);
 	}
 
-	if (n == 0)
+	if (n == 0) {
+		resume();
 		return;
+	}
 
 	--data->balance;
 
@@ -460,6 +547,7 @@ void kmemcheck_hide(struct pt_regs *regs)
 		regs->flags &= ~X86_EFLAGS_TF;
 	if (data->flags & X86_EFLAGS_IF)
 		regs->flags |= X86_EFLAGS_IF;
+	resume();
 }
 
 void kmemcheck_show_pages(struct page *p, unsigned int n)
-- 
1.5.4.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 14:17 [PATCH] kmemcheck: SMP support Vegard Nossum
@ 2008-05-23 15:06 ` Ingo Molnar
  2008-05-23 15:30   ` Vegard Nossum
  2008-05-23 15:40 ` Jeremy Fitzhardinge
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2008-05-23 15:06 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Pekka Enberg, linux-kernel


* Vegard Nossum <vegard.nossum@gmail.com> wrote:

> This works on real hw, but not on qemu. It seems to get stuck waiting 
> for one of the atomic values to change. Don't know why yet, it might 
> just be yet another bug in qemu... (we've hit at least two of them so 
> far. And they were real bugs too.)
> 
> But do you think this approach is feasible? It will kill 
> interactivity, that's for sure, though only when kmemcheck is run-time 
> enabled and number of CPUs > 1 (the more the worse).

i think we definitely want it - if for no other purpose but for people 
to become disgusted at the overhead and fixing it ;-)

Vegard, wanna have a look at introducing per CPU kernel pagetables? I 
tried that once in the past and it wasnt too horrible. (the patches are 
gone though) We could do it before bringing other CPUs online, i.e. much 
of the really yucky boot time pagetable juggling phase would be over 
already. Hm?

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 15:06 ` Ingo Molnar
@ 2008-05-23 15:30   ` Vegard Nossum
  2008-05-23 16:13     ` Jeremy Fitzhardinge
                       ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Vegard Nossum @ 2008-05-23 15:30 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Pekka Enberg, linux-kernel, Pekka Paalanen

On Fri, May 23, 2008 at 5:06 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> Vegard, wanna have a look at introducing per CPU kernel pagetables? I
> tried that once in the past and it wasnt too horrible. (the patches are
> gone though) We could do it before bringing other CPUs online, i.e. much
> of the really yucky boot time pagetable juggling phase would be over
> already. Hm?

Ingo.

It really doesn't matter how easy it was for you.

You're one of the x86 maintainers.

And I think you're forgetting how hard these things are for a newbie.
I don't even know which one comes first of pmds and puds.

Per-cpu page tables sounds about on the same scale of as, say,
rewriting the VM or some other major subsystem. Epic!


I'm glad to hear from you, though.

Pekka suggested that per-cpu page tables might help NUMA systems too.
Does that sound right to you? Would anybody else benefit from having
per-CPU page tables? If not, I have a hard time believing it will ever
get merged.

(Oh. mmio-trace might. But that's also a hacking tool and doesn't really count.)


Vegard

-- 
"The animistic metaphor of the bug that maliciously sneaked in while
the programmer was not looking is intellectually dishonest as it
disguises that the error is the programmer's own creation."
	-- E. W. Dijkstra, EWD1036

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 14:17 [PATCH] kmemcheck: SMP support Vegard Nossum
  2008-05-23 15:06 ` Ingo Molnar
@ 2008-05-23 15:40 ` Jeremy Fitzhardinge
  2008-05-23 15:51   ` Vegard Nossum
  2008-05-23 16:09 ` Johannes Weiner
       [not found] ` <19f34abd0805230719j1ce0e2eje6da7c1f963fdf75@mail.gmail.com>
  3 siblings, 1 reply; 15+ messages in thread
From: Jeremy Fitzhardinge @ 2008-05-23 15:40 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel

Vegard Nossum wrote:
> This works on real hw, but not on qemu. It seems to get stuck waiting for one
> of the atomic values to change. Don't know why yet, it might just be yet
> another bug in qemu... (we've hit at least two of them so far. And they were
> real bugs too.)
>   

I've noticed that qemu mis-reports the eip of cmpxchg if it faults (it 
reports the eip of the start of the basic block, I think).  Does that 
match what you're seeing?

    J

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 15:40 ` Jeremy Fitzhardinge
@ 2008-05-23 15:51   ` Vegard Nossum
  2008-05-23 17:12     ` Jan Kiszka
  0 siblings, 1 reply; 15+ messages in thread
From: Vegard Nossum @ 2008-05-23 15:51 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel

On Fri, May 23, 2008 at 5:40 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> Vegard Nossum wrote:
>>
>> This works on real hw, but not on qemu. It seems to get stuck waiting for
>> one
>> of the atomic values to change. Don't know why yet, it might just be yet
>> another bug in qemu... (we've hit at least two of them so far. And they
>> were
>> real bugs too.)
>>
>
> I've noticed that qemu mis-reports the eip of cmpxchg if it faults (it
> reports the eip of the start of the basic block, I think).  Does that match
> what you're seeing?

You mean the EIP that gets pushed on the stack for the page fault?
(That would be bad news for kmemcheck. I suppose the rest of the
kernel never page faults on cmpxchg addresses?)

Or do you mean the EIP that shows up in gdb?

But no, it seems to be unrelated. What I hit so far were (in 0.9.0):

1. qemu doesn't set the single-stepping flag of DR6 on single-step
debug exceptions.
2. qemu triggers int 0 (divide error) instead of int 2 on NMI IPIs.

But both of these were fixed in the latest 0.9.1.

I don't yet know if what I'm hitting now is really an error with qemu.
But I usually trust the real hardware more :-)


Vegard

-- 
"The animistic metaphor of the bug that maliciously sneaked in while
the programmer was not looking is intellectually dishonest as it
disguises that the error is the programmer's own creation."
	-- E. W. Dijkstra, EWD1036

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 14:17 [PATCH] kmemcheck: SMP support Vegard Nossum
  2008-05-23 15:06 ` Ingo Molnar
  2008-05-23 15:40 ` Jeremy Fitzhardinge
@ 2008-05-23 16:09 ` Johannes Weiner
  2008-05-23 17:10   ` Vegard Nossum
       [not found] ` <19f34abd0805230719j1ce0e2eje6da7c1f963fdf75@mail.gmail.com>
  3 siblings, 1 reply; 15+ messages in thread
From: Johannes Weiner @ 2008-05-23 16:09 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel

Hi Vegard,

Vegard Nossum <vegard.nossum@gmail.com> writes:

> From: Vegard Nossum <vegard.nossum@gmail.com>
> Date: Fri, 23 May 2008 15:53:03 +0200
> Subject: [PATCH] kmemcheck: SMP support
>
> This patch adds SMP support to kmemcheck, that is, the ability to boot
> more than one processor even when kmemcheck is enabled. (Previously,
> only one CPU would be booted even if more were present in the system.)
>
> On page fault, kmemcheck needs to pause all the other CPUs in the system
> in order to guarantee that no other CPU will modify the same memory
> location (which will otherwise be unprotected after we set the present
> bit of the PTE).
>
> Since the page fault can be taken with any irq state (i.e. enabled or
> disabled), we can't send a normal IPI broadcast since this can deadlock.
>
> Instead, we send an NMI. This is guaranteed to be received _except_ if
> the processor is already inside the NMI handler.
>
> This is of course not very efficient, and booting with maxcpus=1 is
> recommended, however, this allows the kernel to be configured with
> CONFIG_KMEMCHECK=y with close to zero overhead when kmemcheck is
> disabled. (It can still be runtime-enabled at any time, though.)
>
> The patch has been tested on real hardware.
>
> Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
> ---
>  arch/x86/kernel/kmemcheck.c |  108 +++++++++++++++++++++++++++++++++++++++----
>  1 files changed, 98 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kernel/kmemcheck.c b/arch/x86/kernel/kmemcheck.c
> index c0045e8..fdf8acb 100644
> --- a/arch/x86/kernel/kmemcheck.c
> +++ b/arch/x86/kernel/kmemcheck.c
> @@ -16,6 +16,7 @@
>  #include <linux/kmemcheck.h>
>  #include <linux/mm.h>
>  #include <linux/module.h>
> +#include <linux/notifier.h>
>  #include <linux/page-flags.h>
>  #include <linux/percpu.h>
>  #include <linux/stacktrace.h>
> @@ -23,9 +24,12 @@
>  #include <asm/cacheflush.h>
>  #include <asm/kmemcheck.h>
>  #include <asm/pgtable.h>
> +#include <asm/smp.h>
>  #include <asm/string.h>
>  #include <asm/tlbflush.h>
>  
> +#include <mach_ipi.h>
> +
>  enum shadow {
>  	SHADOW_UNALLOCATED,
>  	SHADOW_UNINITIALIZED,
> @@ -240,18 +244,91 @@ static void do_wakeup(unsigned long data)
>  	}
>  }
>  
> +static atomic_t nmi_wait;
> +static atomic_t nmi_resume;
> +static atomic_t started;
> +static atomic_t finished;
> +
> +static int nmi_notifier(struct notifier_block *self,
> +	unsigned long val, void *data)
> +{
> +	struct die_args *args = (struct die_args *) data;
> +
> +	if (val != DIE_NMI_IPI || !atomic_read(&nmi_wait))
> +		return NOTIFY_DONE;
> +
> +	atomic_inc(&started);
> +
> +	/* Pause until the fault has been handled */
> +	while (!atomic_read(&nmi_resume))
> +		cpu_relax();
> +
> +	atomic_inc(&finished);
> +
> +	return NOTIFY_STOP;
> +}
> +
> +static void
> +pause_allbutself(void)
> +{
> +#ifdef CONFIG_SMP
> +	static spinlock_t nmi_spinlock;
> +
> +	int cpus;
> +	cpumask_t mask = cpu_online_map;
> +
> +	spin_lock(&nmi_spinlock);
> +
> +	cpus = num_online_cpus() - 1;
> +
> +	atomic_set(&started, 0);
> +	atomic_set(&finished, 0);
> +	atomic_set(&nmi_wait, 1);
> +	atomic_set(&nmi_resume, 0);
> +
> +	cpu_clear(safe_smp_processor_id(), mask);
> +	if (!cpus_empty(mask))
> +		send_IPI_mask(mask, NMI_VECTOR);
> +
> +	while (atomic_read(&started) != cpus)
> +		cpu_relax();
> +
> +	atomic_set(&nmi_wait, 0);
> +
> +	spin_unlock(&nmi_spinlock);
> +#endif
> +}
> +
> +static void
> +resume(void)
> +{
> +#ifdef CONFIG_SMP
> +	int cpus;
> +
> +	cpus = num_online_cpus() - 1;
> +
> +	atomic_set(&nmi_resume, 1);
> +
> +	while (atomic_read(&finished) != cpus)
> +		cpu_relax();
> +#endif
> +}

How about merging finished and started into one?  I.e. `paused'.

The notifiers increases `paused' before the waiting-loop and decreases
it again afterwards.

pause_allbutself() sends the IPIs and waits until `paused' reached the
number of CPUS.

resume() justs waits until `paused' reaches zero.

Would this work?  Will the NMI handler finish even when the CPU is
removed while the handler runs?

	Hannes

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 15:30   ` Vegard Nossum
@ 2008-05-23 16:13     ` Jeremy Fitzhardinge
  2008-05-26  9:11     ` Ingo Molnar
  2008-05-26  9:29     ` Avi Kivity
  2 siblings, 0 replies; 15+ messages in thread
From: Jeremy Fitzhardinge @ 2008-05-23 16:13 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel, Pekka Paalanen

Vegard Nossum wrote:
> On Fri, May 23, 2008 at 5:06 PM, Ingo Molnar <mingo@elte.hu> wrote:
>   
>> Vegard, wanna have a look at introducing per CPU kernel pagetables? I
>> tried that once in the past and it wasnt too horrible. (the patches are
>> gone though) We could do it before bringing other CPUs online, i.e. much
>> of the really yucky boot time pagetable juggling phase would be over
>> already. Hm?
>>     
>
> Ingo.
>
> It really doesn't matter how easy it was for you.
>
> You're one of the x86 maintainers.
>
> And I think you're forgetting how hard these things are for a newbie.
> I don't even know which one comes first of pmds and puds.
>
> Per-cpu page tables sounds about on the same scale of as, say,
> rewriting the VM or some other major subsystem. Epic!
>   

No, I don't think it would really be all that bad, if you just make the 
kernel parts of the pagetable percpu;  userspace might be a bit 
trickier.  But kernel mappings change sufficiently rarely that keeping 
them all in sync isn't a huge problem.

If your requirement is that you want to be able to set page permissions 
on kernel mappings on a per-cpu basis, then it might be easiest to do it 
on-demand.  Start off with a single shared pagetable, and when you want 
to make a per-cpu page protection change, clone the pagetable from the 
root down to the page you're changing and then do your update.

There's certainly enough hooking places in which you could implement it 
without much effect on the core kernel code.

> I'm glad to hear from you, though.
>
> Pekka suggested that per-cpu page tables might help NUMA systems too.
> Does that sound right to you? Would anybody else benefit from having
> per-CPU page tables? If not, I have a hard time believing it will ever
> get merged.
>
> (Oh. mmio-trace might. But that's also a hacking tool and doesn't really count.)
>   

I'd need to think about it a bit, but its possible that 64-bit Xen might 
be able to take advantage of it.  Yeah, it could be useful.

    J

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 16:09 ` Johannes Weiner
@ 2008-05-23 17:10   ` Vegard Nossum
  0 siblings, 0 replies; 15+ messages in thread
From: Vegard Nossum @ 2008-05-23 17:10 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel

On Fri, May 23, 2008 at 6:09 PM, Johannes Weiner <hannes@saeurebad.de> wrote:
> Hi Vegard,
>
> Vegard Nossum <vegard.nossum@gmail.com> writes:
>
>> +static void
>> +resume(void)
>> +{
>> +#ifdef CONFIG_SMP
>> +     int cpus;
>> +
>> +     cpus = num_online_cpus() - 1;
>> +
>> +     atomic_set(&nmi_resume, 1);
>> +
>> +     while (atomic_read(&finished) != cpus)
>> +             cpu_relax();
>> +#endif
>> +}
>
> How about merging finished and started into one?  I.e. `paused'.
>
> The notifiers increases `paused' before the waiting-loop and decreases
> it again afterwards.
>
> pause_allbutself() sends the IPIs and waits until `paused' reached the
> number of CPUS.
>
> resume() justs waits until `paused' reaches zero.
>
> Would this work?  Will the NMI handler finish even when the CPU is
> removed while the handler runs?

Yup, that works perfectly. No CPU will be removed or added, since the
IPI is broadcast to all CPUs and the only CPU running anything besides
the NMI handler itself is waiting for the other CPUs... (At least I
think this is the case. It won't change the way the code currently
works I think...)

Thanks for the tip! :-)

Vegard

-- 
"The animistic metaphor of the bug that maliciously sneaked in while
the programmer was not looking is intellectually dishonest as it
disguises that the error is the programmer's own creation."
	-- E. W. Dijkstra, EWD1036

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 15:51   ` Vegard Nossum
@ 2008-05-23 17:12     ` Jan Kiszka
  2008-05-23 17:32       ` Vegard Nossum
  0 siblings, 1 reply; 15+ messages in thread
From: Jan Kiszka @ 2008-05-23 17:12 UTC (permalink / raw)
  To: Vegard Nossum
  Cc: Jeremy Fitzhardinge, Ingo Molnar, Pekka Enberg, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1996 bytes --]

Vegard Nossum wrote:
> On Fri, May 23, 2008 at 5:40 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>> Vegard Nossum wrote:
>>> This works on real hw, but not on qemu. It seems to get stuck waiting for
>>> one
>>> of the atomic values to change. Don't know why yet, it might just be yet
>>> another bug in qemu... (we've hit at least two of them so far. And they
>>> were
>>> real bugs too.)
>>>
>> I've noticed that qemu mis-reports the eip of cmpxchg if it faults (it
>> reports the eip of the start of the basic block, I think).  Does that match
>> what you're seeing?
> 
> You mean the EIP that gets pushed on the stack for the page fault?
> (That would be bad news for kmemcheck. I suppose the rest of the
> kernel never page faults on cmpxchg addresses?)
> 
> Or do you mean the EIP that shows up in gdb?
> 
> But no, it seems to be unrelated. What I hit so far were (in 0.9.0):
> 
> 1. qemu doesn't set the single-stepping flag of DR6 on single-step
> debug exceptions.
> 2. qemu triggers int 0 (divide error) instead of int 2 on NMI IPIs.
> 
> But both of these were fixed in the latest 0.9.1.

I guess you mean trunk - NMI IPIs didn't came with "old" 0.9.1.

> 
> I don't yet know if what I'm hitting now is really an error with qemu.
> But I usually trust the real hardware more :-)

Try KVM as well. It is, of course, must faster than QEMU, and it comes
with true SMP (given you have a SMP host). With in-kernel irqchip
(that's default), KVM now also supports NMI IPIs. And debug registers
should be fine with my latest patch.

I'm currently trying to get debug support straight for upstream KVM and,
where also required, QEMU. SMP debugging is a common issue, but already
usable with KVM. So testers are welcome, an overview on required patches
can be provided.

Jan

PS: Some cmpxchg changes where committed to qemu-trunk recently, maybe
only related to the restructuring of the code generator, but maybe also
fixing an older bug.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 254 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 17:12     ` Jan Kiszka
@ 2008-05-23 17:32       ` Vegard Nossum
  2008-05-23 17:54         ` Jan Kiszka
  2008-05-23 20:54         ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 15+ messages in thread
From: Vegard Nossum @ 2008-05-23 17:32 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Jeremy Fitzhardinge, Ingo Molnar, Pekka Enberg, linux-kernel

On Fri, May 23, 2008 at 7:12 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> Vegard Nossum wrote:
>> On Fri, May 23, 2008 at 5:40 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>>> Vegard Nossum wrote:
>>>> This works on real hw, but not on qemu. It seems to get stuck waiting for
>>>> one
>>>> of the atomic values to change. Don't know why yet, it might just be yet
>>>> another bug in qemu... (we've hit at least two of them so far. And they
>>>> were
>>>> real bugs too.)
>>>>
>>> I've noticed that qemu mis-reports the eip of cmpxchg if it faults (it
>>> reports the eip of the start of the basic block, I think).  Does that match
>>> what you're seeing?
>>
>> You mean the EIP that gets pushed on the stack for the page fault?
>> (That would be bad news for kmemcheck. I suppose the rest of the
>> kernel never page faults on cmpxchg addresses?)
>>
>> Or do you mean the EIP that shows up in gdb?
>>
>> But no, it seems to be unrelated. What I hit so far were (in 0.9.0):
>>
>> 1. qemu doesn't set the single-stepping flag of DR6 on single-step
>> debug exceptions.
>> 2. qemu triggers int 0 (divide error) instead of int 2 on NMI IPIs.
>>
>> But both of these were fixed in the latest 0.9.1.
>
> I guess you mean trunk - NMI IPIs didn't came with "old" 0.9.1.

Are you sure? It does in fact deliver the NMI IPI as far as I can see
and I am running from a qemu-0.9.1.tar.gz... E.g. for "-smp 3" on this
0.9.1 qemu:

(first number is smp_processor_id())

[0 pause all] <-- in page fault handler
[1 paused] <-- in nmi handler
[2 paused]
[0 resume all] <-- in debug exception handler
[2 resuming, paused = 1] <-- still in nmi handler, now exiting
[1 resuming, paused = 0]

But maybe I should try the trunk and see if that fixes the problem I was seeing!

>
>>
>> I don't yet know if what I'm hitting now is really an error with qemu.
>> But I usually trust the real hardware more :-)
>
> Try KVM as well. It is, of course, must faster than QEMU, and it comes
> with true SMP (given you have a SMP host). With in-kernel irqchip
> (that's default), KVM now also supports NMI IPIs. And debug registers
> should be fine with my latest patch.
>
> I'm currently trying to get debug support straight for upstream KVM and,
> where also required, QEMU. SMP debugging is a common issue, but already
> usable with KVM. So testers are welcome, an overview on required patches
> can be provided.
>

Hm. Doesn't KVM require special hardware? I have just a cheap laptop
(Pentium Dual-Core) and I doubt I will be able to run it... :-(

Thanks for the tip, though!


Vegard

-- 
"The animistic metaphor of the bug that maliciously sneaked in while
the programmer was not looking is intellectually dishonest as it
disguises that the error is the programmer's own creation."
	-- E. W. Dijkstra, EWD1036

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 17:32       ` Vegard Nossum
@ 2008-05-23 17:54         ` Jan Kiszka
  2008-05-23 20:54         ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 15+ messages in thread
From: Jan Kiszka @ 2008-05-23 17:54 UTC (permalink / raw)
  To: Vegard Nossum
  Cc: Jeremy Fitzhardinge, Ingo Molnar, Pekka Enberg, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3177 bytes --]

Vegard Nossum wrote:
> On Fri, May 23, 2008 at 7:12 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> Vegard Nossum wrote:
>>> On Fri, May 23, 2008 at 5:40 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>>>> Vegard Nossum wrote:
>>>>> This works on real hw, but not on qemu. It seems to get stuck waiting for
>>>>> one
>>>>> of the atomic values to change. Don't know why yet, it might just be yet
>>>>> another bug in qemu... (we've hit at least two of them so far. And they
>>>>> were
>>>>> real bugs too.)
>>>>>
>>>> I've noticed that qemu mis-reports the eip of cmpxchg if it faults (it
>>>> reports the eip of the start of the basic block, I think).  Does that match
>>>> what you're seeing?
>>> You mean the EIP that gets pushed on the stack for the page fault?
>>> (That would be bad news for kmemcheck. I suppose the rest of the
>>> kernel never page faults on cmpxchg addresses?)
>>>
>>> Or do you mean the EIP that shows up in gdb?
>>>
>>> But no, it seems to be unrelated. What I hit so far were (in 0.9.0):
>>>
>>> 1. qemu doesn't set the single-stepping flag of DR6 on single-step
>>> debug exceptions.
>>> 2. qemu triggers int 0 (divide error) instead of int 2 on NMI IPIs.
>>>
>>> But both of these were fixed in the latest 0.9.1.
>> I guess you mean trunk - NMI IPIs didn't came with "old" 0.9.1.
> 
> Are you sure? It does in fact deliver the NMI IPI as far as I can see
> and I am running from a qemu-0.9.1.tar.gz... E.g. for "-smp 3" on this
> 0.9.1 qemu:
> 
> (first number is smp_processor_id())
> 
> [0 pause all] <-- in page fault handler
> [1 paused] <-- in nmi handler
> [2 paused]
> [0 resume all] <-- in debug exception handler
> [2 resuming, paused = 1] <-- still in nmi handler, now exiting
> [1 resuming, paused = 0]

Revision 4205 (2008-04-13) introduced the NMI abstraction to QEMU, 4206
added NMI IPIs - while 0.9.1 was released in January. Find /me confused
about what triggers the handler.

> 
> But maybe I should try the trunk and see if that fixes the problem I was seeing!

And even if that hangs, either the internal gdbstub or an external gdb
(on the qemu process) may reveal where things got stuck. Keep in mind
that QEMU is fairly good in widening tiny race windows ;). But if it's
too obscure, just report to qemu-devel.

> 
>>> I don't yet know if what I'm hitting now is really an error with qemu.
>>> But I usually trust the real hardware more :-)
>> Try KVM as well. It is, of course, must faster than QEMU, and it comes
>> with true SMP (given you have a SMP host). With in-kernel irqchip
>> (that's default), KVM now also supports NMI IPIs. And debug registers
>> should be fine with my latest patch.
>>
>> I'm currently trying to get debug support straight for upstream KVM and,
>> where also required, QEMU. SMP debugging is a common issue, but already
>> usable with KVM. So testers are welcome, an overview on required patches
>> can be provided.
>>
> 
> Hm. Doesn't KVM require special hardware? I have just a cheap laptop
> (Pentium Dual-Core) and I doubt I will be able to run it... :-(

Yeah, forgot to mention that "minor" precondition...

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 254 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 17:32       ` Vegard Nossum
  2008-05-23 17:54         ` Jan Kiszka
@ 2008-05-23 20:54         ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 15+ messages in thread
From: Jeremy Fitzhardinge @ 2008-05-23 20:54 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Jan Kiszka, Ingo Molnar, Pekka Enberg, linux-kernel

Vegard Nossum wrote:
> Hm. Doesn't KVM require special hardware? I have just a cheap laptop
> (Pentium Dual-Core) and I doubt I will be able to run it... :-(
>   

If you have an Intel dual-core CPU from the last couple of years, it 
almost certainly has the VT capability (look for "vmx" in 
/proc/cpuinfo).  It may not be usable unless the BIOS enables it though...

    J

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: Fwd: [PATCH] kmemcheck: SMP support
       [not found] ` <19f34abd0805230719j1ce0e2eje6da7c1f963fdf75@mail.gmail.com>
@ 2008-05-25 14:30   ` Pekka Paalanen
  0 siblings, 0 replies; 15+ messages in thread
From: Pekka Paalanen @ 2008-05-25 14:30 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel

On Fri, 23 May 2008 16:19:43 +0200
"Vegard Nossum" <vegard.nossum@gmail.com> wrote:

> Oops, forgot to put you on Cc. I thought you might find this
> interesting as well... :-)

Yes, thanks. My opinion is that mmiotrace can live well enough with
runtime-disabling extra cpus when tracing starts. Multi-cpu effects to
hardware access are not really in the focus but seeing the access in
the first place. I'd rather wait to see if the per-cpu page table
feature starts to evolve.

I quickly read through your code and tried to come up with race
scenarios, but failed. Ok, one question which might be far fetched:
is there a window for things to go wrong, when one cpu has faulted
and is submitting the NMI, but another cpu is managing cpus' online
state at the time? Is there a place in the cpu state management
that might fault at a point where cpu maps are inconsistent?
I doubt, but I don't know. And of course this could be a problem
only when something is bringing cpus on- or offline.

Btw. isn't reserving a cpumask_t variable from the stack discouraged
because it might have to deal with thousands of cpus and consume a
lot of memory?


Thanks.

> ---------- Forwarded message ----------
> From: Vegard Nossum <vegard.nossum@gmail.com>
> Date: Fri, May 23, 2008 at 4:17 PM
> Subject: [PATCH] kmemcheck: SMP support
> To: Ingo Molnar <mingo@elte.hu>, Pekka Enberg <penberg@cs.helsinki.fi>
> Cc: linux-kernel@vger.kernel.org
> 
> 
> Hi,
> 
> This works on real hw, but not on qemu. It seems to get stuck waiting for one
> of the atomic values to change. Don't know why yet, it might just be yet
> another bug in qemu... (we've hit at least two of them so far. And they were
> real bugs too.)
> 
> But do you think this approach is feasible? It will kill interactivity,
> that's for sure, though only when kmemcheck is run-time enabled and number
> of CPUs > 1 (the more the worse).
> 

-- 
Pekka Paalanen
http://www.iki.fi/pq/

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 15:30   ` Vegard Nossum
  2008-05-23 16:13     ` Jeremy Fitzhardinge
@ 2008-05-26  9:11     ` Ingo Molnar
  2008-05-26  9:29     ` Avi Kivity
  2 siblings, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-05-26  9:11 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Pekka Enberg, linux-kernel, Pekka Paalanen


* Vegard Nossum <vegard.nossum@gmail.com> wrote:

> On Fri, May 23, 2008 at 5:06 PM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > Vegard, wanna have a look at introducing per CPU kernel pagetables? I
> > tried that once in the past and it wasnt too horrible. (the patches are
> > gone though) We could do it before bringing other CPUs online, i.e. much
> > of the really yucky boot time pagetable juggling phase would be over
> > already. Hm?
> 
> Ingo.
> 
> It really doesn't matter how easy it was for you.
> 
> You're one of the x86 maintainers.
> 
> And I think you're forgetting how hard these things are for a newbie. 
> I don't even know which one comes first of pmds and puds.

hey i kept mixing that up myself, until i had to fix a bug in it once.

> Per-cpu page tables sounds about on the same scale of as, say, 
> rewriting the VM or some other major subsystem. Epic!

just to make sure: this is an optional "would be really cool" thing. It 
does not impact the mergability of kmemcheck in any way. [ It impacts 
the _usability_ of it :-) ]

> I'm glad to hear from you, though.
> 
> Pekka suggested that per-cpu page tables might help NUMA systems too. 
> Does that sound right to you? Would anybody else benefit from having 
> per-CPU page tables? If not, I have a hard time believing it will ever 
> get merged.

i tried to implement it once for 4G:4G split kernel. My early results 
indicated that especially on PAE there was a significant SMP speedup 
from it. I have not gone down that path so i dont know it for sure, but 
perhaps it was because the CPU sets the accessed bit in kernel PTEs and 
dirties those cachelines which creates overhead on SMP. That happened 
even if there was no kernel pagetable remapping activities. (That was 
many years ago and newer CPUs might be better at it though.)

> (Oh. mmio-trace might. But that's also a hacking tool and doesn't 
> really count.)

even if mmiotrace were just a hacking too (but mmiotrace is a system 
diagnostics tool as well), that could still be a strong reason to do it.

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kmemcheck: SMP support
  2008-05-23 15:30   ` Vegard Nossum
  2008-05-23 16:13     ` Jeremy Fitzhardinge
  2008-05-26  9:11     ` Ingo Molnar
@ 2008-05-26  9:29     ` Avi Kivity
  2 siblings, 0 replies; 15+ messages in thread
From: Avi Kivity @ 2008-05-26  9:29 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: Ingo Molnar, Pekka Enberg, linux-kernel, Pekka Paalanen

Vegard Nossum wrote:
> On Fri, May 23, 2008 at 5:06 PM, Ingo Molnar <mingo@elte.hu> wrote:
>   
>> Vegard, wanna have a look at introducing per CPU kernel pagetables? I
>> tried that once in the past and it wasnt too horrible. (the patches are
>> gone though) We could do it before bringing other CPUs online, i.e. much
>> of the really yucky boot time pagetable juggling phase would be over
>> already. Hm?
>>     
>
> Ingo.
>
> It really doesn't matter how easy it was for you.
>
> You're one of the x86 maintainers.
>
> And I think you're forgetting how hard these things are for a newbie.
> I don't even know which one comes first of pmds and puds.
>
> Per-cpu page tables sounds about on the same scale of as, say,
> rewriting the VM or some other major subsystem. Epic!
>
>   

You might be able to pull off a simple implementation using 
paravirt_ops, without impacting the core VM.

Basically, you keep the current global pagetables, but never set them as 
real pagetables.  Instead you keep per-cpu copies of these pagetables 
and sync them from the master pagetable as needed.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2008-05-26  9:30 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-23 14:17 [PATCH] kmemcheck: SMP support Vegard Nossum
2008-05-23 15:06 ` Ingo Molnar
2008-05-23 15:30   ` Vegard Nossum
2008-05-23 16:13     ` Jeremy Fitzhardinge
2008-05-26  9:11     ` Ingo Molnar
2008-05-26  9:29     ` Avi Kivity
2008-05-23 15:40 ` Jeremy Fitzhardinge
2008-05-23 15:51   ` Vegard Nossum
2008-05-23 17:12     ` Jan Kiszka
2008-05-23 17:32       ` Vegard Nossum
2008-05-23 17:54         ` Jan Kiszka
2008-05-23 20:54         ` Jeremy Fitzhardinge
2008-05-23 16:09 ` Johannes Weiner
2008-05-23 17:10   ` Vegard Nossum
     [not found] ` <19f34abd0805230719j1ce0e2eje6da7c1f963fdf75@mail.gmail.com>
2008-05-25 14:30   ` Fwd: " Pekka Paalanen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox