[PATCH] NMI request/release

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] NMI request/release
@ 2002-10-22  1:32 Corey Minyard
  2002-10-22  2:10 ` John Levon
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-22  1:32 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 518 bytes --]

The attached patch implements a way to request to receive an NMI if it 
comes from an otherwise unknown source.  I needed this for handling NMIs 
with the IPMI watchdog.  This function was discussed a little a while 
back on the mailing list, but nothing was done about it, so I 
implemented a request/release for NMIs.  The code is in 
arch/i386/kernel/traps.c, but it's pretty generic and could possibly 
live in kernel.  I'm not sure, though.

This is relative to 2.5.44.  I have a 2.4 version of it, too.

-Corey


[-- Attachment #2: linux-nmi.diff --]
[-- Type: text/plain, Size: 4761 bytes --]

diff -urN linux.orig/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux.orig/arch/i386/kernel/i386_ksyms.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/i386_ksyms.c	Mon Oct 21 20:04:35 2002
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/tty.h>
+#include <linux/nmi.h>
 #include <linux/highmem.h>
 
 #include <asm/semaphore.h>
@@ -89,6 +90,9 @@
 EXPORT_SYMBOL(get_cmos_time);
 EXPORT_SYMBOL(cpu_khz);
 EXPORT_SYMBOL(apm_info);
+
+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
 
 #ifdef CONFIG_DEBUG_IOVIRT
 EXPORT_SYMBOL(__io_virt_debug);
diff -urN linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- linux.orig/arch/i386/kernel/traps.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/traps.c	Mon Oct 21 20:06:43 2002
@@ -485,6 +485,95 @@
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
+/* A list of handlers for NMIs.  This list will be called in order
+   when an NMI from an otherwise unidentifiable source somes in.  If
+   one of these handles the NMI, it should return 1.  NMI handlers
+   cannot claim spinlocks, so we have to handle mutex in a different
+   manner.  A spinlock protects the list from multiple writers.  When
+   something is removed from the list, it will check to see that
+   calling_nmi_handlers is 0 before returning.  If it is not zero,
+   another processor is handling an NMI, and it should wait until it
+   goes to zero to return and allow the user to free that data. */
+static volatile struct nmi_handler *nmi_handler_list = NULL;
+static spinlock_t                  nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+static atomic_t                    calling_nmi_handlers = ATOMIC_INIT(0);
+
+int request_nmi(struct nmi_handler *handler)
+{
+	volatile struct nmi_handler *curr;
+
+	spin_lock(&nmi_handler_lock);
+
+	/* Make sure the thing is not already in the list. */
+	curr = nmi_handler_list;
+	while (curr) {
+		if (curr == handler) {
+			break;
+		}
+		curr = curr->next;
+	}
+	if (curr)
+		return EBUSY;
+
+	handler->next = nmi_handler_list;
+	xchg(&nmi_handler_list, handler);
+
+	spin_unlock(&nmi_handler_lock);
+	return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+	volatile struct nmi_handler *curr, *prev;
+
+	spin_lock(&nmi_handler_lock);
+
+	/* Find it in the list. */
+	curr = nmi_handler_list;
+	prev = NULL;
+	while (curr) {
+		if (curr == handler) {
+			break;
+		}
+		prev = curr;
+		curr = curr->next;
+	}
+
+	if (curr) {
+		/* If it was found, remove it from the list.  We
+                   assume the write operation here is atomic. */
+		if (prev)
+			xchg(&(prev->next), curr->next);
+		else
+			xchg(&nmi_handler_list, curr->next);
+
+		/* If some other part of the kernel is handling an
+		   NMI, we make sure that we don't release the handler
+		   (or really, allow the user to release the handler)
+		   until it has finished handling the NMI. */
+		while (atomic_read(&calling_nmi_handlers))
+			;
+	}
+	spin_unlock(&nmi_handler_lock);
+}
+
+static int call_nmi_handlers(struct pt_regs * regs)
+{
+	volatile struct nmi_handler *curr;
+	int                         handled = 0;
+
+	atomic_inc(&calling_nmi_handlers);
+	smp_mb();
+	curr = nmi_handler_list;
+	while (curr) {
+		handled |= curr->handler(curr->dev_id, regs);
+		curr = curr->next;
+	}
+	smp_mb();
+	atomic_dec(&calling_nmi_handlers);
+	return handled;
+}
+
 static void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = inb(0x61);
@@ -500,6 +589,12 @@
 			return;
 		}
 #endif
+
+		/* Check our handler list to see if anyone can handle this
+		   nmi. */
+		if (call_nmi_handlers(regs))
+			return;
+
 		unknown_nmi_error(reason, regs);
 		return;
 	}
diff -urN linux.orig/include/asm-i386/irq.h linux/include/asm-i386/irq.h
--- linux.orig/include/asm-i386/irq.h	Mon Oct 21 13:24:41 2002
+++ linux/include/asm-i386/irq.h	Mon Oct 21 20:03:52 2002
@@ -28,4 +28,16 @@
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+
+/* Structure for handling NMIs */
+#define HAVE_NMI_HANDLER	1
+struct nmi_handler
+{
+	char *dev_name;
+	void *dev_id;
+	int  (*handler)(void *dev_id, struct pt_regs *regs);
+
+	volatile struct nmi_handler *next;
+};
+
 #endif /* _ASM_IRQ_H */
diff -urN linux.orig/include/linux/nmi.h linux/include/linux/nmi.h
--- linux.orig/include/linux/nmi.h	Thu Jun 20 17:53:40 2002
+++ linux/include/linux/nmi.h	Mon Oct 21 20:03:53 2002
@@ -19,4 +19,11 @@
 # define touch_nmi_watchdog() do { } while(0)
 #endif
 
+/**
+ * Register a handler to get called when an NMI occurs.  If the handler
+ * actually handles the NMI, it should return 1.
+ */
+int request_nmi(struct nmi_handler *handler);
+void release_nmi(struct nmi_handler *handler);
+
 #endif

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22  1:32 [PATCH] NMI request/release Corey Minyard
@ 2002-10-22  2:10 ` John Levon
  2002-10-22  2:32   ` Corey Minyard
  2002-10-22 12:23   ` [PATCH] NMI request/release Suparna Bhattacharya
  0 siblings, 2 replies; 43+ messages in thread
From: John Levon @ 2002-10-22  2:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: cminyard

On Mon, Oct 21, 2002 at 08:32:47PM -0500, Corey Minyard wrote:

> The attached patch implements a way to request to receive an NMI if it 
> comes from an otherwise unknown source.  I needed this for handling NMIs 
> with the IPMI watchdog.  This function was discussed a little a while 

Then NMI watchdog and oprofile should be changed to use this too. We
also need priority and/or equivalent of NOTIFY_STOP_MASK so we can break
out of calling all the handlers. Actually, why do you continue if one
of the handlers returns 1 anyway ?

> +	atomic_inc(&calling_nmi_handlers);

Isn't this going to cause cacheline ping pong ?

> +	curr = nmi_handler_list;
> +	while (curr) {
> +		handled |= curr->handler(curr->dev_id, regs);

dev_name is never used at all. What is it for ? Also, would be nice
to do an smp_processor_id() just once and pass that in to prevent
multiple calls to get_current().

Couldn't you modify the notifier code to do the xchg()s (though that's
not available on all CPU types ...)

> +#define HAVE_NMI_HANDLER	1

What uses this ?

> +	volatile struct nmi_handler *next;

Hmm ...

Is it not possible to use linux/rcupdate.h for this stuff ?

regards
john
-- 
"Lots of companies would love to be in our hole."
	- Scott McNealy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22  2:10 ` John Levon
@ 2002-10-22  2:32   ` Corey Minyard
  2002-10-22  2:53     ` John Levon
  2002-10-22 12:23   ` [PATCH] NMI request/release Suparna Bhattacharya
  1 sibling, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-22  2:32 UTC (permalink / raw)
  To: John Levon; +Cc: linux-kernel

John Levon wrote:

>On Mon, Oct 21, 2002 at 08:32:47PM -0500, Corey Minyard wrote:
>
>>The attached patch implements a way to request to receive an NMI if it 
>>comes from an otherwise unknown source.  I needed this for handling NMIs 
>>with the IPMI watchdog.  This function was discussed a little a while 
>>    
>>
>Then NMI watchdog and oprofile should be changed to use this too.
>
Maybe so.  If we get the infrastructure into place, we can change that 
afterwards.

> We
>also need priority and/or equivalent of NOTIFY_STOP_MASK so we can break
>out of calling all the handlers. Actually, why do you continue if one
>of the handlers returns 1 anyway ?
>
What if there's an NMI from multiple things?  Not that it's likely, but 
it's possible, right?  I could easily add priority and sort the list by 
it, and add a NOTIFY_STOP_MASK, if there is some benefit.

>>+	atomic_inc(&calling_nmi_handlers);
>>    
>>
>Isn't this going to cause cacheline ping pong ?
>
This is an NMI, does it really matter?  And is there another way to do 
this without locks?

>>+	curr = nmi_handler_list;
>>+	while (curr) {
>>+		handled |= curr->handler(curr->dev_id, regs);
>>    
>>
>dev_name is never used at all. What is it for ? Also, would be nice
>to do an smp_processor_id() just once and pass that in to prevent
>multiple calls to get_current().
>
dev_name could be removed, although it would be nice for reporting 
later.  Basically, for the same reason it's there for interrupts.  And 
again, this is an NMI, I don't think performance really matters (unless 
we perhaps add the NMI watchdog to this).

>Couldn't you modify the notifier code to do the xchg()s (though that's
>not available on all CPU types ...)
>
I don't understand.  The xchg()s are for atomicity between the 
request/release code and the NMI handler.  How could the notifier code 
do it?

>>+#define HAVE_NMI_HANDLER	1
>>    
>>
>What uses this ?
>
This is so the user code can know if it's available or not.

>>+	volatile struct nmi_handler *next;
>>    
>>
>Hmm ...
>
>Is it not possible to use linux/rcupdate.h for this stuff ?
>
I'm not sure.  It looks possible, but remember, this is an NMI, normal 
rules may not apply.  Particularly, you cannot block or spin waiting for 
something else, the NMI code has to run.  An NMI can happen at ANY time. 
 If the rcu code can handle this, I could use it, but I have not looked 
to see if it can.

Thanks,

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22  2:32   ` Corey Minyard
@ 2002-10-22  2:53     ` John Levon
  2002-10-22 13:02       ` Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: John Levon @ 2002-10-22  2:53 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel

On Mon, Oct 21, 2002 at 09:32:07PM -0500, Corey Minyard wrote:

> This is an NMI, does it really matter?

Yes. Both for oprofile and the NMI watchdog (which was firing awfully
often last time I checked). The handler needs to be as streamlined as
possible.

> dev_name could be removed, although it would be nice for reporting 
> later.

Reporting what ? from where ?

> >Couldn't you modify the notifier code to do the xchg()s (though that's
> >not available on all CPU types ...)
> >
> I don't understand.  The xchg()s are for atomicity between the 
> request/release code and the NMI handler.  How could the notifier code 
> do it?

You are using the xchg()s in an attempt to thread onto/off the list
safely no ?

> >>+#define HAVE_NMI_HANDLER	1

> This is so the user code can know if it's available or not.

If we had that for every API or API change, the kernel would be mostly
HAVE_*. It's either available or it's not. If you're maintaining an
external module, then autoconf or similar is the proper way to check for
its existence.

> >Is it not possible to use linux/rcupdate.h for this stuff ?
>
> I'm not sure.  It looks possible, but remember, this is an NMI, normal 
> rules may not apply.  Particularly, you cannot block or spin waiting for 
> something else, the NMI code has to run.  An NMI can happen at ANY time. 

Believe me, I know :)

> If the rcu code can handle this, I could use it, but I have not looked 
> to see if it can.

If it's possible (and I have no idea, not having looked at RCU at all)
it seems the right way.

regards
john

-- 
"Lots of companies would love to be in our hole."
	- Scott McNealy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22  2:53     ` John Levon
@ 2002-10-22 13:02       ` Corey Minyard
  2002-10-22 15:09         ` John Levon
                           ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-22 13:02 UTC (permalink / raw)
  To: John Levon; +Cc: linux-kernel

John Levon wrote:

>On Mon, Oct 21, 2002 at 09:32:07PM -0500, Corey Minyard wrote:
>
>>This is an NMI, does it really matter?
>>    
>>
>Yes. Both for oprofile and the NMI watchdog (which was firing awfully
>often last time I checked). The handler needs to be as streamlined as
>possible.
>
Ok.  I'd be inclined to leave the high-usage things where they are, 
although it would be nice to be able to make the NMI watchdog a module. 
 oprofile should probably stay where it is.  Do you have an alternate 
implementation that would be more efficient?

>>dev_name could be removed, although it would be nice for reporting 
>>later.
>>    
>>
>Reporting what ? from where ?
>
Registered NMI users in procfs.

>>>Couldn't you modify the notifier code to do the xchg()s (though that's
>>>not available on all CPU types ...)
>>>
>>I don't understand.  The xchg()s are for atomicity between the 
>>request/release code and the NMI handler.  How could the notifier code 
>>do it?
>>    
>>
>You are using the xchg()s in an attempt to thread onto/off the list
>safely no ?
>
Yes.  But I don't understand why they would be used in the notifier code.

>>>>+#define HAVE_NMI_HANDLER	1
>>>>        
>>>>
>>This is so the user code can know if it's available or not.
>>    
>>
>If we had that for every API or API change, the kernel would be mostly
>HAVE_*. It's either available or it's not. If you're maintaining an
>external module, then autoconf or similar is the proper way to check for
>its existence.
>
I'm not worried about kernel versions so much as processor capability. 
 Some processors may not have NMIs, or may not be capable of doing this. 
 A few of these exist (like __HAVE_ARCH_CMPXCHG).  The name's probably 
bad, maybe it should be "__HAVE_ARCH_NMI_HANDLER"?

>>If the rcu code can handle this, I could use it, but I have not looked 
>>to see if it can.
>>    
>>
>If it's possible (and I have no idea, not having looked at RCU at all)
>it seems the right way.
>
I looked, and the rcu code relys on turning off interrupts to avoid 
preemption.  So it won't work.

Thanks again,

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 13:02       ` Corey Minyard
@ 2002-10-22 15:09         ` John Levon
  2002-10-22 16:03           ` Corey Minyard
  2002-10-22 17:23         ` Robert Love
  2002-10-22 17:53         ` Dipankar Sarma
  2 siblings, 1 reply; 43+ messages in thread
From: John Levon @ 2002-10-22 15:09 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel

On Tue, Oct 22, 2002 at 08:02:11AM -0500, Corey Minyard wrote:

> Ok.  I'd be inclined to leave the high-usage things where they are, 
> although it would be nice to be able to make the NMI watchdog a module. 
> oprofile should probably stay where it is.  Do you have an alternate 
> implementation that would be more efficient?

I'm beginning to think you're right. You should ask Keith Owens if kdb
etc. can use your API successfully.

> >>dev_name could be removed, although it would be nice for reporting 
> >>
> >Reporting what ? from where ?
> >
> Registered NMI users in procfs.

Then if you add such code, you can add dev_name ... I hate code that
does nothing ...

> Yes.  But I don't understand why they would be used in the notifier code.

I'm trying to reduce code duplication - you do basically the same thing
notifier register/unregister does.

btw, the stuff you add to header files should all be in asm-i386/nmi.h
IMHO.

It would make it clear that there's a fast-path "set nmi handler" and
the slow one, and you can document the difference there, if that's what
we're going to do.

regards
john

-- 
"Lots of companies would love to be in our hole."
	- Scott McNealy

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 15:09         ` John Levon
@ 2002-10-22 16:03           ` Corey Minyard
  0 siblings, 0 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-22 16:03 UTC (permalink / raw)
  To: John Levon; +Cc: Corey Minyard, linux-kernel

John Levon wrote:

>On Tue, Oct 22, 2002 at 08:02:11AM -0500, Corey Minyard wrote:
>
>>Ok.  I'd be inclined to leave the high-usage things where they are, 
>>although it would be nice to be able to make the NMI watchdog a module. 
>>oprofile should probably stay where it is.  Do you have an alternate 
>>implementation that would be more efficient?
>>    
>>
>I'm beginning to think you're right. You should ask Keith Owens if kdb
>etc. can use your API successfully.
>
Ok.  Good thought, that would decouple kdb a little.

>>>>dev_name could be removed, although it would be nice for reporting 
>>>>
>>>Reporting what ? from where ?
>>>
>>Registered NMI users in procfs.
>>    
>>
>Then if you add such code, you can add dev_name ... I hate code that
>does nothing ...
>
Ok, I'll add a procfs interface then :-).  IMHO, there's a different 
between stuff in an interface that is looking forward and dead code, 
though.  If I added it later, I would break all the users.  But there is 
a balance.

>>Yes.  But I don't understand why they would be used in the notifier code.
>>    
>>
>I'm trying to reduce code duplication - you do basically the same thing
>notifier register/unregister does.
>
Ah.  Yes, there is some stuff that looks the same but is subtly 
different.  I'll see what I can do.

>btw, the stuff you add to header files should all be in asm-i386/nmi.h
>IMHO.
>
Ok, I agree.

>It would make it clear that there's a fast-path "set nmi handler" and
>the slow one, and you can document the difference there, if that's what
>we're going to do.
>
>regards
>john
>
>  
>
Thanks,

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 13:02       ` Corey Minyard
  2002-10-22 15:09         ` John Levon
@ 2002-10-22 17:23         ` Robert Love
  2002-10-22 18:08           ` Corey Minyard
  2002-10-22 17:53         ` Dipankar Sarma
  2 siblings, 1 reply; 43+ messages in thread
From: Robert Love @ 2002-10-22 17:23 UTC (permalink / raw)
  To: Corey Minyard; +Cc: John Levon, linux-kernel

On Tue, 2002-10-22 at 09:02, Corey Minyard wrote:

> I looked, and the rcu code relys on turning off interrupts to avoid 
> preemption.  So it won't work.

At least on the variant of RCU that is in 2.5, the RCU code does the
read side by disabling preemption.  Nothing else.

The write side is the same with or without preemption - wait until all
readers are quiescent, change the copy, etc.

But anyhow, disabling interrupts should not affect NMIs, no?

	Robert Love

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 17:23         ` Robert Love
@ 2002-10-22 18:08           ` Corey Minyard
  2002-10-22 18:16             ` Robert Love
  2002-10-22 20:04             ` Dipankar Sarma
  0 siblings, 2 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-22 18:08 UTC (permalink / raw)
  To: Robert Love; +Cc: John Levon, linux-kernel

Robert Love wrote:

>On Tue, 2002-10-22 at 09:02, Corey Minyard wrote:
>
>  
>
>>I looked, and the rcu code relys on turning off interrupts to avoid 
>>preemption.  So it won't work.
>>    
>>
>
>At least on the variant of RCU that is in 2.5, the RCU code does the
>read side by disabling preemption.  Nothing else.
>
In 2.5.44, stock from kernel.org, rcu_process_callbacks() calls 
local_irq_disable().  Is that just preemption disabling, now?

>The write side is the same with or without preemption - wait until all
>readers are quiescent, change the copy, etc.
>
>But anyhow, disabling interrupts should not affect NMIs, no?
>
You are correct.  disabling preemption or interrupts has no effect on NMIs.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 18:08           ` Corey Minyard
@ 2002-10-22 18:16             ` Robert Love
  2002-10-22 20:04             ` Dipankar Sarma
  1 sibling, 0 replies; 43+ messages in thread
From: Robert Love @ 2002-10-22 18:16 UTC (permalink / raw)
  To: Corey Minyard; +Cc: John Levon, linux-kernel

On Tue, 2002-10-22 at 14:08, Corey Minyard wrote:

> In 2.5.44, stock from kernel.org, rcu_process_callbacks() calls 
> local_irq_disable().  Is that just preemption disabling, now?

No, but rcu_process_callbacks() is for the copy update part.

Look at rcu_read_lock() and rcu_read_unlock()... those are the read
paths.

Of course, I can be very wrong about some of this, RCU grosses^Wconfuses
me.  But the read paths do just seem to call rcu_read_lock/unlock which
just do a preempt_disable/enable.  Otherwise the read path is entirely
transparent.

	Robert Love

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 18:08           ` Corey Minyard
  2002-10-22 18:16             ` Robert Love
@ 2002-10-22 20:04             ` Dipankar Sarma
  1 sibling, 0 replies; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-22 20:04 UTC (permalink / raw)
  To: Corey Minyard; +Cc: Robert Love, linux-kernel

On Tue, Oct 22, 2002 at 08:30:16PM +0200, Corey Minyard wrote:
> Robert Love wrote:
> >At least on the variant of RCU that is in 2.5, the RCU code does the
> >read side by disabling preemption.  Nothing else.
> >
> In 2.5.44, stock from kernel.org, rcu_process_callbacks() calls 
> local_irq_disable().  Is that just preemption disabling, now?

No, that is to allow queueing of callbacks from irq context - see
call_rcu(). Since the queues are per-CPU, we don't need any spinlock.
rcu_process_callbacks() is always invoked from the RCU per-CPU tasklet,
so preemption doesn't come into picture. But irq disabling is needed
so that it doesn't race with call_rcu().

Only preemption related issue with RCU is that in the reader
side (in your case traversing the nmi handler list for invocation),
there should not be any preemption (not possible anyway in your case).
This is achieved by rcu_read_lock()/rcu_read_unlock() which essentially
disables/enables preemption.

The idea is that if you get preempted while holding reference to
some RCU protected data, it is not safe to invoke the RCU callback.
Once you get preempted, you can run on a different CPU and keeping
track of the preempted tasks become difficult. Besides preempted
tasks with low priority can delay RCU update for long periods.
Hence the disabling of preemption which is not worse than locks.

> >But anyhow, disabling interrupts should not affect NMIs, no?
> >
> You are correct.  disabling preemption or interrupts has no effect on NMIs.

Yes.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 13:02       ` Corey Minyard
  2002-10-22 15:09         ` John Levon
  2002-10-22 17:23         ` Robert Love
@ 2002-10-22 17:53         ` Dipankar Sarma
  2002-10-22 18:05           ` Corey Minyard
  2 siblings, 1 reply; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-22 17:53 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, levon

On Tue, Oct 22, 2002 at 03:10:06PM +0200, Corey Minyard wrote:
> >If it's possible (and I have no idea, not having looked at RCU at all)
> >it seems the right way.
> >
> I looked, and the rcu code relys on turning off interrupts to avoid 
> preemption.  So it won't work.
> 

Hmm.. Let me see -

You need to walk the list in call_nmi_handlers from nmi interrupt handler where
preemption is not an issue anyway. Using RCU you can possibly do a safe
walking of the nmi handlers. To do this, your update side code
(request/release nmi) will still have to be serialized (spinlock), but
you should not need to wait for completion of any other CPU executing
the nmi handler, instead provide wrappers for nmi_handler
allocation/free and there free the nmi_handler using an RCU callback
(call_rcu()). The nmi_handler will not be freed until all the CPUs
have done a contex switch or executed user-level or been idle.
This will gurantee that *this* nmi_handler is not in execution
and can safely be freed.

This of course is a very simplistic view of the things, there could
be complications that I may have overlooked. But I would be happy
to help out on this if you want.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 17:53         ` Dipankar Sarma
@ 2002-10-22 18:05           ` Corey Minyard
  2002-10-22 18:08             ` Dipankar Sarma
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-22 18:05 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, levon

Dipankar Sarma wrote:

>On Tue, Oct 22, 2002 at 03:10:06PM +0200, Corey Minyard wrote:
>  
>
>>>If it's possible (and I have no idea, not having looked at RCU at all)
>>>it seems the right way.
>>>
>>>      
>>>
>>I looked, and the rcu code relys on turning off interrupts to avoid 
>>preemption.  So it won't work.
>>
>>    
>>
>
>Hmm.. Let me see -
>
>You need to walk the list in call_nmi_handlers from nmi interrupt handler where
>preemption is not an issue anyway. Using RCU you can possibly do a safe
>walking of the nmi handlers. To do this, your update side code
>(request/release nmi) will still have to be serialized (spinlock), but
>you should not need to wait for completion of any other CPU executing
>the nmi handler, instead provide wrappers for nmi_handler
>allocation/free and there free the nmi_handler using an RCU callback
>(call_rcu()). The nmi_handler will not be freed until all the CPUs
>have done a contex switch or executed user-level or been idle.
>This will gurantee that *this* nmi_handler is not in execution
>and can safely be freed.
>
>This of course is a very simplistic view of the things, there could
>be complications that I may have overlooked. But I would be happy
>to help out on this if you want.
>
This doesn't sound any simpler than what I am doing right now.  In fact, 
it sounds more complex.  Am I correct?  What I am doing is pretty simple 
and correct.  Maybe more complexity would be required if you couldn't 
atomically update a pointer, but I think simplicity should win here.

Thanks,

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 18:05           ` Corey Minyard
@ 2002-10-22 18:08             ` Dipankar Sarma
  2002-10-22 18:29               ` Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-22 18:08 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, levon

On Tue, Oct 22, 2002 at 01:05:57PM -0500, Corey Minyard wrote:
> >You need to walk the list in call_nmi_handlers from nmi interrupt handler where
> >preemption is not an issue anyway. Using RCU you can possibly do a safe
> >walking of the nmi handlers. To do this, your update side code
> >(request/release nmi) will still have to be serialized (spinlock), but
> >you should not need to wait for completion of any other CPU executing
> >the nmi handler, instead provide wrappers for nmi_handler
> >allocation/free and there free the nmi_handler using an RCU callback
> >(call_rcu()). The nmi_handler will not be freed until all the CPUs
> >have done a contex switch or executed user-level or been idle.
> >This will gurantee that *this* nmi_handler is not in execution
> >and can safely be freed.
> >
> >This of course is a very simplistic view of the things, there could
> >be complications that I may have overlooked. But I would be happy
> >to help out on this if you want.
> >
> This doesn't sound any simpler than what I am doing right now.  In fact, 
> it sounds more complex.  Am I correct?  What I am doing is pretty simple 
> and correct.  Maybe more complexity would be required if you couldn't 
> atomically update a pointer, but I think simplicity should win here.

I would vote for simplicity and would normally agree with you here. But
it seems to me that using RCU, you can avoid atmic operations
and cache line bouncing of calling_nmi_handlers in the fast path
(nmi interrupt handler). One could argue whether it is really
a fast path or not, but if you are using it for profiling, I would
say it is. No ?

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 18:08             ` Dipankar Sarma
@ 2002-10-22 18:29               ` Corey Minyard
  2002-10-22 19:08                 ` John Levon
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-22 18:29 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, levon

Dipankar Sarma wrote:

>On Tue, Oct 22, 2002 at 01:05:57PM -0500, Corey Minyard wrote:
>  
>
>>>You need to walk the list in call_nmi_handlers from nmi interrupt handler where
>>>preemption is not an issue anyway. Using RCU you can possibly do a safe
>>>walking of the nmi handlers. To do this, your update side code
>>>(request/release nmi) will still have to be serialized (spinlock), but
>>>you should not need to wait for completion of any other CPU executing
>>>the nmi handler, instead provide wrappers for nmi_handler
>>>allocation/free and there free the nmi_handler using an RCU callback
>>>(call_rcu()). The nmi_handler will not be freed until all the CPUs
>>>have done a contex switch or executed user-level or been idle.
>>>This will gurantee that *this* nmi_handler is not in execution
>>>and can safely be freed.
>>>
>>>This of course is a very simplistic view of the things, there could
>>>be complications that I may have overlooked. But I would be happy
>>>to help out on this if you want.
>>>
>>>      
>>>
>>This doesn't sound any simpler than what I am doing right now.  In fact, 
>>it sounds more complex.  Am I correct?  What I am doing is pretty simple 
>>and correct.  Maybe more complexity would be required if you couldn't 
>>atomically update a pointer, but I think simplicity should win here.
>>    
>>
>
>I would vote for simplicity and would normally agree with you here. But
>it seems to me that using RCU, you can avoid atmic operations
>and cache line bouncing of calling_nmi_handlers in the fast path
>(nmi interrupt handler). One could argue whether it is really
>a fast path or not, but if you are using it for profiling, I would
>say it is. No ?
>
I would vote against using it for profiling; profiling has it's own 
special fast-path, anyway.  The NMI watchdog only gets hit once every 
minute or so, it seems, so that seems suitable for this.

I've looked at the RCU code a little more, and I think I understand it 
better.  I think your scenario will work, if it's true that it won't be 
called until all CPUs have done what you say.  I'll look at it a little 
more.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22 18:29               ` Corey Minyard
@ 2002-10-22 19:08                 ` John Levon
  2002-10-22 21:36                   ` [PATCH] NMI request/release, version 3 Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: John Levon @ 2002-10-22 19:08 UTC (permalink / raw)
  To: linux-kernel; +Cc: dipankar, cminyard

On Tue, Oct 22, 2002 at 01:29:55PM -0500, Corey Minyard wrote:

> I would vote against using it for profiling; profiling has it's own 
> special fast-path, anyway.

But it would be good (less code, simpler, and even possibly for keeping
NMI watchdog ticking when oprofile is running) if we could merge the two
cases.

> The NMI watchdog only gets hit once every 
> minute or so, it seems, so that seems suitable for this.

It can easily be much more frequent than that (though you could argue
this is a mis-setup).

> I've looked at the RCU code a little more, and I think I understand it 
> better.  I think your scenario will work, if it's true that it won't be 
> called until all CPUs have done what you say.  I'll look at it a little 
> more.

Thanks for looking into this ...

regards
john

-- 
"This is mindless pedantism up with which I will not put."
	- Donald Knuth on Pascal's lack of default: case statement

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] NMI request/release, version 3
  2002-10-22 19:08                 ` John Levon
@ 2002-10-22 21:36                   ` Corey Minyard
  2002-10-23 17:33                     ` Dipankar Sarma
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-22 21:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: John Levon, dipankar

[-- Attachment #1: Type: text/plain, Size: 326 bytes --]

Add a request/release mechanism to the kernel (x86 only for now) for NMIs.

This version uses the rcupdate mechanism to free the link items, instead 
of using the previous algorithm for interacting with the NMI handler 
code to free the item.  It should be more scalable.  Thanks to everyone 
who helped me with this.

-Corey

[-- Attachment #2: linux-nmi-v3.diff --]
[-- Type: text/plain, Size: 10179 bytes --]

diff -ur linux.orig/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux.orig/arch/i386/kernel/i386_ksyms.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/i386_ksyms.c	Tue Oct 22 12:19:59 2002
@@ -90,6 +90,9 @@
 EXPORT_SYMBOL(cpu_khz);
 EXPORT_SYMBOL(apm_info);
 
+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
 #ifdef CONFIG_DEBUG_IOVIRT
 EXPORT_SYMBOL(__io_virt_debug);
 #endif
diff -ur linux.orig/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c
--- linux.orig/arch/i386/kernel/irq.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/irq.c	Tue Oct 22 12:08:20 2002
@@ -131,6 +131,8 @@
  * Generic, controller-independent functions:
  */
 
+extern void nmi_append_user_names(struct seq_file *p);
+
 int show_interrupts(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -166,6 +168,8 @@
 	for (j = 0; j < NR_CPUS; j++)
 		if (cpu_online(j))
 			p += seq_printf(p, "%10u ", nmi_count(j));
+	seq_printf(p, "                ");
+	nmi_append_user_names(p);
 	seq_putc(p, '\n');
 #if CONFIG_X86_LOCAL_APIC
 	seq_printf(p, "LOC: ");
diff -ur linux.orig/arch/i386/kernel/nmi.c linux/arch/i386/kernel/nmi.c
--- linux.orig/arch/i386/kernel/nmi.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi.c	Tue Oct 22 16:18:00 2002
@@ -20,6 +20,7 @@
 #include <linux/interrupt.h>
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
+#include <linux/notifier.h>
 
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -102,6 +103,17 @@
 	return 0;
 }
 
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+	.dev_name = "nmi_watchdog",
+	.dev_id   = NULL,
+	.handler  = nmi_watchdog_tick,
+	.priority = 255, /* We want to be relatively high priority. */
+	.freed    = NULL
+};
+
 static int __init setup_nmi_watchdog(char *str)
 {
 	int nmi;
@@ -110,6 +122,12 @@
 
 	if (nmi >= NMI_INVALID)
 		return 0;
+
+	if (request_nmi(&nmi_watchdog_handler) != 0) {
+		/* Couldn't add a watchdog handler, give up. */
+		return 0;
+	}
+
 	if (nmi == NMI_NONE)
 		nmi_watchdog = nmi;
 	/*
@@ -350,7 +368,7 @@
 		alert_counter[i] = 0;
 }
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs)
 {
 
 	/*
@@ -401,4 +419,6 @@
 		}
 		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
 	}
+
+	return NOTIFY_OK;
 }
diff -ur linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- linux.orig/arch/i386/kernel/traps.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/traps.c	Tue Oct 22 16:01:33 2002
@@ -23,6 +23,10 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/highmem.h>
+#include <linux/notifier.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -485,21 +489,191 @@
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
+/* 
+ * A list of handlers for NMIs.  This list will be called in order
+ * when an NMI from an otherwise unidentifiable source somes in.  If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE.  NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner.  A
+ * spinlock protects the list from multiple writers.  When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu.  When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct nmi_handler *nmi_handler_list = NULL;
+static spinlock_t         nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+static struct nmi_handler *nmi_to_free_list = NULL;
+static spinlock_t         nmi_to_free_lock = SPIN_LOCK_UNLOCKED;
+
+struct rcu_head nmi_rcu;
+
+/*
+ * To free the list item, we use an rcu.  The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void really_free_nmi_list(void *unused)
+{
+	unsigned long      flags;
+	struct nmi_handler *item;
+
+	spin_lock_irqsave(&nmi_to_free_lock, flags);
+	while (nmi_to_free_list) {
+		item = nmi_to_free_list;
+		nmi_to_free_list = item->link2;
+		item->freed(item);
+	}
+	spin_unlock_irqrestore(&nmi_to_free_lock, flags);
+}
+static inline void free_nmi_handler(struct nmi_handler *item)
+{
+	unsigned long flags;
+
+	if (!item->freed)
+		return;
+
+	spin_lock_irqsave(&nmi_to_free_lock, flags);
+	/* We only have one copy of nmi_rcu, so we only want to add it
+           once.  If there are items in the list, then it has already
+           been added. */
+	if (nmi_to_free_list == NULL)
+		call_rcu(&nmi_rcu, really_free_nmi_list, NULL);
+	item->link2 = nmi_to_free_list;
+	nmi_to_free_list = item;
+	spin_unlock_irqrestore(&nmi_to_free_lock, flags);
+}
+
+static inline struct nmi_handler *find_nmi_handler(
+	struct nmi_handler *handler,
+	struct nmi_handler **rprev)
+{
+	struct nmi_handler *curr, *prev;
+
+	curr = nmi_handler_list;
+	prev = NULL;
+	while (curr) {
+		if (curr == handler) {
+			break;
+		}
+		prev = curr;
+		curr = curr->next;
+	}
+
+	if (rprev)
+		*rprev = prev;
+
+	return curr;
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+	struct nmi_handler *curr;
+
+	spin_lock(&nmi_handler_lock);
+
+	/* Make sure the thing is not already in the list. */
+	if (find_nmi_handler(handler, NULL))
+		return EBUSY;
+
+	/* Add it into the list in priority order. */
+	curr = nmi_handler_list;
+	if ((!curr) || (curr->priority < handler->priority)) {
+		handler->next = nmi_handler_list;
+		smp_mb();
+		xchg(&nmi_handler_list, handler);
+	} else {
+		while (curr->next &&
+		       (curr->next->priority > handler->priority))
+		{
+			curr = curr->next;
+		}
+		handler->next = curr->next;
+		smp_mb();
+		xchg(&(curr->next), handler);
+	}
+
+	spin_unlock(&nmi_handler_lock);
+	return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+	struct nmi_handler *curr, *prev;
+
+	spin_lock(&nmi_handler_lock);
+
+	/* Find it in the list. */
+	curr = find_nmi_handler(handler, &prev);
+
+	if (curr) {
+		/* If it was found, remove it from the list.  We
+                   assume the write operation here is atomic. */
+		smp_mb();
+		if (prev)
+			xchg(&(prev->next), curr->next);
+		else
+			xchg(&nmi_handler_list, curr->next);
+
+		free_nmi_handler(curr);
+	}
+	spin_unlock(&nmi_handler_lock);
+}
+
+static int call_nmi_handlers(struct pt_regs * regs)
+{
+	struct nmi_handler *curr, *next;
+	int                handled = 0;
+	int                val;
+
+	curr = nmi_handler_list;
+	while (curr) {
+		next = curr->next;
+		val = curr->handler(curr->dev_id, regs);
+		switch (val & ~NOTIFY_STOP_MASK) {
+		case NOTIFY_OK:
+			handled = 1;
+			break;
+
+		case NOTIFY_DONE:
+		default:
+		}
+		if (val & NOTIFY_STOP_MASK)
+			break;
+		curr = next;
+	}
+
+	return handled;
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+	struct nmi_handler *curr;
+
+	spin_lock(&nmi_handler_lock);
+	curr = nmi_handler_list;
+	while (curr) {
+		if (curr->dev_name)
+			p += seq_printf(p, " %s", curr->dev_name);
+		curr = curr->next;
+	}
+	spin_unlock(&nmi_handler_lock);
+}
+
 static void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = inb(0x61);
  
 	if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
 		/*
-		 * Ok, so this is none of the documented NMI sources,
-		 * so it must be the NMI watchdog.
+		 * Check our handler list to see if anyone can handle this
+		 * nmi.
 		 */
-		if (nmi_watchdog) {
-			nmi_watchdog_tick(regs);
+		if (call_nmi_handlers(regs))
 			return;
-		}
-#endif
+
 		unknown_nmi_error(reason, regs);
 		return;
 	}
diff -ur linux.orig/include/asm-i386/apic.h linux/include/asm-i386/apic.h
--- linux.orig/include/asm-i386/apic.h	Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h	Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
 extern void setup_boot_APIC_clock (void);
 extern void setup_secondary_APIC_clock (void);
 extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
diff -ur linux.orig/include/asm-i386/irq.h linux/include/asm-i386/irq.h
--- linux.orig/include/asm-i386/irq.h	Mon Oct 21 13:24:41 2002
+++ linux/include/asm-i386/irq.h	Tue Oct 22 16:13:30 2002
@@ -28,4 +28,43 @@
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+
+/**
+ * Register a handler to get called when an NMI occurs.  If the
+ * handler actually handles the NMI, it should return NOTIFY_OK.  If
+ * it did not handle the NMI, it should return NOTIFY_DONE.  It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified.  Note that this is a slow-path
+ * handler, if you have a fast-path handler there's another tie-in
+ * for you.  See the oprofile code.
+ *
+ * Note that when you release the handler, you may NOT immediately
+ * free or reuse the handler item, and you may not unload the module
+ * of any handler, because it still may be referenced in an NMI call.
+ * Instead, you should wait until the supplied "freed" callback is
+ * called.
+ */
+#define HAVE_NMI_HANDLER	1
+struct nmi_handler
+{
+	char *dev_name;
+	void *dev_id;
+	int  (*handler)(void *dev_id, struct pt_regs *regs);
+	int  priority; /* Handlers called in priority order. */
+
+	/* If "freed" is not NULL, this will be called when the item
+           is no longer in use. */
+	void (*freed)(const void *arg);
+
+	/* Don't mess with anything below here. */
+
+	struct nmi_handler *next;
+	/* This is for linking into the list of things release in the
+	   rcu callback.  We can't use next because we can't touch it
+	   until the rcu callback runs. */
+	struct nmi_handler *link2;
+};
+int request_nmi(struct nmi_handler *handler);
+void release_nmi(struct nmi_handler *handler);
+
 #endif /* _ASM_IRQ_H */

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 3
  2002-10-22 21:36                   ` [PATCH] NMI request/release, version 3 Corey Minyard
@ 2002-10-23 17:33                     ` Dipankar Sarma
  2002-10-23 18:03                       ` Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-23 17:33 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, John Levon

On Tue, Oct 22, 2002 at 04:36:51PM -0500, Corey Minyard wrote:

> +static struct nmi_handler *nmi_handler_list = NULL;
> +static spinlock_t         nmi_handler_lock = SPIN_LOCK_UNLOCKED;
> +static struct nmi_handler *nmi_to_free_list = NULL;
> +static spinlock_t         nmi_to_free_lock = SPIN_LOCK_UNLOCKED;
> +
> +struct rcu_head nmi_rcu;
> +
> +/*
> + * To free the list item, we use an rcu.  The rcu-function will not
> + * run until all processors have done a context switch, gone idle, or
> + * gone to a user process, so it's guaranteed that when this runs, any
> + * NMI handler running at release time has completed and the list item
> + * can be safely freed.
> + */
> +static void really_free_nmi_list(void *unused)
> +{
> +	unsigned long      flags;
> +	struct nmi_handler *item;
> +
> +	spin_lock_irqsave(&nmi_to_free_lock, flags);
> +	while (nmi_to_free_list) {
> +		item = nmi_to_free_list;
> +		nmi_to_free_list = item->link2;
> +		item->freed(item);
> +	}
> +	spin_unlock_irqrestore(&nmi_to_free_lock, flags);
> +}
> +static inline void free_nmi_handler(struct nmi_handler *item)
> +{
> +	unsigned long flags;
> +
> +	if (!item->freed)
> +		return;
> +
> +	spin_lock_irqsave(&nmi_to_free_lock, flags);
> +	/* We only have one copy of nmi_rcu, so we only want to add it
> +           once.  If there are items in the list, then it has already
> +           been added. */
> +	if (nmi_to_free_list == NULL)
> +		call_rcu(&nmi_rcu, really_free_nmi_list, NULL);
> +	item->link2 = nmi_to_free_list;
> +	nmi_to_free_list = item;
> +	spin_unlock_irqrestore(&nmi_to_free_lock, flags);
> +}

Hmm... I am not sure if this is correct. 

Your grace period starts from the moment you do a call_rcu(). So,
this example sequence might result in a problem here -

CPU#0		CPU#1		CPU#2		CPU#3
free_nmi_hanlder(X)
call_rcu

		cswitch	

				cswitch		cswitch

				nmi_handler(Y)

		free_nmi_handler(Y)
		[gets queued in the same free list as X]
					
cswitch	
				
reaally_free_nmi_list		[nmi_handler still executing]


Since context switch happened in all the CPUs since call_rcu(),
real update may happen and free the nmi_handler Y while it
is executing in CPU#2. IOW, once you start one RCU grace period
you cannot add to that list since the adding CPU may already
have participated in RCU and indicated that it doesn't have
any references. Once you hand over stuff to RCU, you must
use a new call_rcu() for that new batch.

I am wondering if we could do something like this -

static struct list_head nmi_handler_list = LIST_INIT_HEAD(nmi_handler_list);

struct nmi_handler
{
	struct list_head link;
	char *dev_name;
	void *dev_id;
	int  (*handler)(void *dev_id, struct pt_regs *regs);
	int  priority;
	void (*freed)(const void *arg);
	struct rcu_head rcu;
};

static void really_free_nmi_list(void *arg)
{
	struct nmi_handler *handler = arg;
	list_head_init(&handler->link);
	if (handler->freed)
		handler->freed(handler);
}

void release_nmi(struct nmi_handler *handler)
{
	if (handler == NULL)
		return;

	spin_lock(&nmi_handler_lock);
	list_del_rcu(&handler->link);
	spin_unlock(&nmi_handler_lock);
	call_rcu(&handler->rcu, really_free_nmi_handler, handler);
}

int request_nmi(struct nmi_handler *handler)
{
	struct list_head *head, *curr;
	struct nmi_handler *curr_h;

	/* Make sure the thing is not already in the list. */
	if (!list_empty(&handler->link))
		return EBUSY;

	spin_lock(&nmi_handler_lock);

	/* Add it into the list in priority order. */
	head = &nmi_handler_list;
	__list_for_each(curr, head) {
		curr_h = list_entry(curr, struct nmi_handler, link);
		if (curr_h->priority <= handler->priority)
			break;
	}
	/* list_add_rcu takes care of memory barrier */
	list_add_rcu(&handler->link, curr->link.prev);
	spin_unlock(&nmi_handler_lock);
	return 0;
}

static int call_nmi_handlers(struct pt_regs * regs)
{
	struct list_head *head, *curr;
	int                handled = 0;
	int                val;

	head = &nmi_handler_list;
	/* __list_for_each_rcu takes care of memory barriers */
	__list_for_each_rcu(curr, head) {
		curr_h = list_entry(curr, struct nmi_handler, link);
		val = curr_h->handler(curr_h->dev_id, regs);
		switch (val & ~NOTIFY_STOP_MASK) {
		case NOTIFY_OK:
			handled = 1;
			break;

		case NOTIFY_DONE:
		default:
		}
		if (val & NOTIFY_STOP_MASK)
			break;
	}
	return handled;
}

I probably have missed quite a few things here in these changes, but
would be interesting to see if they could be made to work. One clear
problem - someone does a release_nmi() and then a request_nmi() 
on the same handler while it is waiting for its RCU grace period
to be over. Oh well :-)

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 3
  2002-10-23 17:33                     ` Dipankar Sarma
@ 2002-10-23 18:03                       ` Corey Minyard
  2002-10-23 18:57                         ` Dipankar Sarma
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-23 18:03 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, John Levon

Dipankar Sarma wrote:

>On Tue, Oct 22, 2002 at 04:36:51PM -0500, Corey Minyard wrote:
>
>Hmm... I am not sure if this is correct. 
>
Yes, it's not correct, I will fix it.  I didn't realize there was an 
rcu-safe list.  That should make things simpler.

>I probably have missed quite a few things here in these changes, but
>would be interesting to see if they could be made to work. One clear
>problem - someone does a release_nmi() and then a request_nmi() 
>on the same handler while it is waiting for its RCU grace period
>to be over. Oh well :-)
>
Well, the documentation said "Don't do that!".  But I think you are 
right, I will make release_nmi() block until the item is free.

Thanks,

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 3
  2002-10-23 18:03                       ` Corey Minyard
@ 2002-10-23 18:57                         ` Dipankar Sarma
  2002-10-23 20:14                           ` [PATCH] NMI request/release, version 4 Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-23 18:57 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, John Levon

On Wed, Oct 23, 2002 at 01:03:11PM -0500, Corey Minyard wrote:
> >
> Yes, it's not correct, I will fix it.  I didn't realize there was an 
> rcu-safe list.  That should make things simpler.

Yeah, RCU documentation needs some serious updating :(

One other thing, it might be better to do all handler checks
in request_nmi and release_nmi in the spinlock serialized
critical section to minimize memory barrier issues unlike in the code
snippet I mailed - things like the list_empty() check etc.

That way you need to look at memory barrier issues with only
one piece of lockfree code - call_nmi_handlers.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] NMI request/release, version 4
  2002-10-23 18:57                         ` Dipankar Sarma
@ 2002-10-23 20:14                           ` Corey Minyard
  2002-10-23 20:50                             ` Dipankar Sarma
  2002-10-24  7:50                             ` Dipankar Sarma
  0 siblings, 2 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-23 20:14 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, John Levon

[-- Attachment #1: Type: text/plain, Size: 539 bytes --]

Ok, here's try number 4.  This one fixes the race condition that 
Dipankar pointed out, and modifies the release_nmi() call to block until 
the rcu finishes.

I have noticed that the rcu callback can be delayed a long time, 
sometimes up to 3 seconds.  That seems odd.  I have verified that the 
delay happens there.

Also, does the __wait_for_event() code in release_nmi() need any memory 
barriers?

Also, the code in traps.c is pretty generic.  Perhaps it could be moved 
to a common place?  Or is that worth it?

Thanks again,

-Corey

[-- Attachment #2: linux-nmi-v4.diff --]
[-- Type: text/plain, Size: 9106 bytes --]

diff -ur linux.orig/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux.orig/arch/i386/kernel/i386_ksyms.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/i386_ksyms.c	Tue Oct 22 12:19:59 2002
@@ -90,6 +90,9 @@
 EXPORT_SYMBOL(cpu_khz);
 EXPORT_SYMBOL(apm_info);
 
+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
 #ifdef CONFIG_DEBUG_IOVIRT
 EXPORT_SYMBOL(__io_virt_debug);
 #endif
diff -ur linux.orig/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c
--- linux.orig/arch/i386/kernel/irq.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/irq.c	Tue Oct 22 12:08:20 2002
@@ -131,6 +131,8 @@
  * Generic, controller-independent functions:
  */
 
+extern void nmi_append_user_names(struct seq_file *p);
+
 int show_interrupts(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -166,6 +168,8 @@
 	for (j = 0; j < NR_CPUS; j++)
 		if (cpu_online(j))
 			p += seq_printf(p, "%10u ", nmi_count(j));
+	seq_printf(p, "                ");
+	nmi_append_user_names(p);
 	seq_putc(p, '\n');
 #if CONFIG_X86_LOCAL_APIC
 	seq_printf(p, "LOC: ");
diff -ur linux.orig/arch/i386/kernel/nmi.c linux/arch/i386/kernel/nmi.c
--- linux.orig/arch/i386/kernel/nmi.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi.c	Wed Oct 23 13:57:55 2002
@@ -20,6 +20,7 @@
 #include <linux/interrupt.h>
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
+#include <linux/notifier.h>
 
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -102,6 +103,17 @@
 	return 0;
 }
 
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+	.link     = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+	.dev_name = "nmi_watchdog",
+	.dev_id   = NULL,
+	.handler  = nmi_watchdog_tick,
+	.priority = 255, /* We want to be relatively high priority. */
+};
+
 static int __init setup_nmi_watchdog(char *str)
 {
 	int nmi;
@@ -110,6 +122,12 @@
 
 	if (nmi >= NMI_INVALID)
 		return 0;
+
+	if (request_nmi(&nmi_watchdog_handler) != 0) {
+		/* Couldn't add a watchdog handler, give up. */
+		return 0;
+	}
+
 	if (nmi == NMI_NONE)
 		nmi_watchdog = nmi;
 	/*
@@ -350,7 +368,7 @@
 		alert_counter[i] = 0;
 }
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs)
 {
 
 	/*
@@ -401,4 +419,6 @@
 		}
 		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
 	}
+
+	return NOTIFY_OK;
 }
diff -ur linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- linux.orig/arch/i386/kernel/traps.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/traps.c	Wed Oct 23 14:58:09 2002
@@ -23,6 +23,10 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/highmem.h>
+#include <linux/notifier.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -485,21 +489,139 @@
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
+/* 
+ * A list of handlers for NMIs.  This list will be called in order
+ * when an NMI from an otherwise unidentifiable source somes in.  If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE.  NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner.  A
+ * spinlock protects the list from multiple writers.  When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu.  When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t       nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * To free the list item, we use an rcu.  The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
+{
+	struct nmi_handler *handler = arg;
+
+	INIT_LIST_HEAD(&(handler->link));
+	wmb();
+	wake_up(&(handler->wait));
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+	struct list_head   *curr;
+	struct nmi_handler *curr_h = NULL;
+
+	if (!list_empty(&(handler->link)))
+		return EBUSY;
+
+	spin_lock(&nmi_handler_lock);
+
+	__list_for_each(curr, &nmi_handler_list) {
+		curr_h = list_entry(curr, struct nmi_handler, link);
+		if (curr_h->priority <= handler->priority)
+			break;
+	}
+
+	if (curr_h)
+		/* list_add_rcu takes care of memory barrier */
+		list_add_rcu(&(handler->link), curr_h->link.prev);
+	else
+		list_add_rcu(&(handler->link), &nmi_handler_list);
+
+	spin_unlock(&nmi_handler_lock);
+	return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+	wait_queue_t  q_ent;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nmi_handler_lock, flags);
+	list_del_rcu(&(handler->link));
+
+	/* Wait for handler to finish being freed.  This can't be
+           interrupted, we must wait until it finished. */
+	init_waitqueue_head(&(handler->wait));
+	init_waitqueue_entry(&q_ent, current);
+	add_wait_queue(&(handler->wait), &q_ent);
+	call_rcu(&(handler->rcu), free_nmi_handler, handler);
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (list_empty(&(handler->link)))
+			break;
+		spin_unlock_irqrestore(&nmi_handler_lock, flags);
+		schedule();
+		spin_lock_irqsave(&nmi_handler_lock, flags);
+	}
+	remove_wait_queue(&(handler->wait), &q_ent);
+	spin_unlock_irqrestore(&nmi_handler_lock, flags);
+}
+
+static int call_nmi_handlers(struct pt_regs * regs)
+{
+	struct list_head   *curr;
+	struct nmi_handler *curr_h;
+	int                handled = 0;
+	int                val;
+
+	__list_for_each_rcu(curr, &nmi_handler_list) {
+		curr_h = list_entry(curr, struct nmi_handler, link);
+		val = curr_h->handler(curr_h->dev_id, regs);
+		switch (val & ~NOTIFY_STOP_MASK) {
+		case NOTIFY_OK:
+			handled = 1;
+			break;
+
+		case NOTIFY_DONE:
+		default:
+		}
+		if (val & NOTIFY_STOP_MASK)
+			break;
+	}
+	return handled;
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+	struct list_head   *curr;
+	struct nmi_handler *curr_h;
+
+	spin_lock(&nmi_handler_lock);
+	__list_for_each(curr, &nmi_handler_list) {
+		curr_h = list_entry(curr, struct nmi_handler, link);
+		if (curr_h->dev_name)
+			p += seq_printf(p, " %s", curr_h->dev_name);
+	}
+	spin_unlock(&nmi_handler_lock);
+}
+
 static void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = inb(0x61);
  
 	if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
 		/*
-		 * Ok, so this is none of the documented NMI sources,
-		 * so it must be the NMI watchdog.
+		 * Check the handler list to see if anyone can handle this
+		 * nmi.
 		 */
-		if (nmi_watchdog) {
-			nmi_watchdog_tick(regs);
+		if (call_nmi_handlers(regs))
 			return;
-		}
-#endif
+
 		unknown_nmi_error(reason, regs);
 		return;
 	}
diff -ur linux.orig/include/asm-i386/apic.h linux/include/asm-i386/apic.h
--- linux.orig/include/asm-i386/apic.h	Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h	Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
 extern void setup_boot_APIC_clock (void);
 extern void setup_secondary_APIC_clock (void);
 extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
diff -ur linux.orig/include/asm-i386/irq.h linux/include/asm-i386/irq.h
--- linux.orig/include/asm-i386/irq.h	Mon Oct 21 13:24:41 2002
+++ linux/include/asm-i386/irq.h	Wed Oct 23 14:15:59 2002
@@ -12,6 +12,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/rcupdate.h>
 /* include comes from machine specific directory */
 #include "irq_vectors.h"
 
@@ -27,5 +28,36 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
+
+
+/**
+ * Register a handler to get called when an NMI occurs.  If the
+ * handler actually handles the NMI, it should return NOTIFY_OK.  If
+ * it did not handle the NMI, it should return NOTIFY_DONE.  It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified.  Note that this is a slow-path
+ * handler, if you have a fast-path handler there's another tie-in
+ * for you.  See the oprofile code.
+ */
+#define HAVE_NMI_HANDLER	1
+struct nmi_handler
+{
+	struct list_head link; /* You must init this before use. */
+
+	char *dev_name;
+	void *dev_id;
+	int  (*handler)(void *dev_id, struct pt_regs *regs);
+	int  priority; /* Handlers called in priority order. */
+
+	/* Don't mess with anything below here. */
+
+	struct rcu_head    rcu;
+	wait_queue_head_t  wait;
+};
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);
 
 #endif /* _ASM_IRQ_H */

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 4
  2002-10-23 20:14                           ` [PATCH] NMI request/release, version 4 Corey Minyard
@ 2002-10-23 20:50                             ` Dipankar Sarma
  2002-10-23 21:53                               ` Corey Minyard
  2002-10-24  7:50                             ` Dipankar Sarma
  1 sibling, 1 reply; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-23 20:50 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, John Levon

Well, I haven't looked at the whole patch yet, but some quick
responses -

On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
> I have noticed that the rcu callback can be delayed a long time, 
> sometimes up to 3 seconds.  That seems odd.  I have verified that the 
> delay happens there.

That kind of latency is really bad. Could you please check the latency 
with this change in kernel/rcupdate.c -

 void rcu_check_callbacks(int cpu, int user)
 {
        if (user ||
-           (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= 1))
+           (idle_cpu(cpu) && !in_softirq() &&
+                               hardirq_count() <= (1 << HARDIRQ_SHIFT)))
                RCU_qsctr(cpu)++;
        tasklet_schedule(&RCU_tasklet(cpu));

After local_irq_count() went away, the idle CPU check was broken
and that meant that if you had an idle CPU, it could hold up RCU
grace period completion.


> +void release_nmi(struct nmi_handler *handler)
> +{
> +	wait_queue_t  q_ent;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&nmi_handler_lock, flags);
> +	list_del_rcu(&(handler->link));
> +
> +	/* Wait for handler to finish being freed.  This can't be
> +           interrupted, we must wait until it finished. */
> +	init_waitqueue_head(&(handler->wait));
> +	init_waitqueue_entry(&q_ent, current);
> +	add_wait_queue(&(handler->wait), &q_ent);
> +	call_rcu(&(handler->rcu), free_nmi_handler, handler);
> +	for (;;) {
> +		set_current_state(TASK_UNINTERRUPTIBLE);
> +		if (list_empty(&(handler->link)))
> +			break;
> +		spin_unlock_irqrestore(&nmi_handler_lock, flags);
> +		schedule();
> +		spin_lock_irqsave(&nmi_handler_lock, flags);
> +	}
> +	remove_wait_queue(&(handler->wait), &q_ent);
> +	spin_unlock_irqrestore(&nmi_handler_lock, flags);
> +}

It might just be simpler to use completions instead -

	call_rcu(&(handler->rcu), free_nmi_handler, handler);
	init_completion(&handler->completion);
	spin_unlock_irqrestore(&nmi_handler_lock, flags);
	wait_for_completion(&handler->completion);

and do

	complete(&handler->completion);

in the  the RCU callback.

Also, now I think your original idea of "Don't do this!" :) was right.
Waiting until an nmi handler is seen unlinked could make a task
block for a long time if another task re-installs it. You should
probably just fail installation of a busy handler (checked under
lock).


Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 4
  2002-10-23 20:50                             ` Dipankar Sarma
@ 2002-10-23 21:53                               ` Corey Minyard
  2002-10-24  7:41                                 ` Dipankar Sarma
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-23 21:53 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, John Levon

Dipankar Sarma wrote:

>Well, I haven't looked at the whole patch yet, but some quick
>responses -
>
>On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
>  
>
>>I have noticed that the rcu callback can be delayed a long time, 
>>sometimes up to 3 seconds.  That seems odd.  I have verified that the 
>>delay happens there.
>>    
>>
>
>That kind of latency is really bad. Could you please check the latency 
>with this change in kernel/rcupdate.c -
>
> void rcu_check_callbacks(int cpu, int user)
> {
>        if (user ||
>-           (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= 1))
>+           (idle_cpu(cpu) && !in_softirq() &&
>+                               hardirq_count() <= (1 << HARDIRQ_SHIFT)))
>                RCU_qsctr(cpu)++;
>        tasklet_schedule(&RCU_tasklet(cpu));
>
>After local_irq_count() went away, the idle CPU check was broken
>and that meant that if you had an idle CPU, it could hold up RCU
>grace period completion.
>
Ah, much better.  That seems to fix it.

>It might just be simpler to use completions instead -
>
>	call_rcu(&(handler->rcu), free_nmi_handler, handler);
>	init_completion(&handler->completion);
>	spin_unlock_irqrestore(&nmi_handler_lock, flags);
>	wait_for_completion(&handler->completion);
>
>and do
>
>	complete(&handler->completion);
>
>in the  the RCU callback.
>
I was working under the assumption that the spinlocks were needed.  But 
now I see that there are spinlocks in wait_for_completion.  You did get 
init_completion() and call_rcu() backwards, they would need to be the 
opposite order, I think.

>Also, now I think your original idea of "Don't do this!" :) was right.
>Waiting until an nmi handler is seen unlinked could make a task
>block for a long time if another task re-installs it. You should
>probably just fail installation of a busy handler (checked under
>lock).
>
Since just about all of these will be in modules at unload time, I'm 
thinking that the way it is now is better.  Otherwise, someone will mess 
it up.  IMHO, it's much more likely that someone doesn't handle the 
callback correctly than someone reused the value before the call that 
releases it returns.  I'd prefer to leave it the way it is now.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 4
  2002-10-23 21:53                               ` Corey Minyard
@ 2002-10-24  7:41                                 ` Dipankar Sarma
  2002-10-24 13:08                                   ` Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-24  7:41 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, John Levon

On Wed, Oct 23, 2002 at 04:53:34PM -0500, Corey Minyard wrote:
> >After local_irq_count() went away, the idle CPU check was broken
> >and that meant that if you had an idle CPU, it could hold up RCU
> >grace period completion.
> >
> Ah, much better.  That seems to fix it.

Great! Do you have any latency numbers ? Just curious.

> 
> >It might just be simpler to use completions instead -
> >
> >	call_rcu(&(handler->rcu), free_nmi_handler, handler);
> >	init_completion(&handler->completion);
> >	spin_unlock_irqrestore(&nmi_handler_lock, flags);
> >	wait_for_completion(&handler->completion);
> >
> >and do
> >
> >	complete(&handler->completion);
> >
> >in the  the RCU callback.
> >
> I was working under the assumption that the spinlocks were needed.  But 
> now I see that there are spinlocks in wait_for_completion.  You did get 
> init_completion() and call_rcu() backwards, they would need to be the 
> opposite order, I think.

AFAICS, the ordering of call_rcu() and init_completion should not matter
because the CPU that is executing them would not have gone
through a quiescent state and thus the RCU callback cannot happen.
Only after a context swtich in wait_for_completion(), the callback
is possible.


> 
> >Also, now I think your original idea of "Don't do this!" :) was right.
> >Waiting until an nmi handler is seen unlinked could make a task
> >block for a long time if another task re-installs it. You should
> >probably just fail installation of a busy handler (checked under
> >lock).
> >
> Since just about all of these will be in modules at unload time, I'm 
> thinking that the way it is now is better.  Otherwise, someone will mess 
> it up.  IMHO, it's much more likely that someone doesn't handle the 
> callback correctly than someone reused the value before the call that 
> releases it returns.  I'd prefer to leave it the way it is now.

Oh, I didn't mean the part about waiting in release_nmi() until
the release is complete (RCU callback done). That is absolutely
necessary. I was talking about looping until the handler is
not busy any more. I think it is safe to just do a wait_for_completion()
and return in release_nmi().

Thanks
Dipankar


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 4
  2002-10-24  7:41                                 ` Dipankar Sarma
@ 2002-10-24 13:08                                   ` Corey Minyard
  0 siblings, 0 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 13:08 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel

Dipankar Sarma wrote:

>On Wed, Oct 23, 2002 at 04:53:34PM -0500, Corey Minyard wrote:
>  
>
>>>After local_irq_count() went away, the idle CPU check was broken
>>>and that meant that if you had an idle CPU, it could hold up RCU
>>>grace period completion.
>>>
>>Ah, much better.  That seems to fix it.
>>    
>>
>Great! Do you have any latency numbers ? Just curious.
>
Unfortunately not.  3 seconds is well within the realm of human 
observation with printk.

>>>It might just be simpler to use completions instead -
>>>
>>>	call_rcu(&(handler->rcu), free_nmi_handler, handler);
>>>	init_completion(&handler->completion);
>>>	spin_unlock_irqrestore(&nmi_handler_lock, flags);
>>>	wait_for_completion(&handler->completion);
>>>
>>>and do
>>>
>>>	complete(&handler->completion);
>>>
>>>in the  the RCU callback.
>>>
>>>      
>>>
>>I was working under the assumption that the spinlocks were needed.  But 
>>now I see that there are spinlocks in wait_for_completion.  You did get 
>>init_completion() and call_rcu() backwards, they would need to be the 
>>opposite order, I think.
>>    
>>
>
>AFAICS, the ordering of call_rcu() and init_completion should not matter
>because the CPU that is executing them would not have gone
>through a quiescent state and thus the RCU callback cannot happen.
>Only after a context swtich in wait_for_completion(), the callback
>is possible.
>
Yes, I think you are right.  I'm still not used to the RCUs :-).

-Corey



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 4
  2002-10-23 20:14                           ` [PATCH] NMI request/release, version 4 Corey Minyard
  2002-10-23 20:50                             ` Dipankar Sarma
@ 2002-10-24  7:50                             ` Dipankar Sarma
  2002-10-24 13:05                               ` Corey Minyard
  2002-10-24 13:28                               ` [PATCH] NMI request/release, version 5 - I think this one's ready Corey Minyard
  1 sibling, 2 replies; 43+ messages in thread
From: Dipankar Sarma @ 2002-10-24  7:50 UTC (permalink / raw)
  To: Corey Minyard; +Cc: linux-kernel, John Levon

Ok, some more comments -

On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
> +void release_nmi(struct nmi_handler *handler)
> +{
> +	wait_queue_t  q_ent;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&nmi_handler_lock, flags);
> +	list_del_rcu(&(handler->link));
> +
> +	/* Wait for handler to finish being freed.  This can't be
> +           interrupted, we must wait until it finished. */
> +	init_waitqueue_head(&(handler->wait));
> +	init_waitqueue_entry(&q_ent, current);
> +	add_wait_queue(&(handler->wait), &q_ent);
> +	call_rcu(&(handler->rcu), free_nmi_handler, handler);
> +	for (;;) {
> +		set_current_state(TASK_UNINTERRUPTIBLE);
> +		if (list_empty(&(handler->link)))
> +			break;
> +		spin_unlock_irqrestore(&nmi_handler_lock, flags);
> +		schedule();
> +		spin_lock_irqsave(&nmi_handler_lock, flags);
> +	}
> +	remove_wait_queue(&(handler->wait), &q_ent);
> +	spin_unlock_irqrestore(&nmi_handler_lock, flags);
> +}

Can release_nmi() be done from irq context ? If not, I don't see
why spin_lock_irqsave() is required here. If it can be called
from irq context, then I can't see how you can schedule()
(or wait_for_completion() for that matter :)).

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 4
  2002-10-24  7:50                             ` Dipankar Sarma
@ 2002-10-24 13:05                               ` Corey Minyard
  2002-10-24 13:28                               ` [PATCH] NMI request/release, version 5 - I think this one's ready Corey Minyard
  1 sibling, 0 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 13:05 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, John Levon

Dipankar Sarma wrote:

>Ok, some more comments -
>
>On Wed, Oct 23, 2002 at 03:14:52PM -0500, Corey Minyard wrote:
>  
>
>Can release_nmi() be done from irq context ? If not, I don't see
>why spin_lock_irqsave() is required here. If it can be called
>from irq context, then I can't see how you can schedule()
>(or wait_for_completion() for that matter :)).
>
I originally was using nmi_handler_lock in the rcu routine (which runs 
at interrupt level).  You have to turn off interrupts if someone else 
can claim the lock at interrupt level.  However, I"m not any more, so it 
can go away.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24  7:50                             ` Dipankar Sarma
  2002-10-24 13:05                               ` Corey Minyard
@ 2002-10-24 13:28                               ` Corey Minyard
  2002-10-24 14:46                                 ` John Levon
  1 sibling, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 13:28 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 143 bytes --]

Thanks to Dipankar, I think I am pretty much done with the code to do a 
request/release NMI.  This patch is relative to stock 2.5.44.

-Corey

[-- Attachment #2: linux-nmi-v5.diff --]
[-- Type: text/plain, Size: 8713 bytes --]

diff -ur linux.orig/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux.orig/arch/i386/kernel/i386_ksyms.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/i386_ksyms.c	Tue Oct 22 12:19:59 2002
@@ -90,6 +90,9 @@
 EXPORT_SYMBOL(cpu_khz);
 EXPORT_SYMBOL(apm_info);
 
+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
 #ifdef CONFIG_DEBUG_IOVIRT
 EXPORT_SYMBOL(__io_virt_debug);
 #endif
diff -ur linux.orig/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c
--- linux.orig/arch/i386/kernel/irq.c	Mon Oct 21 13:25:58 2002
+++ linux/arch/i386/kernel/irq.c	Tue Oct 22 12:08:20 2002
@@ -131,6 +131,8 @@
  * Generic, controller-independent functions:
  */
 
+extern void nmi_append_user_names(struct seq_file *p);
+
 int show_interrupts(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -166,6 +168,8 @@
 	for (j = 0; j < NR_CPUS; j++)
 		if (cpu_online(j))
 			p += seq_printf(p, "%10u ", nmi_count(j));
+	seq_printf(p, "                ");
+	nmi_append_user_names(p);
 	seq_putc(p, '\n');
 #if CONFIG_X86_LOCAL_APIC
 	seq_printf(p, "LOC: ");
diff -ur linux.orig/arch/i386/kernel/nmi.c linux/arch/i386/kernel/nmi.c
--- linux.orig/arch/i386/kernel/nmi.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi.c	Wed Oct 23 13:57:55 2002
@@ -20,6 +20,7 @@
 #include <linux/interrupt.h>
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
+#include <linux/notifier.h>
 
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -102,6 +103,17 @@
 	return 0;
 }
 
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+	.link     = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+	.dev_name = "nmi_watchdog",
+	.dev_id   = NULL,
+	.handler  = nmi_watchdog_tick,
+	.priority = 255, /* We want to be relatively high priority. */
+};
+
 static int __init setup_nmi_watchdog(char *str)
 {
 	int nmi;
@@ -110,6 +122,12 @@
 
 	if (nmi >= NMI_INVALID)
 		return 0;
+
+	if (request_nmi(&nmi_watchdog_handler) != 0) {
+		/* Couldn't add a watchdog handler, give up. */
+		return 0;
+	}
+
 	if (nmi == NMI_NONE)
 		nmi_watchdog = nmi;
 	/*
@@ -350,7 +368,7 @@
 		alert_counter[i] = 0;
 }
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs)
 {
 
 	/*
@@ -401,4 +419,6 @@
 		}
 		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
 	}
+
+	return NOTIFY_OK;
 }
diff -ur linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- linux.orig/arch/i386/kernel/traps.c	Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/traps.c	Thu Oct 24 08:11:14 2002
@@ -23,6 +23,10 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/highmem.h>
+#include <linux/notifier.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -485,21 +489,125 @@
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
+/* 
+ * A list of handlers for NMIs.  This list will be called in order
+ * when an NMI from an otherwise unidentifiable source somes in.  If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE.  NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner.  A
+ * spinlock protects the list from multiple writers.  When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu.  When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t       nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * To free the list item, we use an rcu.  The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
+{
+	struct nmi_handler *handler = arg;
+
+	INIT_LIST_HEAD(&(handler->link));
+	complete(&(handler->complete));
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+	struct list_head   *curr;
+	struct nmi_handler *curr_h = NULL;
+
+	if (!list_empty(&(handler->link)))
+		return EBUSY;
+
+	spin_lock(&nmi_handler_lock);
+
+	__list_for_each(curr, &nmi_handler_list) {
+		curr_h = list_entry(curr, struct nmi_handler, link);
+		if (curr_h->priority <= handler->priority)
+			break;
+	}
+
+	if (curr_h)
+		/* list_add_rcu takes care of memory barrier */
+		list_add_rcu(&(handler->link), curr_h->link.prev);
+	else
+		list_add_rcu(&(handler->link), &nmi_handler_list);
+
+	spin_unlock(&nmi_handler_lock);
+	return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+	spin_lock(&nmi_handler_lock);
+	list_del_rcu(&(handler->link));
+	init_completion(&(handler->complete));
+	call_rcu(&(handler->rcu), free_nmi_handler, handler);
+	spin_unlock(&nmi_handler_lock);
+
+	/* Wait for handler to finish being freed.  This can't be
+           interrupted, we must wait until it finished. */
+	wait_for_completion(&(handler->complete));
+}
+
+static int call_nmi_handlers(struct pt_regs * regs)
+{
+	struct list_head   *curr;
+	struct nmi_handler *curr_h;
+	int                handled = 0;
+	int                val;
+
+	__list_for_each_rcu(curr, &nmi_handler_list) {
+		curr_h = list_entry(curr, struct nmi_handler, link);
+		val = curr_h->handler(curr_h->dev_id, regs);
+		switch (val & ~NOTIFY_STOP_MASK) {
+		case NOTIFY_OK:
+			handled = 1;
+			break;
+
+		case NOTIFY_DONE:
+		default:
+		}
+		if (val & NOTIFY_STOP_MASK)
+			break;
+	}
+	return handled;
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+	struct list_head   *curr;
+	struct nmi_handler *curr_h;
+
+	spin_lock(&nmi_handler_lock);
+	__list_for_each(curr, &nmi_handler_list) {
+		curr_h = list_entry(curr, struct nmi_handler, link);
+		if (curr_h->dev_name)
+			p += seq_printf(p, " %s", curr_h->dev_name);
+	}
+	spin_unlock(&nmi_handler_lock);
+}
+
 static void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = inb(0x61);
  
 	if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
 		/*
-		 * Ok, so this is none of the documented NMI sources,
-		 * so it must be the NMI watchdog.
+		 * Check the handler list to see if anyone can handle this
+		 * nmi.
 		 */
-		if (nmi_watchdog) {
-			nmi_watchdog_tick(regs);
+		if (call_nmi_handlers(regs))
 			return;
-		}
-#endif
+
 		unknown_nmi_error(reason, regs);
 		return;
 	}
diff -ur linux.orig/include/asm-i386/apic.h linux/include/asm-i386/apic.h
--- linux.orig/include/asm-i386/apic.h	Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h	Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
 extern void setup_boot_APIC_clock (void);
 extern void setup_secondary_APIC_clock (void);
 extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
diff -ur linux.orig/include/asm-i386/irq.h linux/include/asm-i386/irq.h
--- linux.orig/include/asm-i386/irq.h	Mon Oct 21 13:24:41 2002
+++ linux/include/asm-i386/irq.h	Wed Oct 23 16:47:24 2002
@@ -12,6 +12,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/rcupdate.h>
 /* include comes from machine specific directory */
 #include "irq_vectors.h"
 
@@ -27,5 +28,36 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
+
+
+/**
+ * Register a handler to get called when an NMI occurs.  If the
+ * handler actually handles the NMI, it should return NOTIFY_OK.  If
+ * it did not handle the NMI, it should return NOTIFY_DONE.  It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified.  Note that this is a slow-path
+ * handler, if you have a fast-path handler there's another tie-in
+ * for you.  See the oprofile code.
+ */
+#define HAVE_NMI_HANDLER	1
+struct nmi_handler
+{
+	struct list_head link; /* You must init this before use. */
+
+	char *dev_name;
+	void *dev_id;
+	int  (*handler)(void *dev_id, struct pt_regs *regs);
+	int  priority; /* Handlers called in priority order. */
+
+	/* Don't mess with anything below here. */
+
+	struct rcu_head    rcu;
+	struct completion  complete;
+};
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);
 
 #endif /* _ASM_IRQ_H */

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 13:28                               ` [PATCH] NMI request/release, version 5 - I think this one's ready Corey Minyard
@ 2002-10-24 14:46                                 ` John Levon
  2002-10-24 15:36                                   ` Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: John Levon @ 2002-10-24 14:46 UTC (permalink / raw)
  To: Corey Minyard; +Cc: dipankar, linux-kernel

On Thu, Oct 24, 2002 at 08:28:20AM -0500, Corey Minyard wrote:

> diff -ur linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
> --- linux.orig/arch/i386/kernel/traps.c	Mon Oct 21 13:25:45 2002
> +++ linux/arch/i386/kernel/traps.c	Thu Oct 24 08:11:14 2002

At this point I'd quite like to see :

	mv nmi.c nmi_watchdog.c

and put all this stuff in always-compiled nmi.c.  traps.c is getting
bloated.

>  static void default_do_nmi(struct pt_regs * regs)
>  {
>  	unsigned char reason = inb(0x61);
>   
>  	if (!(reason & 0xc0)) {
> -#if CONFIG_X86_LOCAL_APIC
>  		/*
> -		 * Ok, so this is none of the documented NMI sources,
> -		 * so it must be the NMI watchdog.
> +		 * Check the handler list to see if anyone can handle this
> +		 * nmi.
>  		 */
> -		if (nmi_watchdog) {
> -			nmi_watchdog_tick(regs);
> +		if (call_nmi_handlers(regs))

Now you're using RCU, it's a real pity that we have the inb() first -
if it wasn't for that, there would be no reason at all to have the "fast
path" setting code too (the latter code is ugly, which is one reason I
want to ditch it).

How about adding default_do_nmi as the minimal-priority handler, then
add the watchdog with higher priority above that ? Then oprofile can add
itself on top of those both and return NOTIFY_OK to indicate it should
break out of the loop. As a bonus, you lose the inb() for the watchdog
too.

> +++ linux/include/asm-i386/irq.h	Wed Oct 23 16:47:24 2002

I thought you agreed the stuff should be in asm/nmi.h ?

regards
john
-- 
"This is playing, not work, therefore it's not a waste of time."
	- Zath

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 14:46                                 ` John Levon
@ 2002-10-24 15:36                                   ` Corey Minyard
  2002-10-24 17:18                                     ` John Levon
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 15:36 UTC (permalink / raw)
  To: John Levon; +Cc: dipankar, linux-kernel

John Levon wrote:

>On Thu, Oct 24, 2002 at 08:28:20AM -0500, Corey Minyard wrote:
>
>  
>
>>diff -ur linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
>>--- linux.orig/arch/i386/kernel/traps.c	Mon Oct 21 13:25:45 2002
>>+++ linux/arch/i386/kernel/traps.c	Thu Oct 24 08:11:14 2002
>>    
>>
>
>At this point I'd quite like to see :
>
>	mv nmi.c nmi_watchdog.c
>
>and put all this stuff in always-compiled nmi.c.  traps.c is getting
>bloated.
>
I agree.

>
>  
>
>> static void default_do_nmi(struct pt_regs * regs)
>> {
>> 	unsigned char reason = inb(0x61);
>>  
>> 	if (!(reason & 0xc0)) {
>>-#if CONFIG_X86_LOCAL_APIC
>> 		/*
>>-		 * Ok, so this is none of the documented NMI sources,
>>-		 * so it must be the NMI watchdog.
>>+		 * Check the handler list to see if anyone can handle this
>>+		 * nmi.
>> 		 */
>>-		if (nmi_watchdog) {
>>-			nmi_watchdog_tick(regs);
>>+		if (call_nmi_handlers(regs))
>>    
>>
>
>Now you're using RCU, it's a real pity that we have the inb() first -
>if it wasn't for that, there would be no reason at all to have the "fast
>path" setting code too (the latter code is ugly, which is one reason I
>want to ditch it).
>
>How about adding default_do_nmi as the minimal-priority handler, then
>add the watchdog with higher priority above that ? Then oprofile can add
>itself on top of those both and return NOTIFY_OK to indicate it should
>break out of the loop. As a bonus, you lose the inb() for the watchdog
>too.
>
Is there any way to detect if the nmi watchdog actually caused the 
timeout?  I don't understand the hardware well enough to do it without 
some work, but it would be a VERY good idea to make sure the nmi 
watchdog actully caused the operation.

Plus, can't you get more than one cause of an NMI?  Shouldn't you check 
them all?

>>+++ linux/include/asm-i386/irq.h	Wed Oct 23 16:47:24 2002
>>    
>>
>
>I thought you agreed the stuff should be in asm/nmi.h ?
>
I will (I had forgotten), and I will move nmi.h to nmi_watchdog.h.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 15:36                                   ` Corey Minyard
@ 2002-10-24 17:18                                     ` John Levon
  2002-10-24 17:43                                       ` Corey Minyard
  2002-10-24 20:03                                       ` Corey Minyard
  0 siblings, 2 replies; 43+ messages in thread
From: John Levon @ 2002-10-24 17:18 UTC (permalink / raw)
  To: Corey Minyard; +Cc: dipankar, linux-kernel

On Thu, Oct 24, 2002 at 10:36:22AM -0500, Corey Minyard wrote:

> Is there any way to detect if the nmi watchdog actually caused the 
> timeout?  I don't understand the hardware well enough to do it without 

You can check if the counter used overflowed :

#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
#define CTRL_READ(l,h,msrs,c) do {rdmsr(MSR_P6_PERFCTR0, (l), (h));} while (0)

                CTR_READ(low, high, msrs, i);
                if (CTR_OVERFLOWED(low)) {
			... found

like oprofile does.

I've accidentally deleted your patch, but weren't you unconditionally
returning "break out of loop" from the watchdog ? I'm not very clear on
the difference between NOTIFY_DONE and NOTIFY_OK anyway...

> Plus, can't you get more than one cause of an NMI?  Shouldn't you check 
> them all?

Shouldn't the NMI stay asserted ? At least with perfctr, two counters
causes two interrupts (actually there's a bug in mainline oprofile on
that that I'll fix when Linus is back)

regards
john
-- 
"This is playing, not work, therefore it's not a waste of time."
	- Zath

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 17:18                                     ` John Levon
@ 2002-10-24 17:43                                       ` Corey Minyard
  2002-10-24 18:04                                         ` John Levon
  2002-10-24 20:03                                       ` Corey Minyard
  1 sibling, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 17:43 UTC (permalink / raw)
  To: John Levon; +Cc: dipankar, linux-kernel

John Levon wrote:

>On Thu, Oct 24, 2002 at 10:36:22AM -0500, Corey Minyard wrote:
>
>  
>
>>Is there any way to detect if the nmi watchdog actually caused the 
>>timeout?  I don't understand the hardware well enough to do it without 
>>    
>>
>
>You can check if the counter used overflowed :
>
>#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
>#define CTRL_READ(l,h,msrs,c) do {rdmsr(MSR_P6_PERFCTR0, (l), (h));} while (0)
>
>                CTR_READ(low, high, msrs, i);
>                if (CTR_OVERFLOWED(low)) {
>			... found
>
>like oprofile does.
>
Ok, thanks, I'll add that to the nmi_watchdog code.

>
>I've accidentally deleted your patch, but weren't you unconditionally
>returning "break out of loop" from the watchdog ? I'm not very clear on
>the difference between NOTIFY_DONE and NOTIFY_OK anyway...
>
The comments on these are:
#define NOTIFY_DONE        0x0000        /* Don't care */
#define NOTIFY_OK        0x0001        /* Suits me */
#define NOTIFY_STOP_MASK    0x8000        /* Don't call further */

I'mt taking these to mean that NOTIFY_DONE means you didn't handle it, 
NOTIFY_OK means you did handle it, and you "or" on NOTIFY_STOP_MASK if 
you want it to stop.  I'm thinking that stopping is probably a bad idea, 
if the NMI is really edge triggered.

>
>  
>
>>Plus, can't you get more than one cause of an NMI?  Shouldn't you check 
>>them all?
>>    
>>
>
>Shouldn't the NMI stay asserted ? At least with perfctr, two counters
>causes two interrupts (actually there's a bug in mainline oprofile on
>that that I'll fix when Linus is back)
>
There's a comment in do_nmi() that says that the NMI is edge triggered. 
 If it is, then you have to call everything.  I'd really like a manual 
on how the timer chip works, I'll see if I can hunt one down.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 17:43                                       ` Corey Minyard
@ 2002-10-24 18:04                                         ` John Levon
  2002-10-24 18:32                                           ` Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: John Levon @ 2002-10-24 18:04 UTC (permalink / raw)
  To: Corey Minyard; +Cc: dipankar, linux-kernel

On Thu, Oct 24, 2002 at 12:43:50PM -0500, Corey Minyard wrote:

> #define NOTIFY_DONE        0x0000        /* Don't care */
> #define NOTIFY_OK        0x0001        /* Suits me */
> #define NOTIFY_STOP_MASK    0x8000        /* Don't call further */
> 
> I'mt taking these to mean that NOTIFY_DONE means you didn't handle it, 
> NOTIFY_OK means you did handle it, and you "or" on NOTIFY_STOP_MASK if 
> you want it to stop.  I'm thinking that stopping is probably a bad idea, 
> if the NMI is really edge triggered.

> There's a comment in do_nmi() that says that the NMI is edge triggered. 

So there is. Darn. You could special case default_do_nmi, only printing
out "Unknown NMI" iff the reason inb() check fails, /and/ no previous
handlers set handled. Theoretically we have to take the cost of the
inb() every time otherwise we can lose one of the NMISC-based things in
default_do_nmi ... I can always see if this makes a noticable practical
difference for oprofile under high interrupt load

(I also need to be able to remove the NMI watchdog handler, but that's
an oprofile-specific problem).

Perhaps things will end being best to leave the current fast-path thing,
but we should make sure that we've explored the possibilities of
removing it first ;)

thanks
john

-- 
"This is playing, not work, therefore it's not a waste of time."
	- Zath

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 18:04                                         ` John Levon
@ 2002-10-24 18:32                                           ` Corey Minyard
  2002-10-24 18:47                                             ` John Levon
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 18:32 UTC (permalink / raw)
  To: John Levon; +Cc: dipankar, linux-kernel

John Levon wrote:

>On Thu, Oct 24, 2002 at 12:43:50PM -0500, Corey Minyard wrote:
>  
>
>>There's a comment in do_nmi() that says that the NMI is edge triggered. 
>>    
>>
>
>So there is. Darn. You could special case default_do_nmi, only printing
>out "Unknown NMI" iff the reason inb() check fails, /and/ no previous
>handlers set handled. Theoretically we have to take the cost of the
>inb() every time otherwise we can lose one of the NMISC-based things in
>default_do_nmi ... I can always see if this makes a noticable practical
>difference for oprofile under high interrupt load
>
>(I also need to be able to remove the NMI watchdog handler, but that's
>an oprofile-specific problem).
>
>Perhaps things will end being best to leave the current fast-path thing,
>but we should make sure that we've explored the possibilities of
>removing it first ;)
>
This also means that the current code is actually wrong, since the 
current code just returns at the first thing it finds.

I'll work on redoing this all in one list.

Do you know if nmi_shutdown in oprofile/nmi_int.c can be called from 
interrupt context?  Because nmi_release() can block, so I couldn't use 
it as-is if it ran in interrupt context.

-Corey



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 18:32                                           ` Corey Minyard
@ 2002-10-24 18:47                                             ` John Levon
  0 siblings, 0 replies; 43+ messages in thread
From: John Levon @ 2002-10-24 18:47 UTC (permalink / raw)
  To: linux-kernel; +Cc: cminyard

On Thu, Oct 24, 2002 at 01:32:40PM -0500, Corey Minyard wrote:

> Do you know if nmi_shutdown in oprofile/nmi_int.c can be called from 

It can only be called in process context, from the event buffer
release() method (and open(), on the failure path)

regards
john
-- 
"This is playing, not work, therefore it's not a waste of time."
	- Zath

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 17:18                                     ` John Levon
  2002-10-24 17:43                                       ` Corey Minyard
@ 2002-10-24 20:03                                       ` Corey Minyard
  2002-10-24 20:29                                         ` John Levon
  1 sibling, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-24 20:03 UTC (permalink / raw)
  To: John Levon; +Cc: dipankar, linux-kernel

John Levon wrote:

>On Thu, Oct 24, 2002 at 10:36:22AM -0500, Corey Minyard wrote:
>
>  
>
>>Is there any way to detect if the nmi watchdog actually caused the 
>>timeout?  I don't understand the hardware well enough to do it without 
>>    
>>
>
>You can check if the counter used overflowed :
>
>#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
>#define CTRL_READ(l,h,msrs,c) do {rdmsr(MSR_P6_PERFCTR0, (l), (h));} while (0)
>
>                CTR_READ(low, high, msrs, i);
>                if (CTR_OVERFLOWED(low)) {
>			... found
>
Any way to do this for the IO_APIC case?

-Corey



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 5 - I think this one's ready
  2002-10-24 20:03                                       ` Corey Minyard
@ 2002-10-24 20:29                                         ` John Levon
  2002-10-25  1:22                                           ` [PATCH] NMI request/release, version 6 - "Well I thought the last one was ready" Corey Minyard
  0 siblings, 1 reply; 43+ messages in thread
From: John Levon @ 2002-10-24 20:29 UTC (permalink / raw)
  To: Corey Minyard; +Cc: dipankar, linux-kernel

On Thu, Oct 24, 2002 at 03:03:31PM -0500, Corey Minyard wrote:

> >               if (CTR_OVERFLOWED(low)) {
> >			... found
> >
> Any way to do this for the IO_APIC case?

Ugh, forgot about that. I have no idea, sorry

regards
john

-- 
"This is playing, not work, therefore it's not a waste of time."
	- Zath

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] NMI request/release, version 6 - "Well I thought the last one was ready"
  2002-10-24 20:29                                         ` John Levon
@ 2002-10-25  1:22                                           ` Corey Minyard
  2002-10-25  1:39                                             ` John Levon
  0 siblings, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-25  1:22 UTC (permalink / raw)
  To: John Levon; +Cc: dipankar, linux-kernel

Ok, I have reworked all the NMI code, moved it to it's own file, and 
converted all the NMI users to use the NMI request/release code.  The 
current code will call call the NMI handlers on an NMI, interested 
parties may detect that their NMI occurred and handle it.  Since the NMI 
into the CPU is edge-triggered, I think that's the correct way to handle 
it (although it is slower).  If someone figures out another way, that 
would be very helpful.  The include/linux/nmi.h and 
arch/i386/kernel/nmi.c files were renamed to nmi_watchdog.h and 
nmi_watchdog.c.

The biggest hole (that I know of) in these changes is that there is no 
way that I can tell to detect if an nmi_watchdog has occurred if the NMI 
source is the I/O APIC.  That code will assume that if no one else has 
handled the NMI, it was the source, which is a bad assumption (but the 
same assumption that was being made before my changes, so it's no 
worse).  Most things should be using the local APIC, anyway.

It's now too big to include in an email, so it's at 
http://home.attbi.com/~minyard/linux-nmi-v6.diff.

-Corey

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 6 - "Well I thought the last one was ready"
  2002-10-25  1:22                                           ` [PATCH] NMI request/release, version 6 - "Well I thought the last one was ready" Corey Minyard
@ 2002-10-25  1:39                                             ` John Levon
  2002-10-25  1:58                                               ` Jeff Garzik
  2002-10-25  2:01                                               ` [PATCH] NMI request/release, version 7 - minor cleanups Corey Minyard
  0 siblings, 2 replies; 43+ messages in thread
From: John Levon @ 2002-10-25  1:39 UTC (permalink / raw)
  To: Corey Minyard; +Cc: dipankar, linux-kernel

On Thu, Oct 24, 2002 at 08:22:33PM -0500, Corey Minyard wrote:

> http://home.attbi.com/~minyard/linux-nmi-v6.diff.

                case NOTIFY_DONE:
                default:
                }

This needs to be :

		case NOTIFY_DONE:
		default:;
		}

or later gcc's whine.

+ * handler, if you have a fast-path handler there's another tie-in
+ * for you.  See the oprofile code.

You forgot to remove this comment.

        if (!list_empty(&(handler->link)))
                return EBUSY;

This wants to be -EBUSY (it eventually gets returned to userspace via
oprofile, and is more common anyway).

static int nmi_std (void           * dev_id,
                    struct pt_regs * regs,
                    int            cpu,
                    int            handled)

I don't think this matches the usual kernel style.

I think the rest looks OK, I'll find some time to play with oprofile
side of this.

thanks
john

-- 
"This is playing, not work, therefore it's not a waste of time."
	- Zath

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release, version 6 - "Well I thought the last one was ready"
  2002-10-25  1:39                                             ` John Levon
@ 2002-10-25  1:58                                               ` Jeff Garzik
  2002-10-25  2:01                                               ` [PATCH] NMI request/release, version 7 - minor cleanups Corey Minyard
  1 sibling, 0 replies; 43+ messages in thread
From: Jeff Garzik @ 2002-10-25  1:58 UTC (permalink / raw)
  To: John Levon; +Cc: Corey Minyard, dipankar, linux-kernel

John Levon wrote:

>On Thu, Oct 24, 2002 at 08:22:33PM -0500, Corey Minyard wrote:
>
>  
>
>>http://home.attbi.com/~minyard/linux-nmi-v6.diff.
>>    
>>
>
>                case NOTIFY_DONE:
>                default:
>                }
>
>This needs to be :
>
>		case NOTIFY_DONE:
>		default:;
>		}
>
>or later gcc's whine.
>
or the even-more-readable:

    default: /* do nothing */
        break;




^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] NMI request/release, version 7 - minor cleanups
  2002-10-25  1:39                                             ` John Levon
  2002-10-25  1:58                                               ` Jeff Garzik
@ 2002-10-25  2:01                                               ` Corey Minyard
  2002-10-25 13:26                                                 ` [PATCH] NMI request/release, version 8 Corey Minyard
  1 sibling, 1 reply; 43+ messages in thread
From: Corey Minyard @ 2002-10-25  2:01 UTC (permalink / raw)
  To: John Levon; +Cc: dipankar, linux-kernel

This fixes all the problems John Levon pointed out.  It's at 
http://home.attbi.com/~minyard/linux-nmi-v7.diff

Thanks for everyone's help.

-Corey


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH] NMI request/release, version 8
  2002-10-25  2:01                                               ` [PATCH] NMI request/release, version 7 - minor cleanups Corey Minyard
@ 2002-10-25 13:26                                                 ` Corey Minyard
  0 siblings, 0 replies; 43+ messages in thread
From: Corey Minyard @ 2002-10-25 13:26 UTC (permalink / raw)
  To: linux-kernel; +Cc: John Levon, dipankar

[-- Attachment #1: Type: text/plain, Size: 285 bytes --]

I realized that the piece of code at the end of the old NMI handler was 
still necessary.  I have attached a patch to the previous version to fix 
the problem.  The full version of the patch with this fix is on my web 
page at http://home.attbi.com/~minyard/linux-nmi-v8.diff.

-Corey

[-- Attachment #2: linux-nmi-v7-v8.diff --]
[-- Type: text/plain, Size: 1701 bytes --]

--- linux.v8/arch/i386/kernel/nmi.c	Thu Oct 24 20:53:04 2002
+++ linux/arch/i386/kernel/nmi.c	Fri Oct 25 08:21:22 2002
@@ -208,30 +208,29 @@
 
 	if (!handled)
 		unknown_nmi_error(regs, cpu);
-#if 0
-	/*
-	 * It MAY be possible to only call handlers until one returns
-	 * that it handled the NMI, if, we do the following.  Assuming
-	 * that all the incoming signals causing NMIs are
-	 * level-triggered, this code will cause another NMI
-	 * immediately if the incoming signal is still asserted.  I
-	 * don't know if the assumption is correct or if it's better
-	 * to call all the handlers or do the I/O.
-	 *
-	 * I'm pretty sure that this won't work with the performance
-	 * registers NMI output, so I'm guessing that this won't work.
-	 */
 	else {
 		/*
 		 * Reassert NMI in case it became active meanwhile
-		 * as it's edge-triggered.
+		 * as it's edge-triggered.    Don't do this if the NMI
+		 * wasn't handled to avoid an infinite NMI loop.
+		 *
+		 * This is necessary in case we have another external
+		 * NMI while processing this one.  The external NMIs
+		 * are level-generated, into the processor NMIs are
+		 * edge-triggered, so if you have one NMI source
+		 * come in while another is already there, the level
+		 * will never go down to cause another edge, and
+		 * no more NMIs will happen.  This does NOT apply
+		 * to internally generated NMIs, though, so you
+		 * can't use the same trick to only call one handler
+		 * at a time.  Otherwise, if two internal NMIs came
+		 * in at the same time you might miss one.
 		 */
 		outb(0x8f, 0x70);
 		inb(0x71);		/* dummy */
 		outb(0x0f, 0x70);
 		inb(0x71);		/* dummy */
 	}
-#endif
 }
 
 void __init init_nmi(void)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH] NMI request/release
  2002-10-22  2:10 ` John Levon
  2002-10-22  2:32   ` Corey Minyard
@ 2002-10-22 12:23   ` Suparna Bhattacharya
  1 sibling, 0 replies; 43+ messages in thread
From: Suparna Bhattacharya @ 2002-10-22 12:23 UTC (permalink / raw)


On Tue, 22 Oct 2002 07:46:49 +0530, John Levon wrote:

> On Mon, Oct 21, 2002 at 08:32:47PM -0500, Corey Minyard wrote:
> 
>> The attached patch implements a way to request to receive an NMI if it
>> comes from an otherwise unknown source.  I needed this for handling
>> NMIs with the IPMI watchdog.  This function was discussed a little a
>> while
> 
> Then NMI watchdog and oprofile should be changed to use this too.


Perhaps even LKCD can make use of such a framework if it works 
out.

 We
> also need priority and/or equivalent of NOTIFY_STOP_MASK so we can break
> out of calling all the handlers. Actually, why do you continue if one of
> the handlers returns 1 anyway ?
> 
>> +	atomic_inc(&calling_nmi_handlers);
> 
> Isn't this going to cause cacheline ping pong ?
> 
>> +	curr = nmi_handler_list;
>> +	while (curr) {
>> +		handled |= curr->handler(curr->dev_id, regs);
> 
> dev_name is never used at all. What is it for ? Also, would be nice to
> do an smp_processor_id() just once and pass that in to prevent multiple
> calls to get_current().
> 
> Couldn't you modify the notifier code to do the xchg()s (though that's
> not available on all CPU types ...)
> 
>> +#define HAVE_NMI_HANDLER	1
> 
> What uses this ?
> 
>> +	volatile struct nmi_handler *next;
> 
> Hmm ...
> 
> Is it not possible to use linux/rcupdate.h for this stuff ?
> 
> regards
> john

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2002-10-25 13:19 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-22  1:32 [PATCH] NMI request/release Corey Minyard
2002-10-22  2:10 ` John Levon
2002-10-22  2:32   ` Corey Minyard
2002-10-22  2:53     ` John Levon
2002-10-22 13:02       ` Corey Minyard
2002-10-22 15:09         ` John Levon
2002-10-22 16:03           ` Corey Minyard
2002-10-22 17:23         ` Robert Love
2002-10-22 18:08           ` Corey Minyard
2002-10-22 18:16             ` Robert Love
2002-10-22 20:04             ` Dipankar Sarma
2002-10-22 17:53         ` Dipankar Sarma
2002-10-22 18:05           ` Corey Minyard
2002-10-22 18:08             ` Dipankar Sarma
2002-10-22 18:29               ` Corey Minyard
2002-10-22 19:08                 ` John Levon
2002-10-22 21:36                   ` [PATCH] NMI request/release, version 3 Corey Minyard
2002-10-23 17:33                     ` Dipankar Sarma
2002-10-23 18:03                       ` Corey Minyard
2002-10-23 18:57                         ` Dipankar Sarma
2002-10-23 20:14                           ` [PATCH] NMI request/release, version 4 Corey Minyard
2002-10-23 20:50                             ` Dipankar Sarma
2002-10-23 21:53                               ` Corey Minyard
2002-10-24  7:41                                 ` Dipankar Sarma
2002-10-24 13:08                                   ` Corey Minyard
2002-10-24  7:50                             ` Dipankar Sarma
2002-10-24 13:05                               ` Corey Minyard
2002-10-24 13:28                               ` [PATCH] NMI request/release, version 5 - I think this one's ready Corey Minyard
2002-10-24 14:46                                 ` John Levon
2002-10-24 15:36                                   ` Corey Minyard
2002-10-24 17:18                                     ` John Levon
2002-10-24 17:43                                       ` Corey Minyard
2002-10-24 18:04                                         ` John Levon
2002-10-24 18:32                                           ` Corey Minyard
2002-10-24 18:47                                             ` John Levon
2002-10-24 20:03                                       ` Corey Minyard
2002-10-24 20:29                                         ` John Levon
2002-10-25  1:22                                           ` [PATCH] NMI request/release, version 6 - "Well I thought the last one was ready" Corey Minyard
2002-10-25  1:39                                             ` John Levon
2002-10-25  1:58                                               ` Jeff Garzik
2002-10-25  2:01                                               ` [PATCH] NMI request/release, version 7 - minor cleanups Corey Minyard
2002-10-25 13:26                                                 ` [PATCH] NMI request/release, version 8 Corey Minyard
2002-10-22 12:23   ` [PATCH] NMI request/release Suparna Bhattacharya

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).