* [PATCH] make lost-tick detection more informative
@ 2003-01-30 18:56 Dave Hansen
2003-01-30 21:14 ` Andrew Morton
0 siblings, 1 reply; 4+ messages in thread
From: Dave Hansen @ 2003-01-30 18:56 UTC (permalink / raw)
To: john stultz; +Cc: Linux Kernel Mailing List, Andrew Morton
[-- Attachment #1: Type: text/plain, Size: 804 bytes --]
The new lost-tick detection code is pretty cool, but it can be
relatively uninformative when something using the
common_interrupt->do_IRQ path keeps interrupts disabled for a long time.
This patch will record the last interrupt on the current CPU, and spit
it out along with the stack trace.
There are no guarantees here. If preemption is enabled, all bets are
off, and this data could be incorrect. It would be relatively silly to
disable preemption, just to get some debugging data to be more accurate.
Warning! Detected 4094446 micro-second gap between interrupts.
Compensating for 4093 lost ticks.
Call Trace:
[<c010ab60>] handle_IRQ_event+0x28/0x50
[<c010ad44>] do_IRQ+0xa0/0x10c
[<c01097b3>] common_interrupt+0x43/0x58
Last run common irq: 24: eth2
--
Dave Hansen
haveblue@us.ibm.com
[-- Attachment #2: lost-tick-culprit-2.5.59-mjb2-1.patch --]
[-- Type: text/plain, Size: 1998 bytes --]
diff -ru linux-2.5.59-mjb2-clean/arch/i386/kernel/irq.c linux-2.5.59-mjb2-8way/arch/i386/kernel/irq.c
--- linux-2.5.59-mjb2-clean/arch/i386/kernel/irq.c Wed Jan 29 19:02:39 2003
+++ linux-2.5.59-mjb2-8way/arch/i386/kernel/irq.c Wed Jan 29 22:54:22 2003
@@ -68,6 +68,9 @@
irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
{ [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
+/* used to determine the culprits who disabled interrupts for a long time */
+int last_do_IRQ_interrupt[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
static void register_irq_proc (unsigned int irq);
/*
@@ -330,6 +333,9 @@
struct irqaction * action;
unsigned int status;
+ if(irq) /* don't count the timer */
+ last_do_IRQ_interrupt[cpu] = irq;
+
irq_enter();
#ifdef CONFIG_DEBUG_STACKOVERFLOW
diff -ru linux-2.5.59-mjb2-clean/arch/i386/kernel/time.c linux-2.5.59-mjb2-8way/arch/i386/kernel/time.c
--- linux-2.5.59-mjb2-clean/arch/i386/kernel/time.c Wed Jan 29 19:02:39 2003
+++ linux-2.5.59-mjb2-8way/arch/i386/kernel/time.c Thu Jan 30 10:51:55 2003
@@ -267,7 +267,7 @@
#endif
}
-
+extern int last_do_IRQ_interrupt[];
/* Lost tick detection and compensation */
static inline void detect_lost_tick(void)
{
@@ -280,12 +280,21 @@
/* only print debug info first 5 times */
if(dbg_print < 5){
+ int last_irq = last_do_IRQ_interrupt[smp_processor_id()];
+ struct irqaction * action;
printk(KERN_WARNING "\nWarning! Detected %lu micro-second"
" gap between interrupts.\n",delta);
printk(KERN_WARNING " Compensating for %lu lost ticks.\n",
delta/(1000000/HZ)-1);
/* dump trace info */
show_trace(NULL);
+ if(last_irq >= 0 && (action = irq_desc[last_irq].action)) {
+ printk(KERN_WARNING " Last run common irq: %d: %s",
+ last_irq, action->name);
+ for (action=action->next; action; action = action->next)
+ printk(", %s", action->name);
+ printk("\n");
+ }
dbg_print++;
}
/* calculate number of missed ticks */
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] make lost-tick detection more informative
2003-01-30 18:56 Dave Hansen
@ 2003-01-30 21:14 ` Andrew Morton
0 siblings, 0 replies; 4+ messages in thread
From: Andrew Morton @ 2003-01-30 21:14 UTC (permalink / raw)
To: Dave Hansen; +Cc: john stultz, Linux Kernel Mailing List
Dave Hansen wrote:
>
> The new lost-tick detection code is pretty cool,
I've actually disabled it in -mmnext, because I get too
much mail already.
If you want to slap this on top of that and some new
kernel-hacking debug config option, that would be neat.
arch/i386/kernel/time.c | 36 ++++++++++++++++++++++++++++++++++++
1 files changed, 36 insertions(+)
diff -puN arch/i386/kernel/time.c~lost-tick arch/i386/kernel/time.c
--- 25/arch/i386/kernel/time.c~lost-tick Wed Jan 29 14:12:23 2003
+++ 25-akpm/arch/i386/kernel/time.c Wed Jan 29 14:39:52 2003
@@ -266,6 +266,41 @@ static inline void do_timer_interrupt(in
}
/*
+ * Lost tick detection and compensation
+ */
+static inline void detect_lost_tick(void)
+{
+ /* read time since last interrupt */
+ unsigned long delta = timer->get_offset();
+ static unsigned long dbg_print;
+
+ /* check if delta is greater then two ticks */
+ if(delta >= 2*(1000000/HZ)){
+
+ /*
+ * only print debug info first 5 times
+ */
+ /*
+ * AKPM: disable this for now; it's nice, but irritating.
+ */
+ if (0 && dbg_print < 5) {
+ printk(KERN_WARNING "\nWarning! Detected %lu "
+ "micro-second gap between interrupts.\n",
+ delta);
+ printk(KERN_WARNING " Compensating for %lu lost "
+ "ticks.\n",
+ delta/(1000000/HZ)-1);
+ dump_stack();
+ dbg_print++;
+ }
+ /* calculate number of missed ticks */
+ delta = delta/(1000000/HZ)-1;
+ jiffies += delta;
+ }
+
+}
+
+/*
* This is the same as the above, except we _also_ save the current
* Time Stamp Counter value at the time of the timer interrupt, so that
* we later on can estimate the time of day more exactly.
@@ -281,6 +316,7 @@ void timer_interrupt(int irq, void *dev_
*/
write_lock(&xtime_lock);
+ detect_lost_tick();
timer->mark_offset();
do_timer_interrupt(irq, NULL, regs);
_
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] make lost-tick detection more informative
[not found] ` <3E3995A9.E44AD77A@digeo.com.suse.lists.linux.kernel>
@ 2003-01-30 21:22 ` Andi Kleen
0 siblings, 0 replies; 4+ messages in thread
From: Andi Kleen @ 2003-01-30 21:22 UTC (permalink / raw)
To: Andrew Morton; +Cc: haveblue, linux-kernel
Andrew Morton <akpm@digeo.com> writes:
> I've actually disabled it in -mmnext, because I get too
> much mail already.
>
> If you want to slap this on top of that and some new
> kernel-hacking debug config option, that would be neat.
On x86-64/2.4 it is a kernel option "report_lost_ticks"
The overhead is low (single if check in the timer handler), so this is fine.
If you suspect something goes wrong due to bad latency you can just tell
the user to turn it on.
-Andi
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH] make lost-tick detection more informative
@ 2003-02-04 21:36 Dave Hansen
0 siblings, 0 replies; 4+ messages in thread
From: Dave Hansen @ 2003-02-04 21:36 UTC (permalink / raw)
To: Linus Torvalds; +Cc: john stultz, Linux Kernel Mailing List
[-- Attachment #1: Type: text/plain, Size: 1152 bytes --]
Linus,
Andrew Morton mentioned this when he sent you the lost tick detection
patch.
The new lost-tick detection code can be relatively uninformative when
something using the common_interrupt->do_IRQ path keeps interrupts
disabled for a long time. This patch will record the last interrupt on
the current CPU, and spit it out along with the stack trace. I ignore
the timer interrupt, because it is always running when this detection
occurs, and would overwrite the previous interrupt that took a long time.
You can now turn this on with a boot-time option with the same name as
x86-64: "report_lost_ticks". I don't like the default of 5, so I let it
take a plain report_lost_ticks, or report_lost_ticks=100, if you want.
Warning! Detected 4094446 micro-second gap between interrupts.
Compensating for 4093 lost ticks.
Call Trace:
[<c010ab60>] handle_IRQ_event+0x28/0x50
[<c010ad44>] do_IRQ+0xa0/0x10c
[<c01097b3>] common_interrupt+0x43/0x58
Last run common irq: 24: eth2
irq.c | 6 ++++++
time.c | 41 ++++++++++++++++++++++++++++++++---------
2 files changed, 38 insertions(+), 9 deletions(-)
--
Dave Hansen
haveblue@us.ibm.com
[-- Attachment #2: lost-tick-culprit-2.5.59-bk.patch --]
[-- Type: text/plain, Size: 3241 bytes --]
# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.981 -> 1.983
# arch/i386/kernel/irq.c 1.25 -> 1.26
# arch/i386/kernel/time.c 1.25 -> 1.27
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 03/02/04 haveblue@elm3b96.(none) 1.982
# report better lock-tick information
# --------------------------------------------
# 03/02/04 haveblue@elm3b96.(none) 1.983
# irq.c:
# add better last irq description
# time.c:
# add missing variable
# --------------------------------------------
#
diff -Nru a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
--- a/arch/i386/kernel/irq.c Tue Feb 4 13:30:21 2003
+++ b/arch/i386/kernel/irq.c Tue Feb 4 13:30:21 2003
@@ -68,6 +68,9 @@
irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
{ [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
+/* used to determine the culprits who disabled interrupts for a long time */
+int last_do_IRQ_interrupt[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
static void register_irq_proc (unsigned int irq);
/*
@@ -328,6 +331,9 @@
irq_desc_t *desc = irq_desc + irq;
struct irqaction * action;
unsigned int status;
+
+ if(irq) /* don't count the timer */
+ last_do_IRQ_interrupt[cpu] = irq;
irq_enter();
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c Tue Feb 4 13:30:21 2003
+++ b/arch/i386/kernel/time.c Tue Feb 4 13:30:21 2003
@@ -265,25 +265,39 @@
#endif
}
+static int report_lost_ticks;
+int __init report_lost_ticks_setup(char *str)
+{
+ char* numstr;
+
+ if (str[0] == '=') {
+ numstr = &str[1];
+ return get_option(&numstr,&report_lost_ticks) ? 1 : 0;
+ } else if (str[0] == '\0') {
+ report_lost_ticks = 5;
+ return 1;
+ }
+
+ return 0;
+}
+
+__setup("report_lost_ticks", report_lost_ticks_setup);
+
/*
* Lost tick detection and compensation
*/
+extern int last_do_IRQ_interrupt[];
static inline void detect_lost_tick(void)
{
/* read time since last interrupt */
unsigned long delta = timer->get_offset();
- static unsigned long dbg_print;
/* check if delta is greater then two ticks */
if(delta >= 2*(1000000/HZ)){
- /*
- * only print debug info first 5 times
- */
- /*
- * AKPM: disable this for now; it's nice, but irritating.
- */
- if (0 && dbg_print < 5) {
+ if(report_lost_ticks > 0){
+ int last_irq = last_do_IRQ_interrupt[smp_processor_id()];
+ struct irqaction * action;
printk(KERN_WARNING "\nWarning! Detected %lu "
"micro-second gap between interrupts.\n",
delta);
@@ -291,7 +305,16 @@
"ticks.\n",
delta/(1000000/HZ)-1);
dump_stack();
- dbg_print++;
+
+ if(last_irq >= 0 && (action = irq_desc[last_irq].action)) {
+ printk(KERN_WARNING " Last run common irq: %d: %s",
+ last_irq, action->name);
+ for (action=action->next; action; action = action->next)
+ printk(", %s", action->name);
+ printk("\n");
+ }
+
+ report_lost_ticks--;
}
/* calculate number of missed ticks */
delta = delta/(1000000/HZ)-1;
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2003-02-04 21:27 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-02-04 21:36 [PATCH] make lost-tick detection more informative Dave Hansen
[not found] <3E39756F.6080400@us.ibm.com.suse.lists.linux.kernel>
[not found] ` <3E3995A9.E44AD77A@digeo.com.suse.lists.linux.kernel>
2003-01-30 21:22 ` Andi Kleen
-- strict thread matches above, loose matches on Subject: below --
2003-01-30 18:56 Dave Hansen
2003-01-30 21:14 ` Andrew Morton
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox