All of lore.kernel.org
 help / color / mirror / Atom feed
* [Xenomai-core] [BUG?] stalled xeno domain
@ 2006-04-08 15:12 Jan Kiszka
  2006-04-08 16:38 ` Jan Kiszka
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Kiszka @ 2006-04-08 15:12 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai-core


[-- Attachment #1.1: Type: text/plain, Size: 8289 bytes --]

Hi Philippe,

debugging is nice, tracing is still nicer. As you suggested, I extended
the tracer with per-domain stall flags (needs some output clean-up,
preliminary patch attached nevertheless).

And here is the result (full trace attached):

> :|    (0x51) 0x000000c8 -1113+   1.112  __ipipe_sync_stage+0x142 (ipipe_suspend_domain+0x56)
> :|    fn                -1112+   1.789  __ipipe_sync_stage+0xe (ipipe_suspend_domain+0x56)
> :|   *(0x50) 0x00000064 -1110+   1.293  __ipipe_sync_stage+0x97 (ipipe_suspend_domain+0x56)
> :    *fn                -1109+   1.398  do_IRQ+0x8 (__ipipe_sync_stage+0xcf)
> :    *fn                -1107+   2.105  __do_IRQ+0xc (do_IRQ+0x21)
> :    *fn                -1105+   1.631  handle_IRQ_event+0xd (__do_IRQ+0x9e)
> :    *fn                -1104+   2.413  timer_interrupt+0x9 (handle_IRQ_event+0x40)
> :    *fn                -1101+   3.022  mark_offset_tsc+0xe (timer_interrupt+0x31)
> :    *fn                -1098+   2.721  do_timer+0x9 (timer_interrupt+0x37)
> :|   *fn                -1096+   2.112  __ipipe_handle_irq+0xe (common_interrupt+0x18)
> :|   *fn                -1093+   1.210  __ipipe_ack_common_irq+0xc (__ipipe_handle_irq+0xc0)
> :|   *(0x50) 0x00000064 -1092+   1.218  __ipipe_ack_common_irq+0x31 (__ipipe_handle_irq+0xc0)
> :|   *fn                -1091+   1.834  mask_and_ack_8259A+0xb (__ipipe_ack_common_irq+0x5d)
> :|   *(0x50) 0x00000064 -1089+   1.345  __ipipe_ack_common_irq+0x9d (__ipipe_handle_irq+0xc0)
> :|   *fn                -1088    0.924  ipipe_suspend_domain+0xb (__ipipe_handle_irq+0x174)
> :|   *(0x51) 0x000000c8 -1087+   1.172  ipipe_suspend_domain+0x26 (__ipipe_handle_irq+0x174)
> :|   *fn                -1086+   2.030  __ipipe_sync_stage+0xe (ipipe_suspend_domain+0x56)
> :|  **(0x50) 0x000000c8 -1084+   1.330  __ipipe_sync_stage+0x97 (ipipe_suspend_domain+0x56)
> :|  **fn                -1082    0.879  xnintr_clock_handler+0x8 (__ipipe_sync_stage+0x12b)
> :|  **fn                -1082+   1.218  xnintr_irq_handler+0xb (xnintr_clock_handler+0x18)
> :|  **fn                -1080+   1.233  xnpod_announce_tick+0x9 (xnintr_irq_handler+0x2a)
> :|  **(0x50) 0x000000c8 -1079+   1.736  xnpod_announce_tick+0x20 (xnintr_irq_handler+0x2a)
> :|  **fn                -1077+   2.984  xntimer_do_tick_aperiodic+0xe (xnpod_announce_tick+0x36)
> :|  **fn                -1074+   2.751  xnthread_timeout_handler+0x8 (xntimer_do_tick_aperiodic+0x18d)
> :|  **fn                -1072+   1.864  xnpod_resume_thread+0xe (xnthread_timeout_handler+0x1d)
> :|  **(0x50) 0x000000c8 -1070+   4.699  xnpod_resume_thread+0x2b (xnthread_timeout_handler+0x1d)
> :|  **(0x50) 0x000000c8 -1065+   1.586  xnpod_resume_thread+0x2bf (xnthread_timeout_handler+0x1d)
> :|  **(0x01) 0x00001237 -1063+   2.661  xntimer_do_tick_aperiodic+0x309 (xnpod_announce_tick+0x36)
> :|  **(0x50) 0x000000c8 -1061+   1.263  xnpod_announce_tick+0x4f (xnintr_irq_handler+0x2a)
> :|  **fn                -1060+   1.255  rthal_irq_end+0x8 (xnintr_irq_handler+0x46)
> :|  **fn                -1058+   2.007  enable_8259A_irq+0x9 (rthal_irq_end+0x2e)
> :|  **fn                -1056+   1.924  xnpod_schedule+0xe (xnintr_irq_handler+0x6e)
> :|  **(0x50) 0x000000c8 -1054!  15.368  xnpod_schedule+0x53 (xnintr_irq_handler+0x6e)
> :|  **fn                -1039!  13.300  __switch_to+0xc (xnpod_schedule+0x689)
> :|  **(0x50) 0x000000c8 -1026+   1.766  xnpod_schedule+0x951 (xnpod_suspend_thread+0x27c)
> :|  **(0x50) 0x000000c8 -1024!  18.195  xnpod_suspend_thread+0x2bb (rt_task_sleep+0x54)
> :   **fn                -1006+   1.624  __ipipe_syscall_root+0x9 (system_call+0x20)
> :   **fn                -1004+   1.406  __ipipe_dispatch_event+0xe (__ipipe_syscall_root+0x58)
> :   **fn                -1003+   4.323  hisyscall_event+0xe (__ipipe_dispatch_event+0x5e)
> :   **fn                 -998+   1.398  __rt_task_sleep+0xa (hisyscall_event+0x23c)
> :   **fn                 -997+   1.789  __copy_from_user_ll+0xa (__rt_task_sleep+0x1a)
> :   **fn                 -995!  15.345  rt_task_sleep+0xa (__rt_task_sleep+0x25)
> :   **fn                 -980    0.879  __ipipe_syscall_root+0x9 (system_call+0x20)
> :   **fn                 -979+   1.308  __ipipe_dispatch_event+0xe (__ipipe_syscall_root+0x58)
> :   **fn                 -978+   2.451  hisyscall_event+0xe (__ipipe_dispatch_event+0x5e)
> :   **fn                 -975+   1.631  sys_rtdm_ioctl+0x8 (hisyscall_event+0x23c)
> :   **fn                 -974+   1.255  _rtdm_ioctl+0xc (sys_rtdm_ioctl+0x1b)
> :   **fn                 -972+   4.180  rtdm_context_get+0xa (_rtdm_ioctl+0x20)
> :   **fn                 -968+   1.203  __ipipe_syscall_root+0x9 (system_call+0x20)
> :   **fn                 -967+   1.345  __ipipe_dispatch_event+0xe (__ipipe_syscall_root+0x58)

The '*' mark a stalled domain, root domain on the right. It seems to me
that xnpod_suspend_thread() leaves the xeno domain stalled on wake up.
This gets recovered somehow later.

But here we see a different effect which finally causes the tick to get
frozen:

> :     fn                 -504+   2.075  sched_clock+0xd (schedule+0x112)
> :     fn                 -502    0.992  __ipipe_stall_root+0x8 (schedule+0x18e)
> :    *(0x50) 0x00000064  -501+   4.022  __ipipe_stall_root+0x32 (schedule+0x18e)
> :    *fn                 -497+   1.977  __ipipe_dispatch_event+0xe (schedule+0x412)
> :    *fn                 -495+   4.210  schedule_event+0xd (__ipipe_dispatch_event+0x5e)
> :   **(0x50) 0x000000c8  -491+   1.428  schedule_event+0x261 (__ipipe_dispatch_event+0x5e)
> :|  **fn                 -490+   2.067  xnpod_schedule_runnable+0xe (schedule_event+0x28b)
> :|  **fn                 -488    0.954  ipipe_unstall_pipeline_from+0xc (schedule_event+0x2c1)
> :|   *(0x51) 0x000000c8  -487+   4.646  ipipe_unstall_pipeline_from+0x25 (schedule_event+0x2c1)
> :|   *fn                 -482+   7.248  __switch_to+0xc (xnpod_schedule+0x689)
> :|  **(0x50) 0x000000c8  -475+   1.421  xnpod_schedule+0x77f (xnpod_suspend_thread+0x27c)
> :|  **(0x50) 0x000000c8  -473+   1.481  xnpod_schedule+0x951 (xnpod_suspend_thread+0x27c)
> :|  **(0x50) 0x000000c8  -472+   1.654  xnpod_suspend_thread+0x2bb (xnshadow_relax+0x136)
> :|  **(0x50) 0x000000c8  -470+   2.917  xnshadow_relax+0x154 (hisyscall_event+0x2ec)
> :|  **fn                 -467+   1.375  ipipe_reenter_root+0xb (xnshadow_relax+0x204)
> :|  **fn                 -466    0.954  __ipipe_unstall_root+0x8 (ipipe_reenter_root+0x26)
> :|  * (0x51) 0x00000064  -465+   3.789  __ipipe_unstall_root+0x25 (ipipe_reenter_root+0x26)
> :   * fn                 -461+   1.578  send_sig+0x8 (xnshadow_relax+0x228)
> :   * fn                 -460+   1.496  send_sig_info+0xb (send_sig+0x1d)
> :   * fn                 -458    0.909  __ipipe_test_and_stall_root+0x9 (send_sig_info+0x38)
> :   **(0x50) 0x00000064  -457+   1.699  __ipipe_test_and_stall_root+0x27 (send_sig_info+0x38)
> :   **fn                 -455+   1.203  specific_send_sig_info+0xb (send_sig_info+0x69)
> :   **fn                 -454+   1.706  __ipipe_test_root+0x8 (specific_send_sig_info+0x16)
> :   **fn                 -453+   3.360  sig_ignored+0x9 (specific_send_sig_info+0x29)
> :   **fn                 -449+   1.714  send_signal+0xb (specific_send_sig_info+0x55)
> :   **fn                 -447+   3.142  __sigqueue_alloc+0x9 (send_signal+0x3f)
> :   **fn                 -444+   1.210  kmem_cache_alloc+0xa (__sigqueue_alloc+0x42)
> :   **fn                 -443    0.909  __ipipe_test_and_stall_root+0x9 (kmem_cache_alloc+0x12)
> :   **(0x50) 0x00000064  -442+   2.180  __ipipe_test_and_stall_root+0x27 (kmem_cache_alloc+0x12)
> :   **fn                 -440+   1.218  __ipipe_restore_root+0x8 (kmem_cache_alloc+0x52)
> :   **fn                 -439+   1.315  __ipipe_stall_root+0x8 (__ipipe_restore_root+0x11)

No more recovery from the xeno stall, no more timer ticks.

The special traces were generated via

#define ipipe_mark_domain_stall(ipd,cpuid)     \
	ipipe_trace_special(0x50, ipd->priority)
#define ipipe_mark_domain_unstall(ipd,cpuid)   \
	ipipe_trace_special(0x51, ipd->priority)

Any ideas? I do not find anything fishy yet that we may have introduced
to cause this problem.

Jan

[-- Attachment #1.2: ipipe-dom-state.patch --]
[-- Type: text/plain, Size: 2592 bytes --]

Index: linux-2.6.15.5/kernel/ipipe/core.c
===================================================================
--- linux-2.6.15.5.orig/kernel/ipipe/core.c	2006-04-08 13:36:34.000000000 +0200
+++ linux-2.6.15.5/kernel/ipipe/core.c	2006-04-08 16:39:30.000000000 +0200
@@ -40,7 +40,7 @@
 
 ipipe_spinlock_t __ipipe_pipelock = IPIPE_SPIN_LOCK_UNLOCKED;
 
-struct list_head __ipipe_pipeline;
+LIST_HEAD(__ipipe_pipeline);
 
 unsigned long __ipipe_virtual_irq_map = 0;
 
@@ -66,8 +66,6 @@
 	 * secondary CPUs are still lost in space.
 	 */
 
-	INIT_LIST_HEAD(&__ipipe_pipeline);
-
 	ipd->name = "Linux";
 	ipd->domid = IPIPE_ROOT_ID;
 	ipd->priority = IPIPE_ROOT_PRIO;
Index: linux-2.6.15.5/kernel/ipipe/tracer.c
===================================================================
--- linux-2.6.15.5.orig/kernel/ipipe/tracer.c	2006-04-08 13:37:21.000000000 +0200
+++ linux-2.6.15.5/kernel/ipipe/tracer.c	2006-04-08 16:52:07.000000000 +0200
@@ -51,6 +51,7 @@
 
 #define IPIPE_TFLG_HWIRQ_OFF        0x0100
 #define IPIPE_TFLG_FREEZING         0x0200
+#define IPIPE_TFLG_DOMSTATE_SHIFT   12   /* bits 12..15: domain stalled? */
 
 
 struct ipipe_trace_point{
@@ -118,6 +119,24 @@
 static void __ipipe_print_symname(struct seq_file *m, unsigned long eip);
 
 
+static notrace void
+__ipipe_store_domain_states(struct ipipe_trace_point *point, int cpu_id)
+{
+	int i = IPIPE_TFLG_DOMSTATE_SHIFT;
+	struct list_head *pos;
+
+	list_for_each_prev(pos, &__ipipe_pipeline) {
+		struct ipipe_domain *ipd =
+			list_entry(pos, struct ipipe_domain, p_link);
+
+		if (test_bit(IPIPE_STALL_FLAG, &ipd->cpudata[cpu_id].status))
+			point->flags |= (1 << i);
+
+		if (++i > IPIPE_TFLG_DOMSTATE_SHIFT+3)
+			break;
+	}
+}
+
 static notrace int __ipipe_get_free_trace_path(int old, int cpu_id)
 {
 	int new_active = old;
@@ -282,6 +301,8 @@
 	point->v = v;
 	ipipe_read_tsc(point->timestamp);
 
+	__ipipe_store_domain_states(point, cpu_id);
+
 	/* forward to next point buffer */
 	next_pos = WRAP_POINT_NO(pos+1);
 	tp->trace_pos = next_pos;
@@ -617,6 +638,7 @@
 {
 	char mark = ' ';
 	int point_no = point - print_path->point;
+	int i;
 
 	if (print_path->end == point_no)
 		mark = '<';
@@ -626,6 +648,12 @@
 		mark = ':';
 	seq_printf(m, "%c%c", mark,
 	           (point->flags & IPIPE_TFLG_HWIRQ_OFF) ? '|' : ' ');
+
+	if (verbose_trace)
+		for (i = IPIPE_TFLG_DOMSTATE_SHIFT + 3;
+		     i >= IPIPE_TFLG_DOMSTATE_SHIFT; i--)
+			seq_printf(m, "%c",
+			           (point->flags & (1 << i)) ? '*' : ' ');
 }
 
 static void

[-- Attachment #1.3: frozen-timer3.bz2 --]
[-- Type: application/octet-stream, Size: 9808 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 250 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2006-04-17 16:21 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-08 15:12 [Xenomai-core] [BUG?] stalled xeno domain Jan Kiszka
2006-04-08 16:38 ` Jan Kiszka
2006-04-09 10:52   ` Philippe Gerum
2006-04-10 11:37   ` Jan Kiszka
2006-04-10 12:39     ` Philippe Gerum
2006-04-10 13:00       ` Jan Kiszka
2006-04-10 13:28         ` Philippe Gerum
2006-04-10 17:21           ` Jan Kiszka
2006-04-10 17:47             ` Philippe Gerum
2006-04-16 17:09             ` Philippe Gerum
2006-04-17 15:58               ` Jan Kiszka
2006-04-17 16:21                 ` Philippe Gerum

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.