All of lore.kernel.org
 help / color / mirror / Atom feed
* [Adeos-main] [PATCH] Detect leaking stalls of topmost domain
@ 2008-08-27 12:59 Jan Kiszka
  0 siblings, 0 replies; only message in thread
From: Jan Kiszka @ 2008-08-27 12:59 UTC (permalink / raw)
  To: adeos-main

I just managed to create this infamous bug pattern once again:

function()
{
	stall_topmost_domain();
	...
	if (condition)
		return;
	...
	unstall_topmost_domain();
}

The result is often a locked-up system, specifically the root domain no
longer receives IRQs. Unless you find the bug quickly by code
inspection, debugging/instrumenting can take quite some time.

To catch such issues earlier, I therefore propose the following
extension of ipipe_check_context. It is based on the assumption that the
topmost domain should never be stalled when lower domains execute that
check. This specifically takes care of not breaking Xenomai's IRQ shield
(a mid-prio domain that intentionally blocks Linux IRQs).

This is how this patch sees my bug:

I-pipe: Detected stalled topmost domain, probably caused by a bug.
        A critical section may have been left unterminated.
Pid: 4483, comm: cyclictest Tainted: G        W 2.6.26.2-xeno_64 #55

Call Trace:
 [<ffffffff8026b61b>] ipipe_check_context+0x11e/0x128
 [<ffffffff80474849>] down_write+0x1d/0x2e
 [<ffffffff802c9686>] ipipe_disable_ondemand_mappings+0x41/0x3b3
 [<ffffffff8021ed3c>] ? mcount+0x4c/0x72
 [<ffffffff80283b93>] xnshadow_map+0x65/0x2a6
 [<ffffffff8021ed3c>] ? mcount+0x4c/0x72
 [<ffffffff802b5bc5>] __pthread_setschedparam+0xd3/0x3b9
 [<ffffffff80283840>] losyscall_event+0x11f/0x1ee
 [<ffffffff80283721>] ? losyscall_event+0x0/0x1ee
 [<ffffffff8026c6ad>] __ipipe_dispatch_event+0x127/0x255
 [<ffffffff8021e922>] __ipipe_syscall_root+0xa2/0x194
 [<ffffffff8047555a>] __ipipe_syscall_root_thunk+0x35/0x6a
 [<ffffffff8020c034>] ? system_call_after_swapgs+0x54/0x94

I-pipe tracer log (100 points):
 |  *+func                    0 ipipe_trace_panic_freeze+0xe (ipipe_check_context+0xab)
 |  *+func                    0 find_next_bit+0x9 (__next_cpu+0x1e)
 |  *+func                    0 __next_cpu+0x9 (ipipe_check_context+0x9f)
 |  *+func                   -1 find_first_bit+0x9 (__first_cpu+0x13)
 |  *+func                   -1 __first_cpu+0x9 (ipipe_check_context+0x79)
 |  *+func                   -1 ipipe_check_context+0xc (down_write+0x1d)
 |  *+func                   -1 down_write+0xe (ipipe_disable_ondemand_mappings+0x41)
 |  *+func                   -2 _spin_lock+0x9 (get_task_mm+0x1d)
 |  *+func                   -2 get_task_mm+0xe (ipipe_disable_ondemand_mappings+0x1e)
 |  *+func                   -3 ipipe_disable_ondemand_mappings+0x16 (xnshadow_map+0x65)
 |  *+func                   -3 xnshadow_map+0x12 (__pthread_setschedparam+0xd3)
 |  *+func                   -4 xnsynch_init+0x9 (xnregistry_enter+0xf8)
 |   +begin   0x80000000     -5 xnregistry_enter+0x5b (pthread_create+0x321)
     +func                   -5 strchr+0x9 (xnregistry_enter+0x40)
     +func                   -6 xnregistry_enter+0x16 (pthread_create+0x321)
 |   +end     0x80000000     -6 __ipipe_restore_pipeline_head+0xea (pthread_create+0x309)
 |  *+func                   -6 __ipipe_restore_pipeline_head+0xe (pthread_create+0x309)
 |  *+func                   -7 ppd_lookup_inner+0xe (xnshadow_ppd_get+0x5d)
 |  *+func                   -8 xnshadow_ppd_get+0xd (pthread_create+0x28c)
 |   +begin   0x80000000     -8 pthread_create+0x227 (__pthread_setschedparam+0xb6)

(xnregistry_enter is left with nklock still held. Fix committed.)

Jan

---
 kernel/ipipe/core.c |   20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

Index: b/kernel/ipipe/core.c
===================================================================
--- a/kernel/ipipe/core.c
+++ b/kernel/ipipe/core.c
@@ -1567,7 +1567,9 @@ void ipipe_check_context(struct ipipe_do
 	/* Note: We don't make the per_cpu access atomic. We assume that code
 	   which temporarily disables the check does this in atomic context
 	   only. */
-	if (likely(ipipe_current_domain->priority <= border_ipd->priority) ||
+	if (likely(ipipe_current_domain->priority <= border_ipd->priority &&
+		   !test_bit(IPIPE_STALL_FLAG,
+			     &ipipe_head_cpudom_var(status))) ||
 	    !per_cpu(ipipe_percpu_context_check, ipipe_processor_id()))
 		return;
 
@@ -1575,10 +1577,18 @@ void ipipe_check_context(struct ipipe_do
 
 	ipipe_trace_panic_freeze();
 	ipipe_set_printk_sync(ipipe_current_domain);
-	printk(KERN_ERR "I-pipe: Detected illicit call from domain '%s'\n"
-	       KERN_ERR "        into a service reserved for domain '%s' and "
-			"below.\n",
-	       ipipe_current_domain->name, border_ipd->name);
+
+	if (ipipe_current_domain->priority > border_ipd->priority)
+		printk(KERN_ERR "I-pipe: Detected illicit call from domain "
+				"'%s'\n"
+		       KERN_ERR "        into a service reserved for domain "
+				"'%s' and below.\n",
+		       ipipe_current_domain->name, border_ipd->name);
+	else
+		printk(KERN_ERR "I-pipe: Detected stalled topmost domain, "
+				"probably caused by a bug.\n"
+				"        A critical section may have been "
+				"left unterminated.\n");
 	dump_stack();
 	ipipe_trace_panic_dump();
 }


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-08-27 12:59 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-27 12:59 [Adeos-main] [PATCH] Detect leaking stalls of topmost domain Jan Kiszka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.