From: Yi Yang <yi.y.yang@intel.com>
To: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Gautham R Shenoy <ego@in.ibm.com>, Ingo Molnar <mingo@elte.hu>,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
"Rafael J. Wysocki" <rjw@sisk.pl>,
Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [BUG 2.6.25-rc3] scheduler/hotplug: some processes are dealocked when cpu is set to offline
Date: Tue, 04 Mar 2008 22:37:23 +0800 [thread overview]
Message-ID: <1204641443.3842.46.camel@yangyi-dev.bj.intel.com> (raw)
In-Reply-To: <20080304150107.GA564@tv-sign.ru>
On Tue, 2008-03-04 at 18:01 +0300, Oleg Nesterov wrote:
> On 03/04, Gautham R Shenoy wrote:
> >
> > So at times, the callback thread is blocked on kthread_stop(k) in
> > softlockup.c, while other time, it was blocked in
> > cleanup_workqueue_threads() in workqueue.c.
>
> From another message:
> >
> > However, it remains in R< state
>
> What about cwq->thread? Was it TASK_RUNNING too?
>
> Perhaps, for some reason the task can't get CPU after migrating from
> the now dead CPU.
>
> I can't reproduce this problem on my one cpu P4-ht, perhaps you can
> try something like the untested/uncompiled patch below?
You need one multiprocessor system to regenerate, 4 CPUs or more are
best.
At the beginning, i thought Detect Soft Lockups
(CONFIG_DETECT_SOFTLOCKUP) is culprit because it added much code to
detect lock dependencies and deadlocks, but the issue is still there
after i disabled soft lock detection, it can be regenerated but will
take much long time.
At this time, group_balance and kblockd are in "R<" state.
[root@harwich-rh ~]# ps aux | grep kstop
root 15466 0.0 0.0 0 0 ? S< 01:03 0:00 [kstopmachine]
root 15467 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15468 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15469 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15470 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15471 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15472 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15473 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15474 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15475 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 15476 0.0 0.0 0 0 ? Z< 01:03 0:00 [kstopmachine] <defunct>
root 17155 0.0 0.0 61148 708 pts/3 R+ 14:47 0:00 grep kstop
[root@harwich-rh ~]# ps aux | grep "R<"
root 35 0.0 0.0 0 0 ? R< Mar13 0:00 [group_balance]
root 182 0.1 0.0 0 0 ? R< Mar13 1:09 [kblockd/0]
root 17157 0.0 0.0 61148 704 pts/3 R+ 14:47 0:00 grep R<
[root@harwich-rh ~]#
> Oleg.
>
> --- include/linux/sched.h 2008-02-15 16:59:17.000000000 +0300
> +++ include/linux/sched.h 2008-03-04 17:44:53.136738605 +0300
> @@ -1121,6 +1121,7 @@ struct task_struct {
> /* hung task detection */
> unsigned long last_switch_timestamp;
> unsigned long last_switch_count;
> + unsigned long xxx;
> #endif
> /* CPU-specific state of this task */
> struct thread_struct thread;
> --- kernel/fork.c 2008-02-15 16:59:17.000000000 +0300
> +++ kernel/fork.c 2008-03-04 17:45:14.773033839 +0300
> @@ -1097,6 +1097,7 @@ static struct task_struct *copy_process(
> #ifdef CONFIG_DETECT_SOFTLOCKUP
> p->last_switch_count = 0;
> p->last_switch_timestamp = 0;
> + p->xxx = 0;
> #endif
>
> #ifdef CONFIG_TASK_XACCT
> --- kernel/sched.c 2008-02-15 16:59:17.000000000 +0300
> +++ kernel/sched.c 2008-03-04 17:48:42.308798646 +0300
> @@ -1291,6 +1291,7 @@ static void enqueue_task(struct rq *rq,
> sched_info_queued(p);
> p->sched_class->enqueue_task(rq, p, wakeup);
> p->se.on_rq = 1;
> + p->xxx = jiffies | 1;
> }
>
> static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
> @@ -3944,6 +3945,8 @@ need_resched_nonpreemptible:
> preempt_enable_no_resched();
> if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
> goto need_resched;
> +
> + current->xxx = 0;
> }
> EXPORT_SYMBOL(schedule);
>
> --- kernel/softlockup.c 2008-02-15 16:59:17.000000000 +0300
> +++ kernel/softlockup.c 2008-03-04 17:49:05.584414763 +0300
> @@ -174,6 +174,27 @@ static void check_hung_task(struct task_
> touch_nmi_watchdog();
> }
>
> +static void check_running_task(struct task_struct *t, unsigned long now)
> +{
> + if (!sysctl_hung_task_timeout_secs)
> + return;
> +
> + if (time_before(now, t->xxx + HZ * sysctl_hung_task_timeout_secs)
> + return;
> +
> + printk(KERN_ERR "INFO: task %s:%d can't get CPU for more than "
> + "%ld seconds.\n", t->comm, t->pid,
> + sysctl_hung_task_timeout_secs);
> +
> + if (!cpus_intersects(t->cpus_allowed, cpu_online_map))
> + printk(KERN_ERR "bad ->cpus_allowed\n");
> + if (!cpu_online(task_cpu(t)))
> + printk(KERN_ERR "bad ->cpu\n");
> +
> + sched_show_task(t);
> + touch_nmi_watchdog();
> +}
> +
> /*
> * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
> * a really long time (120 seconds). If that happens, print out
> @@ -183,6 +204,7 @@ static void check_hung_uninterruptible_t
> {
> int max_count = sysctl_hung_task_check_count;
> unsigned long now = get_timestamp(this_cpu);
> + unsigned long jiff = jiffies;
> struct task_struct *g, *t;
>
> /*
> @@ -192,15 +214,17 @@ static void check_hung_uninterruptible_t
> if ((tainted & TAINT_DIE) || did_panic)
> return;
>
> - read_lock(&tasklist_lock);
> + rcu_read_lock();
> do_each_thread(g, t) {
> if (!--max_count)
> goto unlock;
> if (t->state & TASK_UNINTERRUPTIBLE)
> check_hung_task(t, now);
> + if (!t->xxx)
> + check_running_task(t, jiff);
> } while_each_thread(g, t);
> unlock:
> - read_unlock(&tasklist_lock);
> + rcu_read_unlock();
> }
>
> /*
>
next prev parent reply other threads:[~2008-03-05 2:25 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-03-02 18:42 [BUG 2.6.25-rc3] scheduler/hotplug: some processes are dealocked when cpu is set to offline Yi Yang
2008-03-03 11:54 ` Dmitry Adamushko
2008-03-03 11:56 ` Ingo Molnar
2008-03-03 12:02 ` Dmitry Adamushko
2008-03-03 14:53 ` Yi Yang
2008-03-03 17:37 ` Yi Yang
2008-03-03 15:31 ` Gautham R Shenoy
2008-03-03 14:45 ` Yi Yang
2008-03-04 5:26 ` Gautham R Shenoy
2008-03-04 9:09 ` Gautham R Shenoy
2008-03-03 21:56 ` Yi Yang
2008-03-04 15:01 ` Oleg Nesterov
2008-03-04 14:37 ` Yi Yang [this message]
2008-03-06 20:05 ` Yi Yang
2008-03-05 10:05 ` Gautham R Shenoy
2008-03-05 13:53 ` Oleg Nesterov
2008-03-06 11:15 ` Gautham R Shenoy
2008-03-06 12:22 ` Gautham R Shenoy
2008-03-06 13:44 ` Gautham R Shenoy
2008-03-07 2:54 ` Oleg Nesterov
2008-03-07 9:10 ` Gautham R Shenoy
2008-03-07 10:51 ` Gautham R Shenoy
2008-03-06 23:20 ` Yi Yang
2008-03-07 13:02 ` Dmitry Adamushko
2008-03-07 13:55 ` Gautham R Shenoy
2008-03-07 15:50 ` Gautham R Shenoy
2008-03-07 19:14 ` [BUG 2.6.25-rc3] scheduler/hotplug: some processes aredealocked " Suresh Siddha
2008-03-07 20:18 ` [BUG 2.6.25-rc3] scheduler/hotplug: some processes are dealocked " Andrew Morton
2008-03-07 21:36 ` Rafael J. Wysocki
2008-03-07 23:01 ` Suresh Siddha
2008-03-07 23:29 ` Andrew Morton
2008-03-07 23:43 ` Rafael J. Wysocki
2008-03-08 1:50 ` Suresh Siddha
2008-03-08 2:09 ` Andrew Morton
2008-03-08 5:10 ` [PATCH] adjust root-domain->online span in response to hotplug event Gregory Haskins
2008-03-08 8:41 ` Ingo Molnar
2008-03-08 17:50 ` [PATCH] adjust root-domain->online span in response to hotplugevent Gregory Haskins
2008-03-09 0:31 ` Dmitry Adamushko
2008-03-10 14:12 ` Gregory Haskins
2008-03-09 2:35 ` [PATCH] adjust root-domain->online span in response to hotplug event Suresh Siddha
2008-03-10 12:41 ` Gregory Haskins
2008-03-10 8:14 ` Gautham R Shenoy
2008-03-10 13:13 ` [PATCH] cpu-hotplug: Register update_sched_domains() notifier with higher prio Gautham R Shenoy
2008-03-10 22:25 ` Andrew Morton
2008-03-10 13:39 ` [PATCH] keep rd->online and cpu_online_map in sync Gregory Haskins
2008-03-10 14:21 ` Gautham R Shenoy
2008-03-10 18:12 ` Suresh Siddha
2008-03-10 22:03 ` Rafael J. Wysocki
2008-03-10 22:00 ` Gregory Haskins
2008-03-10 22:10 ` Suresh Siddha
2008-03-10 21:59 ` [PATCH v2] " Gregory Haskins
2008-03-10 23:36 ` Andrew Morton
2008-03-11 1:34 ` Suresh Siddha
2008-03-11 4:39 ` Gautham R Shenoy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1204641443.3842.46.camel@yangyi-dev.bj.intel.com \
--to=yi.y.yang@intel.com \
--cc=akpm@linux-foundation.org \
--cc=ego@in.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=oleg@tv-sign.ru \
--cc=rjw@sisk.pl \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.