All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yi Yang <yi.y.yang@intel.com>
To: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Gautham R Shenoy <ego@in.ibm.com>, Ingo Molnar <mingo@elte.hu>,
	akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	"Rafael J. Wysocki" <rjw@sisk.pl>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [BUG 2.6.25-rc3] scheduler/hotplug: some processes are dealocked when cpu is set to offline
Date: Tue, 04 Mar 2008 22:37:23 +0800	[thread overview]
Message-ID: <1204641443.3842.46.camel@yangyi-dev.bj.intel.com> (raw)
In-Reply-To: <20080304150107.GA564@tv-sign.ru>

On Tue, 2008-03-04 at 18:01 +0300, Oleg Nesterov wrote:
> On 03/04, Gautham R Shenoy wrote:
> >
> > So at times, the callback thread is blocked on kthread_stop(k) in
> > softlockup.c, while other time, it was blocked in
> > cleanup_workqueue_threads() in workqueue.c. 
> 
> From another message:
> >
> > However, it remains in R< state
> 
> What about cwq->thread? Was it TASK_RUNNING too?
> 
> Perhaps, for some reason the task can't get CPU after migrating from
> the now dead CPU.
> 
> I can't reproduce this problem on my one cpu P4-ht, perhaps you can
> try something like the untested/uncompiled patch below?
You need one multiprocessor system to regenerate, 4 CPUs or more are
best.

At the beginning, i thought Detect Soft Lockups
(CONFIG_DETECT_SOFTLOCKUP) is culprit because it added much code to
detect lock dependencies and deadlocks, but the issue is still there
after i disabled soft lock detection, it can be regenerated but will
take much long time.

At this time, group_balance and kblockd are in "R<" state.

[root@harwich-rh ~]# ps aux | grep kstop
root     15466  0.0  0.0      0     0 ?        S<   01:03   0:00 [kstopmachine]
root     15467  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15468  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15469  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15470  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15471  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15472  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15473  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15474  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15475  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     15476  0.0  0.0      0     0 ?        Z<   01:03   0:00 [kstopmachine] <defunct>
root     17155  0.0  0.0  61148   708 pts/3    R+   14:47   0:00 grep kstop
[root@harwich-rh ~]# ps aux | grep "R<"
root        35  0.0  0.0      0     0 ?        R<   Mar13   0:00 [group_balance]
root       182  0.1  0.0      0     0 ?        R<   Mar13   1:09 [kblockd/0]
root     17157  0.0  0.0  61148   704 pts/3    R+   14:47   0:00 grep R<
[root@harwich-rh ~]#



> Oleg.
> 
> --- include/linux/sched.h	2008-02-15 16:59:17.000000000 +0300
> +++ include/linux/sched.h	2008-03-04 17:44:53.136738605 +0300
> @@ -1121,6 +1121,7 @@ struct task_struct {
>  /* hung task detection */
>  	unsigned long last_switch_timestamp;
>  	unsigned long last_switch_count;
> +	unsigned long xxx;
>  #endif
>  /* CPU-specific state of this task */
>  	struct thread_struct thread;
> --- kernel/fork.c	2008-02-15 16:59:17.000000000 +0300
> +++ kernel/fork.c	2008-03-04 17:45:14.773033839 +0300
> @@ -1097,6 +1097,7 @@ static struct task_struct *copy_process(
>  #ifdef CONFIG_DETECT_SOFTLOCKUP
>  	p->last_switch_count = 0;
>  	p->last_switch_timestamp = 0;
> +	p->xxx = 0;
>  #endif
>  
>  #ifdef CONFIG_TASK_XACCT
> --- kernel/sched.c	2008-02-15 16:59:17.000000000 +0300
> +++ kernel/sched.c	2008-03-04 17:48:42.308798646 +0300
> @@ -1291,6 +1291,7 @@ static void enqueue_task(struct rq *rq, 
>  	sched_info_queued(p);
>  	p->sched_class->enqueue_task(rq, p, wakeup);
>  	p->se.on_rq = 1;
> +	p->xxx = jiffies | 1;
>  }
>  
>  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
> @@ -3944,6 +3945,8 @@ need_resched_nonpreemptible:
>  	preempt_enable_no_resched();
>  	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
>  		goto need_resched;
> +
> +	current->xxx = 0;
>  }
>  EXPORT_SYMBOL(schedule);
>  
> --- kernel/softlockup.c	2008-02-15 16:59:17.000000000 +0300
> +++ kernel/softlockup.c	2008-03-04 17:49:05.584414763 +0300
> @@ -174,6 +174,27 @@ static void check_hung_task(struct task_
>  	touch_nmi_watchdog();
>  }
>  
> +static void check_running_task(struct task_struct *t, unsigned long now)
> +{
> +	if (!sysctl_hung_task_timeout_secs)
> +		return;
> +
> +	if (time_before(now, t->xxx + HZ * sysctl_hung_task_timeout_secs)
> +		return;
> +
> +	printk(KERN_ERR "INFO: task %s:%d can't get CPU for more than "
> +			"%ld seconds.\n", t->comm, t->pid,
> +			sysctl_hung_task_timeout_secs);
> +
> +	if (!cpus_intersects(t->cpus_allowed, cpu_online_map))
> +		printk(KERN_ERR "bad ->cpus_allowed\n");
> +	if (!cpu_online(task_cpu(t)))
> +		printk(KERN_ERR "bad ->cpu\n");
> +
> +	sched_show_task(t);
> +	touch_nmi_watchdog();
> +}
> +
>  /*
>   * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
>   * a really long time (120 seconds). If that happens, print out
> @@ -183,6 +204,7 @@ static void check_hung_uninterruptible_t
>  {
>  	int max_count = sysctl_hung_task_check_count;
>  	unsigned long now = get_timestamp(this_cpu);
> +	unsigned long jiff = jiffies;
>  	struct task_struct *g, *t;
>  
>  	/*
> @@ -192,15 +214,17 @@ static void check_hung_uninterruptible_t
>  	if ((tainted & TAINT_DIE) || did_panic)
>  		return;
>  
> -	read_lock(&tasklist_lock);
> +	rcu_read_lock();
>  	do_each_thread(g, t) {
>  		if (!--max_count)
>  			goto unlock;
>  		if (t->state & TASK_UNINTERRUPTIBLE)
>  			check_hung_task(t, now);
> +		if (!t->xxx)
> +			check_running_task(t, jiff);
>  	} while_each_thread(g, t);
>   unlock:
> -	read_unlock(&tasklist_lock);
> +	rcu_read_unlock();
>  }
>  
>  /*
> 


  reply	other threads:[~2008-03-05  2:25 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-03-02 18:42 [BUG 2.6.25-rc3] scheduler/hotplug: some processes are dealocked when cpu is set to offline Yi Yang
2008-03-03 11:54 ` Dmitry Adamushko
2008-03-03 11:56   ` Ingo Molnar
2008-03-03 12:02     ` Dmitry Adamushko
2008-03-03 14:53       ` Yi Yang
2008-03-03 17:37         ` Yi Yang
2008-03-03 15:31 ` Gautham R Shenoy
2008-03-03 14:45   ` Yi Yang
2008-03-04  5:26     ` Gautham R Shenoy
2008-03-04  9:09       ` Gautham R Shenoy
2008-03-03 21:56         ` Yi Yang
2008-03-04 15:01       ` Oleg Nesterov
2008-03-04 14:37         ` Yi Yang [this message]
2008-03-06 20:05           ` Yi Yang
2008-03-05 10:05         ` Gautham R Shenoy
2008-03-05 13:53           ` Oleg Nesterov
2008-03-06 11:15             ` Gautham R Shenoy
2008-03-06 12:22               ` Gautham R Shenoy
2008-03-06 13:44         ` Gautham R Shenoy
2008-03-07  2:54           ` Oleg Nesterov
2008-03-07  9:10             ` Gautham R Shenoy
2008-03-07 10:51               ` Gautham R Shenoy
2008-03-06 23:20                 ` Yi Yang
2008-03-07 13:02                 ` Dmitry Adamushko
2008-03-07 13:55                   ` Gautham R Shenoy
2008-03-07 15:50                     ` Gautham R Shenoy
2008-03-07 19:14                       ` [BUG 2.6.25-rc3] scheduler/hotplug: some processes aredealocked " Suresh Siddha
2008-03-07 20:18                   ` [BUG 2.6.25-rc3] scheduler/hotplug: some processes are dealocked " Andrew Morton
2008-03-07 21:36                     ` Rafael J. Wysocki
2008-03-07 23:01                       ` Suresh Siddha
2008-03-07 23:29                         ` Andrew Morton
2008-03-07 23:43                           ` Rafael J. Wysocki
2008-03-08  1:50                             ` Suresh Siddha
2008-03-08  2:09                               ` Andrew Morton
2008-03-08  5:10                               ` [PATCH] adjust root-domain->online span in response to hotplug event Gregory Haskins
2008-03-08  8:41                                 ` Ingo Molnar
2008-03-08 17:50                                   ` [PATCH] adjust root-domain->online span in response to hotplugevent Gregory Haskins
2008-03-09  0:31                                     ` Dmitry Adamushko
2008-03-10 14:12                                       ` Gregory Haskins
2008-03-09  2:35                                 ` [PATCH] adjust root-domain->online span in response to hotplug event Suresh Siddha
2008-03-10 12:41                                   ` Gregory Haskins
2008-03-10  8:14                                 ` Gautham R Shenoy
2008-03-10 13:13                                   ` [PATCH] cpu-hotplug: Register update_sched_domains() notifier with higher prio Gautham R Shenoy
2008-03-10 22:25                                     ` Andrew Morton
2008-03-10 13:39                                   ` [PATCH] keep rd->online and cpu_online_map in sync Gregory Haskins
2008-03-10 14:21                                     ` Gautham R Shenoy
2008-03-10 18:12                                     ` Suresh Siddha
2008-03-10 22:03                                       ` Rafael J. Wysocki
2008-03-10 22:00                                         ` Gregory Haskins
2008-03-10 22:10                                           ` Suresh Siddha
2008-03-10 21:59                                             ` [PATCH v2] " Gregory Haskins
2008-03-10 23:36                                               ` Andrew Morton
2008-03-11  1:34                                                 ` Suresh Siddha
2008-03-11  4:39                                                   ` Gautham R Shenoy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1204641443.3842.46.camel@yangyi-dev.bj.intel.com \
    --to=yi.y.yang@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=ego@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=oleg@tv-sign.ru \
    --cc=rjw@sisk.pl \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.