* [PATCH RESEND] sched/fair: Fix wrong cpu selecting from isolated domain
@ 2020-09-24 6:48 Xunlei Pang
2020-09-24 7:18 ` Vincent Guittot
2020-09-29 7:56 ` [tip: sched/core] " tip-bot2 for Xunlei Pang
0 siblings, 2 replies; 4+ messages in thread
From: Xunlei Pang @ 2020-09-24 6:48 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
Jiang Biao
Cc: Wetp Zhang, linux-kernel
We've met problems that occasionally tasks with full cpumask
(e.g. by putting it into a cpuset or setting to full affinity)
were migrated to our isolated cpus in production environment.
After some analysis, we found that it is due to the current
select_idle_smt() not considering the sched_domain mask.
Steps to reproduce on my 31-CPU hyperthreads machine:
1. with boot parameter: "isolcpus=domain,2-31"
(thread lists: 0,16 and 1,17)
2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads"
3. some threads will be migrated to the isolated cpu16~17.
Fix it by checking the valid domain mask in select_idle_smt().
Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings())
Reported-by: Wetp Zhang <wetp.zy@linux.alibaba.com>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
kernel/sched/fair.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a05..fa942c4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6075,7 +6075,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
@@ -6083,7 +6083,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6099,7 +6100,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -6274,7 +6275,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, target);
+ i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH RESEND] sched/fair: Fix wrong cpu selecting from isolated domain
2020-09-24 6:48 [PATCH RESEND] sched/fair: Fix wrong cpu selecting from isolated domain Xunlei Pang
@ 2020-09-24 7:18 ` Vincent Guittot
2020-09-24 8:54 ` Xunlei Pang
2020-09-29 7:56 ` [tip: sched/core] " tip-bot2 for Xunlei Pang
1 sibling, 1 reply; 4+ messages in thread
From: Vincent Guittot @ 2020-09-24 7:18 UTC (permalink / raw)
To: Xunlei Pang
Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Jiang Biao, Wetp Zhang,
linux-kernel
On Thu, 24 Sep 2020 at 08:48, Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>
> We've met problems that occasionally tasks with full cpumask
> (e.g. by putting it into a cpuset or setting to full affinity)
> were migrated to our isolated cpus in production environment.
>
> After some analysis, we found that it is due to the current
> select_idle_smt() not considering the sched_domain mask.
>
> Steps to reproduce on my 31-CPU hyperthreads machine:
> 1. with boot parameter: "isolcpus=domain,2-31"
> (thread lists: 0,16 and 1,17)
> 2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads"
> 3. some threads will be migrated to the isolated cpu16~17.
>
> Fix it by checking the valid domain mask in select_idle_smt().
>
> Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings())
> Reported-by: Wetp Zhang <wetp.zy@linux.alibaba.com>
> Reviewed-by: Jiang Biao <benbjiang@tencent.com>
> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/fair.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1a68a05..fa942c4 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6075,7 +6075,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
> /*
> * Scan the local SMT mask for idle CPUs.
> */
> -static int select_idle_smt(struct task_struct *p, int target)
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> {
> int cpu;
>
> @@ -6083,7 +6083,8 @@ static int select_idle_smt(struct task_struct *p, int target)
> return -1;
>
> for_each_cpu(cpu, cpu_smt_mask(target)) {
> - if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> + if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> + !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
> if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> return cpu;
> @@ -6099,7 +6100,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
> return -1;
> }
>
> -static inline int select_idle_smt(struct task_struct *p, int target)
> +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
> {
> return -1;
> }
> @@ -6274,7 +6275,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
> if ((unsigned)i < nr_cpumask_bits)
> return i;
>
> - i = select_idle_smt(p, target);
> + i = select_idle_smt(p, sd, target);
> if ((unsigned)i < nr_cpumask_bits)
> return i;
>
> --
> 1.8.3.1
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH RESEND] sched/fair: Fix wrong cpu selecting from isolated domain
2020-09-24 7:18 ` Vincent Guittot
@ 2020-09-24 8:54 ` Xunlei Pang
0 siblings, 0 replies; 4+ messages in thread
From: Xunlei Pang @ 2020-09-24 8:54 UTC (permalink / raw)
To: Vincent Guittot, Xunlei Pang
Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Jiang Biao, Wetp Zhang,
linux-kernel
On 9/24/20 3:18 PM, Vincent Guittot wrote:
> On Thu, 24 Sep 2020 at 08:48, Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>>
>> We've met problems that occasionally tasks with full cpumask
>> (e.g. by putting it into a cpuset or setting to full affinity)
>> were migrated to our isolated cpus in production environment.
>>
>> After some analysis, we found that it is due to the current
>> select_idle_smt() not considering the sched_domain mask.
>>
>> Steps to reproduce on my 31-CPU hyperthreads machine:
>> 1. with boot parameter: "isolcpus=domain,2-31"
>> (thread lists: 0,16 and 1,17)
>> 2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads"
>> 3. some threads will be migrated to the isolated cpu16~17.
>>
>> Fix it by checking the valid domain mask in select_idle_smt().
>>
>> Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings())
>> Reported-by: Wetp Zhang <wetp.zy@linux.alibaba.com>
>> Reviewed-by: Jiang Biao <benbjiang@tencent.com>
>> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
>
> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
>
Thanks, Vincent :-)
^ permalink raw reply [flat|nested] 4+ messages in thread
* [tip: sched/core] sched/fair: Fix wrong cpu selecting from isolated domain
2020-09-24 6:48 [PATCH RESEND] sched/fair: Fix wrong cpu selecting from isolated domain Xunlei Pang
2020-09-24 7:18 ` Vincent Guittot
@ 2020-09-29 7:56 ` tip-bot2 for Xunlei Pang
1 sibling, 0 replies; 4+ messages in thread
From: tip-bot2 for Xunlei Pang @ 2020-09-29 7:56 UTC (permalink / raw)
To: linux-tip-commits
Cc: Wetp Zhang, Xunlei Pang, Peter Zijlstra (Intel), Jiang Biao,
Vincent Guittot, x86, LKML
The following commit has been merged into the sched/core branch of tip:
Commit-ID: df3cb4ea1fb63ff326488efd671ba3c39034255e
Gitweb: https://git.kernel.org/tip/df3cb4ea1fb63ff326488efd671ba3c39034255e
Author: Xunlei Pang <xlpang@linux.alibaba.com>
AuthorDate: Thu, 24 Sep 2020 14:48:47 +08:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 25 Sep 2020 14:23:25 +02:00
sched/fair: Fix wrong cpu selecting from isolated domain
We've met problems that occasionally tasks with full cpumask
(e.g. by putting it into a cpuset or setting to full affinity)
were migrated to our isolated cpus in production environment.
After some analysis, we found that it is due to the current
select_idle_smt() not considering the sched_domain mask.
Steps to reproduce on my 31-CPU hyperthreads machine:
1. with boot parameter: "isolcpus=domain,2-31"
(thread lists: 0,16 and 1,17)
2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads"
3. some threads will be migrated to the isolated cpu16~17.
Fix it by checking the valid domain mask in select_idle_smt().
Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings())
Reported-by: Wetp Zhang <wetp.zy@linux.alibaba.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/1600930127-76857-1-git-send-email-xlpang@linux.alibaba.com
---
kernel/sched/fair.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a15deb2..9613e5d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6080,7 +6080,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
@@ -6088,7 +6088,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6104,7 +6105,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -6279,7 +6280,7 @@ symmetric:
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, target);
+ i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2020-09-29 7:57 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-09-24 6:48 [PATCH RESEND] sched/fair: Fix wrong cpu selecting from isolated domain Xunlei Pang
2020-09-24 7:18 ` Vincent Guittot
2020-09-24 8:54 ` Xunlei Pang
2020-09-29 7:56 ` [tip: sched/core] " tip-bot2 for Xunlei Pang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.