* [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
[not found] <0004fb54-cdee-2197-1cbf-6e2111d39ed9@arm.com>
@ 2019-08-20 10:54 ` Valentin Schneider
2019-08-22 9:21 ` Peter Zijlstra
2019-08-22 18:48 ` bsegall
0 siblings, 2 replies; 7+ messages in thread
From: Valentin Schneider @ 2019-08-20 10:54 UTC (permalink / raw)
To: linux-kernel; +Cc: mingo, peterz, liangyan.peng, shanpeic, xlpang, pjt, stable
Turns out a cfs_rq->runtime_remaining can become positive in
assign_cfs_rq_runtime(), but this codepath has no call to
unthrottle_cfs_rq().
This can leave us in a situation where we have a throttled cfs_rq with
positive ->runtime_remaining, which breaks the math in
distribute_cfs_runtime(): this function expects a negative value so that
it may safely negate it into a positive value.
Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where
we expect negative values, and pull in a comment from the mailing list
that didn't make it in [1].
[1]: https://lkml.kernel.org/r/BANLkTi=NmCxKX6EbDQcJYDJ5kKyG2N1ssw@mail.gmail.com
Cc: <stable@vger.kernel.org>
Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth")
Reported-by: Liangyan <liangyan.peng@linux.alibaba.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
kernel/sched/fair.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1054d2cf6aaa..219ff3f328e5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_remaining += amount;
+ if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
return cfs_rq->runtime_remaining > 0;
}
@@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
__account_cfs_rq_runtime(cfs_rq, delta_exec);
}
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
- return cfs_bandwidth_used() && cfs_rq->throttled;
-}
-
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
@@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ WARN_ON(cfs_rq->runtime_remaining > 0);
+
+ /* Pick the minimum amount to return to a positive quota state */
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
--
2.22.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
2019-08-20 10:54 ` [PATCH] sched/fair: Add missing unthrottle_cfs_rq() Valentin Schneider
@ 2019-08-22 9:21 ` Peter Zijlstra
2019-08-22 17:43 ` bsegall
2019-08-22 18:48 ` bsegall
1 sibling, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2019-08-22 9:21 UTC (permalink / raw)
To: Valentin Schneider
Cc: linux-kernel, mingo, liangyan.peng, shanpeic, xlpang, pjt, stable,
Ben Segall
On Tue, Aug 20, 2019 at 11:54:20AM +0100, Valentin Schneider wrote:
> Turns out a cfs_rq->runtime_remaining can become positive in
> assign_cfs_rq_runtime(), but this codepath has no call to
> unthrottle_cfs_rq().
>
> This can leave us in a situation where we have a throttled cfs_rq with
> positive ->runtime_remaining, which breaks the math in
> distribute_cfs_runtime(): this function expects a negative value so that
> it may safely negate it into a positive value.
>
> Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where
> we expect negative values, and pull in a comment from the mailing list
> that didn't make it in [1].
>
> [1]: https://lkml.kernel.org/r/BANLkTi=NmCxKX6EbDQcJYDJ5kKyG2N1ssw@mail.gmail.com
>
> Cc: <stable@vger.kernel.org>
> Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth")
> Reported-by: Liangyan <liangyan.peng@linux.alibaba.com>
> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Thanks!
> ---
> kernel/sched/fair.c | 17 ++++++++++++-----
> 1 file changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1054d2cf6aaa..219ff3f328e5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
> return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
> }
>
> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> +{
> + return cfs_bandwidth_used() && cfs_rq->throttled;
> +}
> +
> /* returns 0 on failure to allocate runtime */
> static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> {
> @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>
> cfs_rq->runtime_remaining += amount;
>
> + if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
> + unthrottle_cfs_rq(cfs_rq);
> +
> return cfs_rq->runtime_remaining > 0;
> }
>
> @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
> __account_cfs_rq_runtime(cfs_rq, delta_exec);
> }
>
> -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> -{
> - return cfs_bandwidth_used() && cfs_rq->throttled;
> -}
> -
> /* check whether cfs_rq, or any parent, is throttled */
> static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
> {
> @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
> if (!cfs_rq_throttled(cfs_rq))
> goto next;
>
> + /* By the above check, this should never be true */
> + WARN_ON(cfs_rq->runtime_remaining > 0);
> +
> + /* Pick the minimum amount to return to a positive quota state */
> runtime = -cfs_rq->runtime_remaining + 1;
> if (runtime > remaining)
> runtime = remaining;
> --
> 2.22.0
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
2019-08-22 9:21 ` Peter Zijlstra
@ 2019-08-22 17:43 ` bsegall
0 siblings, 0 replies; 7+ messages in thread
From: bsegall @ 2019-08-22 17:43 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Valentin Schneider, linux-kernel, mingo, liangyan.peng, shanpeic,
xlpang, pjt, stable
Peter Zijlstra <peterz@infradead.org> writes:
> On Tue, Aug 20, 2019 at 11:54:20AM +0100, Valentin Schneider wrote:
>> Turns out a cfs_rq->runtime_remaining can become positive in
>> assign_cfs_rq_runtime(), but this codepath has no call to
>> unthrottle_cfs_rq().
>>
>> This can leave us in a situation where we have a throttled cfs_rq with
>> positive ->runtime_remaining, which breaks the math in
>> distribute_cfs_runtime(): this function expects a negative value so that
>> it may safely negate it into a positive value.
>>
>> Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where
>> we expect negative values, and pull in a comment from the mailing list
>> that didn't make it in [1].
This didn't exist because it's not supposed to be possible to call
account_cfs_rq_runtime on a throttled cfs_rq at all, so that's the
invariant being violated. Do you know what the code path causing this
looks like?
This would allow both list del and add while distribute is doing a
foreach, but I think that the racing behavior would just be to restart
the distribute loop, which is fine.
>>
>> [1]: https://lkml.kernel.org/r/BANLkTi=NmCxKX6EbDQcJYDJ5kKyG2N1ssw@mail.gmail.com
>>
>> Cc: <stable@vger.kernel.org>
>> Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth")
>> Reported-by: Liangyan <liangyan.peng@linux.alibaba.com>
>> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
>
> Thanks!
>
>> ---
>> kernel/sched/fair.c | 17 ++++++++++++-----
>> 1 file changed, 12 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 1054d2cf6aaa..219ff3f328e5 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
>> return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
>> }
>>
>> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>> +{
>> + return cfs_bandwidth_used() && cfs_rq->throttled;
>> +}
>> +
>> /* returns 0 on failure to allocate runtime */
>> static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>> {
>> @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>
>> cfs_rq->runtime_remaining += amount;
>>
>> + if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
>> + unthrottle_cfs_rq(cfs_rq);
>> +
>> return cfs_rq->runtime_remaining > 0;
>> }
>>
>> @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
>> __account_cfs_rq_runtime(cfs_rq, delta_exec);
>> }
>>
>> -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>> -{
>> - return cfs_bandwidth_used() && cfs_rq->throttled;
>> -}
>> -
>> /* check whether cfs_rq, or any parent, is throttled */
>> static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>> {
>> @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
>> if (!cfs_rq_throttled(cfs_rq))
>> goto next;
>>
>> + /* By the above check, this should never be true */
>> + WARN_ON(cfs_rq->runtime_remaining > 0);
>> +
>> + /* Pick the minimum amount to return to a positive quota state */
>> runtime = -cfs_rq->runtime_remaining + 1;
>> if (runtime > remaining)
>> runtime = remaining;
>> --
>> 2.22.0
>>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
2019-08-20 10:54 ` [PATCH] sched/fair: Add missing unthrottle_cfs_rq() Valentin Schneider
2019-08-22 9:21 ` Peter Zijlstra
@ 2019-08-22 18:48 ` bsegall
2019-08-22 20:40 ` Valentin Schneider
2019-08-23 7:22 ` Liangyan
1 sibling, 2 replies; 7+ messages in thread
From: bsegall @ 2019-08-22 18:48 UTC (permalink / raw)
To: Valentin Schneider
Cc: linux-kernel, mingo, peterz, liangyan.peng, shanpeic, xlpang, pjt,
stable
Valentin Schneider <valentin.schneider@arm.com> writes:
> Turns out a cfs_rq->runtime_remaining can become positive in
> assign_cfs_rq_runtime(), but this codepath has no call to
> unthrottle_cfs_rq().
>
> This can leave us in a situation where we have a throttled cfs_rq with
> positive ->runtime_remaining, which breaks the math in
> distribute_cfs_runtime(): this function expects a negative value so that
> it may safely negate it into a positive value.
>
> Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where
> we expect negative values, and pull in a comment from the mailing list
> that didn't make it in [1].
>
> [1]: https://lkml.kernel.org/r/BANLkTi=NmCxKX6EbDQcJYDJ5kKyG2N1ssw@mail.gmail.com
>
> Cc: <stable@vger.kernel.org>
> Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth")
> Reported-by: Liangyan <liangyan.peng@linux.alibaba.com>
> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Having now seen the rest of the thread:
Could you send the repro, as it doesn't seem to have reached lkml, so
that I can confirm my guess as to what's going on?
It seems most likely we throttle during one of the remove-change-adds in
set_cpus_allowed and friends or during the put half of pick_next_task
followed by idle balance to drop the lock. Then distribute races with a
later assign_cfs_rq_runtime so that the account finds runtime in the
cfs_b.
Re clock_task, it's only frozen for the purposes of pelt, not delta_exec
The other possible way to fix this would be to skip assign if throttled,
since the only time it could succeed is if we're racing with a
distribute that will unthrottle use anyways.
The main advantage of that is the risk of screwy behavior due to unthrottling
in the middle of pick_next/put_prev. The disadvantage is that we already
have the lock, if it works we don't need an ipi to trigger a preempt,
etc. (But I think one of the issues is that we may trigger the preempt
on the previous task, not the next, and I'm not 100% sure that will
carry over correctly)
> ---
> kernel/sched/fair.c | 17 ++++++++++++-----
> 1 file changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1054d2cf6aaa..219ff3f328e5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
> return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
> }
>
> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> +{
> + return cfs_bandwidth_used() && cfs_rq->throttled;
> +}
> +
> /* returns 0 on failure to allocate runtime */
> static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> {
> @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>
> cfs_rq->runtime_remaining += amount;
>
> + if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
> + unthrottle_cfs_rq(cfs_rq);
> +
> return cfs_rq->runtime_remaining > 0;
> }
>
> @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
> __account_cfs_rq_runtime(cfs_rq, delta_exec);
> }
>
> -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> -{
> - return cfs_bandwidth_used() && cfs_rq->throttled;
> -}
> -
> /* check whether cfs_rq, or any parent, is throttled */
> static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
> {
> @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
> if (!cfs_rq_throttled(cfs_rq))
> goto next;
>
> + /* By the above check, this should never be true */
> + WARN_ON(cfs_rq->runtime_remaining > 0);
> +
> + /* Pick the minimum amount to return to a positive quota state */
> runtime = -cfs_rq->runtime_remaining + 1;
> if (runtime > remaining)
> runtime = remaining;
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
2019-08-22 18:48 ` bsegall
@ 2019-08-22 20:40 ` Valentin Schneider
2019-08-22 21:10 ` Valentin Schneider
2019-08-23 7:22 ` Liangyan
1 sibling, 1 reply; 7+ messages in thread
From: Valentin Schneider @ 2019-08-22 20:40 UTC (permalink / raw)
To: bsegall
Cc: linux-kernel, mingo, peterz, liangyan.peng, shanpeic, xlpang, pjt,
stable
On 22/08/2019 19:48, bsegall@google.com wrote:> Having now seen the rest of the thread:
>
> Could you send the repro, as it doesn't seem to have reached lkml, so
> that I can confirm my guess as to what's going on?
>
Huh, odd. Here's the thing:
delay.c:
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
unsigned long NUM_LOOPS=2500000*250;
/* simple loop based delay: */
static void delay_loop(unsigned long loops)
{
asm volatile(
" test %0,%0 \n"
" jz 3f \n"
" jmp 1f \n"
".align 16 \n"
"1: jmp 2f \n"
".align 16 \n"
"2: dec %0 \n"
" jnz 2b \n"
"3: dec %0 \n"
: /* we don't need output */
:"a" (loops)
);
}
void __const_udelay(unsigned long xloops)
{
int d0;
xloops *= 4;
asm("mull %%edx"
:"=d" (xloops), "=&a" (d0)
:"1" (xloops), "0"
(NUM_LOOPS));
delay_loop(++xloops);
}
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
static void *thread_start(void *arg)
{
while(1) {
__udelay((unsigned long)arg);
usleep(7000);
}
}
int main(int argc, char*argv[])
{
int i;
int thread;
unsigned long timeout;
pthread_t new_th;
if (argc != 3) {
printf("./delay nr_thread work_loop\n");
exit(-1);
}
thread = atoi(argv[1]);
timeout = (unsigned long)atoi(argv[2]);
for (i = 0; i < thread; i++) {
pthread_create(&new_th, NULL, thread_start, (void *)timeout);
usleep(100);
}
while(1) {
sleep(10);
}
}
do-delay.sh:
#!/bin/bash
mkdir /sys/fs/cgroup/cpu/test1
echo 100000 > /sys/fs/cgroup/cpu/cpu.cfs_period_us
echo 1600000 > /sys/fs/cgroup/cpu/test1/cpu.cfs_quota_us
./delay 500 1000 &
echo $! > /sys/fs/cgroup/cpu/test1/cgroup.procs
mkdir /sys/fs/cgroup/cpu/test2
echo 100000 > /sys/fs/cgroup/cpu/test2/cpu.cfs_period_us
echo 800000 > /sys/fs/cgroup/cpu/test2/cpu.cfs_quota_us
./delay 500 1000 &
echo $! > /sys/fs/cgroup/cpu/test2/cgroup.procs
prev=0;while true; do curr=`cat /sys/fs/cgroup/cpu/test1/cpuacct.usage` && echo $(($curr-$prev)) && prev=$curr && sleep 1; done
I ran the thing on a dual-socket x86 test box and could trigger the issue
quite rapidly (w/ the WARN_ON in distribute_cfs_runtime()).
> It seems most likely we throttle during one of the remove-change-adds in
> set_cpus_allowed and friends or during the put half of pick_next_task
> followed by idle balance to drop the lock. Then distribute races with a
> later assign_cfs_rq_runtime so that the account finds runtime in the
> cfs_b.
>
I should still have a trace laying around, let me have a look.
> Re clock_task, it's only frozen for the purposes of pelt, not delta_exec
>
Noted, thanks. But then we shouldn't expect throttled rq's to call into
update_curr(), right? Maybe just right after they've been throttled, but not
beyond that. Otherwise I fail to see how that would make sense.
> The other possible way to fix this would be to skip assign if throttled,
> since the only time it could succeed is if we're racing with a
> distribute that will unthrottle use anyways.
>
So pretty much the change Liangyan originally proposed? (so much for trying
to help :p)
> The main advantage of that is the risk of screwy behavior due to unthrottling
> in the middle of pick_next/put_prev. The disadvantage is that we already
> have the lock, if it works we don't need an ipi to trigger a preempt,
> etc. (But I think one of the issues is that we may trigger the preempt
> on the previous task, not the next, and I'm not 100% sure that will
> carry over correctly)
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
2019-08-22 20:40 ` Valentin Schneider
@ 2019-08-22 21:10 ` Valentin Schneider
0 siblings, 0 replies; 7+ messages in thread
From: Valentin Schneider @ 2019-08-22 21:10 UTC (permalink / raw)
To: bsegall
Cc: linux-kernel, mingo, peterz, liangyan.peng, shanpeic, xlpang, pjt,
stable
On 22/08/2019 21:40, Valentin Schneider wrote:
> On 22/08/2019 19:48, bsegall@google.com wrote:
Re we shouldn't get account_cfs_rq_runtime() called on throttled cfs_rq's,
with this:
---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 171eef3f08f9..1acb88024cad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4411,6 +4416,8 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_remaining += amount;
+ WARN_ON(cfs_rq_throttled(cfs_rq) && cfs_rq->runtime_remaining > 0);
+
return cfs_rq->runtime_remaining > 0;
}
@@ -4436,12 +4443,9 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
- __account_cfs_rq_runtime(cfs_rq, delta_exec);
-}
+ WARN_ON(cfs_rq_throttled(cfs_rq));
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+ __account_cfs_rq_runtime(cfs_rq, delta_exec);
}
/* check whether cfs_rq, or any parent, is throttled */
---
I get this:
[ 204.798643] Call Trace:
[ 204.798645] put_prev_entity+0x8d/0x100
[ 204.798647] put_prev_task_fair+0x22/0x40
[ 204.798648] pick_next_task_idle+0x36/0x50
[ 204.798650] __schedule+0x61d/0x6c0
[ 204.798651] schedule+0x2d/0x90
[ 204.798653] exit_to_usermode_loop+0x61/0x100
[ 204.798654] prepare_exit_to_usermode+0x91/0xa0
[ 204.798656] retint_user+0x8/0x8
(this is a hit on the account_cfs_rq_runtime() WARN_ON)
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] sched/fair: Add missing unthrottle_cfs_rq()
2019-08-22 18:48 ` bsegall
2019-08-22 20:40 ` Valentin Schneider
@ 2019-08-23 7:22 ` Liangyan
1 sibling, 0 replies; 7+ messages in thread
From: Liangyan @ 2019-08-23 7:22 UTC (permalink / raw)
To: bsegall, Valentin Schneider
Cc: linux-kernel, mingo, peterz, shanpeic, xlpang, pjt, stable
Resend.
Sorry that my previous email has format issue.
On 19/8/23 上午2:48, bsegall@google.com wrote:
> Valentin Schneider <valentin.schneider@arm.com> writes:
>
>> Turns out a cfs_rq->runtime_remaining can become positive in
>> assign_cfs_rq_runtime(), but this codepath has no call to
>> unthrottle_cfs_rq().
>>
>> This can leave us in a situation where we have a throttled cfs_rq with
>> positive ->runtime_remaining, which breaks the math in
>> distribute_cfs_runtime(): this function expects a negative value so that
>> it may safely negate it into a positive value.
>>
>> Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where
>> we expect negative values, and pull in a comment from the mailing list
>> that didn't make it in [1].
>>
>> [1]: https://lkml.kernel.org/r/BANLkTi=NmCxKX6EbDQcJYDJ5kKyG2N1ssw@mail.gmail.com
>>
>> Cc: <stable@vger.kernel.org>
>> Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth")
>> Reported-by: Liangyan <liangyan.peng@linux.alibaba.com>
>> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
>
> Having now seen the rest of the thread:
>
> Could you send the repro, as it doesn't seem to have reached lkml, so
> that I can confirm my guess as to what's going on?
>
> It seems most likely we throttle during one of the remove-change-adds in
> set_cpus_allowed and friends or during the put half of pick_next_task
> followed by idle balance to drop the lock. Then distribute races with a
> later assign_cfs_rq_runtime so that the account finds runtime in the
> cfs_b.
>
pick_next_task_fair calls update_curr but get zero runtime because of
cfs_b->runtime=0, then throttle current cfs_rq because of
cfs_rq->runtime_remaining=0, then call put_prev_entity to
__account_cfs_rq_runtime to assign new runtime since dequeue_entity on
another cpu just call return_cfs_rq_runtime to return some runtime to
cfs_b->runtime.
Please check below debug log to trace this logic.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f..0da556c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4395,6 +4395,12 @@ static void __account_cfs_rq_runtime(struct
cfs_rq *cfs_rq, u64 delta_exec)
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled && smp_processor_id()==20) {
+
pr_err("__account_cfs_rq_runtime:cfs_rq=%p,comm=%s,pid=%d,cpu=%d\n",cfs_rq,current->comm,current->pid,smp_processor_id());
+ dump_stack();
+ }
+ //if (cfs_rq->throttled)
+ // return;
/*
* if we're unable to extend our runtime we resched so that the
active
* hierarchy can be throttled
@@ -4508,6 +4514,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
+ {
+ if (cfs_rq->nr_running > 0 && smp_processor_id()==20) {
+
pr_err("throttle_cfs_rq:cfs_rq=%p,comm=%s,pid=%d,cpu=%d\n",cfs_rq,current->comm,current->pid,smp_processor_id());
+ dump_stack();
+ }
+ }
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
empty = list_empty(&cfs_b->throttled_cfs_rq);
[ 257.406397]
throttle_cfs_rq:cfs_rq=00000000b012d731,comm=delay,pid=4307,cpu=20
[ 257.407251] CPU: 20 PID: 4307 Comm: delay Tainted: G W E
5.2.0-rc3+ #9
[ 257.408795] Call Trace:
[ 257.409085] dump_stack+0x5c/0x7b
[ 257.409482] throttle_cfs_rq+0x2c3/0x2d0
[ 257.409940] check_cfs_rq_runtime+0x2f/0x50
[ 257.410430] pick_next_task_fair+0xb1/0x740
[ 257.410918] __schedule+0x104/0x670
[ 257.411333] schedule+0x33/0x90
[ 257.411706] exit_to_usermode_loop+0x6a/0xf0
[ 257.412201] prepare_exit_to_usermode+0x80/0xc0
[ 257.412734] retint_user+0x8/0x8
[ 257.413114] RIP: 0033:0x4006d0
[ 257.413480] Code: 7d f8 48 8b 45 f8 48 85 c0 74 24 eb 0d 0f 1f 00 66
2e 0f 1f 84 00 00 00 00 00 eb 0e 0f 1f 40 00 66 2e 0f 1f 84 00 00 00 00
00 <48> ff c8 75 fb 48 ff c8 90 5d c3 55 48 89 e5 48 83 ec 18 48 89 7d
[ 257.415630] RSP: 002b:00007f9b74abbe90 EFLAGS: 00000206 ORIG_RAX:
ffffffffffffff13
[ 257.416508] RAX: 0000000000060f7b RBX: 0000000000000000 RCX:
0000000000000000
[ 257.417333] RDX: 00000000002625b3 RSI: 0000000000000000 RDI:
00000000002625b4
[ 257.418155] RBP: 00007f9b74abbe90 R08: 00007f9b74abc700 R09:
00007f9b74abc700
[ 257.418983] R10: 00007f9b74abc9d0 R11: 0000000000000000 R12:
00007ffee72e1afe
[ 257.419813] R13: 00007ffee72e1aff R14: 00007ffee72e1b00 R15:
0000000000000000
[ 257.420718]
__account_cfs_rq_runtime:cfs_rq=00000000b012d731,comm=delay,pid=4307,cpu=20
[ 257.421646] CPU: 20 PID: 4307 Comm: delay Tainted: G W E
5.2.0-rc3+ #9
[ 257.422538] Call Trace:
[ 257.424712] dump_stack+0x5c/0x7b
[ 257.425101] __account_cfs_rq_runtime+0x1d7/0x1e0
[ 257.425656] put_prev_entity+0x90/0x100
[ 257.426102] pick_next_task_fair+0x334/0x740
[ 257.426605] __schedule+0x104/0x670
[ 257.427013] schedule+0x33/0x90
[ 257.427384] exit_to_usermode_loop+0x6a/0xf0
[ 257.427879] prepare_exit_to_usermode+0x80/0xc0
[ 257.428406] retint_user+0x8/0x8
[ 257.428783] RIP: 0033:0x4006d0
> Re clock_task, it's only frozen for the purposes of pelt, not delta_exec
>
> The other possible way to fix this would be to skip assign if throttled,
> since the only time it could succeed is if we're racing with a
> distribute that will unthrottle use anyways.
I ever posted a similar patch here
https://lkml.org/lkml/2019/8/14/1176
>
> The main advantage of that is the risk of screwy behavior due to unthrottling
> in the middle of pick_next/put_prev. The disadvantage is that we already
> have the lock, if it works we don't need an ipi to trigger a preempt,
> etc. (But I think one of the issues is that we may trigger the preempt
> on the previous task, not the next, and I'm not 100% sure that will
> carry over correctly)
>
>
>
>> ---
>> kernel/sched/fair.c | 17 ++++++++++++-----
>> 1 file changed, 12 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 1054d2cf6aaa..219ff3f328e5 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
>> return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
>> }
>>
>> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>> +{
>> + return cfs_bandwidth_used() && cfs_rq->throttled;
>> +}
>> +
>> /* returns 0 on failure to allocate runtime */
>> static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>> {
>> @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>
>> cfs_rq->runtime_remaining += amount;
>>
>> + if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
>> + unthrottle_cfs_rq(cfs_rq);
>> +
>> return cfs_rq->runtime_remaining > 0;
>> }
>>
>> @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
>> __account_cfs_rq_runtime(cfs_rq, delta_exec);
>> }
>>
>> -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>> -{
>> - return cfs_bandwidth_used() && cfs_rq->throttled;
>> -}
>> -
>> /* check whether cfs_rq, or any parent, is throttled */
>> static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>> {
>> @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
>> if (!cfs_rq_throttled(cfs_rq))
>> goto next;
>>
>> + /* By the above check, this should never be true */
>> + WARN_ON(cfs_rq->runtime_remaining > 0);
>> +
>> + /* Pick the minimum amount to return to a positive quota state */
>> runtime = -cfs_rq->runtime_remaining + 1;
>> if (runtime > remaining)
>> runtime = remaining;
^ permalink raw reply related [flat|nested] 7+ messages in thread
end of thread, other threads:[~2019-08-23 7:22 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <0004fb54-cdee-2197-1cbf-6e2111d39ed9@arm.com>
2019-08-20 10:54 ` [PATCH] sched/fair: Add missing unthrottle_cfs_rq() Valentin Schneider
2019-08-22 9:21 ` Peter Zijlstra
2019-08-22 17:43 ` bsegall
2019-08-22 18:48 ` bsegall
2019-08-22 20:40 ` Valentin Schneider
2019-08-22 21:10 ` Valentin Schneider
2019-08-23 7:22 ` Liangyan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).