[PATCH v3] sched: async unthrottling for cfs bandwidth

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] sched: async unthrottling for cfs bandwidth
@ 2022-11-17  0:54 Josh Don
  2022-11-18 12:47 ` Peter Zijlstra
  2022-12-27 12:13 ` [tip: sched/core] sched: Async " tip-bot2 for Josh Don
  0 siblings, 2 replies; 21+ messages in thread
From: Josh Don @ 2022-11-17  0:54 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li,
	Josh Don

CFS bandwidth currently distributes new runtime and unthrottles cfs_rq's
inline in an hrtimer callback. Runtime distribution is a per-cpu
operation, and unthrottling is a per-cgroup operation, since a tg walk
is required. On machines with a large number of cpus and large cgroup
hierarchies, this cpus*cgroups work can be too much to do in a single
hrtimer callback: since IRQ are disabled, hard lockups may easily occur.
Specifically, we've found this scalability issue on configurations with
256 cpus, O(1000) cgroups in the hierarchy being throttled, and high
memory bandwidth usage.

To fix this, we can instead unthrottle cfs_rq's asynchronously via a
CSD. Each cpu is responsible for unthrottling itself, thus sharding the
total work more fairly across the system, and avoiding hard lockups.

Signed-off-by: Josh Don <joshdon@google.com>
---
v2: Fixed !CONFIG_SMP build errors
v3: Removed the throttled_csd_count atomic

 kernel/sched/fair.c  | 127 ++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |   8 +++
 2 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4cc56c91e06e..012ec9d03811 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5449,10 +5449,77 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		resched_curr(rq);
 }
 
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
+{
+	struct rq *rq = arg;
+	struct rq_flags rf;
+	struct cfs_rq *cursor, *tmp;
+
+	rq_lock(rq, &rf);
+
+	/*
+	 * Since we hold rq lock we're safe from concurrent manipulation of
+	 * the CSD list. However, this RCU critical section annotates the
+	 * fact that we pair with sched_free_group_rcu(), so that we cannot
+	 * race with group being freed in the window between removing it
+	 * from the list and advancing to the next entry in the list.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+				 throttled_csd_list) {
+		list_del_init(&cursor->throttled_csd_list);
+
+		if (cfs_rq_throttled(cursor))
+			unthrottle_cfs_rq(cursor);
+	}
+
+	rcu_read_unlock();
+
+	rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+
+	if (rq == this_rq()) {
+		unthrottle_cfs_rq(cfs_rq);
+		return;
+	}
+
+	/* Already enqueued */
+	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+		return;
+
+	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+
+	smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	lockdep_assert_rq_held(rq_of(cfs_rq));
+
+	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+	    cfs_rq->runtime_remaining <= 0))
+		return;
+
+	__unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 {
 	struct cfs_rq *cfs_rq;
 	u64 runtime, remaining = 1;
+	bool throttled = false;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5460,11 +5527,22 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		struct rq *rq = rq_of(cfs_rq);
 		struct rq_flags rf;
 
+		if (!remaining) {
+			throttled = true;
+			break;
+		}
+
 		rq_lock_irqsave(rq, &rf);
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
-		/* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+		/* Already queued for async unthrottle */
+		if (!list_empty(&cfs_rq->throttled_csd_list))
+			goto next;
+#endif
+
+		/* By the above checks, this should never be true */
 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 
 		raw_spin_lock(&cfs_b->lock);
@@ -5479,15 +5557,14 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 
 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0)
-			unthrottle_cfs_rq(cfs_rq);
+			unthrottle_cfs_rq_async(cfs_rq);
 
 next:
 		rq_unlock_irqrestore(rq, &rf);
-
-		if (!remaining)
-			break;
 	}
 	rcu_read_unlock();
+
+	return throttled;
 }
 
 /*
@@ -5532,10 +5609,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
 	while (throttled && cfs_b->runtime > 0) {
 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
-		distribute_cfs_runtime(cfs_b);
+		throttled = distribute_cfs_runtime(cfs_b);
 		raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
-		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	}
 
 	/*
@@ -5812,6 +5887,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5828,12 +5906,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+	int __maybe_unused i;
+
 	/* init_cfs_bandwidth() was not called */
 	if (!cfs_b->throttled_cfs_rq.next)
 		return;
 
 	hrtimer_cancel(&cfs_b->period_timer);
 	hrtimer_cancel(&cfs_b->slack_timer);
+
+	/*
+	 * It is possible that we still have some cfs_rq's pending on a CSD
+	 * list, though this race is very rare. In order for this to occur, we
+	 * must have raced with the last task leaving the group while there
+	 * exist throttled cfs_rq(s), and the period_timer must have queued the
+	 * CSD item but the remote cpu has not yet processed it. To handle this,
+	 * we can simply flush all pending CSD work inline here. We're
+	 * guaranteed at this point that no additional cfs_rq of this group can
+	 * join a CSD list.
+	 */
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		if (list_empty(&rq->cfsb_csd_list))
+			continue;
+
+		local_irq_save(flags);
+		__cfsb_csd_unthrottle(rq);
+		local_irq_restore(flags);
+	}
+#endif
 }
 
 /*
@@ -12462,6 +12566,11 @@ __init void init_sched_fair_class(void)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
 	}
 
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..b3d6e819127c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -645,6 +645,9 @@ struct cfs_rq {
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
+#ifdef CONFIG_SMP
+	struct list_head	throttled_csd_list;
+#endif
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -1154,6 +1157,11 @@ struct rq {
 
 	/* Scratch cpumask to be temporarily used under rq_lock */
 	cpumask_var_t		scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+	call_single_data_t	cfsb_csd;
+	struct list_head	cfsb_csd_list;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
2.38.1.431.g37b22c650d-goog


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-17  0:54 [PATCH v3] sched: async unthrottling for cfs bandwidth Josh Don
@ 2022-11-18 12:47 ` Peter Zijlstra
  2022-11-18 19:25   ` Josh Don
  2022-12-27 12:13 ` [tip: sched/core] sched: Async " tip-bot2 for Josh Don
  1 sibling, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-18 12:47 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Wed, Nov 16, 2022 at 04:54:18PM -0800, Josh Don wrote:

> +#ifdef CONFIG_SMP
> +static void __cfsb_csd_unthrottle(void *arg)
> +{
> +	struct rq *rq = arg;
> +	struct rq_flags rf;
> +	struct cfs_rq *cursor, *tmp;
> +
> +	rq_lock(rq, &rf);
> +
> +	/*
> +	 * Since we hold rq lock we're safe from concurrent manipulation of
> +	 * the CSD list. However, this RCU critical section annotates the
> +	 * fact that we pair with sched_free_group_rcu(), so that we cannot
> +	 * race with group being freed in the window between removing it
> +	 * from the list and advancing to the next entry in the list.
> +	 */
> +	rcu_read_lock();

preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
speaking this here is superfluous. But if you want it as an annotation,
that's fine I suppose.

> +
> +	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
> +				 throttled_csd_list) {
> +		list_del_init(&cursor->throttled_csd_list);
> +
> +		if (cfs_rq_throttled(cursor))
> +			unthrottle_cfs_rq(cursor);
> +	}
> +
> +	rcu_read_unlock();
> +
> +	rq_unlock(rq, &rf);
> +}
> +
> +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
> +{
> +	struct rq *rq = rq_of(cfs_rq);
> +
> +	if (rq == this_rq()) {
> +		unthrottle_cfs_rq(cfs_rq);
> +		return;
> +	}

Ideally we'd first queue all the remotes and then process local, but
given how all this is organized that doesn't seem trivial to arrange.

Maybe have this function return false when local and save that cfs_rq in
a local var to process again later, dunno, that might turn messy.

> +
> +	/* Already enqueued */
> +	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
> +		return;
> +
> +	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> +
> +	smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);

Hurmph.. so I was expecting something like:

	first = list_empty(&rq->cfsb_csd_list);
	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
	if (first)
		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);

But I suppose I'm remembering the 'old' version. I don't think it is
broken as written. There's a very narrow window where you'll end up
sending a second IPI for naught, but meh.

> +}

Let me go queue this thing, we can always improve upon matters later.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-18 12:47 ` Peter Zijlstra
@ 2022-11-18 19:25   ` Josh Don
  2022-11-20  2:22     ` Chengming Zhou
                       ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Josh Don @ 2022-11-18 19:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Fri, Nov 18, 2022 at 4:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
> speaking this here is superfluous. But if you want it as an annotation,
> that's fine I suppose.

Yep, I purely added this as extra annotation for future readers.

> Ideally we'd first queue all the remotes and then process local, but
> given how all this is organized that doesn't seem trivial to arrange.
>
> Maybe have this function return false when local and save that cfs_rq in
> a local var to process again later, dunno, that might turn messy.

Maybe something like this? Apologies for inline diff formatting.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 012ec9d03811..100dae6023da 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5520,12 +5520,15 @@ static bool distribute_cfs_runtime(struct
cfs_bandwidth *cfs_b)
        struct cfs_rq *cfs_rq;
        u64 runtime, remaining = 1;
        bool throttled = false;
+       int this_cpu = smp_processor_id();
+       struct cfs_rq *local_unthrottle = NULL;
+       struct rq *rq;
+       struct rq_flags rf;

        rcu_read_lock();
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                throttled_list) {
-               struct rq *rq = rq_of(cfs_rq);
-               struct rq_flags rf;
+               rq = rq_of(cfs_rq);

                if (!remaining) {
                        throttled = true;
@@ -5556,14 +5559,36 @@ static bool distribute_cfs_runtime(struct
cfs_bandwidth *cfs_b)
                cfs_rq->runtime_remaining += runtime;

                /* we check whether we're throttled above */
-               if (cfs_rq->runtime_remaining > 0)
-                       unthrottle_cfs_rq_async(cfs_rq);
+               if (cfs_rq->runtime_remaining > 0) {
+                       if (cpu_of(rq) != this_cpu ||
+                           SCHED_WARN_ON(local_unthrottle)) {
+                               unthrottle_cfs_rq_async(cfs_rq);
+                       } else {
+                               local_unthrottle = cfs_rq;
+                       }
+               } else {
+                       throttled = true;
+               }

 next:
                rq_unlock_irqrestore(rq, &rf);
        }
        rcu_read_unlock();

+       /*
+        * We prefer to stage the async unthrottles of all the remote cpus
+        * before we do the inline unthrottle locally. Note that
+        * unthrottle_cfs_rq_async() on the local cpu is actually synchronous,
+        * but it includes extra WARNs to make sure the cfs_rq really is
+        * still throttled.
+        */
+       if (local_unthrottle) {
+               rq = cpu_rq(this_cpu);
+               rq_lock_irqsave(rq, &rf);
+               unthrottle_cfs_rq_async(local_unthrottle);
+               rq_unlock_irqrestore(rq, &rf);
+       }
+
        return throttled;
 }

Note that one change we definitely want is the extra setting of
throttled = true in the case that cfs_rq->runtime_remaining <= 0, to
catch the case where we run out of runtime to distribute on the last
entity in the list.

> > +
> > +     /* Already enqueued */
> > +     if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
> > +             return;
> > +
> > +     list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> > +
> > +     smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>
> Hurmph.. so I was expecting something like:
>
>         first = list_empty(&rq->cfsb_csd_list);
>         list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
>         if (first)
>                 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>
> But I suppose I'm remembering the 'old' version. I don't think it is
> broken as written. There's a very narrow window where you'll end up
> sending a second IPI for naught, but meh.

The CSD doesn't get  unlocked until right before we call the func().
But you're right that that's a (very) narrow window for an  extra IPI.
Please feel free to modify the patch with that diff if you like.

>
> > +}
>
> Let me go queue this thing, we can always improve upon matters later.

Thanks! Please add at least the extra assignment of 'throttled = true'
from the diff above, but feel free to squash both the diffs if it
makes sense to you.

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-18 19:25   ` Josh Don
@ 2022-11-20  2:22     ` Chengming Zhou
  2022-11-21 11:58       ` Peter Zijlstra
  2022-11-21 12:34     ` Peter Zijlstra
  2022-11-22  6:08     ` Aaron Lu
  2 siblings, 1 reply; 21+ messages in thread
From: Chengming Zhou @ 2022-11-20  2:22 UTC (permalink / raw)
  To: Josh Don, Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On 2022/11/19 03:25, Josh Don wrote:
> On Fri, Nov 18, 2022 at 4:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
>> speaking this here is superfluous. But if you want it as an annotation,
>> that's fine I suppose.
> 
> Yep, I purely added this as extra annotation for future readers.
> 
>> Ideally we'd first queue all the remotes and then process local, but
>> given how all this is organized that doesn't seem trivial to arrange.
>>
>> Maybe have this function return false when local and save that cfs_rq in
>> a local var to process again later, dunno, that might turn messy.
> 
> Maybe something like this? Apologies for inline diff formatting.
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 012ec9d03811..100dae6023da 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5520,12 +5520,15 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
>         struct cfs_rq *cfs_rq;
>         u64 runtime, remaining = 1;
>         bool throttled = false;
> +       int this_cpu = smp_processor_id();
> +       struct cfs_rq *local_unthrottle = NULL;
> +       struct rq *rq;
> +       struct rq_flags rf;
> 
>         rcu_read_lock();
>         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
>                                 throttled_list) {
> -               struct rq *rq = rq_of(cfs_rq);
> -               struct rq_flags rf;
> +               rq = rq_of(cfs_rq);
> 
>                 if (!remaining) {
>                         throttled = true;
> @@ -5556,14 +5559,36 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
>                 cfs_rq->runtime_remaining += runtime;
> 
>                 /* we check whether we're throttled above */
> -               if (cfs_rq->runtime_remaining > 0)
> -                       unthrottle_cfs_rq_async(cfs_rq);
> +               if (cfs_rq->runtime_remaining > 0) {
> +                       if (cpu_of(rq) != this_cpu ||
> +                           SCHED_WARN_ON(local_unthrottle)) {
> +                               unthrottle_cfs_rq_async(cfs_rq);
> +                       } else {
> +                               local_unthrottle = cfs_rq;
> +                       }
> +               } else {
> +                       throttled = true;
> +               }

Hello,

I don't get the point why local unthrottle is put after all the remote cpus,
since this list is FIFO? (earliest throttled cfs_rq is at the head)

Should we distribute runtime in the FIFO order?

Thanks.

> 
>  next:
>                 rq_unlock_irqrestore(rq, &rf);
>         }
>         rcu_read_unlock();
> 
> +       /*
> +        * We prefer to stage the async unthrottles of all the remote cpus
> +        * before we do the inline unthrottle locally. Note that
> +        * unthrottle_cfs_rq_async() on the local cpu is actually synchronous,
> +        * but it includes extra WARNs to make sure the cfs_rq really is
> +        * still throttled.
> +        */
> +       if (local_unthrottle) {
> +               rq = cpu_rq(this_cpu);
> +               rq_lock_irqsave(rq, &rf);
> +               unthrottle_cfs_rq_async(local_unthrottle);
> +               rq_unlock_irqrestore(rq, &rf);
> +       }
> +
>         return throttled;
>  }
> 
> Note that one change we definitely want is the extra setting of
> throttled = true in the case that cfs_rq->runtime_remaining <= 0, to
> catch the case where we run out of runtime to distribute on the last
> entity in the list.
> 
>>> +
>>> +     /* Already enqueued */
>>> +     if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
>>> +             return;
>>> +
>>> +     list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
>>> +
>>> +     smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>>
>> Hurmph.. so I was expecting something like:
>>
>>         first = list_empty(&rq->cfsb_csd_list);
>>         list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
>>         if (first)
>>                 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
>>
>> But I suppose I'm remembering the 'old' version. I don't think it is
>> broken as written. There's a very narrow window where you'll end up
>> sending a second IPI for naught, but meh.
> 
> The CSD doesn't get  unlocked until right before we call the func().
> But you're right that that's a (very) narrow window for an  extra IPI.
> Please feel free to modify the patch with that diff if you like.
> 
>>
>>> +}
>>
>> Let me go queue this thing, we can always improve upon matters later.
> 
> Thanks! Please add at least the extra assignment of 'throttled = true'
> from the diff above, but feel free to squash both the diffs if it
> makes sense to you.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-20  2:22     ` Chengming Zhou
@ 2022-11-21 11:58       ` Peter Zijlstra
  2022-11-21 19:37         ` Josh Don
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-21 11:58 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Sun, Nov 20, 2022 at 10:22:40AM +0800, Chengming Zhou wrote:
> > +               if (cfs_rq->runtime_remaining > 0) {
> > +                       if (cpu_of(rq) != this_cpu ||
> > +                           SCHED_WARN_ON(local_unthrottle)) {
> > +                               unthrottle_cfs_rq_async(cfs_rq);
> > +                       } else {
> > +                               local_unthrottle = cfs_rq;
> > +                       }
> > +               } else {
> > +                       throttled = true;
> > +               }
> 
> Hello,
> 
> I don't get the point why local unthrottle is put after all the remote cpus,
> since this list is FIFO? (earliest throttled cfs_rq is at the head)

Let the local completion time for a CPU be W. Then if we queue a remote
work after the local synchronous work, the lower bound for total
completion is at least 2W.

OTOH, if we first queue all remote work and then process the local
synchronous work, the lower bound for total completion is W.

The practical difference is that all relevant CPUs get unthrottled
rougly at the same point in time, unlike with the original case, where
some CPUs have the opportunity to consume W runtime while another is
still throttled.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-21 11:58       ` Peter Zijlstra
@ 2022-11-21 19:37         ` Josh Don
  2022-11-22 10:35           ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Josh Don @ 2022-11-21 19:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Mon, Nov 21, 2022 at 3:58 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Sun, Nov 20, 2022 at 10:22:40AM +0800, Chengming Zhou wrote:
> > > +               if (cfs_rq->runtime_remaining > 0) {
> > > +                       if (cpu_of(rq) != this_cpu ||
> > > +                           SCHED_WARN_ON(local_unthrottle)) {
> > > +                               unthrottle_cfs_rq_async(cfs_rq);
> > > +                       } else {
> > > +                               local_unthrottle = cfs_rq;
> > > +                       }
> > > +               } else {
> > > +                       throttled = true;
> > > +               }
> >
> > Hello,
> >
> > I don't get the point why local unthrottle is put after all the remote cpus,
> > since this list is FIFO? (earliest throttled cfs_rq is at the head)
>
> Let the local completion time for a CPU be W. Then if we queue a remote
> work after the local synchronous work, the lower bound for total
> completion is at least 2W.
>
> OTOH, if we first queue all remote work and then process the local
> synchronous work, the lower bound for total completion is W.
>
> The practical difference is that all relevant CPUs get unthrottled
> rougly at the same point in time, unlike with the original case, where
> some CPUs have the opportunity to consume W runtime while another is
> still throttled.

Yep, this tradeoff feels "best", but there are some edge cases where
this could potentially disrupt fairness. For example, if we have
non-trivial W, a lot of cpus to iterate through for dispatching remote
unthrottle, and quota is small. Doesn't help that the timer is pinned
so that this will continually hit the same cpu. But as I alluded to, I
think the net benefit here is greater with the local unthrottling
ordered last.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-21 19:37         ` Josh Don
@ 2022-11-22 10:35           ` Peter Zijlstra
  2022-11-25  8:57             ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-22 10:35 UTC (permalink / raw)
  To: Josh Don
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> Yep, this tradeoff feels "best", but there are some edge cases where
> this could potentially disrupt fairness. For example, if we have
> non-trivial W, a lot of cpus to iterate through for dispatching remote
> unthrottle, and quota is small. Doesn't help that the timer is pinned
> so that this will continually hit the same cpu.

We could -- if we wanted to -- manually rotate the timer around the
relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
though I'm afraid.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-22 10:35           ` Peter Zijlstra
@ 2022-11-25  8:57             ` Peter Zijlstra
  2022-11-25  8:59               ` Peter Zijlstra
  2022-11-29  1:32               ` Josh Don
  0 siblings, 2 replies; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-25  8:57 UTC (permalink / raw)
  To: Josh Don
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li,
	Thomas Gleixner, Frederic Weisbecker, anna-maria

On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > Yep, this tradeoff feels "best", but there are some edge cases where
> > this could potentially disrupt fairness. For example, if we have
> > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > so that this will continually hit the same cpu.
> 
> We could -- if we wanted to -- manually rotate the timer around the
> relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> though I'm afraid.

Here; something like so should enable us to cycle the bandwidth timer.
Just need to figure out a way to find another CPU or something.

---
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f8bd200d678a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -63,8 +63,10 @@ enum hrtimer_mode {
  * Return values for the callback function
  */
 enum hrtimer_restart {
-	HRTIMER_NORESTART,	/* Timer is not restarted */
-	HRTIMER_RESTART,	/* Timer must be restarted */
+	HRTIMER_RESTART = -1,		/* Timer must be restarted */
+	HRTIMER_NORESTART = 0,		/* Timer is not restarted */
+	HRTIMER_RESTART_MIGRATE = 1,
+	HRTIMER_RESTART_MIGRATE_MAX = HRTIMER_RESTART_MIGRATE + NR_CPUS,
 };
 
 /*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e75033f78a19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1621,6 +1621,16 @@ bool hrtimer_active(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_active);
 
+static void raw_spin_lock_double(raw_spinlock_t *a, raw_spinlock_t *b)
+{
+	if (b < a)
+		swap(a, b);
+
+	raw_spin_lock(a);
+	if (b != a)
+		raw_spin_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 /*
  * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
  * distinct sections:
@@ -1644,6 +1654,8 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer *timer, ktime_t *now,
 			  unsigned long flags) __must_hold(&cpu_base->lock)
 {
+	struct hrtimer_cpu_base *new_cpu_base = cpu_base;
+	struct hrtimer_clock_base *new_base = base;
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	bool expires_in_hardirq;
 	int restart;
@@ -1686,7 +1698,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 
 	lockdep_hrtimer_exit(expires_in_hardirq);
 	trace_hrtimer_expire_exit(timer);
-	raw_spin_lock_irq(&cpu_base->lock);
+
+	local_irq_disable();
+
+	if (restart >= HRTIMER_RESTART_MIGRATE) {
+		int cpu = restart - HRTIMER_RESTART_MIGRATE;
+		int b = base - cpu_base->clock_base;
+
+		new_cpu_base = &per_cpu(hrtimer_bases, cpu);
+		new_base = new_cpu_base->clock_base[b];
+	}
+	raw_spin_lock_double(&cpu_base->lock, &new_cpu_base->lock);
 
 	/*
 	 * Note: We clear the running state after enqueue_hrtimer and
@@ -1698,8 +1720,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	 * for us already.
 	 */
 	if (restart != HRTIMER_NORESTART &&
-	    !(timer->state & HRTIMER_STATE_ENQUEUED))
-		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+	    !(timer->state & HRTIMER_STATE_ENQUEUED)) {
+
+		if (new_cpu_base != cpu_base) {
+			timer->base = new_base;
+			enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+			raw_spin_unlock(&new_cpu_base->lock);
+		} else {
+			enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+		}
+	}
 
 	/*
 	 * Separate the ->running assignment from the ->state assignment.
@@ -2231,12 +2261,8 @@ int hrtimers_dead_cpu(unsigned int scpu)
 	local_irq_disable();
 	old_base = &per_cpu(hrtimer_bases, scpu);
 	new_base = this_cpu_ptr(&hrtimer_bases);
-	/*
-	 * The caller is globally serialized and nobody else
-	 * takes two locks at once, deadlock is not possible.
-	 */
-	raw_spin_lock(&new_base->lock);
-	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+	raw_spin_lock_double(&old_base->lock, &new_base->lock);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-25  8:57             ` Peter Zijlstra
@ 2022-11-25  8:59               ` Peter Zijlstra
  2022-11-25  9:12                 ` Peter Zijlstra
  2022-11-29  1:32               ` Josh Don
  1 sibling, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-25  8:59 UTC (permalink / raw)
  To: Josh Don
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li,
	Thomas Gleixner, Frederic Weisbecker, anna-maria

On Fri, Nov 25, 2022 at 09:57:09AM +0100, Peter Zijlstra wrote:
> On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> > On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > > Yep, this tradeoff feels "best", but there are some edge cases where
> > > this could potentially disrupt fairness. For example, if we have
> > > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > > so that this will continually hit the same cpu.
> > 
> > We could -- if we wanted to -- manually rotate the timer around the
> > relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> > though I'm afraid.
> 
> Here; something like so should enable us to cycle the bandwidth timer.
> Just need to figure out a way to find another CPU or something.

Some more preparation...

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5617,7 +5617,7 @@ static int do_sched_cfs_period_timer(str
 	if (!throttled) {
 		/* mark as potentially idle for the upcoming period */
 		cfs_b->idle = 1;
-		return 0;
+		return HRTIMER_RESTART;
 	}
 
 	/* account preceding periods in which throttling occurred */
@@ -5641,10 +5641,10 @@ static int do_sched_cfs_period_timer(str
 	 */
 	cfs_b->idle = 0;
 
-	return 0;
+	return HRTIMER_RESTART;
 
 out_deactivate:
-	return 1;
+	return HRTIMER_NORESTART;
 }
 
 /* a cfs_rq won't donate quota below this amount */
@@ -5836,9 +5836,9 @@ static enum hrtimer_restart sched_cfs_pe
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
+	int restart = HRTIMER_RESTART;
 	unsigned long flags;
 	int overrun;
-	int idle = 0;
 	int count = 0;
 
 	raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -5847,7 +5847,7 @@ static enum hrtimer_restart sched_cfs_pe
 		if (!overrun)
 			break;
 
-		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+		restart = do_sched_cfs_period_timer(cfs_b, overrun, flags);
 
 		if (++count > 3) {
 			u64 new, old = ktime_to_ns(cfs_b->period);
@@ -5880,11 +5880,11 @@ static enum hrtimer_restart sched_cfs_pe
 			count = 0;
 		}
 	}
-	if (idle)
+	if (restart == HRTIMER_NORESTART)
 		cfs_b->period_active = 0;
 	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 
-	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+	return restart;
 }
 
 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-25  8:59               ` Peter Zijlstra
@ 2022-11-25  9:12                 ` Peter Zijlstra
  2022-11-29  1:38                   ` Josh Don
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-25  9:12 UTC (permalink / raw)
  To: Josh Don
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li,
	Thomas Gleixner, Frederic Weisbecker, anna-maria

On Fri, Nov 25, 2022 at 09:59:23AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 25, 2022 at 09:57:09AM +0100, Peter Zijlstra wrote:
> > On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> > > On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > > > Yep, this tradeoff feels "best", but there are some edge cases where
> > > > this could potentially disrupt fairness. For example, if we have
> > > > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > > > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > > > so that this will continually hit the same cpu.
> > > 
> > > We could -- if we wanted to -- manually rotate the timer around the
> > > relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> > > though I'm afraid.
> > 
> > Here; something like so should enable us to cycle the bandwidth timer.
> > Just need to figure out a way to find another CPU or something.
> 
> Some more preparation...

And then I think something like so.. That migrates the timer to the CPU
of the first throttled entry -- possibly not the best heuristic, but its
the simplest.

NOTE: none of this has seen a compiler up close.

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5595,13 +5595,21 @@ static bool distribute_cfs_runtime(struc
  */
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
 {
-	int throttled;
+	struct cfs_rq *first_cfs_rq;
+	int throttled = 0;
+	int cpu;
 
 	/* no need to continue the timer with no bandwidth constraint */
 	if (cfs_b->quota == RUNTIME_INF)
 		goto out_deactivate;
 
-	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	first_cfs_rq = list_first_entry_or_null(&cfs_b->throttled_cfs_rq,
+						struct cfs_rq, throttled_list);
+	if (first_cfs_rq) {
+		throttled = 1;
+		cpu = cpu_of(rq_of(first_cfs_rq));
+	}
+
 	cfs_b->nr_periods += overrun;
 
 	/* Refill extra burst quota even if cfs_b->idle */
@@ -5641,7 +5649,7 @@ static int do_sched_cfs_period_timer(str
 	 */
 	cfs_b->idle = 0;
 
-	return HRTIMER_RESTART;
+	return HRTIMER_RESTART_MIGRATE + cpu;
 
 out_deactivate:
 	return HRTIMER_NORESTART;

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-25  9:12                 ` Peter Zijlstra
@ 2022-11-29  1:38                   ` Josh Don
  0 siblings, 0 replies; 21+ messages in thread
From: Josh Don @ 2022-11-29  1:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li,
	Thomas Gleixner, Frederic Weisbecker, anna-maria

On Fri, Nov 25, 2022 at 1:12 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Nov 25, 2022 at 09:59:23AM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 25, 2022 at 09:57:09AM +0100, Peter Zijlstra wrote:
> > > On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> > > > On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > > > > Yep, this tradeoff feels "best", but there are some edge cases where
> > > > > this could potentially disrupt fairness. For example, if we have
> > > > > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > > > > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > > > > so that this will continually hit the same cpu.
> > > >
> > > > We could -- if we wanted to -- manually rotate the timer around the
> > > > relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> > > > though I'm afraid.
> > >
> > > Here; something like so should enable us to cycle the bandwidth timer.
> > > Just need to figure out a way to find another CPU or something.
> >
> > Some more preparation...
>
> And then I think something like so.. That migrates the timer to the CPU
> of the first throttled entry -- possibly not the best heuristic, but its
> the simplest.
>
> NOTE: none of this has seen a compiler up close.

Thanks Peter, this overall looks good to me. One question though: I
was expecting to see that when we migrate the timer, we adjust expiry
to account for clock shift between cpus. Was this just not part of the
initial draft here, or is this somehow already accounted for?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-25  8:57             ` Peter Zijlstra
  2022-11-25  8:59               ` Peter Zijlstra
@ 2022-11-29  1:32               ` Josh Don
  1 sibling, 0 replies; 21+ messages in thread
From: Josh Don @ 2022-11-29  1:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chengming Zhou, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li,
	Thomas Gleixner, Frederic Weisbecker, anna-maria

> @@ -1686,7 +1698,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
>
>         lockdep_hrtimer_exit(expires_in_hardirq);
>         trace_hrtimer_expire_exit(timer);
> -       raw_spin_lock_irq(&cpu_base->lock);
> +
> +       local_irq_disable();
> +
> +       if (restart >= HRTIMER_RESTART_MIGRATE) {
> +               int cpu = restart - HRTIMER_RESTART_MIGRATE;

I know this is just a rough draft, but just noting that this wants a
check against MIGRATE_MAX :)

> +               if (new_cpu_base != cpu_base) {
> +                       timer->base = new_base;
> +                       enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
> +                       raw_spin_unlock(&new_cpu_base->lock);

unlock the old base->lock right?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-18 19:25   ` Josh Don
  2022-11-20  2:22     ` Chengming Zhou
@ 2022-11-21 12:34     ` Peter Zijlstra
  2022-11-21 18:02       ` Michal Koutný
  2022-11-21 19:31       ` Josh Don
  2022-11-22  6:08     ` Aaron Lu
  2 siblings, 2 replies; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-21 12:34 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Fri, Nov 18, 2022 at 11:25:09AM -0800, Josh Don wrote:

> > Maybe have this function return false when local and save that cfs_rq in
> > a local var to process again later, dunno, that might turn messy.
> 
> Maybe something like this? Apologies for inline diff formatting.

That looks entirely reasonable, not nearly as horrible as I feared. Let
me go make that happen.

> Note that one change we definitely want is the extra setting of
> throttled = true in the case that cfs_rq->runtime_remaining <= 0, to
> catch the case where we run out of runtime to distribute on the last
> entity in the list.

Done.

> > > +
> > > +     /* Already enqueued */
> > > +     if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
> > > +             return;
> > > +
> > > +     list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> > > +
> > > +     smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
> >
> > Hurmph.. so I was expecting something like:
> >
> >         first = list_empty(&rq->cfsb_csd_list);
> >         list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
> >         if (first)
> >                 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
> >
> > But I suppose I'm remembering the 'old' version. I don't think it is
> > broken as written. There's a very narrow window where you'll end up
> > sending a second IPI for naught, but meh.
> 
> The CSD doesn't get  unlocked until right before we call the func().
> But you're right that that's a (very) narrow window for an  extra IPI.
> Please feel free to modify the patch with that diff if you like.

Since I was manually editing things, I did that too.

Please test the final version as found here:

  https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-21 12:34     ` Peter Zijlstra
@ 2022-11-21 18:02       ` Michal Koutný
  2022-11-21 19:31       ` Josh Don
  1 sibling, 0 replies; 21+ messages in thread
From: Michal Koutný @ 2022-11-21 18:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Christian Brauner, Zefan Li

[-- Attachment #1: Type: text/plain, Size: 463 bytes --]

On Mon, Nov 21, 2022 at 01:34:33PM +0100, Peter Zijlstra <peterz@infradead.org> wrote:
> Please test the final version as found here:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e

I went through the patch and I claim it's
Reviewed-by: Michal Koutný <mkoutny@suse.com>

(not tested though. And thanks Josh for the notice  about the not-double
queueing check.)


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-21 12:34     ` Peter Zijlstra
  2022-11-21 18:02       ` Michal Koutný
@ 2022-11-21 19:31       ` Josh Don
  2022-11-22  5:55         ` Aaron Lu
  2022-11-22 10:30         ` Peter Zijlstra
  1 sibling, 2 replies; 21+ messages in thread
From: Josh Don @ 2022-11-21 19:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

> Please test the final version as found here:
>
>   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e

Thanks Peter, the patch looks good to me and continues to work
correctly. Note that I needed to make two edits to get it to build
though on sched/core:

- local_cfq_rq doesn't match variable name used in function (local_unthrottle)
- rq_lock_irqrestore -> rq_unlock_irqrestore

Best,
Josh

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-21 19:31       ` Josh Don
@ 2022-11-22  5:55         ` Aaron Lu
  2022-11-22 10:30         ` Peter Zijlstra
  1 sibling, 0 replies; 21+ messages in thread
From: Aaron Lu @ 2022-11-22  5:55 UTC (permalink / raw)
  To: Josh Don
  Cc: Peter Zijlstra, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Mon, Nov 21, 2022 at 11:31:20AM -0800, Josh Don wrote:
> > Please test the final version as found here:
> >
> >   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e
> 
> Thanks Peter, the patch looks good to me and continues to work
> correctly. Note that I needed to make two edits to get it to build
> though on sched/core:
> 
> - local_cfq_rq doesn't match variable name used in function (local_unthrottle)
> - rq_lock_irqrestore -> rq_unlock_irqrestore

I also tested it on a desktop bare metal with 800 cgroups under 2
cgroups that have quota set. Before this patch, the majority of durations
for sched_cfs_period_timer() are in the range of 512us-1ms, with outliers
in the range of 1ms-4ms; after this patch, the majority are in the range
of 128us-512us with no outliers above 512us.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-21 19:31       ` Josh Don
  2022-11-22  5:55         ` Aaron Lu
@ 2022-11-22 10:30         ` Peter Zijlstra
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-22 10:30 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Mon, Nov 21, 2022 at 11:31:20AM -0800, Josh Don wrote:
> > Please test the final version as found here:
> >
> >   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/core&id=4e3c1b7b489e218dfa576cd6af0680b975b8743e
> 
> Thanks Peter, the patch looks good to me and continues to work
> correctly. Note that I needed to make two edits to get it to build
> though on sched/core:
> 
> - local_cfq_rq doesn't match variable name used in function (local_unthrottle)
> - rq_lock_irqrestore -> rq_unlock_irqrestore

Bah; and here I throught the .config I build actually had this crud
enabled. Oh well.. fixed that, will push out shortly.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-18 19:25   ` Josh Don
  2022-11-20  2:22     ` Chengming Zhou
  2022-11-21 12:34     ` Peter Zijlstra
@ 2022-11-22  6:08     ` Aaron Lu
  2022-11-22 19:41       ` Josh Don
  2 siblings, 1 reply; 21+ messages in thread
From: Aaron Lu @ 2022-11-22  6:08 UTC (permalink / raw)
  To: Josh Don
  Cc: Peter Zijlstra, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Fri, Nov 18, 2022 at 11:25:09AM -0800, Josh Don wrote:
> On Fri, Nov 18, 2022 at 4:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > preempt_disable() -- through rq->lock -- also holds off rcu. Strictly
> > speaking this here is superfluous. But if you want it as an annotation,
> > that's fine I suppose.
> 
> Yep, I purely added this as extra annotation for future readers.
> 
> > Ideally we'd first queue all the remotes and then process local, but
> > given how all this is organized that doesn't seem trivial to arrange.
> >
> > Maybe have this function return false when local and save that cfs_rq in
> > a local var to process again later, dunno, that might turn messy.
> 
> Maybe something like this? Apologies for inline diff formatting.
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 012ec9d03811..100dae6023da 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5520,12 +5520,15 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
>         struct cfs_rq *cfs_rq;
>         u64 runtime, remaining = 1;
>         bool throttled = false;
> +       int this_cpu = smp_processor_id();
> +       struct cfs_rq *local_unthrottle = NULL;
> +       struct rq *rq;
> +       struct rq_flags rf;
> 
>         rcu_read_lock();
>         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
>                                 throttled_list) {
> -               struct rq *rq = rq_of(cfs_rq);
> -               struct rq_flags rf;
> +               rq = rq_of(cfs_rq);
> 
>                 if (!remaining) {
>                         throttled = true;
> @@ -5556,14 +5559,36 @@ static bool distribute_cfs_runtime(struct
> cfs_bandwidth *cfs_b)
>                 cfs_rq->runtime_remaining += runtime;
> 
>                 /* we check whether we're throttled above */
> -               if (cfs_rq->runtime_remaining > 0)
> -                       unthrottle_cfs_rq_async(cfs_rq);
> +               if (cfs_rq->runtime_remaining > 0) {
> +                       if (cpu_of(rq) != this_cpu ||
> +                           SCHED_WARN_ON(local_unthrottle)) {
> +                               unthrottle_cfs_rq_async(cfs_rq);
> +                       } else {
> +                               local_unthrottle = cfs_rq;
> +                       }
> +               } else {
> +                       throttled = true;
> +               }
> 
>  next:
>                 rq_unlock_irqrestore(rq, &rf);
>         }
>         rcu_read_unlock();
> 
> +       /*
> +        * We prefer to stage the async unthrottles of all the remote cpus
> +        * before we do the inline unthrottle locally. Note that
> +        * unthrottle_cfs_rq_async() on the local cpu is actually synchronous,
> +        * but it includes extra WARNs to make sure the cfs_rq really is
> +        * still throttled.

With this said ->

> +        */
> +       if (local_unthrottle) {
> +               rq = cpu_rq(this_cpu);
> +               rq_lock_irqsave(rq, &rf);

Should we add:
		if (cfs_rq_throttled(local_unthrottle))

before calling into unthrottle_cfs_rq_async(local_unthrottle) to avoid a
potential WARN?

As for whether the local cfs_rq can be unthrottled now after rq lock is
re-acquired, I suppose it can be. e.g. another user sets a new quota to
this task group during the window of rq lock gets dropped in the above
loop and re-acquired here IIUC.

> +               unthrottle_cfs_rq_async(local_unthrottle);
> +               rq_unlock_irqrestore(rq, &rf);
> +       }
> +
>         return throttled;
>  }
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-22  6:08     ` Aaron Lu
@ 2022-11-22 19:41       ` Josh Don
  2022-11-24  9:12         ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Josh Don @ 2022-11-22 19:41 UTC (permalink / raw)
  To: Aaron Lu, Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

> > +        */
> > +       if (local_unthrottle) {
> > +               rq = cpu_rq(this_cpu);
> > +               rq_lock_irqsave(rq, &rf);
>
> Should we add:
>                 if (cfs_rq_throttled(local_unthrottle))
>
> before calling into unthrottle_cfs_rq_async(local_unthrottle) to avoid a
> potential WARN?
>
> As for whether the local cfs_rq can be unthrottled now after rq lock is
> re-acquired, I suppose it can be. e.g. another user sets a new quota to
> this task group during the window of rq lock gets dropped in the above
> loop and re-acquired here IIUC.
>
> > +               unthrottle_cfs_rq_async(local_unthrottle);
> > +               rq_unlock_irqrestore(rq, &rf);
> > +       }
> > +
> >         return throttled;
> >  }

Yes, we should add that check due to the case you described with a
user concurrently configuring bandwidth. And as long as we're doing
that, we might as well make this unthrottle_cfs_rq() instead and snip
the comment. Peter, would you mind adding that delta?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] sched: async unthrottling for cfs bandwidth
  2022-11-22 19:41       ` Josh Don
@ 2022-11-24  9:12         ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2022-11-24  9:12 UTC (permalink / raw)
  To: Josh Don
  Cc: Aaron Lu, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Valentin Schneider, linux-kernel,
	Tejun Heo, Michal Koutný, Christian Brauner, Zefan Li

On Tue, Nov 22, 2022 at 11:41:04AM -0800, Josh Don wrote:
> > > +        */
> > > +       if (local_unthrottle) {
> > > +               rq = cpu_rq(this_cpu);
> > > +               rq_lock_irqsave(rq, &rf);
> >
> > Should we add:
> >                 if (cfs_rq_throttled(local_unthrottle))
> >
> > before calling into unthrottle_cfs_rq_async(local_unthrottle) to avoid a
> > potential WARN?
> >
> > As for whether the local cfs_rq can be unthrottled now after rq lock is
> > re-acquired, I suppose it can be. e.g. another user sets a new quota to
> > this task group during the window of rq lock gets dropped in the above
> > loop and re-acquired here IIUC.
> >
> > > +               unthrottle_cfs_rq_async(local_unthrottle);
> > > +               rq_unlock_irqrestore(rq, &rf);
> > > +       }
> > > +
> > >         return throttled;
> > >  }
> 
> Yes, we should add that check due to the case you described with a
> user concurrently configuring bandwidth. And as long as we're doing
> that, we might as well make this unthrottle_cfs_rq() instead and snip
> the comment. Peter, would you mind adding that delta?

Done, should be pushed into the queue.git thing momentarily.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [tip: sched/core] sched: Async unthrottling for cfs bandwidth
  2022-11-17  0:54 [PATCH v3] sched: async unthrottling for cfs bandwidth Josh Don
  2022-11-18 12:47 ` Peter Zijlstra
@ 2022-12-27 12:13 ` tip-bot2 for Josh Don
  1 sibling, 0 replies; 21+ messages in thread
From: tip-bot2 for Josh Don @ 2022-12-27 12:13 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Josh Don, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     8ad075c2eb1f6b4b33436144ea1ef2619f3b6398
Gitweb:        https://git.kernel.org/tip/8ad075c2eb1f6b4b33436144ea1ef2619f3b6398
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Wed, 16 Nov 2022 16:54:18 -08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 27 Dec 2022 12:52:09 +01:00

sched: Async unthrottling for cfs bandwidth

CFS bandwidth currently distributes new runtime and unthrottles cfs_rq's
inline in an hrtimer callback. Runtime distribution is a per-cpu
operation, and unthrottling is a per-cgroup operation, since a tg walk
is required. On machines with a large number of cpus and large cgroup
hierarchies, this cpus*cgroups work can be too much to do in a single
hrtimer callback: since IRQ are disabled, hard lockups may easily occur.
Specifically, we've found this scalability issue on configurations with
256 cpus, O(1000) cgroups in the hierarchy being throttled, and high
memory bandwidth usage.

To fix this, we can instead unthrottle cfs_rq's asynchronously via a
CSD. Each cpu is responsible for unthrottling itself, thus sharding the
total work more fairly across the system, and avoiding hard lockups.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221117005418.3499691-1-joshdon@google.com
---
 kernel/sched/fair.c  | 155 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h |   8 ++-
 2 files changed, 150 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54..ea81d48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5461,22 +5461,105 @@ unthrottle_throttle:
 		resched_curr(rq);
 }
 
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cursor, *tmp;
+	struct rq *rq = arg;
+	struct rq_flags rf;
+
+	rq_lock(rq, &rf);
+
+	/*
+	 * Since we hold rq lock we're safe from concurrent manipulation of
+	 * the CSD list. However, this RCU critical section annotates the
+	 * fact that we pair with sched_free_group_rcu(), so that we cannot
+	 * race with group being freed in the window between removing it
+	 * from the list and advancing to the next entry in the list.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+				 throttled_csd_list) {
+		list_del_init(&cursor->throttled_csd_list);
+
+		if (cfs_rq_throttled(cursor))
+			unthrottle_cfs_rq(cursor);
+	}
+
+	rcu_read_unlock();
+
+	rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	bool first;
+
+	if (rq == this_rq()) {
+		unthrottle_cfs_rq(cfs_rq);
+		return;
+	}
+
+	/* Already enqueued */
+	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+		return;
+
+	first = list_empty(&rq->cfsb_csd_list);
+	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+	if (first)
+		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+	lockdep_assert_rq_held(rq_of(cfs_rq));
+
+	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+	    cfs_rq->runtime_remaining <= 0))
+		return;
+
+	__unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+	struct cfs_rq *local_unthrottle = NULL;
+	int this_cpu = smp_processor_id();
 	u64 runtime, remaining = 1;
+	bool throttled = false;
+	struct cfs_rq *cfs_rq;
+	struct rq_flags rf;
+	struct rq *rq;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 				throttled_list) {
-		struct rq *rq = rq_of(cfs_rq);
-		struct rq_flags rf;
+		rq = rq_of(cfs_rq);
+
+		if (!remaining) {
+			throttled = true;
+			break;
+		}
 
 		rq_lock_irqsave(rq, &rf);
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
-		/* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+		/* Already queued for async unthrottle */
+		if (!list_empty(&cfs_rq->throttled_csd_list))
+			goto next;
+#endif
+
+		/* By the above checks, this should never be true */
 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
 
 		raw_spin_lock(&cfs_b->lock);
@@ -5490,16 +5573,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		cfs_rq->runtime_remaining += runtime;
 
 		/* we check whether we're throttled above */
-		if (cfs_rq->runtime_remaining > 0)
-			unthrottle_cfs_rq(cfs_rq);
+		if (cfs_rq->runtime_remaining > 0) {
+			if (cpu_of(rq) != this_cpu ||
+			    SCHED_WARN_ON(local_unthrottle))
+				unthrottle_cfs_rq_async(cfs_rq);
+			else
+				local_unthrottle = cfs_rq;
+		} else {
+			throttled = true;
+		}
 
 next:
 		rq_unlock_irqrestore(rq, &rf);
-
-		if (!remaining)
-			break;
 	}
 	rcu_read_unlock();
+
+	if (local_unthrottle) {
+		rq = cpu_rq(this_cpu);
+		rq_lock_irqsave(rq, &rf);
+		if (cfs_rq_throttled(local_unthrottle))
+			unthrottle_cfs_rq(local_unthrottle);
+		rq_unlock_irqrestore(rq, &rf);
+	}
+
+	return throttled;
 }
 
 /*
@@ -5544,10 +5641,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
 	while (throttled && cfs_b->runtime > 0) {
 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
-		distribute_cfs_runtime(cfs_b);
+		throttled = distribute_cfs_runtime(cfs_b);
 		raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
-		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	}
 
 	/*
@@ -5824,6 +5919,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5840,12 +5938,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+	int __maybe_unused i;
+
 	/* init_cfs_bandwidth() was not called */
 	if (!cfs_b->throttled_cfs_rq.next)
 		return;
 
 	hrtimer_cancel(&cfs_b->period_timer);
 	hrtimer_cancel(&cfs_b->slack_timer);
+
+	/*
+	 * It is possible that we still have some cfs_rq's pending on a CSD
+	 * list, though this race is very rare. In order for this to occur, we
+	 * must have raced with the last task leaving the group while there
+	 * exist throttled cfs_rq(s), and the period_timer must have queued the
+	 * CSD item but the remote cpu has not yet processed it. To handle this,
+	 * we can simply flush all pending CSD work inline here. We're
+	 * guaranteed at this point that no additional cfs_rq of this group can
+	 * join a CSD list.
+	 */
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		if (list_empty(&rq->cfsb_csd_list))
+			continue;
+
+		local_irq_save(flags);
+		__cfsb_csd_unthrottle(rq);
+		local_irq_restore(flags);
+	}
+#endif
 }
 
 /*
@@ -12474,6 +12598,11 @@ __init void init_sched_fair_class(void)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
 	}
 
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8dd..b3d6e81 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -645,6 +645,9 @@ struct cfs_rq {
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
+#ifdef CONFIG_SMP
+	struct list_head	throttled_csd_list;
+#endif
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -1154,6 +1157,11 @@ struct rq {
 
 	/* Scratch cpumask to be temporarily used under rq_lock */
 	cpumask_var_t		scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+	call_single_data_t	cfsb_csd;
+	struct list_head	cfsb_csd_list;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED

^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-12-27 12:17 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-11-17  0:54 [PATCH v3] sched: async unthrottling for cfs bandwidth Josh Don
2022-11-18 12:47 ` Peter Zijlstra
2022-11-18 19:25   ` Josh Don
2022-11-20  2:22     ` Chengming Zhou
2022-11-21 11:58       ` Peter Zijlstra
2022-11-21 19:37         ` Josh Don
2022-11-22 10:35           ` Peter Zijlstra
2022-11-25  8:57             ` Peter Zijlstra
2022-11-25  8:59               ` Peter Zijlstra
2022-11-25  9:12                 ` Peter Zijlstra
2022-11-29  1:38                   ` Josh Don
2022-11-29  1:32               ` Josh Don
2022-11-21 12:34     ` Peter Zijlstra
2022-11-21 18:02       ` Michal Koutný
2022-11-21 19:31       ` Josh Don
2022-11-22  5:55         ` Aaron Lu
2022-11-22 10:30         ` Peter Zijlstra
2022-11-22  6:08     ` Aaron Lu
2022-11-22 19:41       ` Josh Don
2022-11-24  9:12         ` Peter Zijlstra
2022-12-27 12:13 ` [tip: sched/core] sched: Async " tip-bot2 for Josh Don

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox