Re: [RFC patch v3 02/20] sched: Several fixes for cache aware scheduling

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Shrikanth Hegde <sshegde@linux.ibm.com>
To: Tim Chen <tim.c.chen@linux.intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	"Gautham R . Shenoy" <gautham.shenoy@amd.com>
Cc: Chen Yu <yu.c.chen@intel.com>, Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Libo Chen <libo.chen@oracle.com>,
	Abel Wu <wuyun.abel@bytedance.com>,
	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
	Hillf Danton <hdanton@sina.com>, Len Brown <len.brown@intel.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC patch v3 02/20] sched: Several fixes for cache aware scheduling
Date: Fri, 4 Jul 2025 01:03:13 +0530	[thread overview]
Message-ID: <398a83d7-0a8c-42cb-af66-5974582cc2ae@linux.ibm.com> (raw)
In-Reply-To: <d73418022de76dab9f60c0c5432d783b3b2833dc.1750268218.git.tim.c.chen@linux.intel.com>



On 6/18/25 23:57, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@intel.com>
> 
> 1. Fix compile error on percpu allocation.
> 2. Enqueue to the target CPU rather than the current CPU.
> 3. NULL LLC sched domain check(Libo Chen).
> 4. Introduce sched feature SCHED_CACHE to control cache aware scheduling
> 5. Fix unsigned occupancy initialization to -1.
> 6. If there is only 1 thread in the process, no need to enable cache
>     awareness
> 7. Add __maybe_unused to __migrate_degrades_locality() to
>     avoid compile warnings.
> 
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
>   include/linux/mm_types.h |  4 ++--
>   kernel/sched/fair.c      | 27 ++++++++++++++++-----------
>   kernel/sched/features.h  |  1 +
>   3 files changed, 19 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 013291c6aaa2..9de4a0a13c4d 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1411,11 +1411,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>   #endif /* CONFIG_SCHED_MM_CID */
>   
>   #ifdef CONFIG_SCHED_CACHE
> -extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
> +extern void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
>   
>   static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
>   {
> -	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
>   	if (!pcpu_sched)
>   		return -ENOMEM;
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index df7d4a324fbe..89db97f8ef02 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
>   #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
>   #define EPOCH_OLD	5		/* 50 ms */
>   
> -void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>   {
>   	unsigned long epoch;
>   	int i;
> @@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
>   
>   		pcpu_sched->runtime = 0;
>   		pcpu_sched->epoch = epoch = rq->cpu_epoch;
> -		pcpu_sched->occ = -1;
> +		pcpu_sched->occ = 0;
>   	}
>   
>   	raw_spin_lock_init(&mm->mm_sched_lock);
> @@ -1254,7 +1254,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   	if (!mm || !mm->pcpu_sched)
>   		return;
>   
> -	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
> +	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
>   
>   	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
>   		__update_mm_sched(rq, pcpu_sched);
> @@ -1264,12 +1264,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   	}
>   
>   	/*
> -	 * If this task hasn't hit task_cache_work() for a while, invalidate
> +	 * If this task hasn't hit task_cache_work() for a while, or it
> +	 * has only 1 thread, invalidate
>   	 * it's preferred state.
>   	 */
> -	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
> +	    get_nr_threads(p) <= 1) {
>   		mm->mm_sched_cpu = -1;
> -		pcpu_sched->occ = -1;
> +		pcpu_sched->occ = 0;
>   	}
>   }
>   
> @@ -1286,9 +1288,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
>   
>   	guard(raw_spinlock)(&mm->mm_sched_lock);
>   
> -	if (mm->mm_sched_epoch == rq->cpu_epoch)
> -		return;
> -
>   	if (work->next == work) {
>   		task_work_add(p, work, TWA_RESUME);
>   		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> @@ -1322,6 +1321,9 @@ static void task_cache_work(struct callback_head *work)
>   			unsigned long occ, m_occ = 0, a_occ = 0;
>   			int m_cpu = -1, nr = 0, i;
>   
> +			if (!sd)
> +				continue;
> +
>   			for_each_cpu(i, sched_domain_span(sd)) {
>   				occ = fraction_mm_sched(cpu_rq(i),
>   							per_cpu_ptr(mm->pcpu_sched, i));
> @@ -8801,6 +8803,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
>   	struct mm_struct *mm = p->mm;
>   	int cpu;
>   
> +	if (!sched_feat(SCHED_CACHE))
> +		return prev_cpu;
> +
>   	if (!mm || p->nr_cpus_allowed == 1)
>   		return prev_cpu;
>   
> @@ -9555,7 +9560,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   		return 0;
>   
>   #ifdef CONFIG_SCHED_CACHE
> -	if (p->mm && p->mm->pcpu_sched) {
> +	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
>   		/*
>   		 * XXX things like Skylake have non-inclusive L3 and might not
>   		 * like this L3 centric view. What to do about L2 stickyness ?
> @@ -9633,7 +9638,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   }
>   
>   #else
> -static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +static __maybe_unused long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
>   {
>   	return 0;
>   }
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 3c12d9f93331..d2af7bfd36bf 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>    */
>   SCHED_FEAT(SIS_UTIL, true)
>   
> +SCHED_FEAT(SCHED_CACHE, true)

Having both SCHED_FEAT and CONFIG_SCHED_CACHE seems like overkill.
Is it really necessary to have both?

Also, given the complexity it brings and only a workloads which spawns threads
which have data sharing among them benefit, it could be false by default.

>   /*
>    * Issue a WARN when we do multiple update_rq_clock() calls
>    * in a single rq->lock section. Default disabled because the

next prev parent reply	other threads:[~2025-07-03 19:33 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-18 18:27 [RFC patch v3 00/20] Cache aware scheduling Tim Chen
2025-06-18 18:27 ` [RFC patch v3 01/20] sched: Cache aware load-balancing Tim Chen
2025-06-26 12:23   ` Jianyong Wu
2025-06-26 13:32     ` Chen, Yu C
2025-06-27  0:10       ` Tim Chen
2025-06-27  2:13         ` Jianyong Wu
2025-07-03 19:29   ` Shrikanth Hegde
2025-07-04  8:40     ` Chen, Yu C
2025-07-04  8:45       ` Peter Zijlstra
2025-07-04  8:54         ` Shrikanth Hegde
2025-07-07 19:57     ` Tim Chen
2025-06-18 18:27 ` [RFC patch v3 02/20] sched: Several fixes for cache aware scheduling Tim Chen
2025-07-03 19:33   ` Shrikanth Hegde [this message]
2025-07-07 21:02     ` Tim Chen
2025-07-08  1:15   ` Libo Chen
2025-07-08  7:54     ` Chen, Yu C
2025-07-08 15:47       ` Libo Chen
2025-06-18 18:27 ` [RFC patch v3 03/20] sched: Avoid task migration within its preferred LLC Tim Chen
2025-06-18 18:27 ` [RFC patch v3 04/20] sched: Avoid calculating the cpumask if the system is overloaded Tim Chen
2025-07-03 19:39   ` Shrikanth Hegde
2025-07-07 14:57     ` Tim Chen
2025-06-18 18:27 ` [RFC patch v3 05/20] sched: Add hysteresis to switch a task's preferred LLC Tim Chen
2025-07-02  6:47   ` Madadi Vineeth Reddy
2025-07-02 21:47     ` Tim Chen
2025-06-18 18:27 ` [RFC patch v3 06/20] sched: Save the per LLC utilization for better cache aware scheduling Tim Chen
2025-06-18 18:27 ` [RFC patch v3 07/20] sched: Add helper function to decide whether to allow " Tim Chen
2025-07-08  0:41   ` Libo Chen
2025-07-08  8:29     ` Chen, Yu C
2025-07-08 17:22       ` Libo Chen
2025-07-09 14:41         ` Chen, Yu C
2025-07-09 21:31           ` Libo Chen
2025-07-08 21:59     ` Tim Chen
2025-07-09 21:22       ` Libo Chen
2025-06-18 18:27 ` [RFC patch v3 08/20] sched: Set up LLC indexing Tim Chen
2025-07-03 19:44   ` Shrikanth Hegde
2025-07-04  9:36     ` Chen, Yu C
2025-06-18 18:27 ` [RFC patch v3 09/20] sched: Introduce task preferred LLC field Tim Chen
2025-06-18 18:27 ` [RFC patch v3 10/20] sched: Calculate the number of tasks that have LLC preference on a runqueue Tim Chen
2025-07-03 19:45   ` Shrikanth Hegde
2025-07-04 15:00     ` Chen, Yu C
2025-06-18 18:27 ` [RFC patch v3 11/20] sched: Introduce per runqueue task LLC preference counter Tim Chen
2025-06-18 18:28 ` [RFC patch v3 12/20] sched: Calculate the total number of preferred LLC tasks during load balance Tim Chen
2025-06-18 18:28 ` [RFC patch v3 13/20] sched: Tag the sched group as llc_balance if it has tasks prefer other LLC Tim Chen
2025-06-18 18:28 ` [RFC patch v3 14/20] sched: Introduce update_llc_busiest() to deal with groups having preferred LLC tasks Tim Chen
2025-07-03 19:52   ` Shrikanth Hegde
2025-07-05  2:26     ` Chen, Yu C
2025-06-18 18:28 ` [RFC patch v3 15/20] sched: Introduce a new migration_type to track the preferred LLC load balance Tim Chen
2025-06-18 18:28 ` [RFC patch v3 16/20] sched: Consider LLC locality for active balance Tim Chen
2025-06-18 18:28 ` [RFC patch v3 17/20] sched: Consider LLC preference when picking tasks from busiest queue Tim Chen
2025-06-18 18:28 ` [RFC patch v3 18/20] sched: Do not migrate task if it is moving out of its preferred LLC Tim Chen
2025-06-18 18:28 ` [RFC patch v3 19/20] sched: Introduce SCHED_CACHE_LB to control cache aware load balance Tim Chen
2025-06-18 18:28 ` [RFC patch v3 20/20] sched: Introduce SCHED_CACHE_WAKE to control LLC aggregation on wake up Tim Chen
2025-06-19  6:39 ` [RFC patch v3 00/20] Cache aware scheduling Yangyu Chen
2025-06-19 13:21   ` Chen, Yu C
2025-06-19 14:12     ` Yangyu Chen
2025-06-20 19:25 ` Madadi Vineeth Reddy
2025-06-22  0:39   ` Chen, Yu C
2025-06-24 17:47     ` Madadi Vineeth Reddy
2025-06-23 16:45   ` Tim Chen
2025-06-24  5:00 ` K Prateek Nayak
2025-06-24 12:16   ` Chen, Yu C
2025-06-25  4:19     ` K Prateek Nayak
2025-06-25  0:30   ` Tim Chen
2025-06-25  4:30     ` K Prateek Nayak
2025-07-03 20:00   ` Shrikanth Hegde
2025-07-04 10:09     ` Chen, Yu C
2025-07-09 19:39 ` Madadi Vineeth Reddy
2025-07-10  3:33   ` Chen, Yu C

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=398a83d7-0a8c-42cb-af66-5974582cc2ae@linux.ibm.com \
    --to=sshegde@linux.ibm.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=gautham.shenoy@amd.com \
    --cc=hdanton@sina.com \
    --cc=juri.lelli@redhat.com \
    --cc=kprateek.nayak@amd.com \
    --cc=len.brown@intel.com \
    --cc=libo.chen@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tim.c.chen@intel.com \
    --cc=tim.c.chen@linux.intel.com \
    --cc=vincent.guittot@linaro.org \
    --cc=vineethr@linux.ibm.com \
    --cc=vschneid@redhat.com \
    --cc=wuyun.abel@bytedance.com \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).