All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrea Righi <arighi@nvidia.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Phil Auld <pauld@redhat.com>, Koba Ko <kobak@nvidia.com>,
	Felix Abecassis <fabecassis@nvidia.com>,
	Balbir Singh <balbirs@nvidia.com>,
	Joel Fernandes <joelagnelf@nvidia.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	linux-kernel@vger.kernel.org, tim.c.chen@linux.intel.com,
	yu.c.chen@intel.com
Subject: Re: [PATCH v2 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity
Date: Mon, 18 May 2026 23:31:03 +0200	[thread overview]
Message-ID: <aguFF205TXg232l_@gpd4> (raw)
In-Reply-To: <20260518205859.GY3126523@noisy.programming.kicks-ass.net>

Hi Peter,

On Mon, May 18, 2026 at 10:58:59PM +0200, Peter Zijlstra wrote:
> On Sat, May 16, 2026 at 07:58:50AM +0200, Andrea Righi wrote:
...
> Right, so I just merged a branch that has this series with a branch that
> has the cache aware load balancing stuff on, and the result ain't
> pretty.
> 
> That cache aware thing really wants sd_llc_shared. Now, I imagine that
> for now the intersection between ASYM and SCHED_CACHE is not that
> interesting, but at the same time, I'm fairly sure that is something
> people will end up looking at.
> 
> For now, I've stomped on things and the merge holds the below. It
> builds, not tested much beyond that.
> 
> I've pushed out the whole pile into queue/sched/core.

Conceptually makes sense to me. IIUC cache-aware code necessarily needs per-LLC
util_avg/capacity, while the asym path needs has_idle_cores at asym span, so you
basically restored sd_llc_shared alongside sd_balance_shared.

I'll re-run my tests with your sched/core branch and report back.

Thanks!
-Andrea

> 
> diff --cc kernel/sched/topology.c
> index f96d50131495,e47a3f72eb72..000000000000
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@@ -663,9 -670,9 +670,10 @@@ static void destroy_sched_domains(struc
>    */
>   DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>   DEFINE_PER_CPU(int, sd_llc_size);
> - DEFINE_PER_CPU(int, sd_llc_id);
> + DEFINE_PER_CPU(int, sd_llc_id) = -1;
>   DEFINE_PER_CPU(int, sd_share_id);
> + DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>  +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
>   DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>   DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>   DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> @@@ -729,6 -717,9 +718,20 @@@ static void update_top_cache_domain(in
>   
>   	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
>   	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
> + 
> + 	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
> ++	/*
> ++	 * The shared object is attached to sd_asym_cpucapacity only when the
> ++	 * asym domain is non-overlapping (i.e., not built from SD_NUMA).
> ++	 * On overlapping (NUMA) asym domains we fall back to letting the
> ++	 * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
> ++	 * here.
> ++	 */
> ++	if (sd && sd->shared)
> ++		sds = sd->shared;
> ++
> + 	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
> ++	rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds);
>   }
>   
>   /*
> @@@ -2663,54 -2906,61 +2916,109 @@@ static void adjust_numa_imbalance(struc
>   	}
>   }
>   
>  +static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd)
>  +{
>  +	int sd_id = cpumask_first(sched_domain_span(sd));
>  +
>  +	sd->shared = *per_cpu_ptr(d->sds, sd_id);
>  +	/*
>  +	 * nr_busy_cpus is consumed only by the NOHZ kick path via
>  +	 * sd_balance_shared; on the asym-capacity path it is initialized but
>  +	 * never read.
>  +	 */
>  +	atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
>  +	atomic_inc(&sd->shared->ref);
>  +}
>  +
>  +/*
>  + * For asymmetric CPU capacity, attach sched_domain_shared on the innermost
>  + * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is
>  + * not an overlapping NUMA-built domain (then LLC should claim shared).
>  + *
>  + * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island),
>  + * then LLC must claim shared instead.
>  + *
>  + * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values
>  + * are present in the domain span, so the asym domain we attach to cannot
>  + * degenerate into a single-capacity group. The relevant edge cases are instead
>  + * covered by the caveats above.
>  + *
>  + * Return true if this CPU's asym path claimed sd->shared, false otherwise.
>  + */
>  +static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu)
>  +{
>  +	struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu);
>  +	struct sched_domain *sd_asym;
>  +
>  +	if (!sd)
>  +		return false;
>  +
>  +	sd_asym = sd;
>  +	while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
>  +		sd_asym = sd_asym->parent;
>  +
>  +	if (!sd_asym || (sd_asym->flags & SD_NUMA))
>  +		return false;
>  +
>  +	init_sched_domain_shared(d, sd_asym);
>  +	return true;
>  +}
>  +
> + static int __sched_domains_alloc_llc_id(void)
> + {
> + 	int lid, max;
> + 
> + 	lockdep_assert_held(&sched_domains_mutex);
> + 
> + 	lid = cpumask_first_zero(sched_domains_llc_id_allocmask);
> + 	/*
> + 	 * llc_id space should never grow larger than the
> + 	 * possible number of CPUs in the system.
> + 	 */
> + 	if (lid >= nr_cpu_ids)
> + 		return -1;
> + 
> + 	__cpumask_set_cpu(lid, sched_domains_llc_id_allocmask);
> + 	max = cpumask_last(sched_domains_llc_id_allocmask);
> + 	if (max > max_lid)
> + 		max_lid = max;
> + 
> + 	return lid;
> + }
> + 
> + static void __sched_domains_free_llc_id(int cpu)
> + {
> + 	int i, lid, max;
> + 
> + 	lockdep_assert_held(&sched_domains_mutex);
> + 
> + 	lid = per_cpu(sd_llc_id, cpu);
> + 	if (lid == -1 || lid >= nr_cpu_ids)
> + 		return;
> + 
> + 	per_cpu(sd_llc_id, cpu) = -1;
> + 
> + 	for_each_cpu(i, llc_mask(cpu)) {
> + 		/* An online CPU owns the llc_id. */
> + 		if (per_cpu(sd_llc_id, i) == lid)
> + 			return;
> + 	}
> + 
> + 	__cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask);
> + 
> + 	max = cpumask_last(sched_domains_llc_id_allocmask);
> + 	/* shrink max lid to save memory */
> + 	if (max < max_lid)
> + 		max_lid = max;
> + }
> + 
> + void sched_domains_free_llc_id(int cpu)
> + {
> + 	sched_domains_mutex_lock();
> + 	__sched_domains_free_llc_id(cpu);
> + 	sched_domains_mutex_unlock();
> + }
> + 
>   /*
>    * Build sched domains for a given set of CPUs and attach the sched domains
>    * to the individual CPUs
> @@@ -2775,20 -3049,16 +3107,15 @@@ build_sched_domains(const struct cpumas
>   		if (!sd)
>   			continue;
>   
>  +		if (has_asym)
> - 			asym_claimed = claim_asym_sched_domain_shared(&d, i);
> ++			claim_asym_sched_domain_shared(&d, i);
>  +
>   		/* First, find the topmost SD_SHARE_LLC domain */
>   		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
>   			sd = sd->parent;
>   
>   		if (sd->flags & SD_SHARE_LLC) {
> - 			/*
> - 			 * Initialize the sd->shared for SD_SHARE_LLC unless
> - 			 * the asym path above already claimed it.
> - 			 */
> - 			if (!asym_claimed)
> - 				init_sched_domain_shared(&d, sd);
>  -			int sd_id = cpumask_first(sched_domain_span(sd));
>  -
>  -			sd->shared = *per_cpu_ptr(d.sds, sd_id);
>  -			atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
>  -			atomic_inc(&sd->shared->ref);
> ++			init_sched_domain_shared(&d, sd);
>   
>   			/*
>   			 * In presence of higher domains, adjust the
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9dd4a94801c9..300320b0248a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2191,6 +2191,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
>  DECLARE_PER_CPU(int, sd_llc_size);
>  DECLARE_PER_CPU(int, sd_llc_id);
>  DECLARE_PER_CPU(int, sd_share_id);
> +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
>  DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared);
>  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
>  DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);

  reply	other threads:[~2026-05-18 21:31 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-09 18:07 [PATCH v6 0/5 RESEND] sched/fair: SMT-aware asymmetric CPU capacity Andrea Righi
2026-05-09 18:07 ` [PATCH 1/5] sched/fair: Drop redundant RCU read lock in NOHZ kick path Andrea Righi
2026-05-11 13:04   ` Vincent Guittot
2026-05-15  6:49   ` Shrikanth Hegde
2026-05-16  5:45     ` Andrea Righi
2026-05-16 17:15       ` Shrikanth Hegde
2026-05-20  8:34   ` [tip: sched/core] " tip-bot2 for Andrea Righi
2026-05-21 19:47   ` [PATCH 1/5] " Marek Szyprowski
2026-05-21 20:13     ` Andrea Righi
2026-05-09 18:07 ` [PATCH 2/5] sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity Andrea Righi
2026-05-11 13:04   ` Vincent Guittot
2026-05-15 10:05   ` Shrikanth Hegde
2026-05-16  5:58     ` [PATCH v2 " Andrea Righi
2026-05-16 17:19       ` Shrikanth Hegde
2026-05-18 20:58       ` Peter Zijlstra
2026-05-18 21:31         ` Andrea Righi [this message]
2026-05-19  5:52         ` K Prateek Nayak
2026-05-19  6:43           ` Andrea Righi
2026-05-19  7:47             ` K Prateek Nayak
2026-05-19  7:54               ` Andrea Righi
2026-05-19  8:46           ` Peter Zijlstra
2026-05-19 11:27             ` K Prateek Nayak
2026-05-19 11:47               ` Peter Zijlstra
2026-05-25  8:30                 ` Chen, Yu C
2026-05-20  8:34       ` [tip: sched/core] " tip-bot2 for K Prateek Nayak
2026-05-09 18:07 ` [PATCH 3/5] sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection Andrea Righi
2026-05-11 13:07   ` Vincent Guittot
2026-05-11 13:45     ` Andrea Righi
2026-05-11 14:25     ` [PATCH v2 " Andrea Righi
2026-05-20  8:34       ` [tip: sched/core] " tip-bot2 for Andrea Righi
2026-05-09 18:07 ` [PATCH 4/5] sched/fair: Reject misfit pulls onto busy SMT siblings on asym-capacity Andrea Righi
2026-05-11 13:07   ` Vincent Guittot
2026-05-15 10:09   ` Shrikanth Hegde
2026-05-16  9:04     ` Andrea Righi
2026-05-20  8:34   ` [tip: sched/core] " tip-bot2 for Andrea Righi
2026-05-09 18:07 ` [PATCH 5/5] sched/fair: Add SIS_UTIL support to select_idle_capacity() Andrea Righi
2026-05-11 13:08   ` Vincent Guittot
2026-05-20  8:34   ` [tip: sched/core] " tip-bot2 for K Prateek Nayak

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aguFF205TXg232l_@gpd4 \
    --to=arighi@nvidia.com \
    --cc=balbirs@nvidia.com \
    --cc=bsegall@google.com \
    --cc=christian.loehle@arm.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=fabecassis@nvidia.com \
    --cc=joelagnelf@nvidia.com \
    --cc=juri.lelli@redhat.com \
    --cc=kobak@nvidia.com \
    --cc=kprateek.nayak@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=pauld@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=sshegde@linux.ibm.com \
    --cc=tim.c.chen@linux.intel.com \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.