Re: [PATCH 3/4] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Yury Norov <yury.norov@gmail.com>
To: Andrea Righi <arighi@nvidia.com>
Cc: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 3/4] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE
Date: Wed, 11 Dec 2024 10:21:49 -0800	[thread overview]
Message-ID: <Z1nYPWeJvmizCvJJ@yury-ThinkPad> (raw)
In-Reply-To: <20241209104632.718085-4-arighi@nvidia.com>

On Mon, Dec 09, 2024 at 11:40:57AM +0100, Andrea Righi wrote:
> Add a flag (SCX_OPS_NODE_BUILTIN_IDLE) to toggle between a global flat
> idle cpumask and multiple per-node CPU masks.
> 
> This allows each sched_ext scheduler to choose between the new or old
> model, preserving backward compatibility and preventing disruptions to
> existing behavior.
> 
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  kernel/sched/ext.c | 56 +++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 50 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index ed2f0d13915c..d0d57323bcfc 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -122,6 +122,12 @@ enum scx_ops_flags {
>  	 */
>  	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
>  
> +	/*
> +	 * If set, enable per-node idle cpumasks. If clear, use a single global
> +	 * flat idle cpumask.
> +	 */
> +	SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 4,
> +
>  	/*
>  	 * CPU cgroup support flags
>  	 */
> @@ -131,6 +137,7 @@ enum scx_ops_flags {
>  				  SCX_OPS_ENQ_LAST |
>  				  SCX_OPS_ENQ_EXITING |
>  				  SCX_OPS_SWITCH_PARTIAL |
> +				  SCX_OPS_BUILTIN_IDLE_PER_NODE |
>  				  SCX_OPS_HAS_CGROUP_WEIGHT,
>  };
>  
> @@ -886,6 +893,7 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
>  
>  #ifdef CONFIG_SMP
>  static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
>  #endif
>  
>  static struct static_key_false scx_has_op[SCX_OPI_END] =
> @@ -929,18 +937,32 @@ static struct delayed_work scx_watchdog_work;
>  #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
>  #endif
>  
> -static struct {
> +struct idle_cpumask {
>  	cpumask_var_t cpu;
>  	cpumask_var_t smt;
> -} **idle_masks CL_ALIGNED_IF_ONSTACK;
> +};
> +
> +/*
> + * cpumasks to track idle CPUs within each NUMA node.
> + *
> + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not specified, a single flat cpumask
> + * from node 0 is used to track all idle CPUs system-wide.
> + */
> +static struct idle_cpumask **idle_masks CL_ALIGNED_IF_ONSTACK;
>  
>  static struct cpumask *get_idle_cpumask_node(int node)
>  {
> +	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> +		return idle_masks[0]->cpu;
> +
>  	return idle_masks[node]->cpu;
>  }
>  
>  static struct cpumask *get_idle_smtmask_node(int node)
>  {
> +	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> +		return idle_masks[0]->smt;
> +
>  	return idle_masks[node]->smt;
>  }
>  
> @@ -3423,7 +3445,7 @@ static bool llc_numa_mismatch(void)
>   * CPU belongs to a single LLC domain, and that each LLC domain is entirely
>   * contained within a single NUMA node.
>   */
> -static void update_selcpu_topology(void)
> +static void update_selcpu_topology(struct sched_ext_ops *ops)
>  {
>  	bool enable_llc = false;
>  	unsigned int nr_cpus;
> @@ -3442,8 +3464,16 @@ static void update_selcpu_topology(void)
>  	rcu_read_lock();
>  	nr_cpus = llc_weight(cpu);
>  	if (nr_cpus > 0) {
> -		if ((nr_cpus < num_online_cpus()) && llc_numa_mismatch())
> +		if (nr_cpus < num_online_cpus())
>  			enable_llc = true;
> +		/*
> +		 * No need to enable LLC optimization if the LLC domains are
> +		 * perfectly overlapping with the NUMA domains when per-node
> +		 * cpumasks are enabled.
> +		 */
> +		if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
> +		    !llc_numa_mismatch())
> +			enable_llc = false;
>  		pr_debug("sched_ext: LLC=%*pb weight=%u\n",
>  			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
>  	}
> @@ -3456,6 +3486,14 @@ static void update_selcpu_topology(void)
>  		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
>  	else
>  		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
> +
> +	/*
> +	 * Check if we need to enable per-node cpumasks.
> +	 */
> +	if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
> +		static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
> +	else
> +		static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
>  }

The patch that introduces the flag should go the very first in the series,
but should unconditionally disable scx_builtin_idle_per_node.

The following patches should add all the machinery you need. The machinery
should be conditional on the scx_builtin_idle_per_node, i.e. disabled for
a while.

Doing that, you'll be able to introduce your functionality as a whole:

   static struct cpumask *get_idle_cpumask_node(int node)
   {
   	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
   		return idle_masks[0]->cpu;
   
   	return idle_masks[node]->cpu;
   }

Much better than patching just introduced code, right?

The very last patch should only be a chunk that enables scx_builtin_idle_per_node
based on SCX_OPS_BUILTIN_IDLE_PER_NODE.

This way, when your feature will get merged, from git-bisect perspective
it will be enabled atomically by the very last patch, but those interested
in internals will have nice coherent history.

Thanks,
Yury

>  
>  /*
> @@ -3683,6 +3721,12 @@ static void reset_idle_masks(void)
>  {
>  	int node;
>  
> +	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
> +		cpumask_copy(get_idle_cpumask_node(0), cpu_online_mask);
> +		cpumask_copy(get_idle_smtmask_node(0), cpu_online_mask);
> +		return;
> +	}
> +
>  	/*
>  	 * Consider all online cpus idle. Should converge to the actual state
>  	 * quickly.
> @@ -3740,7 +3784,7 @@ static void handle_hotplug(struct rq *rq, bool online)
>  	atomic_long_inc(&scx_hotplug_seq);
>  
>  	if (scx_enabled())
> -		update_selcpu_topology();
> +		update_selcpu_topology(&scx_ops);
>  
>  	if (online && SCX_HAS_OP(cpu_online))
>  		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
> @@ -5618,7 +5662,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	check_hotplug_seq(ops);
>  #ifdef CONFIG_SMP
> -	update_selcpu_topology();
> +	update_selcpu_topology(ops);
>  #endif
>  	cpus_read_unlock();
>  
> -- 
> 2.47.1

next prev parent reply	other threads:[~2024-12-11 18:21 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-09 10:40 [PATCHSET v5 sched_ext/for-6.14] sched_ext: split global idle cpumask into per-NUMA cpumasks Andrea Righi
2024-12-09 10:40 ` [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks Andrea Righi
2024-12-09 19:32   ` Yury Norov
2024-12-09 20:40     ` Andrea Righi
2024-12-10  0:14     ` Andrea Righi
2024-12-10  2:10       ` Yury Norov
2024-12-14  6:05         ` Andrea Righi
2024-12-11 17:46   ` Yury Norov
2024-12-09 10:40 ` [PATCH 2/4] sched_ext: Get rid of the scx_selcpu_topo_numa logic Andrea Righi
2024-12-11  8:05   ` Changwoo Min
2024-12-11 12:22     ` Andrea Righi
2024-12-09 10:40 ` [PATCH 3/4] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE Andrea Righi
2024-12-11 18:21   ` Yury Norov [this message]
2024-12-11 19:59     ` Andrea Righi
2024-12-09 10:40 ` [PATCH 4/4] sched_ext: Introduce NUMA aware idle cpu kfunc helpers Andrea Righi
2024-12-11 17:43   ` Yury Norov
2024-12-11 20:20     ` Andrea Righi
2024-12-11 20:47       ` Yury Norov
2024-12-11 20:55         ` Andrea Righi
  -- strict thread matches above, loose matches on Subject: below --
2024-12-05 21:00 [PATCHSET v4 sched_ext/for-6.14] sched_ext: split global idle cpumask into per-NUMA cpumasks Andrea Righi
2024-12-05 21:00 ` [PATCH 3/4] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE Andrea Righi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z1nYPWeJvmizCvJJ@yury-ThinkPad \
    --to=yury.norov@gmail.com \
    --cc=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.