Re: [PATCH 6/8] sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Yury Norov <yury.norov@gmail.com>
To: Andrea Righi <arighi@nvidia.com>
Cc: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Joel Fernandes <joel@joelfernandes.org>,
	Ian May <ianm@nvidia.com>,
	bpf@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 6/8] sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE
Date: Fri, 14 Feb 2025 16:18:23 -0500	[thread overview]
Message-ID: <Z6-zH3Gh87KC0ykb@thinkpad> (raw)
In-Reply-To: <20250214194134.658939-7-arighi@nvidia.com>

On Fri, Feb 14, 2025 at 08:40:05PM +0100, Andrea Righi wrote:
> Add the new scheduler flag SCX_OPS_BUILTIN_IDLE_PER_NODE, which allows
> BPF schedulers to select between using a global flat idle cpumask or
> multiple per-node cpumasks.
> 
> This only introduces the flag and the mechanism to enable/disable this
> feature without affecting any scheduling behavior.
> 
> Cc: Yury Norov [NVIDIA] <yury.norov@gmail.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>

Reviewed-by: Yury Norov [NVIDIA] <yury.norov@gmail.com>

> ---
>  kernel/sched/ext.c                   | 21 ++++++++++++++++++--
>  kernel/sched/ext_idle.c              | 29 +++++++++++++++++++++-------
>  kernel/sched/ext_idle.h              |  4 ++--
>  tools/sched_ext/include/scx/compat.h |  3 +++
>  4 files changed, 46 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 7c17e05ed15b1..330a359d79301 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -154,6 +154,12 @@ enum scx_ops_flags {
>  	 */
>  	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,
>  
> +	/*
> +	 * If set, enable per-node idle cpumasks. If clear, use a single global
> +	 * flat idle cpumask.
> +	 */
> +	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
> +
>  	/*
>  	 * CPU cgroup support flags
>  	 */
> @@ -165,6 +171,7 @@ enum scx_ops_flags {
>  				  SCX_OPS_ENQ_MIGRATION_DISABLED |
>  				  SCX_OPS_ALLOW_QUEUED_WAKEUP |
>  				  SCX_OPS_SWITCH_PARTIAL |
> +				  SCX_OPS_BUILTIN_IDLE_PER_NODE |
>  				  SCX_OPS_HAS_CGROUP_WEIGHT,
>  };
>  
> @@ -3427,7 +3434,7 @@ static void handle_hotplug(struct rq *rq, bool online)
>  	atomic_long_inc(&scx_hotplug_seq);
>  
>  	if (scx_enabled())
> -		scx_idle_update_selcpu_topology();
> +		scx_idle_update_selcpu_topology(&scx_ops);
>  
>  	if (online && SCX_HAS_OP(cpu_online))
>  		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
> @@ -5228,6 +5235,16 @@ static int validate_ops(const struct sched_ext_ops *ops)
>  		return -EINVAL;
>  	}
>  
> +	/*
> +	 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
> +	 * selection policy to be enabled.
> +	 */
> +	if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
> +	    (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
> +		scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
> +		return -EINVAL;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -5352,7 +5369,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  			static_branch_enable_cpuslocked(&scx_has_op[i]);
>  
>  	check_hotplug_seq(ops);
> -	scx_idle_update_selcpu_topology();
> +	scx_idle_update_selcpu_topology(ops);
>  
>  	cpus_read_unlock();
>  
> diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
> index ed1804506585b..0912f94b95cdc 100644
> --- a/kernel/sched/ext_idle.c
> +++ b/kernel/sched/ext_idle.c
> @@ -14,6 +14,9 @@
>  /* Enable/disable built-in idle CPU selection policy */
>  static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
>  
> +/* Enable/disable per-node idle cpumasks */
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
> +
>  #ifdef CONFIG_SMP
>  #ifdef CONFIG_CPUMASK_OFFSTACK
>  #define CL_ALIGNED_IF_ONSTACK
> @@ -204,7 +207,7 @@ static bool llc_numa_mismatch(void)
>   * CPU belongs to a single LLC domain, and that each LLC domain is entirely
>   * contained within a single NUMA node.
>   */
> -void scx_idle_update_selcpu_topology(void)
> +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
>  {
>  	bool enable_llc = false, enable_numa = false;
>  	unsigned int nr_cpus;
> @@ -237,13 +240,19 @@ void scx_idle_update_selcpu_topology(void)
>  	 * If all CPUs belong to the same NUMA node and the same LLC domain,
>  	 * enabling both NUMA and LLC optimizations is unnecessary, as checking
>  	 * for an idle CPU in the same domain twice is redundant.
> +	 *
> +	 * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
> +	 * optimization, as we would naturally select idle CPUs within
> +	 * specific NUMA nodes querying the corresponding per-node cpumask.
>  	 */
> -	nr_cpus = numa_weight(cpu);
> -	if (nr_cpus > 0) {
> -		if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
> -			enable_numa = true;
> -		pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
> -			 cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
> +	if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
> +		nr_cpus = numa_weight(cpu);
> +		if (nr_cpus > 0) {
> +			if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
> +				enable_numa = true;
> +			pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
> +				 cpumask_pr_args(numa_span(cpu)), nr_cpus);
> +		}
>  	}
>  	rcu_read_unlock();
>  
> @@ -530,6 +539,11 @@ void scx_idle_enable(struct sched_ext_ops *ops)
>  	}
>  	static_branch_enable(&scx_builtin_idle_enabled);
>  
> +	if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
> +		static_branch_enable(&scx_builtin_idle_per_node);
> +	else
> +		static_branch_disable(&scx_builtin_idle_per_node);
> +
>  #ifdef CONFIG_SMP
>  	/*
>  	 * Consider all online cpus idle. Should converge to the actual state
> @@ -543,6 +557,7 @@ void scx_idle_enable(struct sched_ext_ops *ops)
>  void scx_idle_disable(void)
>  {
>  	static_branch_disable(&scx_builtin_idle_enabled);
> +	static_branch_disable(&scx_builtin_idle_per_node);
>  }
>  
>  /********************************************************************************
> diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
> index bbac0fd9a5ddd..339b6ec9c4cb7 100644
> --- a/kernel/sched/ext_idle.h
> +++ b/kernel/sched/ext_idle.h
> @@ -13,12 +13,12 @@
>  struct sched_ext_ops;
>  
>  #ifdef CONFIG_SMP
> -void scx_idle_update_selcpu_topology(void);
> +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
>  void scx_idle_init_masks(void);
>  bool scx_idle_test_and_clear_cpu(int cpu);
>  s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags);
>  #else /* !CONFIG_SMP */
> -static inline void scx_idle_update_selcpu_topology(void) {}
> +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
>  static inline void scx_idle_init_masks(void) {}
>  static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
>  static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
> index b50280e2ba2ba..d63cf40be8eee 100644
> --- a/tools/sched_ext/include/scx/compat.h
> +++ b/tools/sched_ext/include/scx/compat.h
> @@ -109,6 +109,9 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
>  #define SCX_OPS_SWITCH_PARTIAL							\
>  	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
>  
> +#define SCX_OPS_BUILTIN_IDLE_PER_NODE						\
> +	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_BUILTIN_IDLE_PER_NODE")
> +
>  static inline long scx_hotplug_seq(void)
>  {
>  	int fd;
> -- 
> 2.48.1

next prev parent reply	other threads:[~2025-02-14 21:18 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-14 19:39 [PATCHSET v12 sched_ext/for-6.15] sched_ext: split global idle cpumask into per-NUMA cpumasks Andrea Righi
2025-02-14 19:40 ` [PATCH 1/8] nodemask: add nodes_copy() Andrea Righi
2025-02-14 19:40 ` [PATCH 2/8] nodemask: numa: reorganize inclusion path Andrea Righi
2025-02-14 19:40 ` [PATCH 3/8] mm/numa: Introduce nearest_node_nodemask() Andrea Righi
2025-02-14 21:14   ` Yury Norov
2025-02-14 19:40 ` [PATCH 4/8] sched/topology: Introduce for_each_node_numadist() iterator Andrea Righi
2025-02-14 21:16   ` Yury Norov
2025-02-14 21:29     ` Tejun Heo
2025-02-14 21:30       ` Yury Norov
2025-02-16 16:12         ` Tejun Heo
2025-02-14 19:40 ` [PATCH 5/8] sched_ext: idle: Make idle static keys private Andrea Righi
2025-02-14 19:40 ` [PATCH 6/8] sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE Andrea Righi
2025-02-14 21:18   ` Yury Norov [this message]
2025-02-14 19:40 ` [PATCH 7/8] sched_ext: idle: Per-node idle cpumasks Andrea Righi
2025-02-14 21:21   ` Yury Norov
2025-02-14 19:40 ` [PATCH 8/8] sched_ext: idle: Introduce node-aware idle cpu kfunc helpers Andrea Righi
2025-02-14 21:28   ` Yury Norov
2025-02-17 13:41     ` Andrea Righi
2025-02-17 17:24       ` Yury Norov
2025-02-17 17:27         ` Andrea Righi
2025-02-16 16:57   ` Tejun Heo
2025-02-16 19:54     ` Andrea Righi
2025-02-16 16:54 ` [PATCHSET v12 sched_ext/for-6.15] sched_ext: split global idle cpumask into per-NUMA cpumasks Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z6-zH3Gh87KC0ykb@thinkpad \
    --to=yury.norov@gmail.com \
    --cc=arighi@nvidia.com \
    --cc=bpf@vger.kernel.org \
    --cc=bsegall@google.com \
    --cc=changwoo@igalia.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=ianm@nvidia.com \
    --cc=joel@joelfernandes.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=void@manifault.com \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.