All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrea Righi <arighi@nvidia.com>
To: Yury Norov <yury.norov@gmail.com>
Cc: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks
Date: Tue, 10 Dec 2024 01:14:43 +0100	[thread overview]
Message-ID: <Z1eH8_rP16IyJ8LI@gpd3> (raw)
In-Reply-To: <Z1dF6HuEI2nyUD2V@yury-ThinkPad>

On Mon, Dec 09, 2024 at 11:32:56AM -0800, Yury Norov wrote:
...
> > +static struct cpumask *get_idle_cpumask_node(int node)
> > +{
> > +	return idle_masks[node]->cpu;
> > +}
> > +
> > +static struct cpumask *get_idle_smtmask_node(int node)
> > +{
> > +	return idle_masks[node]->smt;
> > +}
> 
> This implies that NUMA_NO_NODE, which is -1, will never be passed.
> Is that indeed impossible?

In PATCH 4/4 I adds some checks to make sure "node" is valid and return
NULL otherwise, but thinking more about it, it seems better to return
cpu_none_mask.

> 
> > +
> > +static struct cpumask *get_curr_idle_cpumask(void)
> > +{
> > +	int node = cpu_to_node(smp_processor_id());
> > +
> > +	return get_idle_cpumask_node(node);
> > +}
> > +
> > +static struct cpumask *get_curr_idle_smtmask(void)
> > +{
> > +	int node = cpu_to_node(smp_processor_id());
> > +
> > +	if (sched_smt_active())
> > +		return get_idle_smtmask_node(node);
> > +	else
> > +		return get_idle_cpumask_node(node);
> > +}
> > +
> > +static void idle_masks_init(void)
> > +{
> > +	int node;
> > +
> > +	idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
> > +	BUG_ON(!idle_masks);
> > +
> > +	for_each_node_state(node, N_POSSIBLE) {
> 
> for_each_node(node)

Ok.

> 
> > +		idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
> > +		BUG_ON(!idle_masks[node]);
> > +
> > +		BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
> > +		BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
> > +	}
> > +}
> > +#else	/* !CONFIG_SMP */
> > +static struct cpumask *get_curr_idle_cpumask(void)
> > +{
> > +	return cpu_none_mask;
> > +}
> > +
> > +static struct cpumask *get_curr_idle_smtmask(void)
> > +{
> > +	return cpu_none_mask;
> > +}
> >  #endif	/* CONFIG_SMP */
> >  
> >  /* for %SCX_KICK_WAIT */
> > @@ -3166,6 +3218,9 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
> >  
> >  static bool test_and_clear_cpu_idle(int cpu)
> >  {
> > +	int node = cpu_to_node(cpu);
> > +	struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> > +
> >  #ifdef CONFIG_SCHED_SMT
> >  	/*
> >  	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
> > @@ -3174,29 +3229,34 @@ static bool test_and_clear_cpu_idle(int cpu)
> >  	 */
> >  	if (sched_smt_active()) {
> >  		const struct cpumask *smt = cpu_smt_mask(cpu);
> > +		struct cpumask *idle_smt = get_idle_smtmask_node(node);
> >  
> >  		/*
> >  		 * If offline, @cpu is not its own sibling and
> >  		 * scx_pick_idle_cpu() can get caught in an infinite loop as
> > -		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
> > -		 * is eventually cleared.
> > +		 * @cpu is never cleared from the idle SMT mask. Ensure that
> > +		 * @cpu is eventually cleared.
> > +		 *
> > +		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
> > +		 * reduce memory writes, which may help alleviate cache
> > +		 * coherence pressure.
> >  		 */
> > -		if (cpumask_intersects(smt, idle_masks.smt))
> > -			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
> > -		else if (cpumask_test_cpu(cpu, idle_masks.smt))
> > -			__cpumask_clear_cpu(cpu, idle_masks.smt);
> > +		if (cpumask_intersects(smt, idle_smt))
> > +			cpumask_andnot(idle_smt, idle_smt, smt);
> > +		else if (cpumask_test_cpu(cpu, idle_smt))
> > +			__cpumask_clear_cpu(cpu, idle_smt);
> >  	}
> >  #endif
> > -	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
> > +	return cpumask_test_and_clear_cpu(cpu, idle_cpu);
> >  }
> >  
> > -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> > +static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> >  {
> >  	int cpu;
> >  
> >  retry:
> >  	if (sched_smt_active()) {
> > -		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
> > +		cpu = cpumask_any_and_distribute(get_idle_smtmask_node(node), cpus_allowed);
> >  		if (cpu < nr_cpu_ids)
> >  			goto found;
> >  
> > @@ -3204,15 +3264,66 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> >  			return -EBUSY;
> >  	}
> >  
> > -	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
> > -	if (cpu >= nr_cpu_ids)
> > -		return -EBUSY;
> > +	cpu = cpumask_any_and_distribute(get_idle_cpumask_node(node), cpus_allowed);
> > +	if (cpu < nr_cpu_ids)
> > +		goto found;
> > +
> > +	return -EBUSY;
> 
> What for did you change the error handling logic? Can you keep the
> original?

I changed that to follow the same pattern as the one inside
sched_smt_active(), it should be equivalent, but I can keep the
original.

> 
> >  
> >  found:
> >  	if (test_and_clear_cpu_idle(cpu))
> >  		return cpu;
> >  	else
> >  		goto retry;
> > +
> > +}
> > +
> > +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> > +{
> > +	const struct cpumask *node_mask;
> > +	s32 cpu;
> > +
> > +	/*
> > +	 * Only node 0 is used if per-node idle cpumasks are disabled.
> > +	 */
> > +	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> > +		return scx_pick_idle_cpu_from_node(0, cpus_allowed, flags);
> > +
> > +	/*
> > +	 * Traverse all nodes in order of increasing distance, starting from
> > +	 * prev_cpu's node.
> > +	 */
> > +	rcu_read_lock();
> > +	for_each_numa_hop_mask(node_mask, cpu_to_node(prev_cpu)) {
> 
> This 'node_mask' misleads. This is not a nodemask. This is a cpumask -
> all cpus in the hop. Can you name it 'hop_cpus', or similar?

Ack.

> 
> > +		/*
> > +		 * scx_pick_idle_cpu_from_node() can be expensive and redundant
> > +		 * if none of the CPUs in the NUMA node can be used (according
> > +		 * to cpus_allowed).
> > +		 *
> > +		 * Therefore, check if the NUMA node is usable in advance to
> > +		 * save some CPU cycles.
> > +		 */
> > +		if (!cpumask_intersects(node_mask, cpus_allowed))
> > +			continue;
> > +
> > +		/*
> > +		 * It would be nice to have a "node" iterator, instead of the
> > +		 * cpumask, to get rid of the cpumask_first() to determine the
> > +		 * node.
> > +		 */
> 
> I'm not aware about such case. And this one doesn't look like that because you
> filter against a cpumask (cpus_allowed). Right?

I'm interested to get the node id here to use it with
scx_pick_idle_cpu_from_node().

> 
> > +		cpu = cpumask_first(node_mask);
> > +		if (cpu >= nr_cpu_ids)
> > +			continue;
> 
> for_each_numa_hop_mask() doesn't traverse per-node CPUs. It traverses
> per-hop CPUs. The difference is that the hop may include more than one
> node if distances from a given node to those belonging the hop are the
> same.

Hm... I was missing this aspect. Thanks for clarifying this.

> 
> So, imagine we have nodes 1 and 2 in the same hop, but cpus_allowed
> has only cpus from node 2. With this configuration you pass the
> cpumask_intersects() check, but when picking a CPU, you ignore the
> cpus_allowed and pick node 1. This is wrong, I guess.

Yeah, that's not what I would like to do.

> 
> Instead, I would make a single check:
> 
>         cpu = cpumask_first_and(node_mask, cpus_allowed);
>         if (cpu >= nr_cpu_ids)
>                 continue;

Ok, but at this point I'm not sure that for_each_numa_hop_mask() is the
most efficient way to iterate NUMA nodes.

> 
> > +
> > +		cpu = scx_pick_idle_cpu_from_node(cpu_to_node(cpu), cpus_allowed, flags);
> > +		if (cpu >= 0)
> > +			goto out_unlock;
> 
> The code in scx_pick_idle_cpu_from_node() is written such that CPUs
> that your picks are well distributed across the nodes. Your code above
> always picks 1st node in the hop. I think here you should use 
> 
>         cpumask_any_and_distribute()
>  
> like the scx_pick_idle_cpu_from_node() does with idle_masks.

Right, that's because I was (incorrectly) assuming that each hop was a
single node.

> 
> Another problem is that high-level hops include CPUs from lower-level
> ones. It means that you will traverse the same lower-level CPUs again
> for nothing. So, you need to mask them out.
> 
> Another-another problem: if your hop has more than one node, you should
> not jump to the next hop until you tried every node from the current hop.
> This brings another loop, but doesn't increase complexity if you
> carefully mask-out all visited nodes.
> 
> > +	}
> > +	cpu = -EBUSY;
> 
> This hides the actual error returned from scx_pick_idle_cpu_from_node().

Right, scx_pick_idle_cpu_from_node() can only returns -EBUSY at the
moment, but it'd be nicer to pass the returned error.

> 
> > +
> > +out_unlock:
> > +	rcu_read_unlock();
> > +	return cpu;
> >  }
> 
> And altogether this should look like:
> 
>  int scx_pick_idle_cpu_from_hop(struct cpumask *hop_cpus, struct cpumask *cpus_allowed)
>  {
>          int node, cpu, random_cpu;
> 
>          do {
> 
>                  /* Pick a 'random' CPU in the hop */
>                  random_cpu = cpumask_any_and_distribute(hop_cpus, cpus_allowed);
>                  if (random_cpu >= nr_cpu_ids)
>                          continue;
> 
>                  node = cpu_to_node(random_cpu);
> 
>                  /* Find an idle CPU in the same node */
>                  cpu = scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
>                  if (cpu >= 0)
>                          break;
> 
>                  /* No luck? Try other nodes */
>          } while (cpumask_andnot(hop_cpus, hop_cpus, cpumask_of_node(node)));
> 
>          return cpu;
>  }
> 
>  static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
>  {
>         const struct cpumask *next, *prev = cpu_none_mask;
>         int prev_node = cpu_to_node(prev_cpu);
>  ...
> 	for_each_numa_hop_mask(next, prev_node) {
>                 cpumask_andnot(hop_cpus, next, prev);
>                 cpu = scx_pick_idle_cpu_from_hop(hop_cpus, cpus_allowed);
>                 prev = next;
>         }
>  ...
>  }
> 
> Not tested, but should work.

Makes sense to me, I'll do some testing with this.

> > @@ -3636,17 +3748,27 @@ static void set_cpus_allowed_scx(struct task_struct *p,
> >  
> >  static void reset_idle_masks(void)
> >  {
> > +	int node;
> > +
> >  	/*
> >  	 * Consider all online cpus idle. Should converge to the actual state
> >  	 * quickly.
> >  	 */
> > -	cpumask_copy(idle_masks.cpu, cpu_online_mask);
> > -	cpumask_copy(idle_masks.smt, cpu_online_mask);
> > +	for_each_node_state(node, N_POSSIBLE) {
> 
> for_each_node()
> 

Ok.

And I think I haven't missed any comment this time, sorry about that and
thanks again for the review!

-Andrea

  parent reply	other threads:[~2024-12-10  0:14 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-09 10:40 [PATCHSET v5 sched_ext/for-6.14] sched_ext: split global idle cpumask into per-NUMA cpumasks Andrea Righi
2024-12-09 10:40 ` [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks Andrea Righi
2024-12-09 19:32   ` Yury Norov
2024-12-09 20:40     ` Andrea Righi
2024-12-10  0:14     ` Andrea Righi [this message]
2024-12-10  2:10       ` Yury Norov
2024-12-14  6:05         ` Andrea Righi
2024-12-11 17:46   ` Yury Norov
2024-12-09 10:40 ` [PATCH 2/4] sched_ext: Get rid of the scx_selcpu_topo_numa logic Andrea Righi
2024-12-11  8:05   ` Changwoo Min
2024-12-11 12:22     ` Andrea Righi
2024-12-09 10:40 ` [PATCH 3/4] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE Andrea Righi
2024-12-11 18:21   ` Yury Norov
2024-12-11 19:59     ` Andrea Righi
2024-12-09 10:40 ` [PATCH 4/4] sched_ext: Introduce NUMA aware idle cpu kfunc helpers Andrea Righi
2024-12-11 17:43   ` Yury Norov
2024-12-11 20:20     ` Andrea Righi
2024-12-11 20:47       ` Yury Norov
2024-12-11 20:55         ` Andrea Righi
  -- strict thread matches above, loose matches on Subject: below --
2024-12-05 21:00 [PATCHSET v4 sched_ext/for-6.14] sched_ext: split global idle cpumask into per-NUMA cpumasks Andrea Righi
2024-12-05 21:00 ` [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks Andrea Righi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z1eH8_rP16IyJ8LI@gpd3 \
    --to=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    --cc=yury.norov@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.