From: Yury Norov <yury.norov@gmail.com>
To: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
linux-kernel@vger.kernel.org,
Thomas Gleixner <tglx@linutronix.de>,
"Paul E . McKenney" <paulmck@kernel.org>,
Boqun Feng <boqun.feng@gmail.com>,
"H . Peter Anvin" <hpa@zytor.com>, Paul Turner <pjt@google.com>,
linux-api@vger.kernel.org, Christian Brauner <brauner@kernel.org>,
Florian Weimer <fw@deneb.enyo.de>,
David.Laight@aculab.com, carlos@redhat.com,
Peter Oskolkov <posk@posk.io>,
Alexander Mikhalitsyn <alexander@mihalicyn.com>,
Chris Kennelly <ckennelly@google.com>
Subject: Re: [PATCH 24/30] sched: NUMA-aware per-memory-map concurrency ID
Date: Tue, 21 Nov 2023 09:43:30 -0800 [thread overview]
Message-ID: <ZVzsQl2vakb9xSWe@yury-ThinkPad> (raw)
In-Reply-To: <20221122203932.231377-25-mathieu.desnoyers@efficios.com>
On Tue, Nov 22, 2022 at 03:39:26PM -0500, Mathieu Desnoyers wrote:
> Keep track of a NUMA-aware concurrency ID. On NUMA systems, when a
> NUMA-aware concurrency ID is observed by user-space to be associated
> with a NUMA node, it is guaranteed to never change NUMA node unless a
> kernel-level NUMA configuration change happens.
>
> Exposing a numa-aware concurrency ID is useful in situations where a
> process or a set of processes belonging to cpuset are pinned to a set of
> cores which belong to a subset of the system's NUMA nodes. In those
> situations, it is possible to benefit from the compactness of
> concurrency IDs over CPU ids, while keeping NUMA locality, for indexing
> a per-cpu data structure which takes into account NUMA locality.
>
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> ---
> include/linux/mm.h | 18 +++++
> include/linux/mm_types.h | 68 +++++++++++++++-
> include/linux/sched.h | 3 +
> kernel/fork.c | 3 +
> kernel/sched/core.c | 8 +-
> kernel/sched/sched.h | 168 +++++++++++++++++++++++++++++++++++----
> 6 files changed, 245 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e0fba52de3e2..c7dfdf4c9d08 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3484,6 +3484,20 @@ static inline int task_mm_cid(struct task_struct *t)
> {
> return t->mm_cid;
> }
> +#ifdef CONFIG_NUMA
> +static inline int task_mm_numa_cid(struct task_struct *t)
> +{
> + if (num_possible_nodes() == 1)
> + return task_mm_cid(t);
> + else
> + return t->mm_numa_cid;
> +}
> +#else
> +static inline int task_mm_numa_cid(struct task_struct *t)
> +{
> + return task_mm_cid(t);
> +}
> +#endif
> #else
> static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
> static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
> @@ -3498,6 +3512,10 @@ static inline int task_mm_cid(struct task_struct *t)
> */
> return raw_smp_processor_id();
> }
> +static inline int task_mm_numa_cid(struct task_struct *t)
> +{
> + return task_mm_cid(t);
> +}
> #endif
>
> #endif /* _LINUX_MM_H */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index dabb42d26bb9..8c9afe8ce603 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -18,6 +18,7 @@
> #include <linux/page-flags-layout.h>
> #include <linux/workqueue.h>
> #include <linux/seqlock.h>
> +#include <linux/nodemask.h>
>
> #include <asm/mmu.h>
>
> @@ -847,15 +848,80 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
> return (struct cpumask *)cid_bitmap;
> }
>
> +#ifdef CONFIG_NUMA
> +/*
> + * Layout of node cidmasks:
> + * - mm_numa cidmask: cpumask of the currently used mm_numa cids.
> + * - node_alloc cidmask: cpumask tracking which cid were
> + * allocated (across nodes) in this
> + * memory map.
> + * - node cidmask[nr_node_ids]: per-node cpumask tracking which cid
> + * were allocated in this memory map.
> + */
> +static inline cpumask_t *mm_numa_cidmask(struct mm_struct *mm)
> +{
> + unsigned long cid_bitmap = (unsigned long)mm_cidmask(mm);
> +
> + /* Skip mm_cidmask */
> + cid_bitmap += cpumask_size();
> + return (struct cpumask *)cid_bitmap;
> +}
> +
> +static inline cpumask_t *mm_node_alloc_cidmask(struct mm_struct *mm)
> +{
> + unsigned long cid_bitmap = (unsigned long)mm_numa_cidmask(mm);
> +
> + /* Skip mm_numa_cidmask */
> + cid_bitmap += cpumask_size();
> + return (struct cpumask *)cid_bitmap;
> +}
> +
> +static inline cpumask_t *mm_node_cidmask(struct mm_struct *mm, unsigned int node)
> +{
> + unsigned long cid_bitmap = (unsigned long)mm_node_alloc_cidmask(mm);
> +
> + /* Skip node alloc cidmask */
> + cid_bitmap += cpumask_size();
> + cid_bitmap += node * cpumask_size();
> + return (struct cpumask *)cid_bitmap;
> +}
> +
> +static inline void mm_init_node_cidmask(struct mm_struct *mm)
> +{
> + unsigned int node;
> +
> + if (num_possible_nodes() == 1)
> + return;
> + cpumask_clear(mm_numa_cidmask(mm));
> + cpumask_clear(mm_node_alloc_cidmask(mm));
> + for (node = 0; node < nr_node_ids; node++)
> + cpumask_clear(mm_node_cidmask(mm, node));
Can you use for_each_node() here? If so, I think it's worth to do it.
> +}
> +
> +static inline unsigned int mm_node_cidmask_size(void)
> +{
> + if (num_possible_nodes() == 1)
> + return 0;
> + return (nr_node_ids + 2) * cpumask_size();
> +}
> +#else /* CONFIG_NUMA */
> +static inline void mm_init_node_cidmask(struct mm_struct *mm) { }
> +static inline unsigned int mm_node_cidmask_size(void)
> +{
> + return 0;
> +}
> +#endif /* CONFIG_NUMA */
> +
> static inline void mm_init_cid(struct mm_struct *mm)
> {
> raw_spin_lock_init(&mm->cid_lock);
> cpumask_clear(mm_cidmask(mm));
> + mm_init_node_cidmask(mm);
> }
>
> static inline unsigned int mm_cid_size(void)
> {
> - return cpumask_size();
> + return cpumask_size() + mm_node_cidmask_size();
> }
> #else /* CONFIG_SCHED_MM_CID */
> static inline void mm_init_cid(struct mm_struct *mm) { }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c7e3c27e0e2e..990ef3d4b22b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1317,6 +1317,9 @@ struct task_struct {
> #ifdef CONFIG_SCHED_MM_CID
> int mm_cid; /* Current cid in mm */
> int mm_cid_active; /* Whether cid bitmap is active */
> +#ifdef CONFIG_NUMA
> + int mm_numa_cid; /* Current numa_cid in mm */
> +#endif
> #endif
>
> struct tlbflush_unmap_batch tlb_ubc;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d48dedc4be75..364f4c62b1a4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1050,6 +1050,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
> #ifdef CONFIG_SCHED_MM_CID
> tsk->mm_cid = -1;
> tsk->mm_cid_active = 0;
> +#ifdef CONFIG_NUMA
> + tsk->mm_numa_cid = -1;
> +#endif
> #endif
> return tsk;
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ef0cc40cca6b..095b5eb35d3d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -11284,8 +11284,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
> if (!mm)
> return;
> local_irq_save(flags);
> - mm_cid_put(mm, t->mm_cid);
> - t->mm_cid = -1;
> + mm_cid_put(mm, t);
> t->mm_cid_active = 0;
> local_irq_restore(flags);
> }
> @@ -11298,8 +11297,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
> if (!mm)
> return;
> local_irq_save(flags);
> - mm_cid_put(mm, t->mm_cid);
> - t->mm_cid = -1;
> + mm_cid_put(mm, t);
> t->mm_cid_active = 0;
> local_irq_restore(flags);
> }
> @@ -11312,7 +11310,7 @@ void sched_mm_cid_after_execve(struct task_struct *t)
> WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);
>
> local_irq_save(flags);
> - t->mm_cid = mm_cid_get(mm);
> + mm_cid_get(mm, t);
> t->mm_cid_active = 1;
> local_irq_restore(flags);
> rseq_set_notify_resume(t);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0096dc22926e..87f61f926e88 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3262,38 +3262,174 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
> }
>
> #ifdef CONFIG_SCHED_MM_CID
> -static inline int __mm_cid_get(struct mm_struct *mm)
> +#ifdef CONFIG_NUMA
> +static inline void __mm_numa_cid_get(struct mm_struct *mm, struct task_struct *t)
> +{
> + struct cpumask *cpumask = mm_numa_cidmask(mm),
> + *node_cpumask = mm_node_cidmask(mm, numa_node_id()),
> + *node_alloc_cpumask = mm_node_alloc_cidmask(mm);
> + unsigned int node;
> + int cid;
> +
> + if (num_possible_nodes() == 1) {
> + cid = -1;
> + goto end;
> + }
> +
> + /*
> + * Try to reserve lowest available cid number within those already
> + * reserved for this NUMA node.
> + */
> + cid = cpumask_first_andnot(node_cpumask, cpumask);
> + if (cid >= nr_cpu_ids)
> + goto alloc_numa;
> + __cpumask_set_cpu(cid, cpumask);
> + goto end;
The 'end' is just a 't->mm_numa_cid = cid;' I think it would be less confusing
if you drop it:
cid = cpumask_first_andnot();
if (cid < nr_cpu_ids) {
t->mm_numa_cid = cid;
return;
}
This would also let you to remove 'alloc_numa' label.
> +
> +alloc_numa:
> + /*
> + * Try to reserve lowest available cid number within those not already
> + * allocated for numa nodes.
> + */
> + cid = cpumask_first_notandnot(node_alloc_cpumask, cpumask);
> + if (cid >= nr_cpu_ids)
> + goto numa_update;
> + __cpumask_set_cpu(cid, cpumask);
> + __cpumask_set_cpu(cid, node_cpumask);
> + __cpumask_set_cpu(cid, node_alloc_cpumask);
> + goto end;
Same here. Reverse conditional and return from if() branch.
> +numa_update:
> + /*
> + * NUMA node id configuration changed for at least one CPU in the system.
> + * We need to steal a currently unused cid from an overprovisioned
> + * node for our current node. Userspace must handle the fact that the
> + * node id associated with this cid may change due to node ID
> + * reconfiguration.
> + *
> + * Count how many possible cpus are attached to each (other) node id,
> + * and compare this with the per-mm node cidmask cpu count. Find one
> + * which has too many cpus in its mask to steal from.
> + */
> + for (node = 0; node < nr_node_ids; node++) {
Again, for_each_node()?
> + struct cpumask *iter_cpumask;
> +
> + if (node == numa_node_id())
> + continue;
> + iter_cpumask = mm_node_cidmask(mm, node);
> + if (nr_cpus_node(node) < cpumask_weight(iter_cpumask)) {
> + /* Try to steal from this node. */
> + cid = cpumask_first_andnot(iter_cpumask, cpumask);
> + if (cid >= nr_cpu_ids)
> + goto steal_fail;
> + __cpumask_set_cpu(cid, cpumask);
> + __cpumask_clear_cpu(cid, iter_cpumask);
> + __cpumask_set_cpu(cid, node_cpumask);
> + goto end;
> + }
Same here, you can reverse conditional and drop goto's.
> + }
> +
> +steal_fail:
> + /*
> + * Our attempt at gracefully stealing a cid from another
> + * overprovisioned NUMA node failed. Fallback to grabbing the first
> + * available cid.
> + */
> + cid = cpumask_first_zero(cpumask);
> + if (cid >= nr_cpu_ids) {
> + cid = -1;
> + goto end;
> + }
> + __cpumask_set_cpu(cid, cpumask);
> + /* Steal cid from its numa node mask. */
> + for (node = 0; node < nr_node_ids; node++) {
> + struct cpumask *iter_cpumask;
> +
> + if (node == numa_node_id())
> + continue;
> + iter_cpumask = mm_node_cidmask(mm, node);
> + if (cpumask_test_cpu(cid, iter_cpumask)) {
> + __cpumask_clear_cpu(cid, iter_cpumask);
> + break;
> + }
> + }
> + __cpumask_set_cpu(cid, node_cpumask);
> +end:
> + t->mm_numa_cid = cid;
> +}
> +
> +static inline void __mm_numa_cid_put(struct mm_struct *mm, struct task_struct *t)
> +{
> + int cid = t->mm_numa_cid;
> +
> + if (num_possible_nodes() == 1)
> + return;
> + if (cid < 0)
> + return;
> + __cpumask_clear_cpu(cid, mm_numa_cidmask(mm));
> + t->mm_numa_cid = -1;
> +}
> +
> +static inline void mm_numa_transfer_cid_prev_next(struct task_struct *prev, struct task_struct *next)
> +{
> + next->mm_numa_cid = prev->mm_numa_cid;
> + prev->mm_numa_cid = -1;
> +}
> +#else
> +static inline void __mm_numa_cid_get(struct mm_struct *mm, struct task_struct *t) { }
> +static inline void __mm_numa_cid_put(struct mm_struct *mm, struct task_struct *t) { }
> +static inline void mm_numa_transfer_cid_prev_next(struct task_struct *prev, struct task_struct *next) { }
> +#endif
> +
> +static inline void __mm_cid_get(struct mm_struct *mm, struct task_struct *t)
> {
> struct cpumask *cpumask;
> int cid;
>
> cpumask = mm_cidmask(mm);
> cid = cpumask_first_zero(cpumask);
> - if (cid >= nr_cpu_ids)
> - return -1;
> + if (cid >= nr_cpu_ids) {
> + cid = -1;
> + goto end;
> + }
> __cpumask_set_cpu(cid, cpumask);
> - return cid;
> +end:
> + t->mm_cid = cid;
> }
>
> -static inline void mm_cid_put(struct mm_struct *mm, int cid)
> +static inline void mm_cid_get(struct mm_struct *mm, struct task_struct *t)
> {
> lockdep_assert_irqs_disabled();
> - if (cid < 0)
> - return;
> raw_spin_lock(&mm->cid_lock);
> - __cpumask_clear_cpu(cid, mm_cidmask(mm));
> + __mm_cid_get(mm, t);
> + __mm_numa_cid_get(mm, t);
> raw_spin_unlock(&mm->cid_lock);
> }
>
> -static inline int mm_cid_get(struct mm_struct *mm)
> +static inline void __mm_cid_put(struct mm_struct *mm, struct task_struct *t)
> {
> - int ret;
> + int cid = t->mm_cid;
> +
> + if (cid < 0)
> + return;
> + __cpumask_clear_cpu(cid, mm_cidmask(mm));
> + t->mm_cid = -1;
> +}
>
> +static inline void mm_cid_put(struct mm_struct *mm, struct task_struct *t)
> +{
> lockdep_assert_irqs_disabled();
> raw_spin_lock(&mm->cid_lock);
> - ret = __mm_cid_get(mm);
> + __mm_cid_put(mm, t);
> + __mm_numa_cid_put(mm, t);
> raw_spin_unlock(&mm->cid_lock);
> - return ret;
> +}
> +
> +static inline void mm_transfer_cid_prev_next(struct task_struct *prev, struct task_struct *next)
> +{
> + next->mm_cid = prev->mm_cid;
> + prev->mm_cid = -1;
> + mm_numa_transfer_cid_prev_next(prev, next);
> }
>
> static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
> @@ -3304,15 +3440,13 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
> * Context switch between threads in same mm, hand over
> * the mm_cid from prev to next.
> */
> - next->mm_cid = prev->mm_cid;
> - prev->mm_cid = -1;
> + mm_transfer_cid_prev_next(prev, next);
> return;
> }
> - mm_cid_put(prev->mm, prev->mm_cid);
> - prev->mm_cid = -1;
> + mm_cid_put(prev->mm, prev);
> }
> if (next->mm_cid_active)
> - next->mm_cid = mm_cid_get(next->mm);
> + mm_cid_get(next->mm, next);
> }
>
> #else
> --
> 2.25.1
>
next prev parent reply other threads:[~2023-11-21 17:43 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-22 20:39 [PATCH 00/30] RSEQ node id and mm concurrency id extensions Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 01/30] selftests/rseq: Fix: Fail thread registration when CONFIG_RSEQ=n Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 02/30] rseq: Introduce feature size and alignment ELF auxiliary vector entries Mathieu Desnoyers
2023-01-04 18:44 ` Nathan Chancellor
2023-01-04 19:00 ` Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 03/30] rseq: Introduce extensible rseq ABI Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 04/30] rseq: Extend struct rseq with numa node id Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 05/30] selftests/rseq: Use ELF auxiliary vector for extensible rseq Mathieu Desnoyers
2023-01-04 19:14 ` Florian Weimer
2023-01-04 19:51 ` Mathieu Desnoyers
2023-01-05 16:19 ` Florian Weimer
2023-01-05 16:28 ` Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 06/30] selftests/rseq: Implement rseq numa node id field selftest Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 07/30] sched: Introduce per-memory-map concurrency ID Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 08/30] rseq: Extend struct rseq with " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 09/30] selftests/rseq: Remove RSEQ_SKIP_FASTPATH code Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 10/30] selftests/rseq: Implement rseq mm_cid field support Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 11/30] selftests/rseq: x86: Template memory ordering and percpu access mode Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 12/30] selftests/rseq: arm: " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 13/30] selftests/rseq: arm64: " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 14/30] selftests/rseq: mips: " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 15/30] selftests/rseq: ppc: " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 16/30] selftests/rseq: s390: " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 17/30] selftests/rseq: riscv: " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 18/30] selftests/rseq: Implement basic percpu ops mm_cid test Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 19/30] selftests/rseq: Implement parametrized " Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 20/30] selftests/rseq: parametrized test: Report/abort on negative concurrency ID Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 21/30] tracing/rseq: Add mm_cid field to rseq_update Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 22/30] lib: Implement find_{first,next,nth}_notandnot_bit, find_first_andnot_bit Mathieu Desnoyers
2023-11-21 17:06 ` Yury Norov
2022-11-22 20:39 ` [PATCH 23/30] cpumask: Implement cpumask_{first,next}_{not,}andnot Mathieu Desnoyers
2023-11-21 17:13 ` Yury Norov
2022-11-22 20:39 ` [PATCH 24/30] sched: NUMA-aware per-memory-map concurrency ID Mathieu Desnoyers
2023-11-21 17:43 ` Yury Norov [this message]
2022-11-22 20:39 ` [PATCH 25/30] rseq: Extend struct rseq with per-memory-map NUMA-aware Concurrency ID Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 26/30] selftests/rseq: x86: Implement rseq_load_u32_u32 Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 27/30] selftests/rseq: Implement mm_numa_cid accessors in headers Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 28/30] selftests/rseq: Implement numa node id vs mm_numa_cid invariant test Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 29/30] selftests/rseq: Implement mm_numa_cid tests Mathieu Desnoyers
2022-11-22 20:39 ` [PATCH 30/30] tracing/rseq: Add mm_numa_cid field to rseq_update Mathieu Desnoyers
2024-02-28 18:50 ` [PATCH 00/30] RSEQ node id and mm concurrency id extensions Marco Elver
2024-02-28 20:01 ` Mathieu Desnoyers
2024-02-29 9:31 ` Marco Elver
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ZVzsQl2vakb9xSWe@yury-ThinkPad \
--to=yury.norov@gmail.com \
--cc=David.Laight@aculab.com \
--cc=alexander@mihalicyn.com \
--cc=boqun.feng@gmail.com \
--cc=brauner@kernel.org \
--cc=carlos@redhat.com \
--cc=ckennelly@google.com \
--cc=fw@deneb.enyo.de \
--cc=hpa@zytor.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=paulmck@kernel.org \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=posk@posk.io \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).