From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
joshdon@google.com, brho@google.com, pjt@google.com,
derkling@google.com, haoluo@google.com, dvernet@meta.com,
dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com,
changwoo@igalia.com, himadrics@inria.fr, memxor@gmail.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 19/36] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support
Date: Fri, 10 Nov 2023 16:47:45 -1000 [thread overview]
Message-ID: <20231111024835.2164816-20-tj@kernel.org> (raw)
In-Reply-To: <20231111024835.2164816-1-tj@kernel.org>
It's often useful to wake up and/or trigger reschedule on other CPUs. This
patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to
kick the target CPU into the scheduling path.
As a sched_ext task relinquishes its CPU only after its slice is depleted,
this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the
slice of the target CPU's current task to guarantee that sched_ext's
scheduling path runs on the CPU.
v4: * Move example scheduler to its own patch.
v3: * Make scx_example_central switch all tasks by default.
* Convert to BPF inline iterators.
v2: * Julia Lawall reported that scx_example_central can overflow the
dispatch buffer and malfunction. As scheduling for other CPUs can't be
handled by the automatic retry mechanism, fix by implementing an
explicit overflow and retry handling.
* Updated to use generic BPF cpumask helpers.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
include/linux/sched/ext.h | 4 ++
kernel/sched/ext.c | 81 ++++++++++++++++++++++++++++++--
kernel/sched/ext.h | 12 +++++
kernel/sched/sched.h | 3 ++
tools/sched_ext/scx_common.bpf.h | 1 +
5 files changed, 98 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 55f649bd065c..d6ebfa6163a1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -408,6 +408,10 @@ struct sched_ext_entity {
* scx_bpf_dispatch() but can also be modified directly by the BPF
* scheduler. Automatically decreased by SCX as the task executes. On
* depletion, a scheduling event is triggered.
+ *
+ * This value is cleared to zero if the task is preempted by
+ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+ * task ran. Use p->se.sum_exec_runtime instead.
*/
u64 slice;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 65ee99ea111b..c18a67791bc7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -507,7 +507,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
}
}
- if (enq_flags & SCX_ENQ_HEAD)
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_node, &dsq->fifo);
else
list_add_tail(&p->scx.dsq_node, &dsq->fifo);
@@ -523,8 +523,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
if (is_local) {
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+ bool preempt = false;
- if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+ rq->curr->sched_class == &ext_sched_class) {
+ rq->curr->scx.slice = 0;
+ preempt = true;
+ }
+
+ if (preempt || sched_class_above(&ext_sched_class,
+ rq->curr->sched_class))
resched_curr(rq);
} else {
raw_spin_unlock(&dsq->lock);
@@ -1941,7 +1949,8 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
* Omitted operations:
*
* - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
- * isn't tied to the CPU at that point.
+ * isn't tied to the CPU at that point. Preemption is implemented by resetting
+ * the victim task's slice to 0 and triggering reschedule on the target CPU.
*
* - migrate_task_rq: Unncessary as task to cpu mapping is transient.
*
@@ -2787,6 +2796,32 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
.enable_mask = SYSRQ_ENABLE_RTNICE,
};
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *this_rq = this_rq();
+ int this_cpu = cpu_of(this_rq);
+ int cpu;
+
+ for_each_cpu(cpu, this_rq->scx.cpus_to_kick) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+
+ if (cpu_online(cpu) || cpu == this_cpu) {
+ if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
+ rq->curr->sched_class == &ext_sched_class)
+ rq->curr->scx.slice = 0;
+ resched_curr(rq);
+ }
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+ }
+
+ cpumask_clear(this_rq->scx.cpus_to_kick);
+ cpumask_clear(this_rq->scx.cpus_to_preempt);
+}
+
/**
* print_scx_info - print out sched_ext scheduler state
* @log_lvl: the log level to use when printing
@@ -2855,6 +2890,10 @@ void __init init_sched_ext_class(void)
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.watchdog_list);
+
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
}
register_sysrq_key('S', &sysrq_sched_ext_reset_op);
@@ -3089,6 +3128,41 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
.set = &scx_kfunc_ids_dispatch,
};
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct rq *rq;
+
+ if (!ops_cpu_valid(cpu)) {
+ scx_ops_error("invalid cpu %d", cpu);
+ return;
+ }
+
+ preempt_disable();
+ rq = this_rq();
+
+ /*
+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
+ * rq locks. We can probably be smarter and avoid bouncing if called
+ * from ops which don't hold a rq lock.
+ */
+ cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
+ if (flags & SCX_KICK_PREEMPT)
+ cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
+
+ irq_work_queue(&rq->scx.kick_cpus_irq_work);
+ preempt_enable();
+}
+
/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
@@ -3353,6 +3427,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p)
}
BTF_SET8_START(scx_kfunc_ids_ops_only)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1cdef69a6855..d246e5c2d3c7 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -19,6 +19,14 @@ enum scx_enq_flags {
/* high 32bits are SCX specific */
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
/*
* The task being enqueued is the only task available for the cpu. By
* default, ext core keeps executing such tasks but when
@@ -55,6 +63,10 @@ enum scx_pick_idle_cpu_flags {
SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
};
+enum scx_kick_flags {
+ SCX_KICK_PREEMPT = 1LLU << 0, /* force scheduling on the CPU */
+};
+
#ifdef CONFIG_SCHED_CLASS_EXT
struct sched_enq_and_set_ctx {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aec09e99cdb0..fc8e23f94e0a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -679,6 +679,9 @@ struct scx_rq {
unsigned long ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
+ cpumask_var_t cpus_to_kick;
+ cpumask_var_t cpus_to_preempt;
+ struct irq_work kick_cpus_irq_work;
};
#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 81e484defd9b..590c84ac602d 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -58,6 +58,7 @@ s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
--
2.42.0
next prev parent reply other threads:[~2023-11-11 2:51 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-11-11 2:47 [PATCHSET v5] sched: Implement BPF extensible scheduler class Tejun Heo
2023-11-11 2:47 ` [PATCH 01/36] cgroup: Implement cgroup_show_cftypes() Tejun Heo
2023-11-11 2:47 ` [PATCH 02/36] sched: Restructure sched_class order sanity checks in sched_init() Tejun Heo
2023-11-11 2:47 ` [PATCH 03/36] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() Tejun Heo
2023-11-11 2:47 ` [PATCH 04/36] sched: Add sched_class->reweight_task() Tejun Heo
2023-11-11 2:47 ` [PATCH 05/36] sched: Add sched_class->switching_to() and expose check_class_changing/changed() Tejun Heo
2023-11-11 2:47 ` [PATCH 06/36] sched: Factor out cgroup weight conversion functions Tejun Heo
2023-11-11 2:47 ` [PATCH 07/36] sched: Expose css_tg() and __setscheduler_prio() Tejun Heo
2023-11-11 2:47 ` [PATCH 08/36] sched: Enumerate CPU cgroup file types Tejun Heo
2023-11-11 2:47 ` [PATCH 09/36] sched: Add @reason to sched_class->rq_{on|off}line() Tejun Heo
2023-11-11 2:47 ` [PATCH 10/36] sched: Add normal_policy() Tejun Heo
2023-11-11 2:47 ` [PATCH 11/36] sched_ext: Add boilerplate for extensible scheduler class Tejun Heo
2023-11-11 2:47 ` [PATCH 12/36] sched_ext: Implement BPF " Tejun Heo
2023-11-13 13:34 ` Changwoo Min
2023-11-14 19:07 ` Tejun Heo
2023-11-13 20:04 ` Andrea Righi
2023-11-14 19:07 ` Tejun Heo
2023-11-23 8:07 ` Andrea Righi
2023-11-25 19:59 ` Tejun Heo
2023-11-26 9:05 ` Andrea Righi
2023-12-07 2:04 ` [PATCH] scx: set p->scx.ops_state using atomic_long_set_release Changwoo Min
2023-12-08 0:16 ` Tejun Heo
2024-03-23 2:37 ` [PATCH 12/36] sched_ext: Implement BPF extensible scheduler class Joel Fernandes
2024-03-23 22:12 ` Tejun Heo
2024-04-25 21:28 ` Joel Fernandes
2024-04-26 16:57 ` Barret Rhoden
2024-04-26 21:58 ` Tejun Heo
2023-11-11 2:47 ` [PATCH 13/36] sched_ext: Add scx_simple and scx_example_qmap example schedulers Tejun Heo
2023-11-12 4:17 ` kernel test robot
2023-11-12 18:06 ` Tejun Heo
2023-11-11 2:47 ` [PATCH 14/36] sched_ext: Add sysrq-S which disables the BPF scheduler Tejun Heo
2023-11-11 2:47 ` [PATCH 15/36] sched_ext: Implement runnable task stall watchdog Tejun Heo
2023-11-11 2:47 ` [PATCH 16/36] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT Tejun Heo
2023-11-11 2:47 ` [PATCH 17/36] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext Tejun Heo
2023-11-11 2:47 ` [PATCH 18/36] sched_ext: Print sched_ext info when dumping stack Tejun Heo
2023-11-14 19:23 ` [PATCH v2 " Tejun Heo
2023-11-11 2:47 ` Tejun Heo [this message]
2023-11-11 2:47 ` [PATCH 20/36] sched_ext: Add a central scheduler which makes all scheduling decisions on one CPU Tejun Heo
2023-11-11 2:47 ` [PATCH 21/36] sched_ext: Make watchdog handle ops.dispatch() looping stall Tejun Heo
2023-11-11 2:47 ` [PATCH 22/36] sched_ext: Add task state tracking operations Tejun Heo
2023-11-11 2:47 ` [PATCH 23/36] sched_ext: Implement tickless support Tejun Heo
2023-11-11 2:47 ` [PATCH 24/36] sched_ext: Track tasks that are subjects of the in-flight SCX operation Tejun Heo
2023-11-11 2:47 ` [PATCH 25/36] sched_ext: Add cgroup support Tejun Heo
2023-11-11 2:47 ` [PATCH 26/36] sched_ext: Add a cgroup-based core-scheduling scheduler Tejun Heo
2023-11-11 2:47 ` [PATCH 27/36] sched_ext: Add a cgroup scheduler which uses flattened hierarchy Tejun Heo
2023-11-11 2:47 ` [PATCH 28/36] sched_ext: Implement SCX_KICK_WAIT Tejun Heo
2023-11-11 2:47 ` [PATCH 29/36] sched_ext: Implement sched_ext_ops.cpu_acquire/release() Tejun Heo
2023-11-11 2:47 ` [PATCH 30/36] sched_ext: Implement sched_ext_ops.cpu_online/offline() Tejun Heo
2023-11-11 2:47 ` [PATCH 31/36] sched_ext: Implement core-sched support Tejun Heo
2023-11-11 2:47 ` [PATCH 32/36] sched_ext: Add vtime-ordered priority queue to dispatch_q's Tejun Heo
2023-11-11 2:47 ` [PATCH 33/36] sched_ext: Documentation: scheduler: Document extensible scheduler class Tejun Heo
2023-11-11 2:48 ` [PATCH 34/36] sched_ext: Add a basic, userland vruntime scheduler Tejun Heo
2023-11-11 2:48 ` [PATCH 35/36] sched_ext: Add scx_rusty, a rust userspace hybrid scheduler Tejun Heo
2023-11-11 2:48 ` [PATCH 36/36] sched_ext: Add scx_layered, a highly configurable multi-layer scheduler Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231111024835.2164816-20-tj@kernel.org \
--to=tj@kernel.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=brho@google.com \
--cc=bristot@redhat.com \
--cc=bsegall@google.com \
--cc=changwoo@igalia.com \
--cc=daniel@iogearbox.net \
--cc=derkling@google.com \
--cc=dietmar.eggemann@arm.com \
--cc=dschatzberg@meta.com \
--cc=dskarlat@cs.cmu.edu \
--cc=dvernet@meta.com \
--cc=haoluo@google.com \
--cc=himadrics@inria.fr \
--cc=joshdon@google.com \
--cc=juri.lelli@redhat.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=martin.lau@kernel.org \
--cc=memxor@gmail.com \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=riel@surriel.com \
--cc=rostedt@goodmis.org \
--cc=torvalds@linux-foundation.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.