linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] include SCX_OPS_TRACK_MIGRATION
@ 2025-06-23  4:32 Henry Huang
  2025-06-23  4:32 ` [PATCH v2] sched_ext: " Henry Huang
  0 siblings, 1 reply; 3+ messages in thread
From: Henry Huang @ 2025-06-23  4:32 UTC (permalink / raw)
  To: changwoo, arighi, tj, void
  Cc: 谈鉴锋, Yan Yan(cailing), linux-kernel,
	sched-ext, Henry Huang

In our environment, we need track task migrations to update per-cpu
map.
Implementing fentry(on enqueue_task_scx & dequeue_task_scx)
is a feasible solution. But there are some limitations:
1. Can't modify p->scx.xxx
2. enqueue_task_scx & dequeue_task_scx can't have some
special compilation optimizations.
3. Has more overhead compared to struct_ops
So we include SCX_OPS_TRACK_MIGRATION to support tracking task
migrations.
If SCX_OPS_TRACK_MIGRATION is set, runnable/quiescent
would be called whether task is doing migration or not.

For v2:
1. if task_on_rq_migrating(p) == true
   set DEQUEUE_MIGRATING to deq_flags in dequeue_task_scx
   set ENQUEUE_MIGRATING to enq_flags in enqueue_task_scx 

Henry Huang (1):
  sched_ext: include SCX_OPS_TRACK_MIGRATION

 kernel/sched/ext.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

-- 
Henry


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v2] sched_ext: include SCX_OPS_TRACK_MIGRATION
  2025-06-23  4:32 [PATCH v2] include SCX_OPS_TRACK_MIGRATION Henry Huang
@ 2025-06-23  4:32 ` Henry Huang
  2025-06-23  5:15   ` Andrea Righi
  0 siblings, 1 reply; 3+ messages in thread
From: Henry Huang @ 2025-06-23  4:32 UTC (permalink / raw)
  To: changwoo, arighi, tj, void
  Cc: 谈鉴锋, Yan Yan(cailing), linux-kernel,
	sched-ext, Henry Huang

For some BPF-schedulers, they should do something when
task is doing migration, such as updating per-cpu map.
If SCX_OPS_TRACK_MIGRATION is set, runnable/quiescent
would be called whether task is doing migration or not.

Signed-off-by: Henry Huang <henry.hj@antgroup.com>
---
 kernel/sched/ext.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b498d86..42c5251 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -161,6 +161,12 @@ enum scx_ops_flags {
 	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
 
 	/*
+	 * If set, runnable/quiescent ops would be called whether the task is
+	 * doing migration or not.
+	 */
+	SCX_OPS_TRACK_MIGRATION		= 1LLU << 7,
+
+	/*
 	 * CPU cgroup support flags
 	 */
 	SCX_OPS_HAS_CGROUP_WEIGHT	= 1LLU << 16,	/* DEPRECATED, will be removed on 6.18 */
@@ -172,6 +178,7 @@ enum scx_ops_flags {
 					  SCX_OPS_ALLOW_QUEUED_WAKEUP |
 					  SCX_OPS_SWITCH_PARTIAL |
 					  SCX_OPS_BUILTIN_IDLE_PER_NODE |
+					  SCX_OPS_TRACK_MIGRATION |
 					  SCX_OPS_HAS_CGROUP_WEIGHT,
 
 	/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
@@ -2390,7 +2397,11 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
 
-	if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+	if (task_on_rq_migrating(p))
+		enq_flags |= ENQUEUE_MIGRATING;
+
+	if (SCX_HAS_OP(sch, runnable) &&
+	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & ENQUEUE_MIGRATING)))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
 
 	if (enq_flags & SCX_ENQ_WAKEUP)
@@ -2463,6 +2474,9 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 		return true;
 	}
 
+	if (task_on_rq_migrating(p))
+		deq_flags |= DEQUEUE_MIGRATING;
+
 	ops_dequeue(rq, p, deq_flags);
 
 	/*
@@ -2482,7 +2496,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
 	}
 
-	if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+	if (SCX_HAS_OP(sch, quiescent) &&
+	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & DEQUEUE_MIGRATING)))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)
@@ -5495,6 +5510,11 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
 		return -EINVAL;
 	}
 
+	if ((ops->flags & SCX_OPS_TRACK_MIGRATION) && (!ops->runnable || !ops->quiescent)) {
+		scx_error(sch, "SCX_OPS_TRACK_MIGRATION requires ops.runnable() and ops.quiescent() to be implemented");
+		return -EINVAL;
+	}
+
 	if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
 		pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
 
-- 
Henry


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] sched_ext: include SCX_OPS_TRACK_MIGRATION
  2025-06-23  4:32 ` [PATCH v2] sched_ext: " Henry Huang
@ 2025-06-23  5:15   ` Andrea Righi
  0 siblings, 0 replies; 3+ messages in thread
From: Andrea Righi @ 2025-06-23  5:15 UTC (permalink / raw)
  To: Henry Huang
  Cc: changwoo, tj, void, 谈鉴锋, Yan Yan(cailing),
	linux-kernel, sched-ext

Hi Henry,

On Mon, Jun 23, 2025 at 12:32:20PM +0800, Henry Huang wrote:
> For some BPF-schedulers, they should do something when
> task is doing migration, such as updating per-cpu map.
> If SCX_OPS_TRACK_MIGRATION is set, runnable/quiescent
> would be called whether task is doing migration or not.
> 
> Signed-off-by: Henry Huang <henry.hj@antgroup.com>

Looks good. One last thing, since we're exposing ENQUEUE_MIGRATING and
DEQUEUE_MIGRATING to BPF, can you introduce some scx enums, similar to what
we're doing with DEQUEUE_SLEEP? Something like the following.

With that you can add my:

Reviewed-by: Andrea Righi <arighi@nvidia.com>

Thanks,
-Andrea

 kernel/sched/ext.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2f0ab232a2786..bf04cde71b34a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -918,8 +918,11 @@ enum scx_enq_flags {
 };
 
 enum scx_deq_flags {
+	/* expose select ENQUEUE_* flags as enums */
+	SCX_ENQ_MIGRATING	= ENQUEUE_MIGRATING,
 	/* expose select DEQUEUE_* flags as enums */
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
+	SCX_DEQ_MIGRATING	= DEQUEUE_MIGRATING,
 
 	/* high 32bits are SCX specific */
 
@@ -2385,10 +2388,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	add_nr_running(rq, 1);
 
 	if (task_on_rq_migrating(p))
-		enq_flags |= ENQUEUE_MIGRATING;
+		enq_flags |= SCX_ENQ_MIGRATING;
 
 	if (SCX_HAS_OP(sch, runnable) &&
-	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & ENQUEUE_MIGRATING)))
+	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & SCX_ENQ_MIGRATING)))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
 
 	if (enq_flags & SCX_ENQ_WAKEUP)
@@ -2471,7 +2474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	}
 
 	if (task_on_rq_migrating(p))
-		deq_flags |= DEQUEUE_MIGRATING;
+		deq_flags |= SCX_DEQ_MIGRATING;
 
 	ops_dequeue(rq, p, deq_flags);
 
@@ -2493,7 +2496,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	}
 
 	if (SCX_HAS_OP(sch, quiescent) &&
-	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & DEQUEUE_MIGRATING)))
+	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & SCX_DEQ_MIGRATING)))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)

> ---
>  kernel/sched/ext.c | 24 ++++++++++++++++++++++--
>  1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index b498d86..42c5251 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -161,6 +161,12 @@ enum scx_ops_flags {
>  	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
>  
>  	/*
> +	 * If set, runnable/quiescent ops would be called whether the task is
> +	 * doing migration or not.
> +	 */
> +	SCX_OPS_TRACK_MIGRATION		= 1LLU << 7,
> +
> +	/*
>  	 * CPU cgroup support flags
>  	 */
>  	SCX_OPS_HAS_CGROUP_WEIGHT	= 1LLU << 16,	/* DEPRECATED, will be removed on 6.18 */
> @@ -172,6 +178,7 @@ enum scx_ops_flags {
>  					  SCX_OPS_ALLOW_QUEUED_WAKEUP |
>  					  SCX_OPS_SWITCH_PARTIAL |
>  					  SCX_OPS_BUILTIN_IDLE_PER_NODE |
> +					  SCX_OPS_TRACK_MIGRATION |
>  					  SCX_OPS_HAS_CGROUP_WEIGHT,
>  
>  	/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
> @@ -2390,7 +2397,11 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>  	rq->scx.nr_running++;
>  	add_nr_running(rq, 1);
>  
> -	if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
> +	if (task_on_rq_migrating(p))
> +		enq_flags |= ENQUEUE_MIGRATING;
> +
> +	if (SCX_HAS_OP(sch, runnable) &&
> +	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(enq_flags & ENQUEUE_MIGRATING)))
>  		SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
>  
>  	if (enq_flags & SCX_ENQ_WAKEUP)
> @@ -2463,6 +2474,9 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
>  		return true;
>  	}
>  
> +	if (task_on_rq_migrating(p))
> +		deq_flags |= DEQUEUE_MIGRATING;
> +
>  	ops_dequeue(rq, p, deq_flags);
>  
>  	/*
> @@ -2482,7 +2496,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
>  		SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
>  	}
>  
> -	if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
> +	if (SCX_HAS_OP(sch, quiescent) &&
> +	    ((sch->ops.flags & SCX_OPS_TRACK_MIGRATION) || !(deq_flags & DEQUEUE_MIGRATING)))
>  		SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
>  
>  	if (deq_flags & SCX_DEQ_SLEEP)
> @@ -5495,6 +5510,11 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
>  		return -EINVAL;
>  	}
>  
> +	if ((ops->flags & SCX_OPS_TRACK_MIGRATION) && (!ops->runnable || !ops->quiescent)) {
> +		scx_error(sch, "SCX_OPS_TRACK_MIGRATION requires ops.runnable() and ops.quiescent() to be implemented");
> +		return -EINVAL;
> +	}
> +
>  	if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
>  		pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
>  
> -- 
> Henry
> 

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-06-23  5:15 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-23  4:32 [PATCH v2] include SCX_OPS_TRACK_MIGRATION Henry Huang
2025-06-23  4:32 ` [PATCH v2] sched_ext: " Henry Huang
2025-06-23  5:15   ` Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).