linux-next: manual merge of the sched-ext tree with the tip tree

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* linux-next: manual merge of the sched-ext tree with the tip tree
@ 2026-03-09 18:49 Mark Brown
  2026-03-09 20:02 ` Tejun Heo
  2026-03-09 20:11 ` [PATCH sched_ext/for-7.1] sched_ext: Replace system_unbound_wq with system_dfl_wq in scx_kobj_release() Tejun Heo
  0 siblings, 2 replies; 3+ messages in thread
From: Mark Brown @ 2026-03-09 18:49 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Ingo Molnar, Linux Kernel Mailing List, Linux Next Mailing List,
	Marco Crivellari, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 9292 bytes --]

Hi all,

Today's linux-next merge of the sched-ext tree got a conflict in:

  kernel/sched/ext.c

between commits:

  c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq")

from the tip tree and commit:

  cde94c032b32b ("sched_ext: Make watchdog sub-sched aware")

from the sched-ext tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

I do note there's another system_unbound_wq usage there which for some
reason wasn't updated...

diff --cc kernel/sched/ext.c
index 7278d57496478,d6d8073370130..0000000000000
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@@ -2766,8 -3205,11 +3205,11 @@@ static void scx_watchdog_workfn(struct 
  
  		cond_resched();
  	}
- 	queue_delayed_work(system_dfl_wq, to_delayed_work(work),
- 			   READ_ONCE(scx_watchdog_timeout) / 2);
+ 
+ 	intv = READ_ONCE(scx_watchdog_interval);
+ 	if (intv < ULONG_MAX)
 -		queue_delayed_work(system_unbound_wq, to_delayed_work(work),
++		queue_delayed_work(system_dfl_wq, to_delayed_work(work),
+ 				   intv);
  }
  
  void scx_tick(struct rq *rq)
@@@ -4282,9 -5218,247 +5218,247 @@@ static void free_kick_syncs(void
  	}
  }
  
- static void scx_disable_workfn(struct kthread_work *work)
+ static void refresh_watchdog(void)
+ {
+ 	struct scx_sched *sch;
+ 	unsigned long intv = ULONG_MAX;
+ 
+ 	/* take the shortest timeout and use its half for watchdog interval */
+ 	rcu_read_lock();
+ 	list_for_each_entry_rcu(sch, &scx_sched_all, all)
+ 		intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+ 	rcu_read_unlock();
+ 
+ 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ 	WRITE_ONCE(scx_watchdog_interval, intv);
+ 
+ 	if (intv < ULONG_MAX)
 -		mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
++		mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
+ 	else
+ 		cancel_delayed_work_sync(&scx_watchdog_work);
+ }
+ 
+ static s32 scx_link_sched(struct scx_sched *sch)
+ {
+ 	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+ #ifdef CONFIG_EXT_SUB_SCHED
+ 		struct scx_sched *parent = scx_parent(sch);
+ 		s32 ret;
+ 
+ 		if (parent) {
+ 			ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
+ 					&sch->hash_node, scx_sched_hash_params);
+ 			if (ret) {
+ 				scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
+ 				return ret;
+ 			}
+ 
+ 			list_add_tail(&sch->sibling, &parent->children);
+ 		}
+ #endif	/* CONFIG_EXT_SUB_SCHED */
+ 
+ 		list_add_tail_rcu(&sch->all, &scx_sched_all);
+ 	}
+ 
+ 	refresh_watchdog();
+ 	return 0;
+ }
+ 
+ static void scx_unlink_sched(struct scx_sched *sch)
+ {
+ 	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+ #ifdef CONFIG_EXT_SUB_SCHED
+ 		if (scx_parent(sch)) {
+ 			rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
+ 					       scx_sched_hash_params);
+ 			list_del_init(&sch->sibling);
+ 		}
+ #endif	/* CONFIG_EXT_SUB_SCHED */
+ 		list_del_rcu(&sch->all);
+ 	}
+ 
+ 	refresh_watchdog();
+ }
+ 
+ #ifdef CONFIG_EXT_SUB_SCHED
+ static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+ 
+ static void drain_descendants(struct scx_sched *sch)
+ {
+ 	/*
+ 	 * Child scheds that finished the critical part of disabling will take
+ 	 * themselves off @sch->children. Wait for it to drain. As propagation
+ 	 * is recursive, empty @sch->children means that all proper descendant
+ 	 * scheds reached unlinking stage.
+ 	 */
+ 	wait_event(scx_unlink_waitq, list_empty(&sch->children));
+ }
+ 
+ static void scx_fail_parent(struct scx_sched *sch,
+ 			    struct task_struct *failed, s32 fail_code)
+ {
+ 	struct scx_sched *parent = scx_parent(sch);
+ 	struct scx_task_iter sti;
+ 	struct task_struct *p;
+ 
+ 	scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ 		  fail_code, failed->comm, failed->pid);
+ 
+ 	/*
+ 	 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ 	 * it. This may cause downstream failures on the BPF side but $parent is
+ 	 * dying anyway.
+ 	 */
+ 	scx_bypass(parent, true);
+ 
+ 	scx_task_iter_start(&sti, sch->cgrp);
+ 	while ((p = scx_task_iter_next_locked(&sti))) {
+ 		if (scx_task_on_sched(parent, p))
+ 			continue;
+ 
+ 		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ 			scx_disable_and_exit_task(sch, p);
+ 			rcu_assign_pointer(p->scx.sched, parent);
+ 		}
+ 	}
+ 	scx_task_iter_stop(&sti);
+ }
+ 
+ static void scx_sub_disable(struct scx_sched *sch)
+ {
+ 	struct scx_sched *parent = scx_parent(sch);
+ 	struct scx_task_iter sti;
+ 	struct task_struct *p;
+ 	int ret;
+ 
+ 	/*
+ 	 * Guarantee forward progress and wait for descendants to be disabled.
+ 	 * To limit disruptions, $parent is not bypassed. Tasks are fully
+ 	 * prepped and then inserted back into $parent.
+ 	 */
+ 	scx_bypass(sch, true);
+ 	drain_descendants(sch);
+ 
+ 	/*
+ 	 * Here, every runnable task is guaranteed to make forward progress and
+ 	 * we can safely use blocking synchronization constructs. Actually
+ 	 * disable ops.
+ 	 */
+ 	mutex_lock(&scx_enable_mutex);
+ 	percpu_down_write(&scx_fork_rwsem);
+ 	scx_cgroup_lock();
+ 
+ 	set_cgroup_sched(sch_cgroup(sch), parent);
+ 
+ 	scx_task_iter_start(&sti, sch->cgrp);
+ 	while ((p = scx_task_iter_next_locked(&sti))) {
+ 		struct rq *rq;
+ 		struct rq_flags rf;
+ 
+ 		/* filter out duplicate visits */
+ 		if (scx_task_on_sched(parent, p))
+ 			continue;
+ 
+ 		/*
+ 		 * By the time control reaches here, all descendant schedulers
+ 		 * should already have been disabled.
+ 		 */
+ 		WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+ 
+ 		/*
+ 		 * If $p is about to be freed, nothing prevents $sch from
+ 		 * unloading before $p reaches sched_ext_free(). Disable and
+ 		 * exit $p right away.
+ 		 */
+ 		if (!tryget_task_struct(p)) {
+ 			scx_disable_and_exit_task(sch, p);
+ 			continue;
+ 		}
+ 
+ 		scx_task_iter_unlock(&sti);
+ 
+ 		/*
+ 		 * $p is READY or ENABLED on @sch. Initialize for $parent,
+ 		 * disable and exit from @sch, and then switch over to $parent.
+ 		 *
+ 		 * If a task fails to initialize for $parent, the only available
+ 		 * action is disabling $parent too. While this allows disabling
+ 		 * of a child sched to cause the parent scheduler to fail, the
+ 		 * failure can only originate from ops.init_task() of the
+ 		 * parent. A child can't directly affect the parent through its
+ 		 * own failures.
+ 		 */
+ 		ret = __scx_init_task(parent, p, false);
+ 		if (ret) {
+ 			scx_fail_parent(sch, p, ret);
+ 			put_task_struct(p);
+ 			break;
+ 		}
+ 
+ 		rq = task_rq_lock(p, &rf);
+ 		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ 			/*
+ 			 * $p is initialized for $parent and still attached to
+ 			 * @sch. Disable and exit for @sch, switch over to
+ 			 * $parent, override the state to READY to account for
+ 			 * $p having already been initialized, and then enable.
+ 			 */
+ 			scx_disable_and_exit_task(sch, p);
+ 			scx_set_task_state(p, SCX_TASK_INIT);
+ 			rcu_assign_pointer(p->scx.sched, parent);
+ 			scx_set_task_state(p, SCX_TASK_READY);
+ 			scx_enable_task(parent, p);
+ 		}
+ 		task_rq_unlock(rq, p, &rf);
+ 
+ 		put_task_struct(p);
+ 	}
+ 	scx_task_iter_stop(&sti);
+ 
+ 	scx_cgroup_unlock();
+ 	percpu_up_write(&scx_fork_rwsem);
+ 
+ 	/*
+ 	 * All tasks are moved off of @sch but there may still be on-going
+ 	 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ 	 * the expedited version as ancestors may be waiting in bypass mode.
+ 	 * Also, tell the parent that there is no need to keep running bypass
+ 	 * DSQs for us.
+ 	 */
+ 	synchronize_rcu_expedited();
+ 	disable_bypass_dsp(sch);
+ 
+ 	scx_unlink_sched(sch);
+ 
+ 	mutex_unlock(&scx_enable_mutex);
+ 
+ 	/*
+ 	 * @sch is now unlinked from the parent's children list. Notify and call
+ 	 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ 	 * after unlinking and releasing all locks. See scx_claim_exit().
+ 	 */
+ 	wake_up_all(&scx_unlink_waitq);
+ 
+ 	if (sch->ops.sub_detach && sch->sub_attached) {
+ 		struct scx_sub_detach_args sub_detach_args = {
+ 			.ops = &sch->ops,
+ 			.cgroup_path = sch->cgrp_path,
+ 		};
+ 		SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+ 			    &sub_detach_args);
+ 	}
+ 
+ 	if (sch->ops.exit)
+ 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+ 	kobject_del(&sch->kobj);
+ }
+ #else	/* CONFIG_EXT_SUB_SCHED */
+ static void drain_descendants(struct scx_sched *sch) { }
+ static void scx_sub_disable(struct scx_sched *sch) { }
+ #endif	/* CONFIG_EXT_SUB_SCHED */
+ 
+ static void scx_root_disable(struct scx_sched *sch)
  {
- 	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
  	struct scx_exit_info *ei = sch->exit_info;
  	struct scx_task_iter sti;
  	struct task_struct *p;

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: linux-next: manual merge of the sched-ext tree with the tip tree
  2026-03-09 18:49 linux-next: manual merge of the sched-ext tree with the tip tree Mark Brown
@ 2026-03-09 20:02 ` Tejun Heo
  2026-03-09 20:11 ` [PATCH sched_ext/for-7.1] sched_ext: Replace system_unbound_wq with system_dfl_wq in scx_kobj_release() Tejun Heo
  1 sibling, 0 replies; 3+ messages in thread
From: Tejun Heo @ 2026-03-09 20:02 UTC (permalink / raw)
  To: Mark Brown
  Cc: Ingo Molnar, Linux Kernel Mailing List, Linux Next Mailing List,
	Marco Crivellari, Peter Zijlstra

On Mon, Mar 09, 2026 at 06:49:44PM +0000, Mark Brown wrote:
> Hi all,
> 
> Today's linux-next merge of the sched-ext tree got a conflict in:
> 
>   kernel/sched/ext.c
> 
> between commits:
> 
>   c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq")
> 
> from the tip tree and commit:
> 
>   cde94c032b32b ("sched_ext: Make watchdog sub-sched aware")
> 
> from the sched-ext tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.

I pulled in sched/core into sched_ext/for-7.1 and resolved the conflict.

> I do note there's another system_unbound_wq usage there which for some
> reason wasn't updated...

Yeah, I'll queue a patch to convert that one too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH sched_ext/for-7.1] sched_ext: Replace system_unbound_wq with system_dfl_wq in scx_kobj_release()
  2026-03-09 18:49 linux-next: manual merge of the sched-ext tree with the tip tree Mark Brown
  2026-03-09 20:02 ` Tejun Heo
@ 2026-03-09 20:11 ` Tejun Heo
  1 sibling, 0 replies; 3+ messages in thread
From: Tejun Heo @ 2026-03-09 20:11 UTC (permalink / raw)
  To: Mark Brown, Ingo Molnar, Linux Kernel Mailing List,
	Linux Next Mailing List, Marco Crivellari, Peter Zijlstra
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext

c2a57380df9d ("sched: Replace use of system_unbound_wq with system_dfl_wq")
converted system_unbound_wq usages in ext.c but missed the queue_rcu_work()
call in scx_kobj_release() which was added later by the dynamic scx_sched
allocation conversion. Apply the same conversion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Marco Crivellari <marco.crivellari@suse.com>
---
Applied to sched_ext/for-7.1.

 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b35b98020f3b..07476355bfd5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4546,7 +4546,7 @@ static void scx_kobj_release(struct kobject *kobj)
 	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);

 	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
-	queue_rcu_work(system_unbound_wq, &sch->rcu_work);
+	queue_rcu_work(system_dfl_wq, &sch->rcu_work);
 }

 static ssize_t scx_attr_ops_show(struct kobject *kobj,
--
2.53.0

--
tejun

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-03-09 20:11 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-09 18:49 linux-next: manual merge of the sched-ext tree with the tip tree Mark Brown
2026-03-09 20:02 ` Tejun Heo
2026-03-09 20:11 ` [PATCH sched_ext/for-7.1] sched_ext: Replace system_unbound_wq with system_dfl_wq in scx_kobj_release() Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox