[PATCH-RT sched v3 0/2] Optimize the RT group scheduling

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH-RT sched v3 0/2] Optimize the RT group scheduling
@ 2024-07-16  5:25 Xavier
  2024-07-16  5:25 ` [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se Xavier
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Xavier @ 2024-07-16  5:25 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, oliver.sang, Xavier

Hi all,

Patch 3 fixed the issue with handling tasks with prio set to 0 during
the execution of blktests.

Kindly review.

Best Regards,
Xavier

Xavier (2):
  RT SCHED: Optimize the enqueue and dequeue operations for rt_se
  RT test: Adding test cases for RT group scheduling

 MAINTAINERS                                   |   7 +
 kernel/sched/debug.c                          |  48 +++
 kernel/sched/rt.c                             | 287 +++++++++++++++---
 kernel/sched/sched.h                          |   1 +
 tools/testing/selftests/sched/Makefile        |   4 +-
 tools/testing/selftests/sched/deadloop.c      | 192 ++++++++++++
 .../selftests/sched/rt_group_sched_test.sh    | 119 ++++++++
 7 files changed, 618 insertions(+), 40 deletions(-)
 create mode 100644 tools/testing/selftests/sched/deadloop.c
 create mode 100755 tools/testing/selftests/sched/rt_group_sched_test.sh

-- 
2.45.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se
  2024-07-16  5:25 [PATCH-RT sched v3 0/2] Optimize the RT group scheduling Xavier
@ 2024-07-16  5:25 ` Xavier
  2024-07-16  5:25 ` [PATCH-RT sched v3 2/2] RT test: Adding test cases for RT group scheduling Xavier
  2024-07-16  8:59 ` [PATCH-RT sched v3 0/2] Optimize the " Peter Zijlstra
  2 siblings, 0 replies; 7+ messages in thread
From: Xavier @ 2024-07-16  5:25 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, oliver.sang, Xavier

This patch optimizes the enqueue and dequeue of rt_se, the strategy employs
a bottom-up removal approach. Specifically, when removing an rt_se at a
certain level, if it is determined that the highest priority of the rq
associated with that rt_se has not changed, there is no need to continue
removing rt_se at higher levels. At this point, only the total number
of removed rt_se needs to be recorded, and the rt_nr_running count of
higher-level rq should be removed accordingly.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 kernel/sched/debug.c |  48 ++++++++
 kernel/sched/rt.c    | 287 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |   1 +
 3 files changed, 298 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..bf9edba5e87b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -712,6 +712,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 }
 
+void print_rt_se(struct seq_file *m, struct sched_rt_entity *rt_se)
+{
+	struct task_struct *task;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->my_q) {
+		SEQ_printf_task_group_path(m, rt_se->my_q->tg, "%s\n");
+		return;
+	}
+#endif
+	task = container_of(rt_se, struct task_struct, rt);
+	SEQ_printf(m, "	prio-%d, pid-%d, %s\n", task->prio, task->pid, task->comm);
+}
+
+/*shall be called in rq lock*/
+void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *rt_se;
+	struct list_head *queue, *head;
+	unsigned long bitmap[2];
+	int idx;
+	int count = 0;
+
+	if (!rt_rq->rt_nr_running)
+		return;
+
+	memcpy(bitmap, array->bitmap, sizeof(unsigned long) * 2);
+	idx = sched_find_first_bit(bitmap);
+	WARN_ON_ONCE(idx >= MAX_RT_PRIO);
+
+	while (1) {
+		clear_bit(idx, bitmap);
+		queue = array->queue + idx;
+		head = queue;
+		queue = queue->next;
+		do {
+			rt_se = list_entry(queue, struct sched_rt_entity, run_list);
+			print_rt_se(m, rt_se);
+			queue = queue->next;
+			count++;
+		} while (queue != head);
+		idx = sched_find_first_bit(bitmap);
+		if (idx >= MAX_RT_PRIO)
+			break;
+	}
+}
+
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..b18c424a50d2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1113,7 +1113,7 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 #endif /* CONFIG_SMP */
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
+static int
 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
@@ -1122,9 +1122,11 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
 		rt_rq->highest_prio.curr = prio;
 
 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
+
+	return prev_prio > prio;
 }
 
-static void
+static int
 dec_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
@@ -1149,12 +1151,22 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 	}
 
 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
+	if (rt_rq->highest_prio.curr > prio)
+		return prio;
+	else
+		return MAX_RT_PRIO;
 }
 
 #else
 
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline int inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
+static inline int dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	return 0;
+}
 
 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 
@@ -1218,28 +1230,31 @@ unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
 }
 
 static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	int prio = rt_se_prio(rt_se);
+	int prio_change;
 
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
-	inc_rt_prio(rt_rq, prio);
+	prio_change = inc_rt_prio(rt_rq, prio);
 	inc_rt_group(rt_se, rt_rq);
+	return prio_change;
 }
 
 static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+int dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int prio)
 {
+	int prio_changed;
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
-	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	prio_changed = dec_rt_prio(rt_rq, prio);
 	dec_rt_group(rt_se, rt_rq);
+	return prio_changed;
 }
 
 /*
@@ -1255,12 +1270,13 @@ static inline bool move_entity(unsigned int flags)
 	return true;
 }
 
-static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+static void __delist_rt_entity(struct sched_rt_entity *rt_se,
+						struct rt_prio_array *array, int last_prio)
 {
 	list_del_init(&rt_se->run_list);
 
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (list_empty(array->queue + last_prio))
+		__clear_bit(last_prio, array->bitmap);
 
 	rt_se->on_list = 0;
 }
@@ -1371,7 +1387,12 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 	}
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/*
+ * Returns: -1 indicates that rt_se was not enqueued, 0 indicates that the highest
+ * priority of the rq did not change after enqueue, and 1 indicates that the highest
+ * priority of the rq changed after enqueue.
+ */
+static int __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1386,8 +1407,8 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
 	 */
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
 		if (rt_se->on_list)
-			__delist_rt_entity(rt_se, array);
-		return;
+			__delist_rt_entity(rt_se, array, rt_se_prio(rt_se));
+		return -1;
 	}
 
 	if (move_entity(flags)) {
@@ -1402,73 +1423,263 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
 	}
 	rt_se->on_rq = 1;
 
-	inc_rt_tasks(rt_se, rt_rq);
+	return inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+/**
+ * delete rt_se from rt_rq
+ *
+ * @rt_se		Nodes to be deleted
+ * @last_prio	The highest priority of this rt_se before the previous round
+ *				of deletion
+ * @flags		operation flags
+ *
+ * Returns: =0 indicates that the highest priority of the current rq did not
+ * change during this deletion. >0 indicates it changed, and it returns the
+ * previous highest priority to use in the next round of deletion.
+ */
+static int __dequeue_rt_entity(struct sched_rt_entity *rt_se, int last_prio,
+									unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
 	if (move_entity(flags)) {
 		WARN_ON_ONCE(!rt_se->on_list);
-		__delist_rt_entity(rt_se, array);
+		__delist_rt_entity(rt_se, array, last_prio);
 	}
 	rt_se->on_rq = 0;
 
-	dec_rt_tasks(rt_se, rt_rq);
+	return dec_rt_tasks(rt_se, rt_rq, last_prio);
+}
+
+static inline void dec_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running -= rt;
+	rt_rq->rr_nr_running -= rr;
+}
+
+static inline void add_rq_nr_running(struct sched_rt_entity *rt_se,
+						unsigned int rt, unsigned int rr)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	rt_rq->rt_nr_running += rt;
+	rt_rq->rr_nr_running += rr;
+}
+
+static inline bool on_top_rt_rq(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se->parent)
+		return false;
+#endif
+	return true;
 }
 
 /*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
+ * To optimize the enqueue and dequeue of rt_se, this strategy employs a
+ * bottom-up removal approach. Specifically, when removing an rt_se at a
+ * certain level, if it is determined that the highest priority of the rq
+ * associated with that rt_se has not changed, there is no need to continue
+ * removing rt_se at higher levels. At this point, only the total number
+ * of removed rt_se needs to be recorded, and the rt_nr_running count of
+ * higher-level rq should be removed accordingly.
+ *
+ * For enqueue operations, if an rt_se at a certain level is in the rq,
+ * it is still necessary to check the priority of the higher-level rq.
+ * If the priority of the higher-level rq is found to be lower than that
+ * of the rt_se to be added, it should be removed, as updating the highest
+ * priority of the rq during addition will cause the rq to be repositioned
+ * in the parent rq.
+ *
+ * Conversely, for dequeue operations, if an rt_se at a certain level is
+ * not in the rq, the operation can be exited immediately to reduce
+ * unnecessary checks and handling.
+ *
+ * The return value refers to the last rt_se that was removed for enqueue
+ * operations. And for dequeue operations, it refers to the last rt_se
+ * that was either removed or had its rt_nr_running updated.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
+static struct sched_rt_entity *dequeue_rt_stack(struct sched_rt_entity *rt_se,
+						unsigned int flags, int for_enqueue)
 {
-	struct sched_rt_entity *back = NULL;
-	unsigned int rt_nr_running;
+	struct sched_rt_entity *last = rt_se;
+	struct sched_rt_entity *origin = rt_se;
+	unsigned int del_rt_nr = 0;
+	unsigned int del_rr_nr = 0;
+	int prio_changed = rt_se_prio(rt_se);
+	int sub_on_rq = 1;
 
 	for_each_sched_rt_entity(rt_se) {
-		rt_se->back = back;
-		back = rt_se;
-	}
+		if (on_rt_rq(rt_se)) {
+			if (sub_on_rq) {
+				/*
+				 * The number of tasks removed from the sub-level rt_se also needs
+				 * to be subtracted from the rq of the current rt_se, as the current
+				 * rt_se's rq no longer includes the number of removed tasks.
+				 */
+				dec_rq_nr_running(rt_se, del_rt_nr, del_rr_nr);
+				if ((prio_changed != MAX_RT_PRIO) ||
+					(rt_se_prio(rt_se) > rt_se_prio(origin))) {
+					/*
+					 * If the removal of the lower-level rt_se causes the
+					 * highest priority of the current rq to change, or if the
+					 * priority of current rq is lower than the rt_se to be
+					 * added, then the current rt_se also needs to be removed
+					 * from its parent rq, and the number of deleted tasks
+					 * should be accumulated.
+					 */
+					if (prio_changed == MAX_RT_PRIO)
+						prio_changed = rt_se_prio(rt_se);
+					del_rt_nr += rt_se_nr_running(rt_se);
+					del_rr_nr += rt_se_rr_nr_running(rt_se);
+					prio_changed = __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last = rt_se;
+				} else if (!for_enqueue) {
+					/* For dequeue, last may only rt_nr_running was modified.*/
+					last = rt_se;
+				}
+			} else {
+				/*
+				 * Entering this branch must be for enqueue, as dequeue would break
+				 * if an rt_se is not online.
+				 * If the sub-level node is not online, and the current rt_se's
+				 * priority is lower than the one being added, current rt_se need
+				 * to be removed.
+				 */
+				prio_changed = rt_se_prio(rt_se);
+				if (prio_changed > rt_se_prio(origin)) {
+					del_rt_nr += rt_se_nr_running(rt_se);
+					del_rr_nr += rt_se_rr_nr_running(rt_se);
+					prio_changed = __dequeue_rt_entity(rt_se,
+									prio_changed, flags);
+					last = rt_se;
+				} else {
+					prio_changed = MAX_RT_PRIO;
+				}
+			}
 
-	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
+			/*
+			 * If the current rt_se is on the top rt_rq, then the already deleted
+			 * nodes, plus the count of the rt_rq where current rt_se located,
+			 * need to be removed from the top_rt_rq.
+			 */
+			if (on_top_rt_rq(rt_se)) {
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						del_rt_nr + rt_rq_of_se(rt_se)->rt_nr_running);
+			}
+			sub_on_rq = 1;
+		} else if (for_enqueue) {
+			struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	for (rt_se = back; rt_se; rt_se = rt_se->back) {
-		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se, flags);
+			/*
+			 * In the case of an enqueue operation, if a certain level is found to be
+			 * not online, then the previous counts need to be reset to zero.
+			 */
+			prio_changed = MAX_RT_PRIO;
+			sub_on_rq = 0;
+			del_rt_nr = 0;
+			del_rr_nr = 0;
+
+			/*
+			 * If the current group is being throttled, then there is no need to check
+			 * higher levels since enqueueing will not affect higher-level nodes.
+			 */
+			if (group_rq && rt_rq_throttled(group_rq))
+				break;
+
+			if (on_top_rt_rq(rt_se))
+				dequeue_top_rt_rq(rt_rq_of_se(rt_se),
+						rt_rq_of_se(rt_se)->rt_nr_running);
+		} else {
+			last = rt_se;
+			break;
+		}
 	}
 
-	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
+	return last;
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr = 0;
+	unsigned int add_rr_nr = 0;
+	int enqueue = 1;
+	int prio_change = 1;
 
 	update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
 
-	dequeue_rt_stack(rt_se, flags);
-	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, flags);
+	last = dequeue_rt_stack(rt_se, flags, 1);
+
+	for_each_sched_rt_entity(rt_se) {
+		if (enqueue || !on_rt_rq(rt_se) || (prio_change == 1)) {
+			prio_change = __enqueue_rt_entity(rt_se, flags);
+			if (prio_change >= 0) {
+				add_rt_nr = rt_se_nr_running(rt_se);
+				add_rr_nr = rt_se_rr_nr_running(rt_se);
+			} else {
+				add_rt_nr = add_rr_nr = 0;
+			}
+		} else {
+			add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+		}
+
+		if (rt_se == last)
+			enqueue = 0;
+	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
+	struct sched_rt_entity *last;
+	unsigned int add_rt_nr = 0;
+	unsigned int add_rr_nr = 0;
+	int prio_change = 1;
 
 	update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
 
-	dequeue_rt_stack(rt_se, flags);
+	last = dequeue_rt_stack(rt_se, flags, 0);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running) {
+			if (on_rt_rq(rt_se)) {
+				add_rq_nr_running(rt_se, add_rt_nr, add_rr_nr);
+			} else {
+				prio_change = __enqueue_rt_entity(rt_se, flags);
+				if (prio_change == 0) {
+					/*
+					 * If enqueue is successful and the priority of the rq has
+					 * not changed, then the parent node only needs to add the
+					 * count of the current rt_se. Otherwise, the parent node
+					 * will also need to enqueue.
+					 */
+					add_rt_nr = rt_se_nr_running(rt_se);
+					add_rr_nr = rt_se_rr_nr_running(rt_se);
+				}
+			}
+		} else {
+			add_rt_nr = add_rr_nr = 0;
+		}
 
-		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, flags);
+		/*
+		 * last is the rt_se of the last deletion or modification of the
+		 * count, so the subsequent rt_se does not need to be updated.
+		 */
+		if (rt_se == last)
+			break;
 	}
+
 	enqueue_top_rt_rq(&rq->rt);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af102070..b634153aacf0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2878,6 +2878,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_rt_rq_task(struct seq_file *m, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
 
 extern void resched_latency_warn(int cpu, u64 latency);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH-RT sched v3 2/2] RT test: Adding test cases for RT group scheduling
  2024-07-16  5:25 [PATCH-RT sched v3 0/2] Optimize the RT group scheduling Xavier
  2024-07-16  5:25 ` [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se Xavier
@ 2024-07-16  5:25 ` Xavier
  2024-07-16  8:59 ` [PATCH-RT sched v3 0/2] Optimize the " Peter Zijlstra
  2 siblings, 0 replies; 7+ messages in thread
From: Xavier @ 2024-07-16  5:25 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, oliver.sang, Xavier

Adding test cases for RT group scheduling, create some RT infinite loop
processes/threads, then set them to the same or different priorities.
Place them in different RT task groups, run for a period of time,
and finally count the number of infinite loop executions for all tasks.

Signed-off-by: Xavier <xavier_qy@163.com>
---
 MAINTAINERS                                   |   7 +
 tools/testing/selftests/sched/Makefile        |   4 +-
 tools/testing/selftests/sched/deadloop.c      | 192 ++++++++++++++++++
 .../selftests/sched/rt_group_sched_test.sh    | 119 +++++++++++
 4 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/sched/deadloop.c
 create mode 100755 tools/testing/selftests/sched/rt_group_sched_test.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index 43353b705988..d29effe57bf8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19480,6 +19480,13 @@ L:	linux-remoteproc@vger.kernel.org
 S:	Maintained
 F:	drivers/tty/rpmsg_tty.c
 
+RT GROUP SCHED TEST
+M:	Xavier <xavier_qy@163.com>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	tools/testing/selftests/sched/deadloop.c
+F:	tools/testing/selftests/sched/rt_group_sched_test.sh
+
 RTL2830 MEDIA DRIVER
 L:	linux-media@vger.kernel.org
 S:	Orphan
diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile
index 099ee9213557..96decb58bf35 100644
--- a/tools/testing/selftests/sched/Makefile
+++ b/tools/testing/selftests/sched/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -Wl,-rpath=./ \
 	  $(CLANG_FLAGS)
 LDLIBS += -lpthread
 
-TEST_GEN_FILES := cs_prctl_test
-TEST_PROGS := cs_prctl_test
+TEST_GEN_FILES := cs_prctl_test deadloop
+TEST_PROGS := cs_prctl_test deadloop
 
 include ../lib.mk
diff --git a/tools/testing/selftests/sched/deadloop.c b/tools/testing/selftests/sched/deadloop.c
new file mode 100644
index 000000000000..d850a3e2a0ab
--- /dev/null
+++ b/tools/testing/selftests/sched/deadloop.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+
+/*
+ * Create multiple infinite loop threads based on the passed parameters
+ * Usage: deadloop num policy prio
+ *	num: the number of child threads
+ *	policy: the scheduling policy of the child threads, 0-fair, 1-fifo, 2-rr
+ *	prio: the priority
+ * If this process is killed, it will print the loop count of all child threads
+ * to the OUTPUT_FILE
+ *
+ * Date: June 27, 2024
+ * Author: Xavier <xavier_qy@163.com>
+ */
+
+#define OUTPUT_FILE "rt_group_sched_test.log"
+
+#if __GLIBC_PREREQ(2, 30) == 0
+#include <sys/syscall.h>
+static pid_t gettid(void)
+{
+	return syscall(SYS_gettid);
+}
+#endif
+
+#define do_err(x) \
+do { \
+	if ((x) < 0) {  \
+		printf("test BUG_ON func %s, line %d %ld\n", \
+			__func__, __LINE__, (long)(x) \
+		); \
+		while (1) \
+			sleep(1); \
+	} \
+} while (0)
+
+#define do_false(x) \
+do { \
+	if ((x) == 1) { \
+		printf("test BUG_ON func %s, line %d %d\n", \
+			__func__, __LINE__, (x) \
+		); \
+		while (1) \
+			sleep(1); \
+	} \
+} while (0)
+
+
+struct thread_data {
+	pthread_t thread;
+	int index;
+	int pid;
+	unsigned long cnt;
+};
+
+static struct thread_data *pdata;
+static int thread_num = 1;
+
+static void create_thread_posix(void *entry, pthread_t *thread, int *para,
+								 int policy, int prio)
+{
+	int					ret;
+	struct sched_param	param;
+	pthread_attr_t		attr;
+
+	memset(&param, 0, sizeof(param));
+	ret = pthread_attr_init(&attr);
+	do_err(ret);
+
+	ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+	do_err(ret);
+
+	param.sched_priority = prio;
+
+	ret = pthread_attr_setschedpolicy(&attr, policy);
+	do_err(ret);
+
+	ret = pthread_attr_setschedparam(&attr, &param);
+	do_err(ret);
+
+	ret = pthread_create(thread, &attr, entry, para);
+	do_err(ret);
+}
+
+static void *dead_loop_entry(void *arg)
+{
+	int index = *(int *)arg;
+	struct sched_param param;
+	int cur = gettid();
+
+	sched_getparam(cur, &param);
+	pdata[index].pid = cur;
+	printf("cur:%d prio:%d\n", cur, param.sched_priority);
+
+	while (1) {
+		asm volatile("" ::: "memory");
+		pdata[index].cnt++;
+	}
+	return NULL;
+}
+
+static void handle_signal(int signal)
+{
+	int cnt = 0;
+
+	if (signal == SIGTERM) {
+		FILE *file = freopen(OUTPUT_FILE, "a", stdout);
+
+		if (file == NULL) {
+			perror("freopen");
+			exit(0);
+		}
+
+		while (cnt < thread_num) {
+			printf("pid:%d cnt:%ld\n", pdata[cnt].pid, pdata[cnt].cnt);
+			cnt++;
+		}
+		fclose(file);
+		exit(0);
+	}
+}
+
+static int dead_loop_create(int policy, int prio)
+{
+	int cnt = 0;
+	int ret;
+	void *status;
+	struct sched_param param;
+
+	param.sched_priority = prio;
+	pdata = malloc(thread_num * sizeof(struct thread_data));
+	do_false(!pdata);
+
+	if (policy) {
+		ret = sched_setscheduler(0, policy, &param);
+		do_err(ret);
+	}
+
+	while (cnt < thread_num) {
+		pdata[cnt].index = cnt;
+		create_thread_posix(dead_loop_entry, &pdata[cnt].thread,
+								 &pdata[cnt].index, policy, prio);
+		cnt++;
+	}
+
+	signal(SIGTERM, handle_signal);
+
+	cnt = 0;
+	while (cnt < thread_num) {
+		pthread_join(pdata[cnt].thread, &status);
+		cnt++;
+	}
+
+	free(pdata);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int policy = 2;
+	int prio = 50;
+
+	if (argc == 2)
+		thread_num = atoi(argv[1]);
+
+	if (argc == 3) {
+		thread_num = atoi(argv[1]);
+		policy = atoi(argv[2]);
+		if (policy > 0)
+			prio = 50;
+	}
+
+	if (argc == 4) {
+		thread_num = atoi(argv[1]);
+		policy = atoi(argv[2]);
+		prio = atoi(argv[3]);
+	}
+
+	dead_loop_create(policy, prio);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/sched/rt_group_sched_test.sh b/tools/testing/selftests/sched/rt_group_sched_test.sh
new file mode 100755
index 000000000000..9031250a2684
--- /dev/null
+++ b/tools/testing/selftests/sched/rt_group_sched_test.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test for rt group scheduling
+# Date: June 27, 2024
+# Author: Xavier <xavier_qy@163.com>
+
+# Record the list of child process PIDs
+PIDS=()
+
+# File for redirected output
+LOGFILE="rt_group_sched_test.log"
+
+# Cleanup function: kill all recorded child processes and unmount the cgroup
+function cleanup() {
+	echo "Cleaning up..."
+	for pid in "${PIDS[@]}"; do
+		if kill -0 $pid 2>/dev/null; then
+			kill -TERM $pid
+		fi
+	done
+
+	# Sleep for a while to ensure the processes are properly killed
+	sleep 2
+
+	# Unmount the cgroup filesystem
+	umount /sys/fs/cgroup/cpu 2>/dev/null
+	umount /sys/fs/cgroup 2>/dev/null
+	echo "Cleanup completed."
+
+	# Ensure the LOGFILE exists and is correct
+	if [ ! -f "$LOGFILE" ]; then
+		echo "$LOGFILE not found!"
+		exit 1
+	fi
+
+	# Initialize the total count variable
+	total=0
+
+	# Read matching lines and calculate the total sum
+	while IFS= read -r line
+	do
+		# Use grep to match lines containing 'pid:' and 'cnt:', and extract the value of cnt
+		if echo "$line" | grep -q '^pid:[[:digit:]]\+ cnt:[[:digit:]]\+'; then
+			cnt=$(echo "$line" | sed -n \
+			  's/^pid:[[:digit:]]\+ cnt:\([[:digit:]]\+\)/\1/p')
+			total=$((total + cnt))
+		fi
+	done < "$LOGFILE"
+
+	# Print the total sum
+	echo "Total cnt: $total"
+	echo "Finished processing."
+}
+
+# Capture actions when interrupted or terminated by a signal
+trap cleanup EXIT
+
+# Start the cgroup filesystem and create the necessary directories
+function setup_cgroups() {
+	mount -t tmpfs -o mode=755 cgroup_root /sys/fs/cgroup
+	mkdir -p /sys/fs/cgroup/cpu
+	mount -t cgroup -o cpu none /sys/fs/cgroup/cpu
+}
+
+# Create cgroup subdirectories and configure their settings
+function create_child_cgroup() {
+	local base_dir=$1
+	local name=$2
+	local rt_period=$3
+	local rt_runtime=$4
+	mkdir -p "$base_dir/$name"
+	echo $rt_period > "$base_dir/$name/cpu.rt_period_us"
+	echo $rt_runtime > "$base_dir/$name/cpu.rt_runtime_us"
+}
+# Launch a process and add it to the specified cgroup
+function launch_process() {
+	local process_name=$1
+
+	# Three parameters representing the number of child threads, scheduling policy, and priority
+	local args=$2
+	local cgroup_path=$3
+
+	# Launch the process
+	exec -a $process_name ./deadloop $args &
+	local pid=$!
+	PIDS+=($pid)
+
+	# Short sleep to ensure the process starts
+	sleep 1
+
+	# Check if the process started successfully
+	if ! pgrep -x $process_name > /dev/null; then
+		echo "Error: No process found with name $process_name."
+		exit 1
+	fi
+
+	echo $pid > "$cgroup_path/cgroup.procs"
+	echo "Process $process_name with PID $pid added to cgroup $cgroup_path"
+}
+
+# Main function running all tasks
+function main() {
+	echo "The test needs 30 seconds..."
+	rm -f "$LOGFILE"
+	setup_cgroups
+	create_child_cgroup "/sys/fs/cgroup/cpu" "child1" 1000000 800000
+	create_child_cgroup "/sys/fs/cgroup/cpu/child1" "child2" 1000000 700000
+	create_child_cgroup "/sys/fs/cgroup/cpu/child1/child2" "child3" 1000000 600000
+	launch_process "child1" "3 2 50" "/sys/fs/cgroup/cpu/child1"
+	launch_process "child2" "3 2 50" "/sys/fs/cgroup/cpu/child1/child2"
+	launch_process "child3" "1 2 50" "/sys/fs/cgroup/cpu/child1/child2/child3"
+	launch_process "tg_root" "1 2 50" "/sys/fs/cgroup/cpu"
+
+	# Run for 30 seconds
+	sleep 30
+}
+
+# Execute the main function
+main
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH-RT sched v3 0/2] Optimize the RT group scheduling
  2024-06-29 11:28 [PATCH-RT sched v2 " Xavier
@ 2024-07-16  6:05 ` Xavier
  0 siblings, 0 replies; 7+ messages in thread
From: Xavier @ 2024-07-16  6:05 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, oliver.sang, Xavier

Hi all,

Patch 3 fixed the issue with handling tasks with prio set to 0 during
the execution of blktests.

Kindly review.

Best Regards,
Xavier

Xavier (2):
  RT SCHED: Optimize the enqueue and dequeue operations for rt_se
  RT test: Adding test cases for RT group scheduling

 MAINTAINERS                                   |   7 +
 kernel/sched/debug.c                          |  48 +++
 kernel/sched/rt.c                             | 287 +++++++++++++++---
 kernel/sched/sched.h                          |   1 +
 tools/testing/selftests/sched/Makefile        |   4 +-
 tools/testing/selftests/sched/deadloop.c      | 192 ++++++++++++
 .../selftests/sched/rt_group_sched_test.sh    | 119 ++++++++
 7 files changed, 618 insertions(+), 40 deletions(-)
 create mode 100644 tools/testing/selftests/sched/deadloop.c
 create mode 100755 tools/testing/selftests/sched/rt_group_sched_test.sh

-- 
2.45.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH-RT sched v3 0/2] Optimize the RT group scheduling
  2024-07-16  5:25 [PATCH-RT sched v3 0/2] Optimize the RT group scheduling Xavier
  2024-07-16  5:25 ` [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se Xavier
  2024-07-16  5:25 ` [PATCH-RT sched v3 2/2] RT test: Adding test cases for RT group scheduling Xavier
@ 2024-07-16  8:59 ` Peter Zijlstra
  2024-07-16  9:17   ` Xavier
  2 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2024-07-16  8:59 UTC (permalink / raw)
  To: Xavier
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel, oliver.sang

On Tue, Jul 16, 2024 at 01:25:41PM +0800, Xavier wrote:
> Hi all,
> 
> Patch 3 fixed the issue with handling tasks with prio set to 0 during
> the execution of blktests.

*sigh*... are you actually using this horror show?

The plan was to scrap this code -- and replace it with something based
on deadline servers. Sadly not a lot of people are able to work on that
:/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re:Re: [PATCH-RT sched v3 0/2] Optimize the RT group scheduling
  2024-07-16  8:59 ` [PATCH-RT sched v3 0/2] Optimize the " Peter Zijlstra
@ 2024-07-16  9:17   ` Xavier
  2024-07-16 10:25     ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Xavier @ 2024-07-16  9:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel, oliver.sang


Hi Peter,

Your meaning is, will the RT scheduling policy be removed? I see that the
current kernel already includes the deadline and EEVDF scheduling policies.
I noticed that the enqueue and dequeue operations for RT are quite
complicated, so I proposed this patch for optimization.



At 2024-07-16 16:59:26, "Peter Zijlstra" <peterz@infradead.org> wrote:
>On Tue, Jul 16, 2024 at 01:25:41PM +0800, Xavier wrote:
>> Hi all,
>> 
>> Patch 3 fixed the issue with handling tasks with prio set to 0 during
>> the execution of blktests.
>
>*sigh*... are you actually using this horror show?
>
>The plan was to scrap this code -- and replace it with something based
>on deadline servers. Sadly not a lot of people are able to work on that
>:/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Re: [PATCH-RT sched v3 0/2] Optimize the RT group scheduling
  2024-07-16  9:17   ` Xavier
@ 2024-07-16 10:25     ` Peter Zijlstra
  0 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2024-07-16 10:25 UTC (permalink / raw)
  To: Xavier
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel, oliver.sang

On Tue, Jul 16, 2024 at 05:17:50PM +0800, Xavier wrote:
> 
> Hi Peter,
> 
> Your meaning is, will the RT scheduling policy be removed? I see that the
> current kernel already includes the deadline and EEVDF scheduling policies.
> I noticed that the enqueue and dequeue operations for RT are quite
> complicated, so I proposed this patch for optimization.

No, not the policy, but we would really like to completely redo the
group scheduling part for RT.

For UP it is trivial, but we're not completely sure how to best do it
for SMP yet.

Anyway, I had gotten the impression that very few people indeed were
using the RT group scheduling feature, so I was surprised so see your
patches.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-07-16 10:25 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-16  5:25 [PATCH-RT sched v3 0/2] Optimize the RT group scheduling Xavier
2024-07-16  5:25 ` [PATCH-RT sched v3 1/2] RT SCHED: Optimize the enqueue and dequeue operations for rt_se Xavier
2024-07-16  5:25 ` [PATCH-RT sched v3 2/2] RT test: Adding test cases for RT group scheduling Xavier
2024-07-16  8:59 ` [PATCH-RT sched v3 0/2] Optimize the " Peter Zijlstra
2024-07-16  9:17   ` Xavier
2024-07-16 10:25     ` Peter Zijlstra
  -- strict thread matches above, loose matches on Subject: below --
2024-06-29 11:28 [PATCH-RT sched v2 " Xavier
2024-07-16  6:05 ` [PATCH-RT sched v3 " Xavier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox