All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tianchen Ding <dtcccc@linux.alibaba.com>
To: linux-kernel@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Tejun Heo <tj@kernel.org>
Subject: [RFC PATCH 2/2] sched/eevdf: Introduce a cgroup interface for slice
Date: Mon, 28 Oct 2024 14:33:13 +0800	[thread overview]
Message-ID: <20241028063313.8039-3-dtcccc@linux.alibaba.com> (raw)
In-Reply-To: <20241028063313.8039-1-dtcccc@linux.alibaba.com>

Introduce "cpu.fair_slice" for cgroup v2 and "cpu.fair_slice_us" for v1
according to their name styles. The unit is always microseconds.

A cgroup with shorter slice can preempt others more easily. This could be
useful in container scenarios.

By default, cpu.fair_slice is 0, which means the slice of se is
calculated by min_slice from its cfs_rq. If cpu.fair_slice is set, it
will overwrite se->slice with the customized value.

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
---
CC Tejun, do we need (and reuse) this slice interface for sched_ext?
---
 kernel/sched/core.c  | 34 ++++++++++++++++++++++++++++++
 kernel/sched/fair.c  | 49 +++++++++++++++++++++++++++++++++++++++-----
 kernel/sched/sched.h |  3 +++
 3 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 114adac5a9c8..8d57b7d88d18 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9690,6 +9690,24 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_fair_slice_read_u64(struct cgroup_subsys_state *css,
+				   struct cftype *cft)
+{
+	u64 fair_slice_us = css_tg(css)->slice;
+
+	do_div(fair_slice_us, NSEC_PER_USEC);
+
+	return fair_slice_us;
+}
+
+static int cpu_fair_slice_write_u64(struct cgroup_subsys_state *css,
+				    struct cftype *cftype, u64 fair_slice_us)
+{
+	return sched_group_set_slice(css_tg(css), fair_slice_us);
+}
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_GROUP_SCHED_WEIGHT
 	{
@@ -9703,6 +9721,14 @@ static struct cftype cpu_legacy_files[] = {
 		.write_s64 = cpu_idle_write_s64,
 	},
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "fair_slice_us",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_fair_slice_read_u64,
+		.write_u64 = cpu_fair_slice_write_u64,
+	},
+#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "cfs_quota_us",
@@ -9943,6 +9969,14 @@ static struct cftype cpu_files[] = {
 		.write_s64 = cpu_idle_write_s64,
 	},
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "fair_slice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_fair_slice_read_u64,
+		.write_u64 = cpu_fair_slice_write_u64,
+	},
+#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "max",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7dc90a6e6e26..694dc0655719 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -797,6 +797,11 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
 	return min_slice;
 }
 
+static inline u64 cfs_rq_slice(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->tg->slice ? : cfs_rq_min_slice(cfs_rq);
+}
+
 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 {
 	return entity_before(__node_2_se(a), __node_2_se(b));
@@ -6994,7 +6999,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			se->custom_slice = 1;
 		}
 		enqueue_entity(cfs_rq, se, flags);
-		slice = cfs_rq_min_slice(cfs_rq);
+		slice = cfs_rq_slice(cfs_rq);
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -7018,7 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		se->slice = slice;
 		min_vruntime_cb_propagate(&se->run_node, NULL);
-		slice = cfs_rq_min_slice(cfs_rq);
+		slice = cfs_rq_slice(cfs_rq);
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -7093,7 +7098,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		idle_h_nr_running = task_has_idle_policy(p);
 	} else {
 		cfs_rq = group_cfs_rq(se);
-		slice = cfs_rq_min_slice(cfs_rq);
+		slice = cfs_rq_slice(cfs_rq);
 	}
 
 	for_each_sched_entity(se) {
@@ -7118,7 +7123,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
-			slice = cfs_rq_min_slice(cfs_rq);
+			slice = cfs_rq_slice(cfs_rq);
 
 			/* Avoid re-evaluating load for this entity: */
 			se = parent_entity(se);
@@ -7143,7 +7148,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		se->slice = slice;
 		min_vruntime_cb_propagate(&se->run_node, NULL);
-		slice = cfs_rq_min_slice(cfs_rq);
+		slice = cfs_rq_slice(cfs_rq);
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
@@ -13535,6 +13540,40 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 	return 0;
 }
 
+int sched_group_set_slice(struct task_group *tg, u64 fair_slice_us)
+{
+	u64 slice = 0;
+	int i;
+
+	if (fair_slice_us > U64_MAX / NSEC_PER_USEC)
+		return -EINVAL;
+
+	if (fair_slice_us) {
+		slice = clamp_t(u64, fair_slice_us * NSEC_PER_USEC,
+				NSEC_PER_MSEC / 10,	/* HZ = 1000 * 10 */
+				NSEC_PER_MSEC * 100);	/* HZ = 100 / 10 */
+	}
+
+	if (slice == tg->slice)
+		return 0;
+
+	tg->slice = slice;
+
+	for_each_possible_cpu(i) {
+		struct sched_entity *se = tg->se[i];
+		struct rq *rq = cpu_rq(i);
+
+		guard(rq_lock_irqsave)(rq);
+		for_each_sched_entity(se) {
+			se->custom_slice = 1;
+			se->slice = cfs_rq_slice(group_cfs_rq(se));
+			min_vruntime_cb_propagate(&se->run_node, NULL);
+		}
+	}
+
+	return 0;
+}
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b139016cbd9..e02f8715bc04 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -443,6 +443,7 @@ struct task_group {
 	/* runqueue "owned" by this group on each CPU */
 	struct cfs_rq		**cfs_rq;
 	unsigned long		shares;
+	u64			slice;
 #ifdef	CONFIG_SMP
 	/*
 	 * load_avg can be heavily contended at clock tick time, so put
@@ -574,6 +575,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern int sched_group_set_idle(struct task_group *tg, long idle);
 
+extern int sched_group_set_slice(struct task_group *tg, u64 fair_slice_us);
+
 #ifdef CONFIG_SMP
 extern void set_task_rq_fair(struct sched_entity *se,
 			     struct cfs_rq *prev, struct cfs_rq *next);
-- 
2.39.3


  parent reply	other threads:[~2024-10-28  6:33 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-28  6:33 [RFC PATCH 0/2] sched/eevdf: Introduce a cgroup interface for slice Tianchen Ding
2024-10-28  6:33 ` [PATCH] sched/eevdf: Force propagating min_slice of cfs_rq when a task changing slice Tianchen Ding
2024-10-30  8:18   ` kernel test robot
2024-10-30  9:11     ` Tianchen Ding
2024-10-31  9:48   ` [PATCH v2] " Tianchen Ding
2024-11-12  3:25     ` Tianchen Ding
2024-11-13 11:50       ` 回复: " 解 咏梅
2024-11-14  2:45         ` Tianchen Ding
2024-11-14  6:06           ` 回复: " 解 咏梅
2024-11-14  6:36             ` Tianchen Ding
     [not found]               ` <ME0P300MB041447EBB0A17918745695898E5B2@ME0P300MB0414.AUSP300.PROD.OUTLOOK.COM>
2024-11-14  7:47                 ` Tianchen Ding
2024-11-14 13:44                   ` 回复: " 解 咏梅
2024-10-28  6:33 ` Tianchen Ding [this message]
2024-10-28 17:37   ` [RFC PATCH 2/2] sched/eevdf: Introduce a cgroup interface for slice Tejun Heo
2024-10-29  2:07     ` Tianchen Ding
2024-10-29  6:18       ` Tejun Heo
2024-10-29  6:49         ` Tianchen Ding
2024-10-29 20:39           ` Tejun Heo
     [not found]   ` <ME0P300MB0414F63E895B2F343EE740258E4B2@ME0P300MB0414.AUSP300.PROD.OUTLOOK.COM>
2024-10-29  4:26     ` 回复: " 解 咏梅
2024-10-30 11:00   ` Peter Zijlstra
2024-10-30 14:54     ` Tianchen Ding

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241028063313.8039-3-dtcccc@linux.alibaba.com \
    --to=dtcccc@linux.alibaba.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.