From: David Dai <davidai@google.com>
To: "Rafael J. Wysocki" <rafael@kernel.org>,
Viresh Kumar <viresh.kumar@linaro.org>,
Rob Herring <robh+dt@kernel.org>,
Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>,
Paolo Bonzini <pbonzini@redhat.com>,
Jonathan Corbet <corbet@lwn.net>, Marc Zyngier <maz@kernel.org>,
Oliver Upton <oliver.upton@linux.dev>,
James Morse <james.morse@arm.com>,
Suzuki K Poulose <suzuki.poulose@arm.com>,
Zenghui Yu <yuzenghui@huawei.com>,
Catalin Marinas <catalin.marinas@arm.com>,
Will Deacon <will@kernel.org>,
Mark Rutland <mark.rutland@arm.com>,
Lorenzo Pieralisi <lpieralisi@kernel.org>,
Sudeep Holla <sudeep.holla@arm.com>,
Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Juri Lelli <juri.lelli@redhat.com>,
Vincent Guittot <vincent.guittot@linaro.org>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Steven Rostedt <rostedt@goodmis.org>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
Daniel Bristot de Oliveira <bristot@redhat.com>,
Valentin Schneider <vschneid@redhat.com>,
David Dai <davidai@google.com>
Cc: Saravana Kannan <saravanak@google.com>,
kernel-team@android.com, linux-pm@vger.kernel.org,
devicetree@vger.kernel.org, linux-kernel@vger.kernel.org,
kvm@vger.kernel.org, linux-doc@vger.kernel.org,
linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev
Subject: [RFC PATCH v2 1/6] sched/fair: Add util_guest for tasks
Date: Thu, 30 Mar 2023 18:43:45 -0700 [thread overview]
Message-ID: <20230331014356.1033759-2-davidai@google.com> (raw)
In-Reply-To: <20230331014356.1033759-1-davidai@google.com>
For virtualization usecases, util_est and util_avg currently tracked
on the host aren't sufficient to accurately represent the workload on
vCPU threads, which results in poor frequency selection and performance.
For example, when a large workload migrates from a busy vCPU thread to
an idle vCPU thread, it incurs additional DVFS ramp-up latencies
as util accumulates.
Introduce a new "util_guest" member as an additional PELT signal that's
independently updated by the guest. When used, it's max aggregated to
provide a boost to both task_util and task_util_est.
Updating task_util and task_util_est will ensure:
-Better task placement decisions for vCPU threads on the host
-Correctly updating util_est.ewma during dequeue
-Additive util with other threads on the same runqueue for more
accurate frequency responses
Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
---
include/linux/sched.h | 11 +++++++++++
kernel/sched/core.c | 18 +++++++++++++++++-
kernel/sched/fair.c | 15 +++++++++++++--
3 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..d8c346fcdf52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -445,6 +445,16 @@ struct util_est {
#define UTIL_AVG_UNCHANGED 0x80000000
} __attribute__((__aligned__(sizeof(u64))));
+/*
+ * For sched_setattr_nocheck() (kernel) only
+ *
+ * Allow vCPU threads to use UTIL_GUEST as a way to hint the scheduler with more
+ * accurate utilization info. This is useful when guest kernels have some way of
+ * tracking its own runqueue's utilization.
+ *
+ */
+#define SCHED_FLAG_UTIL_GUEST 0x20000000
+
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -499,6 +509,7 @@ struct sched_avg {
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
+ unsigned long util_guest;
struct util_est util_est;
} ____cacheline_aligned;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..7700ef5610c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2024,6 +2024,16 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
+static void __setscheduler_task_util(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_GUEST)))
+ return;
+
+ p->se.avg.util_guest = attr->sched_util_min;
+}
+
bool sched_task_on_rq(struct task_struct *p)
{
return task_on_rq_queued(p);
@@ -7561,7 +7571,7 @@ static int __sched_setscheduler(struct task_struct *p,
return -EINVAL;
}
- if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV | SCHED_FLAG_UTIL_GUEST))
return -EINVAL;
/*
@@ -7583,6 +7593,9 @@ static int __sched_setscheduler(struct task_struct *p,
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+ return -EINVAL;
+
retval = security_task_setscheduler(p);
if (retval)
return retval;
@@ -7629,6 +7642,8 @@ static int __sched_setscheduler(struct task_struct *p,
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
retval = 0;
@@ -7718,6 +7733,7 @@ static int __sched_setscheduler(struct task_struct *p,
__setscheduler_prio(p, newprio);
}
__setscheduler_uclamp(p, attr);
+ __setscheduler_task_util(p, attr);
if (queued) {
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..998649554344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
- return READ_ONCE(p->se.avg.util_avg);
+ return max(READ_ONCE(p->se.avg.util_avg),
+ READ_ONCE(p->se.avg.util_guest));
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+ return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
+ max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);
+ /*
+ * The normal code path for host thread enqueue doesn't take into
+ * account guest task migrations when updating cpufreq util.
+ * So, always update the cpufreq when a vCPU thread has a
+ * non-zero util_guest value.
+ */
+ if (READ_ONCE(p->se.avg.util_guest))
+ cpufreq_update_util(rq, 0);
+
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
--
2.40.0.348.gf938b09366-goog
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
next prev parent reply other threads:[~2023-03-31 1:45 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-03-31 1:43 [RFC PATCH v2 0/6] Improve VM CPUfreq and task placement behavior David Dai
2023-03-31 1:43 ` David Dai [this message]
2023-03-31 1:43 ` [RFC PATCH v2 2/6] kvm: arm64: Add support for get_cur_cpufreq service David Dai
2023-04-01 3:12 ` Bagas Sanjaya
2023-04-01 3:16 ` Bagas Sanjaya
2023-03-31 1:43 ` [RFC PATCH v2 3/6] kvm: arm64: Add support for util_hint service David Dai
2023-04-01 3:22 ` Bagas Sanjaya
2023-03-31 1:43 ` [RFC PATCH v2 4/6] kvm: arm64: Add support for get_freqtbl service David Dai
2023-04-01 3:28 ` Bagas Sanjaya
2023-03-31 1:43 ` [RFC PATCH v2 5/6] dt-bindings: cpufreq: add bindings for virtual kvm cpufreq David Dai
2023-03-31 8:55 ` Krzysztof Kozlowski
2023-03-31 12:42 ` Rob Herring
2023-03-31 12:46 ` Rob Herring
2023-04-05 22:07 ` Saravana Kannan
2023-03-31 1:43 ` [RFC PATCH v2 6/6] cpufreq: add kvm-cpufreq driver David Dai
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230331014356.1033759-2-davidai@google.com \
--to=davidai@google.com \
--cc=bristot@redhat.com \
--cc=bsegall@google.com \
--cc=catalin.marinas@arm.com \
--cc=corbet@lwn.net \
--cc=devicetree@vger.kernel.org \
--cc=dietmar.eggemann@arm.com \
--cc=james.morse@arm.com \
--cc=juri.lelli@redhat.com \
--cc=kernel-team@android.com \
--cc=krzysztof.kozlowski+dt@linaro.org \
--cc=kvm@vger.kernel.org \
--cc=kvmarm@lists.linux.dev \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pm@vger.kernel.org \
--cc=lpieralisi@kernel.org \
--cc=mark.rutland@arm.com \
--cc=maz@kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=oliver.upton@linux.dev \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=rafael@kernel.org \
--cc=robh+dt@kernel.org \
--cc=rostedt@goodmis.org \
--cc=saravanak@google.com \
--cc=sudeep.holla@arm.com \
--cc=suzuki.poulose@arm.com \
--cc=vincent.guittot@linaro.org \
--cc=viresh.kumar@linaro.org \
--cc=vschneid@redhat.com \
--cc=will@kernel.org \
--cc=yuzenghui@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).