public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Daniel Bristot de Oliveira <bristot@kernel.org>
To: linux-kernel@vger.kernel.org, Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>,
	Joe Mario <jmario@redhat.com>
Subject: [PATCH] sched/idle: Make idle poll dynamic per-cpu
Date: Thu, 12 Jan 2023 17:24:26 +0100	[thread overview]
Message-ID: <20230112162426.217522-1-bristot@kernel.org> (raw)

idle=poll is frequently used on ultra-low-latency systems. Examples of
such systems are high-performance trading and 5G NVRAM. The performance
gain is given by avoiding the idle driver machinery and by keeping the
CPU is always in an active state - avoiding (odd) hardware heuristics that
are out of the control of the OS.

Currently, idle=poll is an all-or-nothing static option defined at
boot time. The motivation for creating this option dynamic and per-cpu
are two:

  1) Reduce the power usage/heat by allowing only selected CPUs to
     do idle polling;
  2) Allow multi-tenant systems (e.g., Kubernetes) to enable idle
     polling only when ultra-low-latency applications are present
     on specific CPUs.

Joe Mario did some experiments with this option enabled, and the results
were significant. For example, by using dynamic idle polling on
selected CPUs, cyclictest performance is optimal (like when using
idle=poll), but cpu power consumption drops from 381 to 233 watts.

Also, limiting idle=poll to the set of CPUs that benefits from
it allows other CPUs to benefit from frequency boosts. Joe also
shows that the results can be in the order of 80nsec round trip
improvement when system-wide idle=poll was not used.

The user can enable idle polling with this command:
  # echo 1 > /sys/devices/system/cpu/cpu{CPU_ID}/idle_poll

And disable it via:
  # echo 0 > /sys/devices/system/cpu/cpu{CPU_ID}/idle_poll

By default, all CPUs have idle polling disabled (the current behavior).
A static key avoids the CPU mask check overhead when no idle polling
is enabled.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Joe Mario <jmario@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
---
 kernel/sched/idle.c | 97 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..c6ef1322d549 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,91 @@
 /* Linker adds these: start and end of __cpuidle functions */
 extern char __cpuidle_text_start[], __cpuidle_text_end[];
 
+/*
+ * per-cpu idle polling selector.
+ */
+static struct cpumask cpu_poll_mask;
+DEFINE_STATIC_KEY_FALSE(cpu_poll_enabled);
+
+/*
+ * Protects the mask/static key relation.
+ */
+DEFINE_MUTEX(cpu_poll_mutex);
+
+static ssize_t idle_poll_store(struct device *dev, struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	int cpu = dev->id;
+	int retval, set;
+	bool val;
+
+	retval = kstrtobool(buf, &val);
+	if (retval)
+		return retval;
+
+	mutex_lock(&cpu_poll_mutex);
+
+	if (val) {
+		set = cpumask_test_and_set_cpu(cpu, &cpu_poll_mask);
+
+		/*
+		 * If the CPU was already on, do not increase the static key usage.
+		 */
+		if (!set)
+			static_branch_inc(&cpu_poll_enabled);
+	} else {
+		set = cpumask_test_and_clear_cpu(cpu, &cpu_poll_mask);
+
+		/*
+		 * If the CPU was already off, do not decrease the static key usage.
+		 */
+		if (set)
+			static_branch_dec(&cpu_poll_enabled);
+	}
+
+	mutex_unlock(&cpu_poll_mutex);
+
+	return count;
+}
+
+static ssize_t idle_poll_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", cpumask_test_cpu(dev->id, &cpu_poll_mask));
+}
+
+static DEVICE_ATTR_RW(idle_poll);
+
+static const struct attribute *idle_poll_attrs[] = {
+	&dev_attr_idle_poll.attr,
+	NULL
+};
+
+static int __init idle_poll_sysfs_init(void)
+{
+	int cpu, retval;
+
+	for_each_possible_cpu(cpu) {
+		struct device *dev = get_cpu_device(cpu);
+
+		if (!dev)
+			continue;
+		retval = sysfs_create_files(&dev->kobj, idle_poll_attrs);
+		if (retval)
+			return retval;
+	}
+
+	return 0;
+}
+device_initcall(idle_poll_sysfs_init);
+
+static int is_cpu_idle_poll(int cpu)
+{
+	if (static_branch_unlikely(&cpu_poll_enabled))
+		return cpumask_test_cpu(cpu, &cpu_poll_mask);
+
+	return 0;
+}
+
 /**
  * sched_idle_set_state - Record idle state for the current CPU.
  * @idle_state: State to record.
@@ -51,18 +136,21 @@ __setup("hlt", cpu_idle_nopoll_setup);
 
 static noinline int __cpuidle cpu_idle_poll(void)
 {
-	trace_cpu_idle(0, smp_processor_id());
+	int cpu = smp_processor_id();
+
+	trace_cpu_idle(0, cpu);
 	stop_critical_timings();
 	ct_idle_enter();
 	local_irq_enable();
 
 	while (!tif_need_resched() &&
-	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
+	       (cpu_idle_force_poll || tick_check_broadcast_expired()
+		|| is_cpu_idle_poll(cpu)))
 		cpu_relax();
 
 	ct_idle_exit();
 	start_critical_timings();
-	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+	trace_cpu_idle(PWR_EVENT_EXIT, cpu);
 
 	return 1;
 }
@@ -296,7 +384,8 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+		if (cpu_idle_force_poll || tick_check_broadcast_expired()
+		    || is_cpu_idle_poll(cpu)) {
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
-- 
2.38.1


             reply	other threads:[~2023-01-12 16:27 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-12 16:24 Daniel Bristot de Oliveira [this message]
2023-01-15  9:15 ` [PATCH] sched/idle: Make idle poll dynamic per-cpu Ingo Molnar
2023-01-17 11:20   ` Daniel Bristot de Oliveira
2023-01-16  1:43 ` Chen Yu
2023-01-16  8:53 ` Peter Zijlstra
2023-01-16  9:02   ` Ingo Molnar
2023-01-16  9:28     ` Ingo Molnar
2023-01-16  9:51       ` Vincent Guittot
2023-01-16 10:11         ` Daniel Bristot de Oliveira
2023-01-16 10:06       ` Daniel Bristot de Oliveira
2023-01-16 11:54       ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230112162426.217522-1-bristot@kernel.org \
    --to=bristot@kernel.org \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=jmario@redhat.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox