public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Nicholas Piggin <npiggin@gmail.com>
To: Frederic Weisbecker <fweisbec@gmail.com>,
	Thomas Gleixner <tglx@linutronix.de>
Cc: Nicholas Piggin <npiggin@gmail.com>,
	linux-kernel@vger.kernel.org, Michael Neuling <mikey@neuling.org>
Subject: [RFC PATCH] time/nohz: allow the boot CPU to be nohz_full
Date: Mon, 14 Jan 2019 16:47:45 +1000	[thread overview]
Message-ID: <20190114064745.27306-1-npiggin@gmail.com> (raw)

We have a supercomputer site testing nohz_full to reduce jitter with
good results, but they want CPU0 to be nohz_full. That happens to be
the boot CPU, which is disallowed by the nohz_full code.

They have existing job scheduling code which wants this, I don't know
too much detail beyond that, but I hope the kernel can be made to
work with their config.

This patch has the boot CPU take over the jiffies update in the low
res timer before SMP is brought up, after which the nohz CPU will take
over.

It also modifies the housekeeping check code a bit to ensure at least
one !nohz CPU is in the present map so it comes up at boot, rather
than having the nohz code take the boot CPU out of the nohz mask.

This keeps jiffies incrementing on the nohz_full boot CPU before SMP
init, but I'm not sure if this is covering all races and platform
considerations. Sorry I don't know the timer code too well, I would
appreciate any help.

Thanks,
Nick
---
 kernel/sched/isolation.c  | 18 ++++++++++++------
 kernel/time/tick-common.c | 30 +++++++++++++++++++++++++-----
 kernel/time/tick-sched.c  | 14 +++++---------
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index e6802181900f..6f8ac38d39a0 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
 static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
 	cpumask_var_t non_housekeeping_mask;
+	cpumask_var_t tmp;
 	int err;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,25 +76,30 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 		return 0;
 	}
 
+	alloc_bootmem_cpumask_var(&tmp);
 	if (!housekeeping_flags) {
 		alloc_bootmem_cpumask_var(&housekeeping_mask);
 		cpumask_andnot(housekeeping_mask,
 			       cpu_possible_mask, non_housekeeping_mask);
-		if (cpumask_empty(housekeeping_mask))
+		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+		if (cpumask_empty(tmp)) {
+			pr_warn("Housekeeping: must include one present CPU, "
+				"using boot CPU:%d\n", smp_processor_id());
 			cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+			cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+		}
 	} else {
-		cpumask_var_t tmp;
-
-		alloc_bootmem_cpumask_var(&tmp);
-		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+		if (cpumask_empty(tmp))
+			cpumask_set_cpu(smp_processor_id(), tmp);
 		if (!cpumask_equal(tmp, housekeeping_mask)) {
 			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
 			free_bootmem_cpumask_var(tmp);
 			free_bootmem_cpumask_var(non_housekeeping_mask);
 			return 0;
 		}
-		free_bootmem_cpumask_var(tmp);
 	}
+	free_bootmem_cpumask_var(tmp);
 
 	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
 		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 14de3727b18e..c971278dbe95 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -50,6 +50,9 @@ ktime_t tick_period;
  *    procedure also covers cpu hotplug.
  */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
 
 /*
  * Debugging: see timer_list.c
@@ -78,7 +81,11 @@ int tick_is_oneshot_available(void)
  */
 static void tick_periodic(int cpu)
 {
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_do_timer_cpu == cpu || tick_do_timer_boot_cpu == cpu) {
+#else
 	if (tick_do_timer_cpu == cpu) {
+#endif
 		write_seqlock(&jiffies_lock);
 
 		/* Keep track of the next tick event */
@@ -190,12 +197,25 @@ static void tick_setup_device(struct tick_device *td,
 		 * this cpu:
 		 */
 		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-			if (!tick_nohz_full_cpu(cpu))
+			/*
+			 * The boot CPU may be nohz_full, in which case just
+			 * leave it set to TICK_DO_TIMER_BOOT for the next
+			 * CPU. tick_do_timer_boot_cpu is set to run the
+			 * tick at early boot until the housekeeping CPU
+			 * comes up.
+			 */
+			if (!tick_nohz_full_cpu(cpu)) {
 				tick_do_timer_cpu = cpu;
-			else
-				tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-			tick_next_period = ktime_get();
-			tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+				tick_do_timer_boot_cpu = -1;
+			} else {
+				tick_do_timer_boot_cpu = cpu;
+#endif
+			}
+			if (!tick_period) {
+				tick_period = NSEC_PER_SEC / HZ;
+				tick_next_period = ktime_get();
+			}
 		}
 
 		/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69e673b88474..42f77231d0dc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -398,8 +398,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
 	/*
-	 * The boot CPU handles housekeeping duty (unbound timers,
-	 * workqueues, timekeeping, ...) on behalf of full dynticks
+	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
 	 * CPUs. It must remain online when nohz full is enabled.
 	 */
 	if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -428,12 +428,6 @@ void __init tick_nohz_init(void)
 
 	cpu = smp_processor_id();
 
-	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
-		pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
-			cpu);
-		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
-	}
-
 	for_each_cpu(cpu, tick_nohz_full_mask)
 		context_tracking_cpu_set(cpu);
 
@@ -907,8 +901,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		/*
 		 * Boot safety: make sure the timekeeping duty has been
 		 * assigned before entering dyntick-idle mode,
+		 * tick_do_timer_cpu is TICK_DO_TIMER_NONE or
+		 * TICK_DO_TIMER_BOOT
 		 */
-		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+		if (tick_do_timer_cpu < 0)
 			return false;
 	}
 
-- 
2.18.0


             reply	other threads:[~2019-01-14  6:48 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-14  6:47 Nicholas Piggin [this message]
2019-01-16 17:54 ` [RFC PATCH] time/nohz: allow the boot CPU to be nohz_full Frederic Weisbecker
2019-01-23  8:25   ` Nicholas Piggin
2019-01-23 17:11     ` Frederic Weisbecker

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190114064745.27306-1-npiggin@gmail.com \
    --to=npiggin@gmail.com \
    --cc=fweisbec@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mikey@neuling.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox