[RFC PATCH] PM: Optionally block user fork during freeze to improve performance

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
@ 2025-06-06  6:25 Zihuan Zhang
  2025-06-06  7:20 ` David Hildenbrand
  2025-06-06  8:22 ` Peter Zijlstra
  0 siblings, 2 replies; 16+ messages in thread
From: Zihuan Zhang @ 2025-06-06  6:25 UTC (permalink / raw)
  To: rafael, len.brown, pavel, kees, mingo, peterz, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko
  Cc: linux-pm, linux-kernel, linux-mm, Zihuan Zhang

Currently, the freezer traverses all tasks to freeze them during
system suspend or hibernation. If a user process forks during this
window, the new child may escape freezing and require a second
traversal or retry, adding non-trivial overhead.

This patch introduces a CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
option. When enabled, it prevents user processes from creating new
processes (via fork/clone) during the freezing period. This guarantees
a stable task list and avoids re-traversing the process list due to
late-created user tasks, thereby improving performance.

The restriction is only active during the window when the system is
freezing user tasks. Once all tasks are frozen, or if the system aborts
the suspend/hibernate process, the restriction is lifted.
No kernel threads are affected, and kernel_create_* functions remain
unrestricted.

Signed-off-by: Zihuan Zhang <zhangzihuan@kylinos.cn>
---
 include/linux/suspend.h |  8 ++++++++
 kernel/fork.c           |  6 ++++++
 kernel/power/Kconfig    | 10 ++++++++++
 kernel/power/main.c     | 44 +++++++++++++++++++++++++++++++++++++++++
 kernel/power/power.h    |  4 ++++
 kernel/power/process.c  |  7 +++++++
 6 files changed, 79 insertions(+)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index b1c76c8f2c82..2dd8b3eb50f0 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -591,4 +591,12 @@ enum suspend_stat_step {
 void dpm_save_failed_dev(const char *name);
 void dpm_save_failed_step(enum suspend_stat_step step);
 
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+extern bool pm_block_user_fork;
+bool pm_should_block_fork(void);
+bool pm_freeze_process_in_progress(void);
+#else
+static inline bool pm_should_block_fork(void) { return false; };
+static inline bool pm_freeze_process_in_progress(void) { return false; };
+#endif /* CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE */
 #endif /* _LINUX_SUSPEND_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..b0bd0206b644 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
 #include <linux/tick.h>
+#include <linux/suspend.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2596,6 +2597,11 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 			trace = 0;
 	}
 
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+	if (pm_should_block_fork() && !(current->flags & PF_KTHREAD))
+		return -EBUSY;
+#endif
+
 	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
 	add_latent_entropy();
 
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 54a623680019..d3d4d23b8f04 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -375,6 +375,16 @@ config PM_GENERIC_DOMAINS_OF
 	def_bool y
 	depends on PM_GENERIC_DOMAINS && OF
 
+config PM_DISABLE_USER_FORK_DURING_FREEZE
+	bool "Disable user fork during process freeze"
+	depends on PM
+	default n
+	help
+	If enabled, user space processes will be forbidden from creating
+	new tasks (via fork/clone) during the process freezing stage of
+	system suspend/hibernate.
+	This can avoid process list races and reduce retries during suspend.
+
 config CPU_PM
 	bool
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3d484630505a..99f5689dc8ac 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -994,6 +994,47 @@ static ssize_t freeze_filesystems_store(struct kobject *kobj,
 power_attr(freeze_filesystems);
 #endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
 
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+bool strict_fork_enabled;
+bool pm_block_user_fork;
+
+bool pm_freeze_process_in_progress(void)
+{
+	return pm_block_user_fork;
+}
+
+bool pm_should_block_fork(void)
+{
+	return strict_fork_enabled && pm_freeze_process_in_progress();
+}
+EXPORT_SYMBOL_GPL(pm_should_block_fork);
+
+static ssize_t strict_fork_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", strict_fork_enabled);
+}
+
+static ssize_t strict_fork_store(struct kobject *kobj,
+				 struct kobj_attribute *attr,
+				 const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val > 1)
+		return -EINVAL;
+
+	strict_fork_enabled = !!val;
+	return n;
+}
+
+power_attr(strict_fork);
+
+#endif /* CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE */
+
 static struct attribute * g[] = {
 	&state_attr.attr,
 #ifdef CONFIG_PM_TRACE
@@ -1026,6 +1067,9 @@ static struct attribute * g[] = {
 #endif
 #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
 	&freeze_filesystems_attr.attr,
+#endif
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+	&strict_fork_attr.attr,
 #endif
 	NULL,
 };
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cb1d71562002..45a52d7b899d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,6 +22,10 @@ struct swsusp_info {
 extern bool filesystem_freeze_enabled;
 #endif
 
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+extern bool strict_fork_enabled;
+#endif
+
 #ifdef CONFIG_HIBERNATION
 /* kernel/power/snapshot.c */
 extern void __init hibernate_reserved_size_init(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index dc0dfc349f22..a6f7ba2d283d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -134,7 +134,14 @@ int freeze_processes(void)
 
 	pm_wakeup_clear(0);
 	pm_freezing = true;
+
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+	pm_block_user_fork = true;
+#endif
 	error = try_to_freeze_tasks(true);
+#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
+	pm_block_user_fork = false;
+#endif
 	if (!error)
 		__usermodehelper_set_disable_depth(UMH_DISABLED);
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-06  6:25 [RFC PATCH] PM: Optionally block user fork during freeze to improve performance Zihuan Zhang
@ 2025-06-06  7:20 ` David Hildenbrand
  2025-06-08  7:22   ` zhangzihuan
  2025-06-06  8:22 ` Peter Zijlstra
  1 sibling, 1 reply; 16+ messages in thread
From: David Hildenbrand @ 2025-06-06  7:20 UTC (permalink / raw)
  To: Zihuan Zhang, rafael, len.brown, pavel, kees, mingo, peterz,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko
  Cc: linux-pm, linux-kernel, linux-mm

Hi,

On 06.06.25 08:25, Zihuan Zhang wrote:
> Currently, the freezer traverses all tasks to freeze them during
> system suspend or hibernation. If a user process forks during this
> window, the new child may escape freezing and require a second
> traversal or retry, adding non-trivial overhead.
> 
> This patch introduces a CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE

Not sure if a Kconfig is really the right choice here ...

> option. When enabled, it prevents user processes from creating new
> processes (via fork/clone) during the freezing period. This guarantees
> a stable task list and avoids re-traversing the process list due to
> late-created user tasks, thereby improving performance.

Any performance numbers to back your claims?

> 
> The restriction is only active during the window when the system is
> freezing user tasks. Once all tasks are frozen, or if the system aborts
> the suspend/hibernate process, the restriction is lifted.
> No kernel threads are affected, and kernel_create_* functions remain
> unrestricted.
> 
> Signed-off-by: Zihuan Zhang <zhangzihuan@kylinos.cn>
> ---
>   include/linux/suspend.h |  8 ++++++++
>   kernel/fork.c           |  6 ++++++
>   kernel/power/Kconfig    | 10 ++++++++++
>   kernel/power/main.c     | 44 +++++++++++++++++++++++++++++++++++++++++
>   kernel/power/power.h    |  4 ++++
>   kernel/power/process.c  |  7 +++++++
>   6 files changed, 79 insertions(+)
> 
> diff --git a/include/linux/suspend.h b/include/linux/suspend.h
> index b1c76c8f2c82..2dd8b3eb50f0 100644
> --- a/include/linux/suspend.h
> +++ b/include/linux/suspend.h
> @@ -591,4 +591,12 @@ enum suspend_stat_step {
>   void dpm_save_failed_dev(const char *name);
>   void dpm_save_failed_step(enum suspend_stat_step step);
>   
> +#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
> +extern bool pm_block_user_fork;
> +bool pm_should_block_fork(void);
> +bool pm_freeze_process_in_progress(void);
> +#else
> +static inline bool pm_should_block_fork(void) { return false; };
> +static inline bool pm_freeze_process_in_progress(void) { return false; };
> +#endif /* CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE */
>   #endif /* _LINUX_SUSPEND_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1ee8eb11f38b..b0bd0206b644 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -105,6 +105,7 @@
>   #include <uapi/linux/pidfd.h>
>   #include <linux/pidfs.h>
>   #include <linux/tick.h>
> +#include <linux/suspend.h>
>   
>   #include <asm/pgalloc.h>
>   #include <linux/uaccess.h>
> @@ -2596,6 +2597,11 @@ pid_t kernel_clone(struct kernel_clone_args *args)
>   			trace = 0;
>   	}
>   
> +#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
> +	if (pm_should_block_fork() && !(current->flags & PF_KTHREAD))
> +		return -EBUSY;
> +#endif

fork() is not documented to return EBUSY and for clone3() it's 
documented to only happen in specific cases.

So user space is not prepared for that.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-06  6:25 [RFC PATCH] PM: Optionally block user fork during freeze to improve performance Zihuan Zhang
  2025-06-06  7:20 ` David Hildenbrand
@ 2025-06-06  8:22 ` Peter Zijlstra
  2025-06-09  4:05   ` zhangzihuan
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2025-06-06  8:22 UTC (permalink / raw)
  To: Zihuan Zhang
  Cc: rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko, linux-pm, linux-kernel, linux-mm

On Fri, Jun 06, 2025 at 02:25:02PM +0800, Zihuan Zhang wrote:

> @@ -2596,6 +2597,11 @@ pid_t kernel_clone(struct kernel_clone_args *args)
>  			trace = 0;
>  	}
>  
> +#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
> +	if (pm_should_block_fork() && !(current->flags & PF_KTHREAD))
> +		return -EBUSY;
> +#endif
> +
>  	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
>  	add_latent_entropy();

This isn't blocking fork(), this is failing fork(). Huge difference.
Also problematic, because -EBUSY is not a recognised return value of
fork(). As such, no existing software will adequately handle it.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-06  7:20 ` David Hildenbrand
@ 2025-06-08  7:22   ` zhangzihuan
  2025-06-08 15:50     ` Mateusz Guzik
  0 siblings, 1 reply; 16+ messages in thread
From: zhangzihuan @ 2025-06-08  7:22 UTC (permalink / raw)
  To: David Hildenbrand, rafael, len.brown, pavel, kees, mingo, peterz,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko
  Cc: linux-pm, linux-kernel, linux-mm

Hi David,
Thanks for your feedback!

在 2025/6/6 15:20, David Hildenbrand 写道:
> Hi,
>
> On 06.06.25 08:25, Zihuan Zhang wrote:
>> Currently, the freezer traverses all tasks to freeze them during
>> system suspend or hibernation. If a user process forks during this
>> window, the new child may escape freezing and require a second
>> traversal or retry, adding non-trivial overhead.
>>
>> This patch introduces a CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
>
> Not sure if a Kconfig is really the right choice here ...
>
I understand your concern. My initial thinking was to provide an opt-in 
configuration so that platforms sensitive to resume performance (or 
under constrained suspend time budgets) can selectively enable this 
behavior.
However, I agree that a runtime mechanism or a default-on behavior gated 
by suspend state might be cleaner. I'm happy to rework it in that 
direction — e.g., based on pm_freezing or a similar runtime flag.

>> option. When enabled, it prevents user processes from creating new
>> processes (via fork/clone) during the freezing period. This guarantees
>> a stable task list and avoids re-traversing the process list due to
>> late-created user tasks, thereby improving performance.
>
> Any performance numbers to back your claims?
>
We’ve completed the performance testing. To simulate a process escape 
scenario, we created a test environment where a large number of fork 
operations are triggered right before the freeze phase begins.

A few details worth mentioning:
     •    We avoided creating too many processes at once to prevent 
resource exhaustion.
     •    To increase the likelihood of hitting the freeze window 
precisely, we skipped the filesystem freezing time during the simulation.

     •    We also added a small delay to each process, ensuring they 
don’t all complete their fork operations before the system enters 
suspend/hibernate.

    •    Before starting the tests, we also added a debug print in 
try_to_freeze_task() to log the number of freeze retry attempts for each 
task

--- begin test code ---

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <errno.h>
#include <string.h>
#include <time.h>
#define TOTAL_FORKS        1000      // total number
#define BATCH_SIZE          10       // Number of forks per batch
#define FORK_INTERVAL_US   300       // Each fork interval (microseconds)
#define CHILD_LIFETIME_SEC 60        // Subprocess runtime (seconds)
void usleep_precise(int usec) {
     struct timespec ts;
     ts.tv_sec = usec / 1000000;
     ts.tv_nsec = (usec % 1000000) * 1000;
     nanosleep(&ts, NULL);
}
void random_delay_ms(int min_ms, int max_ms) {
     int delay = min_ms + rand() % (max_ms - min_ms + 1);
     usleep_precise(delay * 1000);
}
void random_delay_us(int min_us, int max_us) {
     int delay = min_us + rand() % (max_us - min_us + 1);
     usleep_precise(delay);
}
int main() {
     int count = 0;
     int batch = 0;
     printf("Starting enhanced fork storm test...\n");
     printf("  Total forks:      %d\n", TOTAL_FORKS);
     printf("  Batch size:       %d\n", BATCH_SIZE);
     printf("  Fork interval:    %d us\n", FORK_INTERVAL_US);
     printf("  Child lifetime:   %d sec\n\n", CHILD_LIFETIME_SEC);

     random_delay_ms(2, 10); // Skip Filesystem freeze
     while (count < TOTAL_FORKS) {
         printf("Starting batch %d...\n", ++batch);

         for (int i = 0; i < BATCH_SIZE && count < TOTAL_FORKS; i++, 
count++) {
             pid_t pid = fork();

             if (pid == 0) {
                 printf("Child #%d (pid=%d) started\n", count, getpid());
                 // sleep(CHILD_LIFETIME_SEC);
                 pause();
                 exit(0);
             } else if (pid < 0) {
                 fprintf(stderr, "fork failed at %d: %s\n", count, 
strerror(errno));
                 exit(1);
             }

         }

         usleep_precise(50);
         printf("Batch %d completed. Total forked so far: %d\n", batch, 
count);
     }

     printf("All %d children created. Parent process sleeping...\n", 
TOTAL_FORKS);
     pause();
     return 0;

}

--- end test code ---

Then compile the code and run the test script.

gcc  -o  slow_fork  slow_fork.c

--- begin test code ---

#!/bin/bash
LOOPS=20
DELAY_BETWEEN_RUNS=1
NUM_FORKS_PER_ROUND=10
FORK_PIDS=()
echo freezer > /sys/power/pm_test
echo 3 > /sys/module/suspend/parameters/pm_test_delay
for ((i=1; i<=LOOPS; i++)); do
echo "===== Test round $i/$LOOPS ====="

FORK_PIDS=()
for ((j=1; j<=NUM_FORKS_PER_ROUND; j++)); do
     ./slow_fork &
     FORK_PIDS+=($!)
     echo "  Launched slow_fork #$j (pid=${FORK_PIDS[-1]})"
done

echo mem > /sys/power/state

for pid in "${FORK_PIDS[@]}"; do
     kill "$pid" 2>/dev/null
done

for pid in "${FORK_PIDS[@]}"; do
     wait "$pid" 2>/dev/null
done

echo "Round $i complete. Waiting ${DELAY_BETWEEN_RUNS}s..."
sleep $DELAY_BETWEEN_RUNS

done
pkill slow_fork
echo "==== All $LOOPS rounds complete ===="

}

--- end test code ---

The result like this:

dmesg | grep -E 'elap|Files|retry'
[  585.255784] Filesystems sync: 0.010 seconds
[  585.261620] Freezing user space processes completed (elapsed 0.005 
seconds)
[  585.263530] Freezing remaining freezable tasks completed (elapsed 
0.001 seconds)
[  589.323691] Filesystems sync: 0.012 seconds
[  589.336983] Freeing user space processes todo:0 retry:2
[  589.336996] Freezing user space processes completed (elapsed 0.013 
seconds)
[  589.342628] Freezing remaining freezable tasks completed (elapsed 
0.005 seconds)
[  593.424317] Filesystems sync: 0.011 seconds
[  593.446210] Freeing user space processes todo:0 retry:2
[  593.446227] Freezing user space processes completed (elapsed 0.021 
seconds)
[  593.454303] Freezing remaining freezable tasks completed (elapsed 
0.008 seconds)
[  597.528491] Filesystems sync: 0.012 seconds
[  597.561179] Freeing user space processes todo:0 retry:2
[  597.561200] Freezing user space processes completed (elapsed 0.032 
seconds)
[  597.570157] Freezing remaining freezable tasks completed (elapsed 
0.008 seconds)
[  601.645391] Filesystems sync: 0.010 seconds
[  601.682653] Freeing user space processes todo:0 retry:2
[  601.682671] Freezing user space processes completed (elapsed 0.037 
seconds)
[  601.694401] Freezing remaining freezable tasks completed (elapsed 
0.011 seconds)
[  605.789844] Filesystems sync: 0.011 seconds
[  605.830030] Freezing user space processes completed (elapsed 0.039 
seconds)
[  605.843602] Freezing remaining freezable tasks completed (elapsed 
0.013 seconds)
[  609.942143] Filesystems sync: 0.017 seconds
[  609.997859] Freeing user space processes todo:0 retry:2
[  609.997875] Freezing user space processes completed (elapsed 0.055 
seconds)
[  610.016413] Freezing remaining freezable tasks completed (elapsed 
0.018 seconds)
[  614.123700] Filesystems sync: 0.016 seconds
[  614.187743] Freeing user space processes todo:0 retry:2
[  614.187764] Freezing user space processes completed (elapsed 0.063 
seconds)
[  614.205004] Freezing remaining freezable tasks completed (elapsed 
0.017 seconds)
[  618.323268] Filesystems sync: 0.013 seconds
[  618.393868] Freeing user space processes todo:0 retry:2
[  618.393886] Freezing user space processes completed (elapsed 0.070 
seconds)
[  618.413420] Freezing remaining freezable tasks completed (elapsed 
0.019 seconds)
[  622.584589] Filesystems sync: 0.009 seconds
[  622.676274] Freeing user space processes todo:0 retry:2
[  622.676294] Freezing user space processes completed (elapsed 0.091 
seconds)
[  622.702762] Freezing remaining freezable tasks completed (elapsed 
0.026 seconds)
[  626.836610] Filesystems sync: 0.009 seconds
[  626.935583] Freeing user space processes todo:0 retry:2
[  626.935603] Freezing user space processes completed (elapsed 0.098 
seconds)
[  626.966460] Freezing remaining freezable tasks completed (elapsed 
0.030 seconds)
[  631.131669] Filesystems sync: 0.010 seconds
[  631.249412] Freeing user space processes todo:0 retry:2
[  631.249432] Freezing user space processes completed (elapsed 0.117 
seconds)
[  631.283333] Freezing remaining freezable tasks completed (elapsed 
0.033 seconds)
[  635.459169] Filesystems sync: 0.014 seconds
[  635.574913] Freeing user space processes todo:0 retry:2
[  635.574928] Freezing user space processes completed (elapsed 0.115 
seconds)
[  635.613557] Freezing remaining freezable tasks completed (elapsed 
0.038 seconds)
[  639.801842] Filesystems sync: 0.014 seconds
[  639.949023] Freeing user space processes todo:0 retry:2
[  639.949047] Freezing user space processes completed (elapsed 0.146 
seconds)
[  639.998032] Freezing remaining freezable tasks completed (elapsed 
0.048 seconds)
[  644.151229] Filesystems sync: 0.011 seconds
[  644.303744] Freeing user space processes todo:0 retry:2
[  644.303765] Freezing user space processes completed (elapsed 0.152 
seconds)
[  644.347925] Freezing remaining freezable tasks completed (elapsed 
0.043 seconds)
[  648.506472] Filesystems sync: 0.010 seconds
[  648.647752] Freeing user space processes todo:192 retry:2
[  648.670978] Freeing user space processes todo:0 retry:3
[  648.670997] Freezing user space processes completed (elapsed 0.164 
seconds)
[  648.724734] Freezing remaining freezable tasks completed (elapsed 
0.053 seconds)
[  652.947466] Filesystems sync: 0.021 seconds
[  653.112034] Freeing user space processes todo:0 retry:2
[  653.112055] Freezing user space processes completed (elapsed 0.164 
seconds)
[  653.163845] Freezing remaining freezable tasks completed (elapsed 
0.051 seconds)
[  657.364792] Filesystems sync: 0.012 seconds
[  657.510491] Freezing user space processes completed (elapsed 0.145 
seconds)
[  657.570268] Freezing remaining freezable tasks completed (elapsed 
0.059 seconds)
[  661.779728] Filesystems sync: 0.011 seconds
[  661.975654] Freeing user space processes todo:0 retry:2
[  661.975686] Freezing user space processes completed (elapsed 0.195 
seconds)
[  662.050074] Freezing remaining freezable tasks completed (elapsed 
0.074 seconds)
[  666.273377] Filesystems sync: 0.010 seconds
[  666.481081] Freeing user space processes todo:0 retry:2
[  666.481117] Freezing user space processes completed (elapsed 0.207 
seconds)
[  666.564340] Freezing remaining freezable tasks completed (elapsed 
0.083 seconds)

We observed the following log during one of the test runs:

[  648.647752] Freeing user space processes todo:192 retry:2
However, since the reproduction rate is currently low, it's still 
difficult to quantify exactly how much performance improvement the patch 
brings.

>>
>> The restriction is only active during the window when the system is
>> freezing user tasks. Once all tasks are frozen, or if the system aborts
>> the suspend/hibernate process, the restriction is lifted.
>> No kernel threads are affected, and kernel_create_* functions remain
>> unrestricted.
>>
>> Signed-off-by: Zihuan Zhang <zhangzihuan@kylinos.cn>
>> ---
>>   include/linux/suspend.h |  8 ++++++++
>>   kernel/fork.c           |  6 ++++++
>>   kernel/power/Kconfig    | 10 ++++++++++
>>   kernel/power/main.c     | 44 +++++++++++++++++++++++++++++++++++++++++
>>   kernel/power/power.h    |  4 ++++
>>   kernel/power/process.c  |  7 +++++++
>>   6 files changed, 79 insertions(+)
>>
>> diff --git a/include/linux/suspend.h b/include/linux/suspend.h
>> index b1c76c8f2c82..2dd8b3eb50f0 100644
>> --- a/include/linux/suspend.h
>> +++ b/include/linux/suspend.h
>> @@ -591,4 +591,12 @@ enum suspend_stat_step {
>>   void dpm_save_failed_dev(const char *name);
>>   void dpm_save_failed_step(enum suspend_stat_step step);
>>   +#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
>> +extern bool pm_block_user_fork;
>> +bool pm_should_block_fork(void);
>> +bool pm_freeze_process_in_progress(void);
>> +#else
>> +static inline bool pm_should_block_fork(void) { return false; };
>> +static inline bool pm_freeze_process_in_progress(void) { return 
>> false; };
>> +#endif /* CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE */
>>   #endif /* _LINUX_SUSPEND_H */
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 1ee8eb11f38b..b0bd0206b644 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -105,6 +105,7 @@
>>   #include <uapi/linux/pidfd.h>
>>   #include <linux/pidfs.h>
>>   #include <linux/tick.h>
>> +#include <linux/suspend.h>
>>     #include <asm/pgalloc.h>
>>   #include <linux/uaccess.h>
>> @@ -2596,6 +2597,11 @@ pid_t kernel_clone(struct kernel_clone_args 
>> *args)
>>               trace = 0;
>>       }
>>   +#ifdef CONFIG_PM_DISABLE_USER_FORK_DURING_FREEZE
>> +    if (pm_should_block_fork() && !(current->flags & PF_KTHREAD))
>> +        return -EBUSY;
>> +#endif
>

You're absolutely right — returning -EBUSY is not part of the documented 
interface for fork/clone3, and user space libraries like glibc are 
likely not prepared to handle that gracefully.
One alternative could be to block in kernel_clone() until freezing ends, 
instead of returning an error. That way, fork() would not fail, just 
potentially block briefly (similar to memory pressure or cgroup limits). 
Do you think that's more acceptable?
I’ll draft an updated version reflecting your suggestions. Really 
appreciate your time and review!
Best regards,
Zihuan Zhang

> fork() is not documented to return EBUSY and for clone3() it's 
> documented to only happen in specific cases.
>
> So user space is not prepared for that.
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-08  7:22   ` zhangzihuan
@ 2025-06-08 15:50     ` Mateusz Guzik
  2025-06-09  3:46       ` zhangzihuan
  0 siblings, 1 reply; 16+ messages in thread
From: Mateusz Guzik @ 2025-06-08 15:50 UTC (permalink / raw)
  To: zhangzihuan
  Cc: David Hildenbrand, rafael, len.brown, pavel, kees, mingo, peterz,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko, linux-pm, linux-kernel, linux-mm

On Sun, Jun 08, 2025 at 03:22:20PM +0800, zhangzihuan wrote:
> One alternative could be to block in kernel_clone() until freezing ends,
> instead of returning an error. That way, fork() would not fail, just
> potentially block briefly (similar to memory pressure or cgroup limits). Do
> you think that's more acceptable?

So I had a look at the freezing loop and it operates with
tasklist_lock held, meaning it already stalls clone().

try_to_freeze_tasks() in kernel/power/process.c contains:

	todo = 0;
	read_lock(&tasklist_lock);
	for_each_process_thread(g, p) {
		if (p == current || !freeze_task(p))
			continue;

		todo++;
	}
	read_unlock(&tasklist_lock);

I don't get where the assumption that fork itself is a factor is coming
from.

Looking at freezing itself it seems to me perf trouble starts with tons
of processes existing to begin with in arbitrary states (not with racing
against fork), requring a retry with preceeded by a sleep:

	/*
	 * We need to retry, but first give the freezing tasks some
	 * time to enter the refrigerator.  Start with an initial
	 * 1 ms sleep followed by exponential backoff until 8 ms.
	 */
	usleep_range(sleep_usecs / 2, sleep_usecs);
	if (sleep_usecs < 8 * USEC_PER_MSEC)
		sleep_usecs *= 2;

For a race against fork to have any effect, the new thread has to be
linked in to the global list -- otherwise the todo var wont get bumped.

But then if it gets added in a state which is freezable, the racing fork
did not cause any trouble.

If it gets added in a state which is *NOT* freezable by the current
code, maybe it should be patched to be freezable.

All in all I'm not confident any of this warrants any work -- do you
have a setup where the above causes a real problem?


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-08 15:50     ` Mateusz Guzik
@ 2025-06-09  3:46       ` zhangzihuan
  0 siblings, 0 replies; 16+ messages in thread
From: zhangzihuan @ 2025-06-09  3:46 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: David Hildenbrand, rafael, len.brown, pavel, kees, mingo, peterz,
	juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
	mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko, linux-pm, linux-kernel, linux-mm

Hi Mateusz,

Thanks again for your detailed input.

You’re absolutely right that try_to_freeze_tasks() holds the 
tasklist_lock during the main freeze loop, which temporarily blocks 
kernel_clone() at that stage. However, based on our observations and 
logs, the problem arises just after this loop, in the short window 
before the system enters suspend (e.g., around the brief usleep() 
period), when the lock is released and fork is once again possible.

To illustrate this, I’d like to share some dmesg logs gathered during a 
series of S3 suspend attempts. In most suspend cycles, we intentionally 
run a user-space process that forks rapidly during suspend, and we 
observe multiple retries during the “freezing user space processes” 
phase. Below are selected entries

在 2025/6/8 23:50, Mateusz Guzik 写道:
> On Sun, Jun 08, 2025 at 03:22:20PM +0800, zhangzihuan wrote:
>> One alternative could be to block in kernel_clone() until freezing ends,
>> instead of returning an error. That way, fork() would not fail, just
>> potentially block briefly (similar to memory pressure or cgroup limits). Do
>> you think that's more acceptable?
> So I had a look at the freezing loop and it operates with
> tasklist_lock held, meaning it already stalls clone().
>
> try_to_freeze_tasks() in kernel/power/process.c contains:
>
> 	todo = 0;
> 	read_lock(&tasklist_lock);
> 	for_each_process_thread(g, p) {
> 		if (p == current || !freeze_task(p))
> 			continue;
>
> 		todo++;
> 	}
> 	read_unlock(&tasklist_lock);
>
> I don't get where the assumption that fork itself is a factor is coming
> from.
>
> Looking at freezing itself it seems to me perf trouble starts with tons
> of processes existing to begin with in arbitrary states (not with racing
> against fork), requring a retry with preceeded by a sleep:
>
> 	/*
> 	 * We need to retry, but first give the freezing tasks some
> 	 * time to enter the refrigerator.  Start with an initial
> 	 * 1 ms sleep followed by exponential backoff until 8 ms.
> 	 */
> 	usleep_range(sleep_usecs / 2, sleep_usecs);
> 	if (sleep_usecs < 8 * USEC_PER_MSEC)
> 		sleep_usecs *= 2;
>
> For a race against fork to have any effect, the new thread has to be
> linked in to the global list -- otherwise the todo var wont get bumped.
>
> But then if it gets added in a state which is freezable, the racing fork
> did not cause any trouble.
>
> If it gets added in a state which is *NOT* freezable by the current
> code, maybe it should be patched to be freezable.
>
> All in all I'm not confident any of this warrants any work -- do you
> have a setup where the above causes a real problem?

Here is the log:

dmesg | grep -E 'elap|Files|retry'

[ 2556.566183] Filesystems sync: 0.012 seconds
[ 2556.570653] Freeing user space processes todo:1181 retry:0
[ 2556.572719] Freeing user space processes todo:0 retry:1
[ 2556.572730] Freezing user space processes completed (elapsed 0.006 
seconds)
[ 2556.573243] Freeing remaining freezable tasks todo:13 retry:0
[ 2556.574326] Freeing remaining freezable tasks todo:0 retry:1
[ 2556.574333] Freezing remaining freezable tasks completed (elapsed 
0.001 seconds)
[ 2560.647576] Filesystems sync: 0.018 seconds
[ 2560.656691] Freeing user space processes todo:2656 retry:0
[ 2560.661194] Freeing user space processes todo:327 retry:1
[ 2560.664130] Freeing user space processes todo:0 retry:2
[ 2560.664139] Freezing user space processes completed (elapsed 0.016 
seconds)
[ 2560.665475] Freeing remaining freezable tasks todo:13 retry:0
[ 2560.667159] Freeing remaining freezable tasks todo:0 retry:1
[ 2560.667170] Freezing remaining freezable tasks completed (elapsed 
0.003 seconds)
[ 2564.746592] Filesystems sync: 0.013 seconds
[ 2564.761025] Freeing user space processes todo:4192 retry:0
[ 2564.768048] Freeing user space processes todo:252 retry:1
[ 2564.773774] Freeing user space processes todo:0 retry:2
[ 2564.773801] Freezing user space processes completed (elapsed 0.026 
seconds)
[ 2564.776704] Freeing remaining freezable tasks todo:13 retry:0
[ 2564.781867] Freeing remaining freezable tasks todo:0 retry:1
[ 2564.781887] Freezing remaining freezable tasks completed (elapsed 
0.008 seconds)
[ 2568.872805] Filesystems sync: 0.010 seconds
[ 2568.893397] Freeing user space processes todo:5897 retry:0
[ 2568.903089] Freeing user space processes todo:0 retry:1
[ 2568.903102] Freezing user space processes completed (elapsed 0.030 
seconds)
[ 2568.907681] Freeing remaining freezable tasks todo:13 retry:0
[ 2568.914721] Freeing remaining freezable tasks todo:0 retry:1
[ 2568.914743] Freezing remaining freezable tasks completed (elapsed 
0.011 seconds)
[ 2573.019240] Filesystems sync: 0.018 seconds
[ 2573.044573] Freeing user space processes todo:7536 retry:0
[ 2573.056378] Freeing user space processes todo:261 retry:1
[ 2573.062016] Freeing user space processes todo:0 retry:2
[ 2573.062024] Freezing user space processes completed (elapsed 0.042 
seconds)
[ 2573.067114] Freeing remaining freezable tasks todo:13 retry:0
[ 2573.072597] Freeing remaining freezable tasks todo:0 retry:1
[ 2573.072604] Freezing remaining freezable tasks completed (elapsed 
0.010 seconds)
[ 2577.176003] Filesystems sync: 0.013 seconds
[ 2577.210773] Freeing user space processes todo:9042 retry:0
[ 2577.226116] Freeing user space processes todo:637 retry:1
[ 2577.233723] Freeing user space processes todo:0 retry:2
[ 2577.233733] Freezing user space processes completed (elapsed 0.057 
seconds)
[ 2577.240897] Freeing remaining freezable tasks todo:13 retry:0
[ 2577.250898] Freeing remaining freezable tasks todo:0 retry:1
[ 2577.250928] Freezing remaining freezable tasks completed (elapsed 
0.017 seconds)
[ 2581.358613] Filesystems sync: 0.014 seconds
[ 2581.397288] Freeing user space processes todo:10397 retry:0
[ 2581.415191] Freeing user space processes todo:107 retry:1
[ 2581.423085] Freeing user space processes todo:0 retry:2
[ 2581.423094] Freezing user space processes completed (elapsed 0.064 
seconds)
[ 2581.431079] Freeing remaining freezable tasks todo:13 retry:0
[ 2581.441576] Freeing remaining freezable tasks todo:0 retry:1
[ 2581.441596] Freezing remaining freezable tasks completed (elapsed 
0.018 seconds)
[ 2585.572128] Filesystems sync: 0.016 seconds
[ 2585.617543] Freeing user space processes todo:12330 retry:0
[ 2585.638997] Freeing user space processes todo:1227 retry:1
[ 2585.648592] Freeing user space processes todo:0 retry:2
[ 2585.648602] Freezing user space processes completed (elapsed 0.076 
seconds)
[ 2585.658063] Freeing remaining freezable tasks todo:13 retry:0
[ 2585.670385] Freeing remaining freezable tasks todo:0 retry:1
[ 2585.670405] Freezing remaining freezable tasks completed (elapsed 
0.021 seconds)
[ 2589.810371] Filesystems sync: 0.014 seconds
[ 2589.865483] Freeing user space processes todo:14036 retry:0
[ 2589.893513] Freeing user space processes todo:1288 retry:1
[ 2589.904032] Freeing user space processes todo:0 retry:2
[ 2589.904040] Freezing user space processes completed (elapsed 0.093 
seconds)
[ 2589.914322] Freeing remaining freezable tasks todo:13 retry:0
[ 2589.925185] Freeing remaining freezable tasks todo:0 retry:1
[ 2589.925191] Freezing remaining freezable tasks completed (elapsed 
0.021 seconds)
[ 2594.088171] Filesystems sync: 0.013 seconds
[ 2594.145012] Freeing user space processes todo:15947 retry:0
[ 2594.175153] Freeing user space processes todo:1521 retry:1
[ 2594.187060] Freeing user space processes todo:0 retry:2
[ 2594.187071] Freezing user space processes completed (elapsed 0.098 
seconds)
[ 2594.199270] Freeing remaining freezable tasks todo:13 retry:0
[ 2594.215446] Freeing remaining freezable tasks todo:0 retry:1
[ 2594.215468] Freezing remaining freezable tasks completed (elapsed 
0.028 seconds)

However, in the last suspend cycle, we do not execute the fork script 
and the result is quite different:

[ 2678.840809] Filesystems sync: 0.010 seconds
[ 2678.928107] Freeing user space processes todo:16673 retry:0
[ 2678.950744] Freeing user space processes todo:0 retry:1
[ 2678.950759] Freezing user space processes completed (elapsed 0.109 
seconds)
[ 2678.971389] Freeing remaining freezable tasks todo:13 retry:0
[ 2678.996021] Freeing remaining freezable tasks todo:0 retry:1
[ 2678.996043] Freezing remaining freezable tasks completed (elapsed 
0.045 seconds)

(include the one with only 1 retry, e.g.:

[ 2678.928107] Freeing user space processes todo:16673 retry:0
[ 2678.950744] Freeing user space processes todo:0 retry:1

This pattern is repeatable: when fork is allowed during the 
freeze/suspend window, we consistently hit multiple retries; when fork 
is disabled during that time, the freeze proceeds quickly and smoothly 
with just one retry.

This indicates that new user processes created after the freezing loop 
begins are interfering with the suspend, which is consistent with a fork 
escape scenario. Since the current code doesn’t prevent forks once 
tasklist_lock is released, a new child process can be created and escape 
freezing altogether — leading to the need for retries and sometimes 
suspend failure.

Hope this helps clarify the issue. Happy to provide further logs or 
testing as needed.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-06  8:22 ` Peter Zijlstra
@ 2025-06-09  4:05   ` zhangzihuan
  2025-06-10 10:50     ` David Hildenbrand
  0 siblings, 1 reply; 16+ messages in thread
From: zhangzihuan @ 2025-06-09  4:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka,
	rppt, surenb, mhocko, linux-pm, linux-kernel, linux-mm

Hi Peter,
Thanks a lot for the feedback!

在 2025/6/6 16:22, Peter Zijlstra 写道:
> This isn't blocking fork(), this is failing fork(). Huge difference.
> Also problematic, because -EBUSY is not a recognised return value of
> fork(). As such, no existing software will adequately handle it.
>  I completely agree there's a significant difference between failing 
> and blocking fork().
The intent was to prevent late-created user tasks from interfering with 
the freezing process, but you're right: returning -EBUSY is not valid 
for fork(), and existing user-space programs wouldn't expect or handle 
that properly.
As a next step, I'm considering switching to a blocking mechanism 
instead — that is, have user fork() temporarily sleep if it's attempted 
during the freeze window. That should avoid breaking user-space 
expectations while still helping maintain freeze stability.
Would that be more acceptable?
Thanks again for the insight,
Zihuan Zhang


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-09  4:05   ` zhangzihuan
@ 2025-06-10 10:50     ` David Hildenbrand
  2025-06-13  2:37       ` Zihuan Zhang
  0 siblings, 1 reply; 16+ messages in thread
From: David Hildenbrand @ 2025-06-10 10:50 UTC (permalink / raw)
  To: zhangzihuan, Peter Zijlstra
  Cc: rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, linux-pm, linux-kernel, linux-mm

On 09.06.25 06:05, zhangzihuan wrote:
> Hi Peter,
> Thanks a lot for the feedback!
> 
> 在 2025/6/6 16:22, Peter Zijlstra 写道:
>> This isn't blocking fork(), this is failing fork(). Huge difference.
>> Also problematic, because -EBUSY is not a recognised return value of
>> fork(). As such, no existing software will adequately handle it.
>>   I completely agree there's a significant difference between failing
>> and blocking fork().
> The intent was to prevent late-created user tasks from interfering with
> the freezing process, but you're right: returning -EBUSY is not valid
> for fork(), and existing user-space programs wouldn't expect or handle
> that properly.
> As a next step, I'm considering switching to a blocking mechanism
> instead — that is, have user fork() temporarily sleep if it's attempted
> during the freeze window. That should avoid breaking user-space
> expectations while still helping maintain freeze stability.
> Would that be more acceptable?

Can't this problem be mitigated by simply not scheduling the new fork'ed 
process while the system is frozen?

Or what exact scenario are you worried about?

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-10 10:50     ` David Hildenbrand
@ 2025-06-13  2:37       ` Zihuan Zhang
  2025-06-13  7:05         ` Michal Hocko
  0 siblings, 1 reply; 16+ messages in thread
From: Zihuan Zhang @ 2025-06-13  2:37 UTC (permalink / raw)
  To: David Hildenbrand, Peter Zijlstra
  Cc: rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, linux-pm, linux-kernel, linux-mm

Hi David,
Thanks for your advice!

在 2025/6/10 18:50, David Hildenbrand 写道:
> 　　　 　 　 　　 　 　 　 　 　 　 　 　 　 　　
> Can't this problem be mitigated by simply not scheduling the new fork'ed
> process while the system is frozen?
>
> Or what exact scenario are you worried about?

Let me revisit the core issue for clarity. Under normal conditions, most 
processes in the system are in a sleep state, and only a few are 
runnable. So even with thousands of processes, the freezer generally 
works reliably and completes within a reasonable time
However, in our fork-based test scenario, we observed repeated freeze 
retries. This is not due to process count directly, but rather due to a 
scheduling behavior during the freeze phase. Specifically, the freezer 
logic contains the following snippet:
Here is the relevant freezer code that introduces the yield:
* We need to retry, but first give the freezing tasks some * time to 
enter the refrigerator. Start with an initial * 1 ms sleep followed by 
exponential backoff until 8 ms. */ usleep_range(sleep_usecs / 2, 
sleep_usecs); if (sleep_usecs < 8 * USEC_PER_MSEC) sleep_usecs *= 2;
This mechanism is usually effective because most tasks are sleeping and 
quickly enter the frozen state. But with concurrent fork() bombs, we 
observed that this CPU relinquish gives new child processes a chance to 
run, delaying or blocking the freezer's progress.
When only a single fork loop is running, it’s often frozen before the 
next retry. But when multiple forkers compete for CPU, we observed an 
increase in the todo count and repeated retries.
So while preventing the scheduling of newly forked processes would solve 
the problem at its root, it would require deeper architectural changes 
(e.g., task-level flags or restrictions at the scheduler level).
We initially considered whether replacing usleep_range() with a 
non-yielding wait might reduce this contention window. However, this 
approach turned out to be counterproductive — it starves other normal 
user tasks that need CPU time to reach their try_to_freeze() checkpoint, 
ultimately making the freeze process slower .
You’re right — blocking fork() is quite intrusive, so it’s worth 
exploring alternatives. We’ll try implementing your idea of preventing 
the newly forked process from being scheduled while the system is 
freezing, rather than failing the fork() call outright.

This may allow us to maintain compatibility with existing userspace 
while avoiding interference with the freezer traversal. We’ll evaluate 
whether this approach can reliably mitigate the issue (especially the 
scheduling race window between copy_process() and freeze_task()), and 
report back with results.

Best regards,
Zihuan Zhang

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-13  2:37       ` Zihuan Zhang
@ 2025-06-13  7:05         ` Michal Hocko
  2025-06-16  3:46           ` Zihuan Zhang
  0 siblings, 1 reply; 16+ messages in thread
From: Michal Hocko @ 2025-06-13  7:05 UTC (permalink / raw)
  To: Zihuan Zhang
  Cc: David Hildenbrand, Peter Zijlstra, rafael, len.brown, pavel, kees,
	mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett,
	vbabka, rppt, surenb, linux-pm, linux-kernel, linux-mm

On Fri 13-06-25 10:37:42, Zihuan Zhang wrote:
> Hi David,
> Thanks for your advice!
> 
> 在 2025/6/10 18:50, David Hildenbrand 写道:
> > 　　　 　 　 　　 　 　 　 　 　 　 　 　 　 　　
> > Can't this problem be mitigated by simply not scheduling the new fork'ed
> > process while the system is frozen?
> > 
> > Or what exact scenario are you worried about?
> 
> Let me revisit the core issue for clarity. Under normal conditions, most
> processes in the system are in a sleep state, and only a few are runnable.
> So even with thousands of processes, the freezer generally works reliably
> and completes within a reasonable time

How do you define reasonable time?

> However, in our fork-based test scenario, we observed repeated freeze
> retries.

Does this represent any real life scenario that happens on your system?
In other words how often do you miss your "reasonable time" treshold
while running a regular workload. Does the freezer ever fail?

[...]

> You’re right — blocking fork() is quite intrusive, so it’s worth exploring
> alternatives. We’ll try implementing your idea of preventing the newly
> forked process from being scheduled while the system is freezing, rather
> than failing the fork() call outright.

Just curious, are you interested in global freezer only or is the cgroup
freezer involved as well?

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-13  7:05         ` Michal Hocko
@ 2025-06-16  3:46           ` Zihuan Zhang
  2025-06-16  7:45             ` David Hildenbrand
  0 siblings, 1 reply; 16+ messages in thread
From: Zihuan Zhang @ 2025-06-16  3:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Hildenbrand, Peter Zijlstra, rafael, len.brown, pavel, kees,
	mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett,
	vbabka, rppt, surenb, linux-pm, linux-kernel, linux-mm

Hi  Michal,

Thanks for the question.

在 2025/6/13 15:05, Michal Hocko 写道:
> On Fri 13-06-25 10:37:42, Zihuan Zhang wrote:
>> Hi David,
>> Thanks for your advice!
>>
>> 在 2025/6/10 18:50, David Hildenbrand 写道:
>>> 　　　 　 　 　　 　 　 　 　 　 　 　 　 　 　　
>>> Can't this problem be mitigated by simply not scheduling the new fork'ed
>>> process while the system is frozen?
>>>
>>> Or what exact scenario are you worried about?
>> Let me revisit the core issue for clarity. Under normal conditions, most
>> processes in the system are in a sleep state, and only a few are runnable.
>> So even with thousands of processes, the freezer generally works reliably
>> and completes within a reasonable time
> How do you define reasonable time?
>

To clarify: freezing a process typically takes only a few dozen 
microseconds. In our tests, the freezer includes a usleep_range() delay 
between retries, which is about 1ms in the first round and doubles in 
subsequent rounds. Despite this delay, we observed that around 10% of 
the processes were not frozen during the first pass and had to be retried.

This suggests that even with a reasonably sufficient delay, some newly 
forked processes do not get frozen in time during the first iteration, 
simply due to timing. The freeze latency itself remains small, but not 
all processes are caught on the first try.
>> However, in our fork-based test scenario, we observed repeated freeze
>> retries.
> Does this represent any real life scenario that happens on your system?
> In other words how often do you miss your "reasonable time" treshold
> while running a regular workload. Does the freezer ever fail?
>
> [...]
In our test scenario, although new processes can indeed be created 
during the usleep_range() intervals between freeze iterations, it’s 
actually difficult to make the freezer fail outright. This is because 
user processes are forcibly frozen: when they return to user space and 
check for pending signals, they enter try_to_freeze() and transition 
into the refrigerator.

However, since the scheduler is fair by design, it gives both newly 
forked tasks and yet-to-be-frozen tasks a chance to run. This 
competition for CPU time can slightly delay the overall freeze process. 
While this typically doesn’t lead to failure, it does cause more retries 
than necessary, especially under CPU pressure.

Given that freezing is a clearly defined and semantically critical state 
transition, we believe it makes sense to prioritize the execution of 
tasks that are pending freezing over newly forked ones—particularly in 
resource-constrained environments
>> You’re right — blocking fork() is quite intrusive, so it’s worth exploring
>> alternatives. We’ll try implementing your idea of preventing the newly
>> forked process from being scheduled while the system is freezing, rather
>> than failing the fork() call outright.
> Just curious, are you interested in global freezer only or is the cgroup
> freezer involved as well?
>
At this stage, our focus is mainly on the global freezer during system 
suspend and hibernate (S3/S4). However, the patch itself is based on the 
generic freezing() and freeze_task() logic, so it should also work with 
the cgroup freezer as well.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-16  3:46           ` Zihuan Zhang
@ 2025-06-16  7:45             ` David Hildenbrand
  2025-06-16 11:24               ` Michal Hocko
  2025-06-18 11:30               ` Zihuan Zhang
  0 siblings, 2 replies; 16+ messages in thread
From: David Hildenbrand @ 2025-06-16  7:45 UTC (permalink / raw)
  To: Zihuan Zhang, Michal Hocko
  Cc: Peter Zijlstra, rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, linux-pm, linux-kernel, linux-mm


>> [...]
> In our test scenario, although new processes can indeed be created
> during the usleep_range() intervals between freeze iterations, it’s
> actually difficult to make the freezer fail outright. This is because
> user processes are forcibly frozen: when they return to user space and
> check for pending signals, they enter try_to_freeze() and transition
> into the refrigerator.
> 
> However, since the scheduler is fair by design, it gives both newly
> forked tasks and yet-to-be-frozen tasks a chance to run. This
> competition for CPU time can slightly delay the overall freeze process.
> While this typically doesn’t lead to failure, it does cause more retries
> than necessary, especially under CPU pressure.

I think that goes back to my original comment: why are we even allowing 
fork children to run at all when we are currently freezing all tasks?

I would imagine that try_to_freeze_tasks() should force any new 
processes (forked children) to start in the frozen state directly and 
not get scheduled in the first place.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-16  7:45             ` David Hildenbrand
@ 2025-06-16 11:24               ` Michal Hocko
  2025-06-18 11:30               ` Zihuan Zhang
  1 sibling, 0 replies; 16+ messages in thread
From: Michal Hocko @ 2025-06-16 11:24 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Zihuan Zhang, Peter Zijlstra, rafael, len.brown, pavel, kees,
	mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, vschneid, akpm, lorenzo.stoakes, Liam.Howlett,
	vbabka, rppt, surenb, linux-pm, linux-kernel, linux-mm

On Mon 16-06-25 09:45:59, David Hildenbrand wrote:
> 
> > > [...]
> > In our test scenario, although new processes can indeed be created
> > during the usleep_range() intervals between freeze iterations, it’s
> > actually difficult to make the freezer fail outright. This is because
> > user processes are forcibly frozen: when they return to user space and
> > check for pending signals, they enter try_to_freeze() and transition
> > into the refrigerator.
> > 
> > However, since the scheduler is fair by design, it gives both newly
> > forked tasks and yet-to-be-frozen tasks a chance to run. This
> > competition for CPU time can slightly delay the overall freeze process.
> > While this typically doesn’t lead to failure, it does cause more retries
> > than necessary, especially under CPU pressure.
> 
> I think that goes back to my original comment: why are we even allowing fork
> children to run at all when we are currently freezing all tasks?

The same should be the case for cgroup freezer as well.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-16  7:45             ` David Hildenbrand
  2025-06-16 11:24               ` Michal Hocko
@ 2025-06-18 11:30               ` Zihuan Zhang
  2025-06-18 11:54                 ` David Hildenbrand
  1 sibling, 1 reply; 16+ messages in thread
From: Zihuan Zhang @ 2025-06-18 11:30 UTC (permalink / raw)
  To: David Hildenbrand, Michal Hocko
  Cc: Peter Zijlstra, rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, linux-pm, linux-kernel, linux-mm

Hi David,

在 2025/6/16 15:45, David Hildenbrand 写道:
>
>>> [...]
>> In our test scenario, although new processes can indeed be created
>> during the usleep_range() intervals between freeze iterations, it’s
>> actually difficult to make the freezer fail outright. This is because
>> user processes are forcibly frozen: when they return to user space and
>> check for pending signals, they enter try_to_freeze() and transition
>> into the refrigerator.
>>
>> However, since the scheduler is fair by design, it gives both newly
>> forked tasks and yet-to-be-frozen tasks a chance to run. This
>> competition for CPU time can slightly delay the overall freeze process.
>> While this typically doesn’t lead to failure, it does cause more retries
>> than necessary, especially under CPU pressure.
>
> I think that goes back to my original comment: why are we even 
> allowing fork children to run at all when we are currently freezing 
> all tasks?
>
> I would imagine that try_to_freeze_tasks() should force any new 
> processes (forked children) to start in the frozen state directly and 
> not get scheduled in the first place.
>
Thanks again for your comments and suggestion.

We understand the motivation behind your idea: ideally, newly forked 
tasks during freezing should either be immediately frozen or prevented 
from running at all, to avoid unnecessary retries and delays. That makes 
perfect sense.

However, implementing this seems non-trivial under the current freezer 
model, as it relies on voluntary transitions and lacks a mechanism to 
block forked children from being scheduled.

Any insights or pointers would be greatly appreciated.

Best regards,
Zihuan Zhang


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-18 11:30               ` Zihuan Zhang
@ 2025-06-18 11:54                 ` David Hildenbrand
  2025-07-28 13:06                   ` Zihuan Zhang
  0 siblings, 1 reply; 16+ messages in thread
From: David Hildenbrand @ 2025-06-18 11:54 UTC (permalink / raw)
  To: Zihuan Zhang, Michal Hocko
  Cc: Peter Zijlstra, rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, linux-pm, linux-kernel, linux-mm

On 18.06.25 13:30, Zihuan Zhang wrote:
> Hi David,
> 
> 在 2025/6/16 15:45, David Hildenbrand 写道:
>>
>>>> [...]
>>> In our test scenario, although new processes can indeed be created
>>> during the usleep_range() intervals between freeze iterations, it’s
>>> actually difficult to make the freezer fail outright. This is because
>>> user processes are forcibly frozen: when they return to user space and
>>> check for pending signals, they enter try_to_freeze() and transition
>>> into the refrigerator.
>>>
>>> However, since the scheduler is fair by design, it gives both newly
>>> forked tasks and yet-to-be-frozen tasks a chance to run. This
>>> competition for CPU time can slightly delay the overall freeze process.
>>> While this typically doesn’t lead to failure, it does cause more retries
>>> than necessary, especially under CPU pressure.
>>
>> I think that goes back to my original comment: why are we even
>> allowing fork children to run at all when we are currently freezing
>> all tasks?
>>
>> I would imagine that try_to_freeze_tasks() should force any new
>> processes (forked children) to start in the frozen state directly and
>> not get scheduled in the first place.
>>
> Thanks again for your comments and suggestion.
> 
> We understand the motivation behind your idea: ideally, newly forked
> tasks during freezing should either be immediately frozen or prevented
> from running at all, to avoid unnecessary retries and delays. That makes
> perfect sense.
> 
> However, implementing this seems non-trivial under the current freezer
> model, as it relies on voluntary transitions and lacks a mechanism to
> block forked children from being scheduled.
> 
> Any insights or pointers would be greatly appreciated.

I'm afraid I can't provide too much guidance on scheduler logic.

Apparently we have this freezer_active global that forces existing 
frozen pages to enter the freezing_slow_path().

There, we perform multiple checks, including "pm_freezing && !(p->flags 
& PF_KTHREAD)".

I would have thought that we would want to make fork()/clone() children 
while freezing also result in freezing_slow_path()==true, and stop them 
from getting scheduled in the first place.

Again, no scheduler expert, but that's something I would look into.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH] PM: Optionally block user fork during freeze to improve performance
  2025-06-18 11:54                 ` David Hildenbrand
@ 2025-07-28 13:06                   ` Zihuan Zhang
  0 siblings, 0 replies; 16+ messages in thread
From: Zihuan Zhang @ 2025-07-28 13:06 UTC (permalink / raw)
  To: David Hildenbrand, Michal Hocko
  Cc: Peter Zijlstra, rafael, len.brown, pavel, kees, mingo, juri.lelli,
	vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
	vschneid, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, linux-pm, linux-kernel, linux-mm

Hi,

在 2025/6/18 19:54, David Hildenbrand 写道:
> On 18.06.25 13:30, Zihuan Zhang wrote:
>> Hi David,
>>
>> 在 2025/6/16 15:45, David Hildenbrand 写道:
>>>
>>>>> [...]
>>>> In our test scenario, although new processes can indeed be created
>>>> during the usleep_range() intervals between freeze iterations, it’s
>>>> actually difficult to make the freezer fail outright. This is because
>>>> user processes are forcibly frozen: when they return to user space and
>>>> check for pending signals, they enter try_to_freeze() and transition
>>>> into the refrigerator.
>>>>
>>>> However, since the scheduler is fair by design, it gives both newly
>>>> forked tasks and yet-to-be-frozen tasks a chance to run. This
>>>> competition for CPU time can slightly delay the overall freeze 
>>>> process.
>>>> While this typically doesn’t lead to failure, it does cause more 
>>>> retries
>>>> than necessary, especially under CPU pressure.
>>>
>>> I think that goes back to my original comment: why are we even
>>> allowing fork children to run at all when we are currently freezing
>>> all tasks?
>>>
>>> I would imagine that try_to_freeze_tasks() should force any new
>>> processes (forked children) to start in the frozen state directly and
>>> not get scheduled in the first place.
>>>
>> Thanks again for your comments and suggestion.
>>
>> We understand the motivation behind your idea: ideally, newly forked
>> tasks during freezing should either be immediately frozen or prevented
>> from running at all, to avoid unnecessary retries and delays. That makes
>> perfect sense.
>>
>> However, implementing this seems non-trivial under the current freezer
>> model, as it relies on voluntary transitions and lacks a mechanism to
>> block forked children from being scheduled.
>>
>> Any insights or pointers would be greatly appreciated.
>
> I'm afraid I can't provide too much guidance on scheduler logic.
>
> Apparently we have this freezer_active global that forces existing 
> frozen pages to enter the freezing_slow_path().
>
> There, we perform multiple checks, including "pm_freezing && 
> !(p->flags & PF_KTHREAD)".
>
> I would have thought that we would want to make fork()/clone() 
> children while freezing also result in freezing_slow_path()==true, and 
> stop them from getting scheduled in the first place.
>
> Again, no scheduler expert, but that's something I would look into.
>
We’re currently working on a new freeze priority mechanism, which allows 
the freezer to freeze user processes in layers rather than treating all 
tasks equally.

With our priority-based model, we can ensure that key processes are 
frozen in the correct order to avoid this class of problems entirely. I 
believe this approach will address the issue in a more robust and 
general way.

I’ll share the patchset soon for feedback after serval weeks.





^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-07-28 13:06 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-06  6:25 [RFC PATCH] PM: Optionally block user fork during freeze to improve performance Zihuan Zhang
2025-06-06  7:20 ` David Hildenbrand
2025-06-08  7:22   ` zhangzihuan
2025-06-08 15:50     ` Mateusz Guzik
2025-06-09  3:46       ` zhangzihuan
2025-06-06  8:22 ` Peter Zijlstra
2025-06-09  4:05   ` zhangzihuan
2025-06-10 10:50     ` David Hildenbrand
2025-06-13  2:37       ` Zihuan Zhang
2025-06-13  7:05         ` Michal Hocko
2025-06-16  3:46           ` Zihuan Zhang
2025-06-16  7:45             ` David Hildenbrand
2025-06-16 11:24               ` Michal Hocko
2025-06-18 11:30               ` Zihuan Zhang
2025-06-18 11:54                 ` David Hildenbrand
2025-07-28 13:06                   ` Zihuan Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).