public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak
@ 2024-10-21 19:30 Li Huafei
  2024-10-21 19:30 ` [PATCH RESEND 2/2] watchdog/hardlockup/perf: Warn if watchdog_ev is overwritten Li Huafei
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Li Huafei @ 2024-10-21 19:30 UTC (permalink / raw)
  To: tglx, peterz
  Cc: akpm, linux, song, dianders, j.granados, liusong, lizhe.67, yaoma,
	dzickus, mingo, linux-kernel, lihuafei1

During the stress test, we found a kmemleak report for perf_event:

  unreferenced object 0xff110001410a33e0 (size 1328):
    comm "kworker/4:11", pid 288, jiffies 4294916004
    hex dump (first 32 bytes):
      b8 be c2 3b 02 00 11 ff 22 01 00 00 00 00 ad de  ...;....".......
      f0 33 0a 41 01 00 11 ff f0 33 0a 41 01 00 11 ff  .3.A.....3.A....
    backtrace (crc 24eb7b3a):
      [<00000000e211b653>] kmem_cache_alloc_node_noprof+0x269/0x2e0
      [<000000009d0985fa>] perf_event_alloc+0x5f/0xcf0
      [<00000000084ad4a2>] perf_event_create_kernel_counter+0x38/0x1b0
      [<00000000fde96401>] hardlockup_detector_event_create+0x50/0xe0
      [<0000000051183158>] watchdog_hardlockup_enable+0x17/0x70
      [<00000000ac89727f>] softlockup_start_fn+0x15/0x40
      ...

Our stress test includes CPU online and offline, and updating the
watchdog configuration.  After reading the code, I found that there may
be a race between cleaning up perf_event after updating watchdog and
disabling event when the CPU goes offline:

  CPU0                          CPU1                           CPU2
  (update watchdog)                                            (hotplug offline CPU1)

  ...                                                          _cpu_down(CPU1)
  cpus_read_lock()                                             // waiting for cpu lock
    softlockup_start_all
      smp_call_on_cpu(CPU1)
                                softlockup_start_fn
                                ...
                                  watchdog_hardlockup_enable(CPU1)
                                    perf create E1
                                    watchdog_ev[CPU1] = E1
  cpus_read_unlock()
                                                               cpus_write_lock()
                                                               cpuhp_kick_ap_work(CPU1)
                                cpuhp_thread_fun
                                ...
                                  watchdog_hardlockup_disable(CPU1)
                                    watchdog_ev[CPU1] = NULL
                                    dead_event[CPU1] = E1
  __lockup_detector_cleanup
    for each dead_events_mask
      release each dead_event
      /*
       * CPU1 has not been added to
       * dead_events_mask, then E1
       * will not be released
       */
                                    CPU1 -> dead_events_mask
    cpumask_clear(&dead_events_mask)
    // dead_events_mask is cleared, E1 is leaked

In this case, the leaked perf_event E1 matches the perf_event leak
reported by kmemleak. Due to the low probability of problem recurrence
(only reported once), I added some hack delays in the code:

  static void __lockup_detector_reconfigure(void)
  {
    ...
          watchdog_hardlockup_start();
          cpus_read_unlock();
  +       mdelay(100);
          /*
           * Must be called outside the cpus locked section to prevent
           * recursive locking in the perf code.
    ...
  }

  void watchdog_hardlockup_disable(unsigned int cpu)
  {
    ...
                  perf_event_disable(event);
                  this_cpu_write(watchdog_ev, NULL);
                  this_cpu_write(dead_event, event);
  +               mdelay(100);
                  cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
                  atomic_dec(&watchdog_cpus);
    ...
  }

  void hardlockup_detector_perf_cleanup(void)
  {
    ...
                          perf_event_release_kernel(event);
                  per_cpu(dead_event, cpu) = NULL;
          }
  +       mdelay(100);
          cpumask_clear(&dead_events_mask);
  }

Then, simultaneously performing CPU on/off and switching watchdog, it is
almost certain to reproduce this leak.

The problem here is that releasing perf_event is not within the CPU
hotplug read-write lock. The commit 941154bd6937
("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock") introduced
deferred release to solve the deadlock caused by calling
get_online_cpus() when releasing perf_event. Later, the commit
efe951d3de91 ("perf/x86: Fix perf,x86,cpuhp deadlock") removed the
get_online_cpus() call on the perf_event release path to solve another
deadlock problem.

Therefore, it is now possible to move the release of perf_event back
into the CPU hotplug read-write lock, and release the event immediately
after disabling it.

Fixes: 941154bd6937 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
---
 include/linux/nmi.h    |  4 ----
 kernel/cpu.c           |  5 -----
 kernel/watchdog.c      | 25 -------------------------
 kernel/watchdog_perf.c | 28 +---------------------------
 4 files changed, 1 insertion(+), 61 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a8dfb38c9bb6..e78fa535f61d 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,7 +17,6 @@
 void lockup_detector_init(void);
 void lockup_detector_retry_init(void);
 void lockup_detector_soft_poweroff(void);
-void lockup_detector_cleanup(void);
 
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
@@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_retry_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
-static inline void lockup_detector_cleanup(void) { }
 #endif /* !CONFIG_LOCKUP_DETECTOR */
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
 #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
-extern void hardlockup_detector_perf_cleanup(void);
 extern void hardlockup_config_perf_event(const char *str);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
-static inline void hardlockup_detector_perf_cleanup(void) { }
 static inline void hardlockup_config_perf_event(const char *str) { }
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d293d52a3e00..5dc197f5ca65 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1452,11 +1452,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 out:
 	cpus_write_unlock();
-	/*
-	 * Do post unplug cleanup. This is still protected against
-	 * concurrent CPU hotplug via cpu_add_remove_lock.
-	 */
-	lockup_detector_cleanup();
 	arch_smt_update();
 	return ret;
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 262691ba62b7..4dc72540c3b0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str)
 }
 __setup("watchdog_thresh=", watchdog_thresh_setup);
 
-static void __lockup_detector_cleanup(void);
-
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
 enum stats_per_group {
 	STATS_SYSTEM,
@@ -878,11 +876,6 @@ static void __lockup_detector_reconfigure(void)
 
 	watchdog_hardlockup_start();
 	cpus_read_unlock();
-	/*
-	 * Must be called outside the cpus locked section to prevent
-	 * recursive locking in the perf code.
-	 */
-	__lockup_detector_cleanup();
 }
 
 void lockup_detector_reconfigure(void)
@@ -932,24 +925,6 @@ static inline void lockup_detector_setup(void)
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
-static void __lockup_detector_cleanup(void)
-{
-	lockdep_assert_held(&watchdog_mutex);
-	hardlockup_detector_perf_cleanup();
-}
-
-/**
- * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
- *
- * Caller must not hold the cpu hotplug rwsem.
- */
-void lockup_detector_cleanup(void)
-{
-	mutex_lock(&watchdog_mutex);
-	__lockup_detector_cleanup();
-	mutex_unlock(&watchdog_mutex);
-}
-
 /**
  * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
  *
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 59c1d86a73a2..2fdb96eaf493 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -21,8 +21,6 @@
 #include <linux/perf_event.h>
 
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
 
 static atomic_t watchdog_cpus = ATOMIC_INIT(0);
 
@@ -181,36 +179,12 @@ void watchdog_hardlockup_disable(unsigned int cpu)
 
 	if (event) {
 		perf_event_disable(event);
+		perf_event_release_kernel(event);
 		this_cpu_write(watchdog_ev, NULL);
-		this_cpu_write(dead_event, event);
-		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
 		atomic_dec(&watchdog_cpus);
 	}
 }
 
-/**
- * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
- *
- * Called from lockup_detector_cleanup(). Serialized by the caller.
- */
-void hardlockup_detector_perf_cleanup(void)
-{
-	int cpu;
-
-	for_each_cpu(cpu, &dead_events_mask) {
-		struct perf_event *event = per_cpu(dead_event, cpu);
-
-		/*
-		 * Required because for_each_cpu() reports  unconditionally
-		 * CPU0 as set on UP kernels. Sigh.
-		 */
-		if (event)
-			perf_event_release_kernel(event);
-		per_cpu(dead_event, cpu) = NULL;
-	}
-	cpumask_clear(&dead_events_mask);
-}
-
 /**
  * hardlockup_detector_perf_stop - Globally stop watchdog events
  *
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH RESEND 2/2] watchdog/hardlockup/perf: Warn if watchdog_ev is overwritten
  2024-10-21 19:30 [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
@ 2024-10-21 19:30 ` Li Huafei
  2025-03-06 11:19   ` [tip: perf/core] watchdog/hardlockup/perf: Warn if watchdog_ev is leaked tip-bot2 for Li Huafei
  2024-11-06 10:35 ` [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
  2025-03-06 11:19 ` [tip: perf/core] " tip-bot2 for Li Huafei
  2 siblings, 1 reply; 5+ messages in thread
From: Li Huafei @ 2024-10-21 19:30 UTC (permalink / raw)
  To: tglx, peterz
  Cc: akpm, linux, song, dianders, j.granados, liusong, lizhe.67, yaoma,
	dzickus, mingo, linux-kernel, lihuafei1

When creating a new perf_event, it should not happen that the old
perf_event is not released. If it does, make a warning to sense the
problem in time.

Signed-off-by: Li Huafei <lihuafei1@huawei.com>
---
 kernel/watchdog_perf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 2fdb96eaf493..09236586b8c3 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -144,6 +144,7 @@ static int hardlockup_detector_event_create(void)
 			 PTR_ERR(evt));
 		return PTR_ERR(evt);
 	}
+	WARN_ON(this_cpu_read(watchdog_ev));
 	this_cpu_write(watchdog_ev, evt);
 	return 0;
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak
  2024-10-21 19:30 [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
  2024-10-21 19:30 ` [PATCH RESEND 2/2] watchdog/hardlockup/perf: Warn if watchdog_ev is overwritten Li Huafei
@ 2024-11-06 10:35 ` Li Huafei
  2025-03-06 11:19 ` [tip: perf/core] " tip-bot2 for Li Huafei
  2 siblings, 0 replies; 5+ messages in thread
From: Li Huafei @ 2024-11-06 10:35 UTC (permalink / raw)
  To: tglx, peterz
  Cc: akpm, linux, song, dianders, j.granados, liusong, lizhe.67, yaoma,
	dzickus, mingo, linux-kernel

Hi Thomas, can you help review my patch? Thank you very much :)


On 2024/10/22 3:30, Li Huafei wrote:
> During the stress test, we found a kmemleak report for perf_event:
> 
>   unreferenced object 0xff110001410a33e0 (size 1328):
>     comm "kworker/4:11", pid 288, jiffies 4294916004
>     hex dump (first 32 bytes):
>       b8 be c2 3b 02 00 11 ff 22 01 00 00 00 00 ad de  ...;....".......
>       f0 33 0a 41 01 00 11 ff f0 33 0a 41 01 00 11 ff  .3.A.....3.A....
>     backtrace (crc 24eb7b3a):
>       [<00000000e211b653>] kmem_cache_alloc_node_noprof+0x269/0x2e0
>       [<000000009d0985fa>] perf_event_alloc+0x5f/0xcf0
>       [<00000000084ad4a2>] perf_event_create_kernel_counter+0x38/0x1b0
>       [<00000000fde96401>] hardlockup_detector_event_create+0x50/0xe0
>       [<0000000051183158>] watchdog_hardlockup_enable+0x17/0x70
>       [<00000000ac89727f>] softlockup_start_fn+0x15/0x40
>       ...
> 
> Our stress test includes CPU online and offline, and updating the
> watchdog configuration.  After reading the code, I found that there may
> be a race between cleaning up perf_event after updating watchdog and
> disabling event when the CPU goes offline:
> 
>   CPU0                          CPU1                           CPU2
>   (update watchdog)                                            (hotplug offline CPU1)
> 
>   ...                                                          _cpu_down(CPU1)
>   cpus_read_lock()                                             // waiting for cpu lock
>     softlockup_start_all
>       smp_call_on_cpu(CPU1)
>                                 softlockup_start_fn
>                                 ...
>                                   watchdog_hardlockup_enable(CPU1)
>                                     perf create E1
>                                     watchdog_ev[CPU1] = E1
>   cpus_read_unlock()
>                                                                cpus_write_lock()
>                                                                cpuhp_kick_ap_work(CPU1)
>                                 cpuhp_thread_fun
>                                 ...
>                                   watchdog_hardlockup_disable(CPU1)
>                                     watchdog_ev[CPU1] = NULL
>                                     dead_event[CPU1] = E1
>   __lockup_detector_cleanup
>     for each dead_events_mask
>       release each dead_event
>       /*
>        * CPU1 has not been added to
>        * dead_events_mask, then E1
>        * will not be released
>        */
>                                     CPU1 -> dead_events_mask
>     cpumask_clear(&dead_events_mask)
>     // dead_events_mask is cleared, E1 is leaked
> 
> In this case, the leaked perf_event E1 matches the perf_event leak
> reported by kmemleak. Due to the low probability of problem recurrence
> (only reported once), I added some hack delays in the code:
> 
>   static void __lockup_detector_reconfigure(void)
>   {
>     ...
>           watchdog_hardlockup_start();
>           cpus_read_unlock();
>   +       mdelay(100);
>           /*
>            * Must be called outside the cpus locked section to prevent
>            * recursive locking in the perf code.
>     ...
>   }
> 
>   void watchdog_hardlockup_disable(unsigned int cpu)
>   {
>     ...
>                   perf_event_disable(event);
>                   this_cpu_write(watchdog_ev, NULL);
>                   this_cpu_write(dead_event, event);
>   +               mdelay(100);
>                   cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
>                   atomic_dec(&watchdog_cpus);
>     ...
>   }
> 
>   void hardlockup_detector_perf_cleanup(void)
>   {
>     ...
>                           perf_event_release_kernel(event);
>                   per_cpu(dead_event, cpu) = NULL;
>           }
>   +       mdelay(100);
>           cpumask_clear(&dead_events_mask);
>   }
> 
> Then, simultaneously performing CPU on/off and switching watchdog, it is
> almost certain to reproduce this leak.
> 
> The problem here is that releasing perf_event is not within the CPU
> hotplug read-write lock. The commit 941154bd6937
> ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock") introduced
> deferred release to solve the deadlock caused by calling
> get_online_cpus() when releasing perf_event. Later, the commit
> efe951d3de91 ("perf/x86: Fix perf,x86,cpuhp deadlock") removed the
> get_online_cpus() call on the perf_event release path to solve another
> deadlock problem.
> 
> Therefore, it is now possible to move the release of perf_event back
> into the CPU hotplug read-write lock, and release the event immediately
> after disabling it.
> 
> Fixes: 941154bd6937 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")
> Signed-off-by: Li Huafei <lihuafei1@huawei.com>
> ---
>  include/linux/nmi.h    |  4 ----
>  kernel/cpu.c           |  5 -----
>  kernel/watchdog.c      | 25 -------------------------
>  kernel/watchdog_perf.c | 28 +---------------------------
>  4 files changed, 1 insertion(+), 61 deletions(-)
> 
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index a8dfb38c9bb6..e78fa535f61d 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -17,7 +17,6 @@
>  void lockup_detector_init(void);
>  void lockup_detector_retry_init(void);
>  void lockup_detector_soft_poweroff(void);
> -void lockup_detector_cleanup(void);
>  
>  extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
> @@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
>  static inline void lockup_detector_init(void) { }
>  static inline void lockup_detector_retry_init(void) { }
>  static inline void lockup_detector_soft_poweroff(void) { }
> -static inline void lockup_detector_cleanup(void) { }
>  #endif /* !CONFIG_LOCKUP_DETECTOR */
>  
>  #ifdef CONFIG_SOFTLOCKUP_DETECTOR
> @@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
>  #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
>  extern void hardlockup_detector_perf_stop(void);
>  extern void hardlockup_detector_perf_restart(void);
> -extern void hardlockup_detector_perf_cleanup(void);
>  extern void hardlockup_config_perf_event(const char *str);
>  #else
>  static inline void hardlockup_detector_perf_stop(void) { }
>  static inline void hardlockup_detector_perf_restart(void) { }
> -static inline void hardlockup_detector_perf_cleanup(void) { }
>  static inline void hardlockup_config_perf_event(const char *str) { }
>  #endif
>  
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index d293d52a3e00..5dc197f5ca65 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -1452,11 +1452,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
>  
>  out:
>  	cpus_write_unlock();
> -	/*
> -	 * Do post unplug cleanup. This is still protected against
> -	 * concurrent CPU hotplug via cpu_add_remove_lock.
> -	 */
> -	lockup_detector_cleanup();
>  	arch_smt_update();
>  	return ret;
>  }
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 262691ba62b7..4dc72540c3b0 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str)
>  }
>  __setup("watchdog_thresh=", watchdog_thresh_setup);
>  
> -static void __lockup_detector_cleanup(void);
> -
>  #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
>  enum stats_per_group {
>  	STATS_SYSTEM,
> @@ -878,11 +876,6 @@ static void __lockup_detector_reconfigure(void)
>  
>  	watchdog_hardlockup_start();
>  	cpus_read_unlock();
> -	/*
> -	 * Must be called outside the cpus locked section to prevent
> -	 * recursive locking in the perf code.
> -	 */
> -	__lockup_detector_cleanup();
>  }
>  
>  void lockup_detector_reconfigure(void)
> @@ -932,24 +925,6 @@ static inline void lockup_detector_setup(void)
>  }
>  #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
>  
> -static void __lockup_detector_cleanup(void)
> -{
> -	lockdep_assert_held(&watchdog_mutex);
> -	hardlockup_detector_perf_cleanup();
> -}
> -
> -/**
> - * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
> - *
> - * Caller must not hold the cpu hotplug rwsem.
> - */
> -void lockup_detector_cleanup(void)
> -{
> -	mutex_lock(&watchdog_mutex);
> -	__lockup_detector_cleanup();
> -	mutex_unlock(&watchdog_mutex);
> -}
> -
>  /**
>   * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
>   *
> diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
> index 59c1d86a73a2..2fdb96eaf493 100644
> --- a/kernel/watchdog_perf.c
> +++ b/kernel/watchdog_perf.c
> @@ -21,8 +21,6 @@
>  #include <linux/perf_event.h>
>  
>  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
> -static DEFINE_PER_CPU(struct perf_event *, dead_event);
> -static struct cpumask dead_events_mask;
>  
>  static atomic_t watchdog_cpus = ATOMIC_INIT(0);
>  
> @@ -181,36 +179,12 @@ void watchdog_hardlockup_disable(unsigned int cpu)
>  
>  	if (event) {
>  		perf_event_disable(event);
> +		perf_event_release_kernel(event);
>  		this_cpu_write(watchdog_ev, NULL);
> -		this_cpu_write(dead_event, event);
> -		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
>  		atomic_dec(&watchdog_cpus);
>  	}
>  }
>  
> -/**
> - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
> - *
> - * Called from lockup_detector_cleanup(). Serialized by the caller.
> - */
> -void hardlockup_detector_perf_cleanup(void)
> -{
> -	int cpu;
> -
> -	for_each_cpu(cpu, &dead_events_mask) {
> -		struct perf_event *event = per_cpu(dead_event, cpu);
> -
> -		/*
> -		 * Required because for_each_cpu() reports  unconditionally
> -		 * CPU0 as set on UP kernels. Sigh.
> -		 */
> -		if (event)
> -			perf_event_release_kernel(event);
> -		per_cpu(dead_event, cpu) = NULL;
> -	}
> -	cpumask_clear(&dead_events_mask);
> -}
> -
>  /**
>   * hardlockup_detector_perf_stop - Globally stop watchdog events
>   *
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [tip: perf/core] watchdog/hardlockup/perf: Warn if watchdog_ev is leaked
  2024-10-21 19:30 ` [PATCH RESEND 2/2] watchdog/hardlockup/perf: Warn if watchdog_ev is overwritten Li Huafei
@ 2025-03-06 11:19   ` tip-bot2 for Li Huafei
  0 siblings, 0 replies; 5+ messages in thread
From: tip-bot2 for Li Huafei @ 2025-03-06 11:19 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Li Huafei, Ingo Molnar, Thomas Gleixner, Peter Zijlstra, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     05763885e327f0e257ee8b96b30ac1b95f7dd532
Gitweb:        https://git.kernel.org/tip/05763885e327f0e257ee8b96b30ac1b95f7dd532
Author:        Li Huafei <lihuafei1@huawei.com>
AuthorDate:    Tue, 22 Oct 2024 03:30:04 +08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Thu, 06 Mar 2025 12:07:39 +01:00

watchdog/hardlockup/perf: Warn if watchdog_ev is leaked

When creating a new perf_event for the hardlockup watchdog, it should not
happen that the old perf_event is not released.

Introduce a WARN_ONCE() that should never trigger.

[ mingo: Changed the type of the warning to WARN_ONCE(). ]

Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241021193004.308303-2-lihuafei1@huawei.com
---
 kernel/watchdog_perf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 2fdb96e..a78ff09 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -144,6 +144,7 @@ static int hardlockup_detector_event_create(void)
 			 PTR_ERR(evt));
 		return PTR_ERR(evt);
 	}
+	WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
 	this_cpu_write(watchdog_ev, evt);
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [tip: perf/core] watchdog/hardlockup/perf: Fix perf_event memory leak
  2024-10-21 19:30 [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
  2024-10-21 19:30 ` [PATCH RESEND 2/2] watchdog/hardlockup/perf: Warn if watchdog_ev is overwritten Li Huafei
  2024-11-06 10:35 ` [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
@ 2025-03-06 11:19 ` tip-bot2 for Li Huafei
  2 siblings, 0 replies; 5+ messages in thread
From: tip-bot2 for Li Huafei @ 2025-03-06 11:19 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Li Huafei, Ingo Molnar, Thomas Gleixner, Peter Zijlstra, x86,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     d6834d9c990333bfa433bc1816e2417f268eebbe
Gitweb:        https://git.kernel.org/tip/d6834d9c990333bfa433bc1816e2417f268eebbe
Author:        Li Huafei <lihuafei1@huawei.com>
AuthorDate:    Tue, 22 Oct 2024 03:30:03 +08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Thu, 06 Mar 2025 12:05:33 +01:00

watchdog/hardlockup/perf: Fix perf_event memory leak

During stress-testing, we found a kmemleak report for perf_event:

  unreferenced object 0xff110001410a33e0 (size 1328):
    comm "kworker/4:11", pid 288, jiffies 4294916004
    hex dump (first 32 bytes):
      b8 be c2 3b 02 00 11 ff 22 01 00 00 00 00 ad de  ...;....".......
      f0 33 0a 41 01 00 11 ff f0 33 0a 41 01 00 11 ff  .3.A.....3.A....
    backtrace (crc 24eb7b3a):
      [<00000000e211b653>] kmem_cache_alloc_node_noprof+0x269/0x2e0
      [<000000009d0985fa>] perf_event_alloc+0x5f/0xcf0
      [<00000000084ad4a2>] perf_event_create_kernel_counter+0x38/0x1b0
      [<00000000fde96401>] hardlockup_detector_event_create+0x50/0xe0
      [<0000000051183158>] watchdog_hardlockup_enable+0x17/0x70
      [<00000000ac89727f>] softlockup_start_fn+0x15/0x40
      ...

Our stress test includes CPU online and offline cycles, and updating the
watchdog configuration.

After reading the code, I found that there may be a race between cleaning up
perf_event after updating watchdog and disabling event when the CPU goes offline:

  CPU0                          CPU1                           CPU2
  (update watchdog)                                            (hotplug offline CPU1)

  ...                                                          _cpu_down(CPU1)
  cpus_read_lock()                                             // waiting for cpu lock
    softlockup_start_all
      smp_call_on_cpu(CPU1)
                                softlockup_start_fn
                                ...
                                  watchdog_hardlockup_enable(CPU1)
                                    perf create E1
                                    watchdog_ev[CPU1] = E1
  cpus_read_unlock()
                                                               cpus_write_lock()
                                                               cpuhp_kick_ap_work(CPU1)
                                cpuhp_thread_fun
                                ...
                                  watchdog_hardlockup_disable(CPU1)
                                    watchdog_ev[CPU1] = NULL
                                    dead_event[CPU1] = E1
  __lockup_detector_cleanup
    for each dead_events_mask
      release each dead_event
      /*
       * CPU1 has not been added to
       * dead_events_mask, then E1
       * will not be released
       */
                                    CPU1 -> dead_events_mask
    cpumask_clear(&dead_events_mask)
    // dead_events_mask is cleared, E1 is leaked

In this case, the leaked perf_event E1 matches the perf_event leak
reported by kmemleak. Due to the low probability of problem recurrence
(only reported once), I added some hack delays in the code:

  static void __lockup_detector_reconfigure(void)
  {
    ...
          watchdog_hardlockup_start();
          cpus_read_unlock();
  +       mdelay(100);
          /*
           * Must be called outside the cpus locked section to prevent
           * recursive locking in the perf code.
    ...
  }

  void watchdog_hardlockup_disable(unsigned int cpu)
  {
    ...
                  perf_event_disable(event);
                  this_cpu_write(watchdog_ev, NULL);
                  this_cpu_write(dead_event, event);
  +               mdelay(100);
                  cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
                  atomic_dec(&watchdog_cpus);
    ...
  }

  void hardlockup_detector_perf_cleanup(void)
  {
    ...
                          perf_event_release_kernel(event);
                  per_cpu(dead_event, cpu) = NULL;
          }
  +       mdelay(100);
          cpumask_clear(&dead_events_mask);
  }

Then, simultaneously performing CPU on/off and switching watchdog, it is
almost certain to reproduce this leak.

The problem here is that releasing perf_event is not within the CPU
hotplug read-write lock. Commit:

  941154bd6937 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")

introduced deferred release to solve the deadlock caused by calling
get_online_cpus() when releasing perf_event. Later, commit:

  efe951d3de91 ("perf/x86: Fix perf,x86,cpuhp deadlock")

removed the get_online_cpus() call on the perf_event release path to solve
another deadlock problem.

Therefore, it is now possible to move the release of perf_event back
into the CPU hotplug read-write lock, and release the event immediately
after disabling it.

Fixes: 941154bd6937 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241021193004.308303-1-lihuafei1@huawei.com
---
 include/linux/nmi.h    |  4 ----
 kernel/cpu.c           |  5 -----
 kernel/watchdog.c      | 25 -------------------------
 kernel/watchdog_perf.c | 28 +---------------------------
 4 files changed, 1 insertion(+), 61 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a8dfb38..e78fa53 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,7 +17,6 @@
 void lockup_detector_init(void);
 void lockup_detector_retry_init(void);
 void lockup_detector_soft_poweroff(void);
-void lockup_detector_cleanup(void);
 
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
@@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_retry_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
-static inline void lockup_detector_cleanup(void) { }
 #endif /* !CONFIG_LOCKUP_DETECTOR */
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
 #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
-extern void hardlockup_detector_perf_cleanup(void);
 extern void hardlockup_config_perf_event(const char *str);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
-static inline void hardlockup_detector_perf_cleanup(void) { }
 static inline void hardlockup_config_perf_event(const char *str) { }
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 07455d2..ad755db 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1453,11 +1453,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 out:
 	cpus_write_unlock();
-	/*
-	 * Do post unplug cleanup. This is still protected against
-	 * concurrent CPU hotplug via cpu_add_remove_lock.
-	 */
-	lockup_detector_cleanup();
 	arch_smt_update();
 	return ret;
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b2da7de..1815602 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str)
 }
 __setup("watchdog_thresh=", watchdog_thresh_setup);
 
-static void __lockup_detector_cleanup(void);
-
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
 enum stats_per_group {
 	STATS_SYSTEM,
@@ -886,11 +884,6 @@ static void __lockup_detector_reconfigure(void)
 
 	watchdog_hardlockup_start();
 	cpus_read_unlock();
-	/*
-	 * Must be called outside the cpus locked section to prevent
-	 * recursive locking in the perf code.
-	 */
-	__lockup_detector_cleanup();
 }
 
 void lockup_detector_reconfigure(void)
@@ -940,24 +933,6 @@ static inline void lockup_detector_setup(void)
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
-static void __lockup_detector_cleanup(void)
-{
-	lockdep_assert_held(&watchdog_mutex);
-	hardlockup_detector_perf_cleanup();
-}
-
-/**
- * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
- *
- * Caller must not hold the cpu hotplug rwsem.
- */
-void lockup_detector_cleanup(void)
-{
-	mutex_lock(&watchdog_mutex);
-	__lockup_detector_cleanup();
-	mutex_unlock(&watchdog_mutex);
-}
-
 /**
  * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
  *
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 59c1d86..2fdb96e 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -21,8 +21,6 @@
 #include <linux/perf_event.h>
 
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
 
 static atomic_t watchdog_cpus = ATOMIC_INIT(0);
 
@@ -181,37 +179,13 @@ void watchdog_hardlockup_disable(unsigned int cpu)
 
 	if (event) {
 		perf_event_disable(event);
+		perf_event_release_kernel(event);
 		this_cpu_write(watchdog_ev, NULL);
-		this_cpu_write(dead_event, event);
-		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
 		atomic_dec(&watchdog_cpus);
 	}
 }
 
 /**
- * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
- *
- * Called from lockup_detector_cleanup(). Serialized by the caller.
- */
-void hardlockup_detector_perf_cleanup(void)
-{
-	int cpu;
-
-	for_each_cpu(cpu, &dead_events_mask) {
-		struct perf_event *event = per_cpu(dead_event, cpu);
-
-		/*
-		 * Required because for_each_cpu() reports  unconditionally
-		 * CPU0 as set on UP kernels. Sigh.
-		 */
-		if (event)
-			perf_event_release_kernel(event);
-		per_cpu(dead_event, cpu) = NULL;
-	}
-	cpumask_clear(&dead_events_mask);
-}
-
-/**
  * hardlockup_detector_perf_stop - Globally stop watchdog events
  *
  * Special interface for x86 to handle the perf HT bug.

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-03-06 11:19 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-21 19:30 [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
2024-10-21 19:30 ` [PATCH RESEND 2/2] watchdog/hardlockup/perf: Warn if watchdog_ev is overwritten Li Huafei
2025-03-06 11:19   ` [tip: perf/core] watchdog/hardlockup/perf: Warn if watchdog_ev is leaked tip-bot2 for Li Huafei
2024-11-06 10:35 ` [PATCH RESEND 1/2] watchdog/hardlockup/perf: Fix perf_event memory leak Li Huafei
2025-03-06 11:19 ` [tip: perf/core] " tip-bot2 for Li Huafei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox