Linux Power Management development
 help / color / mirror / Atom feed
* [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp
@ 2025-06-24 13:24 Benjamin Berg
  2025-06-25  9:44 ` Hans de Goede
  0 siblings, 1 reply; 5+ messages in thread
From: Benjamin Berg @ 2025-06-24 13:24 UTC (permalink / raw)
  To: linux-pm; +Cc: Zhang Rui, Benjamin Berg, Hans de Goede

From: Benjamin Berg <benjamin.berg@intel.com>

The intel driver has code paths that will take the tz->lock while the
cpuhp_state-up lock is held. As the cpuhp_state-up lock is used in other
code paths, it may happen that lockdep detects possible deadlocks
through unrelated thermal zone devices.

Fix these false positives by using a separate lockdep class for the
x86_pkg_temp thermal device.

Reported-by: Hans de Goede <hansg@kernel.org>
Closes: https://lore.kernel.org/linux-pm/e9d7ef79-6a24-4515-aa35-d1f2357da798@kernel.org/
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>

---

Hi,

I believe that this should solve the lockdep warning that Hans was
seeing. That said, I have not tested it much.

v2:
- Fix function name
- Mark lock class variable static

Benjamin
---
 drivers/thermal/intel/x86_pkg_temp_thermal.c | 2 ++
 drivers/thermal/thermal_core.c               | 7 +++++++
 include/linux/thermal.h                      | 8 ++++++++
 3 files changed, 17 insertions(+)

diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
index 3fc679b6f11b..15d3c904eaa3 100644
--- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
@@ -310,6 +310,7 @@ static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
 
 static int pkg_temp_thermal_device_add(unsigned int cpu)
 {
+	static struct lock_class_key x86_pkg_temp_class;
 	struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
 	int id = topology_logical_die_id(cpu);
 	u32 eax, ebx, ecx, edx;
@@ -349,6 +350,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu)
 		err = PTR_ERR(zonedev->tzone);
 		goto out_kfree_zonedev;
 	}
+	thermal_zone_device_set_lock_class(zonedev->tzone, &x86_pkg_temp_class);
 	err = thermal_zone_device_enable(zonedev->tzone);
 	if (err)
 		goto out_unregister_tz;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 17ca5c082643..ff5c2e01904a 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1657,6 +1657,13 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
 }
 EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register);
 
+void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
+					struct lock_class_key *lock_class)
+{
+	lockdep_set_class_and_name(&tz->lock, lock_class, tz->type);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_set_lock_class);
+
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
 {
 	return tzd->devdata;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 0b5ed6821080..3cb4cf60b66d 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -240,6 +240,9 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
 					const struct thermal_zone_device_ops *ops,
 					const struct thermal_zone_params *tzp);
 
+void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
+					struct lock_class_key *lock_class);
+
 void thermal_zone_device_unregister(struct thermal_zone_device *tz);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
@@ -290,6 +293,11 @@ static inline struct thermal_zone_device *thermal_tripless_zone_device_register(
 					const struct thermal_zone_params *tzp)
 { return ERR_PTR(-ENODEV); }
 
+static inline void
+thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
+				   struct lock_class_key *lock_class)
+{ }
+
 static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 { }
 
-- 
2.50.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp
  2025-06-24 13:24 [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp Benjamin Berg
@ 2025-06-25  9:44 ` Hans de Goede
  2025-06-25 10:15   ` Berg, Benjamin
  0 siblings, 1 reply; 5+ messages in thread
From: Hans de Goede @ 2025-06-25  9:44 UTC (permalink / raw)
  To: Benjamin Berg, linux-pm; +Cc: Zhang Rui, Benjamin Berg

Hi Benjamin,

On 24-Jun-25 3:24 PM, Benjamin Berg wrote:
> From: Benjamin Berg <benjamin.berg@intel.com>
> 
> The intel driver has code paths that will take the tz->lock while the
> cpuhp_state-up lock is held. As the cpuhp_state-up lock is used in other
> code paths, it may happen that lockdep detects possible deadlocks
> through unrelated thermal zone devices.
> 
> Fix these false positives by using a separate lockdep class for the
> x86_pkg_temp thermal device.
> 
> Reported-by: Hans de Goede <hansg@kernel.org>
> Closes: https://lore.kernel.org/linux-pm/e9d7ef79-6a24-4515-aa35-d1f2357da798@kernel.org/
> Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>

Thank you for trying to fix this, the fix does get rid of cpuhp_state-up
in the lockdep report, only for it to be replaced with the thermal_list_lock
unfortunately (full lockdep report below).

It seems that the real issue is trying to lock wiphy.mtx while holding
tz->lock.

Or maybe we need this patch + a patch to not hold wiphy.mtx while
registering the thermalzone ?

Regards,

Hans






[   19.759432] ======================================================
[   19.759434] WARNING: possible circular locking dependency detected
[   19.759436] 6.16.0-rc3+ #11 Not tainted
[   19.759439] ------------------------------------------------------
[   19.759440] modprobe/881 is trying to acquire lock:
[   19.759442] ffff897b4c940768 (&rdev->wiphy.mtx){+.+.}-{4:4}, at: iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.759485] 
               but task is already holding lock:
[   19.759486] ffff897b0204e708 (&tz->lock){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
[   19.759498] 
               which lock already depends on the new lock.

[   19.759500] 
               the existing dependency chain (in reverse order) is:
[   19.759501] 
               -> #5 (&tz->lock){+.+.}-{4:4}:
[   19.759506]        __mutex_lock+0x9f/0xed0
[   19.759513]        thermal_zone_device_register_with_trips+0x573/0x640
[   19.759517]        thermal_tripless_zone_device_register+0x1b/0x30
[   19.759521]        int3400_thermal_probe+0x21d/0x4f0 [int3400_thermal]
[   19.759526]        platform_probe+0x3f/0xa0
[   19.759531]        really_probe+0xdb/0x340
[   19.759535]        __driver_probe_device+0x78/0x140
[   19.759539]        driver_probe_device+0x1f/0xa0
[   19.759542]        __driver_attach+0xcb/0x1e0
[   19.759546]        bus_for_each_dev+0x63/0xa0
[   19.759549]        bus_add_driver+0x10a/0x1f0
[   19.759552]        driver_register+0x71/0xe0
[   19.759556]        do_one_initcall+0x54/0x390
[   19.759561]        do_init_module+0x62/0x240
[   19.759565]        __do_sys_init_module+0x164/0x190
[   19.759568]        do_syscall_64+0x94/0x3d0
[   19.759571]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.759574] 
               -> #4 (thermal_list_lock){+.+.}-{4:4}:
[   19.759578]        __mutex_lock+0x9f/0xed0
[   19.759581]        __thermal_cooling_device_register.part.0+0x16e/0x2a0
[   19.759585]        acpi_processor_thermal_init+0x24/0xb0
[   19.759591]        acpi_soft_cpu_online+0xa7/0x140
[   19.759593]        cpuhp_invoke_callback+0x1ab/0x660
[   19.759599]        cpuhp_thread_fun+0x187/0x270
[   19.759602]        smpboot_thread_fn+0x12a/0x2e0
[   19.759607]        kthread+0x108/0x240
[   19.759612]        ret_from_fork+0x232/0x2a0
[   19.759617]        ret_from_fork_asm+0x1a/0x30
[   19.759621] 
               -> #3 (cpuhp_state-up){+.+.}-{0:0}:
[   19.759625]        cpuhp_thread_fun+0x99/0x270
[   19.759628]        smpboot_thread_fn+0x12a/0x2e0
[   19.759631]        kthread+0x108/0x240
[   19.759635]        ret_from_fork+0x232/0x2a0
[   19.759638]        ret_from_fork_asm+0x1a/0x30
[   19.759642] 
               -> #2 (cpu_hotplug_lock){++++}-{0:0}:
[   19.759645]        cpus_read_lock+0x3c/0xe0
[   19.759649]        static_key_slow_inc+0x12/0x30
[   19.759656]        __nf_register_net_hook+0xb7/0x210
[   19.759662]        nf_register_net_hook+0x2d/0x90
[   19.759665]        nf_tables_addchain.constprop.0+0x2dd/0x6f0 [nf_tables]
[   19.759686]        nf_tables_newchain+0x78f/0xb10 [nf_tables]
[   19.759701]        nfnetlink_rcv_batch+0x7a5/0xc50 [nfnetlink]
[   19.759705]        nfnetlink_rcv+0x12d/0x150 [nfnetlink]
[   19.759708]        netlink_unicast+0x1bf/0x2b0
[   19.759712]        netlink_sendmsg+0x211/0x430
[   19.759714]        ____sys_sendmsg+0x373/0x3b0
[   19.759719]        ___sys_sendmsg+0x7d/0xc0
[   19.759722]        __sys_sendmsg+0x5e/0xb0
[   19.759724]        do_syscall_64+0x94/0x3d0
[   19.759727]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.759729] 
               -> #1 (&nft_net->commit_mutex){+.+.}-{4:4}:
[   19.759733]        __mutex_lock+0x9f/0xed0
[   19.759735]        nf_tables_netdev_event+0x59/0xc0 [nf_tables]
[   19.759751]        notifier_call_chain+0x3d/0x100
[   19.759755]        register_netdevice+0x731/0x8f0
[   19.759759]        cfg80211_register_netdevice+0x4c/0xf0 [cfg80211]
[   19.759828]        ieee80211_if_add+0x475/0x740 [mac80211]
[   19.759921]        ieee80211_register_hw+0xd6b/0xdb0 [mac80211]
[   19.759967]        iwl_op_mode_mld_start+0x438/0x4b0 [iwlmld]
[   19.759989]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.760012]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.760028]        iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.760043]        do_one_initcall+0x54/0x390
[   19.760046]        do_init_module+0x62/0x240
[   19.760049]        __do_sys_init_module+0x164/0x190
[   19.760052]        do_syscall_64+0x94/0x3d0
[   19.760055]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.760057] 
               -> #0 (&rdev->wiphy.mtx){+.+.}-{4:4}:
[   19.760061]        __lock_acquire+0x1481/0x2270
[   19.760067]        lock_acquire+0xc9/0x2c0
[   19.760070]        __mutex_lock+0x9f/0xed0
[   19.760074]        iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.760098]        __thermal_zone_get_temp+0x29/0x90
[   19.760102]        __thermal_zone_device_update+0x69/0x480
[   19.760106]        thermal_zone_device_set_mode+0x52/0xa0
[   19.760110]        iwl_mld_thermal_zone_register+0x144/0x1d0 [iwlmld]
[   19.760126]        iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
[   19.760140]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.760157]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.760173]        iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.760187]        do_one_initcall+0x54/0x390
[   19.760190]        do_init_module+0x62/0x240
[   19.760193]        __do_sys_init_module+0x164/0x190
[   19.760196]        do_syscall_64+0x94/0x3d0
[   19.760198]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.760201] 
               other info that might help us debug this:

[   19.760202] Chain exists of:
                 &rdev->wiphy.mtx --> thermal_list_lock --> &tz->lock

[   19.760208]  Possible unsafe locking scenario:

[   19.760209]        CPU0                    CPU1
[   19.760211]        ----                    ----
[   19.760212]   lock(&tz->lock);
[   19.760215]                                lock(thermal_list_lock);
[   19.760218]                                lock(&tz->lock);
[   19.760220]   lock(&rdev->wiphy.mtx);
[   19.760223] 
                *** DEADLOCK ***

[   19.760224] 2 locks held by modprobe/881:
[   19.760227]  #0: ffffffffc1914c68 (iwlwifi_opmode_table_mtx){+.+.}-{4:4}, at: iwl_opmode_register+0x21/0xc0 [iwlwifi]
[   19.760247]  #1: ffff897b0204e708 (&tz->lock){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
[   19.760255] 
               stack backtrace:
[   19.760259] CPU: 11 UID: 0 PID: 881 Comm: modprobe Not tainted 6.16.0-rc3+ #11 PREEMPT(lazy) 
[   19.760262] Hardware name: Dell Inc. XPS 16 9640/09CK4V, BIOS 1.12.0 02/10/2025
[   19.760264] Call Trace:
[   19.760266]  <TASK>
[   19.760267]  dump_stack_lvl+0x68/0x90
[   19.760271]  print_circular_bug.cold+0x185/0x1d0
[   19.760276]  check_noncircular+0x10f/0x130
[   19.760279]  ? __kernel_text_address+0xe/0x30
[   19.760282]  ? unwind_get_return_address+0x26/0x50
[   19.760287]  __lock_acquire+0x1481/0x2270
[   19.760293]  lock_acquire+0xc9/0x2c0
[   19.760296]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.760314]  __mutex_lock+0x9f/0xed0
[   19.760317]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.760332]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.760345]  ? lock_acquire+0xc9/0x2c0
[   19.760348]  ? thermal_zone_device_set_mode+0x20/0xa0
[   19.760352]  ? lock_acquire+0xd9/0x2c0
[   19.760356]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.760368]  iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.760382]  ? lock_is_held_type+0xd5/0x140
[   19.760386]  __thermal_zone_get_temp+0x29/0x90
[   19.760389]  __thermal_zone_device_update+0x69/0x480
[   19.760395]  thermal_zone_device_set_mode+0x52/0xa0
[   19.760399]  iwl_mld_thermal_zone_register+0x144/0x1d0 [iwlmld]
[   19.760416]  iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
[   19.760435]  _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.760453]  iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.760468]  ? __pfx_iwl_mld_init+0x10/0x10 [iwlmld]
[   19.760482]  iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.760495]  do_one_initcall+0x54/0x390
[   19.760499]  do_init_module+0x62/0x240
[   19.760502]  ? __do_sys_init_module+0x164/0x190
[   19.760504]  __do_sys_init_module+0x164/0x190
[   19.760510]  do_syscall_64+0x94/0x3d0
[   19.760513]  ? lock_acquire+0xc9/0x2c0
[   19.760516]  ? __folio_batch_add_and_move+0x8f/0x2f0
[   19.760519]  ? lock_acquire+0xd9/0x2c0
[   19.760522]  ? find_held_lock+0x2b/0x80
[   19.760524]  ? find_held_lock+0x2b/0x80
[   19.760526]  ? find_held_lock+0x2b/0x80
[   19.760529]  ? rcu_read_unlock+0x17/0x60
[   19.760533]  ? lock_release+0x1a0/0x2d0
[   19.760537]  ? __lock_acquire+0x45f/0x2270
[   19.760541]  ? __handle_mm_fault+0xaf4/0xe20
[   19.760545]  ? lock_acquire+0xc9/0x2c0
[   19.760549]  ? find_held_lock+0x2b/0x80
[   19.760551]  ? rcu_read_unlock+0x17/0x60
[   19.760553]  ? lock_release+0x1a0/0x2d0
[   19.760556]  ? find_held_lock+0x2b/0x80
[   19.760559]  ? exc_page_fault+0x8c/0x240
[   19.760561]  ? lock_release+0x1a0/0x2d0
[   19.760564]  ? do_user_addr_fault+0x370/0x6b0
[   19.760568]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.760571] RIP: 0033:0x7f531b702bae
[   19.760574] Code: 48 8b 0d 5d 32 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2a 32 0f 00 f7 d8 64 89 01 48
[   19.760576] RSP: 002b:00007ffe4c645798 EFLAGS: 00000246 ORIG_RAX: 00000000000000af
[   19.760580] RAX: ffffffffffffffda RBX: 00005600c715be60 RCX: 00007f531b702bae
[   19.760581] RDX: 00005600a44015ee RSI: 00000000000cbf71 RDI: 00005600c7d0acd0
[   19.760583] RBP: 00007ffe4c645850 R08: 00005600c715bd40 R09: 00007f531b7f6ac0
[   19.760584] R10: 00005600c715b010 R11: 0000000000000246 R12: 0000000000040000
[   19.760585] R13: 00005600c715bdc0 R14: 00005600a44015ee R15: 0000000000000000
[   19.760590]  </TASK>
[   19.760846] iwlwifi 0000:03:00.0: Registered PHC clock: iwlwifi-PTP, with index: 0


Regards,

Hans




> 
> ---
> 
> Hi,
> 
> I believe that this should solve the lockdep warning that Hans was
> seeing. That said, I have not tested it much.
> 
> v2:
> - Fix function name
> - Mark lock class variable static
> 
> Benjamin
> ---
>  drivers/thermal/intel/x86_pkg_temp_thermal.c | 2 ++
>  drivers/thermal/thermal_core.c               | 7 +++++++
>  include/linux/thermal.h                      | 8 ++++++++
>  3 files changed, 17 insertions(+)
> 
> diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
> index 3fc679b6f11b..15d3c904eaa3 100644
> --- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
> +++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
> @@ -310,6 +310,7 @@ static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
>  
>  static int pkg_temp_thermal_device_add(unsigned int cpu)
>  {
> +	static struct lock_class_key x86_pkg_temp_class;
>  	struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
>  	int id = topology_logical_die_id(cpu);
>  	u32 eax, ebx, ecx, edx;
> @@ -349,6 +350,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu)
>  		err = PTR_ERR(zonedev->tzone);
>  		goto out_kfree_zonedev;
>  	}
> +	thermal_zone_device_set_lock_class(zonedev->tzone, &x86_pkg_temp_class);
>  	err = thermal_zone_device_enable(zonedev->tzone);
>  	if (err)
>  		goto out_unregister_tz;
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index 17ca5c082643..ff5c2e01904a 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -1657,6 +1657,13 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
>  }
>  EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register);
>  
> +void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
> +					struct lock_class_key *lock_class)
> +{
> +	lockdep_set_class_and_name(&tz->lock, lock_class, tz->type);
> +}
> +EXPORT_SYMBOL_GPL(thermal_zone_device_set_lock_class);
> +
>  void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
>  {
>  	return tzd->devdata;
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index 0b5ed6821080..3cb4cf60b66d 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -240,6 +240,9 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
>  					const struct thermal_zone_device_ops *ops,
>  					const struct thermal_zone_params *tzp);
>  
> +void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
> +					struct lock_class_key *lock_class);
> +
>  void thermal_zone_device_unregister(struct thermal_zone_device *tz);
>  
>  void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
> @@ -290,6 +293,11 @@ static inline struct thermal_zone_device *thermal_tripless_zone_device_register(
>  					const struct thermal_zone_params *tzp)
>  { return ERR_PTR(-ENODEV); }
>  
> +static inline void
> +thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
> +				   struct lock_class_key *lock_class)
> +{ }
> +
>  static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
>  { }
>  


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp
  2025-06-25  9:44 ` Hans de Goede
@ 2025-06-25 10:15   ` Berg, Benjamin
  2025-06-25 11:34     ` Hans de Goede
  2025-06-25 13:23     ` Hans de Goede
  0 siblings, 2 replies; 5+ messages in thread
From: Berg, Benjamin @ 2025-06-25 10:15 UTC (permalink / raw)
  To: hansg@kernel.org, linux-pm@vger.kernel.org; +Cc: Zhang, Rui

On Wed, 2025-06-25 at 11:44 +0200, Hans de Goede wrote:
> Hi Benjamin,
> 
> On 24-Jun-25 3:24 PM, Benjamin Berg wrote:
> > From: Benjamin Berg <benjamin.berg@intel.com>
> > 
> > The intel driver has code paths that will take the tz->lock while the
> > cpuhp_state-up lock is held. As the cpuhp_state-up lock is used in other
> > code paths, it may happen that lockdep detects possible deadlocks
> > through unrelated thermal zone devices.
> > 
> > Fix these false positives by using a separate lockdep class for the
> > x86_pkg_temp thermal device.
> > 
> > Reported-by: Hans de Goede <hansg@kernel.org>
> > Closes: https://lore.kernel.org/linux-pm/e9d7ef79-6a24-4515-aa35-d1f2357da798@kernel.org/
> > Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
> 
> Thank you for trying to fix this, the fix does get rid of cpuhp_state-up
> in the lockdep report, only for it to be replaced with the thermal_list_lock
> unfortunately (full lockdep report below).
> 
> It seems that the real issue is trying to lock wiphy.mtx while holding
> tz->lock.
> 
> Or maybe we need this patch + a patch to not hold wiphy.mtx while
> registering the thermalzone ?

I think my patch simply did not work, because
thermal_zone_device_register_with_trips is already taking &tz->lock
before the lock class is changed.

What might work is to instead pass the lock class in when registering
the thermal zone device. i.e. just add a new helper with an additional
argument for the lock class and set it right after the mutex_init call
for &tz->lock.

Benjamin

> [   19.759432] ======================================================
> [   19.759434] WARNING: possible circular locking dependency detected
> [   19.759436] 6.16.0-rc3+ #11 Not tainted
> [   19.759439] ------------------------------------------------------
> [   19.759440] modprobe/881 is trying to acquire lock:
> [   19.759442] ffff897b4c940768 (&rdev->wiphy.mtx){+.+.}-{4:4}, at: iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.759485] 
>                but task is already holding lock:
> [   19.759486] ffff897b0204e708 (&tz->lock){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
> [   19.759498] 
>                which lock already depends on the new lock.
> 
> [   19.759500] 
>                the existing dependency chain (in reverse order) is:
> [   19.759501] 
>                -> #5 (&tz->lock){+.+.}-{4:4}:
> [   19.759506]        __mutex_lock+0x9f/0xed0
> [   19.759513]        thermal_zone_device_register_with_trips+0x573/0x640
> [   19.759517]        thermal_tripless_zone_device_register+0x1b/0x30
> [   19.759521]        int3400_thermal_probe+0x21d/0x4f0 [int3400_thermal]
> [   19.759526]        platform_probe+0x3f/0xa0
> [   19.759531]        really_probe+0xdb/0x340
> [   19.759535]        __driver_probe_device+0x78/0x140
> [   19.759539]        driver_probe_device+0x1f/0xa0
> [   19.759542]        __driver_attach+0xcb/0x1e0
> [   19.759546]        bus_for_each_dev+0x63/0xa0
> [   19.759549]        bus_add_driver+0x10a/0x1f0
> [   19.759552]        driver_register+0x71/0xe0
> [   19.759556]        do_one_initcall+0x54/0x390
> [   19.759561]        do_init_module+0x62/0x240
> [   19.759565]        __do_sys_init_module+0x164/0x190
> [   19.759568]        do_syscall_64+0x94/0x3d0
> [   19.759571]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   19.759574] 
>                -> #4 (thermal_list_lock){+.+.}-{4:4}:
> [   19.759578]        __mutex_lock+0x9f/0xed0
> [   19.759581]        __thermal_cooling_device_register.part.0+0x16e/0x2a0
> [   19.759585]        acpi_processor_thermal_init+0x24/0xb0
> [   19.759591]        acpi_soft_cpu_online+0xa7/0x140
> [   19.759593]        cpuhp_invoke_callback+0x1ab/0x660
> [   19.759599]        cpuhp_thread_fun+0x187/0x270
> [   19.759602]        smpboot_thread_fn+0x12a/0x2e0
> [   19.759607]        kthread+0x108/0x240
> [   19.759612]        ret_from_fork+0x232/0x2a0
> [   19.759617]        ret_from_fork_asm+0x1a/0x30
> [   19.759621] 
>                -> #3 (cpuhp_state-up){+.+.}-{0:0}:
> [   19.759625]        cpuhp_thread_fun+0x99/0x270
> [   19.759628]        smpboot_thread_fn+0x12a/0x2e0
> [   19.759631]        kthread+0x108/0x240
> [   19.759635]        ret_from_fork+0x232/0x2a0
> [   19.759638]        ret_from_fork_asm+0x1a/0x30
> [   19.759642] 
>                -> #2 (cpu_hotplug_lock){++++}-{0:0}:
> [   19.759645]        cpus_read_lock+0x3c/0xe0
> [   19.759649]        static_key_slow_inc+0x12/0x30
> [   19.759656]        __nf_register_net_hook+0xb7/0x210
> [   19.759662]        nf_register_net_hook+0x2d/0x90
> [   19.759665]        nf_tables_addchain.constprop.0+0x2dd/0x6f0 [nf_tables]
> [   19.759686]        nf_tables_newchain+0x78f/0xb10 [nf_tables]
> [   19.759701]        nfnetlink_rcv_batch+0x7a5/0xc50 [nfnetlink]
> [   19.759705]        nfnetlink_rcv+0x12d/0x150 [nfnetlink]
> [   19.759708]        netlink_unicast+0x1bf/0x2b0
> [   19.759712]        netlink_sendmsg+0x211/0x430
> [   19.759714]        ____sys_sendmsg+0x373/0x3b0
> [   19.759719]        ___sys_sendmsg+0x7d/0xc0
> [   19.759722]        __sys_sendmsg+0x5e/0xb0
> [   19.759724]        do_syscall_64+0x94/0x3d0
> [   19.759727]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   19.759729] 
>                -> #1 (&nft_net->commit_mutex){+.+.}-{4:4}:
> [   19.759733]        __mutex_lock+0x9f/0xed0
> [   19.759735]        nf_tables_netdev_event+0x59/0xc0 [nf_tables]
> [   19.759751]        notifier_call_chain+0x3d/0x100
> [   19.759755]        register_netdevice+0x731/0x8f0
> [   19.759759]        cfg80211_register_netdevice+0x4c/0xf0 [cfg80211]
> [   19.759828]        ieee80211_if_add+0x475/0x740 [mac80211]
> [   19.759921]        ieee80211_register_hw+0xd6b/0xdb0 [mac80211]
> [   19.759967]        iwl_op_mode_mld_start+0x438/0x4b0 [iwlmld]
> [   19.759989]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
> [   19.760012]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
> [   19.760028]        iwl_mld_init+0x19/0xff0 [iwlmld]
> [   19.760043]        do_one_initcall+0x54/0x390
> [   19.760046]        do_init_module+0x62/0x240
> [   19.760049]        __do_sys_init_module+0x164/0x190
> [   19.760052]        do_syscall_64+0x94/0x3d0
> [   19.760055]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   19.760057] 
>                -> #0 (&rdev->wiphy.mtx){+.+.}-{4:4}:
> [   19.760061]        __lock_acquire+0x1481/0x2270
> [   19.760067]        lock_acquire+0xc9/0x2c0
> [   19.760070]        __mutex_lock+0x9f/0xed0
> [   19.760074]        iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.760098]        __thermal_zone_get_temp+0x29/0x90
> [   19.760102]        __thermal_zone_device_update+0x69/0x480
> [   19.760106]        thermal_zone_device_set_mode+0x52/0xa0
> [   19.760110]        iwl_mld_thermal_zone_register+0x144/0x1d0 [iwlmld]
> [   19.760126]        iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
> [   19.760140]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
> [   19.760157]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
> [   19.760173]        iwl_mld_init+0x19/0xff0 [iwlmld]
> [   19.760187]        do_one_initcall+0x54/0x390
> [   19.760190]        do_init_module+0x62/0x240
> [   19.760193]        __do_sys_init_module+0x164/0x190
> [   19.760196]        do_syscall_64+0x94/0x3d0
> [   19.760198]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   19.760201] 
>                other info that might help us debug this:
> 
> [   19.760202] Chain exists of:
>                  &rdev->wiphy.mtx --> thermal_list_lock --> &tz->lock
> 
> [   19.760208]  Possible unsafe locking scenario:
> 
> [   19.760209]        CPU0                    CPU1
> [   19.760211]        ----                    ----
> [   19.760212]   lock(&tz->lock);
> [   19.760215]                                lock(thermal_list_lock);
> [   19.760218]                                lock(&tz->lock);
> [   19.760220]   lock(&rdev->wiphy.mtx);
> [   19.760223] 
>                 *** DEADLOCK ***
> 
> [   19.760224] 2 locks held by modprobe/881:
> [   19.760227]  #0: ffffffffc1914c68 (iwlwifi_opmode_table_mtx){+.+.}-{4:4}, at: iwl_opmode_register+0x21/0xc0 [iwlwifi]
> [   19.760247]  #1: ffff897b0204e708 (&tz->lock){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
> [   19.760255] 
>                stack backtrace:
> [   19.760259] CPU: 11 UID: 0 PID: 881 Comm: modprobe Not tainted 6.16.0-rc3+ #11 PREEMPT(lazy) 
> [   19.760262] Hardware name: Dell Inc. XPS 16 9640/09CK4V, BIOS 1.12.0 02/10/2025
> [   19.760264] Call Trace:
> [   19.760266]  <TASK>
> [   19.760267]  dump_stack_lvl+0x68/0x90
> [   19.760271]  print_circular_bug.cold+0x185/0x1d0
> [   19.760276]  check_noncircular+0x10f/0x130
> [   19.760279]  ? __kernel_text_address+0xe/0x30
> [   19.760282]  ? unwind_get_return_address+0x26/0x50
> [   19.760287]  __lock_acquire+0x1481/0x2270
> [   19.760293]  lock_acquire+0xc9/0x2c0
> [   19.760296]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.760314]  __mutex_lock+0x9f/0xed0
> [   19.760317]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.760332]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.760345]  ? lock_acquire+0xc9/0x2c0
> [   19.760348]  ? thermal_zone_device_set_mode+0x20/0xa0
> [   19.760352]  ? lock_acquire+0xd9/0x2c0
> [   19.760356]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.760368]  iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
> [   19.760382]  ? lock_is_held_type+0xd5/0x140
> [   19.760386]  __thermal_zone_get_temp+0x29/0x90
> [   19.760389]  __thermal_zone_device_update+0x69/0x480
> [   19.760395]  thermal_zone_device_set_mode+0x52/0xa0
> [   19.760399]  iwl_mld_thermal_zone_register+0x144/0x1d0 [iwlmld]
> [   19.760416]  iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
> [   19.760435]  _iwl_op_mode_start+0x67/0x100 [iwlwifi]
> [   19.760453]  iwl_opmode_register+0x6b/0xc0 [iwlwifi]
> [   19.760468]  ? __pfx_iwl_mld_init+0x10/0x10 [iwlmld]
> [   19.760482]  iwl_mld_init+0x19/0xff0 [iwlmld]
> [   19.760495]  do_one_initcall+0x54/0x390
> [   19.760499]  do_init_module+0x62/0x240
> [   19.760502]  ? __do_sys_init_module+0x164/0x190
> [   19.760504]  __do_sys_init_module+0x164/0x190
> [   19.760510]  do_syscall_64+0x94/0x3d0
> [   19.760513]  ? lock_acquire+0xc9/0x2c0
> [   19.760516]  ? __folio_batch_add_and_move+0x8f/0x2f0
> [   19.760519]  ? lock_acquire+0xd9/0x2c0
> [   19.760522]  ? find_held_lock+0x2b/0x80
> [   19.760524]  ? find_held_lock+0x2b/0x80
> [   19.760526]  ? find_held_lock+0x2b/0x80
> [   19.760529]  ? rcu_read_unlock+0x17/0x60
> [   19.760533]  ? lock_release+0x1a0/0x2d0
> [   19.760537]  ? __lock_acquire+0x45f/0x2270
> [   19.760541]  ? __handle_mm_fault+0xaf4/0xe20
> [   19.760545]  ? lock_acquire+0xc9/0x2c0
> [   19.760549]  ? find_held_lock+0x2b/0x80
> [   19.760551]  ? rcu_read_unlock+0x17/0x60
> [   19.760553]  ? lock_release+0x1a0/0x2d0
> [   19.760556]  ? find_held_lock+0x2b/0x80
> [   19.760559]  ? exc_page_fault+0x8c/0x240
> [   19.760561]  ? lock_release+0x1a0/0x2d0
> [   19.760564]  ? do_user_addr_fault+0x370/0x6b0
> [   19.760568]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   19.760571] RIP: 0033:0x7f531b702bae
> [   19.760574] Code: 48 8b 0d 5d 32 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2a 32 0f 00 f7 d8 64 89 01 48
> [   19.760576] RSP: 002b:00007ffe4c645798 EFLAGS: 00000246 ORIG_RAX: 00000000000000af
> [   19.760580] RAX: ffffffffffffffda RBX: 00005600c715be60 RCX: 00007f531b702bae
> [   19.760581] RDX: 00005600a44015ee RSI: 00000000000cbf71 RDI: 00005600c7d0acd0
> [   19.760583] RBP: 00007ffe4c645850 R08: 00005600c715bd40 R09: 00007f531b7f6ac0
> [   19.760584] R10: 00005600c715b010 R11: 0000000000000246 R12: 0000000000040000
> [   19.760585] R13: 00005600c715bdc0 R14: 00005600a44015ee R15: 0000000000000000
> [   19.760590]  </TASK>
> [   19.760846] iwlwifi 0000:03:00.0: Registered PHC clock: iwlwifi-PTP, with index: 0
> 
> 
> Regards,
> 
> Hans
> 
> 
> 
> 
> > 
> > ---
> > 
> > Hi,
> > 
> > I believe that this should solve the lockdep warning that Hans was
> > seeing. That said, I have not tested it much.
> > 
> > v2:
> > - Fix function name
> > - Mark lock class variable static
> > 
> > Benjamin
> > ---
> >  drivers/thermal/intel/x86_pkg_temp_thermal.c | 2 ++
> >  drivers/thermal/thermal_core.c               | 7 +++++++
> >  include/linux/thermal.h                      | 8 ++++++++
> >  3 files changed, 17 insertions(+)
> > 
> > diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
> > index 3fc679b6f11b..15d3c904eaa3 100644
> > --- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
> > +++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
> > @@ -310,6 +310,7 @@ static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
> >  
> >  static int pkg_temp_thermal_device_add(unsigned int cpu)
> >  {
> > +	static struct lock_class_key x86_pkg_temp_class;
> >  	struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
> >  	int id = topology_logical_die_id(cpu);
> >  	u32 eax, ebx, ecx, edx;
> > @@ -349,6 +350,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu)
> >  		err = PTR_ERR(zonedev->tzone);
> >  		goto out_kfree_zonedev;
> >  	}
> > +	thermal_zone_device_set_lock_class(zonedev->tzone, &x86_pkg_temp_class);
> >  	err = thermal_zone_device_enable(zonedev->tzone);
> >  	if (err)
> >  		goto out_unregister_tz;
> > diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> > index 17ca5c082643..ff5c2e01904a 100644
> > --- a/drivers/thermal/thermal_core.c
> > +++ b/drivers/thermal/thermal_core.c
> > @@ -1657,6 +1657,13 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
> >  }
> >  EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register);
> >  
> > +void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
> > +					struct lock_class_key *lock_class)
> > +{
> > +	lockdep_set_class_and_name(&tz->lock, lock_class, tz->type);
> > +}
> > +EXPORT_SYMBOL_GPL(thermal_zone_device_set_lock_class);
> > +
> >  void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
> >  {
> >  	return tzd->devdata;
> > diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> > index 0b5ed6821080..3cb4cf60b66d 100644
> > --- a/include/linux/thermal.h
> > +++ b/include/linux/thermal.h
> > @@ -240,6 +240,9 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
> >  					const struct thermal_zone_device_ops *ops,
> >  					const struct thermal_zone_params *tzp);
> >  
> > +void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
> > +					struct lock_class_key *lock_class);
> > +
> >  void thermal_zone_device_unregister(struct thermal_zone_device *tz);
> >  
> >  void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
> > @@ -290,6 +293,11 @@ static inline struct thermal_zone_device *thermal_tripless_zone_device_register(
> >  					const struct thermal_zone_params *tzp)
> >  { return ERR_PTR(-ENODEV); }
> >  
> > +static inline void
> > +thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
> > +				   struct lock_class_key *lock_class)
> > +{ }
> > +
> >  static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
> >  { }
> >  
> 

Intel Deutschland GmbH
Registered Address: Am Campeon 10, 85579 Neubiberg, Germany
Tel: +49 89 99 8853-0, www.intel.de
Managing Directors: Sean Fennelly, Jeffrey Schneiderman, Tiffany Doon Silva
Chairperson of the Supervisory Board: Nicole Lau
Registered Office: Munich
Commercial Register: Amtsgericht Muenchen HRB 186928

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp
  2025-06-25 10:15   ` Berg, Benjamin
@ 2025-06-25 11:34     ` Hans de Goede
  2025-06-25 13:23     ` Hans de Goede
  1 sibling, 0 replies; 5+ messages in thread
From: Hans de Goede @ 2025-06-25 11:34 UTC (permalink / raw)
  To: Berg, Benjamin, linux-pm@vger.kernel.org; +Cc: Zhang, Rui

Hi Benjamin,

On 25-Jun-25 12:15 PM, Berg, Benjamin wrote:
> On Wed, 2025-06-25 at 11:44 +0200, Hans de Goede wrote:
>> Hi Benjamin,
>>
>> On 24-Jun-25 3:24 PM, Benjamin Berg wrote:
>>> From: Benjamin Berg <benjamin.berg@intel.com>
>>>
>>> The intel driver has code paths that will take the tz->lock while the
>>> cpuhp_state-up lock is held. As the cpuhp_state-up lock is used in other
>>> code paths, it may happen that lockdep detects possible deadlocks
>>> through unrelated thermal zone devices.
>>>
>>> Fix these false positives by using a separate lockdep class for the
>>> x86_pkg_temp thermal device.
>>>
>>> Reported-by: Hans de Goede <hansg@kernel.org>
>>> Closes: https://lore.kernel.org/linux-pm/e9d7ef79-6a24-4515-aa35-d1f2357da798@kernel.org/
>>> Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
>>
>> Thank you for trying to fix this, the fix does get rid of cpuhp_state-up
>> in the lockdep report, only for it to be replaced with the thermal_list_lock
>> unfortunately (full lockdep report below).
>>
>> It seems that the real issue is trying to lock wiphy.mtx while holding
>> tz->lock.
>>
>> Or maybe we need this patch + a patch to not hold wiphy.mtx while
>> registering the thermalzone ?
> 
> I think my patch simply did not work, because
> thermal_zone_device_register_with_trips is already taking &tz->lock
> before the lock class is changed.
> 
> What might work is to instead pass the lock class in when registering
> the thermal zone device. i.e. just add a new helper with an additional
> argument for the lock class and set it right after the mutex_init call
> for &tz->lock.

Ok, I'll give this a try.

Regards,

Hans




>> [   19.759432] ======================================================
>> [   19.759434] WARNING: possible circular locking dependency detected
>> [   19.759436] 6.16.0-rc3+ #11 Not tainted
>> [   19.759439] ------------------------------------------------------
>> [   19.759440] modprobe/881 is trying to acquire lock:
>> [   19.759442] ffff897b4c940768 (&rdev->wiphy.mtx){+.+.}-{4:4}, at: iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.759485] 
>>                but task is already holding lock:
>> [   19.759486] ffff897b0204e708 (&tz->lock){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
>> [   19.759498] 
>>                which lock already depends on the new lock.
>>
>> [   19.759500] 
>>                the existing dependency chain (in reverse order) is:
>> [   19.759501] 
>>                -> #5 (&tz->lock){+.+.}-{4:4}:
>> [   19.759506]        __mutex_lock+0x9f/0xed0
>> [   19.759513]        thermal_zone_device_register_with_trips+0x573/0x640
>> [   19.759517]        thermal_tripless_zone_device_register+0x1b/0x30
>> [   19.759521]        int3400_thermal_probe+0x21d/0x4f0 [int3400_thermal]
>> [   19.759526]        platform_probe+0x3f/0xa0
>> [   19.759531]        really_probe+0xdb/0x340
>> [   19.759535]        __driver_probe_device+0x78/0x140
>> [   19.759539]        driver_probe_device+0x1f/0xa0
>> [   19.759542]        __driver_attach+0xcb/0x1e0
>> [   19.759546]        bus_for_each_dev+0x63/0xa0
>> [   19.759549]        bus_add_driver+0x10a/0x1f0
>> [   19.759552]        driver_register+0x71/0xe0
>> [   19.759556]        do_one_initcall+0x54/0x390
>> [   19.759561]        do_init_module+0x62/0x240
>> [   19.759565]        __do_sys_init_module+0x164/0x190
>> [   19.759568]        do_syscall_64+0x94/0x3d0
>> [   19.759571]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
>> [   19.759574] 
>>                -> #4 (thermal_list_lock){+.+.}-{4:4}:
>> [   19.759578]        __mutex_lock+0x9f/0xed0
>> [   19.759581]        __thermal_cooling_device_register.part.0+0x16e/0x2a0
>> [   19.759585]        acpi_processor_thermal_init+0x24/0xb0
>> [   19.759591]        acpi_soft_cpu_online+0xa7/0x140
>> [   19.759593]        cpuhp_invoke_callback+0x1ab/0x660
>> [   19.759599]        cpuhp_thread_fun+0x187/0x270
>> [   19.759602]        smpboot_thread_fn+0x12a/0x2e0
>> [   19.759607]        kthread+0x108/0x240
>> [   19.759612]        ret_from_fork+0x232/0x2a0
>> [   19.759617]        ret_from_fork_asm+0x1a/0x30
>> [   19.759621] 
>>                -> #3 (cpuhp_state-up){+.+.}-{0:0}:
>> [   19.759625]        cpuhp_thread_fun+0x99/0x270
>> [   19.759628]        smpboot_thread_fn+0x12a/0x2e0
>> [   19.759631]        kthread+0x108/0x240
>> [   19.759635]        ret_from_fork+0x232/0x2a0
>> [   19.759638]        ret_from_fork_asm+0x1a/0x30
>> [   19.759642] 
>>                -> #2 (cpu_hotplug_lock){++++}-{0:0}:
>> [   19.759645]        cpus_read_lock+0x3c/0xe0
>> [   19.759649]        static_key_slow_inc+0x12/0x30
>> [   19.759656]        __nf_register_net_hook+0xb7/0x210
>> [   19.759662]        nf_register_net_hook+0x2d/0x90
>> [   19.759665]        nf_tables_addchain.constprop.0+0x2dd/0x6f0 [nf_tables]
>> [   19.759686]        nf_tables_newchain+0x78f/0xb10 [nf_tables]
>> [   19.759701]        nfnetlink_rcv_batch+0x7a5/0xc50 [nfnetlink]
>> [   19.759705]        nfnetlink_rcv+0x12d/0x150 [nfnetlink]
>> [   19.759708]        netlink_unicast+0x1bf/0x2b0
>> [   19.759712]        netlink_sendmsg+0x211/0x430
>> [   19.759714]        ____sys_sendmsg+0x373/0x3b0
>> [   19.759719]        ___sys_sendmsg+0x7d/0xc0
>> [   19.759722]        __sys_sendmsg+0x5e/0xb0
>> [   19.759724]        do_syscall_64+0x94/0x3d0
>> [   19.759727]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
>> [   19.759729] 
>>                -> #1 (&nft_net->commit_mutex){+.+.}-{4:4}:
>> [   19.759733]        __mutex_lock+0x9f/0xed0
>> [   19.759735]        nf_tables_netdev_event+0x59/0xc0 [nf_tables]
>> [   19.759751]        notifier_call_chain+0x3d/0x100
>> [   19.759755]        register_netdevice+0x731/0x8f0
>> [   19.759759]        cfg80211_register_netdevice+0x4c/0xf0 [cfg80211]
>> [   19.759828]        ieee80211_if_add+0x475/0x740 [mac80211]
>> [   19.759921]        ieee80211_register_hw+0xd6b/0xdb0 [mac80211]
>> [   19.759967]        iwl_op_mode_mld_start+0x438/0x4b0 [iwlmld]
>> [   19.759989]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
>> [   19.760012]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
>> [   19.760028]        iwl_mld_init+0x19/0xff0 [iwlmld]
>> [   19.760043]        do_one_initcall+0x54/0x390
>> [   19.760046]        do_init_module+0x62/0x240
>> [   19.760049]        __do_sys_init_module+0x164/0x190
>> [   19.760052]        do_syscall_64+0x94/0x3d0
>> [   19.760055]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
>> [   19.760057] 
>>                -> #0 (&rdev->wiphy.mtx){+.+.}-{4:4}:
>> [   19.760061]        __lock_acquire+0x1481/0x2270
>> [   19.760067]        lock_acquire+0xc9/0x2c0
>> [   19.760070]        __mutex_lock+0x9f/0xed0
>> [   19.760074]        iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.760098]        __thermal_zone_get_temp+0x29/0x90
>> [   19.760102]        __thermal_zone_device_update+0x69/0x480
>> [   19.760106]        thermal_zone_device_set_mode+0x52/0xa0
>> [   19.760110]        iwl_mld_thermal_zone_register+0x144/0x1d0 [iwlmld]
>> [   19.760126]        iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
>> [   19.760140]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
>> [   19.760157]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
>> [   19.760173]        iwl_mld_init+0x19/0xff0 [iwlmld]
>> [   19.760187]        do_one_initcall+0x54/0x390
>> [   19.760190]        do_init_module+0x62/0x240
>> [   19.760193]        __do_sys_init_module+0x164/0x190
>> [   19.760196]        do_syscall_64+0x94/0x3d0
>> [   19.760198]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
>> [   19.760201] 
>>                other info that might help us debug this:
>>
>> [   19.760202] Chain exists of:
>>                  &rdev->wiphy.mtx --> thermal_list_lock --> &tz->lock
>>
>> [   19.760208]  Possible unsafe locking scenario:
>>
>> [   19.760209]        CPU0                    CPU1
>> [   19.760211]        ----                    ----
>> [   19.760212]   lock(&tz->lock);
>> [   19.760215]                                lock(thermal_list_lock);
>> [   19.760218]                                lock(&tz->lock);
>> [   19.760220]   lock(&rdev->wiphy.mtx);
>> [   19.760223] 
>>                 *** DEADLOCK ***
>>
>> [   19.760224] 2 locks held by modprobe/881:
>> [   19.760227]  #0: ffffffffc1914c68 (iwlwifi_opmode_table_mtx){+.+.}-{4:4}, at: iwl_opmode_register+0x21/0xc0 [iwlwifi]
>> [   19.760247]  #1: ffff897b0204e708 (&tz->lock){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
>> [   19.760255] 
>>                stack backtrace:
>> [   19.760259] CPU: 11 UID: 0 PID: 881 Comm: modprobe Not tainted 6.16.0-rc3+ #11 PREEMPT(lazy) 
>> [   19.760262] Hardware name: Dell Inc. XPS 16 9640/09CK4V, BIOS 1.12.0 02/10/2025
>> [   19.760264] Call Trace:
>> [   19.760266]  <TASK>
>> [   19.760267]  dump_stack_lvl+0x68/0x90
>> [   19.760271]  print_circular_bug.cold+0x185/0x1d0
>> [   19.760276]  check_noncircular+0x10f/0x130
>> [   19.760279]  ? __kernel_text_address+0xe/0x30
>> [   19.760282]  ? unwind_get_return_address+0x26/0x50
>> [   19.760287]  __lock_acquire+0x1481/0x2270
>> [   19.760293]  lock_acquire+0xc9/0x2c0
>> [   19.760296]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.760314]  __mutex_lock+0x9f/0xed0
>> [   19.760317]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.760332]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.760345]  ? lock_acquire+0xc9/0x2c0
>> [   19.760348]  ? thermal_zone_device_set_mode+0x20/0xa0
>> [   19.760352]  ? lock_acquire+0xd9/0x2c0
>> [   19.760356]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.760368]  iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
>> [   19.760382]  ? lock_is_held_type+0xd5/0x140
>> [   19.760386]  __thermal_zone_get_temp+0x29/0x90
>> [   19.760389]  __thermal_zone_device_update+0x69/0x480
>> [   19.760395]  thermal_zone_device_set_mode+0x52/0xa0
>> [   19.760399]  iwl_mld_thermal_zone_register+0x144/0x1d0 [iwlmld]
>> [   19.760416]  iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
>> [   19.760435]  _iwl_op_mode_start+0x67/0x100 [iwlwifi]
>> [   19.760453]  iwl_opmode_register+0x6b/0xc0 [iwlwifi]
>> [   19.760468]  ? __pfx_iwl_mld_init+0x10/0x10 [iwlmld]
>> [   19.760482]  iwl_mld_init+0x19/0xff0 [iwlmld]
>> [   19.760495]  do_one_initcall+0x54/0x390
>> [   19.760499]  do_init_module+0x62/0x240
>> [   19.760502]  ? __do_sys_init_module+0x164/0x190
>> [   19.760504]  __do_sys_init_module+0x164/0x190
>> [   19.760510]  do_syscall_64+0x94/0x3d0
>> [   19.760513]  ? lock_acquire+0xc9/0x2c0
>> [   19.760516]  ? __folio_batch_add_and_move+0x8f/0x2f0
>> [   19.760519]  ? lock_acquire+0xd9/0x2c0
>> [   19.760522]  ? find_held_lock+0x2b/0x80
>> [   19.760524]  ? find_held_lock+0x2b/0x80
>> [   19.760526]  ? find_held_lock+0x2b/0x80
>> [   19.760529]  ? rcu_read_unlock+0x17/0x60
>> [   19.760533]  ? lock_release+0x1a0/0x2d0
>> [   19.760537]  ? __lock_acquire+0x45f/0x2270
>> [   19.760541]  ? __handle_mm_fault+0xaf4/0xe20
>> [   19.760545]  ? lock_acquire+0xc9/0x2c0
>> [   19.760549]  ? find_held_lock+0x2b/0x80
>> [   19.760551]  ? rcu_read_unlock+0x17/0x60
>> [   19.760553]  ? lock_release+0x1a0/0x2d0
>> [   19.760556]  ? find_held_lock+0x2b/0x80
>> [   19.760559]  ? exc_page_fault+0x8c/0x240
>> [   19.760561]  ? lock_release+0x1a0/0x2d0
>> [   19.760564]  ? do_user_addr_fault+0x370/0x6b0
>> [   19.760568]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>> [   19.760571] RIP: 0033:0x7f531b702bae
>> [   19.760574] Code: 48 8b 0d 5d 32 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2a 32 0f 00 f7 d8 64 89 01 48
>> [   19.760576] RSP: 002b:00007ffe4c645798 EFLAGS: 00000246 ORIG_RAX: 00000000000000af
>> [   19.760580] RAX: ffffffffffffffda RBX: 00005600c715be60 RCX: 00007f531b702bae
>> [   19.760581] RDX: 00005600a44015ee RSI: 00000000000cbf71 RDI: 00005600c7d0acd0
>> [   19.760583] RBP: 00007ffe4c645850 R08: 00005600c715bd40 R09: 00007f531b7f6ac0
>> [   19.760584] R10: 00005600c715b010 R11: 0000000000000246 R12: 0000000000040000
>> [   19.760585] R13: 00005600c715bdc0 R14: 00005600a44015ee R15: 0000000000000000
>> [   19.760590]  </TASK>
>> [   19.760846] iwlwifi 0000:03:00.0: Registered PHC clock: iwlwifi-PTP, with index: 0
>>
>>
>> Regards,
>>
>> Hans
>>
>>
>>
>>
>>>
>>> ---
>>>
>>> Hi,
>>>
>>> I believe that this should solve the lockdep warning that Hans was
>>> seeing. That said, I have not tested it much.
>>>
>>> v2:
>>> - Fix function name
>>> - Mark lock class variable static
>>>
>>> Benjamin
>>> ---
>>>  drivers/thermal/intel/x86_pkg_temp_thermal.c | 2 ++
>>>  drivers/thermal/thermal_core.c               | 7 +++++++
>>>  include/linux/thermal.h                      | 8 ++++++++
>>>  3 files changed, 17 insertions(+)
>>>
>>> diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
>>> index 3fc679b6f11b..15d3c904eaa3 100644
>>> --- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
>>> +++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
>>> @@ -310,6 +310,7 @@ static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
>>>  
>>>  static int pkg_temp_thermal_device_add(unsigned int cpu)
>>>  {
>>> +	static struct lock_class_key x86_pkg_temp_class;
>>>  	struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
>>>  	int id = topology_logical_die_id(cpu);
>>>  	u32 eax, ebx, ecx, edx;
>>> @@ -349,6 +350,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu)
>>>  		err = PTR_ERR(zonedev->tzone);
>>>  		goto out_kfree_zonedev;
>>>  	}
>>> +	thermal_zone_device_set_lock_class(zonedev->tzone, &x86_pkg_temp_class);
>>>  	err = thermal_zone_device_enable(zonedev->tzone);
>>>  	if (err)
>>>  		goto out_unregister_tz;
>>> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
>>> index 17ca5c082643..ff5c2e01904a 100644
>>> --- a/drivers/thermal/thermal_core.c
>>> +++ b/drivers/thermal/thermal_core.c
>>> @@ -1657,6 +1657,13 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
>>>  }
>>>  EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register);
>>>  
>>> +void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
>>> +					struct lock_class_key *lock_class)
>>> +{
>>> +	lockdep_set_class_and_name(&tz->lock, lock_class, tz->type);
>>> +}
>>> +EXPORT_SYMBOL_GPL(thermal_zone_device_set_lock_class);
>>> +
>>>  void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
>>>  {
>>>  	return tzd->devdata;
>>> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
>>> index 0b5ed6821080..3cb4cf60b66d 100644
>>> --- a/include/linux/thermal.h
>>> +++ b/include/linux/thermal.h
>>> @@ -240,6 +240,9 @@ struct thermal_zone_device *thermal_tripless_zone_device_register(
>>>  					const struct thermal_zone_device_ops *ops,
>>>  					const struct thermal_zone_params *tzp);
>>>  
>>> +void thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
>>> +					struct lock_class_key *lock_class);
>>> +
>>>  void thermal_zone_device_unregister(struct thermal_zone_device *tz);
>>>  
>>>  void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
>>> @@ -290,6 +293,11 @@ static inline struct thermal_zone_device *thermal_tripless_zone_device_register(
>>>  					const struct thermal_zone_params *tzp)
>>>  { return ERR_PTR(-ENODEV); }
>>>  
>>> +static inline void
>>> +thermal_zone_device_set_lock_class(struct thermal_zone_device *tz,
>>> +				   struct lock_class_key *lock_class)
>>> +{ }
>>> +
>>>  static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
>>>  { }
>>>  
>>
> 
> Intel Deutschland GmbH
> Registered Address: Am Campeon 10, 85579 Neubiberg, Germany
> Tel: +49 89 99 8853-0, www.intel.de
> Managing Directors: Sean Fennelly, Jeffrey Schneiderman, Tiffany Doon Silva
> Chairperson of the Supervisory Board: Nicole Lau
> Registered Office: Munich
> Commercial Register: Amtsgericht Muenchen HRB 186928


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp
  2025-06-25 10:15   ` Berg, Benjamin
  2025-06-25 11:34     ` Hans de Goede
@ 2025-06-25 13:23     ` Hans de Goede
  1 sibling, 0 replies; 5+ messages in thread
From: Hans de Goede @ 2025-06-25 13:23 UTC (permalink / raw)
  To: Berg, Benjamin, linux-pm@vger.kernel.org; +Cc: Zhang, Rui

[-- Attachment #1: Type: text/plain, Size: 11388 bytes --]

Hi Benjamin,

On 25-Jun-25 12:15 PM, Berg, Benjamin wrote:
> On Wed, 2025-06-25 at 11:44 +0200, Hans de Goede wrote:
>> Hi Benjamin,
>>
>> On 24-Jun-25 3:24 PM, Benjamin Berg wrote:
>>> From: Benjamin Berg <benjamin.berg@intel.com>
>>>
>>> The intel driver has code paths that will take the tz->lock while the
>>> cpuhp_state-up lock is held. As the cpuhp_state-up lock is used in other
>>> code paths, it may happen that lockdep detects possible deadlocks
>>> through unrelated thermal zone devices.
>>>
>>> Fix these false positives by using a separate lockdep class for the
>>> x86_pkg_temp thermal device.
>>>
>>> Reported-by: Hans de Goede <hansg@kernel.org>
>>> Closes: https://lore.kernel.org/linux-pm/e9d7ef79-6a24-4515-aa35-d1f2357da798@kernel.org/
>>> Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
>>
>> Thank you for trying to fix this, the fix does get rid of cpuhp_state-up
>> in the lockdep report, only for it to be replaced with the thermal_list_lock
>> unfortunately (full lockdep report below).
>>
>> It seems that the real issue is trying to lock wiphy.mtx while holding
>> tz->lock.
>>
>> Or maybe we need this patch + a patch to not hold wiphy.mtx while
>> registering the thermalzone ?
> 
> I think my patch simply did not work, because
> thermal_zone_device_register_with_trips is already taking &tz->lock
> before the lock class is changed.
> 
> What might work is to instead pass the lock class in when registering
> the thermal zone device. i.e. just add a new helper with an additional
> argument for the lock class and set it right after the mutex_init call
> for &tz->lock.

I wrote the attached patch to give each call-site (so each tz driver)
its own lock_class_key. Unfortunately this does not help and AFAICT
the lockdep report is unchanged from your v2 patch. I'll put the new
full lockdep report below.

Regards,

Hans


[   19.526864] ======================================================
[   19.526866] WARNING: possible circular locking dependency detected
[   19.526868] 6.16.0-rc3+ #17 Tainted: G        W          
[   19.526871] ------------------------------------------------------
[   19.526873] modprobe/893 is trying to acquire lock:
[   19.526875] ffff8c5c17570768 (&rdev->wiphy.mtx){+.+.}-{4:4}, at: iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.526915] 
               but task is already holding lock:
[   19.526916] ffff8c5bee900708 (&tz->lock#5){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
[   19.526929] 
               which lock already depends on the new lock.

[   19.526931] 
               the existing dependency chain (in reverse order) is:
[   19.526932] 
               -> #5 (&tz->lock#5){+.+.}-{4:4}:
[   19.526938]        __mutex_lock+0x9f/0xed0
[   19.526944]        thermal_zone_device_register_with_trips_and_lockkey+0x571/0x640
[   19.526948]        iwl_mld_thermal_zone_register+0x130/0x1e0 [iwlmld]
[   19.526967]        iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
[   19.526992]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.527013]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.527029]        iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.527045]        do_one_initcall+0x54/0x390
[   19.527051]        do_init_module+0x62/0x240
[   19.527056]        __do_sys_init_module+0x164/0x190
[   19.527058]        do_syscall_64+0x94/0x3d0
[   19.527062]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.527065] 
               -> #4 (thermal_list_lock){+.+.}-{4:4}:
[   19.527069]        __mutex_lock+0x9f/0xed0
[   19.527072]        __thermal_cooling_device_register.part.0+0x16e/0x2a0
[   19.527076]        acpi_processor_thermal_init+0x24/0xb0
[   19.527082]        acpi_soft_cpu_online+0xa7/0x140
[   19.527085]        cpuhp_invoke_callback+0x1ab/0x660
[   19.527090]        cpuhp_thread_fun+0x187/0x270
[   19.527094]        smpboot_thread_fn+0x12a/0x2e0
[   19.527099]        kthread+0x108/0x240
[   19.527103]        ret_from_fork+0x232/0x2a0
[   19.527109]        ret_from_fork_asm+0x1a/0x30
[   19.527113] 
               -> #3 (cpuhp_state-up){+.+.}-{0:0}:
[   19.527117]        cpuhp_thread_fun+0x99/0x270
[   19.527121]        smpboot_thread_fn+0x12a/0x2e0
[   19.527124]        kthread+0x108/0x240
[   19.527127]        ret_from_fork+0x232/0x2a0
[   19.527131]        ret_from_fork_asm+0x1a/0x30
[   19.527134] 
               -> #2 (cpu_hotplug_lock){++++}-{0:0}:
[   19.527138]        cpus_read_lock+0x3c/0xe0
[   19.527142]        static_key_slow_inc+0x12/0x30
[   19.527148]        __nf_register_net_hook+0xb7/0x210
[   19.527154]        nf_register_net_hook+0x2d/0x90
[   19.527157]        nf_tables_addchain.constprop.0+0x2dd/0x6f0 [nf_tables]
[   19.527180]        nf_tables_newchain+0x78f/0xb10 [nf_tables]
[   19.527194]        nfnetlink_rcv_batch+0x7a5/0xc50 [nfnetlink]
[   19.527199]        nfnetlink_rcv+0x12d/0x150 [nfnetlink]
[   19.527202]        netlink_unicast+0x1bf/0x2b0
[   19.527205]        netlink_sendmsg+0x211/0x430
[   19.527207]        ____sys_sendmsg+0x373/0x3b0
[   19.527212]        ___sys_sendmsg+0x7d/0xc0
[   19.527216]        __sys_sendmsg+0x5e/0xb0
[   19.527218]        do_syscall_64+0x94/0x3d0
[   19.527221]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.527223] 
               -> #1 (&nft_net->commit_mutex){+.+.}-{4:4}:
[   19.527228]        __mutex_lock+0x9f/0xed0
[   19.527230]        nf_tables_netdev_event+0x59/0xc0 [nf_tables]
[   19.527246]        notifier_call_chain+0x3d/0x100
[   19.527250]        register_netdevice+0x731/0x8f0
[   19.527254]        cfg80211_register_netdevice+0x4c/0xf0 [cfg80211]
[   19.527318]        ieee80211_if_add+0x475/0x740 [mac80211]
[   19.527406]        ieee80211_register_hw+0xd6b/0xdb0 [mac80211]
[   19.527453]        iwl_op_mode_mld_start+0x438/0x4b0 [iwlmld]
[   19.527476]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.527494]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.527507] Adding alias for supply vdd-amp,(null) -> vdd-amp,sdw:0:0:01fa:4243:01
[   19.527510]        iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.527526]        do_one_initcall+0x54/0x390
[   19.527530]        do_init_module+0x62/0x240
[   19.527533]        __do_sys_init_module+0x164/0x190
[   19.527535]        do_syscall_64+0x94/0x3d0
[   19.527538]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.527540] 
               -> #0 (&rdev->wiphy.mtx){+.+.}-{4:4}:
[   19.527545]        __lock_acquire+0x1481/0x2270
[   19.527550]        lock_acquire+0xc9/0x2c0
[   19.527554]        __mutex_lock+0x9f/0xed0
[   19.527557]        iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.527580]        __thermal_zone_get_temp+0x29/0x90
[   19.527584]        __thermal_zone_device_update+0x69/0x480
[   19.527588]        thermal_zone_device_set_mode+0x52/0xa0
[   19.527592]        iwl_mld_thermal_zone_register+0x14b/0x1e0 [iwlmld]
[   19.527608]        iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
[   19.527626]        _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.527642]        iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.527658]        iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.527672]        do_one_initcall+0x54/0x390
[   19.527675]        do_init_module+0x62/0x240
[   19.527678]        __do_sys_init_module+0x164/0x190
[   19.527681]        do_syscall_64+0x94/0x3d0
[   19.527683]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.527685] 
               other info that might help us debug this:

[   19.527687] Chain exists of:
                 &rdev->wiphy.mtx --> thermal_list_lock --> &tz->lock#5

[   19.527694]  Possible unsafe locking scenario:

[   19.527695]        CPU0                    CPU1
[   19.527697]        ----                    ----
[   19.527698]   lock(&tz->lock#5);
[   19.527701]                                lock(thermal_list_lock);
[   19.527704]                                lock(&tz->lock#5);
[   19.527707]   lock(&rdev->wiphy.mtx);
[   19.527710] 
                *** DEADLOCK ***

[   19.527711] 2 locks held by modprobe/893:
[   19.527714]  #0: ffffffffc1a47c68 (iwlwifi_opmode_table_mtx){+.+.}-{4:4}, at: iwl_opmode_register+0x21/0xc0 [iwlwifi]
[   19.527735]  #1: ffff8c5bee900708 (&tz->lock#5){+.+.}-{4:4}, at: thermal_zone_device_set_mode+0x20/0xa0
[   19.527743] 
               stack backtrace:
[   19.527747] CPU: 8 UID: 0 PID: 893 Comm: modprobe Tainted: G        W           6.16.0-rc3+ #17 PREEMPT(lazy) 
[   19.527751] Tainted: [W]=WARN
[   19.527752] Hardware name: Dell Inc. XPS 16 9640/09CK4V, BIOS 1.12.0 02/10/2025
[   19.527754] Call Trace:
[   19.527756]  <TASK>
[   19.527757]  dump_stack_lvl+0x68/0x90
[   19.527762]  print_circular_bug.cold+0x185/0x1d0
[   19.527767]  check_noncircular+0x10f/0x130
[   19.527770]  ? __kernel_text_address+0xe/0x30
[   19.527773]  ? unwind_get_return_address+0x26/0x50
[   19.527778]  __lock_acquire+0x1481/0x2270
[   19.527784]  lock_acquire+0xc9/0x2c0
[   19.527787]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.527806]  __mutex_lock+0x9f/0xed0
[   19.527809]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.527824]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.527839]  ? lock_acquire+0xc9/0x2c0
[   19.527843]  ? thermal_zone_device_set_mode+0x20/0xa0
[   19.527846]  ? lock_acquire+0xd9/0x2c0
[   19.527850]  ? iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.527864]  iwl_mld_tzone_get_temp+0x2f/0x1d0 [iwlmld]
[   19.527885]  ? lock_is_held_type+0xd5/0x140
[   19.527889]  __thermal_zone_get_temp+0x29/0x90
[   19.527892]  __thermal_zone_device_update+0x69/0x480
[   19.527898]  thermal_zone_device_set_mode+0x52/0xa0
[   19.527901]  iwl_mld_thermal_zone_register+0x14b/0x1e0 [iwlmld]
[   19.527920]  iwl_op_mode_mld_start+0x460/0x4b0 [iwlmld]
[   19.527943]  _iwl_op_mode_start+0x67/0x100 [iwlwifi]
[   19.527963]  iwl_opmode_register+0x6b/0xc0 [iwlwifi]
[   19.527979]  ? __pfx_iwl_mld_init+0x10/0x10 [iwlmld]
[   19.527994]  iwl_mld_init+0x19/0xff0 [iwlmld]
[   19.528008]  do_one_initcall+0x54/0x390
[   19.528013]  do_init_module+0x62/0x240
[   19.528015]  ? __do_sys_init_module+0x164/0x190
[   19.528018]  __do_sys_init_module+0x164/0x190
[   19.528025]  do_syscall_64+0x94/0x3d0
[   19.528028]  ? lock_acquire+0xc9/0x2c0
[   19.528032]  ? find_held_lock+0x2b/0x80
[   19.528035]  ? rcu_read_unlock+0x17/0x60
[   19.528040]  ? lock_release+0x1a0/0x2d0
[   19.528043]  ? find_held_lock+0x2b/0x80
[   19.528045]  ? exc_page_fault+0x8c/0x240
[   19.528047]  ? lock_release+0x1a0/0x2d0
[   19.528052]  ? do_user_addr_fault+0x370/0x6b0
[   19.528056]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   19.528058] RIP: 0033:0x7f7cf7f02bae
[   19.528063] Code: 48 8b 0d 5d 32 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2a 32 0f 00 f7 d8 64 89 01 48
[   19.528065] RSP: 002b:00007ffeadb3b848 EFLAGS: 00000246 ORIG_RAX: 00000000000000af
[   19.528068] RAX: ffffffffffffffda RBX: 00005560c13fae60 RCX: 00007f7cf7f02bae
[   19.528070] RDX: 00005560a17a75ee RSI: 00000000000cbfb9 RDI: 00005560c1fa9cd0
[   19.528071] RBP: 00007ffeadb3b900 R08: 00005560c13fad40 R09: 00007f7cf7ff6ac0
[   19.528072] R10: 00005560c13fa010 R11: 0000000000000246 R12: 0000000000040000
[   19.528073] R13: 00005560c13fadc0 R14: 00005560a17a75ee R15: 0000000000000000
[   19.528078]  </TASK>
[   19.528285] iwlwifi 0000:03:00.0: Registered PHC clock: iwlwifi-PTP, with index: 0





[-- Attachment #2: 0001-thermal-core-Give-each-thermal-zone-driver-its-own-l.patch --]
[-- Type: text/x-patch, Size: 6510 bytes --]

From abb5f1c30ae77381bc36f4e42241c9789309e028 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hansg@kernel.org>
Date: Wed, 25 Jun 2025 13:40:19 +0200
Subject: [PATCH] thermal: core: Give each thermal-zone driver its own
 lock_class_key

Different thermal-zone devices sharing the same lock_class_key can lead
to false-positive lockdep reports.

Change thermal_zone_device_register_with_trips() into
thermal_zone_device_register_with_trips_and_lockkey() and make
thermal_zone_device_register_with_trips() a wrapper macro around
this including giving each thermal_zone_device_register_with_trips()
call-site their own lock_class_key.

Also turn thermal_tripless_zone_device_register() into a macro, so that its
callers also get their own lock_class_key.

Suggested-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Hans de Goede <hansg@kernel.org>
---
 drivers/thermal/thermal_core.c | 35 ++++++++++++-----------------
 include/linux/thermal.h        | 40 +++++++++++++++++++++++-----------
 2 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 17ca5c082643..003342c46628 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1470,7 +1470,7 @@ static void thermal_zone_init_complete(struct thermal_zone_device *tz)
 }
 
 /**
- * thermal_zone_device_register_with_trips() - register a new thermal zone device
+ * thermal_zone_device_register_with_trips_and_lockkey() - register a new thermal zone device
  * @type:	the thermal zone device type
  * @trips:	a pointer to an array of thermal trips
  * @num_trips:	the number of trip points the thermal zone support
@@ -1482,6 +1482,9 @@ static void thermal_zone_init_complete(struct thermal_zone_device *tz)
  * @polling_delay: number of milliseconds to wait between polls when checking
  *		   whether trip points have been crossed (0 for interrupt
  *		   driven systems)
+ * @lockkey:	lock_class_key to use for this thermal zone's lock, note drivers
+ *		should use the thermal_zone_device_register_with_trips() macro
+ *		instead of specifying this
  *
  * This interface function adds a new thermal zone device (sensor) to
  * /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
@@ -1494,13 +1497,14 @@ static void thermal_zone_init_complete(struct thermal_zone_device *tz)
  * IS_ERR*() helpers.
  */
 struct thermal_zone_device *
-thermal_zone_device_register_with_trips(const char *type,
-					const struct thermal_trip *trips,
-					int num_trips, void *devdata,
-					const struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp,
-					unsigned int passive_delay,
-					unsigned int polling_delay)
+thermal_zone_device_register_with_trips_and_lockkey(const char *type,
+						    const struct thermal_trip *trips,
+						    int num_trips, void *devdata,
+						    const struct thermal_zone_device_ops *ops,
+						    const struct thermal_zone_params *tzp,
+						    unsigned int passive_delay,
+						    unsigned int polling_delay,
+						    struct lock_class_key *lockkey)
 {
 	const struct thermal_trip *trip = trips;
 	struct thermal_zone_device *tz;
@@ -1555,7 +1559,7 @@ thermal_zone_device_register_with_trips(const char *type,
 	INIT_LIST_HEAD(&tz->trips_reached);
 	INIT_LIST_HEAD(&tz->trips_invalid);
 	ida_init(&tz->ida);
-	mutex_init(&tz->lock);
+	mutex_init_with_key(&tz->lock, lockkey);
 	init_completion(&tz->removal);
 	init_completion(&tz->resume);
 	id = ida_alloc(&thermal_tz_ida, GFP_KERNEL);
@@ -1644,18 +1648,7 @@ thermal_zone_device_register_with_trips(const char *type,
 	kfree(tz);
 	return ERR_PTR(result);
 }
-EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips);
-
-struct thermal_zone_device *thermal_tripless_zone_device_register(
-					const char *type,
-					void *devdata,
-					const struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp)
-{
-	return thermal_zone_device_register_with_trips(type, NULL, 0, devdata,
-						       ops, tzp, 0, 0);
-}
-EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register);
+EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips_and_lockkey);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
 {
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 0b5ed6821080..f35dab95001d 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -13,6 +13,7 @@
 #include <linux/of.h>
 #include <linux/idr.h>
 #include <linux/device.h>
+#include <linux/lockdep_types.h>
 #include <linux/sysfs.h>
 #include <linux/workqueue.h>
 #include <uapi/linux/thermal.h>
@@ -225,20 +226,33 @@ void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
 
 #ifdef CONFIG_THERMAL
-struct thermal_zone_device *thermal_zone_device_register_with_trips(
-					const char *type,
-					const struct thermal_trip *trips,
-					int num_trips, void *devdata,
-					const struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp,
-					unsigned int passive_delay,
-					unsigned int polling_delay);
+#define thermal_zone_device_register_with_trips(type, trips, num_trips, devdata,	\
+						ops, tzp, passive_delay, polling_delay)	\
+({											\
+	static struct lock_class_key __key;						\
+	thermal_zone_device_register_with_trips_and_lockkey(type, trips, num_trips,	\
+							    devdata, ops, tzp,		\
+							    passive_delay,		\
+							    polling_delay, &__key);	\
+})
 
-struct thermal_zone_device *thermal_tripless_zone_device_register(
-					const char *type,
-					void *devdata,
-					const struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp);
+#define thermal_tripless_zone_device_register(type, devdata, ops, tzp)			\
+({											\
+	static struct lock_class_key __key;						\
+	thermal_zone_device_register_with_trips_and_lockkey(type, NULL, 0,		\
+							    devdata, ops, tzp,		\
+							    0, 0, &__key);		\
+})
+
+struct thermal_zone_device *
+thermal_zone_device_register_with_trips_and_lockkey(const char *type,
+						    const struct thermal_trip *trips,
+						    int num_trips, void *devdata,
+						    const struct thermal_zone_device_ops *ops,
+						    const struct thermal_zone_params *tzp,
+						    unsigned int passive_delay,
+						    unsigned int polling_delay,
+						    struct lock_class_key *lockkey);
 
 void thermal_zone_device_unregister(struct thermal_zone_device *tz);
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-06-25 13:23 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-24 13:24 [PATCH v2] thermal: use a custom lock class for intel x86_pkg_temp Benjamin Berg
2025-06-25  9:44 ` Hans de Goede
2025-06-25 10:15   ` Berg, Benjamin
2025-06-25 11:34     ` Hans de Goede
2025-06-25 13:23     ` Hans de Goede

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox