* [PATCH] PCI: vmd: Use raw spinlock for cfg_lock
@ 2024-06-19 11:27 Jiwei Sun
2024-06-19 20:00 ` Bjorn Helgaas
0 siblings, 1 reply; 5+ messages in thread
From: Jiwei Sun @ 2024-06-19 11:27 UTC (permalink / raw)
To: nirmal.patel, jonathan.derrick
Cc: lpieralisi, kw, robh, bhelgaas, linux-pci, linux-kernel, sunjw10,
ahuang12, sunjw10
From: Jiwei Sun <sunjw10@lenovo.com>
If the kernel is built with the following configurations and booting
CONFIG_VMD=y
CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_PROVE_LOCKING=y
CONFIG_PROVE_RAW_LOCK_NESTING=y
The following log appears,
=============================
[ BUG: Invalid wait context ]
6.10.0-rc4 #80 Not tainted
-----------------------------
kworker/18:2/633 is trying to lock:
ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
other info that might help us debug this:
context-{5:5}
4 locks held by kworker/18:2/633:
#0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
#1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
#2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
#3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
stack backtrace:
CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
Workqueue: events work_for_cpu_fn
Call Trace:
<TASK>
dump_stack_lvl+0x7c/0xc0
__lock_acquire+0x9e5/0x1ed0
lock_acquire+0x194/0x490
_raw_spin_lock_irqsave+0x42/0x90
vmd_pci_write+0x185/0x2a0
pci_msi_update_mask+0x10c/0x170
__pci_enable_msi_range+0x291/0x800
pci_alloc_irq_vectors_affinity+0x13e/0x1d0
pcie_portdrv_probe+0x570/0xe60
local_pci_probe+0xdc/0x190
work_for_cpu_fn+0x4e/0xa0
process_one_work+0x86d/0x1920
process_scheduled_works+0xd7/0x140
worker_thread+0x3e9/0xb90
kthread+0x2e9/0x3d0
ret_from_fork+0x2d/0x60
ret_from_fork_asm+0x1a/0x30
</TASK>
The root cause is that the dev->msi_lock is a raw spinlock, but
vmd->cfg_lock is a spinlock.
Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
Suggested-by: Adrian Huang <ahuang12@lenovo.com>
---
drivers/pci/controller/vmd.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 87b7856f375a..45d0ebf96adc 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -125,7 +125,7 @@ struct vmd_irq_list {
struct vmd_dev {
struct pci_dev *dev;
- spinlock_t cfg_lock;
+ raw_spinlock_t cfg_lock;
void __iomem *cfgbar;
int msix_count;
@@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
if (!addr)
return -EFAULT;
- spin_lock_irqsave(&vmd->cfg_lock, flags);
+ raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
switch (len) {
case 1:
*value = readb(addr);
@@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
ret = -EINVAL;
break;
}
- spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+ raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
return ret;
}
@@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
if (!addr)
return -EFAULT;
- spin_lock_irqsave(&vmd->cfg_lock, flags);
+ raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
switch (len) {
case 1:
writeb(value, addr);
@@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
ret = -EINVAL;
break;
}
- spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+ raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
return ret;
}
@@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
vmd->first_vec = 1;
- spin_lock_init(&vmd->cfg_lock);
+ raw_spin_lock_init(&vmd->cfg_lock);
pci_set_drvdata(dev, vmd);
err = vmd_enable_domain(vmd, features);
if (err)
--
2.27.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] PCI: vmd: Use raw spinlock for cfg_lock
2024-06-19 11:27 [PATCH] PCI: vmd: Use raw spinlock for cfg_lock Jiwei Sun
@ 2024-06-19 20:00 ` Bjorn Helgaas
2024-06-20 8:57 ` Jiwei Sun
0 siblings, 1 reply; 5+ messages in thread
From: Bjorn Helgaas @ 2024-06-19 20:00 UTC (permalink / raw)
To: Jiwei Sun
Cc: nirmal.patel, jonathan.derrick, lpieralisi, kw, robh, bhelgaas,
linux-pci, linux-kernel, sunjw10, ahuang12, Thomas Gleixner,
Keith Busch
[+cc Thomas in case he has msi_lock comment, Keith in case he has
cfg_lock comment]
On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
> From: Jiwei Sun <sunjw10@lenovo.com>
>
> If the kernel is built with the following configurations and booting
> CONFIG_VMD=y
> CONFIG_DEBUG_LOCKDEP=y
> CONFIG_DEBUG_SPINLOCK=y
> CONFIG_PROVE_LOCKING=y
> CONFIG_PROVE_RAW_LOCK_NESTING=y
>
> The following log appears,
>
> =============================
> [ BUG: Invalid wait context ]
> 6.10.0-rc4 #80 Not tainted
> -----------------------------
> kworker/18:2/633 is trying to lock:
> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
> other info that might help us debug this:
> context-{5:5}
> 4 locks held by kworker/18:2/633:
> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
> stack backtrace:
> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
> Workqueue: events work_for_cpu_fn
> Call Trace:
> <TASK>
> dump_stack_lvl+0x7c/0xc0
> __lock_acquire+0x9e5/0x1ed0
> lock_acquire+0x194/0x490
> _raw_spin_lock_irqsave+0x42/0x90
> vmd_pci_write+0x185/0x2a0
> pci_msi_update_mask+0x10c/0x170
> __pci_enable_msi_range+0x291/0x800
> pci_alloc_irq_vectors_affinity+0x13e/0x1d0
> pcie_portdrv_probe+0x570/0xe60
> local_pci_probe+0xdc/0x190
> work_for_cpu_fn+0x4e/0xa0
> process_one_work+0x86d/0x1920
> process_scheduled_works+0xd7/0x140
> worker_thread+0x3e9/0xb90
> kthread+0x2e9/0x3d0
> ret_from_fork+0x2d/0x60
> ret_from_fork_asm+0x1a/0x30
> </TASK>
>
> The root cause is that the dev->msi_lock is a raw spinlock, but
> vmd->cfg_lock is a spinlock.
Can you expand this a little bit? This isn't enough unless one
already knows the difference between raw_spinlock_t and spinlock_t,
which I didn't.
Documentation/locking/locktypes.rst says they are the same except when
CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
rt_mutex.
And I guess there's a rule that you can't acquire rt_mutex while
holding a raw_spinlock.
The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
msi_desc::masked for multi-MSI") and only used in
pci_msi_update_mask():
raw_spin_lock_irqsave(lock, flags);
desc->pci.msi_mask &= ~clear;
desc->pci.msi_mask |= set;
pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
desc->pci.msi_mask);
raw_spin_unlock_irqrestore(lock, flags);
The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
Intel Volume Management Device (VMD)") and is only used around VMD
config accesses, e.g.,
* CPU may deadlock if config space is not serialized on some versions of this
* hardware, so all config space access is done under a spinlock.
static int vmd_pci_read(...)
{
spin_lock_irqsave(&vmd->cfg_lock, flags);
switch (len) {
case 1:
*value = readb(addr);
break;
case 2:
*value = readw(addr);
break;
case 4:
*value = readl(addr);
break;
default:
ret = -EINVAL;
break;
}
spin_unlock_irqrestore(&vmd->cfg_lock, flags);
}
IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
expect any concurrency issues there that need locking.
But apparently there's something weird that can deadlock the CPU.
> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
> ---
> drivers/pci/controller/vmd.c | 12 ++++++------
> 1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
> index 87b7856f375a..45d0ebf96adc 100644
> --- a/drivers/pci/controller/vmd.c
> +++ b/drivers/pci/controller/vmd.c
> @@ -125,7 +125,7 @@ struct vmd_irq_list {
> struct vmd_dev {
> struct pci_dev *dev;
>
> - spinlock_t cfg_lock;
> + raw_spinlock_t cfg_lock;
> void __iomem *cfgbar;
>
> int msix_count;
> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
> if (!addr)
> return -EFAULT;
>
> - spin_lock_irqsave(&vmd->cfg_lock, flags);
> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
> switch (len) {
> case 1:
> *value = readb(addr);
> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
> ret = -EINVAL;
> break;
> }
> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> return ret;
> }
>
> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
> if (!addr)
> return -EFAULT;
>
> - spin_lock_irqsave(&vmd->cfg_lock, flags);
> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
> switch (len) {
> case 1:
> writeb(value, addr);
> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
> ret = -EINVAL;
> break;
> }
> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> return ret;
> }
>
> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
> vmd->first_vec = 1;
>
> - spin_lock_init(&vmd->cfg_lock);
> + raw_spin_lock_init(&vmd->cfg_lock);
> pci_set_drvdata(dev, vmd);
> err = vmd_enable_domain(vmd, features);
> if (err)
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] PCI: vmd: Use raw spinlock for cfg_lock
2024-06-19 20:00 ` Bjorn Helgaas
@ 2024-06-20 8:57 ` Jiwei Sun
2024-06-20 15:10 ` Paul M Stillwell Jr
0 siblings, 1 reply; 5+ messages in thread
From: Jiwei Sun @ 2024-06-20 8:57 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: nirmal.patel, jonathan.derrick, lpieralisi, kw, robh, bhelgaas,
linux-pci, linux-kernel, sunjw10, ahuang12, Thomas Gleixner,
Keith Busch
On 6/20/24 04:00, Bjorn Helgaas wrote:
> [+cc Thomas in case he has msi_lock comment, Keith in case he has
> cfg_lock comment]
>
> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
>> From: Jiwei Sun <sunjw10@lenovo.com>
>>
>> If the kernel is built with the following configurations and booting
>> CONFIG_VMD=y
>> CONFIG_DEBUG_LOCKDEP=y
>> CONFIG_DEBUG_SPINLOCK=y
>> CONFIG_PROVE_LOCKING=y
>> CONFIG_PROVE_RAW_LOCK_NESTING=y
>>
>> The following log appears,
>>
>> =============================
>> [ BUG: Invalid wait context ]
>> 6.10.0-rc4 #80 Not tainted
>> -----------------------------
>> kworker/18:2/633 is trying to lock:
>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
>> other info that might help us debug this:
>> context-{5:5}
>> 4 locks held by kworker/18:2/633:
>> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
>> stack backtrace:
>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
>> Workqueue: events work_for_cpu_fn
>> Call Trace:
>> <TASK>
>> dump_stack_lvl+0x7c/0xc0
>> __lock_acquire+0x9e5/0x1ed0
>> lock_acquire+0x194/0x490
>> _raw_spin_lock_irqsave+0x42/0x90
>> vmd_pci_write+0x185/0x2a0
>> pci_msi_update_mask+0x10c/0x170
>> __pci_enable_msi_range+0x291/0x800
>> pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>> pcie_portdrv_probe+0x570/0xe60
>> local_pci_probe+0xdc/0x190
>> work_for_cpu_fn+0x4e/0xa0
>> process_one_work+0x86d/0x1920
>> process_scheduled_works+0xd7/0x140
>> worker_thread+0x3e9/0xb90
>> kthread+0x2e9/0x3d0
>> ret_from_fork+0x2d/0x60
>> ret_from_fork_asm+0x1a/0x30
>> </TASK>
>>
>> The root cause is that the dev->msi_lock is a raw spinlock, but
>> vmd->cfg_lock is a spinlock.
>
> Can you expand this a little bit? This isn't enough unless one
> already knows the difference between raw_spinlock_t and spinlock_t,
> which I didn't.
>
> Documentation/locking/locktypes.rst says they are the same except when
> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
> rt_mutex.
>
> And I guess there's a rule that you can't acquire rt_mutex while
> holding a raw_spinlock.
Thanks for your review and comments. Sorry for not explaining this clearly.
Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is
based on raw_spinlock, there is no any question in the above call trace.
But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based
on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example,
there are two threads are trying to hold a rt_mutex lock, if A hold the
lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting
for A to release the lock. The raw_spinlock is a real spinning lock, which
is not allowed the task of the raw_spinlock owner is scheduled in its
critical region. In other words, we should not try to acquire rt_mutex lock
in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set.
CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are
used to detect the invalid lock nesting (the raw_spinlock vs. spinlock
nesting checks) [1]. Here is the call path:
pci_msi_update_mask ---> hold raw_spinlock dev->msi_lock
pci_write_config_dword
pci_bus_write_config_dword
vmd_pci_write ---> hold spinlock_t vmd->cfg_lock
The above call path is the invalid lock nesting becuase the vmd driver
tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock
region (dev->msi_lock). That's why the message "BUG: Invalid wait contex"
is shown.
[1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/
Thanks,
Regards,
Jiwei
>
> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
> msi_desc::masked for multi-MSI") and only used in
> pci_msi_update_mask():
>
> raw_spin_lock_irqsave(lock, flags);
> desc->pci.msi_mask &= ~clear;
> desc->pci.msi_mask |= set;
> pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
> desc->pci.msi_mask);
> raw_spin_unlock_irqrestore(lock, flags);
>
> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
> Intel Volume Management Device (VMD)") and is only used around VMD
> config accesses, e.g.,
>
> * CPU may deadlock if config space is not serialized on some versions of this
> * hardware, so all config space access is done under a spinlock.
>
> static int vmd_pci_read(...)
> {
> spin_lock_irqsave(&vmd->cfg_lock, flags);
> switch (len) {
> case 1:
> *value = readb(addr);
> break;
> case 2:
> *value = readw(addr);
> break;
> case 4:
> *value = readl(addr);
> break;
> default:
> ret = -EINVAL;
> break;
> }
> spin_unlock_irqrestore(&vmd->cfg_lock, flags);
> }
>
> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
> expect any concurrency issues there that need locking.
>
> But apparently there's something weird that can deadlock the CPU.
>
>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
>> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
>> ---
>> drivers/pci/controller/vmd.c | 12 ++++++------
>> 1 file changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
>> index 87b7856f375a..45d0ebf96adc 100644
>> --- a/drivers/pci/controller/vmd.c
>> +++ b/drivers/pci/controller/vmd.c
>> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>> struct vmd_dev {
>> struct pci_dev *dev;
>>
>> - spinlock_t cfg_lock;
>> + raw_spinlock_t cfg_lock;
>> void __iomem *cfgbar;
>>
>> int msix_count;
>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>> if (!addr)
>> return -EFAULT;
>>
>> - spin_lock_irqsave(&vmd->cfg_lock, flags);
>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>> switch (len) {
>> case 1:
>> *value = readb(addr);
>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>> ret = -EINVAL;
>> break;
>> }
>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> return ret;
>> }
>>
>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>> if (!addr)
>> return -EFAULT;
>>
>> - spin_lock_irqsave(&vmd->cfg_lock, flags);
>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>> switch (len) {
>> case 1:
>> writeb(value, addr);
>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>> ret = -EINVAL;
>> break;
>> }
>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> return ret;
>> }
>>
>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>> vmd->first_vec = 1;
>>
>> - spin_lock_init(&vmd->cfg_lock);
>> + raw_spin_lock_init(&vmd->cfg_lock);
>> pci_set_drvdata(dev, vmd);
>> err = vmd_enable_domain(vmd, features);
>> if (err)
>> --
>> 2.27.0
>>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] PCI: vmd: Use raw spinlock for cfg_lock
2024-06-20 8:57 ` Jiwei Sun
@ 2024-06-20 15:10 ` Paul M Stillwell Jr
2024-06-21 9:12 ` Jiwei Sun
0 siblings, 1 reply; 5+ messages in thread
From: Paul M Stillwell Jr @ 2024-06-20 15:10 UTC (permalink / raw)
To: Jiwei Sun, Bjorn Helgaas
Cc: nirmal.patel, jonathan.derrick, lpieralisi, kw, robh, bhelgaas,
linux-pci, linux-kernel, sunjw10, ahuang12, Thomas Gleixner,
Keith Busch
On 6/20/2024 1:57 AM, Jiwei Sun wrote:
>
>
> On 6/20/24 04:00, Bjorn Helgaas wrote:
>> [+cc Thomas in case he has msi_lock comment, Keith in case he has
>> cfg_lock comment]
>>
>> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
>>> From: Jiwei Sun <sunjw10@lenovo.com>
>>>
>>> If the kernel is built with the following configurations and booting
>>> CONFIG_VMD=y
>>> CONFIG_DEBUG_LOCKDEP=y
>>> CONFIG_DEBUG_SPINLOCK=y
>>> CONFIG_PROVE_LOCKING=y
>>> CONFIG_PROVE_RAW_LOCK_NESTING=y
>>>
>>> The following log appears,
>>>
>>> =============================
>>> [ BUG: Invalid wait context ]
>>> 6.10.0-rc4 #80 Not tainted
>>> -----------------------------
>>> kworker/18:2/633 is trying to lock:
>>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
>>> other info that might help us debug this:
>>> context-{5:5}
>>> 4 locks held by kworker/18:2/633:
>>> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>>> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>>> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>>> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
>>> stack backtrace:
>>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
>>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
>>> Workqueue: events work_for_cpu_fn
>>> Call Trace:
>>> <TASK>
>>> dump_stack_lvl+0x7c/0xc0
>>> __lock_acquire+0x9e5/0x1ed0
>>> lock_acquire+0x194/0x490
>>> _raw_spin_lock_irqsave+0x42/0x90
>>> vmd_pci_write+0x185/0x2a0
>>> pci_msi_update_mask+0x10c/0x170
>>> __pci_enable_msi_range+0x291/0x800
>>> pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>>> pcie_portdrv_probe+0x570/0xe60
>>> local_pci_probe+0xdc/0x190
>>> work_for_cpu_fn+0x4e/0xa0
>>> process_one_work+0x86d/0x1920
>>> process_scheduled_works+0xd7/0x140
>>> worker_thread+0x3e9/0xb90
>>> kthread+0x2e9/0x3d0
>>> ret_from_fork+0x2d/0x60
>>> ret_from_fork_asm+0x1a/0x30
>>> </TASK>
>>>
>>> The root cause is that the dev->msi_lock is a raw spinlock, but
>>> vmd->cfg_lock is a spinlock.
>>
>> Can you expand this a little bit? This isn't enough unless one
>> already knows the difference between raw_spinlock_t and spinlock_t,
>> which I didn't.
>>
>> Documentation/locking/locktypes.rst says they are the same except when
>> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
>> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
>> rt_mutex.
>>
>> And I guess there's a rule that you can't acquire rt_mutex while
>> holding a raw_spinlock.
>
> Thanks for your review and comments. Sorry for not explaining this clearly.
> Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is
> based on raw_spinlock, there is no any question in the above call trace.
>
> But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based
> on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example,
> there are two threads are trying to hold a rt_mutex lock, if A hold the
> lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting
> for A to release the lock. The raw_spinlock is a real spinning lock, which
> is not allowed the task of the raw_spinlock owner is scheduled in its
> critical region. In other words, we should not try to acquire rt_mutex lock
> in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set.
>
> CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are
> used to detect the invalid lock nesting (the raw_spinlock vs. spinlock
> nesting checks) [1]. Here is the call path:
>
> pci_msi_update_mask ---> hold raw_spinlock dev->msi_lock
> pci_write_config_dword
> pci_bus_write_config_dword
> vmd_pci_write ---> hold spinlock_t vmd->cfg_lock
>
> The above call path is the invalid lock nesting becuase the vmd driver
> tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock
> region (dev->msi_lock). That's why the message "BUG: Invalid wait contex"
> is shown.
>
It looks like this only happens when CONFIG_PREEMPT_RT is set so I would
mention that in the commit message (as Bjorn mentioned). I also think
thsi level of detail is helpful and should be in the commit message as
well since it's not obvious to the casual observer :)
Paul
> [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/
>
> Thanks,
> Regards,
> Jiwei
>
>>
>> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
>> msi_desc::masked for multi-MSI") and only used in
>> pci_msi_update_mask():
>>
>> raw_spin_lock_irqsave(lock, flags);
>> desc->pci.msi_mask &= ~clear;
>> desc->pci.msi_mask |= set;
>> pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
>> desc->pci.msi_mask);
>> raw_spin_unlock_irqrestore(lock, flags);
>>
>> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
>> Intel Volume Management Device (VMD)") and is only used around VMD
>> config accesses, e.g.,
>>
>> * CPU may deadlock if config space is not serialized on some versions of this
>> * hardware, so all config space access is done under a spinlock.
>>
>> static int vmd_pci_read(...)
>> {
>> spin_lock_irqsave(&vmd->cfg_lock, flags);
>> switch (len) {
>> case 1:
>> *value = readb(addr);
>> break;
>> case 2:
>> *value = readw(addr);
>> break;
>> case 4:
>> *value = readl(addr);
>> break;
>> default:
>> ret = -EINVAL;
>> break;
>> }
>> spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>> }
>>
>> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
>> expect any concurrency issues there that need locking.
>>
>> But apparently there's something weird that can deadlock the CPU.
>>
>>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
>>> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
>>> ---
>>> drivers/pci/controller/vmd.c | 12 ++++++------
>>> 1 file changed, 6 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
>>> index 87b7856f375a..45d0ebf96adc 100644
>>> --- a/drivers/pci/controller/vmd.c
>>> +++ b/drivers/pci/controller/vmd.c
>>> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>>> struct vmd_dev {
>>> struct pci_dev *dev;
>>>
>>> - spinlock_t cfg_lock;
>>> + raw_spinlock_t cfg_lock;
>>> void __iomem *cfgbar;
>>>
>>> int msix_count;
>>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>> if (!addr)
>>> return -EFAULT;
>>>
>>> - spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> switch (len) {
>>> case 1:
>>> *value = readb(addr);
>>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>> ret = -EINVAL;
>>> break;
>>> }
>>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> return ret;
>>> }
>>>
>>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>> if (!addr)
>>> return -EFAULT;
>>>
>>> - spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> switch (len) {
>>> case 1:
>>> writeb(value, addr);
>>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>> ret = -EINVAL;
>>> break;
>>> }
>>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> return ret;
>>> }
>>>
>>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>>> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>>> vmd->first_vec = 1;
>>>
>>> - spin_lock_init(&vmd->cfg_lock);
>>> + raw_spin_lock_init(&vmd->cfg_lock);
>>> pci_set_drvdata(dev, vmd);
>>> err = vmd_enable_domain(vmd, features);
>>> if (err)
>>> --
>>> 2.27.0
>>>
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] PCI: vmd: Use raw spinlock for cfg_lock
2024-06-20 15:10 ` Paul M Stillwell Jr
@ 2024-06-21 9:12 ` Jiwei Sun
0 siblings, 0 replies; 5+ messages in thread
From: Jiwei Sun @ 2024-06-21 9:12 UTC (permalink / raw)
To: Paul M Stillwell Jr, Bjorn Helgaas
Cc: nirmal.patel, jonathan.derrick, lpieralisi, kw, robh, bhelgaas,
linux-pci, linux-kernel, sunjw10, ahuang12, Thomas Gleixner,
Keith Busch
On 6/20/24 23:10, Paul M Stillwell Jr wrote:
> On 6/20/2024 1:57 AM, Jiwei Sun wrote:
>>
>>
>> On 6/20/24 04:00, Bjorn Helgaas wrote:
>>> [+cc Thomas in case he has msi_lock comment, Keith in case he has
>>> cfg_lock comment]
>>>
>>> On Wed, Jun 19, 2024 at 07:27:59PM +0800, Jiwei Sun wrote:
>>>> From: Jiwei Sun <sunjw10@lenovo.com>
>>>>
>>>> If the kernel is built with the following configurations and booting
>>>> CONFIG_VMD=y
>>>> CONFIG_DEBUG_LOCKDEP=y
>>>> CONFIG_DEBUG_SPINLOCK=y
>>>> CONFIG_PROVE_LOCKING=y
>>>> CONFIG_PROVE_RAW_LOCK_NESTING=y
>>>>
>>>> The following log appears,
>>>>
>>>> =============================
>>>> [ BUG: Invalid wait context ]
>>>> 6.10.0-rc4 #80 Not tainted
>>>> -----------------------------
>>>> kworker/18:2/633 is trying to lock:
>>>> ffff888c474e5648 (&vmd->cfg_lock){....}-{3:3}, at: vmd_pci_write+0x185/0x2a0
>>>> other info that might help us debug this:
>>>> context-{5:5}
>>>> 4 locks held by kworker/18:2/633:
>>>> #0: ffff888100108958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0xf78/0x1920
>>>> #1: ffffc9000ae1fd90 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x7fe/0x1920
>>>> #2: ffff888c483508a8 (&md->mutex){+.+.}-{4:4}, at: __pci_enable_msi_range+0x208/0x800
>>>> #3: ffff888c48329bd8 (&dev->msi_lock){....}-{2:2}, at: pci_msi_update_mask+0x91/0x170
>>>> stack backtrace:
>>>> CPU: 18 PID: 633 Comm: kworker/18:2 Not tainted 6.10.0-rc4 #80 7c0f2526417bfbb7579e3c3442683c5961773c75
>>>> Hardware name: Lenovo ThinkSystem SR630/-[7X01RCZ000]-, BIOS IVEL60O-2.71 09/28/2020
>>>> Workqueue: events work_for_cpu_fn
>>>> Call Trace:
>>>> <TASK>
>>>> dump_stack_lvl+0x7c/0xc0
>>>> __lock_acquire+0x9e5/0x1ed0
>>>> lock_acquire+0x194/0x490
>>>> _raw_spin_lock_irqsave+0x42/0x90
>>>> vmd_pci_write+0x185/0x2a0
>>>> pci_msi_update_mask+0x10c/0x170
>>>> __pci_enable_msi_range+0x291/0x800
>>>> pci_alloc_irq_vectors_affinity+0x13e/0x1d0
>>>> pcie_portdrv_probe+0x570/0xe60
>>>> local_pci_probe+0xdc/0x190
>>>> work_for_cpu_fn+0x4e/0xa0
>>>> process_one_work+0x86d/0x1920
>>>> process_scheduled_works+0xd7/0x140
>>>> worker_thread+0x3e9/0xb90
>>>> kthread+0x2e9/0x3d0
>>>> ret_from_fork+0x2d/0x60
>>>> ret_from_fork_asm+0x1a/0x30
>>>> </TASK>
>>>>
>>>> The root cause is that the dev->msi_lock is a raw spinlock, but
>>>> vmd->cfg_lock is a spinlock.
>>>
>>> Can you expand this a little bit? This isn't enough unless one
>>> already knows the difference between raw_spinlock_t and spinlock_t,
>>> which I didn't.
>>>
>>> Documentation/locking/locktypes.rst says they are the same except when
>>> CONFIG_PREEMPT_RT is set (might be worth mentioning with the config
>>> list above?), but that with CONFIG_PREEMPT_RT, spinlock_t is based on
>>> rt_mutex.
>>>
>>> And I guess there's a rule that you can't acquire rt_mutex while
>>> holding a raw_spinlock.
>>
>> Thanks for your review and comments. Sorry for not explaining this clearly.
>> Yes, you are right, if CONFIG_PREEMPT_RT is not set, the spinlock_t is
>> based on raw_spinlock, there is no any question in the above call trace.
>>
>> But as you mentioned, if CONFIG_PREEMPT_RT is set, the spinlock_t is based
>> on rt_mutex, a task will be scheduled when waiting for rt_mutex. For example,
>> there are two threads are trying to hold a rt_mutex lock, if A hold the
>> lock firstly, and B will be scheduled in rtlock_slowlock_locked() waiting
>> for A to release the lock. The raw_spinlock is a real spinning lock, which
>> is not allowed the task of the raw_spinlock owner is scheduled in its
>> critical region. In other words, we should not try to acquire rt_mutex lock
>> in the critical region of the raw_spinlock when CONFIG_PREEMPT_RT is set.
>>
>> CONFIG_PROVE_LOCKING and CONFIG_PROVE_RAW_LOCK_NESTING options are
>> used to detect the invalid lock nesting (the raw_spinlock vs. spinlock
>> nesting checks) [1]. Here is the call path:
>>
>> pci_msi_update_mask ---> hold raw_spinlock dev->msi_lock
>> pci_write_config_dword
>> pci_bus_write_config_dword
>> vmd_pci_write ---> hold spinlock_t vmd->cfg_lock
>>
>> The above call path is the invalid lock nesting becuase the vmd driver
>> tries to acquire the vmd->cfg_lock spinlock within the raw_spinlock
>> region (dev->msi_lock). That's why the message "BUG: Invalid wait contex"
>> is shown.
>>
>
> It looks like this only happens when CONFIG_PREEMPT_RT is set so I would mention that in the commit message (as Bjorn mentioned). I also think thsi level of detail is helpful and should be in the commit message as well since it's not obvious to the casual observer :)
Thanks for your suggestions and comments, I totally agree with you.
I will add those key information into V2 patch commit message.
Thanks,
Regards,
Jiwei
>
> Paul
>
>> [1] https://lore.kernel.org/lkml/YBBA81osV7cHN2fb@hirez.programming.kicks-ass.net/
>>
>> Thanks,
>> Regards,
>> Jiwei
>>
>>>
>>> The dev->msi_lock was added by 77e89afc25f3 ("PCI/MSI: Protect
>>> msi_desc::masked for multi-MSI") and only used in
>>> pci_msi_update_mask():
>>>
>>> raw_spin_lock_irqsave(lock, flags);
>>> desc->pci.msi_mask &= ~clear;
>>> desc->pci.msi_mask |= set;
>>> pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
>>> desc->pci.msi_mask);
>>> raw_spin_unlock_irqrestore(lock, flags);
>>>
>>> The vmd->cfg_lock was added by 185a383ada2e ("x86/PCI: Add driver for
>>> Intel Volume Management Device (VMD)") and is only used around VMD
>>> config accesses, e.g.,
>>>
>>> * CPU may deadlock if config space is not serialized on some versions of this
>>> * hardware, so all config space access is done under a spinlock.
>>>
>>> static int vmd_pci_read(...)
>>> {
>>> spin_lock_irqsave(&vmd->cfg_lock, flags);
>>> switch (len) {
>>> case 1:
>>> *value = readb(addr);
>>> break;
>>> case 2:
>>> *value = readw(addr);
>>> break;
>>> case 4:
>>> *value = readl(addr);
>>> break;
>>> default:
>>> ret = -EINVAL;
>>> break;
>>> }
>>> spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>> }
>>>
>>> IIUC those reads turn into single PCIe MMIO reads, so I wouldn't
>>> expect any concurrency issues there that need locking.
>>>
>>> But apparently there's something weird that can deadlock the CPU.
>>>
>>>> Signed-off-by: Jiwei Sun<sunjw10@lenovo.com>
>>>> Suggested-by: Adrian Huang <ahuang12@lenovo.com>
>>>> ---
>>>> drivers/pci/controller/vmd.c | 12 ++++++------
>>>> 1 file changed, 6 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
>>>> index 87b7856f375a..45d0ebf96adc 100644
>>>> --- a/drivers/pci/controller/vmd.c
>>>> +++ b/drivers/pci/controller/vmd.c
>>>> @@ -125,7 +125,7 @@ struct vmd_irq_list {
>>>> struct vmd_dev {
>>>> struct pci_dev *dev;
>>>> - spinlock_t cfg_lock;
>>>> + raw_spinlock_t cfg_lock;
>>>> void __iomem *cfgbar;
>>>> int msix_count;
>>>> @@ -402,7 +402,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>>> if (!addr)
>>>> return -EFAULT;
>>>> - spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>> switch (len) {
>>>> case 1:
>>>> *value = readb(addr);
>>>> @@ -417,7 +417,7 @@ static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
>>>> ret = -EINVAL;
>>>> break;
>>>> }
>>>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>> return ret;
>>>> }
>>>> @@ -437,7 +437,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>>> if (!addr)
>>>> return -EFAULT;
>>>> - spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>> + raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
>>>> switch (len) {
>>>> case 1:
>>>> writeb(value, addr);
>>>> @@ -455,7 +455,7 @@ static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
>>>> ret = -EINVAL;
>>>> break;
>>>> }
>>>> - spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>> + raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
>>>> return ret;
>>>> }
>>>> @@ -1015,7 +1015,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>>>> if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
>>>> vmd->first_vec = 1;
>>>> - spin_lock_init(&vmd->cfg_lock);
>>>> + raw_spin_lock_init(&vmd->cfg_lock);
>>>> pci_set_drvdata(dev, vmd);
>>>> err = vmd_enable_domain(vmd, features);
>>>> if (err)
>>>> --
>>>> 2.27.0
>>>>
>>
>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-06-21 9:12 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-19 11:27 [PATCH] PCI: vmd: Use raw spinlock for cfg_lock Jiwei Sun
2024-06-19 20:00 ` Bjorn Helgaas
2024-06-20 8:57 ` Jiwei Sun
2024-06-20 15:10 ` Paul M Stillwell Jr
2024-06-21 9:12 ` Jiwei Sun
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox