* [PATCH] vfio/pci: Allow disabling idle D3 on a per-device basis
@ 2026-04-22 8:13 lirongqing
2026-04-22 21:51 ` Alex Williamson
0 siblings, 1 reply; 2+ messages in thread
From: lirongqing @ 2026-04-22 8:13 UTC (permalink / raw)
To: Alex Williamson, Jason Gunthorpe, Kevin Tian, Ankit Agrawal,
Leon Romanovsky, Alistair Popple, kvm, linux-kernel
Cc: Li RongQing
From: Li RongQing <lirongqing@baidu.com>
The disable_idle_d3 module parameter currently toggles idle D3 power
management for all devices handled by vfio-pci. This is too coarse for
environments where only specific devices (e.g., certain GPUs or NICs)
have issues with D3 state transition.
For example, some PCIe devices exhibit hardware bugs or firmware issues
when entering or exiting D3 state. These devices may experience PCIe link
speed degradation after transitioning out of D3, reducing from Gen4/Gen5
to lower speeds, which can significantly impact I/O bandwidth. In such
cases, only these problematic devices need to have idle D3 disabled,
rather than all devices globally.
Introduce a new module parameter 'disable_idle_d3_ids' to allow users to
specify a list of vendor:device IDs that should have idle D3 disabled.
To support this, add a 'disable_idle_d3' flag to struct
vfio_pci_core_device. This flag is initialized during device probe
based on both the global 'disable_idle_d3' parameter and the new
'disable_idle_d3_ids' list. All runtime PM decisions are then shifted
to use this per-device flag.
In vfio_pci_dev_set_try_reset(), update the logic to iterate through
all devices in the dev_set and respect their individual D3 settings
when performing a bus reset.
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
drivers/vfio/pci/vfio_pci.c | 7 ++-
drivers/vfio/pci/vfio_pci_core.c | 109 +++++++++++++++++++++++++++++++++++----
include/linux/vfio_pci_core.h | 3 +-
3 files changed, 107 insertions(+), 12 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 0c771064c..fd55776 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -60,6 +60,10 @@ static bool disable_denylist;
module_param(disable_denylist, bool, 0444);
MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
+static char disable_idle_d3_ids[1024];
+module_param_string(disable_idle_d3_ids, disable_idle_d3_ids, sizeof(disable_idle_d3_ids), 0444);
+MODULE_PARM_DESC(disable_idle_d3_ids, "Comma-separated list of vendor:device IDs to disable idle D3");
+
static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
{
switch (pdev->vendor) {
@@ -262,7 +266,8 @@ static int __init vfio_pci_init(void)
is_disable_vga = disable_vga;
#endif
- vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3);
+ vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3,
+ disable_idle_d3_ids);
/* Register and scan for devices */
ret = pci_register_driver(&vfio_pci_driver);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index ad52abc..ac037a7 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -42,6 +42,73 @@ static bool nointxmask;
static bool disable_vga;
static bool disable_idle_d3;
+struct vfio_pci_d3_info {
+ struct list_head list;
+ unsigned int vendor;
+ unsigned int device;
+};
+
+/*
+ * disable_idle_d3_list is built in vfio_pci_core_set_params() before
+ * pci_register_driver(), and is read-only after that, so no locking is
+ * needed. It is freed in vfio_pci_core_cleanup() after
+ * pci_unregister_driver() completes.
+ */
+static LIST_HEAD(disable_idle_d3_list);
+
+static void vfio_pci_parse_d3_ids(const char *disable_idle_d3_ids)
+{
+ char *tmp, *p, *id_str;
+
+ if (*disable_idle_d3_ids == '\0')
+ return;
+
+ tmp = kstrdup(disable_idle_d3_ids, GFP_KERNEL);
+ if (!tmp)
+ return;
+
+ p = tmp;
+ while ((id_str = strsep(&p, ","))) {
+ unsigned int v, d;
+ struct vfio_pci_d3_info *info;
+
+ if (*id_str == '\0')
+ continue;
+
+ if (sscanf(id_str, "%x:%x", &v, &d) == 2) {
+ info = kzalloc_obj(*info, GFP_KERNEL);
+ if (!info)
+ break;
+ info->vendor = v;
+ info->device = d;
+ list_add_tail(&info->list, &disable_idle_d3_list);
+ } else
+ pr_warn("vfio-pci: invalid ids '%s'\n", id_str);
+ }
+ kfree(tmp);
+}
+
+static void vfio_pci_free_d3_ids(void)
+{
+ struct vfio_pci_d3_info *info, *next;
+
+ list_for_each_entry_safe(info, next, &disable_idle_d3_list, list) {
+ list_del(&info->list);
+ kfree(info);
+ }
+}
+
+static bool vfio_pci_dev_in_d3_list(struct pci_dev *pdev)
+{
+ struct vfio_pci_d3_info *info;
+
+ list_for_each_entry(info, &disable_idle_d3_list, list) {
+ if (pdev->vendor == info->vendor && pdev->device == info->device)
+ return true;
+ }
+ return false;
+}
+
static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu)
{
struct vfio_pci_eventfd *eventfd =
@@ -501,7 +568,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
u16 cmd;
u8 msix_pos;
- if (!disable_idle_d3) {
+ if (!vdev->disable_idle_d3) {
ret = pm_runtime_resume_and_get(&pdev->dev);
if (ret < 0)
return ret;
@@ -579,7 +646,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
out_disable_device:
pci_disable_device(pdev);
out_power:
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_put(&pdev->dev);
return ret;
}
@@ -715,7 +782,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
/* Put the pm-runtime usage counter acquired during enable */
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_put(&pdev->dev);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
@@ -2107,6 +2174,9 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
+ vdev->disable_idle_d3 = disable_idle_d3 ||
+ vfio_pci_dev_in_d3_list(vdev->pdev);
+
return 0;
}
EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev);
@@ -2202,7 +2272,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
dev->driver->pm = &vfio_pci_core_pm_ops;
pm_runtime_allow(dev);
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_put(dev);
ret = vfio_register_group_dev(&vdev->vdev);
@@ -2211,7 +2281,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
return 0;
out_power:
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_get_noresume(dev);
pm_runtime_forbid(dev);
@@ -2230,7 +2300,7 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
vfio_pci_vf_uninit(vdev);
vfio_pci_vga_uninit(vdev);
- if (!disable_idle_d3)
+ if (!vdev->disable_idle_d3)
pm_runtime_get_noresume(&vdev->pdev->dev);
pm_runtime_forbid(&vdev->pdev->dev);
@@ -2541,6 +2611,7 @@ static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
struct vfio_pci_core_device *cur;
struct pci_dev *pdev;
bool reset_done = false;
+ int ret;
if (!vfio_pci_dev_set_needs_reset(dev_set))
return;
@@ -2554,8 +2625,16 @@ static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
* state. Increment the usage count for all the devices in the dev_set
* before reset and decrement the same after reset.
*/
- if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
- return;
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+ if (!cur->disable_idle_d3) {
+ ret = pm_runtime_resume_and_get(&cur->pdev->dev);
+ if (ret < 0) {
+ pci_warn(cur->pdev,
+ "failed to resume device for bus reset, ret=%d\n", ret);
+ goto out;
+ }
+ }
+ }
if (!pci_reset_bus(pdev))
reset_done = true;
@@ -2564,23 +2643,33 @@ static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
if (reset_done)
cur->needs_reset = false;
- if (!disable_idle_d3)
+ if (!cur->disable_idle_d3)
+ pm_runtime_put(&cur->pdev->dev);
+ }
+ return;
+
+out:
+ list_for_each_entry_continue_reverse(cur, &dev_set->device_list, vdev.dev_set_list) {
+ if (!cur->disable_idle_d3)
pm_runtime_put(&cur->pdev->dev);
}
}
void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
- bool is_disable_idle_d3)
+ bool is_disable_idle_d3, const char *ids)
{
nointxmask = is_nointxmask;
disable_vga = is_disable_vga;
disable_idle_d3 = is_disable_idle_d3;
+ vfio_pci_free_d3_ids();
+ vfio_pci_parse_d3_ids(ids);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
static void vfio_pci_core_cleanup(void)
{
vfio_pci_uninit_perm_bits();
+ vfio_pci_free_d3_ids();
}
static int __init vfio_pci_core_init(void)
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 2ebba74..2062543 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -127,6 +127,7 @@ struct vfio_pci_core_device {
bool needs_pm_restore:1;
bool pm_intx_masked:1;
bool pm_runtime_engaged:1;
+ bool disable_idle_d3:1;
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
@@ -157,7 +158,7 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
const struct vfio_pci_regops *ops,
size_t size, u32 flags, void *data);
void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
- bool is_disable_idle_d3);
+ bool is_disable_idle_d3, const char *ids);
void vfio_pci_core_close_device(struct vfio_device *core_vdev);
int vfio_pci_core_init_dev(struct vfio_device *core_vdev);
void vfio_pci_core_release_dev(struct vfio_device *core_vdev);
--
2.9.4
^ permalink raw reply related [flat|nested] 2+ messages in thread* Re: [PATCH] vfio/pci: Allow disabling idle D3 on a per-device basis
2026-04-22 8:13 [PATCH] vfio/pci: Allow disabling idle D3 on a per-device basis lirongqing
@ 2026-04-22 21:51 ` Alex Williamson
0 siblings, 0 replies; 2+ messages in thread
From: Alex Williamson @ 2026-04-22 21:51 UTC (permalink / raw)
To: lirongqing
Cc: Jason Gunthorpe, Kevin Tian, Ankit Agrawal, Leon Romanovsky,
Alistair Popple, kvm, linux-kernel, alex
On Wed, 22 Apr 2026 04:13:07 -0400
lirongqing <lirongqing@baidu.com> wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> The disable_idle_d3 module parameter currently toggles idle D3 power
> management for all devices handled by vfio-pci. This is too coarse for
> environments where only specific devices (e.g., certain GPUs or NICs)
> have issues with D3 state transition.
>
> For example, some PCIe devices exhibit hardware bugs or firmware issues
> when entering or exiting D3 state. These devices may experience PCIe link
> speed degradation after transitioning out of D3, reducing from Gen4/Gen5
> to lower speeds, which can significantly impact I/O bandwidth. In such
> cases, only these problematic devices need to have idle D3 disabled,
> rather than all devices globally.
>
> Introduce a new module parameter 'disable_idle_d3_ids' to allow users to
> specify a list of vendor:device IDs that should have idle D3 disabled.
>
> To support this, add a 'disable_idle_d3' flag to struct
> vfio_pci_core_device. This flag is initialized during device probe
> based on both the global 'disable_idle_d3' parameter and the new
> 'disable_idle_d3_ids' list. All runtime PM decisions are then shifted
> to use this per-device flag.
>
> In vfio_pci_dev_set_try_reset(), update the logic to iterate through
> all devices in the dev_set and respect their individual D3 settings
> when performing a bus reset.PCI_DEV_FLAGS_NO_D3
There are device flags that can be set by quirks to handle this:
enum pci_dev_flags {
...
/* Device configuration is irrevocably lost if disabled into D3 */
PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) (1 << 1),
...
/* Do not use PM reset even if device advertises NoSoftRst- */
PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
Ideally vfio-pci.disable_idle_d3 would be your debug tool for
evaluating issues with device level D3 support. If an incompatible
device is found, we should attempt to resolve issues, like link
re-training, or at least contribute a quirk for the device so that all
users benefit, not just those with a magic list of broken devices.
You also have the reset_method sysfs attribute at your disposal to
manage how we trigger a function scoped reset. Thanks,
Alex
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-04-22 21:52 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-22 8:13 [PATCH] vfio/pci: Allow disabling idle D3 on a per-device basis lirongqing
2026-04-22 21:51 ` Alex Williamson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox