Re: [RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Alex Williamson <alex.williamson@redhat.com>
To: Chaitanya Kulkarni <kch@nvidia.com>
Cc: <kbusch@kernel.org>, <axboe@fb.com>, <hch@lst.de>,
	<sagi@grimberg.me>, <cohuck@redhat.com>, <jgg@ziepe.ca>,
	<yishaih@nvidia.com>, <shameerali.kolothum.thodi@huawei.com>,
	<kevin.tian@intel.com>, <mjrosato@linux.ibm.com>,
	<mgurtovoy@nvidia.com>, <linux-nvme@lists.infradead.org>,
	<kvm@vger.kernel.org>, <Konrad.wilk@oracle.com>,
	<martin.petersen@oracle.com>, <jmeneghi@redhat.com>,
	<arnd@arndb.de>, <schnelle@linux.ibm.com>, <bhelgaas@google.com>,
	<joao.m.martins@oracle.com>, Lei Rao <lei.rao@intel.com>
Subject: Re: [RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure
Date: Mon, 4 Aug 2025 11:15:56 -0600	[thread overview]
Message-ID: <20250804111556.3ba2c832.alex.williamson@redhat.com> (raw)
In-Reply-To: <20250803024705.10256-2-kch@nvidia.com>

On Sat, 2 Aug 2025 19:47:02 -0700
Chaitanya Kulkarni <kch@nvidia.com> wrote:

> Add foundational infrastructure for vfio-nvme, enabling support for live
> migration of NVMe devices via the VFIO framework. The following
> components are included:
> 
> - Core driver skeleton for vfio-nvme support under drivers/vfio/pci/nvme/
> - Definitions of basic data structures used in live migration
>   (e.g., nvmevf_pci_core_device and nvmevf_migration_file)
> - Implementation of helper routines for managing migration file state
> - Integration of PCI driver callbacks and error handling logic
> - Registration with vfio-pci-core through nvmevf_pci_ops
> - Initial support for VFIO migration states and device open/close flows
> 
> Subsequent patches will build upon this base to implement actual live
> migration commands and complete the vfio device state handling logic.
> 
> Signed-off-by: Lei Rao <lei.rao@intel.com>
> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
> Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
> ---
>  drivers/vfio/pci/Kconfig       |   2 +
>  drivers/vfio/pci/Makefile      |   2 +
>  drivers/vfio/pci/nvme/Kconfig  |  10 ++
>  drivers/vfio/pci/nvme/Makefile |   3 +
>  drivers/vfio/pci/nvme/nvme.c   | 196 +++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/nvme/nvme.h   |  36 ++++++
>  6 files changed, 249 insertions(+)
>  create mode 100644 drivers/vfio/pci/nvme/Kconfig
>  create mode 100644 drivers/vfio/pci/nvme/Makefile
>  create mode 100644 drivers/vfio/pci/nvme/nvme.c
>  create mode 100644 drivers/vfio/pci/nvme/nvme.h
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f54665..8f94429e7adc 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
>  
>  source "drivers/vfio/pci/qat/Kconfig"
>  
> +source "drivers/vfio/pci/nvme/Kconfig"
> +
>  endmenu
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c..be8c4b5ee0ba 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -10,6 +10,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
>  
>  obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
>  
> +obj-$(CONFIG_NVME_VFIO_PCI) += nvme/
> +
>  obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
>  
>  obj-$(CONFIG_PDS_VFIO_PCI) += pds/
> diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig
> new file mode 100644
> index 000000000000..12e0eaba0de1
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/Kconfig
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config NVME_VFIO_PCI
> +	tristate "VFIO support for NVMe PCI devices"
> +	depends on NVME_CORE
> +	depends on VFIO_PCI_CORE
> +	help
> +	  This provides migration support for NVMe devices using the
> +	  VFIO framework.
> +
> +	  If you don't know what to do here, say N.
> diff --git a/drivers/vfio/pci/nvme/Makefile b/drivers/vfio/pci/nvme/Makefile
> new file mode 100644
> index 000000000000..2f4a0ad3d9cf
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/Makefile
> @@ -0,0 +1,3 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-$(CONFIG_NVME_VFIO_PCI) += nvme-vfio-pci.o
> +nvme-vfio-pci-y := nvme.o
> diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c
> new file mode 100644
> index 000000000000..08bee3274207
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/nvme.c
> @@ -0,0 +1,196 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
> + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
> + */
> +
> +#include <linux/device.h>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/pci.h>
> +#include <linux/types.h>
> +#include <linux/vfio.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/kernel.h>
> +#include <linux/vfio_pci_core.h>
> +
> +#include "nvme.h"
> +
> +static void nvmevf_disable_fd(struct nvmevf_migration_file *migf)
> +{
> +	mutex_lock(&migf->lock);
> +
> +	/* release the device states buffer */
> +	kvfree(migf->vf_data);
> +	migf->vf_data = NULL;
> +	migf->disabled = true;
> +	migf->total_length = 0;
> +	migf->filp->f_pos = 0;
> +	mutex_unlock(&migf->lock);
> +}
> +
> +static void nvmevf_disable_fds(struct nvmevf_pci_core_device *nvmevf_dev)
> +{
> +	if (nvmevf_dev->resuming_migf) {
> +		nvmevf_disable_fd(nvmevf_dev->resuming_migf);
> +		fput(nvmevf_dev->resuming_migf->filp);
> +		nvmevf_dev->resuming_migf = NULL;
> +	}
> +
> +	if (nvmevf_dev->saving_migf) {
> +		nvmevf_disable_fd(nvmevf_dev->saving_migf);
> +		fput(nvmevf_dev->saving_migf->filp);
> +		nvmevf_dev->saving_migf = NULL;
> +	}
> +}
> +
> +static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device *nvmevf_dev)
> +{
> +	lockdep_assert_held(&nvmevf_dev->state_mutex);
> +again:
> +	spin_lock(&nvmevf_dev->reset_lock);
> +	if (nvmevf_dev->deferred_reset) {
> +		nvmevf_dev->deferred_reset = false;
> +		spin_unlock(&nvmevf_dev->reset_lock);
> +		nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> +		nvmevf_disable_fds(nvmevf_dev);
> +		goto again;
> +	}
> +	mutex_unlock(&nvmevf_dev->state_mutex);
> +	spin_unlock(&nvmevf_dev->reset_lock);
> +}
> +
> +static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
> +
> +	return container_of(core_device, struct nvmevf_pci_core_device,
> +			    core_device);
> +}
> +
> +static int nvmevf_pci_open_device(struct vfio_device *core_vdev)
> +{
> +	struct nvmevf_pci_core_device *nvmevf_dev;
> +	struct vfio_pci_core_device *vdev;
> +	int ret;
> +
> +	nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device,
> +			core_device.vdev);
> +	vdev = &nvmevf_dev->core_device;
> +
> +	ret = vfio_pci_core_enable(vdev);
> +	if (ret)
> +		return ret;
> +
> +	if (nvmevf_dev->migrate_cap)
> +		nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> +	vfio_pci_core_finish_enable(vdev);
> +	return 0;
> +}
> +
> +static void nvmevf_pci_close_device(struct vfio_device *core_vdev)
> +{
> +	struct nvmevf_pci_core_device *nvmevf_dev;
> +
> +	nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device,
> +			core_device.vdev);
> +
> +	if (nvmevf_dev->migrate_cap) {
> +		mutex_lock(&nvmevf_dev->state_mutex);
> +		nvmevf_disable_fds(nvmevf_dev);
> +		nvmevf_state_mutex_unlock(nvmevf_dev);
> +	}
> +
> +	vfio_pci_core_close_device(core_vdev);
> +}
> +
> +static const struct vfio_device_ops nvmevf_pci_ops = {
> +	.name = "nvme-vfio-pci",
> +	.release = vfio_pci_core_release_dev,
> +	.open_device = nvmevf_pci_open_device,
> +	.close_device = nvmevf_pci_close_device,
> +	.ioctl = vfio_pci_core_ioctl,
> +	.device_feature = vfio_pci_core_ioctl_feature,
> +	.read = vfio_pci_core_read,
> +	.write = vfio_pci_core_write,
> +	.mmap = vfio_pci_core_mmap,
> +	.request = vfio_pci_core_request,
> +	.match = vfio_pci_core_match,
> +};
> +
> +static int nvmevf_pci_probe(struct pci_dev *pdev,
> +			    const struct pci_device_id *id)
> +{
> +	struct nvmevf_pci_core_device *nvmevf_dev;
> +	int ret;
> +
> +	nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device, core_device.vdev,
> +				       &pdev->dev, &nvmevf_pci_ops);
> +	if (IS_ERR(nvmevf_dev))
> +		return PTR_ERR(nvmevf_dev);
> +
> +	dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device);
> +	ret = vfio_pci_core_register_device(&nvmevf_dev->core_device);
> +	if (ret)
> +		goto out_put_dev;
> +
> +	return 0;
> +
> +out_put_dev:
> +	vfio_put_device(&nvmevf_dev->core_device.vdev);
> +	return ret;
> +}
> +
> +static void nvmevf_pci_remove(struct pci_dev *pdev)
> +{
> +	struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
> +
> +	vfio_pci_core_unregister_device(&nvmevf_dev->core_device);
> +	vfio_put_device(&nvmevf_dev->core_device.vdev);
> +}
> +
> +static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
> +
> +	if (!nvmevf_dev->migrate_cap)
> +		return;
> +
> +	/*
> +	 * As the higher VFIO layers are holding locks across reset and using
> +	 * those same locks with the mm_lock we need to prevent ABBA deadlock
> +	 * with the state_mutex and mm_lock.
> +	 * In case the state_mutex was taken already we defer the cleanup work
> +	 * to the unlock flow of the other running context.
> +	 */
> +	spin_lock(&nvmevf_dev->reset_lock);
> +	nvmevf_dev->deferred_reset = true;
> +	if (!mutex_trylock(&nvmevf_dev->state_mutex)) {
> +		spin_unlock(&nvmevf_dev->reset_lock);
> +		return;
> +	}
> +	spin_unlock(&nvmevf_dev->reset_lock);
> +	nvmevf_state_mutex_unlock(nvmevf_dev);
> +}
> +
> +static const struct pci_error_handlers nvmevf_err_handlers = {
> +	.reset_done = nvmevf_pci_aer_reset_done,
> +	.error_detected = vfio_pci_core_aer_err_detected,
> +};
> +
> +static struct pci_driver nvmevf_pci_driver = {
> +	.name = KBUILD_MODNAME,
> +	.probe = nvmevf_pci_probe,
> +	.remove = nvmevf_pci_remove,
> +	.err_handler = &nvmevf_err_handlers,
> +	.driver_managed_dma = true,
> +};
> +
> +module_pci_driver(nvmevf_pci_driver);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Chaitanya Kulkarni <kch@nvidia.com>");
> +MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe");

Without a MODULE_DEVICE_TABLE, what devices are ever going to use this
driver?  Userspace needs to be given a clue when to use this driver vs
vfio-pci.  We also don't have a fallback mechanism to try a driver
until it fails, so this driver likely needs to take over defacto
support for all NVMe devices from vfio-pci, rather that later rejecting
those that don't support migration as patch 4/ implements in the .init
callback.  Thanks,

Alex

> diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h
> new file mode 100644
> index 000000000000..ee602254679e
> --- /dev/null
> +++ b/drivers/vfio/pci/nvme/nvme.h
> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
> + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
> + */
> +
> +#ifndef NVME_VFIO_PCI_H
> +#define NVME_VFIO_PCI_H
> +
> +#include <linux/kernel.h>
> +#include <linux/vfio_pci_core.h>
> +#include <linux/nvme.h>
> +
> +struct nvmevf_migration_file {
> +	struct file *filp;
> +	struct mutex lock;
> +	bool disabled;
> +	u8 *vf_data;
> +	size_t total_length;
> +};
> +
> +struct nvmevf_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +	int vf_id;
> +	u8 migrate_cap:1;
> +	u8 deferred_reset:1;
> +	/* protect migration state */
> +	struct mutex state_mutex;
> +	enum vfio_device_mig_state mig_state;
> +	/* protect the reset_done flow */
> +	spinlock_t reset_lock;
> +	struct nvmevf_migration_file *resuming_migf;
> +	struct nvmevf_migration_file *saving_migf;
> +};
> +
> +#endif /* NVME_VFIO_PCI_H */

next prev parent reply	other threads:[~2025-08-04 17:16 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-03  2:47 [RFC PATCH 0/4] Add new VFIO PCI driver for NVMe devices Chaitanya Kulkarni
2025-08-03  2:47 ` [RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure Chaitanya Kulkarni
2025-08-04 15:20   ` Shameerali Kolothum Thodi
2025-08-04 16:43   ` Bjorn Helgaas
2025-08-04 17:15   ` Alex Williamson [this message]
2025-08-03  2:47 ` [RFC PATCH 2/4] nvme: add live migration TP 4159 definitions Chaitanya Kulkarni
2025-08-03  2:47 ` [RFC PATCH 3/4] nvme: export helpers to implement vfio-nvme lm Chaitanya Kulkarni
2025-08-03  2:47 ` [RFC PATCH 4/4] vfio-nvme: implement TP4159 live migration cmds Chaitanya Kulkarni
2025-08-04 16:41   ` Bjorn Helgaas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250804111556.3ba2c832.alex.williamson@redhat.com \
    --to=alex.williamson@redhat.com \
    --cc=Konrad.wilk@oracle.com \
    --cc=arnd@arndb.de \
    --cc=axboe@fb.com \
    --cc=bhelgaas@google.com \
    --cc=cohuck@redhat.com \
    --cc=hch@lst.de \
    --cc=jgg@ziepe.ca \
    --cc=jmeneghi@redhat.com \
    --cc=joao.m.martins@oracle.com \
    --cc=kbusch@kernel.org \
    --cc=kch@nvidia.com \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=lei.rao@intel.com \
    --cc=linux-nvme@lists.infradead.org \
    --cc=martin.petersen@oracle.com \
    --cc=mgurtovoy@nvidia.com \
    --cc=mjrosato@linux.ibm.com \
    --cc=sagi@grimberg.me \
    --cc=schnelle@linux.ibm.com \
    --cc=shameerali.kolothum.thodi@huawei.com \
    --cc=yishaih@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).