All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chaitanya Kulkarni <kch@nvidia.com>
To: <kbusch@kernel.org>, <axboe@fb.com>, <hch@lst.de>,
	<sagi@grimberg.me>, <alex.williamson@redhat.com>,
	<cohuck@redhat.com>, <jgg@ziepe.ca>, <yishaih@nvidia.com>,
	<shameerali.kolothum.thodi@huawei.com>, <kevin.tian@intel.com>,
	<mjrosato@linux.ibm.com>, <mgurtovoy@nvidia.com>
Cc: <linux-nvme@lists.infradead.org>, <kvm@vger.kernel.org>,
	<Konrad.wilk@oracle.com>, <martin.petersen@oracle.com>,
	<jmeneghi@redhat.com>, <arnd@arndb.de>, <schnelle@linux.ibm.com>,
	<bhelgaas@google.com>, <joao.m.martins@oracle.com>,
	Chaitanya Kulkarni <kch@nvidia.com>, Lei Rao <lei.rao@intel.com>
Subject: [RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure
Date: Sat, 2 Aug 2025 19:47:02 -0700	[thread overview]
Message-ID: <20250803024705.10256-2-kch@nvidia.com> (raw)
In-Reply-To: <20250803024705.10256-1-kch@nvidia.com>

Add foundational infrastructure for vfio-nvme, enabling support for live
migration of NVMe devices via the VFIO framework. The following
components are included:

- Core driver skeleton for vfio-nvme support under drivers/vfio/pci/nvme/
- Definitions of basic data structures used in live migration
  (e.g., nvmevf_pci_core_device and nvmevf_migration_file)
- Implementation of helper routines for managing migration file state
- Integration of PCI driver callbacks and error handling logic
- Registration with vfio-pci-core through nvmevf_pci_ops
- Initial support for VFIO migration states and device open/close flows

Subsequent patches will build upon this base to implement actual live
migration commands and complete the vfio device state handling logic.

Signed-off-by: Lei Rao <lei.rao@intel.com>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
---
 drivers/vfio/pci/Kconfig       |   2 +
 drivers/vfio/pci/Makefile      |   2 +
 drivers/vfio/pci/nvme/Kconfig  |  10 ++
 drivers/vfio/pci/nvme/Makefile |   3 +
 drivers/vfio/pci/nvme/nvme.c   | 196 +++++++++++++++++++++++++++++++++
 drivers/vfio/pci/nvme/nvme.h   |  36 ++++++
 6 files changed, 249 insertions(+)
 create mode 100644 drivers/vfio/pci/nvme/Kconfig
 create mode 100644 drivers/vfio/pci/nvme/Makefile
 create mode 100644 drivers/vfio/pci/nvme/nvme.c
 create mode 100644 drivers/vfio/pci/nvme/nvme.h

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..8f94429e7adc 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
 
 source "drivers/vfio/pci/qat/Kconfig"
 
+source "drivers/vfio/pci/nvme/Kconfig"
+
 endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..be8c4b5ee0ba 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
 
 obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
 
+obj-$(CONFIG_NVME_VFIO_PCI) += nvme/
+
 obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
 
 obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig
new file mode 100644
index 000000000000..12e0eaba0de1
--- /dev/null
+++ b/drivers/vfio/pci/nvme/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NVME_VFIO_PCI
+	tristate "VFIO support for NVMe PCI devices"
+	depends on NVME_CORE
+	depends on VFIO_PCI_CORE
+	help
+	  This provides migration support for NVMe devices using the
+	  VFIO framework.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvme/Makefile b/drivers/vfio/pci/nvme/Makefile
new file mode 100644
index 000000000000..2f4a0ad3d9cf
--- /dev/null
+++ b/drivers/vfio/pci/nvme/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NVME_VFIO_PCI) += nvme-vfio-pci.o
+nvme-vfio-pci-y := nvme.o
diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c
new file mode 100644
index 000000000000..08bee3274207
--- /dev/null
+++ b/drivers/vfio/pci/nvme/nvme.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+
+#include "nvme.h"
+
+static void nvmevf_disable_fd(struct nvmevf_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+
+	/* release the device states buffer */
+	kvfree(migf->vf_data);
+	migf->vf_data = NULL;
+	migf->disabled = true;
+	migf->total_length = 0;
+	migf->filp->f_pos = 0;
+	mutex_unlock(&migf->lock);
+}
+
+static void nvmevf_disable_fds(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	if (nvmevf_dev->resuming_migf) {
+		nvmevf_disable_fd(nvmevf_dev->resuming_migf);
+		fput(nvmevf_dev->resuming_migf->filp);
+		nvmevf_dev->resuming_migf = NULL;
+	}
+
+	if (nvmevf_dev->saving_migf) {
+		nvmevf_disable_fd(nvmevf_dev->saving_migf);
+		fput(nvmevf_dev->saving_migf->filp);
+		nvmevf_dev->saving_migf = NULL;
+	}
+}
+
+static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	lockdep_assert_held(&nvmevf_dev->state_mutex);
+again:
+	spin_lock(&nvmevf_dev->reset_lock);
+	if (nvmevf_dev->deferred_reset) {
+		nvmevf_dev->deferred_reset = false;
+		spin_unlock(&nvmevf_dev->reset_lock);
+		nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+		nvmevf_disable_fds(nvmevf_dev);
+		goto again;
+	}
+	mutex_unlock(&nvmevf_dev->state_mutex);
+	spin_unlock(&nvmevf_dev->reset_lock);
+}
+
+static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+	return container_of(core_device, struct nvmevf_pci_core_device,
+			    core_device);
+}
+
+static int nvmevf_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev;
+	struct vfio_pci_core_device *vdev;
+	int ret;
+
+	nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device,
+			core_device.vdev);
+	vdev = &nvmevf_dev->core_device;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (nvmevf_dev->migrate_cap)
+		nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+	vfio_pci_core_finish_enable(vdev);
+	return 0;
+}
+
+static void nvmevf_pci_close_device(struct vfio_device *core_vdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev;
+
+	nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device,
+			core_device.vdev);
+
+	if (nvmevf_dev->migrate_cap) {
+		mutex_lock(&nvmevf_dev->state_mutex);
+		nvmevf_disable_fds(nvmevf_dev);
+		nvmevf_state_mutex_unlock(nvmevf_dev);
+	}
+
+	vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_device_ops nvmevf_pci_ops = {
+	.name = "nvme-vfio-pci",
+	.release = vfio_pci_core_release_dev,
+	.open_device = nvmevf_pci_open_device,
+	.close_device = nvmevf_pci_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+};
+
+static int nvmevf_pci_probe(struct pci_dev *pdev,
+			    const struct pci_device_id *id)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev;
+	int ret;
+
+	nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device, core_device.vdev,
+				       &pdev->dev, &nvmevf_pci_ops);
+	if (IS_ERR(nvmevf_dev))
+		return PTR_ERR(nvmevf_dev);
+
+	dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device);
+	ret = vfio_pci_core_register_device(&nvmevf_dev->core_device);
+	if (ret)
+		goto out_put_dev;
+
+	return 0;
+
+out_put_dev:
+	vfio_put_device(&nvmevf_dev->core_device.vdev);
+	return ret;
+}
+
+static void nvmevf_pci_remove(struct pci_dev *pdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&nvmevf_dev->core_device);
+	vfio_put_device(&nvmevf_dev->core_device.vdev);
+}
+
+static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+	if (!nvmevf_dev->migrate_cap)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&nvmevf_dev->reset_lock);
+	nvmevf_dev->deferred_reset = true;
+	if (!mutex_trylock(&nvmevf_dev->state_mutex)) {
+		spin_unlock(&nvmevf_dev->reset_lock);
+		return;
+	}
+	spin_unlock(&nvmevf_dev->reset_lock);
+	nvmevf_state_mutex_unlock(nvmevf_dev);
+}
+
+static const struct pci_error_handlers nvmevf_err_handlers = {
+	.reset_done = nvmevf_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver nvmevf_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.probe = nvmevf_pci_probe,
+	.remove = nvmevf_pci_remove,
+	.err_handler = &nvmevf_err_handlers,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(nvmevf_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Chaitanya Kulkarni <kch@nvidia.com>");
+MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe");
diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h
new file mode 100644
index 000000000000..ee602254679e
--- /dev/null
+++ b/drivers/vfio/pci/nvme/nvme.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved
+ */
+
+#ifndef NVME_VFIO_PCI_H
+#define NVME_VFIO_PCI_H
+
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/nvme.h>
+
+struct nvmevf_migration_file {
+	struct file *filp;
+	struct mutex lock;
+	bool disabled;
+	u8 *vf_data;
+	size_t total_length;
+};
+
+struct nvmevf_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	int vf_id;
+	u8 migrate_cap:1;
+	u8 deferred_reset:1;
+	/* protect migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	/* protect the reset_done flow */
+	spinlock_t reset_lock;
+	struct nvmevf_migration_file *resuming_migf;
+	struct nvmevf_migration_file *saving_migf;
+};
+
+#endif /* NVME_VFIO_PCI_H */
-- 
2.40.0


  reply	other threads:[~2025-08-03  2:48 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-03  2:47 [RFC PATCH 0/4] Add new VFIO PCI driver for NVMe devices Chaitanya Kulkarni
2025-08-03  2:47 ` Chaitanya Kulkarni [this message]
2025-08-04 15:20   ` [RFC PATCH 1/4] vfio-nvme: add vfio-nvme lm driver infrastructure Shameerali Kolothum Thodi
2025-08-04 16:43   ` Bjorn Helgaas
2025-08-04 17:15   ` Alex Williamson
2025-08-03  2:47 ` [RFC PATCH 2/4] nvme: add live migration TP 4159 definitions Chaitanya Kulkarni
2025-08-03 23:04   ` kernel test robot
2025-08-03  2:47 ` [RFC PATCH 3/4] nvme: export helpers to implement vfio-nvme lm Chaitanya Kulkarni
2025-08-03  2:47 ` [RFC PATCH 4/4] vfio-nvme: implement TP4159 live migration cmds Chaitanya Kulkarni
2025-08-03 20:40   ` kernel test robot
2025-08-04 16:41   ` Bjorn Helgaas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250803024705.10256-2-kch@nvidia.com \
    --to=kch@nvidia.com \
    --cc=Konrad.wilk@oracle.com \
    --cc=alex.williamson@redhat.com \
    --cc=arnd@arndb.de \
    --cc=axboe@fb.com \
    --cc=bhelgaas@google.com \
    --cc=cohuck@redhat.com \
    --cc=hch@lst.de \
    --cc=jgg@ziepe.ca \
    --cc=jmeneghi@redhat.com \
    --cc=joao.m.martins@oracle.com \
    --cc=kbusch@kernel.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=lei.rao@intel.com \
    --cc=linux-nvme@lists.infradead.org \
    --cc=martin.petersen@oracle.com \
    --cc=mgurtovoy@nvidia.com \
    --cc=mjrosato@linux.ibm.com \
    --cc=sagi@grimberg.me \
    --cc=schnelle@linux.ibm.com \
    --cc=shameerali.kolothum.thodi@huawei.com \
    --cc=yishaih@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.