On Thu, Apr 09, 2026 at 11:34:51PM +0200, mr-083 wrote: > Add hotplug support for nvme-ns devices on the NvmeBus. This enables > namespace-level hot-swap without removing the NVMe controller, matching > the behavior of physical NVMe drives hot-swapped in the same PCIe slot. If we rely purely on NVMe's AEN then this is not equivalent to swapping physical drives in the same PCIe slot. Maybe adjust the wording to reflect that this is NVMe-level Namespace hotplug? > Mark nvme-ns devices as hotpluggable and register the NvmeBus as a > hotplug handler with proper plug and unplug callbacks: > > - plug: attach namespace to all started controllers and send an > Asynchronous Event Notification (AEN) with NS_ATTR_CHANGED so > the guest kernel rescans namespaces and adds the block device > - unplug: detach from all controllers, send AEN, remove from > subsystem, then unrealize the device. The guest kernel rescans > and removes the block device. > > The plug handler skips controllers that haven't started yet > (qs_created == false) to avoid interfering with boot-time namespace > attachment in nvme_start_ctrl(). > > Both the controller bus and subsystem bus are configured as hotplug > handlers via qbus_set_bus_hotplug_handler() since nvme-ns devices > may reparent to the subsystem bus during realize. > > Example hot-swap sequence using the NVMe subsystem model: > > # Boot with: -device nvme-subsys,id=subsys0 > # -device nvme,id=ctrl0,subsys=subsys0 > # -device nvme-ns,id=ns0,drive=drv0,bus=ctrl0,nsid=1 > > device_del ns0 # guest receives AEN, removes /dev/nvme0n1 > drive_del drv0 > drive_add 0 file=disk.qcow2,format=qcow2,id=drv0,if=none > device_add nvme-ns,id=ns0,drive=drv0,bus=ctrl0,nsid=1 > # guest receives AEN, adds /dev/nvme0n1 > > Tested with Linux 6.1 guest (NVMe driver processes AEN and rescans > namespace list automatically). Did you test a Windows Server guest? If not, I can try that next week in case there are any surprises. > > Signed-off-by: Matthieu Receveur > --- > hw/nvme/ctrl.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ > hw/nvme/ns.c | 1 + > hw/nvme/subsys.c | 2 ++ > 3 files changed, 88 insertions(+) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index be6c7028cb..5502e4ea2b 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -206,6 +206,7 @@ > #include "system/hostmem.h" > #include "hw/pci/msix.h" > #include "hw/pci/pcie_sriov.h" > +#include "hw/core/qdev.h" > #include "system/spdm-socket.h" > #include "migration/vmstate.h" > > @@ -9293,6 +9294,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) > } > > qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id); > + qbus_set_bus_hotplug_handler(BUS(&n->bus)); > > if (nvme_init_subsys(n, errp)) { > return; > @@ -9553,10 +9555,93 @@ static const TypeInfo nvme_info = { > }, > }; > > +static void nvme_ns_hot_plug(HotplugHandler *hotplug_dev, DeviceState *dev, > + Error **errp) > +{ > + NvmeNamespace *ns = NVME_NS(dev); > + NvmeSubsystem *subsys = ns->subsys; > + uint32_t nsid = ns->params.nsid; > + int i; > + > + /* > + * Attach to all started controllers and notify via AEN. > + * Skip controllers that haven't started yet (boot-time realize) — > + * nvme_start_ctrl() will attach namespaces during controller init. > + */ > + for (i = 0; i < NVME_MAX_CONTROLLERS; i++) { > + NvmeCtrl *ctrl = nvme_subsys_ctrl(subsys, i); > + if (!ctrl || !ctrl->qs_created) { > + continue; > + } > + > + if (nvme_csi_supported(ctrl, ns->csi) && !ns->params.detached) { > + nvme_attach_ns(ctrl, ns); > + nvme_update_dsm_limits(ctrl, ns); > + > + if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { > + nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, > + NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, > + NVME_LOG_CHANGED_NSLIST); > + } > + } > + } > +} > + > +static void nvme_ns_hot_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, > + Error **errp) > +{ > + NvmeNamespace *ns = NVME_NS(dev); > + NvmeSubsystem *subsys = ns->subsys; > + uint32_t nsid = ns->params.nsid; > + int i; While there is qdev_unrealize -> nvme_ns_unrealize -> nvme_ns_drain -> blk_drain to quiesce I/O requests at the end of this function, I wonder whether it's safe to start removing the namespace before I/O has been drained. Did you test hot unplug while the Namespace is under heavy I/O (e.g. fio job running inside the guest with lots of queued I/O requests)? It might be necessary to stop I/O first before tearing down the namespace. > + > + /* > + * Detach from all controllers and notify the guest via AEN. > + * Must happen before unrealize to avoid use-after-free when the > + * guest sends I/O to a freed namespace. > + */ > + for (i = 0; i < NVME_MAX_CONTROLLERS; i++) { > + NvmeCtrl *ctrl = nvme_subsys_ctrl(subsys, i); > + if (!ctrl || !nvme_ns(ctrl, nsid)) { > + continue; > + } > + > + nvme_detach_ns(ctrl, ns); > + nvme_update_dsm_limits(ctrl, NULL); > + > + if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { > + nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, > + NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, > + NVME_LOG_CHANGED_NSLIST); > + } > + } > + > + /* Remove from subsystem namespace list. */ > + subsys->namespaces[nsid] = NULL; The dual of this operation is done in nvme_ns_realize(): subsys->namespaces[nsid] = ns; Maybe nvme_ns_unrealize() should remove the namespace from the subsystem for consistency? I guess the lack of removal was never an issue before hot unplug, but now it would be nice to implement the lifecycle. > + > + /* > + * Unrealize: drain I/O, flush, cleanup structures, remove from QOM. > + * nvme_ns_unrealize() handles drain/shutdown/cleanup internally. > + */ > + qdev_unrealize(dev); > +} > + > +static void nvme_bus_class_init(ObjectClass *klass, const void *data) > +{ > + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass); > + hc->plug = nvme_ns_hot_plug; > + hc->unplug = nvme_ns_hot_unplug; > +} > + > static const TypeInfo nvme_bus_info = { > .name = TYPE_NVME_BUS, > .parent = TYPE_BUS, > .instance_size = sizeof(NvmeBus), > + .class_init = nvme_bus_class_init, > + .interfaces = (const InterfaceInfo[]) { > + { TYPE_HOTPLUG_HANDLER }, > + { } > + }, > }; > > static void nvme_register_types(void) > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c > index b0106eaa5c..eb628c0734 100644 > --- a/hw/nvme/ns.c > +++ b/hw/nvme/ns.c > @@ -937,6 +937,7 @@ static void nvme_ns_class_init(ObjectClass *oc, const void *data) > dc->bus_type = TYPE_NVME_BUS; > dc->realize = nvme_ns_realize; > dc->unrealize = nvme_ns_unrealize; > + dc->hotpluggable = true; > device_class_set_props(dc, nvme_ns_props); > dc->desc = "Virtual NVMe namespace"; > } > diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c > index 777e1c620f..fa35055d3c 100644 > --- a/hw/nvme/subsys.c > +++ b/hw/nvme/subsys.c > @@ -9,6 +9,7 @@ > #include "qemu/osdep.h" > #include "qemu/units.h" > #include "qapi/error.h" > +#include "hw/core/qdev.h" > > #include "nvme.h" > > @@ -205,6 +206,7 @@ static void nvme_subsys_realize(DeviceState *dev, Error **errp) > NvmeSubsystem *subsys = NVME_SUBSYS(dev); > > qbus_init(&subsys->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id); > + qbus_set_bus_hotplug_handler(BUS(&subsys->bus)); > > nvme_subsys_setup(subsys, errp); > } > -- > 2.53.0 >