From: Klaus Jensen <its@irrelevant.dk>
To: Stefan Hajnoczi <stefanha@redhat.com>
Cc: qemu-block@nongnu.org, "Klaus Jensen" <k.jensen@samsung.com>,
"Markus Armbruster" <armbru@redhat.com>,
qemu-devel@nongnu.org, "Hannes Reinecke" <hare@suse.de>,
"Keith Busch" <kbusch@kernel.org>,
"Philippe Mathieu-Daudé" <philmd@redhat.com>
Subject: Re: [PATCH 4/4] hw/nvme: fix controller hot unplugging
Date: Wed, 7 Jul 2021 18:47:50 +0200 [thread overview]
Message-ID: <YOXatgr6weL1zy9s@apples.localdomain> (raw)
In-Reply-To: <YOXS4QFXOPXLW5//@stefanha-x1.localdomain>
[-- Attachment #1: Type: text/plain, Size: 11858 bytes --]
On Jul 7 17:14, Stefan Hajnoczi wrote:
>On Wed, Jul 07, 2021 at 12:43:56PM +0200, Hannes Reinecke wrote:
>> On 7/7/21 11:53 AM, Klaus Jensen wrote:
>> > On Jul 7 09:49, Hannes Reinecke wrote:
>> > > On 7/6/21 11:33 AM, Klaus Jensen wrote:
>> > > > From: Klaus Jensen <k.jensen@samsung.com>
>> > > >
>> > > > Prior to this patch the nvme-ns devices are always children of the
>> > > > NvmeBus owned by the NvmeCtrl. This causes the namespaces to be
>> > > > unrealized when the parent device is removed. However, when subsystems
>> > > > are involved, this is not what we want since the namespaces may be
>> > > > attached to other controllers as well.
>> > > >
>> > > > This patch adds an additional NvmeBus on the subsystem device. When
>> > > > nvme-ns devices are realized, if the parent controller device is linked
>> > > > to a subsystem, the parent bus is set to the subsystem one instead. This
>> > > > makes sure that namespaces are kept alive and not unrealized.
>> > > >
>> > > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
>> > > > ---
>> > > > hw/nvme/nvme.h | 18 ++++++++++--------
>> > > > hw/nvme/ctrl.c | 8 +++++---
>> > > > hw/nvme/ns.c | 32 +++++++++++++++++++++++++-------
>> > > > hw/nvme/subsys.c | 4 ++++
>> > > > 4 files changed, 44 insertions(+), 18 deletions(-)
>> > > >
>> > > > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
>> > > > index c4065467d877..9401e212f9f7 100644
>> > > > --- a/hw/nvme/nvme.h
>> > > > +++ b/hw/nvme/nvme.h
>> > > > @@ -33,12 +33,21 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES >
>> > > > NVME_NSID_BROADCAST - 1);
>> > > > typedef struct NvmeCtrl NvmeCtrl;
>> > > > typedef struct NvmeNamespace NvmeNamespace;
>> > > > +#define TYPE_NVME_BUS "nvme-bus"
>> > > > +OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
>> > > > +
>> > > > +typedef struct NvmeBus {
>> > > > + BusState parent_bus;
>> > > > + bool is_subsys;
>> > > > +} NvmeBus;
>> > > > +
>> > > > #define TYPE_NVME_SUBSYS "nvme-subsys"
>> > > > #define NVME_SUBSYS(obj) \
>> > > > OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
>> > > > typedef struct NvmeSubsystem {
>> > > > DeviceState parent_obj;
>> > > > + NvmeBus bus;
>> > > > uint8_t subnqn[256];
>> > > > NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS];
>> > > > @@ -365,13 +374,6 @@ typedef struct NvmeCQueue {
>> > > > QTAILQ_HEAD(, NvmeRequest) req_list;
>> > > > } NvmeCQueue;
>> > > > -#define TYPE_NVME_BUS "nvme-bus"
>> > > > -#define NVME_BUS(obj) OBJECT_CHECK(NvmeBus, (obj), TYPE_NVME_BUS)
>> > > > -
>> > > > -typedef struct NvmeBus {
>> > > > - BusState parent_bus;
>> > > > -} NvmeBus;
>> > > > -
>> > > > #define TYPE_NVME "nvme"
>> > > > #define NVME(obj) \
>> > > > OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
>> > > > @@ -463,7 +465,7 @@ typedef struct NvmeCtrl {
>> > > > static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
>> > > > {
>> > > > - if (!nsid || nsid > NVME_MAX_NAMESPACES) {
>> > > > + if (!n || !nsid || nsid > NVME_MAX_NAMESPACES) {
>> > > > return NULL;
>> > > > }
>> > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>> > > > index 90e3ee2b70ee..7c8fca36d9a5 100644
>> > > > --- a/hw/nvme/ctrl.c
>> > > > +++ b/hw/nvme/ctrl.c
>> > > > @@ -6516,11 +6516,13 @@ static void nvme_exit(PCIDevice *pci_dev)
>> > > > for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
>> > > > ns = nvme_ns(n, i);
>> > > > - if (!ns) {
>> > > > - continue;
>> > > > + if (ns) {
>> > > > + ns->attached--;
>> > > > }
>> > > > + }
>> > > > - nvme_ns_cleanup(ns);
>> > > > + if (n->subsys) {
>> > > > + nvme_subsys_unregister_ctrl(n->subsys, n);
>> > > > }
>> > > > if (n->subsys) {
>> > > > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
>> > > > index 3c4f5b8c714a..612a2786d75d 100644
>> > > > --- a/hw/nvme/ns.c
>> > > > +++ b/hw/nvme/ns.c
>> > > > @@ -444,13 +444,29 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
>> > > > static void nvme_ns_realize(DeviceState *dev, Error **errp)
>> > > > {
>> > > > NvmeNamespace *ns = NVME_NS(dev);
>> > > > - BusState *s = qdev_get_parent_bus(dev);
>> > > > - NvmeCtrl *n = NVME(s->parent);
>> > > > - NvmeSubsystem *subsys = n->subsys;
>> > > > + BusState *qbus = qdev_get_parent_bus(dev);
>> > > > + NvmeBus *bus = NVME_BUS(qbus);
>> > > > + NvmeCtrl *n = NULL;
>> > > > + NvmeSubsystem *subsys = NULL;
>> > > > uint32_t nsid = ns->params.nsid;
>> > > > int i;
>> > > > - if (!n->subsys) {
>> > > > + if (bus->is_subsys) {
>> > > > + subsys = NVME_SUBSYS(qbus->parent);
>> > > > + } else {
>> > > > + n = NVME(qbus->parent);
>> > > > + subsys = n->subsys;
>> > > > + }
>> > > > +
>> > > > + if (subsys) {
>> > > > + /*
>> > > > + * If this namespace belongs to a subsystem (through a
>> > > > link on the
>> > > > + * controller device), reparent the device.
>> > > > + */
>> > > > + if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) {
>> > > > + return;
>> > > > + }
>> > > > + } else {
>> > > > if (ns->params.detached) {
>> > > > error_setg(errp, "detached requires that the nvme
>> > > > device is "
>> > > > "linked to an nvme-subsys device");
>> > > > @@ -470,7 +486,7 @@ static void nvme_ns_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > > if (!nsid) {
>> > > > for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
>> > > > - if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) {
>> > > > + if (nvme_subsys_ns(subsys, i) || nvme_ns(n, i)) {
>> > > > continue;
>> > > > }
>> > > > @@ -483,7 +499,7 @@ static void nvme_ns_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > > return;
>> > > > }
>> > > > } else {
>> > > > - if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) {
>> > > > + if (nvme_subsys_ns(subsys, nsid) || nvme_ns(n, nsid)) {
>> > > > error_setg(errp, "namespace id '%d' already
>> > > > allocated", nsid);
>> > > > return;
>> > > > }
>> > > > @@ -509,7 +525,9 @@ static void nvme_ns_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > > }
>> > > > }
>> > > > - nvme_attach_ns(n, ns);
>> > > > + if (n) {
>> > > > + nvme_attach_ns(n, ns);
>> > > > + }
>> > > > }
>> > > > static Property nvme_ns_props[] = {
>> > > > diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
>> > > > index 92caa604a280..fb7c3a7c55fc 100644
>> > > > --- a/hw/nvme/subsys.c
>> > > > +++ b/hw/nvme/subsys.c
>> > > > @@ -50,6 +50,10 @@ static void nvme_subsys_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > > {
>> > > > NvmeSubsystem *subsys = NVME_SUBSYS(dev);
>> > > > + qbus_create_inplace(&subsys->bus, sizeof(NvmeBus),
>> > > > TYPE_NVME_BUS, dev,
>> > > > + dev->id);
>> > > > + subsys->bus.is_subsys = true;
>> > > > +
>> > > > nvme_subsys_setup(subsys);
>> > > > }
>> > > >
>> > > Why not always create a subsystem, and distinguish between 'shared'
>> > > and 'non-shared' subsystems instead of the ->subsys pointer?
>> > > That way all namespaces can be children of the subsystem, we won't
>> > > need any reparenting, and the whole thing will be more in-line with
>> > > qdev, no?
>> > >
>> >
>> > Hi Hannes,
>> >
>> > Thanks for your reviews and comments!
>> >
>> > So, I have actually considered what you suggest.
>> >
>> > The problem is that the nvme-ns device currently expects to plug into a
>> > bus (an 'nvme-bus') and we cannot really get rid of that 'bus' parameter
>> > without breaking compatibility. I experimented with removing the bus
>> > from the nvme device and instead creating it in the nvme-subsys device.
>> > My idea here was to implicitly always create an nvme-subsys if not
>> > explicitly created by the user, but since nvme-subsys does not plug into
>> > any bus itself, it is not exposed in the qtree and thus not reachable
>> > (i.e., when realizing the nvme-ns device, it will complain that there
>> > are no 'nvme-bus' available to plug into). To make this work, I believe
>> > we'd have to have the nvme-subsys plug into the main system bus (or some
>> > other bus rooted at main-system-bus), and I'd prefer not to do that
>> > since this is already a flawed design and I think it would be more
>> > intrusive than what my patch does.
>> >
>> Sigh.
>> _Again_.
>>
>> I seem to trip over issues for which no patch can possibly be accepted as
>> first the infrastructure has to be reworked.
>>
>> ... there is a reason why I'm avoiding posting patches to qemu-devel :-(
>>
>> > I raised these issues on the mailinglist some time ago[1]. We didn't
>> > really find a good solution, but I think the conclusion is that the
>> > bus/device design of subsystems and namespaces is flawed. That's why I
>> > am working on an "objectification" of the two devices as suggested by
>> > Stefan[2] as a proper fix for this mess.
>> >
>> Actually, I _do_ think that it would be good to have an nvme-subsystem bus,
>> providing a list of all namespaces for that subsystem.
>> Creating a controller would then mean that one has to create a controller
>> and a namespace _separately_, as then the namespace would _not_ be a child
>> of the controller.
>> But that's arguably a good thing, as the namespaces _do_ have a separate
>> lifetime from controllers.
>> And the lifetime of the subsystem could be completely self-contained; the
>> controller would be looking up subsystems via the subsystem NQN if present;
>> I guess we'll need to create ad-hoc subsystems for PCIe.
>> But nothing insurmountable.
>> And certainly nothing which require a large-scale rework of qemu internals,
>> I would think.
>
>I don't remember the reason to use --object instead of --device. Maybe
>the --device approach is viable and it makes some sense because this is
>a "device" (like a SCSI LUN) that exposed to the guest. It would be
>necessary to have a nvme-subsys device on the system bus to act as the
>bus for nvme-ns devices as Klaus described.
>
A namespace (-device nvme-ns) does not expose anything on its own to the
guest. Whatever the namespace provides (basically the association to a
-drive) is accessed through the pcie controller (-device nvme).
Similarly, the subsystem is just a concept for allowing multiple
controllers to access the same namespace. And this is where the model
clash because the parent/child relationship doesnt fit. Reversing the
relationship (controller is a child of a namespace) also doesnt work
since a contrller may have multiple namespace attached. A many-to-many
relationship is required which cannot be expresseed with QDEV as far as
I understand.
Talk is cheap, but I have a WIP that shims the -device's and moves the
core functionality of namespaces and subsystems to -objects. This makes
the setup much more flexible and fits QDEV/QOM much better.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
next prev parent reply other threads:[~2021-07-07 16:51 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-06 9:33 [PATCH 0/4] hw/nvme: fix controller hotplugging Klaus Jensen
2021-07-06 9:33 ` [PATCH 1/4] hw/nvme: remove NvmeCtrl parameter from ns setup/check functions Klaus Jensen
2021-07-07 7:45 ` Hannes Reinecke
2021-07-06 9:33 ` [PATCH 2/4] hw/nvme: mark nvme-subsys non-hotpluggable Klaus Jensen
2021-07-07 7:45 ` Hannes Reinecke
2021-07-06 9:33 ` [PATCH 3/4] hw/nvme: unregister controller with subsystem at exit Klaus Jensen
2021-07-07 7:46 ` Hannes Reinecke
2021-07-06 9:33 ` [PATCH 4/4] hw/nvme: fix controller hot unplugging Klaus Jensen
2021-07-07 7:49 ` Hannes Reinecke
2021-07-07 9:53 ` Klaus Jensen
2021-07-07 10:43 ` Hannes Reinecke
2021-07-07 15:41 ` Klaus Jensen
2021-07-07 16:14 ` Stefan Hajnoczi
2021-07-07 16:47 ` Klaus Jensen [this message]
2021-07-08 6:15 ` Hannes Reinecke
2021-07-07 10:02 ` Klaus Jensen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=YOXatgr6weL1zy9s@apples.localdomain \
--to=its@irrelevant.dk \
--cc=armbru@redhat.com \
--cc=hare@suse.de \
--cc=k.jensen@samsung.com \
--cc=kbusch@kernel.org \
--cc=philmd@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).