* [PATCH v9 1/4] driver core: don't always lock parent in shutdown
2024-10-09 17:57 [PATCH v9 0/4] shut down devices asynchronously Stuart Hayes
@ 2024-10-09 17:57 ` Stuart Hayes
2024-10-09 17:57 ` [PATCH v9 2/4] driver core: separate function to shutdown one device Stuart Hayes
` (3 subsequent siblings)
4 siblings, 0 replies; 16+ messages in thread
From: Stuart Hayes @ 2024-10-09 17:57 UTC (permalink / raw)
To: linux-kernel, Greg Kroah-Hartman, Rafael J . Wysocki,
Martin Belanger, Oliver O'Halloran, Daniel Wagner,
Keith Busch, Lukas Wunner, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme,
Nathan Chancellor, Jan Kiszka, Bert Karwatzki
Cc: Stuart Hayes
Don't lock a parent device unless it is needed in device_shutdown. This
is in preparation for making device shutdown asynchronous, when it will
be needed to allow children of a common parent to shut down
simultaneously.
Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Signed-off-by: David Jeffery <djeffery@redhat.com>
---
drivers/base/core.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index a4c853411a6b..2bf9730db056 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4816,7 +4816,7 @@ void device_shutdown(void)
spin_unlock(&devices_kset->list_lock);
/* hold lock to avoid race with probe/release */
- if (parent)
+ if (parent && dev->bus && dev->bus->need_parent_lock)
device_lock(parent);
device_lock(dev);
@@ -4840,7 +4840,7 @@ void device_shutdown(void)
}
device_unlock(dev);
- if (parent)
+ if (parent && dev->bus && dev->bus->need_parent_lock)
device_unlock(parent);
put_device(dev);
--
2.39.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v9 2/4] driver core: separate function to shutdown one device
2024-10-09 17:57 [PATCH v9 0/4] shut down devices asynchronously Stuart Hayes
2024-10-09 17:57 ` [PATCH v9 1/4] driver core: don't always lock parent in shutdown Stuart Hayes
@ 2024-10-09 17:57 ` Stuart Hayes
2024-10-09 17:57 ` [PATCH v9 3/4] driver core: shut down devices asynchronously Stuart Hayes
` (2 subsequent siblings)
4 siblings, 0 replies; 16+ messages in thread
From: Stuart Hayes @ 2024-10-09 17:57 UTC (permalink / raw)
To: linux-kernel, Greg Kroah-Hartman, Rafael J . Wysocki,
Martin Belanger, Oliver O'Halloran, Daniel Wagner,
Keith Busch, Lukas Wunner, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme,
Nathan Chancellor, Jan Kiszka, Bert Karwatzki
Cc: Stuart Hayes
Make a separate function for the part of device_shutdown() that does the
shutown for a single device. This is in preparation for making device
shutdown asynchronous.
Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Signed-off-by: David Jeffery <djeffery@redhat.com>
---
drivers/base/core.c | 66 ++++++++++++++++++++++++---------------------
1 file changed, 36 insertions(+), 30 deletions(-)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 2bf9730db056..4482382fb947 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4779,6 +4779,41 @@ int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
}
EXPORT_SYMBOL_GPL(device_change_owner);
+static void shutdown_one_device(struct device *dev)
+{
+ /* hold lock to avoid race with probe/release */
+ if (dev->parent && dev->bus && dev->bus->need_parent_lock)
+ device_lock(dev->parent);
+ device_lock(dev);
+
+ /* Don't allow any more runtime suspends */
+ pm_runtime_get_noresume(dev);
+ pm_runtime_barrier(dev);
+
+ if (dev->class && dev->class->shutdown_pre) {
+ if (initcall_debug)
+ dev_info(dev, "shutdown_pre\n");
+ dev->class->shutdown_pre(dev);
+ }
+ if (dev->bus && dev->bus->shutdown) {
+ if (initcall_debug)
+ dev_info(dev, "shutdown\n");
+ dev->bus->shutdown(dev);
+ } else if (dev->driver && dev->driver->shutdown) {
+ if (initcall_debug)
+ dev_info(dev, "shutdown\n");
+ dev->driver->shutdown(dev);
+ }
+
+ device_unlock(dev);
+ if (dev->parent && dev->bus && dev->bus->need_parent_lock)
+ device_unlock(dev->parent);
+
+ put_device(dev);
+ if (dev->parent)
+ put_device(dev->parent);
+}
+
/**
* device_shutdown - call ->shutdown() on each device to shutdown.
*/
@@ -4815,36 +4850,7 @@ void device_shutdown(void)
list_del_init(&dev->kobj.entry);
spin_unlock(&devices_kset->list_lock);
- /* hold lock to avoid race with probe/release */
- if (parent && dev->bus && dev->bus->need_parent_lock)
- device_lock(parent);
- device_lock(dev);
-
- /* Don't allow any more runtime suspends */
- pm_runtime_get_noresume(dev);
- pm_runtime_barrier(dev);
-
- if (dev->class && dev->class->shutdown_pre) {
- if (initcall_debug)
- dev_info(dev, "shutdown_pre\n");
- dev->class->shutdown_pre(dev);
- }
- if (dev->bus && dev->bus->shutdown) {
- if (initcall_debug)
- dev_info(dev, "shutdown\n");
- dev->bus->shutdown(dev);
- } else if (dev->driver && dev->driver->shutdown) {
- if (initcall_debug)
- dev_info(dev, "shutdown\n");
- dev->driver->shutdown(dev);
- }
-
- device_unlock(dev);
- if (parent && dev->bus && dev->bus->need_parent_lock)
- device_unlock(parent);
-
- put_device(dev);
- put_device(parent);
+ shutdown_one_device(dev);
spin_lock(&devices_kset->list_lock);
}
--
2.39.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v9 3/4] driver core: shut down devices asynchronously
2024-10-09 17:57 [PATCH v9 0/4] shut down devices asynchronously Stuart Hayes
2024-10-09 17:57 ` [PATCH v9 1/4] driver core: don't always lock parent in shutdown Stuart Hayes
2024-10-09 17:57 ` [PATCH v9 2/4] driver core: separate function to shutdown one device Stuart Hayes
@ 2024-10-09 17:57 ` Stuart Hayes
2024-10-09 17:57 ` [PATCH v9 4/4] nvme-pci: Make driver prefer asynchronous shutdown Stuart Hayes
2024-10-11 4:22 ` [PATCH v9 0/4] shut down devices asynchronously Michael Kelley
4 siblings, 0 replies; 16+ messages in thread
From: Stuart Hayes @ 2024-10-09 17:57 UTC (permalink / raw)
To: linux-kernel, Greg Kroah-Hartman, Rafael J . Wysocki,
Martin Belanger, Oliver O'Halloran, Daniel Wagner,
Keith Busch, Lukas Wunner, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme,
Nathan Chancellor, Jan Kiszka, Bert Karwatzki
Cc: Stuart Hayes
Add code to allow asynchronous shutdown of devices, ensuring that each
device is shut down before its parents & suppliers. Any devices that are
ordered in the devices_kset list such that a parent or supplier would
shut down before its child or consumer are shut down synchronously (as
they are without this patch).
Only devices with drivers that have async_shutdown_enable enabled will be
shut down asynchronously.
This can dramatically reduce system shutdown/reboot time on systems that
have multiple devices that take many seconds to shut down (like certain
NVMe drives). On one system tested, the shutdown time went from 11 minutes
without this patch to 55 seconds with the patch.
Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Signed-off-by: David Jeffery <djeffery@redhat.com>
---
drivers/base/base.h | 4 ++
drivers/base/core.c | 81 ++++++++++++++++++++++++++++++++++-
include/linux/device/driver.h | 2 +
3 files changed, 86 insertions(+), 1 deletion(-)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 8cf04a557bdb..ea18aa70f151 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -10,6 +10,7 @@
* shared outside of the drivers/base/ directory.
*
*/
+#include <linux/async.h>
#include <linux/notifier.h>
/**
@@ -97,6 +98,8 @@ struct driver_private {
* the device; typically because it depends on another driver getting
* probed first.
* @async_driver - pointer to device driver awaiting probe via async_probe
+ * @shutdown_after - used during device shutdown to ensure correct shutdown
+ * ordering.
* @device - pointer back to the struct device that this structure is
* associated with.
* @dead - This device is currently either in the process of or has been
@@ -114,6 +117,7 @@ struct device_private {
struct list_head deferred_probe;
const struct device_driver *async_driver;
char *deferred_probe_reason;
+ async_cookie_t shutdown_after;
struct device *device;
u8 dead:1;
};
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4482382fb947..bde3fdeafbdb 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -9,6 +9,7 @@
*/
#include <linux/acpi.h>
+#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/cleanup.h>
#include <linux/cpufreq.h>
@@ -3524,6 +3525,7 @@ static int device_private_init(struct device *dev)
klist_init(&dev->p->klist_children, klist_children_get,
klist_children_put);
INIT_LIST_HEAD(&dev->p->deferred_probe);
+ dev->p->shutdown_after = 0;
return 0;
}
@@ -4779,6 +4781,8 @@ int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
}
EXPORT_SYMBOL_GPL(device_change_owner);
+static ASYNC_DOMAIN(sd_domain);
+
static void shutdown_one_device(struct device *dev)
{
/* hold lock to avoid race with probe/release */
@@ -4814,12 +4818,61 @@ static void shutdown_one_device(struct device *dev)
put_device(dev->parent);
}
+static bool device_wants_async_shutdown(struct device *dev)
+{
+ if (dev->driver && dev->driver->async_shutdown_enable)
+ return true;
+
+ return false;
+}
+
+/**
+ * shutdown_one_device_async
+ * @data: the pointer to the struct device to be shutdown
+ * @cookie: not used
+ *
+ * Shuts down one device, after waiting for previous device to shut down (for
+ * synchronous shutdown) or waiting for device's last child or consumer to
+ * be shutdown (for async shutdown).
+ *
+ * shutdown_after is set to the shutdown cookie of the last child or consumer
+ * of this device (if any).
+ */
+static void shutdown_one_device_async(void *data, async_cookie_t cookie)
+{
+ struct device *dev = data;
+ async_cookie_t wait = cookie;
+
+ if (device_wants_async_shutdown(dev)) {
+ wait = dev->p->shutdown_after + 1;
+ /*
+ * To prevent system hang, revert to sync shutdown in the event
+ * that shutdown_after would make this shutdown wait for a
+ * shutdown that hasn't been scheduled yet.
+ *
+ * This can happen if a parent or supplier is not ordered in the
+ * devices_kset list before a child or consumer, which is not
+ * expected.
+ */
+ if (wait > cookie) {
+ wait = cookie;
+ dev_warn(dev, "Unsafe shutdown ordering, forcing sync order\n");
+ }
+ }
+
+ async_synchronize_cookie_domain(wait, &sd_domain);
+ shutdown_one_device(dev);
+}
+
/**
* device_shutdown - call ->shutdown() on each device to shutdown.
*/
void device_shutdown(void)
{
struct device *dev, *parent;
+ async_cookie_t cookie;
+ struct device_link *link;
+ int idx;
wait_for_device_probe();
device_block_probing();
@@ -4850,11 +4903,37 @@ void device_shutdown(void)
list_del_init(&dev->kobj.entry);
spin_unlock(&devices_kset->list_lock);
- shutdown_one_device(dev);
+ get_device(dev);
+ get_device(parent);
+
+ cookie = async_schedule_domain(shutdown_one_device_async,
+ dev, &sd_domain);
+ /*
+ * Ensure any parent & suppliers will wait for this device to
+ * shut down
+ */
+ if (parent) {
+ parent->p->shutdown_after = cookie;
+ put_device(parent);
+ }
+
+ idx = device_links_read_lock();
+ list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
+ device_links_read_lock_held()) {
+ /*
+ * sync_state_only devlink consumers aren't dependent on
+ * suppliers
+ */
+ if (!device_link_flag_is_sync_state_only(link->flags))
+ link->supplier->p->shutdown_after = cookie;
+ }
+ device_links_read_unlock(idx);
+ put_device(dev);
spin_lock(&devices_kset->list_lock);
}
spin_unlock(&devices_kset->list_lock);
+ async_synchronize_full_domain(&sd_domain);
}
/*
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 5c04b8e3833b..14c9211b82d6 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -56,6 +56,7 @@ enum probe_type {
* @mod_name: Used for built-in modules.
* @suppress_bind_attrs: Disables bind/unbind via sysfs.
* @probe_type: Type of the probe (synchronous or asynchronous) to use.
+ * @async_shutdown_enable: Enables devices to be shutdown asynchronously.
* @of_match_table: The open firmware table.
* @acpi_match_table: The ACPI match table.
* @probe: Called to query the existence of a specific device,
@@ -102,6 +103,7 @@ struct device_driver {
bool suppress_bind_attrs; /* disables bind/unbind via sysfs */
enum probe_type probe_type;
+ bool async_shutdown_enable;
const struct of_device_id *of_match_table;
const struct acpi_device_id *acpi_match_table;
--
2.39.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v9 4/4] nvme-pci: Make driver prefer asynchronous shutdown
2024-10-09 17:57 [PATCH v9 0/4] shut down devices asynchronously Stuart Hayes
` (2 preceding siblings ...)
2024-10-09 17:57 ` [PATCH v9 3/4] driver core: shut down devices asynchronously Stuart Hayes
@ 2024-10-09 17:57 ` Stuart Hayes
2024-10-11 4:22 ` [PATCH v9 0/4] shut down devices asynchronously Michael Kelley
4 siblings, 0 replies; 16+ messages in thread
From: Stuart Hayes @ 2024-10-09 17:57 UTC (permalink / raw)
To: linux-kernel, Greg Kroah-Hartman, Rafael J . Wysocki,
Martin Belanger, Oliver O'Halloran, Daniel Wagner,
Keith Busch, Lukas Wunner, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme,
Nathan Chancellor, Jan Kiszka, Bert Karwatzki
Cc: Stuart Hayes
Set the driver default to enable asynchronous shutdown.
Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Signed-off-by: David Jeffery <djeffery@redhat.com>
---
drivers/nvme/host/pci.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7990c3f22ecf..1cbff7537788 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3597,6 +3597,7 @@ static struct pci_driver nvme_driver = {
.shutdown = nvme_shutdown,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ .async_shutdown_enable = true,
#ifdef CONFIG_PM_SLEEP
.pm = &nvme_dev_pm_ops,
#endif
--
2.39.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* RE: [PATCH v9 0/4] shut down devices asynchronously
2024-10-09 17:57 [PATCH v9 0/4] shut down devices asynchronously Stuart Hayes
` (3 preceding siblings ...)
2024-10-09 17:57 ` [PATCH v9 4/4] nvme-pci: Make driver prefer asynchronous shutdown Stuart Hayes
@ 2024-10-11 4:22 ` Michael Kelley
2024-10-11 15:52 ` Laurence Oberman
2024-10-18 3:26 ` Michael Kelley
4 siblings, 2 replies; 16+ messages in thread
From: Michael Kelley @ 2024-10-11 4:22 UTC (permalink / raw)
To: Stuart Hayes, linux-kernel@vger.kernel.org, Greg Kroah-Hartman,
Rafael J . Wysocki, Martin Belanger, Oliver O'Halloran,
Daniel Wagner, Keith Busch, Lukas Wunner, David Jeffery,
Jeremy Allison, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
linux-nvme@lists.infradead.org, Nathan Chancellor, Jan Kiszka,
Bert Karwatzki
From: Stuart Hayes <stuart.w.hayes@gmail.com> Sent: Wednesday, October 9, 2024 10:58 AM
>
> This adds the ability for the kernel to shutdown devices asynchronously.
>
> Only devices with drivers that enable it are shut down asynchronously.
>
> This can dramatically reduce system shutdown/reboot time on systems that
> have multiple devices that take many seconds to shut down (like certain
> NVMe drives). On one system tested, the shutdown time went from 11 minutes
> without this patch to 55 seconds with the patch.
I've been testing this series against a 6.11.0 kernel in an Azure VM, which
is running as a guest on the Microsoft Hyper-V hypervisor. The VM has 16 vCPUs,
128 Gbytes of memory, and two physical NVMe controllers that are mapped
into the VM so that it can access them directly.
But I wanted to confirm that the two NVMe controllers are being shutdown
in parallel. So before doing a shutdown, I set
/sys/module/kernel/parameters/initcall_debug to "Y" so the shutdown
of each device is recorded in the console output. Here's the full set of
device shutdown messages:
172.609825 platform intel_rapl_msr.0: shutdown
172.611940 mlx5_ib.rdma mlx5_core.rdma.0: shutdown
172.613931 mlx5_core.eth mlx5_core.eth.0: shutdown
172.618116 nvme c2b7:00:00.0: shutdown
172.618262 nvme 132e:00:00.0: shutdown
172.618349 mlx5_core 1610:00:02.0: shutdown
172.618359 mlx5_core 1610:00:02.0: Shutdown was called
172.782768 hv_pci ba152dae-1610-4c67-b925-81ac4902e4ce: shutdown
172.786405 sd 0:0:0:1: shutdown
172.788788 sd 0:0:0:0: shutdown
172.789949 sd 0:0:0:0: [sda] Synchronizing SCSI cache
172.794209 atkbd serio0: shutdown
172.795974 hv_utils 242ff919-07db-4180-9c2e-b86cb68c8c55: shutdown
172.800432 hv_pci 0cdfe983-132e-434b-8025-fc9ab43c0fc5: shutdown
172.802812 hv_pci 2394da4f-c2b7-43bd-b72f-d3482ef6850a: shutdown
172.805145 hv_netvsc 0022487e-1043-0022-487e-10430022487e: shutdown
172.807575 hyperv_fb 5620e0c7-8062-4dce-aeb7-520c7ef76171: shutdown
172.810026 hyperv_keyboard d34b2567-b9b6-42b9-8778-0a4ec0b955bf: shutdown
172.812522 hid_hyperv 58f75a6d-d949-4320-99e1-a2a2576d581c: shutdown
172.814982 hv_balloon 1eccfd72-4b41-45ef-b73a-4a6e44c12924: shutdown
172.817376 vmbus c4e5e7d1-d748-4afc-979d-683167910a55: shutdown
172.819789 hv_storvsc f8b3781b-1e82-4818-a1c3-63d806ec15bb: shutdown
172.822324 hv_storvsc f8b3781a-1e82-4818-a1c3-63d806ec15bb: shutdown
172.824813 hv_utils 2dd1ce17-079e-403c-b352-a1921ee207ee: shutdown
172.827199 hv_utils b6650ff7-33bc-4840-8048-e0676786f393: shutdown
172.829653 hv_utils fd149e91-82e0-4a7d-afa6-2a4166cbd7c0: shutdown
172.836408 platform eisa.0: shutdown
172.838558 alarmtimer alarmtimer.0.auto: shutdown
172.842461 platform Fixed MDIO bus.0: shutdown
172.864709 kgdboc kgdboc: shutdown
172.878009 serial8250 serial8250: shutdown
172.889725 platform pcspkr: shutdown
172.904386 rtc_cmos 00:02: shutdown
172.906217 serial 00:01: shutdown
172.907799 serial 00:00: shutdown
172.910427 platform efivars.0: shutdown
172.913341 platform rtc-efi.0: shutdown
172.915470 vmgenid HYPER_V_GEN_COUNTER_V1:00: shutdown
172.917479 vmbus VMBUS:00: shutdown
172.919012 platform PNP0003:00: shutdown
172.926707 reg-dummy reg-dummy: shutdown
172.961360 ACPI: PM: Preparing to enter system sleep state S5
You see the Mellanox CX-5 NIC, the two NVMe devices, various Hyper-V
virtual devices, and platform devices being shutdown. Everything seems to
work properly, so that's good. The two NVMe devices are shutdown very
close in time, so they are probably being done in parallel.
As a comparison, I did the same thing with an unmodified 6.11.0 kernel.
Indeed, the NVMe device shutdowns are significantly more apart in time
(110 milliseconds). That's not noticeably slow like the NVMe devices you
were dealing with, but doing them in parallel helps a little bit.
But here's the kicker: The overall process of shutting down the devices
took *longer* with the patch set than without. Here's the same output
from a 6.11.0 kernel without the patch set:
745.455493 platform intel_rapl_msr.0: shutdown
745.456999 mlx5_ib.rdma mlx5_core.rdma.0: shutdown
745.458557 mlx5_core.eth mlx5_core.eth.0: shutdown
745.460166 mlx5_core 1610:00:02.0: shutdown
745.461570 mlx5_core 1610:00:02.0: Shutdown was called
745.466053 nvme 132e:00:00.0: shutdown
745.579284 nvme c2b7:00:00.0: shutdown
745.718739 hv_pci ba152dae-1610-4c67-b925-81ac4902e4ce: shutdown
745.721114 sd 0:0:0:1: shutdown
745.722254 sd 0:0:0:0: shutdown
745.723357 sd 0:0:0:0: [sda] Synchronizing SCSI cache
745.725259 atkbd serio0: shutdown
745.726405 hv_utils 242ff919-07db-4180-9c2e-b86cb68c8c55: shutdown
745.728375 hv_pci 0cdfe983-132e-434b-8025-fc9ab43c0fc5: shutdown
745.730347 hv_pci 2394da4f-c2b7-43bd-b72f-d3482ef6850a: shutdown
745.732281 hv_netvsc 0022487e-1043-0022-487e-10430022487e: shutdown
745.734318 hyperv_fb 5620e0c7-8062-4dce-aeb7-520c7ef76171: shutdown
745.736488 hyperv_keyboard d34b2567-b9b6-42b9-8778-0a4ec0b955bf: shutdown
745.738628 hid_hyperv 58f75a6d-d949-4320-99e1-a2a2576d581c: shutdown
745.740770 hv_balloon 1eccfd72-4b41-45ef-b73a-4a6e44c12924: shutdown
745.742835 vmbus c4e5e7d1-d748-4afc-979d-683167910a55: shutdown
745.744765 hv_storvsc f8b3781b-1e82-4818-a1c3-63d806ec15bb: shutdown
745.746861 hv_storvsc f8b3781a-1e82-4818-a1c3-63d806ec15bb: shutdown
745.748907 hv_utils 2dd1ce17-079e-403c-b352-a1921ee207ee: shutdown
745.750948 hv_utils b6650ff7-33bc-4840-8048-e0676786f393: shutdown
745.753012 hv_utils fd149e91-82e0-4a7d-afa6-2a4166cbd7c0: shutdown
745.755000 platform eisa.0: shutdown
745.756266 alarmtimer alarmtimer.0.auto: shutdown
745.757868 platform Fixed MDIO bus.0: shutdown
745.759447 kgdboc kgdboc: shutdown
745.760679 serial8250 serial8250: shutdown
745.762110 platform pcspkr: shutdown
745.763387 rtc_cmos 00:02: shutdown
745.764726 serial 00:01: shutdown
745.765898 serial 00:00: shutdown
745.767036 platform efivars.0: shutdown
745.768783 platform rtc-efi.0: shutdown
745.770240 vmgenid HYPER_V_GEN_COUNTER_V1:00: shutdown
745.771949 vmbus VMBUS:00: shutdown
745.773197 platform PNP0003:00: shutdown
745.774540 reg-dummy reg-dummy: shutdown
745.775964 ACPI: PM: Preparing to enter system sleep state S5
There's some modest variability in the individual steps, but the 110 ms
saved on the NVMe device seems to be given back on some other
devices. I did the comparison twice with similar results. (I have the
full data set with comparisons in an Excel spreadsheet.)
Any thoughts on what might be causing this? I haven't gone into the
details of your algorithms for parallelizing, but is there any extra
overhead that could be adding to the time? Or maybe this is
something unique to Hyper-V guests. The overall difference is only
a few 10's of milliseconds, so not that big of a deal. But maybe it's
an indicator that something unexpected is happening that we should
understand.
I'll keep thinking about the issue and see if I can get any more insight.
Michael Kelley
>
> Changes from V8:
>
> Deal with shutdown hangs resulting when a parent/supplier device is
> later in the devices_kset list than its children/consumers:
> * Ignore sync_state_only devlinks for shutdown dependencies
> * Ignore shutdown_after for devices that don't want async shutdown
> * Add a sanity check to revert to sync shutdown for any device that
> would otherwise wait for a child/consumer shutdown that hasn't
> already been scheduled
>
> Changes from V7:
>
> Do not expose driver async_shutdown_enable in sysfs.
> Wrapped a long line.
>
> Changes from V6:
>
> Removed a sysfs attribute that allowed the async device shutdown to be
> "on" (with driver opt-out), "safe" (driver opt-in), or "off"... what was
> previously "safe" is now the only behavior, so drivers now only need to
> have the option to enable or disable async shutdown.
>
> Changes from V5:
>
> Separated into multiple patches to make review easier.
> Reworked some code to make it more readable
> Made devices wait for consumers to shut down, not just children
> (suggested by David Jeffery)
>
> Changes from V4:
>
> Change code to use cookies for synchronization rather than async domains
> Allow async shutdown to be disabled via sysfs, and allow driver opt-in or
> opt-out of async shutdown (when not disabled), with ability to control
> driver opt-in/opt-out via sysfs
>
> Changes from V3:
>
> Bug fix (used "parent" not "dev->parent" in device_shutdown)
>
> Changes from V2:
>
> Removed recursive functions to schedule children to be shutdown before
> parents, since existing device_shutdown loop will already do this
>
> Changes from V1:
>
> Rewritten using kernel async code (suggested by Lukas Wunner)
>
>
> Stuart Hayes (4):
> driver core: don't always lock parent in shutdown
> driver core: separate function to shutdown one device
> driver core: shut down devices asynchronously
> nvme-pci: Make driver prefer asynchronous shutdown
>
> drivers/base/base.h | 4 +
> drivers/base/core.c | 137 +++++++++++++++++++++++++++-------
> drivers/nvme/host/pci.c | 1 +
> include/linux/device/driver.h | 2 +
> 4 files changed, 118 insertions(+), 26 deletions(-)
>
> --
> 2.39.3
>
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH v9 0/4] shut down devices asynchronously
2024-10-11 4:22 ` [PATCH v9 0/4] shut down devices asynchronously Michael Kelley
@ 2024-10-11 15:52 ` Laurence Oberman
2024-10-18 3:26 ` Michael Kelley
1 sibling, 0 replies; 16+ messages in thread
From: Laurence Oberman @ 2024-10-11 15:52 UTC (permalink / raw)
To: Michael Kelley, Stuart Hayes, linux-kernel@vger.kernel.org,
Greg Kroah-Hartman, Rafael J . Wysocki, Martin Belanger,
Oliver O'Halloran, Daniel Wagner, Keith Busch, Lukas Wunner,
David Jeffery, Jeremy Allison, Jens Axboe, Christoph Hellwig,
Sagi Grimberg, linux-nvme@lists.infradead.org, Nathan Chancellor,
Jan Kiszka, Bert Karwatzki
On Fri, 2024-10-11 at 04:22 +0000, Michael Kelley wrote:
> From: Stuart Hayes <stuart.w.hayes@gmail.com> Sent: Wednesday,
> October 9, 2024 10:58 AM
> >
> > This adds the ability for the kernel to shutdown devices
> > asynchronously.
> >
> > Only devices with drivers that enable it are shut down
> > asynchronously.
> >
> > This can dramatically reduce system shutdown/reboot time on systems
> > that
> > have multiple devices that take many seconds to shut down (like
> > certain
> > NVMe drives). On one system tested, the shutdown time went from 11
> > minutes
> > without this patch to 55 seconds with the patch.
>
> I've been testing this series against a 6.11.0 kernel in an Azure VM,
> which
> is running as a guest on the Microsoft Hyper-V hypervisor. The VM has
> 16 vCPUs,
> 128 Gbytes of memory, and two physical NVMe controllers that are
> mapped
> into the VM so that it can access them directly.
>
> But I wanted to confirm that the two NVMe controllers are being
> shutdown
> in parallel. So before doing a shutdown, I set
> /sys/module/kernel/parameters/initcall_debug to "Y" so the shutdown
> of each device is recorded in the console output. Here's the full
> set of
> device shutdown messages:
>
> 172.609825 platform intel_rapl_msr.0: shutdown
> 172.611940 mlx5_ib.rdma mlx5_core.rdma.0: shutdown
> 172.613931 mlx5_core.eth mlx5_core.eth.0: shutdown
> 172.618116 nvme c2b7:00:00.0: shutdown
> 172.618262 nvme 132e:00:00.0: shutdown
> 172.618349 mlx5_core 1610:00:02.0: shutdown
> 172.618359 mlx5_core 1610:00:02.0: Shutdown was called
> 172.782768 hv_pci ba152dae-1610-4c67-b925-81ac4902e4ce: shutdown
> 172.786405 sd 0:0:0:1: shutdown
> 172.788788 sd 0:0:0:0: shutdown
> 172.789949 sd 0:0:0:0: [sda] Synchronizing SCSI cache
> 172.794209 atkbd serio0: shutdown
> 172.795974 hv_utils 242ff919-07db-4180-9c2e-b86cb68c8c55:
> shutdown
> 172.800432 hv_pci 0cdfe983-132e-434b-8025-fc9ab43c0fc5: shutdown
> 172.802812 hv_pci 2394da4f-c2b7-43bd-b72f-d3482ef6850a: shutdown
> 172.805145 hv_netvsc 0022487e-1043-0022-487e-10430022487e:
> shutdown
> 172.807575 hyperv_fb 5620e0c7-8062-4dce-aeb7-520c7ef76171:
> shutdown
> 172.810026 hyperv_keyboard d34b2567-b9b6-42b9-8778-0a4ec0b955bf:
> shutdown
> 172.812522 hid_hyperv 58f75a6d-d949-4320-99e1-a2a2576d581c:
> shutdown
> 172.814982 hv_balloon 1eccfd72-4b41-45ef-b73a-4a6e44c12924:
> shutdown
> 172.817376 vmbus c4e5e7d1-d748-4afc-979d-683167910a55: shutdown
> 172.819789 hv_storvsc f8b3781b-1e82-4818-a1c3-63d806ec15bb:
> shutdown
> 172.822324 hv_storvsc f8b3781a-1e82-4818-a1c3-63d806ec15bb:
> shutdown
> 172.824813 hv_utils 2dd1ce17-079e-403c-b352-a1921ee207ee:
> shutdown
> 172.827199 hv_utils b6650ff7-33bc-4840-8048-e0676786f393:
> shutdown
> 172.829653 hv_utils fd149e91-82e0-4a7d-afa6-2a4166cbd7c0:
> shutdown
> 172.836408 platform eisa.0: shutdown
> 172.838558 alarmtimer alarmtimer.0.auto: shutdown
> 172.842461 platform Fixed MDIO bus.0: shutdown
> 172.864709 kgdboc kgdboc: shutdown
> 172.878009 serial8250 serial8250: shutdown
> 172.889725 platform pcspkr: shutdown
> 172.904386 rtc_cmos 00:02: shutdown
> 172.906217 serial 00:01: shutdown
> 172.907799 serial 00:00: shutdown
> 172.910427 platform efivars.0: shutdown
> 172.913341 platform rtc-efi.0: shutdown
> 172.915470 vmgenid HYPER_V_GEN_COUNTER_V1:00: shutdown
> 172.917479 vmbus VMBUS:00: shutdown
> 172.919012 platform PNP0003:00: shutdown
> 172.926707 reg-dummy reg-dummy: shutdown
> 172.961360 ACPI: PM: Preparing to enter system sleep state S5
>
> You see the Mellanox CX-5 NIC, the two NVMe devices, various Hyper-V
> virtual devices, and platform devices being shutdown. Everything
> seems to
> work properly, so that's good. The two NVMe devices are shutdown very
> close in time, so they are probably being done in parallel.
>
> As a comparison, I did the same thing with an unmodified 6.11.0
> kernel.
> Indeed, the NVMe device shutdowns are significantly more apart in
> time
> (110 milliseconds). That's not noticeably slow like the NVMe devices
> you
> were dealing with, but doing them in parallel helps a little bit.
>
> But here's the kicker: The overall process of shutting down the
> devices
> took *longer* with the patch set than without. Here's the same
> output
> from a 6.11.0 kernel without the patch set:
>
> 745.455493 platform intel_rapl_msr.0: shutdown
> 745.456999 mlx5_ib.rdma mlx5_core.rdma.0: shutdown
> 745.458557 mlx5_core.eth mlx5_core.eth.0: shutdown
> 745.460166 mlx5_core 1610:00:02.0: shutdown
> 745.461570 mlx5_core 1610:00:02.0: Shutdown was
> called
> 745.466053 nvme 132e:00:00.0: shutdown
> 745.579284 nvme c2b7:00:00.0: shutdown
> 745.718739 hv_pci ba152dae-1610-4c67-b925-81ac4902e4ce:
> shutdown
> 745.721114 sd 0:0:0:1: shutdown
> 745.722254 sd 0:0:0:0: shutdown
> 745.723357 sd 0:0:0:0: [sda] Synchronizing SCSI
> cache
> 745.725259 atkbd serio0: shutdown
> 745.726405 hv_utils 242ff919-07db-4180-9c2e-b86cb68c8c55:
> shutdown
> 745.728375 hv_pci 0cdfe983-132e-434b-8025-fc9ab43c0fc5:
> shutdown
> 745.730347 hv_pci 2394da4f-c2b7-43bd-b72f-d3482ef6850a:
> shutdown
> 745.732281 hv_netvsc 0022487e-1043-0022-487e-10430022487e:
> shutdown
> 745.734318 hyperv_fb 5620e0c7-8062-4dce-aeb7-520c7ef76171:
> shutdown
> 745.736488 hyperv_keyboard d34b2567-b9b6-42b9-8778-0a4ec0b955bf:
> shutdown
> 745.738628 hid_hyperv 58f75a6d-d949-4320-99e1-a2a2576d581c:
> shutdown
> 745.740770 hv_balloon 1eccfd72-4b41-45ef-b73a-4a6e44c12924:
> shutdown
> 745.742835 vmbus c4e5e7d1-d748-4afc-979d-683167910a55:
> shutdown
> 745.744765 hv_storvsc f8b3781b-1e82-4818-a1c3-63d806ec15bb:
> shutdown
> 745.746861 hv_storvsc f8b3781a-1e82-4818-a1c3-63d806ec15bb:
> shutdown
> 745.748907 hv_utils 2dd1ce17-079e-403c-b352-a1921ee207ee:
> shutdown
> 745.750948 hv_utils b6650ff7-33bc-4840-8048-e0676786f393:
> shutdown
> 745.753012 hv_utils fd149e91-82e0-4a7d-afa6-2a4166cbd7c0:
> shutdown
> 745.755000 platform eisa.0: shutdown
> 745.756266 alarmtimer alarmtimer.0.auto: shutdown
> 745.757868 platform Fixed MDIO bus.0: shutdown
> 745.759447 kgdboc kgdboc: shutdown
> 745.760679 serial8250 serial8250: shutdown
> 745.762110 platform pcspkr: shutdown
> 745.763387 rtc_cmos 00:02: shutdown
> 745.764726 serial 00:01: shutdown
> 745.765898 serial 00:00: shutdown
> 745.767036 platform efivars.0: shutdown
> 745.768783 platform rtc-efi.0: shutdown
> 745.770240 vmgenid HYPER_V_GEN_COUNTER_V1:00:
> shutdown
> 745.771949 vmbus VMBUS:00: shutdown
> 745.773197 platform PNP0003:00: shutdown
> 745.774540 reg-dummy reg-dummy: shutdown
> 745.775964 ACPI: PM: Preparing to enter system sleep state S5
>
> There's some modest variability in the individual steps, but the 110
> ms
> saved on the NVMe device seems to be given back on some other
> devices. I did the comparison twice with similar results. (I have the
> full data set with comparisons in an Excel spreadsheet.)
>
> Any thoughts on what might be causing this? I haven't gone into the
> details of your algorithms for parallelizing, but is there any extra
> overhead that could be adding to the time? Or maybe this is
> something unique to Hyper-V guests. The overall difference is only
> a few 10's of milliseconds, so not that big of a deal. But maybe it's
> an indicator that something unexpected is happening that we should
> understand.
>
> I'll keep thinking about the issue and see if I can get any more
> insight.
>
> Michael Kelley
>
> >
> > Changes from V8:
> >
> > Deal with shutdown hangs resulting when a parent/supplier device is
> > later in the devices_kset list than its children/consumers:
> > * Ignore sync_state_only devlinks for shutdown dependencies
> > * Ignore shutdown_after for devices that don't want async
> > shutdown
> > * Add a sanity check to revert to sync shutdown for any device
> > that
> > would otherwise wait for a child/consumer shutdown that hasn't
> > already been scheduled
> >
> > Changes from V7:
> >
> > Do not expose driver async_shutdown_enable in sysfs.
> > Wrapped a long line.
> >
> > Changes from V6:
> >
> > Removed a sysfs attribute that allowed the async device shutdown to
> > be
> > "on" (with driver opt-out), "safe" (driver opt-in), or "off"...
> > what was
> > previously "safe" is now the only behavior, so drivers now only
> > need to
> > have the option to enable or disable async shutdown.
> >
> > Changes from V5:
> >
> > Separated into multiple patches to make review easier.
> > Reworked some code to make it more readable
> > Made devices wait for consumers to shut down, not just children
> > (suggested by David Jeffery)
> >
> > Changes from V4:
> >
> > Change code to use cookies for synchronization rather than async
> > domains
> > Allow async shutdown to be disabled via sysfs, and allow driver
> > opt-in or
> > opt-out of async shutdown (when not disabled), with ability to
> > control
> > driver opt-in/opt-out via sysfs
> >
> > Changes from V3:
> >
> > Bug fix (used "parent" not "dev->parent" in device_shutdown)
> >
> > Changes from V2:
> >
> > Removed recursive functions to schedule children to be shutdown
> > before
> > parents, since existing device_shutdown loop will already do this
> >
> > Changes from V1:
> >
> > Rewritten using kernel async code (suggested by Lukas Wunner)
> >
> >
> > Stuart Hayes (4):
> > driver core: don't always lock parent in shutdown
> > driver core: separate function to shutdown one device
> > driver core: shut down devices asynchronously
> > nvme-pci: Make driver prefer asynchronous shutdown
> >
> > drivers/base/base.h | 4 +
> > drivers/base/core.c | 137 +++++++++++++++++++++++++++---
> > ----
> > drivers/nvme/host/pci.c | 1 +
> > include/linux/device/driver.h | 2 +
> > 4 files changed, 118 insertions(+), 26 deletions(-)
> >
> > --
> > 2.39.3
> >
>
>
Hopefully helpful.
Interesting, once again I tested PATCH bundle V9 in the Red Hat lab and
I again see a great improvement of 6 to 8 time faster on a 24 nvme
device server.
Measuring this using dmesg timestamps gets me a shutdown in about 8s
versus 50+ seconds.
The problem with my testing is I don't have all the hardware, For
example, what failed last time on V8 (SOC board) and was reported, is
not covered bu the Red Hat lab testing.'
So for what it is worth.
Tested-by: Laurence Oberman <loberman@redhat.com>
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [PATCH v9 0/4] shut down devices asynchronously
2024-10-11 4:22 ` [PATCH v9 0/4] shut down devices asynchronously Michael Kelley
2024-10-11 15:52 ` Laurence Oberman
@ 2024-10-18 3:26 ` Michael Kelley
2024-10-18 5:49 ` Greg Kroah-Hartman
1 sibling, 1 reply; 16+ messages in thread
From: Michael Kelley @ 2024-10-18 3:26 UTC (permalink / raw)
To: Michael Kelley, Stuart Hayes, linux-kernel@vger.kernel.org,
Greg Kroah-Hartman, Rafael J . Wysocki, Martin Belanger,
Oliver O'Halloran, Daniel Wagner, Keith Busch, Lukas Wunner,
David Jeffery, Jeremy Allison, Jens Axboe, Christoph Hellwig,
Sagi Grimberg, linux-nvme@lists.infradead.org, Nathan Chancellor,
Jan Kiszka, Bert Karwatzki
From: Michael Kelley <mhklinux@outlook.com> Sent: Thursday, October 10, 2024 9:22 PM
>
> From: Stuart Hayes <stuart.w.hayes@gmail.com> Sent: Wednesday, October 9, 2024 10:58 AM
> >
> > This adds the ability for the kernel to shutdown devices asynchronously.
> >
> > Only devices with drivers that enable it are shut down asynchronously.
> >
> > This can dramatically reduce system shutdown/reboot time on systems that
> > have multiple devices that take many seconds to shut down (like certain
> > NVMe drives). On one system tested, the shutdown time went from 11 minutes
> > without this patch to 55 seconds with the patch.
>
> I've been testing this series against a 6.11.0 kernel in an Azure VM, which
> is running as a guest on the Microsoft Hyper-V hypervisor. The VM has 16 vCPUs,
> 128 Gbytes of memory, and two physical NVMe controllers that are mapped
> into the VM so that it can access them directly.
>
[snip]
>
> But here's the kicker: The overall process of shutting down the devices
> took *longer* with the patch set than without. Here's the same output
> from a 6.11.0 kernel without the patch set:
>
[snip]
>
> Any thoughts on what might be causing this? I haven't gone into the
> details of your algorithms for parallelizing, but is there any extra
> overhead that could be adding to the time? Or maybe this is
> something unique to Hyper-V guests. The overall difference is only
> a few 10's of milliseconds, so not that big of a deal. But maybe it's
> an indicator that something unexpected is happening that we should
> understand.
>
> I'll keep thinking about the issue and see if I can get any more insight.
I've debugged this enough to now know what is happening. I see the
same behavior in two different environments:
1) A Hyper-V VM with 8 vCPUs running on my local laptop. It has
no NVMe devices, so there's no opportunity for parallelism with this
patch set.
2) A Raspberry Pi 5 with 4 CPUs. Linux is running on the bare metal.
It has one NVMe device via the Pi 5 M.2 HAT.
In both cases, the loop in device_shutdown() goes through a lot of
devices: 492 in my Hyper-V VM, and 577 in the Pi 5. With the code
in this patch set, all the devices get added to the async_wq in
kernel/async.c. So async_wq quickly gets heavily populated.
In the process, the workqueue code spins up additional worker threads
to handle the load. On the Hyper-V VM, 210 to 230 new kernel
threads are created during device_shutdown(), depending on the
timing. On the Pi 5, 253 are created. The max for this workqueue is
WQ_DFL_ACTIVE (256).
Creating all these new worker threads, and scheduling and
synchronizing them takes 30 to 40 milliseconds of additional time
compared to the original code. On the Hyper-V VM, device_shutdown()
completes in about 5 millisecond with the original code, and +/- 40
milliseconds with the new async code. The Pi 5 results are more
variable, but also have roughly 30 to 40 milliseconds additional.
Is all this extra work a problem? I don't know. Clearly, there's big
benefit in parallelizing the NVMe shutdown, and in those situations
the extra 30 to 40 milliseconds can be ignored. But I wonder if there
are other situations whether the extra elapsed time and CPU
consumption might be a problem.
I also wonder about scalability. Does everything still work if
device_shutdown is processing 5000 devices? Is there a possibility
of deadlock if async_wq can only have 256 active items out of
5000 queued ones?
Michael Kelley
>
> >
> > Changes from V8:
> >
> > Deal with shutdown hangs resulting when a parent/supplier device is
> > later in the devices_kset list than its children/consumers:
> > * Ignore sync_state_only devlinks for shutdown dependencies
> > * Ignore shutdown_after for devices that don't want async shutdown
> > * Add a sanity check to revert to sync shutdown for any device that
> > would otherwise wait for a child/consumer shutdown that hasn't
> > already been scheduled
> >
> > Changes from V7:
> >
> > Do not expose driver async_shutdown_enable in sysfs.
> > Wrapped a long line.
> >
> > Changes from V6:
> >
> > Removed a sysfs attribute that allowed the async device shutdown to be
> > "on" (with driver opt-out), "safe" (driver opt-in), or "off"... what was
> > previously "safe" is now the only behavior, so drivers now only need to
> > have the option to enable or disable async shutdown.
> >
> > Changes from V5:
> >
> > Separated into multiple patches to make review easier.
> > Reworked some code to make it more readable
> > Made devices wait for consumers to shut down, not just children
> > (suggested by David Jeffery)
> >
> > Changes from V4:
> >
> > Change code to use cookies for synchronization rather than async domains
> > Allow async shutdown to be disabled via sysfs, and allow driver opt-in or
> > opt-out of async shutdown (when not disabled), with ability to control
> > driver opt-in/opt-out via sysfs
> >
> > Changes from V3:
> >
> > Bug fix (used "parent" not "dev->parent" in device_shutdown)
> >
> > Changes from V2:
> >
> > Removed recursive functions to schedule children to be shutdown before
> > parents, since existing device_shutdown loop will already do this
> >
> > Changes from V1:
> >
> > Rewritten using kernel async code (suggested by Lukas Wunner)
> >
> >
> > Stuart Hayes (4):
> > driver core: don't always lock parent in shutdown
> > driver core: separate function to shutdown one device
> > driver core: shut down devices asynchronously
> > nvme-pci: Make driver prefer asynchronous shutdown
> >
> > drivers/base/base.h | 4 +
> > drivers/base/core.c | 137 +++++++++++++++++++++++++++-------
> > drivers/nvme/host/pci.c | 1 +
> > include/linux/device/driver.h | 2 +
> > 4 files changed, 118 insertions(+), 26 deletions(-)
> >
> > --
> > 2.39.3
> >
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v9 0/4] shut down devices asynchronously
2024-10-18 3:26 ` Michael Kelley
@ 2024-10-18 5:49 ` Greg Kroah-Hartman
2024-10-18 9:14 ` Lukas Wunner
0 siblings, 1 reply; 16+ messages in thread
From: Greg Kroah-Hartman @ 2024-10-18 5:49 UTC (permalink / raw)
To: Michael Kelley
Cc: Stuart Hayes, linux-kernel@vger.kernel.org, Rafael J . Wysocki,
Martin Belanger, Oliver O'Halloran, Daniel Wagner,
Keith Busch, Lukas Wunner, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg,
linux-nvme@lists.infradead.org, Nathan Chancellor, Jan Kiszka,
Bert Karwatzki
On Fri, Oct 18, 2024 at 03:26:05AM +0000, Michael Kelley wrote:
> From: Michael Kelley <mhklinux@outlook.com> Sent: Thursday, October 10, 2024 9:22 PM
> >
> > From: Stuart Hayes <stuart.w.hayes@gmail.com> Sent: Wednesday, October 9, 2024 10:58 AM
> > >
> > > This adds the ability for the kernel to shutdown devices asynchronously.
> > >
> > > Only devices with drivers that enable it are shut down asynchronously.
> > >
> > > This can dramatically reduce system shutdown/reboot time on systems that
> > > have multiple devices that take many seconds to shut down (like certain
> > > NVMe drives). On one system tested, the shutdown time went from 11 minutes
> > > without this patch to 55 seconds with the patch.
> >
> > I've been testing this series against a 6.11.0 kernel in an Azure VM, which
> > is running as a guest on the Microsoft Hyper-V hypervisor. The VM has 16 vCPUs,
> > 128 Gbytes of memory, and two physical NVMe controllers that are mapped
> > into the VM so that it can access them directly.
> >
> [snip]
> >
> > But here's the kicker: The overall process of shutting down the devices
> > took *longer* with the patch set than without. Here's the same output
> > from a 6.11.0 kernel without the patch set:
> >
> [snip]
> >
> > Any thoughts on what might be causing this? I haven't gone into the
> > details of your algorithms for parallelizing, but is there any extra
> > overhead that could be adding to the time? Or maybe this is
> > something unique to Hyper-V guests. The overall difference is only
> > a few 10's of milliseconds, so not that big of a deal. But maybe it's
> > an indicator that something unexpected is happening that we should
> > understand.
> >
> > I'll keep thinking about the issue and see if I can get any more insight.
>
> I've debugged this enough to now know what is happening. I see the
> same behavior in two different environments:
>
> 1) A Hyper-V VM with 8 vCPUs running on my local laptop. It has
> no NVMe devices, so there's no opportunity for parallelism with this
> patch set.
>
> 2) A Raspberry Pi 5 with 4 CPUs. Linux is running on the bare metal.
> It has one NVMe device via the Pi 5 M.2 HAT.
>
> In both cases, the loop in device_shutdown() goes through a lot of
> devices: 492 in my Hyper-V VM, and 577 in the Pi 5. With the code
> in this patch set, all the devices get added to the async_wq in
> kernel/async.c. So async_wq quickly gets heavily populated.
>
> In the process, the workqueue code spins up additional worker threads
> to handle the load. On the Hyper-V VM, 210 to 230 new kernel
> threads are created during device_shutdown(), depending on the
> timing. On the Pi 5, 253 are created. The max for this workqueue is
> WQ_DFL_ACTIVE (256).
>
> Creating all these new worker threads, and scheduling and
> synchronizing them takes 30 to 40 milliseconds of additional time
> compared to the original code. On the Hyper-V VM, device_shutdown()
> completes in about 5 millisecond with the original code, and +/- 40
> milliseconds with the new async code. The Pi 5 results are more
> variable, but also have roughly 30 to 40 milliseconds additional.
>
> Is all this extra work a problem? I don't know. Clearly, there's big
> benefit in parallelizing the NVMe shutdown, and in those situations
> the extra 30 to 40 milliseconds can be ignored. But I wonder if there
> are other situations whether the extra elapsed time and CPU
> consumption might be a problem.
>
> I also wonder about scalability. Does everything still work if
> device_shutdown is processing 5000 devices? Is there a possibility
> of deadlock if async_wq can only have 256 active items out of
> 5000 queued ones?
In talking with someone off-list about this yesterday, we guessed that
this was the "thundering herd" problem that might be happening, and you
went and proved it was so! Thanks so much for doing this work.
I don't think we can put this type of load on all systems just to handle
one specific type of "bad" hardware that takes long periods of time to
shutdown, sorry.
Also think of systems with tens of thousands of devices connected to
them, with tiny 31bit processors (i.e. s390), shutting those down and
spinning up a thread for all of those devices is sure to cause us real
problems.
thanks,
greg k-h
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v9 0/4] shut down devices asynchronously
2024-10-18 5:49 ` Greg Kroah-Hartman
@ 2024-10-18 9:14 ` Lukas Wunner
2024-10-18 9:37 ` Greg Kroah-Hartman
0 siblings, 1 reply; 16+ messages in thread
From: Lukas Wunner @ 2024-10-18 9:14 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: Michael Kelley, Stuart Hayes, linux-kernel@vger.kernel.org,
Rafael J . Wysocki, Martin Belanger, Oliver O'Halloran,
Daniel Wagner, Keith Busch, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg,
linux-nvme@lists.infradead.org, Nathan Chancellor, Jan Kiszka,
Bert Karwatzki
On Fri, Oct 18, 2024 at 07:49:51AM +0200, Greg Kroah-Hartman wrote:
> On Fri, Oct 18, 2024 at 03:26:05AM +0000, Michael Kelley wrote:
> > In the process, the workqueue code spins up additional worker threads
> > to handle the load. On the Hyper-V VM, 210 to 230 new kernel
> > threads are created during device_shutdown(), depending on the
> > timing. On the Pi 5, 253 are created. The max for this workqueue is
> > WQ_DFL_ACTIVE (256).
[...]
> I don't think we can put this type of load on all systems just to handle
> one specific type of "bad" hardware that takes long periods of time to
> shutdown, sorry.
Parallelizing shutdown means shorter reboot times, less downtime,
less cost for CSPs.
Modern servers (e.g. Sierra Forest with 288 cores) should handle
this load easily and may see significant benefits from parallelization.
Perhaps a solution is to cap async shutdown based on the number of cores,
but always use async for certain device classes (e.g. nvme_subsys_class)?
Thanks,
Lukas
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v9 0/4] shut down devices asynchronously
2024-10-18 9:14 ` Lukas Wunner
@ 2024-10-18 9:37 ` Greg Kroah-Hartman
2024-10-19 0:27 ` stuart hayes
0 siblings, 1 reply; 16+ messages in thread
From: Greg Kroah-Hartman @ 2024-10-18 9:37 UTC (permalink / raw)
To: Lukas Wunner
Cc: Michael Kelley, Stuart Hayes, linux-kernel@vger.kernel.org,
Rafael J . Wysocki, Martin Belanger, Oliver O'Halloran,
Daniel Wagner, Keith Busch, David Jeffery, Jeremy Allison,
Jens Axboe, Christoph Hellwig, Sagi Grimberg,
linux-nvme@lists.infradead.org, Nathan Chancellor, Jan Kiszka,
Bert Karwatzki
On Fri, Oct 18, 2024 at 11:14:51AM +0200, Lukas Wunner wrote:
> On Fri, Oct 18, 2024 at 07:49:51AM +0200, Greg Kroah-Hartman wrote:
> > On Fri, Oct 18, 2024 at 03:26:05AM +0000, Michael Kelley wrote:
> > > In the process, the workqueue code spins up additional worker threads
> > > to handle the load. On the Hyper-V VM, 210 to 230 new kernel
> > > threads are created during device_shutdown(), depending on the
> > > timing. On the Pi 5, 253 are created. The max for this workqueue is
> > > WQ_DFL_ACTIVE (256).
> [...]
> > I don't think we can put this type of load on all systems just to handle
> > one specific type of "bad" hardware that takes long periods of time to
> > shutdown, sorry.
>
> Parallelizing shutdown means shorter reboot times, less downtime,
> less cost for CSPs.
For some systems, yes, but as have been seen here, it comes at the
offset of a huge CPU load at shutdown, with sometimes longer reboot
times.
> Modern servers (e.g. Sierra Forest with 288 cores) should handle
> this load easily and may see significant benefits from parallelization.
"may see", can you test this?
> Perhaps a solution is to cap async shutdown based on the number of cores,
> but always use async for certain device classes (e.g. nvme_subsys_class)?
Maybe, but as-is, we can't take the changes this way, sorry. That is a
regression from the situation of working hardware that many people have.
thanks,
greg k-h
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v9 0/4] shut down devices asynchronously
2024-10-18 9:37 ` Greg Kroah-Hartman
@ 2024-10-19 0:27 ` stuart hayes
2024-10-20 0:24 ` Michael Kelley
0 siblings, 1 reply; 16+ messages in thread
From: stuart hayes @ 2024-10-19 0:27 UTC (permalink / raw)
To: Greg Kroah-Hartman, Lukas Wunner
Cc: Michael Kelley, linux-kernel@vger.kernel.org, Rafael J . Wysocki,
Martin Belanger, Oliver O'Halloran, Daniel Wagner,
Keith Busch, David Jeffery, Jeremy Allison, Jens Axboe,
Christoph Hellwig, Sagi Grimberg, linux-nvme@lists.infradead.org,
Nathan Chancellor, Jan Kiszka, Bert Karwatzki
On 10/18/2024 4:37 AM, Greg Kroah-Hartman wrote:
> On Fri, Oct 18, 2024 at 11:14:51AM +0200, Lukas Wunner wrote:
>> On Fri, Oct 18, 2024 at 07:49:51AM +0200, Greg Kroah-Hartman wrote:
>>> On Fri, Oct 18, 2024 at 03:26:05AM +0000, Michael Kelley wrote:
>>>> In the process, the workqueue code spins up additional worker threads
>>>> to handle the load. On the Hyper-V VM, 210 to 230 new kernel
>>>> threads are created during device_shutdown(), depending on the
>>>> timing. On the Pi 5, 253 are created. The max for this workqueue is
>>>> WQ_DFL_ACTIVE (256).
>> [...]
>>> I don't think we can put this type of load on all systems just to handle
>>> one specific type of "bad" hardware that takes long periods of time to
>>> shutdown, sorry.
>>
>> Parallelizing shutdown means shorter reboot times, less downtime,
>> less cost for CSPs.
>
> For some systems, yes, but as have been seen here, it comes at the
> offset of a huge CPU load at shutdown, with sometimes longer reboot
> times.
>
>> Modern servers (e.g. Sierra Forest with 288 cores) should handle
>> this load easily and may see significant benefits from parallelization.
>
> "may see", can you test this?
>
>> Perhaps a solution is to cap async shutdown based on the number of cores,
>> but always use async for certain device classes (e.g. nvme_subsys_class)?
>
> Maybe, but as-is, we can't take the changes this way, sorry. That is a
> regression from the situation of working hardware that many people have.
>
> thanks,
>
> greg k-h
Thank you both for your time and effort considering this. It didn't
occur to me that an extra few 10s of milliseconds (or maxing out the
async workqueue) would be an issue.
To answer your earlier question (Michael), there shouldn't be a
possibility of deadlock regardless of the number of devices. While the
device shutdowns are scheduled on a workqueue rather than run in a loop,
they are still scheduled in the same order as they are without this
patch, any any device that is scheduled for shutdown should never have
to wait for device that hasn't yet been scheduled. So even if only one
device shutdown could be scheduled at a time, it should still work
without deadlocking--it just wouldn't be able to do shutdowns in parallel.
And I believe there is still a benefit to having async shutdown enabled
even with one core. The NVMe shutdowns that take a while involve waiting
for drives to finish commands, so they are mostly just sleeping.
Workqueues will schedule another worker if one worker sleeps, so even a
single core system should be able to get a number of NVMe drives started
on their shutdowns in parallel.
I'll see what I can to do limit the amount of stuff that gets put on the
workqueue, though. I can likely limit it to just the asynchronous
device shutdowns (NVMe shutdowns), plus any devices that have to wait
for them (i.e., any devices of which they are dependents or consumers).
^ permalink raw reply [flat|nested] 16+ messages in thread
* RE: [PATCH v9 0/4] shut down devices asynchronously
2024-10-19 0:27 ` stuart hayes
@ 2024-10-20 0:24 ` Michael Kelley
2024-10-29 15:32 ` Christoph Hellwig
2025-01-30 3:06 ` [PATCH] driver core: optimize async device shutdown Sultan Alsawaf
0 siblings, 2 replies; 16+ messages in thread
From: Michael Kelley @ 2024-10-20 0:24 UTC (permalink / raw)
To: stuart hayes, Greg Kroah-Hartman, Lukas Wunner
Cc: linux-kernel@vger.kernel.org, Rafael J . Wysocki, Martin Belanger,
Oliver O'Halloran, Daniel Wagner, Keith Busch, David Jeffery,
Jeremy Allison, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
linux-nvme@lists.infradead.org, Nathan Chancellor, Jan Kiszka,
Bert Karwatzki
From: stuart hayes <stuart.w.hayes@gmail.com> Sent: Friday, October 18, 2024 5:27 PM
>
> On 10/18/2024 4:37 AM, Greg Kroah-Hartman wrote:
> > On Fri, Oct 18, 2024 at 11:14:51AM +0200, Lukas Wunner wrote:
> >> On Fri, Oct 18, 2024 at 07:49:51AM +0200, Greg Kroah-Hartman wrote:
> >>> On Fri, Oct 18, 2024 at 03:26:05AM +0000, Michael Kelley wrote:
> >>>> In the process, the workqueue code spins up additional worker threads
> >>>> to handle the load. On the Hyper-V VM, 210 to 230 new kernel
> >>>> threads are created during device_shutdown(), depending on the
> >>>> timing. On the Pi 5, 253 are created. The max for this workqueue is
> >>>> WQ_DFL_ACTIVE (256).
> >> [...]
> >>> I don't think we can put this type of load on all systems just to handle
> >>> one specific type of "bad" hardware that takes long periods of time to
> >>> shutdown, sorry.
> >>
> >> Parallelizing shutdown means shorter reboot times, less downtime,
> >> less cost for CSPs.
> >
> > For some systems, yes, but as have been seen here, it comes at the
> > offset of a huge CPU load at shutdown, with sometimes longer reboot
> > times.
> >
> >> Modern servers (e.g. Sierra Forest with 288 cores) should handle
> >> this load easily and may see significant benefits from parallelization.
> >
> > "may see", can you test this?
> >
> >> Perhaps a solution is to cap async shutdown based on the number of cores,
> >> but always use async for certain device classes (e.g. nvme_subsys_class)?
> >
> > Maybe, but as-is, we can't take the changes this way, sorry. That is a
> > regression from the situation of working hardware that many people have.
> >
> > thanks,
> >
> > greg k-h
>
> Thank you both for your time and effort considering this. It didn't
> occur to me that an extra few 10s of milliseconds (or maxing out the
> async workqueue) would be an issue.
>
> To answer your earlier question (Michael), there shouldn't be a
> possibility of deadlock regardless of the number of devices. While the
> device shutdowns are scheduled on a workqueue rather than run in a loop,
> they are still scheduled in the same order as they are without this
> patch, any any device that is scheduled for shutdown should never have
> to wait for device that hasn't yet been scheduled. So even if only one
> device shutdown could be scheduled at a time, it should still work
> without deadlocking--it just wouldn't be able to do shutdowns in parallel.
OK -- makes sense.
>
> And I believe there is still a benefit to having async shutdown enabled
> even with one core. The NVMe shutdowns that take a while involve waiting
> for drives to finish commands, so they are mostly just sleeping.
> Workqueues will schedule another worker if one worker sleeps, so even a
> single core system should be able to get a number of NVMe drives started
> on their shutdowns in parallel.
Yes, I agree that a single core system should be able to get multiple
NVMe drives shutting down in parallel. The parallelism would be
governed by the number of worker processes that the workqueue
decides are needed. I didn't look at how it makes that decision.
>
> I'll see what I can to do limit the amount of stuff that gets put on the
> workqueue, though. I can likely limit it to just the asynchronous
> device shutdowns (NVMe shutdowns), plus any devices that have to wait
> for them (i.e., any devices of which they are dependents or consumers).
>
Yes, based on what I saw, that should eliminate the extra overhead
when there are no NVMe devices, or only a small number. If a system has
only a couple of NVMe devices (plus some dependent/consumer devices),
putting a few entries in the aysnc workqueue should be negligible. And when
there are many NVMe devices, the extra overhead of more kernel threads
is more than offset by the parallelism.
Michael
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v9 0/4] shut down devices asynchronously
2024-10-20 0:24 ` Michael Kelley
@ 2024-10-29 15:32 ` Christoph Hellwig
2025-01-30 3:06 ` [PATCH] driver core: optimize async device shutdown Sultan Alsawaf
1 sibling, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2024-10-29 15:32 UTC (permalink / raw)
To: Michael Kelley
Cc: stuart hayes, Greg Kroah-Hartman, Lukas Wunner,
linux-kernel@vger.kernel.org, Rafael J . Wysocki, Martin Belanger,
Oliver O'Halloran, Daniel Wagner, Keith Busch, David Jeffery,
Jeremy Allison, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
linux-nvme@lists.infradead.org, Nathan Chancellor, Jan Kiszka,
Bert Karwatzki
On Sun, Oct 20, 2024 at 12:24:32AM +0000, Michael Kelley wrote:
> Yes, I agree that a single core system should be able to get multiple
> NVMe drives shutting down in parallel. The parallelism would be
> governed by the number of worker processes that the workqueue
> decides are needed. I didn't look at how it makes that decision.
Or we could just go back to the old design where one methods kicks off
the shutdown, and then another one waits for it, which requires no
extra threads at all.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH] driver core: optimize async device shutdown
2024-10-20 0:24 ` Michael Kelley
2024-10-29 15:32 ` Christoph Hellwig
@ 2025-01-30 3:06 ` Sultan Alsawaf
2025-01-30 17:12 ` [PATCH v2] " Sultan Alsawaf
1 sibling, 1 reply; 16+ messages in thread
From: Sultan Alsawaf @ 2025-01-30 3:06 UTC (permalink / raw)
To: mhklinux
Cc: Martin.Belanger, axboe, djeffery, dwagner, gregkh, hch, jallison,
jan.kiszka, kbusch, linux-kernel, linux-nvme, lukas, nathan,
oohall, rafael, sagi, spasswolf, stuart.w.hayes
From: Sultan Alsawaf <sultan@kerneltoast.com>
The async device shutdown scaffolding assumes any device may be async and
thus schedules an async worker for all devices. This can result in massive
workqueue overhead at shutdown time, even when no async devices are
present.
Optimize async device shutdown by only scheduling async workers for devices
advertising async shutdown support.
This also fixes the (data) race on shutdown_after so that shutdown_after
isn't modified for a device which has already been scheduled for async
shutdown.
Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
Hi all,
I've created this follow-on patch to address the overhead problem. With this
patch applied on top of the v9 series, workers are only ever spawned for devices
which request async shutdown. This should also improve performance in the async
shutdown case.
Cheers,
Sultan
---
drivers/base/base.h | 4 +-
drivers/base/core.c | 136 ++++++++++++++++++++++++++++----------------
2 files changed, 89 insertions(+), 51 deletions(-)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index ea18aa70f151..28eb9ffe3ff6 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -105,6 +105,7 @@ struct driver_private {
* @dead - This device is currently either in the process of or has been
* removed from the system. Any asynchronous events scheduled for this
* device should exit without taking any action.
+ * @async_shutdown_queued - indicates async shutdown is enqueued for this device
*
* Nothing outside of the driver core should ever touch these fields.
*/
@@ -119,7 +120,8 @@ struct device_private {
char *deferred_probe_reason;
async_cookie_t shutdown_after;
struct device *device;
- u8 dead:1;
+ u8 dead:1,
+ async_shutdown_queued:1;
};
#define to_device_private_parent(obj) \
container_of(obj, struct device_private, knode_parent)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index df0df531b561..993285bc0d1a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3515,7 +3515,6 @@ static int device_private_init(struct device *dev)
klist_init(&dev->p->klist_children, klist_children_get,
klist_children_put);
INIT_LIST_HEAD(&dev->p->deferred_probe);
- dev->p->shutdown_after = 0;
return 0;
}
@@ -4856,37 +4855,66 @@ static bool device_wants_async_shutdown(struct device *dev)
* @data: the pointer to the struct device to be shutdown
* @cookie: not used
*
- * Shuts down one device, after waiting for previous device to shut down (for
- * synchronous shutdown) or waiting for device's last child or consumer to
- * be shutdown (for async shutdown).
+ * Shuts down one device, after waiting for device's last child or consumer to
+ * be shutdown.
*
* shutdown_after is set to the shutdown cookie of the last child or consumer
* of this device (if any).
*/
static void shutdown_one_device_async(void *data, async_cookie_t cookie)
{
- struct device *dev = data;
- async_cookie_t wait = cookie;
+ struct device_private *p = data;
- if (device_wants_async_shutdown(dev)) {
- wait = dev->p->shutdown_after + 1;
+ if (p->shutdown_after)
+ async_synchronize_cookie_domain(p->shutdown_after, &sd_domain);
+
+ shutdown_one_device(p->device);
+}
+
+/* Assumes an extra reference to @dev and @dev->parent are acquired first */
+static void queue_device_async_shutdown(struct device *dev)
+{
+ struct device *parent = dev->parent;
+ struct device_link *link;
+ async_cookie_t cookie;
+ int idx;
+
+ /*
+ * Add one to this device's cookie so that when shutdown_after is passed
+ * to async_synchronize_cookie_domain(), it will wait until *after*
+ * shutdown_one_device_async() is finished running for this device.
+ */
+ cookie = async_schedule_domain(shutdown_one_device_async, dev->p,
+ &sd_domain) + 1;
+
+ /*
+ * Set async_shutdown_queued to avoid overwriting a parent's
+ * shutdown_after while the parent is shutting down. This can happen if
+ * a parent or supplier is not ordered in the devices_kset list before a
+ * child or consumer, which is not expected.
+ */
+ dev->p->async_shutdown_queued = 1;
+
+ /* Ensure any parent & suppliers wait for this device to shut down */
+ if (parent) {
+ if (!parent->p->async_shutdown_queued)
+ parent->p->shutdown_after = cookie;
+ put_device(parent);
+ }
+
+ idx = device_links_read_lock();
+ list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
+ device_links_read_lock_held()) {
/*
- * To prevent system hang, revert to sync shutdown in the event
- * that shutdown_after would make this shutdown wait for a
- * shutdown that hasn't been scheduled yet.
- *
- * This can happen if a parent or supplier is not ordered in the
- * devices_kset list before a child or consumer, which is not
- * expected.
+ * sync_state_only devlink consumers aren't dependent on
+ * suppliers
*/
- if (wait > cookie) {
- wait = cookie;
- dev_warn(dev, "Unsafe shutdown ordering, forcing sync order\n");
- }
+ if (!device_link_flag_is_sync_state_only(link->flags) &&
+ !link->supplier->p->async_shutdown_queued)
+ link->supplier->p->shutdown_after = cookie;
}
-
- async_synchronize_cookie_domain(wait, &sd_domain);
- shutdown_one_device(dev);
+ device_links_read_unlock(idx);
+ put_device(dev);
}
/**
@@ -4894,16 +4922,37 @@ static void shutdown_one_device_async(void *data, async_cookie_t cookie)
*/
void device_shutdown(void)
{
- struct device *dev, *parent;
- async_cookie_t cookie;
- struct device_link *link;
- int idx;
+ struct device *dev, *parent, *tmp;
+ LIST_HEAD(async_list);
+ bool wait_for_async;
wait_for_device_probe();
device_block_probing();
cpufreq_suspend();
+ /*
+ * Find devices which can shut down asynchronously, and move them from
+ * the devices list onto the async list in reverse order.
+ */
+ spin_lock(&devices_kset->list_lock);
+ list_for_each_entry_safe(dev, tmp, &devices_kset->list, kobj.entry) {
+ if (device_wants_async_shutdown(dev)) {
+ get_device(dev->parent);
+ get_device(dev);
+ list_move(&dev->kobj.entry, &async_list);
+ }
+ }
+ spin_unlock(&devices_kset->list_lock);
+
+ /*
+ * Dispatch asynchronous shutdowns first so they don't have to wait
+ * behind any synchronous shutdowns.
+ */
+ wait_for_async = !list_empty(&async_list);
+ list_for_each_entry_safe(dev, tmp, &async_list, kobj.entry)
+ queue_device_async_shutdown(dev);
+
spin_lock(&devices_kset->list_lock);
/*
* Walk the devices list backward, shutting down each in turn.
@@ -4928,37 +4977,24 @@ void device_shutdown(void)
list_del_init(&dev->kobj.entry);
spin_unlock(&devices_kset->list_lock);
- get_device(dev);
- get_device(parent);
-
- cookie = async_schedule_domain(shutdown_one_device_async,
- dev, &sd_domain);
/*
- * Ensure any parent & suppliers will wait for this device to
- * shut down
+ * Dispatch an async shutdown if this device has a child or
+ * consumer that is async. Otherwise, shut down synchronously.
*/
- if (parent) {
- parent->p->shutdown_after = cookie;
- put_device(parent);
- }
-
- idx = device_links_read_lock();
- list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
- device_links_read_lock_held()) {
- /*
- * sync_state_only devlink consumers aren't dependent on
- * suppliers
- */
- if (!device_link_flag_is_sync_state_only(link->flags))
- link->supplier->p->shutdown_after = cookie;
+ if (dev->p->shutdown_after) {
+ get_device(parent);
+ get_device(dev);
+ queue_device_async_shutdown(dev);
+ } else {
+ shutdown_one_device(dev);
}
- device_links_read_unlock(idx);
- put_device(dev);
spin_lock(&devices_kset->list_lock);
}
spin_unlock(&devices_kset->list_lock);
- async_synchronize_full_domain(&sd_domain);
+
+ if (wait_for_async)
+ async_synchronize_full_domain(&sd_domain);
}
/*
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v2] driver core: optimize async device shutdown
2025-01-30 3:06 ` [PATCH] driver core: optimize async device shutdown Sultan Alsawaf
@ 2025-01-30 17:12 ` Sultan Alsawaf
0 siblings, 0 replies; 16+ messages in thread
From: Sultan Alsawaf @ 2025-01-30 17:12 UTC (permalink / raw)
To: mhklinux
Cc: Martin.Belanger, axboe, djeffery, dwagner, gregkh, hch, jallison,
jan.kiszka, kbusch, linux-kernel, linux-nvme, lukas, nathan,
oohall, rafael, sagi, spasswolf, stuart.w.hayes
From: Sultan Alsawaf <sultan@kerneltoast.com>
The async device shutdown scaffolding assumes any device may be async and
thus schedules an async worker for all devices. This can result in massive
workqueue overhead at shutdown time, even when no async devices are
present.
Optimize async device shutdown by only scheduling async workers for devices
advertising async shutdown support.
This also fixes the (data) race on shutdown_after so that shutdown_after
isn't modified for a device which has already been scheduled for async
shutdown.
Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
Changes in v2:
* Fixed the unbalanced device refcounts, since queue_device_async_shutdown()
itself needs an extra set of refcounts on top of the refcounts required for
taking a device off of the devices_kset list
Sultan
---
drivers/base/base.h | 4 +-
drivers/base/core.c | 137 +++++++++++++++++++++++++++-----------------
2 files changed, 89 insertions(+), 52 deletions(-)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index ea18aa70f151..28eb9ffe3ff6 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -105,6 +105,7 @@ struct driver_private {
* @dead - This device is currently either in the process of or has been
* removed from the system. Any asynchronous events scheduled for this
* device should exit without taking any action.
+ * @async_shutdown_queued - indicates async shutdown is enqueued for this device
*
* Nothing outside of the driver core should ever touch these fields.
*/
@@ -119,7 +120,8 @@ struct device_private {
char *deferred_probe_reason;
async_cookie_t shutdown_after;
struct device *device;
- u8 dead:1;
+ u8 dead:1,
+ async_shutdown_queued:1;
};
#define to_device_private_parent(obj) \
container_of(obj, struct device_private, knode_parent)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index df0df531b561..a697c13a6787 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3515,7 +3515,6 @@ static int device_private_init(struct device *dev)
klist_init(&dev->p->klist_children, klist_children_get,
klist_children_put);
INIT_LIST_HEAD(&dev->p->deferred_probe);
- dev->p->shutdown_after = 0;
return 0;
}
@@ -4856,37 +4855,68 @@ static bool device_wants_async_shutdown(struct device *dev)
* @data: the pointer to the struct device to be shutdown
* @cookie: not used
*
- * Shuts down one device, after waiting for previous device to shut down (for
- * synchronous shutdown) or waiting for device's last child or consumer to
- * be shutdown (for async shutdown).
+ * Shuts down one device, after waiting for device's last child or consumer to
+ * be shutdown.
*
* shutdown_after is set to the shutdown cookie of the last child or consumer
* of this device (if any).
*/
static void shutdown_one_device_async(void *data, async_cookie_t cookie)
{
- struct device *dev = data;
- async_cookie_t wait = cookie;
+ struct device_private *p = data;
- if (device_wants_async_shutdown(dev)) {
- wait = dev->p->shutdown_after + 1;
+ if (p->shutdown_after)
+ async_synchronize_cookie_domain(p->shutdown_after, &sd_domain);
+
+ shutdown_one_device(p->device);
+}
+
+static void queue_device_async_shutdown(struct device *dev)
+{
+ struct device_link *link;
+ struct device *parent;
+ async_cookie_t cookie;
+ int idx;
+
+ parent = get_device(dev->parent);
+ get_device(dev);
+
+ /*
+ * Add one to this device's cookie so that when shutdown_after is passed
+ * to async_synchronize_cookie_domain(), it will wait until *after*
+ * shutdown_one_device_async() is finished running for this device.
+ */
+ cookie = async_schedule_domain(shutdown_one_device_async, dev->p,
+ &sd_domain) + 1;
+
+ /*
+ * Set async_shutdown_queued to avoid overwriting a parent's
+ * shutdown_after while the parent is shutting down. This can happen if
+ * a parent or supplier is not ordered in the devices_kset list before a
+ * child or consumer, which is not expected.
+ */
+ dev->p->async_shutdown_queued = 1;
+
+ /* Ensure any parent & suppliers wait for this device to shut down */
+ if (parent) {
+ if (!parent->p->async_shutdown_queued)
+ parent->p->shutdown_after = cookie;
+ put_device(parent);
+ }
+
+ idx = device_links_read_lock();
+ list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
+ device_links_read_lock_held()) {
/*
- * To prevent system hang, revert to sync shutdown in the event
- * that shutdown_after would make this shutdown wait for a
- * shutdown that hasn't been scheduled yet.
- *
- * This can happen if a parent or supplier is not ordered in the
- * devices_kset list before a child or consumer, which is not
- * expected.
+ * sync_state_only devlink consumers aren't dependent on
+ * suppliers
*/
- if (wait > cookie) {
- wait = cookie;
- dev_warn(dev, "Unsafe shutdown ordering, forcing sync order\n");
- }
+ if (!device_link_flag_is_sync_state_only(link->flags) &&
+ !link->supplier->p->async_shutdown_queued)
+ link->supplier->p->shutdown_after = cookie;
}
-
- async_synchronize_cookie_domain(wait, &sd_domain);
- shutdown_one_device(dev);
+ device_links_read_unlock(idx);
+ put_device(dev);
}
/**
@@ -4894,16 +4924,37 @@ static void shutdown_one_device_async(void *data, async_cookie_t cookie)
*/
void device_shutdown(void)
{
- struct device *dev, *parent;
- async_cookie_t cookie;
- struct device_link *link;
- int idx;
+ struct device *dev, *parent, *tmp;
+ LIST_HEAD(async_list);
+ bool wait_for_async;
wait_for_device_probe();
device_block_probing();
cpufreq_suspend();
+ /*
+ * Find devices which can shut down asynchronously, and move them from
+ * the devices list onto the async list in reverse order.
+ */
+ spin_lock(&devices_kset->list_lock);
+ list_for_each_entry_safe(dev, tmp, &devices_kset->list, kobj.entry) {
+ if (device_wants_async_shutdown(dev)) {
+ get_device(dev->parent);
+ get_device(dev);
+ list_move(&dev->kobj.entry, &async_list);
+ }
+ }
+ spin_unlock(&devices_kset->list_lock);
+
+ /*
+ * Dispatch asynchronous shutdowns first so they don't have to wait
+ * behind any synchronous shutdowns.
+ */
+ wait_for_async = !list_empty(&async_list);
+ list_for_each_entry_safe(dev, tmp, &async_list, kobj.entry)
+ queue_device_async_shutdown(dev);
+
spin_lock(&devices_kset->list_lock);
/*
* Walk the devices list backward, shutting down each in turn.
@@ -4928,37 +4979,21 @@ void device_shutdown(void)
list_del_init(&dev->kobj.entry);
spin_unlock(&devices_kset->list_lock);
- get_device(dev);
- get_device(parent);
-
- cookie = async_schedule_domain(shutdown_one_device_async,
- dev, &sd_domain);
/*
- * Ensure any parent & suppliers will wait for this device to
- * shut down
+ * Dispatch an async shutdown if this device has a child or
+ * consumer that is async. Otherwise, shut down synchronously.
*/
- if (parent) {
- parent->p->shutdown_after = cookie;
- put_device(parent);
- }
-
- idx = device_links_read_lock();
- list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
- device_links_read_lock_held()) {
- /*
- * sync_state_only devlink consumers aren't dependent on
- * suppliers
- */
- if (!device_link_flag_is_sync_state_only(link->flags))
- link->supplier->p->shutdown_after = cookie;
- }
- device_links_read_unlock(idx);
- put_device(dev);
+ if (dev->p->shutdown_after)
+ queue_device_async_shutdown(dev);
+ else
+ shutdown_one_device(dev);
spin_lock(&devices_kset->list_lock);
}
spin_unlock(&devices_kset->list_lock);
- async_synchronize_full_domain(&sd_domain);
+
+ if (wait_for_async)
+ async_synchronize_full_domain(&sd_domain);
}
/*
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread