[PATCHv2 0/2] blk-mq: add CPU latency limit control

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCHv2 0/2] blk-mq: add CPU latency limit control
@ 2024-10-18  7:30 Tero Kristo
  2024-10-18  7:30 ` [PATCHv2 1/2] block/genhd: add sysfs knobs for the CPU latency PM QoS settings Tero Kristo
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Tero Kristo @ 2024-10-18  7:30 UTC (permalink / raw)
  To: axboe; +Cc: hch, linux-block, linux-kernel

Hello,

Try #2 of the patches here. I sent earlier an RFC version against block/bio
[1], and then an isolated patch for NVMe driver only [2].

After feedback from maintainers, I've reworked the patches again to block
layer, this time to multiqueue support only. Once a blk-mq request is about
to be dispatched to the driver layer, the PM QoS variables for active CPUs
are tweaked based on configuration, and a timeout is launched as a delayed
work that drops the PM QoS limits once the queue is idle. The mechanism is
disabled by default, and only enabled once user activates it via the
provided sysfs knobs.

Some measurement data provided below as a reference for the
results, measured with 'fio' on an Intel Icelake Xeon platform, with an
extra NVMe card on the system. Both latency and bandwidth values are
provided, to showcase that the latency is reduced and bandwidth is not
impacted negatively due to overhead. C6 residency measurement is not
very accurate in my test leading to somewhat glitchy result on its
value (c6%).

key:
  slat: start latency max, in us
  clat: completion latency max, in us
  lat: overall latency max, in us
  bw: min-avg-max bandwidth values
  c6%: c6 (deep idle) residency for the active CPU during the test

cpu_lat_limit_us=10 (enabled)
  slat: 63, clat: 107, lat: 115, bw: 1177-1367-1397, c6%: 11.9
  slat: 30, clat: 129, lat: 137, bw: 1196-1380-1409, c6%: 0.9
  slat: 60, clat: 101, lat: 109, bw: 1193-1372-1407, c6%: 0.9
  slat: 29, clat: 135, lat: 143, bw: 1184-1369-1398, c6%: 1.0
  slat: 29, clat: 112, lat: 120, bw: 1188-1368-1397, c6%: 1.0
cpu_lat_limit_us=-1 (disabled)
  slat: 106, clat: 281, lat: 353, bw: 1183-1363-1403, c6%: 79.9
  slat: 107, clat: 270, lat: 319, bw: 1192-1370-1406, c6%: 79.8
  slat: 156, clat: 269, lat: 323, bw: 1187-1363-1398, c6%: 80.4
  slat: 106, clat: 267, lat: 316, bw: 1183-1367-1402, c6%: 80.5
  slat: 108, clat: 247, lat: 313, bw: 1186-1368-1404, c6%: 80.0
  slat: 107, clat: 274, lat: 323, bw: 1188-1361-1399, c6%: 80.0

-Tero

[1] https://lore.kernel.org/lkml/ZtHWkn2FJhAa+Vvo@fedora/T/
[2] https://lore.kernel.org/lkml/20241004101014.3716006-1-tero.kristo@linux.intel.com/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv2 1/2] block/genhd: add sysfs knobs for the CPU latency PM QoS settings
  2024-10-18  7:30 [PATCHv2 0/2] blk-mq: add CPU latency limit control Tero Kristo
@ 2024-10-18  7:30 ` Tero Kristo
  2024-10-18  7:30 ` [PATCHv2 2/2] blk-mq: add support for CPU latency limits Tero Kristo
  2024-10-23 14:06 ` [PATCH v3 " Tero Kristo
  2 siblings, 0 replies; 7+ messages in thread
From: Tero Kristo @ 2024-10-18  7:30 UTC (permalink / raw)
  To: axboe; +Cc: hch, linux-block, linux-kernel

Add sysfs knobs for the following parameters:

  cpu_lat_limit_us: for limiting the CPU latency to given value when block IO
		    is running
  cpu_lat_timeout_ms: for clearing up the CPU latency limit after block IO
		      is complete

This can be used to prevent the CPU from entering deep idle states when
block IO is running and waiting for an interrupt, potentially causing
large latencies to the operation.

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
---
 block/genhd.c          | 47 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  3 +++
 2 files changed, 50 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index 1c05dd4c6980..e60af2639136 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1046,6 +1046,48 @@ static ssize_t partscan_show(struct device *dev,
 	return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
 }
 
+static ssize_t cpu_lat_limit_us_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", disk->cpu_lat_limit);
+}
+
+static ssize_t cpu_lat_limit_us_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int i;
+
+	if (count > 0 && !kstrtoint(buf, 10, &i))
+		disk->cpu_lat_limit = i;
+
+	return count;
+}
+
+static ssize_t cpu_lat_timeout_ms_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", disk->cpu_lat_timeout);
+}
+
+static ssize_t cpu_lat_timeout_ms_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int i;
+
+	if (count > 0 && !kstrtoint(buf, 10, &i))
+		disk->cpu_lat_timeout = i;
+
+	return count;
+}
+
 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1060,6 +1102,8 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
 static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
+static DEVICE_ATTR_RW(cpu_lat_limit_us);
+static DEVICE_ATTR_RW(cpu_lat_timeout_ms);
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
@@ -1111,6 +1155,8 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_events_poll_msecs.attr,
 	&dev_attr_diskseq.attr,
 	&dev_attr_partscan.attr,
+	&dev_attr_cpu_lat_limit_us.attr,
+	&dev_attr_cpu_lat_timeout_ms.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1377,6 +1423,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
 	INIT_LIST_HEAD(&disk->slave_bdevs);
 #endif
+	disk->cpu_lat_limit = -1;
 	return disk;
 
 out_erase_part0:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b959da28..8bf76da2efac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -213,6 +213,9 @@ struct gendisk {
 	u64 diskseq;
 	blk_mode_t open_mode;
 
+	int cpu_lat_limit;
+	int cpu_lat_timeout;
+
 	/*
 	 * Independent sector access ranges. This is always NULL for
 	 * devices that do not have multiple independent access ranges.
-- 
2.43.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCHv2 2/2] blk-mq: add support for CPU latency limits
  2024-10-18  7:30 [PATCHv2 0/2] blk-mq: add CPU latency limit control Tero Kristo
  2024-10-18  7:30 ` [PATCHv2 1/2] block/genhd: add sysfs knobs for the CPU latency PM QoS settings Tero Kristo
@ 2024-10-18  7:30 ` Tero Kristo
  2024-10-18 14:21   ` Jens Axboe
  2024-10-23 14:06 ` [PATCH v3 " Tero Kristo
  2 siblings, 1 reply; 7+ messages in thread
From: Tero Kristo @ 2024-10-18  7:30 UTC (permalink / raw)
  To: axboe; +Cc: hch, linux-block, linux-kernel

Add support for setting CPU latency limits when a request is dispatched
to driver layer, and removing it once the device is idle. The latency
limits use the dev PM QoS framework for setting per-cpu limits for
active CPUs. The feature is user configurable via sysfs knobs under the
block device.

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
---
 block/blk-mq.c         | 54 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h | 12 ++++++++++
 2 files changed, 66 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4b2c8e940f59..f8906e2aff6d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
 #include <linux/sched/isolation.h>
+#include <linux/pm_qos.h>
 
 #include <trace/events/block.h>
 
@@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
 static void __blk_mq_flush_plug_list(struct request_queue *q,
 				     struct blk_plug *plug)
 {
+	struct request *req, *next;
+	struct blk_mq_hw_ctx *hctx;
+	int cpu;
+
 	if (blk_queue_quiesced(q))
 		return;
+
+	rq_list_for_each_safe(&plug->mq_list, req, next) {
+		hctx = req->mq_hctx;
+
+		if (next && next->mq_hctx == hctx)
+			continue;
+
+		if (q->disk->cpu_lat_limit < 0)
+			continue;
+
+		hctx->last_active = jiffies + msecs_to_jiffies(q->disk->cpu_lat_timeout);
+
+		if (!hctx->cpu_lat_limit_active) {
+			hctx->cpu_lat_limit_active = true;
+			for_each_cpu(cpu, hctx->cpumask) {
+				struct dev_pm_qos_request *qos;
+
+				qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
+				dev_pm_qos_add_request(get_cpu_device(cpu), qos,
+						       DEV_PM_QOS_RESUME_LATENCY,
+						       q->disk->cpu_lat_limit);
+			}
+			schedule_delayed_work(&hctx->cpu_latency_work,
+					      msecs_to_jiffies(q->disk->cpu_lat_timeout));
+		}
+	}
+
 	q->mq_ops->queue_rqs(&plug->mq_list);
 }
 
+static void blk_mq_cpu_latency_work(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx,
+						  cpu_latency_work.work);
+	int cpu;
+
+	if (time_after(jiffies, hctx->last_active)) {
+		for_each_cpu(cpu, hctx->cpumask) {
+			struct dev_pm_qos_request *qos;
+
+			qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
+			dev_pm_qos_remove_request(qos);
+		}
+		hctx->cpu_lat_limit_active = false;
+	} else {
+		schedule_delayed_work(&hctx->cpu_latency_work,
+				      msecs_to_jiffies(hctx->queue->disk->cpu_lat_timeout));
+	}
+}
+
 static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 {
 	struct blk_mq_hw_ctx *this_hctx = NULL;
@@ -3729,6 +3778,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
 		goto exit_flush_rq;
 
+	hctx->cpu_lat_qos = alloc_percpu(struct dev_pm_qos_request);
+	if (!hctx->cpu_lat_qos)
+		goto exit_flush_rq;
+	INIT_DELAYED_WORK(&hctx->cpu_latency_work, blk_mq_cpu_latency_work);
+
 	return 0;
 
  exit_flush_rq:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b751cc92209b..2b61942490d6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -435,6 +435,18 @@ struct blk_mq_hw_ctx {
 	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
+	/** @cpu_latency_work: Work to handle CPU latency PM limits. */
+	struct delayed_work	cpu_latency_work;
+
+	/** @cpu_lat_limit_active: If CPU latency limits are active or not. */
+	bool			cpu_lat_limit_active;
+
+	/** @last_active: Jiffies value when the queue was last active. */
+	unsigned long		last_active;
+
+	/** @cpu_lat_qos: PM QoS latency limits for individual CPUs. */
+	struct dev_pm_qos_request __percpu *cpu_lat_qos;
+
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
 	 * @debugfs_dir: debugfs directory for this hardware queue. Named
-- 
2.43.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 2/2] blk-mq: add support for CPU latency limits
  2024-10-18  7:30 ` [PATCHv2 2/2] blk-mq: add support for CPU latency limits Tero Kristo
@ 2024-10-18 14:21   ` Jens Axboe
  2024-10-23 13:26     ` Tero Kristo
  0 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2024-10-18 14:21 UTC (permalink / raw)
  To: Tero Kristo; +Cc: hch, linux-block, linux-kernel

On 10/18/24 1:30 AM, Tero Kristo wrote:
> @@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
>  static void __blk_mq_flush_plug_list(struct request_queue *q,
>  				     struct blk_plug *plug)
>  {
> +	struct request *req, *next;
> +	struct blk_mq_hw_ctx *hctx;
> +	int cpu;
> +
>  	if (blk_queue_quiesced(q))
>  		return;
> +
> +	rq_list_for_each_safe(&plug->mq_list, req, next) {
> +		hctx = req->mq_hctx;
> +
> +		if (next && next->mq_hctx == hctx)
> +			continue;
> +
> +		if (q->disk->cpu_lat_limit < 0)
> +			continue;
> +
> +		hctx->last_active = jiffies + msecs_to_jiffies(q->disk->cpu_lat_timeout);
> +
> +		if (!hctx->cpu_lat_limit_active) {
> +			hctx->cpu_lat_limit_active = true;
> +			for_each_cpu(cpu, hctx->cpumask) {
> +				struct dev_pm_qos_request *qos;
> +
> +				qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
> +				dev_pm_qos_add_request(get_cpu_device(cpu), qos,
> +						       DEV_PM_QOS_RESUME_LATENCY,
> +						       q->disk->cpu_lat_limit);
> +			}
> +			schedule_delayed_work(&hctx->cpu_latency_work,
> +					      msecs_to_jiffies(q->disk->cpu_lat_timeout));
> +		}
> +	}
> +

This is, quite literally, and insane amount of cycles to add to the hot
issue path. You're iterating each request in the list, and then each CPU
in the mask of the hardware context for each request.

This just won't fly, not at all. Like the previous feedback, please
figure out a way to make this cheaper. This means don't iterate a bunch
of stuff.

Outside of that, lots of styling issues here too, but none of that
really matters until the base mechanism is at least half way sane.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 2/2] blk-mq: add support for CPU latency limits
  2024-10-18 14:21   ` Jens Axboe
@ 2024-10-23 13:26     ` Tero Kristo
  2024-10-23 13:48       ` Jens Axboe
  0 siblings, 1 reply; 7+ messages in thread
From: Tero Kristo @ 2024-10-23 13:26 UTC (permalink / raw)
  To: Jens Axboe; +Cc: hch, linux-block, linux-kernel

On Fri, 2024-10-18 at 08:21 -0600, Jens Axboe wrote:
> On 10/18/24 1:30 AM, Tero Kristo wrote:
> > @@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct
> > blk_plug *plug)
> >  static void __blk_mq_flush_plug_list(struct request_queue *q,
> >  				     struct blk_plug *plug)
> >  {
> > +	struct request *req, *next;
> > +	struct blk_mq_hw_ctx *hctx;
> > +	int cpu;
> > +
> >  	if (blk_queue_quiesced(q))
> >  		return;
> > +
> > +	rq_list_for_each_safe(&plug->mq_list, req, next) {
> > +		hctx = req->mq_hctx;
> > +
> > +		if (next && next->mq_hctx == hctx)
> > +			continue;
> > +
> > +		if (q->disk->cpu_lat_limit < 0)
> > +			continue;
> > +
> > +		hctx->last_active = jiffies + msecs_to_jiffies(q-
> > >disk->cpu_lat_timeout);
> > +
> > +		if (!hctx->cpu_lat_limit_active) {
> > +			hctx->cpu_lat_limit_active = true;
> > +			for_each_cpu(cpu, hctx->cpumask) {
> > +				struct dev_pm_qos_request *qos;
> > +
> > +				qos = per_cpu_ptr(hctx-
> > >cpu_lat_qos, cpu);
> > +				dev_pm_qos_add_request(get_cpu_dev
> > ice(cpu), qos,
> > +						      
> > DEV_PM_QOS_RESUME_LATENCY,
> > +						       q->disk-
> > >cpu_lat_limit);
> > +			}
> > +			schedule_delayed_work(&hctx-
> > >cpu_latency_work,
> > +					      msecs_to_jiffies(q-
> > >disk->cpu_lat_timeout));
> > +		}
> > +	}
> > +
> 
> This is, quite literally, and insane amount of cycles to add to the
> hot
> issue path. You're iterating each request in the list, and then each
> CPU
> in the mask of the hardware context for each request.

Ok, I made some optimizations to the code, sending v3 shortly. In this,
all the PM QoS handling and iteration of lists is moved to the
workqueue, and happens in the background. The initial block requests
(until the workqueue fires) may run with higher latency, but that is
most likely an okay compromise.

PS: Please bear with me, my knowledge of the block layer and/or NVMe is
pretty limited. I am sorry if these patches make you frustrated, that
is not my intention.

-Tero

> 
> This just won't fly, not at all. Like the previous feedback, please
> figure out a way to make this cheaper. This means don't iterate a
> bunch
> of stuff.
> 
> Outside of that, lots of styling issues here too, but none of that
> really matters until the base mechanism is at least half way sane.
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 2/2] blk-mq: add support for CPU latency limits
  2024-10-23 13:26     ` Tero Kristo
@ 2024-10-23 13:48       ` Jens Axboe
  0 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-10-23 13:48 UTC (permalink / raw)
  To: Tero Kristo; +Cc: hch, linux-block, linux-kernel

On 10/23/24 7:26 AM, Tero Kristo wrote:
> On Fri, 2024-10-18 at 08:21 -0600, Jens Axboe wrote:
>> On 10/18/24 1:30 AM, Tero Kristo wrote:
>>> @@ -2700,11 +2701,62 @@ static void blk_mq_plug_issue_direct(struct
>>> blk_plug *plug)
>>>  static void __blk_mq_flush_plug_list(struct request_queue *q,
>>>  				     struct blk_plug *plug)
>>>  {
>>> +	struct request *req, *next;
>>> +	struct blk_mq_hw_ctx *hctx;
>>> +	int cpu;
>>> +
>>>  	if (blk_queue_quiesced(q))
>>>  		return;
>>> +
>>> +	rq_list_for_each_safe(&plug->mq_list, req, next) {
>>> +		hctx = req->mq_hctx;
>>> +
>>> +		if (next && next->mq_hctx == hctx)
>>> +			continue;
>>> +
>>> +		if (q->disk->cpu_lat_limit < 0)
>>> +			continue;
>>> +
>>> +		hctx->last_active = jiffies + msecs_to_jiffies(q-
>>>> disk->cpu_lat_timeout);
>>> +
>>> +		if (!hctx->cpu_lat_limit_active) {
>>> +			hctx->cpu_lat_limit_active = true;
>>> +			for_each_cpu(cpu, hctx->cpumask) {
>>> +				struct dev_pm_qos_request *qos;
>>> +
>>> +				qos = per_cpu_ptr(hctx-
>>>> cpu_lat_qos, cpu);
>>> +				dev_pm_qos_add_request(get_cpu_dev
>>> ice(cpu), qos,
>>> +						      
>>> DEV_PM_QOS_RESUME_LATENCY,
>>> +						       q->disk-
>>>> cpu_lat_limit);
>>> +			}
>>> +			schedule_delayed_work(&hctx-
>>>> cpu_latency_work,
>>> +					      msecs_to_jiffies(q-
>>>> disk->cpu_lat_timeout));
>>> +		}
>>> +	}
>>> +
>>
>> This is, quite literally, and insane amount of cycles to add to the
>> hot
>> issue path. You're iterating each request in the list, and then each
>> CPU
>> in the mask of the hardware context for each request.
> 
> Ok, I made some optimizations to the code, sending v3 shortly. In this,
> all the PM QoS handling and iteration of lists is moved to the
> workqueue, and happens in the background. The initial block requests
> (until the workqueue fires) may run with higher latency, but that is
> most likely an okay compromise.
> 
> PS: Please bear with me, my knowledge of the block layer and/or NVMe is
> pretty limited. I am sorry if these patches make you frustrated, that
> is not my intention.

That's fine, but I'd much rather you ask for clarification if there's
something that you don't understand, rather than keep adding really
expensive code to the issue path. Pushing the iteration to the workqueue
indeed sounds like the much better approach.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 2/2] blk-mq: add support for CPU latency limits
  2024-10-18  7:30 [PATCHv2 0/2] blk-mq: add CPU latency limit control Tero Kristo
  2024-10-18  7:30 ` [PATCHv2 1/2] block/genhd: add sysfs knobs for the CPU latency PM QoS settings Tero Kristo
  2024-10-18  7:30 ` [PATCHv2 2/2] blk-mq: add support for CPU latency limits Tero Kristo
@ 2024-10-23 14:06 ` Tero Kristo
  2 siblings, 0 replies; 7+ messages in thread
From: Tero Kristo @ 2024-10-23 14:06 UTC (permalink / raw)
  To: axboe; +Cc: hch, linux-block, linux-kernel

Add support for setting CPU latency limits when a request is dispatched
to driver layer, and removing it once the device is idle. A delayed work
is scheduled from the first block layer activity, and the workqueue ticks
with the configurable timeout period, checking if there has been any
activity. After the initial kick of the workqueue, only the last activity
time is updated with the current jiffies value, minimizing overhead.
The feature is user configurable via sysfs knobs under each individual
block device.

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
---
v2:
  * moved implementation back to block layer, to the request queue
    dispatch section

v3:
  * further optimization; fast path now only updates the jiffies value,
    and kicks off the workqueue for handling the PM QoS activities if
    not already active
  * moved the fast path handling under individual request handling, to
    avoid iterating the whole request queue

 block/blk-mq.c         | 51 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h | 12 ++++++++++
 2 files changed, 63 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4b2c8e940f59..e8d82601471d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
 #include <linux/sched/isolation.h>
+#include <linux/pm_qos.h>
 
 #include <trace/events/block.h>
 
@@ -1303,6 +1304,12 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 	rq->rq_next = NULL;
 	rq_list_add(&plug->mq_list, rq);
 	plug->rq_count++;
+
+	if (rq->q->disk->cpu_lat_limit >= 0) {
+		rq->mq_hctx->last_active = jiffies;
+		if (!delayed_work_pending(&rq->mq_hctx->cpu_latency_work))
+			schedule_delayed_work(&rq->mq_hctx->cpu_latency_work, 0);
+	}
 }
 
 /**
@@ -2705,6 +2712,45 @@ static void __blk_mq_flush_plug_list(struct request_queue *q,
 	q->mq_ops->queue_rqs(&plug->mq_list);
 }
 
+static void blk_mq_cpu_latency_work(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx,
+						  cpu_latency_work.work);
+	int cpu;
+	bool add_req = false;
+	bool remove_req = false;
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(hctx->queue->disk->cpu_lat_timeout);
+
+	if (time_after(jiffies, hctx->last_active + timeout)) {
+		remove_req = true;
+		hctx->cpu_lat_limit_active = false;
+	} else {
+		if (!hctx->cpu_lat_limit_active) {
+			hctx->cpu_lat_limit_active = true;
+			add_req = true;
+		}
+		schedule_delayed_work(&hctx->cpu_latency_work,
+				      hctx->last_active + timeout - jiffies);
+	}
+
+	if (!add_req && !remove_req)
+		return;
+
+	for_each_cpu(cpu, hctx->cpumask) {
+		struct dev_pm_qos_request *qos;
+
+		qos = per_cpu_ptr(hctx->cpu_lat_qos, cpu);
+		if (add_req)
+			dev_pm_qos_add_request(get_cpu_device(cpu), qos,
+					       DEV_PM_QOS_RESUME_LATENCY,
+					       hctx->queue->disk->cpu_lat_limit);
+		else
+			dev_pm_qos_remove_request(qos);
+	}
+}
+
 static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
 {
 	struct blk_mq_hw_ctx *this_hctx = NULL;
@@ -3729,6 +3775,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
 		goto exit_flush_rq;
 
+	hctx->cpu_lat_qos = alloc_percpu(struct dev_pm_qos_request);
+	if (!hctx->cpu_lat_qos)
+		goto exit_flush_rq;
+	INIT_DELAYED_WORK(&hctx->cpu_latency_work, blk_mq_cpu_latency_work);
+
 	return 0;
 
  exit_flush_rq:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4fecf46ef681..4442c18bf3d9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -435,6 +435,18 @@ struct blk_mq_hw_ctx {
 	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
+	/** @cpu_latency_work: Work to handle CPU latency PM limits. */
+	struct delayed_work	cpu_latency_work;
+
+	/** @cpu_lat_limit_active: If CPU latency limits are active or not. */
+	bool			cpu_lat_limit_active;
+
+	/** @last_active: Jiffies value when the queue was last active. */
+	unsigned long		last_active;
+
+	/** @cpu_lat_qos: PM QoS latency limits for individual CPUs. */
+	struct dev_pm_qos_request __percpu *cpu_lat_qos;
+
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
 	 * @debugfs_dir: debugfs directory for this hardware queue. Named
-- 
2.43.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-10-23 14:07 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-18  7:30 [PATCHv2 0/2] blk-mq: add CPU latency limit control Tero Kristo
2024-10-18  7:30 ` [PATCHv2 1/2] block/genhd: add sysfs knobs for the CPU latency PM QoS settings Tero Kristo
2024-10-18  7:30 ` [PATCHv2 2/2] blk-mq: add support for CPU latency limits Tero Kristo
2024-10-18 14:21   ` Jens Axboe
2024-10-23 13:26     ` Tero Kristo
2024-10-23 13:48       ` Jens Axboe
2024-10-23 14:06 ` [PATCH v3 " Tero Kristo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox