[PATCH RFC v2 2/3] aio-poll: refine iothread polling using weighted handler intervals

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jaehoon Kim <jhkim@linux.ibm.com>
To: qemu-devel@nongnu.org, qemu-block@nongnu.org
Cc: mjrosato@linux.ibm.com, farman@linux.ibm.com,
	pbonzini@redhat.com, stefanha@redhat.com, fam@euphon.net,
	armbru@redhat.com, eblake@redhat.com, berrange@redhat.com,
	eduardo@habkost.net, dave@treblig.org, sw@weilnetz.de,
	Jaehoon Kim <jhkim@linux.ibm.com>
Subject: [PATCH RFC v2 2/3] aio-poll: refine iothread polling using weighted handler intervals
Date: Mon, 23 Mar 2026 08:54:50 -0500	[thread overview]
Message-ID: <20260323135451.579655-3-jhkim@linux.ibm.com> (raw)
In-Reply-To: <20260323135451.579655-1-jhkim@linux.ibm.com>

Refine adaptive polling in aio_poll by updating iothread polling
duration based on weighted AioHandler event intervals.

Each AioHandler's poll.ns is updated using a weighted factor when an
event occurs. Idle handlers accumulate block_ns until poll_max_ns and
then reset to 0, preventing sporadically active handlers from
unnecessarily prolonging iothread polling.

The iothread polling duration is set based on the largest poll.ns among
active handlers. The shrink divider defaults to 2, matching the grow
rate, to reduce frequent poll_ns resets for slow devices.

The default weight factor (POLL_WEIGHT_SHIFT=3, meaning the current
interval contributes 12.5% to the weighted average) was selected based
on extensive testing comparing QEMU 10.0.0 baseline vs poll-weight=2
and poll-weight=3 across various workloads.

The table below shows a comparison between:
-Host: RHEL 10.1 GA + qemu-10.0.0-14.el10_1, Guest: RHEL 9.6GA vs.
-Host: RHEL 10.1 GA + qemu-10.0.0-14.el10_1 (w=2/w=3), Guest: RHEL 9.6GA
for FIO FCP and FICON with 1 iothread and 8 iothreads.
The values shown are the averages for numjobs 1, 4, and 8.

Summary of results (% change vs baseline):

                    | poll-weight=2      | poll-weight=3
--------------------|--------------------|-----------------
Throughput avg      | -2.4% (all tests)  | -2.2% (all tests)
CPU consumption avg | -10.9% (all tests) | -9.4% (all tests)

Both weight=2 and weight=3 show significant CPU consumption reduction
(~10%) compared to baseline, which addresses the CPU utilization
regression observed in QEMU 10.0.0. The throughput impact is minimal
for both (~2%).

Weight=3 is selected as the default because it provides slightly better
throughput (-2.2% vs -2.4%) while still achieving substantial CPU
savings (-9.4%). The difference between weight=2 and weight=3 is small,
but weight=3 offers a better balance for general-purpose workloads.

Signed-off-by: Jaehoon Kim <jhkim@linux.ibm.com>
---
 include/qemu/aio.h |   4 +-
 util/aio-posix.c   | 135 +++++++++++++++++++++++++++++++--------------
 util/async.c       |   1 +
 3 files changed, 99 insertions(+), 41 deletions(-)

diff --git a/include/qemu/aio.h b/include/qemu/aio.h
index 8cca2360d1..6c77a190e9 100644
--- a/include/qemu/aio.h
+++ b/include/qemu/aio.h
@@ -195,7 +195,8 @@ struct BHListSlice {
 typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
 
 typedef struct AioPolledEvent {
-    int64_t ns;        /* current polling time in nanoseconds */
+    bool has_event; /* Flag to indicate if an event has occurred */
+    int64_t ns;     /* estimated block time in nanoseconds */
 } AioPolledEvent;
 
 struct AioContext {
@@ -306,6 +307,7 @@ struct AioContext {
     int poll_disable_cnt;
 
     /* Polling mode parameters */
+    int64_t poll_ns;        /* current polling time in nanoseconds */
     int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index b02beb0505..2b3522f2f9 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -29,9 +29,11 @@
 
 /* Stop userspace polling on a handler if it isn't active for some time */
 #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+#define POLL_WEIGHT_SHIFT   (3)
 
-static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
-                                int64_t block_ns);
+static void adjust_block_ns(AioContext *ctx, int64_t block_ns);
+static void grow_polling_time(AioContext *ctx, int64_t block_ns);
+static void shrink_polling_time(AioContext *ctx, int64_t block_ns);
 
 bool aio_poll_disabled(AioContext *ctx)
 {
@@ -373,7 +375,7 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
          * add the handler to ctx->poll_aio_handlers.
          */
         if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) {
-            adjust_polling_time(ctx, &node->poll, block_ns);
+            node->poll.has_event = true;
         }
     }
 
@@ -560,18 +562,13 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
 static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
                           int64_t *timeout)
 {
-    AioHandler *node;
     int64_t max_ns;
 
     if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
         return false;
     }
 
-    max_ns = 0;
-    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
-        max_ns = MAX(max_ns, node->poll.ns);
-    }
-    max_ns = qemu_soonest_timeout(*timeout, max_ns);
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         /*
@@ -587,46 +584,98 @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
     return false;
 }
 
-static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
-                                int64_t block_ns)
+static void shrink_polling_time(AioContext *ctx, int64_t block_ns)
 {
-    if (block_ns <= poll->ns) {
-        /* This is the sweet spot, no adjustment needed */
-    } else if (block_ns > ctx->poll_max_ns) {
-        /* We'd have to poll for too long, poll less */
-        int64_t old = poll->ns;
-
-        if (ctx->poll_shrink) {
-            poll->ns /= ctx->poll_shrink;
-        } else {
-            poll->ns = 0;
-        }
+    /*
+     * Reduce polling time if the block_ns is zero or
+     * less than the current poll_ns.
+     */
+    int64_t old = ctx->poll_ns;
+    int64_t shrink = ctx->poll_shrink;
 
-        trace_poll_shrink(ctx, old, poll->ns);
-    } else if (poll->ns < ctx->poll_max_ns &&
-               block_ns < ctx->poll_max_ns) {
-        /* There is room to grow, poll longer */
-        int64_t old = poll->ns;
-        int64_t grow = ctx->poll_grow;
+    if (shrink == 0) {
+        shrink = 2;
+    }
 
-        if (grow == 0) {
-            grow = 2;
-        }
+    if (block_ns < (ctx->poll_ns / shrink)) {
+        ctx->poll_ns /= shrink;
+    }
 
-        if (poll->ns) {
-            poll->ns *= grow;
-        } else {
-            poll->ns = 4000; /* start polling at 4 microseconds */
-        }
+    trace_poll_shrink(ctx, old, ctx->poll_ns);
+}
 
-        if (poll->ns > ctx->poll_max_ns) {
-            poll->ns = ctx->poll_max_ns;
-        }
+static void grow_polling_time(AioContext *ctx, int64_t block_ns)
+{
+    /* There is room to grow, poll longer */
+    int64_t old = ctx->poll_ns;
+    int64_t grow = ctx->poll_grow;
 
-        trace_poll_grow(ctx, old, poll->ns);
+    if (grow == 0) {
+        grow = 2;
     }
+
+    if (block_ns > ctx->poll_ns * grow) {
+        ctx->poll_ns = block_ns;
+    } else {
+        ctx->poll_ns *= grow;
+    }
+
+    if (ctx->poll_ns > ctx->poll_max_ns) {
+        ctx->poll_ns = ctx->poll_max_ns;
+    }
+
+    trace_poll_grow(ctx, old, ctx->poll_ns);
 }
 
+static void adjust_block_ns(AioContext *ctx, int64_t block_ns)
+{
+    AioHandler *node;
+    int64_t adj_block_ns = -1;
+
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+        if (node->poll.has_event) {
+            /*
+             * Update poll.ns for the node with an event.
+             * Uses a weighted average of the current block_ns and the previous
+             * poll.ns to smooth out polling time adjustments.
+             */
+            node->poll.ns = node->poll.ns
+                ? (node->poll.ns - (node->poll.ns >> POLL_WEIGHT_SHIFT))
+                + (block_ns >> POLL_WEIGHT_SHIFT) : block_ns;
+
+            if (node->poll.ns > ctx->poll_max_ns) {
+                node->poll.ns = 0;
+            }
+            /*
+             * To avoid excessive polling time increase, update adj_block_ns
+             * for nodes with the event flag set to true
+             */
+            adj_block_ns = MAX(adj_block_ns, node->poll.ns);
+            node->poll.has_event = false;
+         } else {
+            /*
+             * No event now, but was active before.
+             * If it waits longer than poll_max_ns, poll.ns will stay 0
+             * until the next event arrives.
+             */
+            if (node->poll.ns != 0) {
+                node->poll.ns += block_ns;
+                if (node->poll.ns > ctx->poll_max_ns) {
+                    node->poll.ns = 0;
+                }
+            }
+        }
+    }
+
+    if (adj_block_ns >= 0) {
+        if (adj_block_ns > ctx->poll_ns) {
+            grow_polling_time(ctx, adj_block_ns);
+        } else {
+            shrink_polling_time(ctx, adj_block_ns);
+         }
+     }
+ }
+
 bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
@@ -723,6 +772,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     aio_free_deleted_handlers(ctx);
 
+    if (ctx->poll_max_ns) {
+        adjust_block_ns(ctx, block_ns);
+    }
+
     qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= timerlistgroup_run_timers(&ctx->tlg);
@@ -784,6 +837,7 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
 
     qemu_lockcnt_inc(&ctx->list_lock);
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        node->poll.has_event = false;
         node->poll.ns = 0;
     }
     qemu_lockcnt_dec(&ctx->list_lock);
@@ -794,6 +848,7 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
     ctx->poll_max_ns = max_ns;
     ctx->poll_grow = grow;
     ctx->poll_shrink = shrink;
+    ctx->poll_ns = 0;
 
     aio_notify(ctx);
 }
diff --git a/util/async.c b/util/async.c
index 80d6b01a8a..9d3627566f 100644
--- a/util/async.c
+++ b/util/async.c
@@ -606,6 +606,7 @@ AioContext *aio_context_new(Error **errp)
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
     ctx->poll_max_ns = 0;
+    ctx->poll_ns = 0;
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;
 
-- 
2.50.1

next prev parent reply	other threads:[~2026-03-23 13:56 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-23 13:54 [PATCH RFC v2 0/3] improve aio-polling efficiency Jaehoon Kim
2026-03-23 13:54 ` [PATCH RFC v2 1/3] aio-poll: avoid unnecessary polling time computation Jaehoon Kim
2026-03-25 17:22   ` Stefan Hajnoczi
2026-03-26 18:17     ` JAEHOON KIM
2026-03-26 18:34       ` Stefan Hajnoczi
2026-03-23 13:54 ` Jaehoon Kim [this message]
2026-03-25 20:37   ` [PATCH RFC v2 2/3] aio-poll: refine iothread polling using weighted handler intervals Stefan Hajnoczi
2026-03-27  5:02     ` JAEHOON KIM
2026-03-30 19:17       ` Stefan Hajnoczi
2026-03-31 20:42         ` JAEHOON KIM
2026-03-23 13:54 ` [PATCH RFC v2 3/3] qapi/iothread: introduce poll-weight parameter for aio-poll Jaehoon Kim
2026-03-25 14:04   ` Markus Armbruster
2026-03-26 15:55     ` JAEHOON KIM
2026-03-27  5:49       ` Markus Armbruster
2026-03-27 14:23         ` JAEHOON KIM
2026-03-25 16:52   ` Stefan Hajnoczi
2026-03-25 16:56   ` Stefan Hajnoczi
2026-03-26 16:13     ` JAEHOON KIM

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:8cca2360d dfblob:6c77a190e dfblob:b02beb050 dfblob:2b3522f2f
dfblob:80d6b01a8 dfblob:9d3627566 )
 OR (
bs:"[PATCH RFC v2 2/3] aio-poll: refine iothread polling using weighted handler intervals" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260323135451.579655-3-jhkim@linux.ibm.com \
    --to=jhkim@linux.ibm.com \
    --cc=armbru@redhat.com \
    --cc=berrange@redhat.com \
    --cc=dave@treblig.org \
    --cc=eblake@redhat.com \
    --cc=eduardo@habkost.net \
    --cc=fam@euphon.net \
    --cc=farman@linux.ibm.com \
    --cc=mjrosato@linux.ibm.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    --cc=sw@weilnetz.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.