On Fri, Mar 27, 2026 at 12:02:21AM -0500, JAEHOON KIM wrote: > On 3/25/2026 3:37 PM, Stefan Hajnoczi wrote: > > On Mon, Mar 23, 2026 at 08:54:50AM -0500, Jaehoon Kim wrote: > > > Refine adaptive polling in aio_poll by updating iothread polling > > > duration based on weighted AioHandler event intervals. > > > > > > Each AioHandler's poll.ns is updated using a weighted factor when an > > > event occurs. Idle handlers accumulate block_ns until poll_max_ns and > > > then reset to 0, preventing sporadically active handlers from > > > unnecessarily prolonging iothread polling. > > > > > > The iothread polling duration is set based on the largest poll.ns among > > > active handlers. The shrink divider defaults to 2, matching the grow > > > rate, to reduce frequent poll_ns resets for slow devices. > > > > > > The default weight factor (POLL_WEIGHT_SHIFT=3, meaning the current > > > interval contributes 12.5% to the weighted average) was selected based > > > on extensive testing comparing QEMU 10.0.0 baseline vs poll-weight=2 > > > and poll-weight=3 across various workloads. > > > > > > The table below shows a comparison between: > > > -Host: RHEL 10.1 GA + qemu-10.0.0-14.el10_1, Guest: RHEL 9.6GA vs. > > > -Host: RHEL 10.1 GA + qemu-10.0.0-14.el10_1 (w=2/w=3), Guest: RHEL 9.6GA > > > for FIO FCP and FICON with 1 iothread and 8 iothreads. > > > The values shown are the averages for numjobs 1, 4, and 8. > > > > > > Summary of results (% change vs baseline): > > > > > > | poll-weight=2 | poll-weight=3 > > > --------------------|--------------------|----------------- > > > Throughput avg | -2.4% (all tests) | -2.2% (all tests) > > > CPU consumption avg | -10.9% (all tests) | -9.4% (all tests) > > > > > > Both weight=2 and weight=3 show significant CPU consumption reduction > > > (~10%) compared to baseline, which addresses the CPU utilization > > > regression observed in QEMU 10.0.0. The throughput impact is minimal > > > for both (~2%). > > > > > > Weight=3 is selected as the default because it provides slightly better > > > throughput (-2.2% vs -2.4%) while still achieving substantial CPU > > > savings (-9.4%). The difference between weight=2 and weight=3 is small, > > > but weight=3 offers a better balance for general-purpose workloads. > > > > > > Signed-off-by: Jaehoon Kim > > > --- > > > include/qemu/aio.h | 4 +- > > > util/aio-posix.c | 135 +++++++++++++++++++++++++++++++-------------- > > > util/async.c | 1 + > > > 3 files changed, 99 insertions(+), 41 deletions(-) > > > > > > diff --git a/include/qemu/aio.h b/include/qemu/aio.h > > > index 8cca2360d1..6c77a190e9 100644 > > > --- a/include/qemu/aio.h > > > +++ b/include/qemu/aio.h > > > @@ -195,7 +195,8 @@ struct BHListSlice { > > > typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; > > > typedef struct AioPolledEvent { > > > - int64_t ns; /* current polling time in nanoseconds */ > > > + bool has_event; /* Flag to indicate if an event has occurred */ > > > + int64_t ns; /* estimated block time in nanoseconds */ > > > } AioPolledEvent; > > > struct AioContext { > > > @@ -306,6 +307,7 @@ struct AioContext { > > > int poll_disable_cnt; > > > /* Polling mode parameters */ > > > + int64_t poll_ns; /* current polling time in nanoseconds */ > > > int64_t poll_max_ns; /* maximum polling time in nanoseconds */ > > > int64_t poll_grow; /* polling time growth factor */ > > > int64_t poll_shrink; /* polling time shrink factor */ > > > diff --git a/util/aio-posix.c b/util/aio-posix.c > > > index b02beb0505..2b3522f2f9 100644 > > > --- a/util/aio-posix.c > > > +++ b/util/aio-posix.c > > > @@ -29,9 +29,11 @@ > > > /* Stop userspace polling on a handler if it isn't active for some time */ > > > #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) > > > +#define POLL_WEIGHT_SHIFT (3) > > > -static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll, > > > - int64_t block_ns); > > > +static void adjust_block_ns(AioContext *ctx, int64_t block_ns); > > > +static void grow_polling_time(AioContext *ctx, int64_t block_ns); > > > +static void shrink_polling_time(AioContext *ctx, int64_t block_ns); > > > bool aio_poll_disabled(AioContext *ctx) > > > { > > > @@ -373,7 +375,7 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx, > > > * add the handler to ctx->poll_aio_handlers. > > This comment refers to adjusting the polling time. The code no longer > > does this and the comment should be updated. > The comment about adjusting polling time is no longer accurate. > I will update it in the next version. > > > */ > > > if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) { > > > - adjust_polling_time(ctx, &node->poll, block_ns); > > aio_dispatch_ready_handlers() no longer uses the block_ns argument. It > > can be removed. > I will remove the block_ns argument in the next version. > > > + node->poll.has_event = true; > > > } > > > } > > > @@ -560,18 +562,13 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list, > > > static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, > > > int64_t *timeout) > > > { > > > - AioHandler *node; > > > int64_t max_ns; > > > if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { > > > return false; > > > } > > > - max_ns = 0; > > > - QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { > > > - max_ns = MAX(max_ns, node->poll.ns); > > > - } > > > - max_ns = qemu_soonest_timeout(*timeout, max_ns); > > > + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); > > > if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { > > > /* > > > @@ -587,46 +584,98 @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, > > > return false; > > > } > > > -static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll, > > > - int64_t block_ns) > > > +static void shrink_polling_time(AioContext *ctx, int64_t block_ns) > > > { > > > - if (block_ns <= poll->ns) { > > > - /* This is the sweet spot, no adjustment needed */ > > > - } else if (block_ns > ctx->poll_max_ns) { > > > - /* We'd have to poll for too long, poll less */ > > > - int64_t old = poll->ns; > > > - > > > - if (ctx->poll_shrink) { > > > - poll->ns /= ctx->poll_shrink; > > > - } else { > > > - poll->ns = 0; > > > - } > > > + /* > > > + * Reduce polling time if the block_ns is zero or > > > + * less than the current poll_ns. > > > + */ > > > + int64_t old = ctx->poll_ns; > > > + int64_t shrink = ctx->poll_shrink; > > > - trace_poll_shrink(ctx, old, poll->ns); > > > - } else if (poll->ns < ctx->poll_max_ns && > > > - block_ns < ctx->poll_max_ns) { > > > - /* There is room to grow, poll longer */ > > > - int64_t old = poll->ns; > > > - int64_t grow = ctx->poll_grow; > > > + if (shrink == 0) { > > > + shrink = 2; > > > + } > > > - if (grow == 0) { > > > - grow = 2; > > > - } > > > + if (block_ns < (ctx->poll_ns / shrink)) { > > > + ctx->poll_ns /= shrink; > > > + } > > > - if (poll->ns) { > > > - poll->ns *= grow; > > > - } else { > > > - poll->ns = 4000; /* start polling at 4 microseconds */ > > > - } > > > + trace_poll_shrink(ctx, old, ctx->poll_ns); > > This trace event should be inside if (block_ns < (ctx->poll_ns / > > shrink)) like it was before this patch. > > > > > +} > > > - if (poll->ns > ctx->poll_max_ns) { > > > - poll->ns = ctx->poll_max_ns; > > > - } > > > +static void grow_polling_time(AioContext *ctx, int64_t block_ns) > > > +{ > > > + /* There is room to grow, poll longer */ > > > + int64_t old = ctx->poll_ns; > > > + int64_t grow = ctx->poll_grow; > > > - trace_poll_grow(ctx, old, poll->ns); > > > + if (grow == 0) { > > > + grow = 2; > > > } > > > + > > > + if (block_ns > ctx->poll_ns * grow) { > > > + ctx->poll_ns = block_ns; > > > + } else { > > > + ctx->poll_ns *= grow; > > > + } > > > + > > > + if (ctx->poll_ns > ctx->poll_max_ns) { > > > + ctx->poll_ns = ctx->poll_max_ns; > > > + } > > > + > > > + trace_poll_grow(ctx, old, ctx->poll_ns); > > Same here. > I will move the trace_poll_xxx functions inside if condition in the > next version. > > > > > } > > > +static void adjust_block_ns(AioContext *ctx, int64_t block_ns) > > > +{ > > > + AioHandler *node; > > > + int64_t adj_block_ns = -1; > > > + > > > + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { > > > + if (node->poll.has_event) { > > Did you consider unifying node->poll.has_event with > > node->poll_idle_timeout, which is assigned now + POLL_IDLE_INTERVAL_NS > > every time ->io_poll() detects an event? > > > > For instance, rename node->poll_idle_timeout to > > node->last_event_timestamp and assign now without adding > > POLL_IDLE_INTERVAL_NS. Then use the field for both idle node removal and > > adjust_block_ns() (pass in now). > Thank you for the suggestion, I think this is a good idea. > After testing, it seems that node->poll_idle_timeout can be reused as you > suggested, > although a few adjustment are needed. > > currently, an event is detected and the AioHandler is added to the > ready_list in > three cases: run_poll_handler_once(), ctx->fdmon_ops-wait(), and > poll_set_started(). > > To accurately track the last event timestamp of an AioHandler, it seems > necessary to > update the timestamp in the following two functions: > > @@ -45,6 +45,7 @@ void aio_add_ready_handler(AioHandlerList *ready_list, >  { >      QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's > list */ >      node->pfd.revents = revents; > +    node->poll_idle_timeout = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); >      QLIST_INSERT_HEAD(ready_list, node, node_ready); >  } > > @@ -53,6 +54,7 @@ static void aio_add_poll_ready_handler(AioHandlerList > *ready_list, >  { >      QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's > list */ >      node->poll_ready = true; > +    node->poll_idle_timeout = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); >      QLIST_INSERT_HEAD(ready_list, node, node_ready); >  } > > In addition, remove_idle_poll_handler() would need some adjustments as well. > If this approach aligns with you had in mind, I believe it can be > incorporated in > the next version without any issues. That works but I worry a bit about calling qemu_clock_get_ns() for every event. The timestamp does not need to be precise down to the nano-, micro-, or event milli-second. I think you could instead pass 'now' into aio_dispatch_ready_handlers() and assign the new node->last_dispatch_timestamp field there. > > > + /* > > > + * Update poll.ns for the node with an event. > > > + * Uses a weighted average of the current block_ns and the previous > > > + * poll.ns to smooth out polling time adjustments. > > > + */ > > > + node->poll.ns = node->poll.ns > > > + ? (node->poll.ns - (node->poll.ns >> POLL_WEIGHT_SHIFT)) > > > + + (block_ns >> POLL_WEIGHT_SHIFT) : block_ns; > > > + > > > + if (node->poll.ns > ctx->poll_max_ns) { > > > + node->poll.ns = 0; > > > + } > > Previously: > > - if (poll->ns > ctx->poll_max_ns) { > > - poll->ns = ctx->poll_max_ns; > > - } > > > > Was this causing excessive CPU consumption in your benchmarks? > > > > Can you explain the rationale for zeroing the poll time? Aside from > > reducing CPU consumption, it also reduces the chance that polling will > > succeed and could therefore impact performance. > > > > I'm asking about this because this patch makes several changes at once > > and I'm not sure how the CPU usage and performance changes are > > attributed to these multiple changes. I want to make sure the changes > > merged are minimal and the best set - sometimes when multiple things are > > changes at the same time, not all of them are beneficial. > > This snippet you quoted under "Previously" is also reflected in the logic > within grow_polling_time() where a same approach is used. > > The difference is that previously. all AioHandler in the reay_list would > update their own poll.ns using block_ns. As in the snippet below, > if block_ns exceeded poll_max_ns, it would effectively be reset to 0 anyway. > > -    } else if (block_ns > ctx->poll_max_ns) { > -        /* We'd have to poll for too long, poll less */ > -        int64_t old = poll->ns; > - > -        if (ctx->poll_shrink) { > -            poll->ns /= ctx->poll_shrink; > -        } else { > -            poll->ns = 0; > -        } > > I think I did not explain this part clearly enough in the commit message. > Here’s a more detailed explanation of the current polling logic problem and > approach: > > Problem: > Starting from QEMU 10.0, poll.ns was introduced per event handler to > mitigate excessive > fluctuations in IOThread polling times observed in earlier versions (QEMU > 9.x). > > However, in the current design, poll.ns is updated only when an event > occurs, making it > difficult to treat block_ns as a reliable event interval. Also, The > IOThread’s next > polling time is determined by the maximum poll.ns among all AioHandlers, > which means > idle AioHandlers with high poll.ns can have an outsized impact on the > polling duration. > > For io_uring, idle AioHandlers are cleared after POLL_IDLE_INTERVAL_NS (7s), > but > ppoll/epoll, there is no such mechanism, so CPU consumption due to idle > nodes can > increase even more. > > Approach: > To address this, we treat block_ns as an event interval and update each > AioHandler’s > poll.ns using a weighted factor. This smooths out polling time adjustments, > preventing > excessive fluctuations and ensuring that recent event intervals are properly > reflected, > which helps maintain performance while lowering CPU utilization. > > To use block_ns as an event interval, we update polling times for both event > and non-event AioHandlers in each loop iteration. Non-event AioHandler do > not require > a weighted factor; this allows for rapid isolation of idle nodes, while > ensuring that > poll.ns can increase more responsively when an event occurs within a few > subsequent loops. Thanks for this information. Please include it in the commit description. > > > + /* > > > + * To avoid excessive polling time increase, update adj_block_ns > > > + * for nodes with the event flag set to true > > > + */ > > > + adj_block_ns = MAX(adj_block_ns, node->poll.ns); > > adj_block_ns is not the blocking time, it's the maximum current poll > > time across all nodes. It would be clearer to change the variable name. > You're right. I will rename it to max_poll_ns to better reflect its purpose. > > > + node->poll.has_event = false; > > > + } else { > > 4-space indentation should be used. > I will also fix the indentation. > > > > > + /* > > > + * No event now, but was active before. > > > + * If it waits longer than poll_max_ns, poll.ns will stay 0 > > > + * until the next event arrives. > > > + */ > > > + if (node->poll.ns != 0) { > > > + node->poll.ns += block_ns; > > Why is block_ns being added to an recently inactive node's polling time? > > Here node->poll.ns no longer measures the weighted time until the > > handler had an event. > > > > If the goal is to get rid of inactive nodes, then maybe the idle handler > > removal mechanism should be made more aggresive instead? > > > > > + if (node->poll.ns > ctx->poll_max_ns) { > > > + node->poll.ns = 0; > > > + } > > > + } > > > + } > > > + } > > > + > > > + if (adj_block_ns >= 0) { > > > + if (adj_block_ns > ctx->poll_ns) { > > > + grow_polling_time(ctx, adj_block_ns); > > > + } else { > > > + shrink_polling_time(ctx, adj_block_ns); > > > + } > > > + } > > > + } > > > + > > > bool aio_poll(AioContext *ctx, bool blocking) > > > { > > > AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); > > > @@ -723,6 +772,10 @@ bool aio_poll(AioContext *ctx, bool blocking) > > > aio_free_deleted_handlers(ctx); > > > + if (ctx->poll_max_ns) { > > > + adjust_block_ns(ctx, block_ns); > > > + } > > > + > > > qemu_lockcnt_dec(&ctx->list_lock); > > > progress |= timerlistgroup_run_timers(&ctx->tlg); > > > @@ -784,6 +837,7 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, > > > qemu_lockcnt_inc(&ctx->list_lock); > > > QLIST_FOREACH(node, &ctx->aio_handlers, node) { > > > + node->poll.has_event = false; > > > node->poll.ns = 0; > > > } > > > qemu_lockcnt_dec(&ctx->list_lock); > > > @@ -794,6 +848,7 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, > > > ctx->poll_max_ns = max_ns; > > > ctx->poll_grow = grow; > > > ctx->poll_shrink = shrink; > > > + ctx->poll_ns = 0; > > > aio_notify(ctx); > > > } > > > diff --git a/util/async.c b/util/async.c > > > index 80d6b01a8a..9d3627566f 100644 > > > --- a/util/async.c > > > +++ b/util/async.c > > > @@ -606,6 +606,7 @@ AioContext *aio_context_new(Error **errp) > > > timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); > > > ctx->poll_max_ns = 0; > > > + ctx->poll_ns = 0; > > > ctx->poll_grow = 0; > > > ctx->poll_shrink = 0; > > > -- > > > 2.50.1 > > > >