From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Cc: qemu-block@nongnu.org
Subject: [PATCH] block: split qemu/aio.h out of block/aio.h
Date: Fri, 28 Nov 2025 11:15:55 +0100 [thread overview]
Message-ID: <20251128101555.227630-1-pbonzini@redhat.com> (raw)
Rust bindings are roughly broken up according to subdirectories of
include/ (that's not exact, but it's roughly an aim). However,
block/aio.h contains both block layer-specific concepts (BlockAIOCB,
BlockCompletionFunc) and AioContext-related declarations that are
used be qemu/main-loop.h.
Break out the latter into their own header file, and use that to
break the inclusion of block/ from qemu/main-loop.h.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
Based on top of
https://lore.kernel.org/qemu-devel/20251127131516.80807-3-pbonzini@redhat.com/
include/block/aio.h | 838 +-------------------------------------
include/qemu/aio.h | 852 +++++++++++++++++++++++++++++++++++++++
include/qemu/main-loop.h | 4 +-
3 files changed, 857 insertions(+), 837 deletions(-)
create mode 100644 include/qemu/aio.h
diff --git a/include/block/aio.h b/include/block/aio.h
index cc3d5f25a24..dba423f896e 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -11,22 +11,13 @@
*
*/
-#ifndef QEMU_AIO_H
-#define QEMU_AIO_H
+#ifndef QEMU_BLOCK_AIO_H
+#define QEMU_BLOCK_AIO_H
-#ifdef CONFIG_LINUX_IO_URING
-#include <liburing.h>
-#endif
-#include "qemu/coroutine-core.h"
-#include "qemu/queue.h"
-#include "qemu/event_notifier.h"
-#include "qemu/lockcnt.h"
-#include "qemu/thread.h"
-#include "qemu/timer.h"
+#include "qemu/aio.h"
#include "block/graph-lock.h"
#include "hw/core/qdev.h"
-
typedef struct BlockAIOCB BlockAIOCB;
typedef void BlockCompletionFunc(void *opaque, int ret);
@@ -48,827 +39,4 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
void qemu_aio_unref(void *p);
void qemu_aio_ref(void *p);
-typedef struct AioHandler AioHandler;
-typedef QLIST_HEAD(, AioHandler) AioHandlerList;
-typedef void QEMUBHFunc(void *opaque);
-typedef bool AioPollFn(void *opaque);
-typedef void IOHandler(void *opaque);
-
-struct ThreadPoolAio;
-struct LinuxAioState;
-typedef struct LuringState LuringState;
-
-/* Is polling disabled? */
-bool aio_poll_disabled(AioContext *ctx);
-
-#ifdef CONFIG_LINUX_IO_URING
-/*
- * Each io_uring request must have a unique CqeHandler that processes the cqe.
- * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
- * ->cb() invocation.
- */
-typedef struct CqeHandler CqeHandler;
-struct CqeHandler {
- /* Called by the AioContext when the request has completed */
- void (*cb)(CqeHandler *handler);
-
- /* Used internally, do not access this */
- QSIMPLEQ_ENTRY(CqeHandler) next;
-
- /* This field is filled in before ->cb() is called */
- struct io_uring_cqe cqe;
-};
-
-typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
-#endif /* CONFIG_LINUX_IO_URING */
-
-/* Callbacks for file descriptor monitoring implementations */
-typedef struct {
- /*
- * update:
- * @ctx: the AioContext
- * @old_node: the existing handler or NULL if this file descriptor is being
- * monitored for the first time
- * @new_node: the new handler or NULL if this file descriptor is being
- * removed
- *
- * Add/remove/modify a monitored file descriptor.
- *
- * Called with ctx->list_lock acquired.
- */
- void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
-
- /*
- * wait:
- * @ctx: the AioContext
- * @ready_list: list for handlers that become ready
- * @timeout: maximum duration to wait, in nanoseconds
- *
- * Wait for file descriptors to become ready and place them on ready_list.
- *
- * Called with ctx->list_lock incremented but not locked.
- *
- * Returns: number of ready file descriptors.
- */
- int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
-
- /*
- * need_wait:
- * @ctx: the AioContext
- *
- * Tell aio_poll() when to stop userspace polling early because ->wait()
- * has fds ready.
- *
- * File descriptor monitoring implementations that cannot poll fd readiness
- * from userspace should use aio_poll_disabled() here. This ensures that
- * file descriptors are not starved by handlers that frequently make
- * progress via userspace polling.
- *
- * Returns: true if ->wait() should be called, false otherwise.
- */
- bool (*need_wait)(AioContext *ctx);
-
- /*
- * dispatch:
- * @ctx: the AioContext
- *
- * Dispatch any work that is specific to this file descriptor monitoring
- * implementation. Usually the event loop's generic file descriptor
- * monitoring, BH, and timer dispatching code is sufficient, but file
- * descriptor monitoring implementations offering additional functionality
- * may need to implement this function for custom behavior. Called at a
- * point in the event loop when it is safe to invoke user-defined
- * callbacks.
- *
- * This function is optional and may be NULL.
- *
- * Returns: true if progress was made (see aio_poll()'s return value),
- * false otherwise.
- */
- bool (*dispatch)(AioContext *ctx);
-
- /*
- * gsource_prepare:
- * @ctx: the AioContext
- *
- * Prepare for the glib event loop to wait for events instead of the usual
- * ->wait() call. See glib's GSourceFuncs->prepare().
- */
- void (*gsource_prepare)(AioContext *ctx);
-
- /*
- * gsource_check:
- * @ctx: the AioContext
- *
- * Called by the glib event loop from glib's GSourceFuncs->check() after
- * waiting for events.
- *
- * Returns: true when ready to be dispatched.
- */
- bool (*gsource_check)(AioContext *ctx);
-
- /*
- * gsource_dispatch:
- * @ctx: the AioContext
- * @ready_list: list for handlers that become ready
- *
- * Place ready AioHandlers on ready_list. Called as part of the glib event
- * loop from glib's GSourceFuncs->dispatch().
- *
- * Called with list_lock incremented.
- */
- void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
-
-#ifdef CONFIG_LINUX_IO_URING
- /**
- * add_sqe: Add an io_uring sqe for submission.
- * @prep_sqe: invoked with an sqe that should be prepared for submission
- * @opaque: user-defined argument to @prep_sqe()
- * @cqe_handler: the unique cqe handler associated with this request
- *
- * The caller's @prep_sqe() function is invoked to fill in the details of
- * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
- *
- * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
- * until the next event loop iteration.
- *
- * This function is called from the current AioContext and is not
- * thread-safe.
- */
- void (*add_sqe)(AioContext *ctx,
- void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
- void *opaque, CqeHandler *cqe_handler);
-#endif /* CONFIG_LINUX_IO_URING */
-} FDMonOps;
-
-/*
- * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
- * scheduled BHs are not processed until the next aio_bh_poll() call. All
- * active aio_bh_poll() calls chain their slices together in a list, so that
- * nested aio_bh_poll() calls process all scheduled bottom halves.
- */
-typedef QSLIST_HEAD(, QEMUBH) BHList;
-typedef struct BHListSlice BHListSlice;
-struct BHListSlice {
- BHList bh_list;
- QSIMPLEQ_ENTRY(BHListSlice) next;
-};
-
-typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
-
-typedef struct AioPolledEvent {
- int64_t ns; /* current polling time in nanoseconds */
-} AioPolledEvent;
-
-struct AioContext {
- GSource source;
-
- /* Used by AioContext users to protect from multi-threaded access. */
- QemuRecMutex lock;
-
- /*
- * Keep track of readers and writers of the block layer graph.
- * This is essential to avoid performing additions and removal
- * of nodes and edges from block graph while some
- * other thread is traversing it.
- */
- BdrvGraphRWlock *bdrv_graph;
-
- /* The list of registered AIO handlers. Protected by ctx->list_lock. */
- AioHandlerList aio_handlers;
-
- /* The list of AIO handlers to be deleted. Protected by ctx->list_lock. */
- AioHandlerList deleted_aio_handlers;
-
- /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
- * only written from the AioContext home thread, or under the BQL in
- * the case of the main AioContext. However, it is read from any
- * thread so it is still accessed with atomic primitives.
- *
- * If this field is 0, everything (file descriptors, bottom halves,
- * timers) will be re-evaluated before the next blocking poll() or
- * io_uring wait; therefore, the event_notifier_set call can be
- * skipped. If it is non-zero, you may need to wake up a concurrent
- * aio_poll or the glib main event loop, making event_notifier_set
- * necessary.
- *
- * Bit 0 is reserved for GSource usage of the AioContext, and is 1
- * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
- * Bits 1-31 simply count the number of active calls to aio_poll
- * that are in the prepare or poll phase.
- *
- * The GSource and aio_poll must use a different mechanism because
- * there is no certainty that a call to GSource's prepare callback
- * (via g_main_context_prepare) is indeed followed by check and
- * dispatch. It's not clear whether this would be a bug, but let's
- * play safe and allow it---it will just cause extra calls to
- * event_notifier_set until the next call to dispatch.
- *
- * Instead, the aio_poll calls include both the prepare and the
- * dispatch phase, hence a simple counter is enough for them.
- */
- uint32_t notify_me;
-
- /* A lock to protect between QEMUBH and AioHandler adders and deleter,
- * and to ensure that no callbacks are removed while we're walking and
- * dispatching them.
- */
- QemuLockCnt list_lock;
-
- /* Bottom Halves pending aio_bh_poll() processing */
- BHList bh_list;
-
- /* Chained BH list slices for each nested aio_bh_poll() call */
- QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
-
- /* Used by aio_notify.
- *
- * "notified" is used to avoid expensive event_notifier_test_and_clear
- * calls. When it is clear, the EventNotifier is clear, or one thread
- * is going to clear "notified" before processing more events. False
- * positives are possible, i.e. "notified" could be set even though the
- * EventNotifier is clear.
- *
- * Note that event_notifier_set *cannot* be optimized the same way. For
- * more information on the problem that would result, see "#ifdef BUG2"
- * in the docs/aio_notify_accept.promela formal model.
- */
- bool notified;
- EventNotifier notifier;
-
- QSLIST_HEAD(, Coroutine) scheduled_coroutines;
- QEMUBH *co_schedule_bh;
-
- int thread_pool_min;
- int thread_pool_max;
- /* Thread pool for performing work and receiving completion callbacks.
- * Has its own locking.
- */
- struct ThreadPoolAio *thread_pool;
-
-#ifdef CONFIG_LINUX_AIO
- struct LinuxAioState *linux_aio;
-#endif
-#ifdef CONFIG_LINUX_IO_URING
- /* State for file descriptor monitoring using Linux io_uring */
- struct io_uring fdmon_io_uring;
- AioHandlerSList submit_list;
- void *io_uring_fd_tag;
-
- /* Pending callback state for cqe handlers */
- CqeHandlerSimpleQ cqe_handler_ready_list;
-#endif /* CONFIG_LINUX_IO_URING */
-
- /* TimerLists for calling timers - one per clock type. Has its own
- * locking.
- */
- QEMUTimerListGroup tlg;
-
- /* Number of AioHandlers without .io_poll() */
- int poll_disable_cnt;
-
- /* Polling mode parameters */
- int64_t poll_max_ns; /* maximum polling time in nanoseconds */
- int64_t poll_grow; /* polling time growth factor */
- int64_t poll_shrink; /* polling time shrink factor */
-
- /* AIO engine parameters */
- int64_t aio_max_batch; /* maximum number of requests in a batch */
-
- /*
- * List of handlers participating in userspace polling. Protected by
- * ctx->list_lock. Iterated and modified mostly by the event loop thread
- * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
- * only touches the list to delete nodes if ctx->list_lock's count is zero.
- */
- AioHandlerList poll_aio_handlers;
-
- /* Are we in polling mode or monitoring file descriptors? */
- bool poll_started;
-
- /* epoll(7) state used when built with CONFIG_EPOLL */
- int epollfd;
-
- /* The GSource unix fd tag for epollfd */
- void *epollfd_tag;
-
- const FDMonOps *fdmon_ops;
-
- /* Was aio_context_new() successful? */
- bool initialized;
-};
-
-/**
- * aio_context_new: Allocate a new AioContext.
- *
- * AioContext provide a mini event-loop that can be waited on synchronously.
- * They also provide bottom halves, a service to execute a piece of code
- * as soon as possible.
- */
-AioContext *aio_context_new(Error **errp);
-
-/**
- * aio_context_ref:
- * @ctx: The AioContext to operate on.
- *
- * Add a reference to an AioContext.
- */
-void aio_context_ref(AioContext *ctx);
-
-/**
- * aio_context_unref:
- * @ctx: The AioContext to operate on.
- *
- * Drop a reference to an AioContext.
- */
-void aio_context_unref(AioContext *ctx);
-
-/**
- * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
- * run only once and as soon as possible.
- *
- * @name: A human-readable identifier for debugging purposes.
- */
-void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
- const char *name);
-
-/**
- * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
- * only once and as soon as possible.
- *
- * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
- * name string.
- */
-#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
- aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
-
-/**
- * aio_bh_new_full: Allocate a new bottom half structure.
- *
- * Bottom halves are lightweight callbacks whose invocation is guaranteed
- * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure
- * is opaque and must be allocated prior to its use.
- *
- * @name: A human-readable identifier for debugging purposes.
- * @reentrancy_guard: A guard set when entering a cb to prevent
- * device-reentrancy issues
- */
-QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
- const char *name, MemReentrancyGuard *reentrancy_guard);
-
-/**
- * aio_bh_new: Allocate a new bottom half structure
- *
- * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
- * string.
- */
-#define aio_bh_new(ctx, cb, opaque) \
- aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
-
-/**
- * aio_bh_new_guarded: Allocate a new bottom half structure with a
- * reentrancy_guard
- *
- * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
- * string.
- */
-#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
- aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
-
-/**
- * aio_notify: Force processing of pending events.
- *
- * Similar to signaling a condition variable, aio_notify forces
- * aio_poll to exit, so that the next call will re-examine pending events.
- * The caller of aio_notify will usually call aio_poll again very soon,
- * or go through another iteration of the GLib main loop. Hence, aio_notify
- * also has the side effect of recalculating the sets of file descriptors
- * that the main loop waits for.
- *
- * Calling aio_notify is rarely necessary, because for example scheduling
- * a bottom half calls it already.
- */
-void aio_notify(AioContext *ctx);
-
-/**
- * aio_notify_accept: Acknowledge receiving an aio_notify.
- *
- * aio_notify() uses an EventNotifier in order to wake up a sleeping
- * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are
- * usually rare, but the AioContext has to clear the EventNotifier on
- * every aio_poll() or g_main_context_iteration() in order to avoid
- * busy waiting. This event_notifier_test_and_clear() cannot be done
- * using the usual aio_context_set_event_notifier(), because it must
- * be done before processing all events (file descriptors, bottom halves,
- * timers).
- *
- * aio_notify_accept() is an optimized event_notifier_test_and_clear()
- * that is specific to an AioContext's notifier; it is used internally
- * to clear the EventNotifier only if aio_notify() had been called.
- */
-void aio_notify_accept(AioContext *ctx);
-
-/**
- * aio_bh_call: Executes callback function of the specified BH.
- */
-void aio_bh_call(QEMUBH *bh);
-
-/**
- * aio_bh_poll: Poll bottom halves for an AioContext.
- *
- * These are internal functions used by the QEMU main loop.
- * And notice that multiple occurrences of aio_bh_poll cannot
- * be called concurrently
- */
-int aio_bh_poll(AioContext *ctx);
-
-/**
- * qemu_bh_schedule: Schedule a bottom half.
- *
- * Scheduling a bottom half interrupts the main loop and causes the
- * execution of the callback that was passed to qemu_bh_new.
- *
- * Bottom halves that are scheduled from a bottom half handler are instantly
- * invoked. This can create an infinite loop if a bottom half handler
- * schedules itself.
- *
- * @bh: The bottom half to be scheduled.
- */
-void qemu_bh_schedule(QEMUBH *bh);
-
-/**
- * qemu_bh_cancel: Cancel execution of a bottom half.
- *
- * Canceling execution of a bottom half undoes the effect of calls to
- * qemu_bh_schedule without freeing its resources yet. While cancellation
- * itself is also wait-free and thread-safe, it can of course race with the
- * loop that executes bottom halves unless you are holding the iothread
- * mutex. This makes it mostly useless if you are not holding the mutex.
- *
- * @bh: The bottom half to be canceled.
- */
-void qemu_bh_cancel(QEMUBH *bh);
-
-/**
- *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
- *
- * Deleting a bottom half frees the memory that was allocated for it by
- * qemu_bh_new. It also implies canceling the bottom half if it was
- * scheduled.
- * This func is async. The bottom half will do the delete action at the finial
- * end.
- *
- * @bh: The bottom half to be deleted.
- */
-void qemu_bh_delete(QEMUBH *bh);
-
-/* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext, before g_poll is invoked.
- *
- * This is used internally in the implementation of the GSource.
- */
-bool aio_prepare(AioContext *ctx);
-
-/* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext, after g_poll is invoked.
- *
- * This is used internally in the implementation of the GSource.
- */
-bool aio_pending(AioContext *ctx);
-
-/* Dispatch any pending callbacks from the GSource attached to the AioContext.
- *
- * This is used internally in the implementation of the GSource.
- */
-void aio_dispatch(AioContext *ctx);
-
-/* Progress in completing AIO work to occur. This can issue new pending
- * aio as a result of executing I/O completion or bh callbacks.
- *
- * Return whether any progress was made by executing AIO or bottom half
- * handlers. If @blocking == true, this should always be true except
- * if someone called aio_notify.
- *
- * If there are no pending bottom halves, but there are pending AIO
- * operations, it may not be possible to make any progress without
- * blocking. If @blocking is true, this function will wait until one
- * or more AIO events have completed, to ensure something has moved
- * before returning.
- */
-bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
-
-/* Register a file descriptor and associated callbacks. Behaves very similarly
- * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will
- * be invoked when using aio_poll().
- *
- * Code that invokes AIO completion functions should rely on this function
- * instead of qemu_set_fd_handler[2].
- */
-void aio_set_fd_handler(AioContext *ctx,
- int fd,
- IOHandler *io_read,
- IOHandler *io_write,
- AioPollFn *io_poll,
- IOHandler *io_poll_ready,
- void *opaque);
-
-/* Register an event notifier and associated callbacks. Behaves very similarly
- * to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks
- * will be invoked when using aio_poll().
- *
- * Code that invokes AIO completion functions should rely on this function
- * instead of event_notifier_set_handler.
- */
-void aio_set_event_notifier(AioContext *ctx,
- EventNotifier *notifier,
- EventNotifierHandler *io_read,
- AioPollFn *io_poll,
- EventNotifierHandler *io_poll_ready);
-
-/*
- * Set polling begin/end callbacks for an event notifier that has already been
- * registered with aio_set_event_notifier. Do nothing if the event notifier is
- * not registered.
- *
- * Note that if the io_poll_end() callback (or the entire notifier) is removed
- * during polling, it will not be called, so an io_poll_begin() is not
- * necessarily always followed by an io_poll_end().
- */
-void aio_set_event_notifier_poll(AioContext *ctx,
- EventNotifier *notifier,
- EventNotifierHandler *io_poll_begin,
- EventNotifierHandler *io_poll_end);
-
-/* Return a GSource that lets the main loop poll the file descriptors attached
- * to this AioContext.
- */
-GSource *aio_get_g_source(AioContext *ctx);
-
-/* Return the ThreadPoolAio bound to this AioContext */
-struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
-
-/* Setup the LinuxAioState bound to this AioContext */
-struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
-
-/* Return the LinuxAioState bound to this AioContext */
-struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
-
-/**
- * aio_timer_new_with_attrs:
- * @ctx: the aio context
- * @type: the clock type
- * @scale: the scale
- * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
- * to assign
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Allocate a new timer (with attributes) attached to the context @ctx.
- * The function is responsible for memory allocation.
- *
- * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
- * Use that unless you really need dynamic memory allocation.
- *
- * Returns: a pointer to the new timer
- */
-static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
- QEMUClockType type,
- int scale, int attributes,
- QEMUTimerCB *cb, void *opaque)
-{
- return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
-}
-
-/**
- * aio_timer_new:
- * @ctx: the aio context
- * @type: the clock type
- * @scale: the scale
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Allocate a new timer attached to the context @ctx.
- * See aio_timer_new_with_attrs for details.
- *
- * Returns: a pointer to the new timer
- */
-static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
- int scale,
- QEMUTimerCB *cb, void *opaque)
-{
- return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
-}
-
-/**
- * aio_timer_init_with_attrs:
- * @ctx: the aio context
- * @ts: the timer
- * @type: the clock type
- * @scale: the scale
- * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
- * to assign
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Initialise a new timer (with attributes) attached to the context @ctx.
- * The caller is responsible for memory allocation.
- */
-static inline void aio_timer_init_with_attrs(AioContext *ctx,
- QEMUTimer *ts, QEMUClockType type,
- int scale, int attributes,
- QEMUTimerCB *cb, void *opaque)
-{
- timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
-}
-
-/**
- * aio_timer_init:
- * @ctx: the aio context
- * @ts: the timer
- * @type: the clock type
- * @scale: the scale
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Initialise a new timer attached to the context @ctx.
- * See aio_timer_init_with_attrs for details.
- */
-static inline void aio_timer_init(AioContext *ctx,
- QEMUTimer *ts, QEMUClockType type,
- int scale,
- QEMUTimerCB *cb, void *opaque)
-{
- timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
-}
-
-/**
- * aio_compute_timeout:
- * @ctx: the aio context
- *
- * Compute the timeout that a blocking aio_poll should use.
- */
-int64_t aio_compute_timeout(AioContext *ctx);
-
-/**
- * aio_co_schedule:
- * @ctx: the aio context
- * @co: the coroutine
- *
- * Start a coroutine on a remote AioContext.
- *
- * The coroutine must not be entered by anyone else while aio_co_schedule()
- * is active. In addition the coroutine must have yielded unless ctx
- * is the context in which the coroutine is running (i.e. the value of
- * qemu_get_current_aio_context() from the coroutine itself).
- */
-void aio_co_schedule(AioContext *ctx, Coroutine *co);
-
-/**
- * aio_co_reschedule_self:
- * @new_ctx: the new context
- *
- * Move the currently running coroutine to new_ctx. If the coroutine is already
- * running in new_ctx, do nothing.
- *
- * Note that this function cannot reschedule from iohandler_ctx to
- * qemu_aio_context.
- */
-void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
-
-/**
- * aio_co_wake:
- * @co: the coroutine
- *
- * Restart a coroutine on the AioContext where it was running last, thus
- * preventing coroutines from jumping from one context to another when they
- * go to sleep.
- *
- * aio_co_wake may be executed either in coroutine or non-coroutine
- * context. The coroutine must not be entered by anyone else while
- * aio_co_wake() is active.
- *
- * If `co`'s AioContext differs from the current AioContext, this will call
- * aio_co_schedule(), which makes this safe to use even when `co` has not
- * yielded yet. In such a case, it will be entered once it yields.
- *
- * In contrast, if `co`'s AioContext is equal to the current one, it is
- * required for `co` to currently be yielding. This is generally the case
- * if the caller is not in `co` (i.e. invoked by `co`), because the only
- * other way for the caller to be running then is for `co` to currently be
- * yielding.
- *
- * Therefore, if there is no way for the caller to be invoked/entered by
- * `co`, it is generally safe to call this regardless of whether `co` is
- * known to already be yielding or not -- it only has to yield at some
- * point.
- */
-void aio_co_wake(Coroutine *co);
-
-/**
- * aio_co_enter:
- * @ctx: the context to run the coroutine
- * @co: the coroutine to run
- *
- * Enter a coroutine in the specified AioContext.
- */
-void aio_co_enter(AioContext *ctx, Coroutine *co);
-
-/**
- * Return the AioContext whose event loop runs in the current thread.
- *
- * If called from an IOThread this will be the IOThread's AioContext. If
- * called from the main thread or with the "big QEMU lock" taken it
- * will be the main loop AioContext.
- *
- * Note that the return value is never the main loop's iohandler_ctx and the
- * return value is the main loop AioContext instead.
- */
-AioContext *qemu_get_current_aio_context(void);
-
-void qemu_set_current_aio_context(AioContext *ctx);
-
-/**
- * aio_context_setup:
- * @ctx: the aio context
- * @errp: error pointer
- *
- * Initialize the aio context.
- *
- * Returns: true on success, false otherwise
- */
-bool aio_context_setup(AioContext *ctx, Error **errp);
-
-/**
- * aio_context_destroy:
- * @ctx: the aio context
- *
- * Destroy the aio context.
- */
-void aio_context_destroy(AioContext *ctx);
-
-/**
- * aio_context_set_poll_params:
- * @ctx: the aio context
- * @max_ns: how long to busy poll for, in nanoseconds
- * @grow: polling time growth factor
- * @shrink: polling time shrink factor
- *
- * Poll mode can be disabled by setting poll_max_ns to 0.
- */
-void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
- int64_t grow, int64_t shrink,
- Error **errp);
-
-/**
- * aio_context_set_aio_params:
- * @ctx: the aio context
- * @max_batch: maximum number of requests in a batch, 0 means that the
- * engine will use its default
- */
-void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
-
-/**
- * aio_context_set_thread_pool_params:
- * @ctx: the aio context
- * @min: min number of threads to have readily available in the thread pool
- * @min: max number of threads the thread pool can contain
- */
-void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
- int64_t max, Error **errp);
-
-#ifdef CONFIG_LINUX_IO_URING
-/**
- * aio_has_io_uring: Return whether io_uring is available.
- *
- * io_uring is either available in all AioContexts or in none, so this only
- * needs to be called once from within any thread's AioContext.
- */
-static inline bool aio_has_io_uring(void)
-{
- AioContext *ctx = qemu_get_current_aio_context();
- return ctx->fdmon_ops->add_sqe;
-}
-
-/**
- * aio_add_sqe: Add an io_uring sqe for submission.
- * @prep_sqe: invoked with an sqe that should be prepared for submission
- * @opaque: user-defined argument to @prep_sqe()
- * @cqe_handler: the unique cqe handler associated with this request
- *
- * The caller's @prep_sqe() function is invoked to fill in the details of the
- * sqe. Do not call io_uring_sqe_set_data() on this sqe.
- *
- * The sqe is submitted by the current AioContext. The kernel may see the sqe
- * as soon as @prep_sqe() returns or it may take until the next event loop
- * iteration.
- *
- * When the AioContext is destroyed, pending sqes are ignored and their
- * CqeHandlers are not invoked.
- *
- * This function must be called only when aio_has_io_uring() returns true.
- */
-void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
- void *opaque, CqeHandler *cqe_handler);
-#endif /* CONFIG_LINUX_IO_URING */
-
#endif
diff --git a/include/qemu/aio.h b/include/qemu/aio.h
new file mode 100644
index 00000000000..8cca2360d1a
--- /dev/null
+++ b/include/qemu/aio.h
@@ -0,0 +1,852 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
+#include "qemu/coroutine-core.h"
+#include "qemu/queue.h"
+#include "qemu/event_notifier.h"
+#include "qemu/lockcnt.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+
+struct MemReentrancyGuard;
+
+typedef struct AioHandler AioHandler;
+typedef QLIST_HEAD(, AioHandler) AioHandlerList;
+typedef void QEMUBHFunc(void *opaque);
+typedef bool AioPollFn(void *opaque);
+typedef void IOHandler(void *opaque);
+
+struct ThreadPoolAio;
+struct LinuxAioState;
+typedef struct LuringState LuringState;
+
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
+#ifdef CONFIG_LINUX_IO_URING
+/*
+ * Each io_uring request must have a unique CqeHandler that processes the cqe.
+ * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
+ * ->cb() invocation.
+ */
+typedef struct CqeHandler CqeHandler;
+struct CqeHandler {
+ /* Called by the AioContext when the request has completed */
+ void (*cb)(CqeHandler *handler);
+
+ /* Used internally, do not access this */
+ QSIMPLEQ_ENTRY(CqeHandler) next;
+
+ /* This field is filled in before ->cb() is called */
+ struct io_uring_cqe cqe;
+};
+
+typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
+#endif /* CONFIG_LINUX_IO_URING */
+
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+ /*
+ * update:
+ * @ctx: the AioContext
+ * @old_node: the existing handler or NULL if this file descriptor is being
+ * monitored for the first time
+ * @new_node: the new handler or NULL if this file descriptor is being
+ * removed
+ *
+ * Add/remove/modify a monitored file descriptor.
+ *
+ * Called with ctx->list_lock acquired.
+ */
+ void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
+
+ /*
+ * wait:
+ * @ctx: the AioContext
+ * @ready_list: list for handlers that become ready
+ * @timeout: maximum duration to wait, in nanoseconds
+ *
+ * Wait for file descriptors to become ready and place them on ready_list.
+ *
+ * Called with ctx->list_lock incremented but not locked.
+ *
+ * Returns: number of ready file descriptors.
+ */
+ int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+ /*
+ * need_wait:
+ * @ctx: the AioContext
+ *
+ * Tell aio_poll() when to stop userspace polling early because ->wait()
+ * has fds ready.
+ *
+ * File descriptor monitoring implementations that cannot poll fd readiness
+ * from userspace should use aio_poll_disabled() here. This ensures that
+ * file descriptors are not starved by handlers that frequently make
+ * progress via userspace polling.
+ *
+ * Returns: true if ->wait() should be called, false otherwise.
+ */
+ bool (*need_wait)(AioContext *ctx);
+
+ /*
+ * dispatch:
+ * @ctx: the AioContext
+ *
+ * Dispatch any work that is specific to this file descriptor monitoring
+ * implementation. Usually the event loop's generic file descriptor
+ * monitoring, BH, and timer dispatching code is sufficient, but file
+ * descriptor monitoring implementations offering additional functionality
+ * may need to implement this function for custom behavior. Called at a
+ * point in the event loop when it is safe to invoke user-defined
+ * callbacks.
+ *
+ * This function is optional and may be NULL.
+ *
+ * Returns: true if progress was made (see aio_poll()'s return value),
+ * false otherwise.
+ */
+ bool (*dispatch)(AioContext *ctx);
+
+ /*
+ * gsource_prepare:
+ * @ctx: the AioContext
+ *
+ * Prepare for the glib event loop to wait for events instead of the usual
+ * ->wait() call. See glib's GSourceFuncs->prepare().
+ */
+ void (*gsource_prepare)(AioContext *ctx);
+
+ /*
+ * gsource_check:
+ * @ctx: the AioContext
+ *
+ * Called by the glib event loop from glib's GSourceFuncs->check() after
+ * waiting for events.
+ *
+ * Returns: true when ready to be dispatched.
+ */
+ bool (*gsource_check)(AioContext *ctx);
+
+ /*
+ * gsource_dispatch:
+ * @ctx: the AioContext
+ * @ready_list: list for handlers that become ready
+ *
+ * Place ready AioHandlers on ready_list. Called as part of the glib event
+ * loop from glib's GSourceFuncs->dispatch().
+ *
+ * Called with list_lock incremented.
+ */
+ void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
+
+#ifdef CONFIG_LINUX_IO_URING
+ /**
+ * add_sqe: Add an io_uring sqe for submission.
+ * @prep_sqe: invoked with an sqe that should be prepared for submission
+ * @opaque: user-defined argument to @prep_sqe()
+ * @cqe_handler: the unique cqe handler associated with this request
+ *
+ * The caller's @prep_sqe() function is invoked to fill in the details of
+ * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
+ *
+ * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
+ * until the next event loop iteration.
+ *
+ * This function is called from the current AioContext and is not
+ * thread-safe.
+ */
+ void (*add_sqe)(AioContext *ctx,
+ void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+ void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+} FDMonOps;
+
+/*
+ * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
+ * scheduled BHs are not processed until the next aio_bh_poll() call. All
+ * active aio_bh_poll() calls chain their slices together in a list, so that
+ * nested aio_bh_poll() calls process all scheduled bottom halves.
+ */
+typedef QSLIST_HEAD(, QEMUBH) BHList;
+typedef struct BHListSlice BHListSlice;
+struct BHListSlice {
+ BHList bh_list;
+ QSIMPLEQ_ENTRY(BHListSlice) next;
+};
+
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
+typedef struct AioPolledEvent {
+ int64_t ns; /* current polling time in nanoseconds */
+} AioPolledEvent;
+
+struct AioContext {
+ GSource source;
+
+ /* Used by AioContext users to protect from multi-threaded access. */
+ QemuRecMutex lock;
+
+ /*
+ * Keep track of readers and writers of the block layer graph.
+ * This is essential to avoid performing additions and removal
+ * of nodes and edges from block graph while some
+ * other thread is traversing it.
+ */
+ struct BdrvGraphRWlock *bdrv_graph;
+
+ /* The list of registered AIO handlers. Protected by ctx->list_lock. */
+ AioHandlerList aio_handlers;
+
+ /* The list of AIO handlers to be deleted. Protected by ctx->list_lock. */
+ AioHandlerList deleted_aio_handlers;
+
+ /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+ * only written from the AioContext home thread, or under the BQL in
+ * the case of the main AioContext. However, it is read from any
+ * thread so it is still accessed with atomic primitives.
+ *
+ * If this field is 0, everything (file descriptors, bottom halves,
+ * timers) will be re-evaluated before the next blocking poll() or
+ * io_uring wait; therefore, the event_notifier_set call can be
+ * skipped. If it is non-zero, you may need to wake up a concurrent
+ * aio_poll or the glib main event loop, making event_notifier_set
+ * necessary.
+ *
+ * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+ * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+ * Bits 1-31 simply count the number of active calls to aio_poll
+ * that are in the prepare or poll phase.
+ *
+ * The GSource and aio_poll must use a different mechanism because
+ * there is no certainty that a call to GSource's prepare callback
+ * (via g_main_context_prepare) is indeed followed by check and
+ * dispatch. It's not clear whether this would be a bug, but let's
+ * play safe and allow it---it will just cause extra calls to
+ * event_notifier_set until the next call to dispatch.
+ *
+ * Instead, the aio_poll calls include both the prepare and the
+ * dispatch phase, hence a simple counter is enough for them.
+ */
+ uint32_t notify_me;
+
+ /* A lock to protect between QEMUBH and AioHandler adders and deleter,
+ * and to ensure that no callbacks are removed while we're walking and
+ * dispatching them.
+ */
+ QemuLockCnt list_lock;
+
+ /* Bottom Halves pending aio_bh_poll() processing */
+ BHList bh_list;
+
+ /* Chained BH list slices for each nested aio_bh_poll() call */
+ QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
+
+ /* Used by aio_notify.
+ *
+ * "notified" is used to avoid expensive event_notifier_test_and_clear
+ * calls. When it is clear, the EventNotifier is clear, or one thread
+ * is going to clear "notified" before processing more events. False
+ * positives are possible, i.e. "notified" could be set even though the
+ * EventNotifier is clear.
+ *
+ * Note that event_notifier_set *cannot* be optimized the same way. For
+ * more information on the problem that would result, see "#ifdef BUG2"
+ * in the docs/aio_notify_accept.promela formal model.
+ */
+ bool notified;
+ EventNotifier notifier;
+
+ QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+ QEMUBH *co_schedule_bh;
+
+ int thread_pool_min;
+ int thread_pool_max;
+ /* Thread pool for performing work and receiving completion callbacks.
+ * Has its own locking.
+ */
+ struct ThreadPoolAio *thread_pool;
+
+#ifdef CONFIG_LINUX_AIO
+ struct LinuxAioState *linux_aio;
+#endif
+#ifdef CONFIG_LINUX_IO_URING
+ /* State for file descriptor monitoring using Linux io_uring */
+ struct io_uring fdmon_io_uring;
+ AioHandlerSList submit_list;
+ void *io_uring_fd_tag;
+
+ /* Pending callback state for cqe handlers */
+ CqeHandlerSimpleQ cqe_handler_ready_list;
+#endif /* CONFIG_LINUX_IO_URING */
+
+ /* TimerLists for calling timers - one per clock type. Has its own
+ * locking.
+ */
+ QEMUTimerListGroup tlg;
+
+ /* Number of AioHandlers without .io_poll() */
+ int poll_disable_cnt;
+
+ /* Polling mode parameters */
+ int64_t poll_max_ns; /* maximum polling time in nanoseconds */
+ int64_t poll_grow; /* polling time growth factor */
+ int64_t poll_shrink; /* polling time shrink factor */
+
+ /* AIO engine parameters */
+ int64_t aio_max_batch; /* maximum number of requests in a batch */
+
+ /*
+ * List of handlers participating in userspace polling. Protected by
+ * ctx->list_lock. Iterated and modified mostly by the event loop thread
+ * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
+ * only touches the list to delete nodes if ctx->list_lock's count is zero.
+ */
+ AioHandlerList poll_aio_handlers;
+
+ /* Are we in polling mode or monitoring file descriptors? */
+ bool poll_started;
+
+ /* epoll(7) state used when built with CONFIG_EPOLL */
+ int epollfd;
+
+ /* The GSource unix fd tag for epollfd */
+ void *epollfd_tag;
+
+ const FDMonOps *fdmon_ops;
+
+ /* Was aio_context_new() successful? */
+ bool initialized;
+};
+
+/**
+ * aio_context_new: Allocate a new AioContext.
+ *
+ * AioContext provide a mini event-loop that can be waited on synchronously.
+ * They also provide bottom halves, a service to execute a piece of code
+ * as soon as possible.
+ */
+AioContext *aio_context_new(Error **errp);
+
+/**
+ * aio_context_ref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Add a reference to an AioContext.
+ */
+void aio_context_ref(AioContext *ctx);
+
+/**
+ * aio_context_unref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Drop a reference to an AioContext.
+ */
+void aio_context_unref(AioContext *ctx);
+
+/**
+ * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
+ * run only once and as soon as possible.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+ const char *name);
+
+/**
+ * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
+ * only once and as soon as possible.
+ *
+ * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
+ * name string.
+ */
+#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
+ aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_bh_new_full: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ * @reentrancy_guard: A guard set when entering a cb to prevent
+ * device-reentrancy issues
+ */
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+ const char *name, struct MemReentrancyGuard *reentrancy_guard);
+
+/**
+ * aio_bh_new: Allocate a new bottom half structure
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new(ctx, cb, opaque) \
+ aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
+
+/**
+ * aio_bh_new_guarded: Allocate a new bottom half structure with a
+ * reentrancy_guard
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
+ aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
+
+/**
+ * aio_notify: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, aio_notify forces
+ * aio_poll to exit, so that the next call will re-examine pending events.
+ * The caller of aio_notify will usually call aio_poll again very soon,
+ * or go through another iteration of the GLib main loop. Hence, aio_notify
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling aio_notify is rarely necessary, because for example scheduling
+ * a bottom half calls it already.
+ */
+void aio_notify(AioContext *ctx);
+
+/**
+ * aio_notify_accept: Acknowledge receiving an aio_notify.
+ *
+ * aio_notify() uses an EventNotifier in order to wake up a sleeping
+ * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are
+ * usually rare, but the AioContext has to clear the EventNotifier on
+ * every aio_poll() or g_main_context_iteration() in order to avoid
+ * busy waiting. This event_notifier_test_and_clear() cannot be done
+ * using the usual aio_context_set_event_notifier(), because it must
+ * be done before processing all events (file descriptors, bottom halves,
+ * timers).
+ *
+ * aio_notify_accept() is an optimized event_notifier_test_and_clear()
+ * that is specific to an AioContext's notifier; it is used internally
+ * to clear the EventNotifier only if aio_notify() had been called.
+ */
+void aio_notify_accept(AioContext *ctx);
+
+/**
+ * aio_bh_call: Executes callback function of the specified BH.
+ */
+void aio_bh_call(QEMUBH *bh);
+
+/**
+ * aio_bh_poll: Poll bottom halves for an AioContext.
+ *
+ * These are internal functions used by the QEMU main loop.
+ * And notice that multiple occurrences of aio_bh_poll cannot
+ * be called concurrently
+ */
+int aio_bh_poll(AioContext *ctx);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked. This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet. While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex. This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new. It also implies canceling the bottom half if it was
+ * scheduled.
+ * This func is async. The bottom half will do the delete action at the finial
+ * end.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, before g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_prepare(AioContext *ctx);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, after g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_pending(AioContext *ctx);
+
+/* Dispatch any pending callbacks from the GSource attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+void aio_dispatch(AioContext *ctx);
+
+/* Progress in completing AIO work to occur. This can issue new pending
+ * aio as a result of executing I/O completion or bh callbacks.
+ *
+ * Return whether any progress was made by executing AIO or bottom half
+ * handlers. If @blocking == true, this should always be true except
+ * if someone called aio_notify.
+ *
+ * If there are no pending bottom halves, but there are pending AIO
+ * operations, it may not be possible to make any progress without
+ * blocking. If @blocking is true, this function will wait until one
+ * or more AIO events have completed, to ensure something has moved
+ * before returning.
+ */
+bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
+
+/* Register a file descriptor and associated callbacks. Behaves very similarly
+ * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will
+ * be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of qemu_set_fd_handler[2].
+ */
+void aio_set_fd_handler(AioContext *ctx,
+ int fd,
+ IOHandler *io_read,
+ IOHandler *io_write,
+ AioPollFn *io_poll,
+ IOHandler *io_poll_ready,
+ void *opaque);
+
+/* Register an event notifier and associated callbacks. Behaves very similarly
+ * to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks
+ * will be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of event_notifier_set_handler.
+ */
+void aio_set_event_notifier(AioContext *ctx,
+ EventNotifier *notifier,
+ EventNotifierHandler *io_read,
+ AioPollFn *io_poll,
+ EventNotifierHandler *io_poll_ready);
+
+/*
+ * Set polling begin/end callbacks for an event notifier that has already been
+ * registered with aio_set_event_notifier. Do nothing if the event notifier is
+ * not registered.
+ *
+ * Note that if the io_poll_end() callback (or the entire notifier) is removed
+ * during polling, it will not be called, so an io_poll_begin() is not
+ * necessarily always followed by an io_poll_end().
+ */
+void aio_set_event_notifier_poll(AioContext *ctx,
+ EventNotifier *notifier,
+ EventNotifierHandler *io_poll_begin,
+ EventNotifierHandler *io_poll_end);
+
+/* Return a GSource that lets the main loop poll the file descriptors attached
+ * to this AioContext.
+ */
+GSource *aio_get_g_source(AioContext *ctx);
+
+/* Return the ThreadPoolAio bound to this AioContext */
+struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
+
+/* Setup the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
+
+/* Return the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
+
+/**
+ * aio_timer_new_with_attrs:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ * to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer (with attributes) attached to the context @ctx.
+ * The function is responsible for memory allocation.
+ *
+ * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
+ * Use that unless you really need dynamic memory allocation.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
+ QEMUClockType type,
+ int scale, int attributes,
+ QEMUTimerCB *cb, void *opaque)
+{
+ return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_new:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer attached to the context @ctx.
+ * See aio_timer_new_with_attrs for details.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
+ int scale,
+ QEMUTimerCB *cb, void *opaque)
+{
+ return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_timer_init_with_attrs:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ * to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer (with attributes) attached to the context @ctx.
+ * The caller is responsible for memory allocation.
+ */
+static inline void aio_timer_init_with_attrs(AioContext *ctx,
+ QEMUTimer *ts, QEMUClockType type,
+ int scale, int attributes,
+ QEMUTimerCB *cb, void *opaque)
+{
+ timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_init:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer attached to the context @ctx.
+ * See aio_timer_init_with_attrs for details.
+ */
+static inline void aio_timer_init(AioContext *ctx,
+ QEMUTimer *ts, QEMUClockType type,
+ int scale,
+ QEMUTimerCB *cb, void *opaque)
+{
+ timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_compute_timeout:
+ * @ctx: the aio context
+ *
+ * Compute the timeout that a blocking aio_poll should use.
+ */
+int64_t aio_compute_timeout(AioContext *ctx);
+
+/**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active. In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, Coroutine *co);
+
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ *
+ * Note that this function cannot reschedule from iohandler_ctx to
+ * qemu_aio_context.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context. The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ *
+ * If `co`'s AioContext differs from the current AioContext, this will call
+ * aio_co_schedule(), which makes this safe to use even when `co` has not
+ * yielded yet. In such a case, it will be entered once it yields.
+ *
+ * In contrast, if `co`'s AioContext is equal to the current one, it is
+ * required for `co` to currently be yielding. This is generally the case
+ * if the caller is not in `co` (i.e. invoked by `co`), because the only
+ * other way for the caller to be running then is for `co` to currently be
+ * yielding.
+ *
+ * Therefore, if there is no way for the caller to be invoked/entered by
+ * `co`, it is generally safe to call this regardless of whether `co` is
+ * known to already be yielding or not -- it only has to yield at some
+ * point.
+ */
+void aio_co_wake(Coroutine *co);
+
+/**
+ * aio_co_enter:
+ * @ctx: the context to run the coroutine
+ * @co: the coroutine to run
+ *
+ * Enter a coroutine in the specified AioContext.
+ */
+void aio_co_enter(AioContext *ctx, Coroutine *co);
+
+/**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext. If
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
+ *
+ * Note that the return value is never the main loop's iohandler_ctx and the
+ * return value is the main loop AioContext instead.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+void qemu_set_current_aio_context(AioContext *ctx);
+
+/**
+ * aio_context_setup:
+ * @ctx: the aio context
+ * @errp: error pointer
+ *
+ * Initialize the aio context.
+ *
+ * Returns: true on success, false otherwise
+ */
+bool aio_context_setup(AioContext *ctx, Error **errp);
+
+/**
+ * aio_context_destroy:
+ * @ctx: the aio context
+ *
+ * Destroy the aio context.
+ */
+void aio_context_destroy(AioContext *ctx);
+
+/**
+ * aio_context_set_poll_params:
+ * @ctx: the aio context
+ * @max_ns: how long to busy poll for, in nanoseconds
+ * @grow: polling time growth factor
+ * @shrink: polling time shrink factor
+ *
+ * Poll mode can be disabled by setting poll_max_ns to 0.
+ */
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+ int64_t grow, int64_t shrink,
+ Error **errp);
+
+/**
+ * aio_context_set_aio_params:
+ * @ctx: the aio context
+ * @max_batch: maximum number of requests in a batch, 0 means that the
+ * engine will use its default
+ */
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
+
+/**
+ * aio_context_set_thread_pool_params:
+ * @ctx: the aio context
+ * @min: min number of threads to have readily available in the thread pool
+ * @min: max number of threads the thread pool can contain
+ */
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+ int64_t max, Error **errp);
+
+#ifdef CONFIG_LINUX_IO_URING
+/**
+ * aio_has_io_uring: Return whether io_uring is available.
+ *
+ * io_uring is either available in all AioContexts or in none, so this only
+ * needs to be called once from within any thread's AioContext.
+ */
+static inline bool aio_has_io_uring(void)
+{
+ AioContext *ctx = qemu_get_current_aio_context();
+ return ctx->fdmon_ops->add_sqe;
+}
+
+/**
+ * aio_add_sqe: Add an io_uring sqe for submission.
+ * @prep_sqe: invoked with an sqe that should be prepared for submission
+ * @opaque: user-defined argument to @prep_sqe()
+ * @cqe_handler: the unique cqe handler associated with this request
+ *
+ * The caller's @prep_sqe() function is invoked to fill in the details of the
+ * sqe. Do not call io_uring_sqe_set_data() on this sqe.
+ *
+ * The sqe is submitted by the current AioContext. The kernel may see the sqe
+ * as soon as @prep_sqe() returns or it may take until the next event loop
+ * iteration.
+ *
+ * When the AioContext is destroyed, pending sqes are ignored and their
+ * CqeHandlers are not invoked.
+ *
+ * This function must be called only when aio_has_io_uring() returns true.
+ */
+void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+ void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+
+#endif
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 0d55c636b21..8c1241a2c11 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -25,7 +25,7 @@
#ifndef QEMU_MAIN_LOOP_H
#define QEMU_MAIN_LOOP_H
-#include "block/aio.h"
+#include "qemu/aio.h"
#include "qom/object.h"
#include "system/event-loop-base.h"
@@ -431,7 +431,7 @@ void qemu_cond_timedwait_bql(QemuCond *cond, int ms);
#define qemu_bh_new(cb, opaque) \
qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL)
QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
- MemReentrancyGuard *reentrancy_guard);
+ struct MemReentrancyGuard *reentrancy_guard);
void qemu_bh_schedule_idle(QEMUBH *bh);
enum {
--
2.51.1
next reply other threads:[~2025-11-28 10:16 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-28 10:15 Paolo Bonzini [this message]
2025-11-28 13:14 ` [PATCH] block: split qemu/aio.h out of block/aio.h Prasad Pandit
2025-12-02 7:42 ` Paolo Bonzini
2025-12-02 16:03 ` Kevin Wolf
2025-12-02 16:09 ` Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251128101555.227630-1-pbonzini@redhat.com \
--to=pbonzini@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).