[PATCH] block: split qemu/aio.h out of block/aio.h

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Cc: qemu-block@nongnu.org
Subject: [PATCH] block: split qemu/aio.h out of block/aio.h
Date: Fri, 28 Nov 2025 11:15:55 +0100	[thread overview]
Message-ID: <20251128101555.227630-1-pbonzini@redhat.com> (raw)

Rust bindings are roughly broken up according to subdirectories of
include/ (that's not exact, but it's roughly an aim).  However,
block/aio.h contains both block layer-specific concepts (BlockAIOCB,
BlockCompletionFunc) and AioContext-related declarations that are
used be qemu/main-loop.h.

Break out the latter into their own header file, and use that to
break the inclusion of block/ from qemu/main-loop.h.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
	Based on top of
	https://lore.kernel.org/qemu-devel/20251127131516.80807-3-pbonzini@redhat.com/

 include/block/aio.h      | 838 +-------------------------------------
 include/qemu/aio.h       | 852 +++++++++++++++++++++++++++++++++++++++
 include/qemu/main-loop.h |   4 +-
 3 files changed, 857 insertions(+), 837 deletions(-)
 create mode 100644 include/qemu/aio.h

diff --git a/include/block/aio.h b/include/block/aio.h
index cc3d5f25a24..dba423f896e 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -11,22 +11,13 @@
  *
  */
 
-#ifndef QEMU_AIO_H
-#define QEMU_AIO_H
+#ifndef QEMU_BLOCK_AIO_H
+#define QEMU_BLOCK_AIO_H
 
-#ifdef CONFIG_LINUX_IO_URING
-#include <liburing.h>
-#endif
-#include "qemu/coroutine-core.h"
-#include "qemu/queue.h"
-#include "qemu/event_notifier.h"
-#include "qemu/lockcnt.h"
-#include "qemu/thread.h"
-#include "qemu/timer.h"
+#include "qemu/aio.h"
 #include "block/graph-lock.h"
 #include "hw/core/qdev.h"
 
-
 typedef struct BlockAIOCB BlockAIOCB;
 typedef void BlockCompletionFunc(void *opaque, int ret);
 
@@ -48,827 +39,4 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 void qemu_aio_unref(void *p);
 void qemu_aio_ref(void *p);
 
-typedef struct AioHandler AioHandler;
-typedef QLIST_HEAD(, AioHandler) AioHandlerList;
-typedef void QEMUBHFunc(void *opaque);
-typedef bool AioPollFn(void *opaque);
-typedef void IOHandler(void *opaque);
-
-struct ThreadPoolAio;
-struct LinuxAioState;
-typedef struct LuringState LuringState;
-
-/* Is polling disabled? */
-bool aio_poll_disabled(AioContext *ctx);
-
-#ifdef CONFIG_LINUX_IO_URING
-/*
- * Each io_uring request must have a unique CqeHandler that processes the cqe.
- * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
- * ->cb() invocation.
- */
-typedef struct CqeHandler CqeHandler;
-struct CqeHandler {
-    /* Called by the AioContext when the request has completed */
-    void (*cb)(CqeHandler *handler);
-
-    /* Used internally, do not access this */
-    QSIMPLEQ_ENTRY(CqeHandler) next;
-
-    /* This field is filled in before ->cb() is called */
-    struct io_uring_cqe cqe;
-};
-
-typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
-#endif /* CONFIG_LINUX_IO_URING */
-
-/* Callbacks for file descriptor monitoring implementations */
-typedef struct {
-    /*
-     * update:
-     * @ctx: the AioContext
-     * @old_node: the existing handler or NULL if this file descriptor is being
-     *            monitored for the first time
-     * @new_node: the new handler or NULL if this file descriptor is being
-     *            removed
-     *
-     * Add/remove/modify a monitored file descriptor.
-     *
-     * Called with ctx->list_lock acquired.
-     */
-    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
-
-    /*
-     * wait:
-     * @ctx: the AioContext
-     * @ready_list: list for handlers that become ready
-     * @timeout: maximum duration to wait, in nanoseconds
-     *
-     * Wait for file descriptors to become ready and place them on ready_list.
-     *
-     * Called with ctx->list_lock incremented but not locked.
-     *
-     * Returns: number of ready file descriptors.
-     */
-    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
-
-    /*
-     * need_wait:
-     * @ctx: the AioContext
-     *
-     * Tell aio_poll() when to stop userspace polling early because ->wait()
-     * has fds ready.
-     *
-     * File descriptor monitoring implementations that cannot poll fd readiness
-     * from userspace should use aio_poll_disabled() here.  This ensures that
-     * file descriptors are not starved by handlers that frequently make
-     * progress via userspace polling.
-     *
-     * Returns: true if ->wait() should be called, false otherwise.
-     */
-    bool (*need_wait)(AioContext *ctx);
-
-    /*
-     * dispatch:
-     * @ctx: the AioContext
-     *
-     * Dispatch any work that is specific to this file descriptor monitoring
-     * implementation. Usually the event loop's generic file descriptor
-     * monitoring, BH, and timer dispatching code is sufficient, but file
-     * descriptor monitoring implementations offering additional functionality
-     * may need to implement this function for custom behavior. Called at a
-     * point in the event loop when it is safe to invoke user-defined
-     * callbacks.
-     *
-     * This function is optional and may be NULL.
-     *
-     * Returns: true if progress was made (see aio_poll()'s return value),
-     * false otherwise.
-     */
-    bool (*dispatch)(AioContext *ctx);
-
-    /*
-     * gsource_prepare:
-     * @ctx: the AioContext
-     *
-     * Prepare for the glib event loop to wait for events instead of the usual
-     * ->wait() call. See glib's GSourceFuncs->prepare().
-     */
-    void (*gsource_prepare)(AioContext *ctx);
-
-    /*
-     * gsource_check:
-     * @ctx: the AioContext
-     *
-     * Called by the glib event loop from glib's GSourceFuncs->check() after
-     * waiting for events.
-     *
-     * Returns: true when ready to be dispatched.
-     */
-    bool (*gsource_check)(AioContext *ctx);
-
-    /*
-     * gsource_dispatch:
-     * @ctx: the AioContext
-     * @ready_list: list for handlers that become ready
-     *
-     * Place ready AioHandlers on ready_list. Called as part of the glib event
-     * loop from glib's GSourceFuncs->dispatch().
-     *
-     * Called with list_lock incremented.
-     */
-    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
-
-#ifdef CONFIG_LINUX_IO_URING
-    /**
-     * add_sqe: Add an io_uring sqe for submission.
-     * @prep_sqe: invoked with an sqe that should be prepared for submission
-     * @opaque: user-defined argument to @prep_sqe()
-     * @cqe_handler: the unique cqe handler associated with this request
-     *
-     * The caller's @prep_sqe() function is invoked to fill in the details of
-     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
-     *
-     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
-     * until the next event loop iteration.
-     *
-     * This function is called from the current AioContext and is not
-     * thread-safe.
-     */
-    void (*add_sqe)(AioContext *ctx,
-                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
-                    void *opaque, CqeHandler *cqe_handler);
-#endif /* CONFIG_LINUX_IO_URING */
-} FDMonOps;
-
-/*
- * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
- * scheduled BHs are not processed until the next aio_bh_poll() call.  All
- * active aio_bh_poll() calls chain their slices together in a list, so that
- * nested aio_bh_poll() calls process all scheduled bottom halves.
- */
-typedef QSLIST_HEAD(, QEMUBH) BHList;
-typedef struct BHListSlice BHListSlice;
-struct BHListSlice {
-    BHList bh_list;
-    QSIMPLEQ_ENTRY(BHListSlice) next;
-};
-
-typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
-
-typedef struct AioPolledEvent {
-    int64_t ns;        /* current polling time in nanoseconds */
-} AioPolledEvent;
-
-struct AioContext {
-    GSource source;
-
-    /* Used by AioContext users to protect from multi-threaded access.  */
-    QemuRecMutex lock;
-
-    /*
-     * Keep track of readers and writers of the block layer graph.
-     * This is essential to avoid performing additions and removal
-     * of nodes and edges from block graph while some
-     * other thread is traversing it.
-     */
-    BdrvGraphRWlock *bdrv_graph;
-
-    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
-    AioHandlerList aio_handlers;
-
-    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
-    AioHandlerList deleted_aio_handlers;
-
-    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
-     * only written from the AioContext home thread, or under the BQL in
-     * the case of the main AioContext.  However, it is read from any
-     * thread so it is still accessed with atomic primitives.
-     *
-     * If this field is 0, everything (file descriptors, bottom halves,
-     * timers) will be re-evaluated before the next blocking poll() or
-     * io_uring wait; therefore, the event_notifier_set call can be
-     * skipped.  If it is non-zero, you may need to wake up a concurrent
-     * aio_poll or the glib main event loop, making event_notifier_set
-     * necessary.
-     *
-     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
-     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
-     * Bits 1-31 simply count the number of active calls to aio_poll
-     * that are in the prepare or poll phase.
-     *
-     * The GSource and aio_poll must use a different mechanism because
-     * there is no certainty that a call to GSource's prepare callback
-     * (via g_main_context_prepare) is indeed followed by check and
-     * dispatch.  It's not clear whether this would be a bug, but let's
-     * play safe and allow it---it will just cause extra calls to
-     * event_notifier_set until the next call to dispatch.
-     *
-     * Instead, the aio_poll calls include both the prepare and the
-     * dispatch phase, hence a simple counter is enough for them.
-     */
-    uint32_t notify_me;
-
-    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
-     * and to ensure that no callbacks are removed while we're walking and
-     * dispatching them.
-     */
-    QemuLockCnt list_lock;
-
-    /* Bottom Halves pending aio_bh_poll() processing */
-    BHList bh_list;
-
-    /* Chained BH list slices for each nested aio_bh_poll() call */
-    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
-
-    /* Used by aio_notify.
-     *
-     * "notified" is used to avoid expensive event_notifier_test_and_clear
-     * calls.  When it is clear, the EventNotifier is clear, or one thread
-     * is going to clear "notified" before processing more events.  False
-     * positives are possible, i.e. "notified" could be set even though the
-     * EventNotifier is clear.
-     *
-     * Note that event_notifier_set *cannot* be optimized the same way.  For
-     * more information on the problem that would result, see "#ifdef BUG2"
-     * in the docs/aio_notify_accept.promela formal model.
-     */
-    bool notified;
-    EventNotifier notifier;
-
-    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
-    QEMUBH *co_schedule_bh;
-
-    int thread_pool_min;
-    int thread_pool_max;
-    /* Thread pool for performing work and receiving completion callbacks.
-     * Has its own locking.
-     */
-    struct ThreadPoolAio *thread_pool;
-
-#ifdef CONFIG_LINUX_AIO
-    struct LinuxAioState *linux_aio;
-#endif
-#ifdef CONFIG_LINUX_IO_URING
-    /* State for file descriptor monitoring using Linux io_uring */
-    struct io_uring fdmon_io_uring;
-    AioHandlerSList submit_list;
-    void *io_uring_fd_tag;
-
-    /* Pending callback state for cqe handlers */
-    CqeHandlerSimpleQ cqe_handler_ready_list;
-#endif /* CONFIG_LINUX_IO_URING */
-
-    /* TimerLists for calling timers - one per clock type.  Has its own
-     * locking.
-     */
-    QEMUTimerListGroup tlg;
-
-    /* Number of AioHandlers without .io_poll() */
-    int poll_disable_cnt;
-
-    /* Polling mode parameters */
-    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
-    int64_t poll_grow;      /* polling time growth factor */
-    int64_t poll_shrink;    /* polling time shrink factor */
-
-    /* AIO engine parameters */
-    int64_t aio_max_batch;  /* maximum number of requests in a batch */
-
-    /*
-     * List of handlers participating in userspace polling.  Protected by
-     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
-     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
-     * only touches the list to delete nodes if ctx->list_lock's count is zero.
-     */
-    AioHandlerList poll_aio_handlers;
-
-    /* Are we in polling mode or monitoring file descriptors? */
-    bool poll_started;
-
-    /* epoll(7) state used when built with CONFIG_EPOLL */
-    int epollfd;
-
-    /* The GSource unix fd tag for epollfd */
-    void *epollfd_tag;
-
-    const FDMonOps *fdmon_ops;
-
-    /* Was aio_context_new() successful? */
-    bool initialized;
-};
-
-/**
- * aio_context_new: Allocate a new AioContext.
- *
- * AioContext provide a mini event-loop that can be waited on synchronously.
- * They also provide bottom halves, a service to execute a piece of code
- * as soon as possible.
- */
-AioContext *aio_context_new(Error **errp);
-
-/**
- * aio_context_ref:
- * @ctx: The AioContext to operate on.
- *
- * Add a reference to an AioContext.
- */
-void aio_context_ref(AioContext *ctx);
-
-/**
- * aio_context_unref:
- * @ctx: The AioContext to operate on.
- *
- * Drop a reference to an AioContext.
- */
-void aio_context_unref(AioContext *ctx);
-
-/**
- * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
- * run only once and as soon as possible.
- *
- * @name: A human-readable identifier for debugging purposes.
- */
-void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
-                                  const char *name);
-
-/**
- * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
- * only once and as soon as possible.
- *
- * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
- * name string.
- */
-#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
-    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
-
-/**
- * aio_bh_new_full: Allocate a new bottom half structure.
- *
- * Bottom halves are lightweight callbacks whose invocation is guaranteed
- * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
- * is opaque and must be allocated prior to its use.
- *
- * @name: A human-readable identifier for debugging purposes.
- * @reentrancy_guard: A guard set when entering a cb to prevent
- * device-reentrancy issues
- */
-QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
-                        const char *name, MemReentrancyGuard *reentrancy_guard);
-
-/**
- * aio_bh_new: Allocate a new bottom half structure
- *
- * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
- * string.
- */
-#define aio_bh_new(ctx, cb, opaque) \
-    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
-
-/**
- * aio_bh_new_guarded: Allocate a new bottom half structure with a
- * reentrancy_guard
- *
- * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
- * string.
- */
-#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
-    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
-
-/**
- * aio_notify: Force processing of pending events.
- *
- * Similar to signaling a condition variable, aio_notify forces
- * aio_poll to exit, so that the next call will re-examine pending events.
- * The caller of aio_notify will usually call aio_poll again very soon,
- * or go through another iteration of the GLib main loop.  Hence, aio_notify
- * also has the side effect of recalculating the sets of file descriptors
- * that the main loop waits for.
- *
- * Calling aio_notify is rarely necessary, because for example scheduling
- * a bottom half calls it already.
- */
-void aio_notify(AioContext *ctx);
-
-/**
- * aio_notify_accept: Acknowledge receiving an aio_notify.
- *
- * aio_notify() uses an EventNotifier in order to wake up a sleeping
- * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
- * usually rare, but the AioContext has to clear the EventNotifier on
- * every aio_poll() or g_main_context_iteration() in order to avoid
- * busy waiting.  This event_notifier_test_and_clear() cannot be done
- * using the usual aio_context_set_event_notifier(), because it must
- * be done before processing all events (file descriptors, bottom halves,
- * timers).
- *
- * aio_notify_accept() is an optimized event_notifier_test_and_clear()
- * that is specific to an AioContext's notifier; it is used internally
- * to clear the EventNotifier only if aio_notify() had been called.
- */
-void aio_notify_accept(AioContext *ctx);
-
-/**
- * aio_bh_call: Executes callback function of the specified BH.
- */
-void aio_bh_call(QEMUBH *bh);
-
-/**
- * aio_bh_poll: Poll bottom halves for an AioContext.
- *
- * These are internal functions used by the QEMU main loop.
- * And notice that multiple occurrences of aio_bh_poll cannot
- * be called concurrently
- */
-int aio_bh_poll(AioContext *ctx);
-
-/**
- * qemu_bh_schedule: Schedule a bottom half.
- *
- * Scheduling a bottom half interrupts the main loop and causes the
- * execution of the callback that was passed to qemu_bh_new.
- *
- * Bottom halves that are scheduled from a bottom half handler are instantly
- * invoked.  This can create an infinite loop if a bottom half handler
- * schedules itself.
- *
- * @bh: The bottom half to be scheduled.
- */
-void qemu_bh_schedule(QEMUBH *bh);
-
-/**
- * qemu_bh_cancel: Cancel execution of a bottom half.
- *
- * Canceling execution of a bottom half undoes the effect of calls to
- * qemu_bh_schedule without freeing its resources yet.  While cancellation
- * itself is also wait-free and thread-safe, it can of course race with the
- * loop that executes bottom halves unless you are holding the iothread
- * mutex.  This makes it mostly useless if you are not holding the mutex.
- *
- * @bh: The bottom half to be canceled.
- */
-void qemu_bh_cancel(QEMUBH *bh);
-
-/**
- *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
- *
- * Deleting a bottom half frees the memory that was allocated for it by
- * qemu_bh_new.  It also implies canceling the bottom half if it was
- * scheduled.
- * This func is async. The bottom half will do the delete action at the finial
- * end.
- *
- * @bh: The bottom half to be deleted.
- */
-void qemu_bh_delete(QEMUBH *bh);
-
-/* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext, before g_poll is invoked.
- *
- * This is used internally in the implementation of the GSource.
- */
-bool aio_prepare(AioContext *ctx);
-
-/* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext, after g_poll is invoked.
- *
- * This is used internally in the implementation of the GSource.
- */
-bool aio_pending(AioContext *ctx);
-
-/* Dispatch any pending callbacks from the GSource attached to the AioContext.
- *
- * This is used internally in the implementation of the GSource.
- */
-void aio_dispatch(AioContext *ctx);
-
-/* Progress in completing AIO work to occur.  This can issue new pending
- * aio as a result of executing I/O completion or bh callbacks.
- *
- * Return whether any progress was made by executing AIO or bottom half
- * handlers.  If @blocking == true, this should always be true except
- * if someone called aio_notify.
- *
- * If there are no pending bottom halves, but there are pending AIO
- * operations, it may not be possible to make any progress without
- * blocking.  If @blocking is true, this function will wait until one
- * or more AIO events have completed, to ensure something has moved
- * before returning.
- */
-bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
-
-/* Register a file descriptor and associated callbacks.  Behaves very similarly
- * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
- * be invoked when using aio_poll().
- *
- * Code that invokes AIO completion functions should rely on this function
- * instead of qemu_set_fd_handler[2].
- */
-void aio_set_fd_handler(AioContext *ctx,
-                        int fd,
-                        IOHandler *io_read,
-                        IOHandler *io_write,
-                        AioPollFn *io_poll,
-                        IOHandler *io_poll_ready,
-                        void *opaque);
-
-/* Register an event notifier and associated callbacks.  Behaves very similarly
- * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
- * will be invoked when using aio_poll().
- *
- * Code that invokes AIO completion functions should rely on this function
- * instead of event_notifier_set_handler.
- */
-void aio_set_event_notifier(AioContext *ctx,
-                            EventNotifier *notifier,
-                            EventNotifierHandler *io_read,
-                            AioPollFn *io_poll,
-                            EventNotifierHandler *io_poll_ready);
-
-/*
- * Set polling begin/end callbacks for an event notifier that has already been
- * registered with aio_set_event_notifier.  Do nothing if the event notifier is
- * not registered.
- *
- * Note that if the io_poll_end() callback (or the entire notifier) is removed
- * during polling, it will not be called, so an io_poll_begin() is not
- * necessarily always followed by an io_poll_end().
- */
-void aio_set_event_notifier_poll(AioContext *ctx,
-                                 EventNotifier *notifier,
-                                 EventNotifierHandler *io_poll_begin,
-                                 EventNotifierHandler *io_poll_end);
-
-/* Return a GSource that lets the main loop poll the file descriptors attached
- * to this AioContext.
- */
-GSource *aio_get_g_source(AioContext *ctx);
-
-/* Return the ThreadPoolAio bound to this AioContext */
-struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
-
-/* Setup the LinuxAioState bound to this AioContext */
-struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
-
-/* Return the LinuxAioState bound to this AioContext */
-struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
-
-/**
- * aio_timer_new_with_attrs:
- * @ctx: the aio context
- * @type: the clock type
- * @scale: the scale
- * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
- *              to assign
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Allocate a new timer (with attributes) attached to the context @ctx.
- * The function is responsible for memory allocation.
- *
- * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
- * Use that unless you really need dynamic memory allocation.
- *
- * Returns: a pointer to the new timer
- */
-static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
-                                                  QEMUClockType type,
-                                                  int scale, int attributes,
-                                                  QEMUTimerCB *cb, void *opaque)
-{
-    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
-}
-
-/**
- * aio_timer_new:
- * @ctx: the aio context
- * @type: the clock type
- * @scale: the scale
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Allocate a new timer attached to the context @ctx.
- * See aio_timer_new_with_attrs for details.
- *
- * Returns: a pointer to the new timer
- */
-static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
-                                       int scale,
-                                       QEMUTimerCB *cb, void *opaque)
-{
-    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
-}
-
-/**
- * aio_timer_init_with_attrs:
- * @ctx: the aio context
- * @ts: the timer
- * @type: the clock type
- * @scale: the scale
- * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
- *              to assign
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Initialise a new timer (with attributes) attached to the context @ctx.
- * The caller is responsible for memory allocation.
- */
-static inline void aio_timer_init_with_attrs(AioContext *ctx,
-                                             QEMUTimer *ts, QEMUClockType type,
-                                             int scale, int attributes,
-                                             QEMUTimerCB *cb, void *opaque)
-{
-    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
-}
-
-/**
- * aio_timer_init:
- * @ctx: the aio context
- * @ts: the timer
- * @type: the clock type
- * @scale: the scale
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Initialise a new timer attached to the context @ctx.
- * See aio_timer_init_with_attrs for details.
- */
-static inline void aio_timer_init(AioContext *ctx,
-                                  QEMUTimer *ts, QEMUClockType type,
-                                  int scale,
-                                  QEMUTimerCB *cb, void *opaque)
-{
-    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
-}
-
-/**
- * aio_compute_timeout:
- * @ctx: the aio context
- *
- * Compute the timeout that a blocking aio_poll should use.
- */
-int64_t aio_compute_timeout(AioContext *ctx);
-
-/**
- * aio_co_schedule:
- * @ctx: the aio context
- * @co: the coroutine
- *
- * Start a coroutine on a remote AioContext.
- *
- * The coroutine must not be entered by anyone else while aio_co_schedule()
- * is active.  In addition the coroutine must have yielded unless ctx
- * is the context in which the coroutine is running (i.e. the value of
- * qemu_get_current_aio_context() from the coroutine itself).
- */
-void aio_co_schedule(AioContext *ctx, Coroutine *co);
-
-/**
- * aio_co_reschedule_self:
- * @new_ctx: the new context
- *
- * Move the currently running coroutine to new_ctx. If the coroutine is already
- * running in new_ctx, do nothing.
- *
- * Note that this function cannot reschedule from iohandler_ctx to
- * qemu_aio_context.
- */
-void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
-
-/**
- * aio_co_wake:
- * @co: the coroutine
- *
- * Restart a coroutine on the AioContext where it was running last, thus
- * preventing coroutines from jumping from one context to another when they
- * go to sleep.
- *
- * aio_co_wake may be executed either in coroutine or non-coroutine
- * context.  The coroutine must not be entered by anyone else while
- * aio_co_wake() is active.
- *
- * If `co`'s AioContext differs from the current AioContext, this will call
- * aio_co_schedule(), which makes this safe to use even when `co` has not
- * yielded yet.  In such a case, it will be entered once it yields.
- *
- * In contrast, if `co`'s AioContext is equal to the current one, it is
- * required for `co` to currently be yielding.  This is generally the case
- * if the caller is not in `co` (i.e. invoked by `co`), because the only
- * other way for the caller to be running then is for `co` to currently be
- * yielding.
- *
- * Therefore, if there is no way for the caller to be invoked/entered by
- * `co`, it is generally safe to call this regardless of whether `co` is
- * known to already be yielding or not -- it only has to yield at some
- * point.
- */
-void aio_co_wake(Coroutine *co);
-
-/**
- * aio_co_enter:
- * @ctx: the context to run the coroutine
- * @co: the coroutine to run
- *
- * Enter a coroutine in the specified AioContext.
- */
-void aio_co_enter(AioContext *ctx, Coroutine *co);
-
-/**
- * Return the AioContext whose event loop runs in the current thread.
- *
- * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from the main thread or with the "big QEMU lock" taken it
- * will be the main loop AioContext.
- *
- * Note that the return value is never the main loop's iohandler_ctx and the
- * return value is the main loop AioContext instead.
- */
-AioContext *qemu_get_current_aio_context(void);
-
-void qemu_set_current_aio_context(AioContext *ctx);
-
-/**
- * aio_context_setup:
- * @ctx: the aio context
- * @errp: error pointer
- *
- * Initialize the aio context.
- *
- * Returns: true on success, false otherwise
- */
-bool aio_context_setup(AioContext *ctx, Error **errp);
-
-/**
- * aio_context_destroy:
- * @ctx: the aio context
- *
- * Destroy the aio context.
- */
-void aio_context_destroy(AioContext *ctx);
-
-/**
- * aio_context_set_poll_params:
- * @ctx: the aio context
- * @max_ns: how long to busy poll for, in nanoseconds
- * @grow: polling time growth factor
- * @shrink: polling time shrink factor
- *
- * Poll mode can be disabled by setting poll_max_ns to 0.
- */
-void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
-                                 int64_t grow, int64_t shrink,
-                                 Error **errp);
-
-/**
- * aio_context_set_aio_params:
- * @ctx: the aio context
- * @max_batch: maximum number of requests in a batch, 0 means that the
- *             engine will use its default
- */
-void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
-
-/**
- * aio_context_set_thread_pool_params:
- * @ctx: the aio context
- * @min: min number of threads to have readily available in the thread pool
- * @min: max number of threads the thread pool can contain
- */
-void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
-                                        int64_t max, Error **errp);
-
-#ifdef CONFIG_LINUX_IO_URING
-/**
- * aio_has_io_uring: Return whether io_uring is available.
- *
- * io_uring is either available in all AioContexts or in none, so this only
- * needs to be called once from within any thread's AioContext.
- */
-static inline bool aio_has_io_uring(void)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    return ctx->fdmon_ops->add_sqe;
-}
-
-/**
- * aio_add_sqe: Add an io_uring sqe for submission.
- * @prep_sqe: invoked with an sqe that should be prepared for submission
- * @opaque: user-defined argument to @prep_sqe()
- * @cqe_handler: the unique cqe handler associated with this request
- *
- * The caller's @prep_sqe() function is invoked to fill in the details of the
- * sqe. Do not call io_uring_sqe_set_data() on this sqe.
- *
- * The sqe is submitted by the current AioContext. The kernel may see the sqe
- * as soon as @prep_sqe() returns or it may take until the next event loop
- * iteration.
- *
- * When the AioContext is destroyed, pending sqes are ignored and their
- * CqeHandlers are not invoked.
- *
- * This function must be called only when aio_has_io_uring() returns true.
- */
-void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
-                 void *opaque, CqeHandler *cqe_handler);
-#endif /* CONFIG_LINUX_IO_URING */
-
 #endif
diff --git a/include/qemu/aio.h b/include/qemu/aio.h
new file mode 100644
index 00000000000..8cca2360d1a
--- /dev/null
+++ b/include/qemu/aio.h
@@ -0,0 +1,852 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
+#include "qemu/coroutine-core.h"
+#include "qemu/queue.h"
+#include "qemu/event_notifier.h"
+#include "qemu/lockcnt.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+
+struct MemReentrancyGuard;
+
+typedef struct AioHandler AioHandler;
+typedef QLIST_HEAD(, AioHandler) AioHandlerList;
+typedef void QEMUBHFunc(void *opaque);
+typedef bool AioPollFn(void *opaque);
+typedef void IOHandler(void *opaque);
+
+struct ThreadPoolAio;
+struct LinuxAioState;
+typedef struct LuringState LuringState;
+
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
+#ifdef CONFIG_LINUX_IO_URING
+/*
+ * Each io_uring request must have a unique CqeHandler that processes the cqe.
+ * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
+ * ->cb() invocation.
+ */
+typedef struct CqeHandler CqeHandler;
+struct CqeHandler {
+    /* Called by the AioContext when the request has completed */
+    void (*cb)(CqeHandler *handler);
+
+    /* Used internally, do not access this */
+    QSIMPLEQ_ENTRY(CqeHandler) next;
+
+    /* This field is filled in before ->cb() is called */
+    struct io_uring_cqe cqe;
+};
+
+typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
+#endif /* CONFIG_LINUX_IO_URING */
+
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+    /*
+     * update:
+     * @ctx: the AioContext
+     * @old_node: the existing handler or NULL if this file descriptor is being
+     *            monitored for the first time
+     * @new_node: the new handler or NULL if this file descriptor is being
+     *            removed
+     *
+     * Add/remove/modify a monitored file descriptor.
+     *
+     * Called with ctx->list_lock acquired.
+     */
+    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
+
+    /*
+     * wait:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     * @timeout: maximum duration to wait, in nanoseconds
+     *
+     * Wait for file descriptors to become ready and place them on ready_list.
+     *
+     * Called with ctx->list_lock incremented but not locked.
+     *
+     * Returns: number of ready file descriptors.
+     */
+    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+    /*
+     * need_wait:
+     * @ctx: the AioContext
+     *
+     * Tell aio_poll() when to stop userspace polling early because ->wait()
+     * has fds ready.
+     *
+     * File descriptor monitoring implementations that cannot poll fd readiness
+     * from userspace should use aio_poll_disabled() here.  This ensures that
+     * file descriptors are not starved by handlers that frequently make
+     * progress via userspace polling.
+     *
+     * Returns: true if ->wait() should be called, false otherwise.
+     */
+    bool (*need_wait)(AioContext *ctx);
+
+    /*
+     * dispatch:
+     * @ctx: the AioContext
+     *
+     * Dispatch any work that is specific to this file descriptor monitoring
+     * implementation. Usually the event loop's generic file descriptor
+     * monitoring, BH, and timer dispatching code is sufficient, but file
+     * descriptor monitoring implementations offering additional functionality
+     * may need to implement this function for custom behavior. Called at a
+     * point in the event loop when it is safe to invoke user-defined
+     * callbacks.
+     *
+     * This function is optional and may be NULL.
+     *
+     * Returns: true if progress was made (see aio_poll()'s return value),
+     * false otherwise.
+     */
+    bool (*dispatch)(AioContext *ctx);
+
+    /*
+     * gsource_prepare:
+     * @ctx: the AioContext
+     *
+     * Prepare for the glib event loop to wait for events instead of the usual
+     * ->wait() call. See glib's GSourceFuncs->prepare().
+     */
+    void (*gsource_prepare)(AioContext *ctx);
+
+    /*
+     * gsource_check:
+     * @ctx: the AioContext
+     *
+     * Called by the glib event loop from glib's GSourceFuncs->check() after
+     * waiting for events.
+     *
+     * Returns: true when ready to be dispatched.
+     */
+    bool (*gsource_check)(AioContext *ctx);
+
+    /*
+     * gsource_dispatch:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     *
+     * Place ready AioHandlers on ready_list. Called as part of the glib event
+     * loop from glib's GSourceFuncs->dispatch().
+     *
+     * Called with list_lock incremented.
+     */
+    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
+
+#ifdef CONFIG_LINUX_IO_URING
+    /**
+     * add_sqe: Add an io_uring sqe for submission.
+     * @prep_sqe: invoked with an sqe that should be prepared for submission
+     * @opaque: user-defined argument to @prep_sqe()
+     * @cqe_handler: the unique cqe handler associated with this request
+     *
+     * The caller's @prep_sqe() function is invoked to fill in the details of
+     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
+     *
+     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
+     * until the next event loop iteration.
+     *
+     * This function is called from the current AioContext and is not
+     * thread-safe.
+     */
+    void (*add_sqe)(AioContext *ctx,
+                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                    void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+} FDMonOps;
+
+/*
+ * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
+ * scheduled BHs are not processed until the next aio_bh_poll() call.  All
+ * active aio_bh_poll() calls chain their slices together in a list, so that
+ * nested aio_bh_poll() calls process all scheduled bottom halves.
+ */
+typedef QSLIST_HEAD(, QEMUBH) BHList;
+typedef struct BHListSlice BHListSlice;
+struct BHListSlice {
+    BHList bh_list;
+    QSIMPLEQ_ENTRY(BHListSlice) next;
+};
+
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
+typedef struct AioPolledEvent {
+    int64_t ns;        /* current polling time in nanoseconds */
+} AioPolledEvent;
+
+struct AioContext {
+    GSource source;
+
+    /* Used by AioContext users to protect from multi-threaded access.  */
+    QemuRecMutex lock;
+
+    /*
+     * Keep track of readers and writers of the block layer graph.
+     * This is essential to avoid performing additions and removal
+     * of nodes and edges from block graph while some
+     * other thread is traversing it.
+     */
+    struct BdrvGraphRWlock *bdrv_graph;
+
+    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
+    AioHandlerList aio_handlers;
+
+    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
+    AioHandlerList deleted_aio_handlers;
+
+    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+     * only written from the AioContext home thread, or under the BQL in
+     * the case of the main AioContext.  However, it is read from any
+     * thread so it is still accessed with atomic primitives.
+     *
+     * If this field is 0, everything (file descriptors, bottom halves,
+     * timers) will be re-evaluated before the next blocking poll() or
+     * io_uring wait; therefore, the event_notifier_set call can be
+     * skipped.  If it is non-zero, you may need to wake up a concurrent
+     * aio_poll or the glib main event loop, making event_notifier_set
+     * necessary.
+     *
+     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+     * Bits 1-31 simply count the number of active calls to aio_poll
+     * that are in the prepare or poll phase.
+     *
+     * The GSource and aio_poll must use a different mechanism because
+     * there is no certainty that a call to GSource's prepare callback
+     * (via g_main_context_prepare) is indeed followed by check and
+     * dispatch.  It's not clear whether this would be a bug, but let's
+     * play safe and allow it---it will just cause extra calls to
+     * event_notifier_set until the next call to dispatch.
+     *
+     * Instead, the aio_poll calls include both the prepare and the
+     * dispatch phase, hence a simple counter is enough for them.
+     */
+    uint32_t notify_me;
+
+    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
+     * and to ensure that no callbacks are removed while we're walking and
+     * dispatching them.
+     */
+    QemuLockCnt list_lock;
+
+    /* Bottom Halves pending aio_bh_poll() processing */
+    BHList bh_list;
+
+    /* Chained BH list slices for each nested aio_bh_poll() call */
+    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
+
+    /* Used by aio_notify.
+     *
+     * "notified" is used to avoid expensive event_notifier_test_and_clear
+     * calls.  When it is clear, the EventNotifier is clear, or one thread
+     * is going to clear "notified" before processing more events.  False
+     * positives are possible, i.e. "notified" could be set even though the
+     * EventNotifier is clear.
+     *
+     * Note that event_notifier_set *cannot* be optimized the same way.  For
+     * more information on the problem that would result, see "#ifdef BUG2"
+     * in the docs/aio_notify_accept.promela formal model.
+     */
+    bool notified;
+    EventNotifier notifier;
+
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
+    int thread_pool_min;
+    int thread_pool_max;
+    /* Thread pool for performing work and receiving completion callbacks.
+     * Has its own locking.
+     */
+    struct ThreadPoolAio *thread_pool;
+
+#ifdef CONFIG_LINUX_AIO
+    struct LinuxAioState *linux_aio;
+#endif
+#ifdef CONFIG_LINUX_IO_URING
+    /* State for file descriptor monitoring using Linux io_uring */
+    struct io_uring fdmon_io_uring;
+    AioHandlerSList submit_list;
+    void *io_uring_fd_tag;
+
+    /* Pending callback state for cqe handlers */
+    CqeHandlerSimpleQ cqe_handler_ready_list;
+#endif /* CONFIG_LINUX_IO_URING */
+
+    /* TimerLists for calling timers - one per clock type.  Has its own
+     * locking.
+     */
+    QEMUTimerListGroup tlg;
+
+    /* Number of AioHandlers without .io_poll() */
+    int poll_disable_cnt;
+
+    /* Polling mode parameters */
+    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
+    int64_t poll_grow;      /* polling time growth factor */
+    int64_t poll_shrink;    /* polling time shrink factor */
+
+    /* AIO engine parameters */
+    int64_t aio_max_batch;  /* maximum number of requests in a batch */
+
+    /*
+     * List of handlers participating in userspace polling.  Protected by
+     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
+     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
+     * only touches the list to delete nodes if ctx->list_lock's count is zero.
+     */
+    AioHandlerList poll_aio_handlers;
+
+    /* Are we in polling mode or monitoring file descriptors? */
+    bool poll_started;
+
+    /* epoll(7) state used when built with CONFIG_EPOLL */
+    int epollfd;
+
+    /* The GSource unix fd tag for epollfd */
+    void *epollfd_tag;
+
+    const FDMonOps *fdmon_ops;
+
+    /* Was aio_context_new() successful? */
+    bool initialized;
+};
+
+/**
+ * aio_context_new: Allocate a new AioContext.
+ *
+ * AioContext provide a mini event-loop that can be waited on synchronously.
+ * They also provide bottom halves, a service to execute a piece of code
+ * as soon as possible.
+ */
+AioContext *aio_context_new(Error **errp);
+
+/**
+ * aio_context_ref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Add a reference to an AioContext.
+ */
+void aio_context_ref(AioContext *ctx);
+
+/**
+ * aio_context_unref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Drop a reference to an AioContext.
+ */
+void aio_context_unref(AioContext *ctx);
+
+/**
+ * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
+ * run only once and as soon as possible.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                                  const char *name);
+
+/**
+ * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
+ * only once and as soon as possible.
+ *
+ * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
+ * name string.
+ */
+#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
+    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_bh_new_full: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ * @reentrancy_guard: A guard set when entering a cb to prevent
+ * device-reentrancy issues
+ */
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                        const char *name, struct MemReentrancyGuard *reentrancy_guard);
+
+/**
+ * aio_bh_new: Allocate a new bottom half structure
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new(ctx, cb, opaque) \
+    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
+
+/**
+ * aio_bh_new_guarded: Allocate a new bottom half structure with a
+ * reentrancy_guard
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
+    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
+
+/**
+ * aio_notify: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, aio_notify forces
+ * aio_poll to exit, so that the next call will re-examine pending events.
+ * The caller of aio_notify will usually call aio_poll again very soon,
+ * or go through another iteration of the GLib main loop.  Hence, aio_notify
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling aio_notify is rarely necessary, because for example scheduling
+ * a bottom half calls it already.
+ */
+void aio_notify(AioContext *ctx);
+
+/**
+ * aio_notify_accept: Acknowledge receiving an aio_notify.
+ *
+ * aio_notify() uses an EventNotifier in order to wake up a sleeping
+ * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
+ * usually rare, but the AioContext has to clear the EventNotifier on
+ * every aio_poll() or g_main_context_iteration() in order to avoid
+ * busy waiting.  This event_notifier_test_and_clear() cannot be done
+ * using the usual aio_context_set_event_notifier(), because it must
+ * be done before processing all events (file descriptors, bottom halves,
+ * timers).
+ *
+ * aio_notify_accept() is an optimized event_notifier_test_and_clear()
+ * that is specific to an AioContext's notifier; it is used internally
+ * to clear the EventNotifier only if aio_notify() had been called.
+ */
+void aio_notify_accept(AioContext *ctx);
+
+/**
+ * aio_bh_call: Executes callback function of the specified BH.
+ */
+void aio_bh_call(QEMUBH *bh);
+
+/**
+ * aio_bh_poll: Poll bottom halves for an AioContext.
+ *
+ * These are internal functions used by the QEMU main loop.
+ * And notice that multiple occurrences of aio_bh_poll cannot
+ * be called concurrently
+ */
+int aio_bh_poll(AioContext *ctx);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet.  While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex.  This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new.  It also implies canceling the bottom half if it was
+ * scheduled.
+ * This func is async. The bottom half will do the delete action at the finial
+ * end.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, before g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_prepare(AioContext *ctx);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, after g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_pending(AioContext *ctx);
+
+/* Dispatch any pending callbacks from the GSource attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+void aio_dispatch(AioContext *ctx);
+
+/* Progress in completing AIO work to occur.  This can issue new pending
+ * aio as a result of executing I/O completion or bh callbacks.
+ *
+ * Return whether any progress was made by executing AIO or bottom half
+ * handlers.  If @blocking == true, this should always be true except
+ * if someone called aio_notify.
+ *
+ * If there are no pending bottom halves, but there are pending AIO
+ * operations, it may not be possible to make any progress without
+ * blocking.  If @blocking is true, this function will wait until one
+ * or more AIO events have completed, to ensure something has moved
+ * before returning.
+ */
+bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
+
+/* Register a file descriptor and associated callbacks.  Behaves very similarly
+ * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
+ * be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of qemu_set_fd_handler[2].
+ */
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioPollFn *io_poll,
+                        IOHandler *io_poll_ready,
+                        void *opaque);
+
+/* Register an event notifier and associated callbacks.  Behaves very similarly
+ * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
+ * will be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of event_notifier_set_handler.
+ */
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *notifier,
+                            EventNotifierHandler *io_read,
+                            AioPollFn *io_poll,
+                            EventNotifierHandler *io_poll_ready);
+
+/*
+ * Set polling begin/end callbacks for an event notifier that has already been
+ * registered with aio_set_event_notifier.  Do nothing if the event notifier is
+ * not registered.
+ *
+ * Note that if the io_poll_end() callback (or the entire notifier) is removed
+ * during polling, it will not be called, so an io_poll_begin() is not
+ * necessarily always followed by an io_poll_end().
+ */
+void aio_set_event_notifier_poll(AioContext *ctx,
+                                 EventNotifier *notifier,
+                                 EventNotifierHandler *io_poll_begin,
+                                 EventNotifierHandler *io_poll_end);
+
+/* Return a GSource that lets the main loop poll the file descriptors attached
+ * to this AioContext.
+ */
+GSource *aio_get_g_source(AioContext *ctx);
+
+/* Return the ThreadPoolAio bound to this AioContext */
+struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
+
+/* Setup the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
+
+/* Return the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
+
+/**
+ * aio_timer_new_with_attrs:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ *              to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer (with attributes) attached to the context @ctx.
+ * The function is responsible for memory allocation.
+ *
+ * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
+ * Use that unless you really need dynamic memory allocation.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
+                                                  QEMUClockType type,
+                                                  int scale, int attributes,
+                                                  QEMUTimerCB *cb, void *opaque)
+{
+    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_new:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer attached to the context @ctx.
+ * See aio_timer_new_with_attrs for details.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
+                                       int scale,
+                                       QEMUTimerCB *cb, void *opaque)
+{
+    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_timer_init_with_attrs:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ *              to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer (with attributes) attached to the context @ctx.
+ * The caller is responsible for memory allocation.
+ */
+static inline void aio_timer_init_with_attrs(AioContext *ctx,
+                                             QEMUTimer *ts, QEMUClockType type,
+                                             int scale, int attributes,
+                                             QEMUTimerCB *cb, void *opaque)
+{
+    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_init:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer attached to the context @ctx.
+ * See aio_timer_init_with_attrs for details.
+ */
+static inline void aio_timer_init(AioContext *ctx,
+                                  QEMUTimer *ts, QEMUClockType type,
+                                  int scale,
+                                  QEMUTimerCB *cb, void *opaque)
+{
+    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_compute_timeout:
+ * @ctx: the aio context
+ *
+ * Compute the timeout that a blocking aio_poll should use.
+ */
+int64_t aio_compute_timeout(AioContext *ctx);
+
+/**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, Coroutine *co);
+
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ *
+ * Note that this function cannot reschedule from iohandler_ctx to
+ * qemu_aio_context.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ *
+ * If `co`'s AioContext differs from the current AioContext, this will call
+ * aio_co_schedule(), which makes this safe to use even when `co` has not
+ * yielded yet.  In such a case, it will be entered once it yields.
+ *
+ * In contrast, if `co`'s AioContext is equal to the current one, it is
+ * required for `co` to currently be yielding.  This is generally the case
+ * if the caller is not in `co` (i.e. invoked by `co`), because the only
+ * other way for the caller to be running then is for `co` to currently be
+ * yielding.
+ *
+ * Therefore, if there is no way for the caller to be invoked/entered by
+ * `co`, it is generally safe to call this regardless of whether `co` is
+ * known to already be yielding or not -- it only has to yield at some
+ * point.
+ */
+void aio_co_wake(Coroutine *co);
+
+/**
+ * aio_co_enter:
+ * @ctx: the context to run the coroutine
+ * @co: the coroutine to run
+ *
+ * Enter a coroutine in the specified AioContext.
+ */
+void aio_co_enter(AioContext *ctx, Coroutine *co);
+
+/**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
+ *
+ * Note that the return value is never the main loop's iohandler_ctx and the
+ * return value is the main loop AioContext instead.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+void qemu_set_current_aio_context(AioContext *ctx);
+
+/**
+ * aio_context_setup:
+ * @ctx: the aio context
+ * @errp: error pointer
+ *
+ * Initialize the aio context.
+ *
+ * Returns: true on success, false otherwise
+ */
+bool aio_context_setup(AioContext *ctx, Error **errp);
+
+/**
+ * aio_context_destroy:
+ * @ctx: the aio context
+ *
+ * Destroy the aio context.
+ */
+void aio_context_destroy(AioContext *ctx);
+
+/**
+ * aio_context_set_poll_params:
+ * @ctx: the aio context
+ * @max_ns: how long to busy poll for, in nanoseconds
+ * @grow: polling time growth factor
+ * @shrink: polling time shrink factor
+ *
+ * Poll mode can be disabled by setting poll_max_ns to 0.
+ */
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+                                 int64_t grow, int64_t shrink,
+                                 Error **errp);
+
+/**
+ * aio_context_set_aio_params:
+ * @ctx: the aio context
+ * @max_batch: maximum number of requests in a batch, 0 means that the
+ *             engine will use its default
+ */
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
+
+/**
+ * aio_context_set_thread_pool_params:
+ * @ctx: the aio context
+ * @min: min number of threads to have readily available in the thread pool
+ * @min: max number of threads the thread pool can contain
+ */
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+                                        int64_t max, Error **errp);
+
+#ifdef CONFIG_LINUX_IO_URING
+/**
+ * aio_has_io_uring: Return whether io_uring is available.
+ *
+ * io_uring is either available in all AioContexts or in none, so this only
+ * needs to be called once from within any thread's AioContext.
+ */
+static inline bool aio_has_io_uring(void)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    return ctx->fdmon_ops->add_sqe;
+}
+
+/**
+ * aio_add_sqe: Add an io_uring sqe for submission.
+ * @prep_sqe: invoked with an sqe that should be prepared for submission
+ * @opaque: user-defined argument to @prep_sqe()
+ * @cqe_handler: the unique cqe handler associated with this request
+ *
+ * The caller's @prep_sqe() function is invoked to fill in the details of the
+ * sqe. Do not call io_uring_sqe_set_data() on this sqe.
+ *
+ * The sqe is submitted by the current AioContext. The kernel may see the sqe
+ * as soon as @prep_sqe() returns or it may take until the next event loop
+ * iteration.
+ *
+ * When the AioContext is destroyed, pending sqes are ignored and their
+ * CqeHandlers are not invoked.
+ *
+ * This function must be called only when aio_has_io_uring() returns true.
+ */
+void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                 void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+
+#endif
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 0d55c636b21..8c1241a2c11 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -25,7 +25,7 @@
 #ifndef QEMU_MAIN_LOOP_H
 #define QEMU_MAIN_LOOP_H
 
-#include "block/aio.h"
+#include "qemu/aio.h"
 #include "qom/object.h"
 #include "system/event-loop-base.h"
 
@@ -431,7 +431,7 @@ void qemu_cond_timedwait_bql(QemuCond *cond, int ms);
 #define qemu_bh_new(cb, opaque) \
     qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL)
 QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
-                         MemReentrancyGuard *reentrancy_guard);
+                         struct MemReentrancyGuard *reentrancy_guard);
 void qemu_bh_schedule_idle(QEMUBH *bh);
 
 enum {
-- 
2.51.1

next             reply	other threads:[~2025-11-28 10:16 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-28 10:15 Paolo Bonzini [this message]
2025-11-28 13:14 ` [PATCH] block: split qemu/aio.h out of block/aio.h Prasad Pandit
2025-12-02  7:42   ` Paolo Bonzini
2025-12-02 16:03 ` Kevin Wolf
2025-12-02 16:09   ` Paolo Bonzini

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:cc3d5f25a2 dfblob:dba423f896 dfblob:8cca2360d1
dfblob:0d55c636b2 dfblob:8c1241a2c1 )
 OR (
bs:"[PATCH] block: split qemu/aio.h out of block/aio.h" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251128101555.227630-1-pbonzini@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).