* [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting
2025-10-21 17:55 [PATCHSET 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
@ 2025-10-21 17:55 ` Jens Axboe
2025-10-21 23:35 ` Gabriel Krisman Bertazi
2025-10-21 17:55 ` [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime usage Jens Axboe
1 sibling, 1 reply; 5+ messages in thread
From: Jens Axboe @ 2025-10-21 17:55 UTC (permalink / raw)
To: io-uring; +Cc: changfengnan, xiaobing.li, lidiangang, Jens Axboe, stable
getrusage() does a lot more than what the SQPOLL accounting needs, the
latter only cares about (and uses) the stime. Rather than do a full
RUSAGE_SELF summation, just query the used stime instead.
Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/fdinfo.c | 9 +++++----
io_uring/sqpoll.c | 34 ++++++++++++++++++++--------------
io_uring/sqpoll.h | 1 +
3 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77..966e06b078f6 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
- struct rusage sq_usage;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
unsigned int sq_head = READ_ONCE(r->sq.head);
unsigned int sq_tail = READ_ONCE(r->sq.tail);
@@ -152,14 +151,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
* thread termination.
*/
if (tsk) {
+ struct timespec64 ts;
+
get_task_struct(tsk);
rcu_read_unlock();
- getrusage(tsk, RUSAGE_SELF, &sq_usage);
+ ts = io_sq_cpu_time(tsk);
put_task_struct(tsk);
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
- sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
- + sq_usage.ru_stime.tv_usec);
+ sq_total_time = (ts.tv_sec * 1000000
+ + ts.tv_nsec / 1000);
sq_work_time = sq->work_time;
} else {
rcu_read_unlock();
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a3f11349ce06..8705b0aa82e0 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -11,6 +11,7 @@
#include <linux/audit.h>
#include <linux/security.h>
#include <linux/cpuset.h>
+#include <linux/sched/cputime.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -169,6 +170,22 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
+struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(tsk, &utime, &stime);
+ return ns_to_timespec64(stime);
+}
+
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
+{
+ struct timespec64 ts;
+
+ ts = timespec64_sub(io_sq_cpu_time(current), start);
+ sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
+
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
unsigned int to_submit;
@@ -255,23 +272,12 @@ static bool io_sq_tw_pending(struct llist_node *retry_list)
return retry_list || !llist_empty(&tctx->task_list);
}
-static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
-{
- struct rusage end;
-
- getrusage(current, RUSAGE_SELF, &end);
- end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
- end.ru_stime.tv_usec -= start->ru_stime.tv_usec;
-
- sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
-}
-
static int io_sq_thread(void *data)
{
struct llist_node *retry_list = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- struct rusage start;
+ struct timespec64 start;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN] = {};
DEFINE_WAIT(wait);
@@ -317,7 +323,7 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
- getrusage(current, RUSAGE_SELF, &start);
+ start = io_sq_cpu_time(current);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
@@ -333,7 +339,7 @@ static int io_sq_thread(void *data)
if (sqt_spin || !time_after(jiffies, timeout)) {
if (sqt_spin) {
- io_sq_update_worktime(sqd, &start);
+ io_sq_update_worktime(sqd, start);
timeout = jiffies + sqd->sq_thread_idle;
}
if (unlikely(need_resched())) {
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index b83dcdec9765..84ed2b312e88 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -29,6 +29,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+struct timespec64 io_sq_cpu_time(struct task_struct *tsk);
static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
{
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime usage
2025-10-21 17:55 [PATCHSET 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
2025-10-21 17:55 ` [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting Jens Axboe
@ 2025-10-21 17:55 ` Jens Axboe
1 sibling, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2025-10-21 17:55 UTC (permalink / raw)
To: io-uring; +Cc: changfengnan, xiaobing.li, lidiangang, Jens Axboe, stable
The current approach is a bit naive, and hence calls the time querying
way too often. Only start the "doing work" timer when there's actual
work to do, and then use that information to terminate (and account) the
work time once done. This greatly reduces the frequency of these calls,
when they cannot have changed anyway.
Running a basic random reader that is setup to use SQPOLL, a profile
before this change shows these as the top cycle consumers:
+ 32.60% iou-sqp-1074 [kernel.kallsyms] [k] thread_group_cputime_adjusted
+ 19.97% iou-sqp-1074 [kernel.kallsyms] [k] thread_group_cputime
+ 12.20% io_uring io_uring [.] submitter_uring_fn
+ 4.13% iou-sqp-1074 [kernel.kallsyms] [k] getrusage
+ 2.45% iou-sqp-1074 [kernel.kallsyms] [k] io_submit_sqes
+ 2.18% iou-sqp-1074 [kernel.kallsyms] [k] __pi_memset_generic
+ 2.09% iou-sqp-1074 [kernel.kallsyms] [k] cputime_adjust
and after this change, top of profile looks as follows:
+ 36.23% io_uring io_uring [.] submitter_uring_fn
+ 23.26% iou-sqp-819 [kernel.kallsyms] [k] io_sq_thread
+ 10.14% iou-sqp-819 [kernel.kallsyms] [k] io_sq_tw
+ 6.52% iou-sqp-819 [kernel.kallsyms] [k] tctx_task_work_run
+ 4.82% iou-sqp-819 [kernel.kallsyms] [k] nvme_submit_cmds.part.0
+ 2.91% iou-sqp-819 [kernel.kallsyms] [k] io_submit_sqes
[...]
0.02% iou-sqp-819 [kernel.kallsyms] [k] cputime_adjust
where it's spending the cycles on things that actually matter.
Reported-by: Fengnan Chang <changfengnan@bytedance.com>
Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/sqpoll.c | 43 ++++++++++++++++++++++++++++++++-----------
1 file changed, 32 insertions(+), 11 deletions(-)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 8705b0aa82e0..f6916f46c047 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -170,6 +170,11 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
+struct io_sq_time {
+ bool started;
+ struct timespec64 ts;
+};
+
struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
{
u64 utime, stime;
@@ -178,15 +183,27 @@ struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
return ns_to_timespec64(stime);
}
-static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist)
{
struct timespec64 ts;
- ts = timespec64_sub(io_sq_cpu_time(current), start);
+ if (!ist->started)
+ return;
+ ist->started = false;
+ ts = timespec64_sub(io_sq_cpu_time(current), ist->ts);
sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
}
-static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
+static void io_sq_start_worktime(struct io_sq_time *ist)
+{
+ if (ist->started)
+ return;
+ ist->started = true;
+ ist->ts = io_sq_cpu_time(current);
+}
+
+static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd,
+ bool cap_entries, struct io_sq_time *ist)
{
unsigned int to_submit;
int ret = 0;
@@ -199,6 +216,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
const struct cred *creds = NULL;
+ io_sq_start_worktime(ist);
+
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
@@ -277,7 +296,6 @@ static int io_sq_thread(void *data)
struct llist_node *retry_list = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- struct timespec64 start;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN] = {};
DEFINE_WAIT(wait);
@@ -315,6 +333,7 @@ static int io_sq_thread(void *data)
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
+ struct io_sq_time ist = { };
if (io_sqd_events_pending(sqd) || signal_pending(current)) {
if (io_sqd_handle_event(sqd))
@@ -323,9 +342,8 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
- start = io_sq_cpu_time(current);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- int ret = __io_sq_thread(ctx, cap_entries);
+ int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);
if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
@@ -333,15 +351,18 @@ static int io_sq_thread(void *data)
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
sqt_spin = true;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- if (io_napi(ctx))
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (io_napi(ctx)) {
+ io_sq_start_worktime(&ist);
io_napi_sqpoll_busy_poll(ctx);
+ }
+ }
+
+ io_sq_update_worktime(sqd, &ist);
if (sqt_spin || !time_after(jiffies, timeout)) {
- if (sqt_spin) {
- io_sq_update_worktime(sqd, start);
+ if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
- }
if (unlikely(need_resched())) {
mutex_unlock(&sqd->lock);
cond_resched();
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread