[PATCH 1/4] sched/core: add helpers for iowait handling

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/4] sched/core: add helpers for iowait handling
  2024-04-16 12:11 [PATCHSET v4 0/4] Split iowait into two states Jens Axboe
@ 2024-04-16 12:11 ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-04-16 12:11 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx, Jens Axboe

Adds helpers to inc/dec the runqueue iowait count, based on the task, and
use those in the spots where the count is manipulated.

Adds an rq_iowait() helper, to abstract out how the per-rq stats are read.

No functional changes in this patch, just in preparation for switching
the type of 'nr_iowait'.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sched/core.c    | 23 +++++++++++++++++++----
 kernel/sched/cputime.c |  3 +--
 kernel/sched/sched.h   |  2 ++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7019a40457a6..977bb08a33d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3721,6 +3721,21 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node)
 
 #endif /* !CONFIG_SMP */
 
+static void task_iowait_inc(struct task_struct *p)
+{
+	atomic_inc(&task_rq(p)->nr_iowait);
+}
+
+static void task_iowait_dec(struct task_struct *p)
+{
+	atomic_dec(&task_rq(p)->nr_iowait);
+}
+
+int rq_iowait(struct rq *rq)
+{
+	return atomic_read(&rq->nr_iowait);
+}
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -3787,7 +3802,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 #endif
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
-		atomic_dec(&task_rq(p)->nr_iowait);
+		task_iowait_dec(p);
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4364,7 +4379,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		if (task_cpu(p) != cpu) {
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
-				atomic_dec(&task_rq(p)->nr_iowait);
+				task_iowait_dec(p);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5472,7 +5487,7 @@ unsigned long long nr_context_switches(void)
 
 unsigned int nr_iowait_cpu(int cpu)
 {
-	return atomic_read(&cpu_rq(cpu)->nr_iowait);
+	return rq_iowait(cpu_rq(cpu));
 }
 
 /*
@@ -6692,7 +6707,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
 
 			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
+				task_iowait_inc(prev);
 				delayacct_blkio_start();
 			}
 		}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..7d9423df7779 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -222,9 +222,8 @@ void account_steal_time(u64 cputime)
 void account_idle_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	struct rq *rq = this_rq();
 
-	if (atomic_read(&rq->nr_iowait) > 0)
+	if (rq_iowait(this_rq()) > 0)
 		cpustat[CPUTIME_IOWAIT] += cputime;
 	else
 		cpustat[CPUTIME_IDLE] += cputime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2242679239e..387f67ddf18a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3473,4 +3473,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
+int rq_iowait(struct rq *rq);
+
 #endif /* _KERNEL_SCHED_SCHED_H */
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 1/4] sched/core: add helpers for iowait handling
  2024-08-17 20:45 [PATCHSET v5 " Jens Axboe
@ 2024-08-17 20:45 ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-17 20:45 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx, Jens Axboe

Adds helpers to inc/dec the runqueue iowait count, based on the task, and
use those in the spots where the count is manipulated.

Adds an rq_iowait() helper, to abstract out how the per-rq stats are read.

No functional changes in this patch, just in preparation for switching
the type of 'nr_iowait'.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sched/core.c    | 23 +++++++++++++++++++----
 kernel/sched/cputime.c |  3 +--
 kernel/sched/sched.h   |  2 ++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab50100363ca..9bf1b67818d0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3541,6 +3541,21 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 
 #endif /* !CONFIG_SMP */
 
+static void task_iowait_inc(struct task_struct *p)
+{
+	atomic_inc(&task_rq(p)->nr_iowait);
+}
+
+static void task_iowait_dec(struct task_struct *p)
+{
+	atomic_dec(&task_rq(p)->nr_iowait);
+}
+
+int rq_iowait(struct rq *rq)
+{
+	return atomic_read(&rq->nr_iowait);
+}
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -3607,7 +3622,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 #endif
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
-		atomic_dec(&task_rq(p)->nr_iowait);
+		task_iowait_dec(p);
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4184,7 +4199,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		if (task_cpu(p) != cpu) {
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
-				atomic_dec(&task_rq(p)->nr_iowait);
+				task_iowait_dec(p);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5282,7 +5297,7 @@ unsigned long long nr_context_switches(void)
 
 unsigned int nr_iowait_cpu(int cpu)
 {
-	return atomic_read(&cpu_rq(cpu)->nr_iowait);
+	return rq_iowait(cpu_rq(cpu));
 }
 
 /*
@@ -6512,7 +6527,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
 
 			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
+				task_iowait_inc(prev);
 				delayacct_blkio_start();
 			}
 		}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bed0fa1acd9..b826267714de 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -222,9 +222,8 @@ void account_steal_time(u64 cputime)
 void account_idle_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	struct rq *rq = this_rq();
 
-	if (atomic_read(&rq->nr_iowait) > 0)
+	if (rq_iowait(this_rq()) > 0)
 		cpustat[CPUTIME_IOWAIT] += cputime;
 	else
 		cpustat[CPUTIME_IDLE] += cputime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1e1d1b467af2..b6b3b565bcb1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3579,6 +3579,8 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
+int rq_iowait(struct rq *rq);
+
 #ifdef CONFIG_RT_MUTEXES
 
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCHSET v6 0/4] Split iowait into two states
@ 2024-08-19 15:39 Jens Axboe
  2024-08-19 15:39 ` [PATCH 1/4] sched/core: add helpers for iowait handling Jens Axboe
                   ` (6 more replies)
  0 siblings, 7 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-19 15:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx

Hi,

This is v6 of the patchset where the current in_iowait state is split
into two parts:

1) The "task is sleeping waiting on IO", and would like cpufreq goodness
   in terms of sleep and wakeup latencies.
2) The above, and also accounted as such in the iowait stats.

The current ->in_iowait covers both, this series splits it into two types
of state so that each can be controlled seperately.

Patches 1..3 are prep patches, changing the type of
task_struct->nr_iowait and adding helpers to manipulate the iowait counts.

Patch 4 does the actual splitting.

This has been sitting for a while, would be nice to get this queued up
for 6.12. Comments welcome!

 arch/s390/appldata/appldata_base.c |  2 +-
 arch/s390/appldata/appldata_os.c   |  2 +-
 block/blk-cgroup.c                 |  2 +-
 fs/proc/stat.c                     |  2 +-
 include/linux/sched.h              | 10 ++++-
 include/linux/sched/stat.h         |  5 ++-
 kernel/locking/mutex.c             |  4 +-
 kernel/locking/rtmutex_api.c       |  4 +-
 kernel/sched/core.c                | 68 ++++++++++++++++++++++++------
 kernel/sched/cputime.c             |  3 +-
 kernel/sched/sched.h               |  5 ++-
 kernel/time/tick-sched.c           |  6 +--
 12 files changed, 81 insertions(+), 32 deletions(-)

Since v5:
- Make nr_iowait atomic_long_t unconditionally, as 32-bit archs have
  it as a 32-bit type. This avoids the ifdef stuff in sched/core.c.
  Thanks to Zhang Qiao for that suggestion.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/4] sched/core: add helpers for iowait handling
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
@ 2024-08-19 15:39 ` Jens Axboe
  2024-08-19 15:39 ` [PATCH 2/4] sched/core: change rq->nr_iowait type to atomic_long_t Jens Axboe
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-19 15:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx, Jens Axboe

Adds helpers to inc/dec the runqueue iowait count, based on the task, and
use those in the spots where the count is manipulated.

Adds an rq_iowait() helper, to abstract out how the per-rq stats are read.

No functional changes in this patch, just in preparation for switching
the type of 'nr_iowait'.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sched/core.c    | 23 +++++++++++++++++++----
 kernel/sched/cputime.c |  3 +--
 kernel/sched/sched.h   |  2 ++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab50100363ca..9bf1b67818d0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3541,6 +3541,21 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 
 #endif /* !CONFIG_SMP */
 
+static void task_iowait_inc(struct task_struct *p)
+{
+	atomic_inc(&task_rq(p)->nr_iowait);
+}
+
+static void task_iowait_dec(struct task_struct *p)
+{
+	atomic_dec(&task_rq(p)->nr_iowait);
+}
+
+int rq_iowait(struct rq *rq)
+{
+	return atomic_read(&rq->nr_iowait);
+}
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -3607,7 +3622,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 #endif
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
-		atomic_dec(&task_rq(p)->nr_iowait);
+		task_iowait_dec(p);
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4184,7 +4199,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		if (task_cpu(p) != cpu) {
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
-				atomic_dec(&task_rq(p)->nr_iowait);
+				task_iowait_dec(p);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5282,7 +5297,7 @@ unsigned long long nr_context_switches(void)
 
 unsigned int nr_iowait_cpu(int cpu)
 {
-	return atomic_read(&cpu_rq(cpu)->nr_iowait);
+	return rq_iowait(cpu_rq(cpu));
 }
 
 /*
@@ -6512,7 +6527,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
 
 			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
+				task_iowait_inc(prev);
 				delayacct_blkio_start();
 			}
 		}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bed0fa1acd9..b826267714de 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -222,9 +222,8 @@ void account_steal_time(u64 cputime)
 void account_idle_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	struct rq *rq = this_rq();
 
-	if (atomic_read(&rq->nr_iowait) > 0)
+	if (rq_iowait(this_rq()) > 0)
 		cpustat[CPUTIME_IOWAIT] += cputime;
 	else
 		cpustat[CPUTIME_IDLE] += cputime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1e1d1b467af2..b6b3b565bcb1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3579,6 +3579,8 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
+int rq_iowait(struct rq *rq);
+
 #ifdef CONFIG_RT_MUTEXES
 
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/4] sched/core: change rq->nr_iowait type to atomic_long_t
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
  2024-08-19 15:39 ` [PATCH 1/4] sched/core: add helpers for iowait handling Jens Axboe
@ 2024-08-19 15:39 ` Jens Axboe
  2024-08-20  2:14   ` Zhang Qiao
  2024-08-19 15:39 ` [PATCH 3/4] sched/core: have io_schedule_prepare() return a long Jens Axboe
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-19 15:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx, Jens Axboe

In preparation for storing two separate iowait states in there, bump the
size from a 32-bit to a 64-bit size, for 64-bit kernels. On 32-bit
archs, the type remains a 32-bit size.

Note that on 32-bit, the number of tasks are limited to 0x8000, which
fits just fine in even half of the existing 32-bit atomic_t. For 64-bit,
no such limit exists, hence play it safe and make it a 64-bit atomic.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sched/core.c  | 8 ++++----
 kernel/sched/sched.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9bf1b67818d0..7e04b84dcc55 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3543,17 +3543,17 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 
 static void task_iowait_inc(struct task_struct *p)
 {
-	atomic_inc(&task_rq(p)->nr_iowait);
+	atomic_long_inc(&task_rq(p)->nr_iowait);
 }
 
 static void task_iowait_dec(struct task_struct *p)
 {
-	atomic_dec(&task_rq(p)->nr_iowait);
+	atomic_long_dec(&task_rq(p)->nr_iowait);
 }
 
 int rq_iowait(struct rq *rq)
 {
-	return atomic_read(&rq->nr_iowait);
+	return atomic_long_read(&rq->nr_iowait);
 }
 
 static void
@@ -8372,7 +8372,7 @@ void __init sched_init(void)
 #endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
-		atomic_set(&rq->nr_iowait, 0);
+		atomic_long_set(&rq->nr_iowait, 0);
 		fair_server_init(rq);
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b6b3b565bcb1..da2e67621f39 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1082,7 +1082,7 @@ struct rq {
 	u64			clock_idle_copy;
 #endif
 
-	atomic_t		nr_iowait;
+	atomic_long_t		nr_iowait;
 
 #ifdef CONFIG_SCHED_DEBUG
 	u64 last_seen_need_resched_ns;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/4] sched/core: have io_schedule_prepare() return a long
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
  2024-08-19 15:39 ` [PATCH 1/4] sched/core: add helpers for iowait handling Jens Axboe
  2024-08-19 15:39 ` [PATCH 2/4] sched/core: change rq->nr_iowait type to atomic_long_t Jens Axboe
@ 2024-08-19 15:39 ` Jens Axboe
  2024-09-05  9:57   ` Peter Zijlstra
  2024-08-19 15:39 ` [PATCH 4/4] sched/core: split iowait state into two states Jens Axboe
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-19 15:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx, Jens Axboe

In preparation for needing more state then 32-bit on 64-bit archs,
switch it to a long instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c           | 2 +-
 include/linux/sched.h        | 4 ++--
 kernel/locking/mutex.c       | 4 ++--
 kernel/locking/rtmutex_api.c | 4 ++--
 kernel/sched/core.c          | 6 +++---
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 69e70964398c..f8e6220c66a7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1884,7 +1884,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
 	u64 now = blk_time_get_ns();
 	u64 exp;
 	u64 delay_nsec = 0;
-	int tok;
+	long tok;
 
 	while (blkg->parent) {
 		int use_delay = atomic_read(&blkg->use_delay);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c1b4ee3234f..c1a65e19a3ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -319,8 +319,8 @@ asmlinkage void preempt_schedule_irq(void);
  extern void schedule_rtlock(void);
 #endif
 
-extern int __must_check io_schedule_prepare(void);
-extern void io_schedule_finish(int token);
+extern long __must_check io_schedule_prepare(void);
+extern void io_schedule_finish(long token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cbae8c0b89ab..4a86ea6c7f19 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -830,7 +830,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 void __sched
 mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 {
-	int token;
+	long token;
 
 	might_sleep();
 
@@ -1026,7 +1026,7 @@ EXPORT_SYMBOL(mutex_lock_killable);
  */
 void __sched mutex_lock_io(struct mutex *lock)
 {
-	int token;
+	long token;
 
 	token = io_schedule_prepare();
 	mutex_lock(lock);
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index a6974d044593..ddf7f7f3f0b5 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -547,7 +547,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
 
 void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 {
-	int token;
+	long token;
 
 	might_sleep();
 
@@ -579,7 +579,7 @@ EXPORT_SYMBOL(mutex_lock_killable);
 
 void __sched mutex_lock_io(struct mutex *lock)
 {
-	int token = io_schedule_prepare();
+	long token = io_schedule_prepare();
 
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
 	io_schedule_finish(token);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7e04b84dcc55..d6c494e33206 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7415,16 +7415,16 @@ static inline void preempt_dynamic_init(void) { }
 
 #endif /* CONFIG_PREEMPT_DYNAMIC */
 
-int io_schedule_prepare(void)
+long io_schedule_prepare(void)
 {
-	int old_iowait = current->in_iowait;
+	long old_iowait = current->in_iowait;
 
 	current->in_iowait = 1;
 	blk_flush_plug(current->plug, true);
 	return old_iowait;
 }
 
-void io_schedule_finish(int token)
+void io_schedule_finish(long token)
 {
 	current->in_iowait = token;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/4] sched/core: split iowait state into two states
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
                   ` (2 preceding siblings ...)
  2024-08-19 15:39 ` [PATCH 3/4] sched/core: have io_schedule_prepare() return a long Jens Axboe
@ 2024-08-19 15:39 ` Jens Axboe
  2024-09-05 10:55   ` Peter Zijlstra
  2024-08-21 14:54 ` [PATCHSET v6 0/4] Split iowait " Christian Loehle
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-19 15:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, tglx, Jens Axboe

iowait is a bogus metric, but it's helpful in the sense that it allows
short waits to not enter sleep states that have a higher exit latency
than would've otherwise have been picked for iowait'ing tasks. However,
it's harmless in that lots of applications and monitoring assumes that
iowait is busy time, or otherwise use it as a health metric.
Particularly for async IO it's entirely nonsensical.

Split the iowait part into two parts - one that tracks whether the task
needs boosting for short waits, and one that marks whether the task
needs to be accounted as such. ->in_iowait_acct nests inside of
->in_iowait, both for efficiency reasons, but also so that the
relationship between the two is clear. A waiter may set ->in_wait alone
and not care about the accounting.

Existing users of nr_iowait() for accounting purposes are switched to
use nr_iowait_acct(), which leaves the governor using nr_iowait() as it
only cares about iowaiters, not the accounting side.

Utilize that there's enough space in rq->nr_iowait to store both values
in there, shifting the accounting side by half the size of the type.
Thank you to Thomas Gleixner for that [1] suggestion.

[1] https://lore.kernel.org/lkml/87sf1b6o9w.ffs@tglx/

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/s390/appldata/appldata_base.c |  2 +-
 arch/s390/appldata/appldata_os.c   |  2 +-
 fs/proc/stat.c                     |  2 +-
 include/linux/sched.h              |  6 ++++
 include/linux/sched/stat.h         |  5 ++--
 kernel/sched/core.c                | 45 +++++++++++++++++++++++-------
 kernel/sched/sched.h               |  1 +
 kernel/time/tick-sched.c           |  6 ++--
 8 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 91a30e017d65..5b21225405fd 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -423,4 +423,4 @@ EXPORT_SYMBOL_GPL(si_swapinfo);
 #endif
 EXPORT_SYMBOL_GPL(nr_threads);
 EXPORT_SYMBOL_GPL(nr_running);
-EXPORT_SYMBOL_GPL(nr_iowait);
+EXPORT_SYMBOL_GPL(nr_iowait_acct);
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index a363d30ce739..fa4b278aca6c 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -100,7 +100,7 @@ static void appldata_get_os_data(void *data)
 
 	os_data->nr_threads = nr_threads;
 	os_data->nr_running = nr_running();
-	os_data->nr_iowait  = nr_iowait();
+	os_data->nr_iowait  = nr_iowait_acct();
 	os_data->avenrun[0] = avenrun[0] + (FIXED_1/200);
 	os_data->avenrun[1] = avenrun[1] + (FIXED_1/200);
 	os_data->avenrun[2] = avenrun[2] + (FIXED_1/200);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index da60956b2915..149be7a884fb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -180,7 +180,7 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)boottime.tv_sec,
 		total_forks,
 		nr_running(),
-		nr_iowait());
+		nr_iowait_acct());
 
 	seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c1a65e19a3ac..1b40847b0a80 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -953,7 +953,13 @@ struct task_struct {
 
 	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
+	/* task is in iowait */
 	unsigned			in_iowait:1;
+	/*
+	 * task is in iowait and should be accounted as such. can only be set
+	 * if ->in_iowait is also set.
+	 */
+	unsigned			in_iowait_acct:1;
 #ifndef TIF_RESTORE_SIGMASK
 	unsigned			restore_sigmask:1;
 #endif
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 0108a38bb64d..31e8a44b3d71 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -19,8 +19,9 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned int nr_running(void);
 extern bool single_task_running(void);
-extern unsigned int nr_iowait(void);
-extern unsigned int nr_iowait_cpu(int cpu);
+unsigned int nr_iowait_acct(void);
+unsigned int nr_iowait_acct_cpu(int cpu);
+unsigned int nr_iowait_cpu(int cpu);
 
 static inline int sched_info_on(void)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6c494e33206..a082919caaf8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3541,19 +3541,33 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
 
 #endif /* !CONFIG_SMP */
 
+/* Shift half the size of the type, atomic_long_t */
+#define IOWAIT_SHIFT	(4 * sizeof(atomic_long_t))
+
+/*
+ * Store iowait and iowait_acct state in the same variable. The lower bits
+ * hold the iowait state, and the upper bits hold the iowait_acct state.
+ */
 static void task_iowait_inc(struct task_struct *p)
 {
-	atomic_long_inc(&task_rq(p)->nr_iowait);
+	long val = 1 + ((long) p->in_iowait_acct << IOWAIT_SHIFT);
+	atomic_long_add(val, &task_rq(p)->nr_iowait);
 }
 
 static void task_iowait_dec(struct task_struct *p)
 {
-	atomic_long_dec(&task_rq(p)->nr_iowait);
+	long val = 1 + ((long) p->in_iowait_acct << IOWAIT_SHIFT);
+	atomic_long_sub(val, &task_rq(p)->nr_iowait);
 }
 
 int rq_iowait(struct rq *rq)
 {
-	return atomic_long_read(&rq->nr_iowait);
+	return atomic_long_read(&rq->nr_iowait) & ((1UL << IOWAIT_SHIFT) - 1);
+}
+
+int rq_iowait_acct(struct rq *rq)
+{
+	return atomic_long_read(&rq->nr_iowait) >> IOWAIT_SHIFT;
 }
 
 static void
@@ -5295,7 +5309,12 @@ unsigned long long nr_context_switches(void)
  * it does become runnable.
  */
 
-unsigned int nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_acct_cpu(int cpu)
+{
+	return rq_iowait_acct(cpu_rq(cpu));
+}
+
+unsigned nr_iowait_cpu(int cpu)
 {
 	return rq_iowait(cpu_rq(cpu));
 }
@@ -5330,12 +5349,12 @@ unsigned int nr_iowait_cpu(int cpu)
  * Task CPU affinities can make all that even more 'interesting'.
  */
 
-unsigned int nr_iowait(void)
+unsigned int nr_iowait_acct(void)
 {
 	unsigned int i, sum = 0;
 
 	for_each_possible_cpu(i)
-		sum += nr_iowait_cpu(i);
+		sum += nr_iowait_acct_cpu(i);
 
 	return sum;
 }
@@ -7415,18 +7434,24 @@ static inline void preempt_dynamic_init(void) { }
 
 #endif /* CONFIG_PREEMPT_DYNAMIC */
 
+/*
+ * Returns a token which is comprised of the two bits of iowait wait state -
+ * one is whether we're making ourselves as in iowait for cpufreq reasons,
+ * and the other is if the task should be accounted as such.
+ */
 long io_schedule_prepare(void)
 {
-	long old_iowait = current->in_iowait;
-
+	long token = current->in_iowait + ((long) current->in_iowait_acct << IOWAIT_SHIFT);
 	current->in_iowait = 1;
+	current->in_iowait_acct = 1;
 	blk_flush_plug(current->plug, true);
-	return old_iowait;
+	return token;
 }
 
 void io_schedule_finish(long token)
 {
-	current->in_iowait = token;
+	current->in_iowait = token & 0x01;
+	current->in_iowait_acct = token >> IOWAIT_SHIFT;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index da2e67621f39..be377ecd1301 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3580,6 +3580,7 @@ extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
 int rq_iowait(struct rq *rq);
+int rq_iowait_acct(struct rq *rq);
 
 #ifdef CONFIG_RT_MUTEXES
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184c7090..17a82771c3df 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -732,7 +732,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 	delta = ktime_sub(now, ts->idle_entrytime);
 
 	write_seqcount_begin(&ts->idle_sleeptime_seq);
-	if (nr_iowait_cpu(smp_processor_id()) > 0)
+	if (nr_iowait_acct_cpu(smp_processor_id()) > 0)
 		ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 	else
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
@@ -805,7 +805,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
 	return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
-				     !nr_iowait_cpu(cpu), last_update_time);
+				     !nr_iowait_acct_cpu(cpu), last_update_time);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
@@ -831,7 +831,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
 	return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
-				     nr_iowait_cpu(cpu), last_update_time);
+				     nr_iowait_acct_cpu(cpu), last_update_time);
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/4] sched/core: change rq->nr_iowait type to atomic_long_t
  2024-08-19 15:39 ` [PATCH 2/4] sched/core: change rq->nr_iowait type to atomic_long_t Jens Axboe
@ 2024-08-20  2:14   ` Zhang Qiao
  0 siblings, 0 replies; 29+ messages in thread
From: Zhang Qiao @ 2024-08-20  2:14 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel; +Cc: peterz, tglx



在 2024/8/19 23:39, Jens Axboe 写道:
> In preparation for storing two separate iowait states in there, bump the
> size from a 32-bit to a 64-bit size, for 64-bit kernels. On 32-bit
> archs, the type remains a 32-bit size.
> 
> Note that on 32-bit, the number of tasks are limited to 0x8000, which
> fits just fine in even half of the existing 32-bit atomic_t. For 64-bit,
> no such limit exists, hence play it safe and make it a 64-bit atomic.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

Reviewed-by: Zhang Qiao <zhangqiao22@huawei.com>

> ---
>  kernel/sched/core.c  | 8 ++++----
>  kernel/sched/sched.h | 2 +-
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9bf1b67818d0..7e04b84dcc55 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3543,17 +3543,17 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
>  
>  static void task_iowait_inc(struct task_struct *p)
>  {
> -	atomic_inc(&task_rq(p)->nr_iowait);
> +	atomic_long_inc(&task_rq(p)->nr_iowait);
>  }
>  
>  static void task_iowait_dec(struct task_struct *p)
>  {
> -	atomic_dec(&task_rq(p)->nr_iowait);
> +	atomic_long_dec(&task_rq(p)->nr_iowait);
>  }
>  
>  int rq_iowait(struct rq *rq)
>  {
> -	return atomic_read(&rq->nr_iowait);
> +	return atomic_long_read(&rq->nr_iowait);
>  }
>  
>  static void
> @@ -8372,7 +8372,7 @@ void __init sched_init(void)
>  #endif
>  #endif /* CONFIG_SMP */
>  		hrtick_rq_init(rq);
> -		atomic_set(&rq->nr_iowait, 0);
> +		atomic_long_set(&rq->nr_iowait, 0);
>  		fair_server_init(rq);
>  
>  #ifdef CONFIG_SCHED_CORE
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b6b3b565bcb1..da2e67621f39 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1082,7 +1082,7 @@ struct rq {
>  	u64			clock_idle_copy;
>  #endif
>  
> -	atomic_t		nr_iowait;
> +	atomic_long_t		nr_iowait;>
>  #ifdef CONFIG_SCHED_DEBUG
>  	u64 last_seen_need_resched_ns;

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
                   ` (3 preceding siblings ...)
  2024-08-19 15:39 ` [PATCH 4/4] sched/core: split iowait state into two states Jens Axboe
@ 2024-08-21 14:54 ` Christian Loehle
  2024-08-21 15:04   ` Jens Axboe
  2024-09-04 14:28 ` Peter Zijlstra
  2025-03-31  9:02 ` Pavel Begunkov
  6 siblings, 1 reply; 29+ messages in thread
From: Christian Loehle @ 2024-08-21 14:54 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel
  Cc: peterz, tglx, Vincent Guittot, Qais Yousef, Rafael J. Wysocki

On 8/19/24 16:39, Jens Axboe wrote:
> Hi,
> 
> This is v6 of the patchset where the current in_iowait state is split
> into two parts:
> 
> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>    in terms of sleep and wakeup latencies.
> 2) The above, and also accounted as such in the iowait stats.
> 
> The current ->in_iowait covers both, this series splits it into two types
> of state so that each can be controlled seperately.

Hi Jens,
I wanted to give a brief update on where I think we're at in terms
of iowait behavior regarding cpuidle and cpufreq.
I'm still working on getting both removed, given the discussions had
on the list [0] and at OSPM [1] this seems realistic and the best way
forward IMO.
That would then naturally make this series and the iowait workaround in
io_uring/io_uring.c unnecessary.

1. For cpuidle:
Main issue with relying on nr_iowaiters is that there is no guarantee
whatsoever that these tasks will wakeup where they went to sleep so if
we can achieve the same throughput without nr_iowaiters it shouldn't
be relevant.
I spent quite some time in fixing teo [2], because untangling nr_iowaiters
from menu seems hard, essentially nobody has worked on menu seriously for
a while now. Thus the plan here is to replace menu by teo eventually.
For your io_uring workloads I see throughput on par for teo (doesn't rely
on iowait) and menu.

# echo teo > /sys/devices/system/cpu/cpuidle/current_governor
#  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
submitter=0, tid=206, file=/dev/nvme0n1, node=-1
polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
Engine=preadv2
IOPS=22500, BW=87MiB/s, IOS/call=0/0
IOPS=21916, BW=85MiB/s, IOS/call=1/0
IOPS=21774, BW=85MiB/s, IOS/call=1/0
IOPS=22467, BW=87MiB/s, IOS/call=1/0
Exiting on timeout
Maximum IOPS=22500
# echo menu > /sys/devices/system/cpu/cpuidle/current_governor
[  178.754571] cpuidle: using governor menu
#  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
submitter=0, tid=209, file=/dev/nvme0n1, node=-1
polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
Engine=preadv2
IOPS=21452, BW=83MiB/s, IOS/call=0/0
IOPS=21778, BW=85MiB/s, IOS/call=1/0
IOPS=21120, BW=82MiB/s, IOS/call=1/0
IOPS=20903, BW=81MiB/s, IOS/call=1/0
Exiting on timeout
Maximum IOPS=21778

Please do give it a try for yourself as well!

2. For cpufreq:
Main issue for IO-bound workloads with iowait boosting is we're punishing
the 'good' workloads (that don't have iowait sleeps in their throughput-critical
part, which is already bad because of the scheduling overhead induced) by
making them energy-inefficient to make synthetic benchmarks happy.
A study of more realistic workloads show that they don't suffer from a problem
of building up utilization, not util_est anyway, so they don't actually benefit
from a cpufreq boost.
This leads me to the conclusion that cpufreq iowait boosting can be scrapped
altogether if we accept some degradation of benchmarks like
./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1
or
fio --name=fio --rw=randread --bs=4k --runtime=5 --time_based --filename=/dev/nvme0n1 --iodepth=1 --numjobs=1
(non-io_uring) for that matter.

For io_uring where the expected case is probably not single-threaded sync IO
(or iodepth=1) the cpufreq iowait boost is just hurting use-cases by pushing
it to less efficient frequencies that might not be needed.

I know you want your problem (io_uring showing up as 100% busy even though it's
just sleeping) to be solved like yesterday and my opinion on a future timeline
might not be enough to convince you of much. I wanted to share it anyway.
I don't see an issue with the actual code you're proposing, but it does feel
like a step in the wrong direction to me.

[0] https://lore.kernel.org/lkml/20240304201625.100619-1-christian.loehle@arm.com/
v2: https://lore.kernel.org/lkml/20240518113947.2127802-1-christian.loehle@arm.com/
[1] https://www.youtube.com/watch?v=MSQGEsSziZ4
[2] https://lore.kernel.org/lkml/20240628095955.34096-1-christian.loehle@arm.com/

Regards,
Christian

> [snip]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-08-21 14:54 ` [PATCHSET v6 0/4] Split iowait " Christian Loehle
@ 2024-08-21 15:04   ` Jens Axboe
  2024-08-21 15:57     ` Christian Loehle
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 15:04 UTC (permalink / raw)
  To: Christian Loehle, linux-kernel
  Cc: peterz, tglx, Vincent Guittot, Qais Yousef, Rafael J. Wysocki

On 8/21/24 8:54 AM, Christian Loehle wrote:
> On 8/19/24 16:39, Jens Axboe wrote:
>> Hi,
>>
>> This is v6 of the patchset where the current in_iowait state is split
>> into two parts:
>>
>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>    in terms of sleep and wakeup latencies.
>> 2) The above, and also accounted as such in the iowait stats.
>>
>> The current ->in_iowait covers both, this series splits it into two types
>> of state so that each can be controlled seperately.
> 
> Hi Jens,
> I wanted to give a brief update on where I think we're at in terms
> of iowait behavior regarding cpuidle and cpufreq.
> I'm still working on getting both removed, given the discussions had
> on the list [0] and at OSPM [1] this seems realistic and the best way
> forward IMO.
> That would then naturally make this series and the iowait workaround in
> io_uring/io_uring.c unnecessary.
> 
> 1. For cpuidle:
> Main issue with relying on nr_iowaiters is that there is no guarantee
> whatsoever that these tasks will wakeup where they went to sleep so if
> we can achieve the same throughput without nr_iowaiters it shouldn't
> be relevant.
> I spent quite some time in fixing teo [2], because untangling nr_iowaiters
> from menu seems hard, essentially nobody has worked on menu seriously for
> a while now. Thus the plan here is to replace menu by teo eventually.
> For your io_uring workloads I see throughput on par for teo (doesn't rely
> on iowait) and menu.
> 
> # echo teo > /sys/devices/system/cpu/cpuidle/current_governor
> #  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
> submitter=0, tid=206, file=/dev/nvme0n1, node=-1
> polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
> Engine=preadv2
> IOPS=22500, BW=87MiB/s, IOS/call=0/0
> IOPS=21916, BW=85MiB/s, IOS/call=1/0
> IOPS=21774, BW=85MiB/s, IOS/call=1/0
> IOPS=22467, BW=87MiB/s, IOS/call=1/0
> Exiting on timeout
> Maximum IOPS=22500
> # echo menu > /sys/devices/system/cpu/cpuidle/current_governor
> [  178.754571] cpuidle: using governor menu
> #  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
> submitter=0, tid=209, file=/dev/nvme0n1, node=-1
> polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
> Engine=preadv2
> IOPS=21452, BW=83MiB/s, IOS/call=0/0
> IOPS=21778, BW=85MiB/s, IOS/call=1/0
> IOPS=21120, BW=82MiB/s, IOS/call=1/0
> IOPS=20903, BW=81MiB/s, IOS/call=1/0
> Exiting on timeout
> Maximum IOPS=21778
> 
> Please do give it a try for yourself as well!
> 
> 2. For cpufreq:
> Main issue for IO-bound workloads with iowait boosting is we're punishing
> the 'good' workloads (that don't have iowait sleeps in their throughput-critical
> part, which is already bad because of the scheduling overhead induced) by
> making them energy-inefficient to make synthetic benchmarks happy.
> A study of more realistic workloads show that they don't suffer from a problem
> of building up utilization, not util_est anyway, so they don't actually benefit
> from a cpufreq boost.
> This leads me to the conclusion that cpufreq iowait boosting can be scrapped
> altogether if we accept some degradation of benchmarks like
> ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1
> or
> fio --name=fio --rw=randread --bs=4k --runtime=5 --time_based --filename=/dev/nvme0n1 --iodepth=1 --numjobs=1
> (non-io_uring) for that matter.

The original iowait addition came because a big regression was seen
compared to not setting iowait, it was around 20% iirc. That's big, and
not in the realm of "some degradation" that will be acceptable. And that
will largely depend on the system being used. On some systems, it'll be
less, and on some it'll be more.

> For io_uring where the expected case is probably not single-threaded
> sync IO (or iodepth=1) the cpufreq iowait boost is just hurting
> use-cases by pushing it to less efficient frequencies that might not
> be needed.

People do all sorts of things, and sync (or low queue depth) IO is
certainly one of the use cases. In fact that's where the above report
came from, on the postgres aio side.

> I know you want your problem (io_uring showing up as 100% busy even
> though it's just sleeping) to be solved like yesterday and my opinion
> on a future timeline might not be enough to convince you of much. I
> wanted to share it anyway. I don't see an issue with the actual code
> you're proposing, but it does feel like a step in the wrong direction
> to me.

As mentioned in my original reply, I view this as entirely orthogonal,
and while I appreciate your efforts in this area, I'm a little tired of
this being brought up as a gatekeeping metric when it's not there.

If we can eliminate iowait for boosting down the line, then I'm all for
it. But this has now been pending for > 6 months and I don't think it's
far to keep stringing this along on a future promise. This isn't a lot
of code and it solves the issue for now, if the code will get removed
down the line as not needed, then that's certainly fine. For now, we
need it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-08-21 15:04   ` Jens Axboe
@ 2024-08-21 15:57     ` Christian Loehle
  2024-08-24 15:34       ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Christian Loehle @ 2024-08-21 15:57 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel
  Cc: peterz, tglx, Vincent Guittot, Qais Yousef, Rafael J. Wysocki

On 8/21/24 16:04, Jens Axboe wrote:
> On 8/21/24 8:54 AM, Christian Loehle wrote:
>> On 8/19/24 16:39, Jens Axboe wrote:
>>> Hi,
>>>
>>> This is v6 of the patchset where the current in_iowait state is split
>>> into two parts:
>>>
>>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>>    in terms of sleep and wakeup latencies.
>>> 2) The above, and also accounted as such in the iowait stats.
>>>
>>> The current ->in_iowait covers both, this series splits it into two types
>>> of state so that each can be controlled seperately.
>>
>> Hi Jens,
>> I wanted to give a brief update on where I think we're at in terms
>> of iowait behavior regarding cpuidle and cpufreq.
>> I'm still working on getting both removed, given the discussions had
>> on the list [0] and at OSPM [1] this seems realistic and the best way
>> forward IMO.
>> That would then naturally make this series and the iowait workaround in
>> io_uring/io_uring.c unnecessary.
>>
>> 1. For cpuidle:
>> Main issue with relying on nr_iowaiters is that there is no guarantee
>> whatsoever that these tasks will wakeup where they went to sleep so if
>> we can achieve the same throughput without nr_iowaiters it shouldn't
>> be relevant.
>> I spent quite some time in fixing teo [2], because untangling nr_iowaiters
>> from menu seems hard, essentially nobody has worked on menu seriously for
>> a while now. Thus the plan here is to replace menu by teo eventually.
>> For your io_uring workloads I see throughput on par for teo (doesn't rely
>> on iowait) and menu.
>>
>> # echo teo > /sys/devices/system/cpu/cpuidle/current_governor
>> #  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
>> submitter=0, tid=206, file=/dev/nvme0n1, node=-1
>> polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
>> Engine=preadv2
>> IOPS=22500, BW=87MiB/s, IOS/call=0/0
>> IOPS=21916, BW=85MiB/s, IOS/call=1/0
>> IOPS=21774, BW=85MiB/s, IOS/call=1/0
>> IOPS=22467, BW=87MiB/s, IOS/call=1/0
>> Exiting on timeout
>> Maximum IOPS=22500
>> # echo menu > /sys/devices/system/cpu/cpuidle/current_governor
>> [  178.754571] cpuidle: using governor menu
>> #  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
>> submitter=0, tid=209, file=/dev/nvme0n1, node=-1
>> polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
>> Engine=preadv2
>> IOPS=21452, BW=83MiB/s, IOS/call=0/0
>> IOPS=21778, BW=85MiB/s, IOS/call=1/0
>> IOPS=21120, BW=82MiB/s, IOS/call=1/0
>> IOPS=20903, BW=81MiB/s, IOS/call=1/0
>> Exiting on timeout
>> Maximum IOPS=21778
>>
>> Please do give it a try for yourself as well!
>>
>> 2. For cpufreq:
>> Main issue for IO-bound workloads with iowait boosting is we're punishing
>> the 'good' workloads (that don't have iowait sleeps in their throughput-critical
>> part, which is already bad because of the scheduling overhead induced) by
>> making them energy-inefficient to make synthetic benchmarks happy.
>> A study of more realistic workloads show that they don't suffer from a problem
>> of building up utilization, not util_est anyway, so they don't actually benefit
>> from a cpufreq boost.
>> This leads me to the conclusion that cpufreq iowait boosting can be scrapped
>> altogether if we accept some degradation of benchmarks like
>> ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1
>> or
>> fio --name=fio --rw=randread --bs=4k --runtime=5 --time_based --filename=/dev/nvme0n1 --iodepth=1 --numjobs=1
>> (non-io_uring) for that matter.
> 
> The original iowait addition came because a big regression was seen
> compared to not setting iowait, it was around 20% iirc. That's big, and
> not in the realm of "some degradation" that will be acceptable. And that
> will largely depend on the system being used. On some systems, it'll be
> less, and on some it'll be more.

We are also talking about power regressions of 1000% easily FWIW for e.g.
fio --name=fio --rw=randread --bs=4k --runtime=10 --time_based --filename=/dev/nvme0n1 --iodepth=32 --numjobs=nr_cpus --ioengine=io_uring
(without any throughput gain).

> 
>> For io_uring where the expected case is probably not single-threaded
>> sync IO (or iodepth=1) the cpufreq iowait boost is just hurting
>> use-cases by pushing it to less efficient frequencies that might not
>> be needed.
> 
> People do all sorts of things, and sync (or low queue depth) IO is
> certainly one of the use cases. In fact that's where the above report
> came from, on the postgres aio side.

I have looked at that and (on the platforms I've tested) that was indeed
from cpuidle FWIW. Moving away from menu did remedy this with the
mainlined teo fixes.

>> I know you want your problem (io_uring showing up as 100% busy even
>> though it's just sleeping) to be solved like yesterday and my opinion
>> on a future timeline might not be enough to convince you of much. I
>> wanted to share it anyway. I don't see an issue with the actual code
>> you're proposing, but it does feel like a step in the wrong direction
>> to me.
> 
> As mentioned in my original reply, I view this as entirely orthogonal,
> and while I appreciate your efforts in this area, I'm a little tired of
> this being brought up as a gatekeeping metric when it's not there.

I can understand you being tired of me bringing this up, but I'm not
gatekeeping this series, not intentionally anyway.
Just trying to give some perspective on the entire iowait behavior
future.

> 
> If we can eliminate iowait for boosting down the line, then I'm all for
> it. But this has now been pending for > 6 months and I don't think it's
> far to keep stringing this along on a future promise. This isn't a lot
> of code and it solves the issue for now, if the code will get removed
> down the line as not needed, then that's certainly fine. For now, we
> need it.

I'm fine with carrying a revert of the series along my patchset.

Regards,
Christian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-08-21 15:57     ` Christian Loehle
@ 2024-08-24 15:34       ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-24 15:34 UTC (permalink / raw)
  To: Christian Loehle, linux-kernel
  Cc: peterz, tglx, Vincent Guittot, Qais Yousef, Rafael J. Wysocki

On 8/21/24 9:57 AM, Christian Loehle wrote:
> On 8/21/24 16:04, Jens Axboe wrote:
>> On 8/21/24 8:54 AM, Christian Loehle wrote:
>>> On 8/19/24 16:39, Jens Axboe wrote:
>>>> Hi,
>>>>
>>>> This is v6 of the patchset where the current in_iowait state is split
>>>> into two parts:
>>>>
>>>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>>>    in terms of sleep and wakeup latencies.
>>>> 2) The above, and also accounted as such in the iowait stats.
>>>>
>>>> The current ->in_iowait covers both, this series splits it into two types
>>>> of state so that each can be controlled seperately.
>>>
>>> Hi Jens,
>>> I wanted to give a brief update on where I think we're at in terms
>>> of iowait behavior regarding cpuidle and cpufreq.
>>> I'm still working on getting both removed, given the discussions had
>>> on the list [0] and at OSPM [1] this seems realistic and the best way
>>> forward IMO.
>>> That would then naturally make this series and the iowait workaround in
>>> io_uring/io_uring.c unnecessary.
>>>
>>> 1. For cpuidle:
>>> Main issue with relying on nr_iowaiters is that there is no guarantee
>>> whatsoever that these tasks will wakeup where they went to sleep so if
>>> we can achieve the same throughput without nr_iowaiters it shouldn't
>>> be relevant.
>>> I spent quite some time in fixing teo [2], because untangling nr_iowaiters
>>> from menu seems hard, essentially nobody has worked on menu seriously for
>>> a while now. Thus the plan here is to replace menu by teo eventually.
>>> For your io_uring workloads I see throughput on par for teo (doesn't rely
>>> on iowait) and menu.
>>>
>>> # echo teo > /sys/devices/system/cpu/cpuidle/current_governor
>>> #  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
>>> submitter=0, tid=206, file=/dev/nvme0n1, node=-1
>>> polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
>>> Engine=preadv2
>>> IOPS=22500, BW=87MiB/s, IOS/call=0/0
>>> IOPS=21916, BW=85MiB/s, IOS/call=1/0
>>> IOPS=21774, BW=85MiB/s, IOS/call=1/0
>>> IOPS=22467, BW=87MiB/s, IOS/call=1/0
>>> Exiting on timeout
>>> Maximum IOPS=22500
>>> # echo menu > /sys/devices/system/cpu/cpuidle/current_governor
>>> [  178.754571] cpuidle: using governor menu
>>> #  ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1 
>>> submitter=0, tid=209, file=/dev/nvme0n1, node=-1
>>> polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=1
>>> Engine=preadv2
>>> IOPS=21452, BW=83MiB/s, IOS/call=0/0
>>> IOPS=21778, BW=85MiB/s, IOS/call=1/0
>>> IOPS=21120, BW=82MiB/s, IOS/call=1/0
>>> IOPS=20903, BW=81MiB/s, IOS/call=1/0
>>> Exiting on timeout
>>> Maximum IOPS=21778
>>>
>>> Please do give it a try for yourself as well!
>>>
>>> 2. For cpufreq:
>>> Main issue for IO-bound workloads with iowait boosting is we're punishing
>>> the 'good' workloads (that don't have iowait sleeps in their throughput-critical
>>> part, which is already bad because of the scheduling overhead induced) by
>>> making them energy-inefficient to make synthetic benchmarks happy.
>>> A study of more realistic workloads show that they don't suffer from a problem
>>> of building up utilization, not util_est anyway, so they don't actually benefit
>>> from a cpufreq boost.
>>> This leads me to the conclusion that cpufreq iowait boosting can be scrapped
>>> altogether if we accept some degradation of benchmarks like
>>> ./io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S 1 -R 0 /dev/nvme0n1
>>> or
>>> fio --name=fio --rw=randread --bs=4k --runtime=5 --time_based --filename=/dev/nvme0n1 --iodepth=1 --numjobs=1
>>> (non-io_uring) for that matter.
>>
>> The original iowait addition came because a big regression was seen
>> compared to not setting iowait, it was around 20% iirc. That's big, and
>> not in the realm of "some degradation" that will be acceptable. And that
>> will largely depend on the system being used. On some systems, it'll be
>> less, and on some it'll be more.
> 
> We are also talking about power regressions of 1000% easily FWIW for
> e.g. fio --name=fio --rw=randread --bs=4k --runtime=10 --time_based
> --filename=/dev/nvme0n1 --iodepth=32 --numjobs=nr_cpus
> --ioengine=io_uring (without any throughput gain).

Oh I believe it, for some embeded or low power cpus. And it is on our
list, to make this selectable. Ideally what I think should happen is
that the application gives you a hint on how long it expects to sleep,
and we'll pass that on and let the lower layers decide what's the most
appropriate state to enter. The current iowait usage isn't very pretty
(in io_uring or otherwise, it's too coarse of a hint), but it's what we
have/had, and we needed it to solve a problem that would otherwise be a
regression on a much more common setup than really lower power devices.

>>> For io_uring where the expected case is probably not single-threaded
>>> sync IO (or iodepth=1) the cpufreq iowait boost is just hurting
>>> use-cases by pushing it to less efficient frequencies that might not
>>> be needed.
>>
>> People do all sorts of things, and sync (or low queue depth) IO is
>> certainly one of the use cases. In fact that's where the above report
>> came from, on the postgres aio side.
> 
> I have looked at that and (on the platforms I've tested) that was indeed
> from cpuidle FWIW. Moving away from menu did remedy this with the
> mainlined teo fixes.
> 
>>> I know you want your problem (io_uring showing up as 100% busy even
>>> though it's just sleeping) to be solved like yesterday and my opinion
>>> on a future timeline might not be enough to convince you of much. I
>>> wanted to share it anyway. I don't see an issue with the actual code
>>> you're proposing, but it does feel like a step in the wrong direction
>>> to me.
>>
>> As mentioned in my original reply, I view this as entirely orthogonal,
>> and while I appreciate your efforts in this area, I'm a little tired of
>> this being brought up as a gatekeeping metric when it's not there.
> 
> I can understand you being tired of me bringing this up, but I'm not
> gatekeeping this series, not intentionally anyway.

Well it does feel like that, because this orthogonal (imho) development
is being brought up as a means to not needing to do this. Not just this
posting, but past ones too. Meanwhile, I'd like this problem solved, and
this just adds noise to it as far as I'm concerned. It would be a lot
better to split those two discussions up.

>> If we can eliminate iowait for boosting down the line, then I'm all for
>> it. But this has now been pending for > 6 months and I don't think it's
>> far to keep stringing this along on a future promise. This isn't a lot
>> of code and it solves the issue for now, if the code will get removed
>> down the line as not needed, then that's certainly fine. For now, we
>> need it.
> 
> I'm fine with carrying a revert of the series along my patchset.

OK that's fine, and let's hope we end up in a place down the line that's
a lot better than the iowait on/off we have now, with guesswork based on
past behavior (iow, mostly wrong) on the other end on how long the we
expect to sleep. I'd certainly be all for that, I just don't want future
promises to stop fixing a real issue we have now. If this series goes
away down the line because we don't need it, I surely won't cry over it!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
                   ` (4 preceding siblings ...)
  2024-08-21 14:54 ` [PATCHSET v6 0/4] Split iowait " Christian Loehle
@ 2024-09-04 14:28 ` Peter Zijlstra
  2024-09-04 14:41   ` Jens Axboe
  2024-09-04 14:42   ` Rafael J. Wysocki
  2025-03-31  9:02 ` Pavel Begunkov
  6 siblings, 2 replies; 29+ messages in thread
From: Peter Zijlstra @ 2024-09-04 14:28 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, tglx, rafael, daniel.lezcano, linux-pm

On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
> Hi,
> 
> This is v6 of the patchset where the current in_iowait state is split
> into two parts:
> 
> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>    in terms of sleep and wakeup latencies.
> 2) The above, and also accounted as such in the iowait stats.
> 
> The current ->in_iowait covers both, this series splits it into two types
> of state so that each can be controlled seperately.

Yeah, but *WHY* !?!? I have some vague memories from last time around,
but patches should really keep this information.

> Patches 1..3 are prep patches, changing the type of
> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
> 
> Patch 4 does the actual splitting.
> 
> This has been sitting for a while, would be nice to get this queued up
> for 6.12. Comments welcome!

Ufff, and all this because menu-governor does something insane :-(

Rafael, why can't we simply remove this from menu? All the nr_iowait*()
users are basically broken and I would much rather fix broken rather
than work around broken like this.

That is, from where I'm sitting this all makes the io-wait situation far
worse instead of better.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 14:28 ` Peter Zijlstra
@ 2024-09-04 14:41   ` Jens Axboe
  2024-09-04 14:49     ` Jens Axboe
  2024-09-05  9:51     ` Peter Zijlstra
  2024-09-04 14:42   ` Rafael J. Wysocki
  1 sibling, 2 replies; 29+ messages in thread
From: Jens Axboe @ 2024-09-04 14:41 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, tglx, rafael, daniel.lezcano, linux-pm

On 9/4/24 8:28 AM, Peter Zijlstra wrote:
> On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
>> Hi,
>>
>> This is v6 of the patchset where the current in_iowait state is split
>> into two parts:
>>
>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>    in terms of sleep and wakeup latencies.
>> 2) The above, and also accounted as such in the iowait stats.
>>
>> The current ->in_iowait covers both, this series splits it into two types
>> of state so that each can be controlled seperately.
> 
> Yeah, but *WHY* !?!? I have some vague memories from last time around,
> but patches should really keep this information.

To decouple the frequency boost on short waits from the accounting side,
as lots of tooling equates iowait time with busy time and reports it as
such. Yeah that's garbage and a reporting issue, but decades of
education hasn't really improved on that. We should've dumped iowait
once we moved away from 1-2 processor system or had preemptible kernels,
but alas we did not and here we are in 2024.

>> Patches 1..3 are prep patches, changing the type of
>> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
>>
>> Patch 4 does the actual splitting.
>>
>> This has been sitting for a while, would be nice to get this queued up
>> for 6.12. Comments welcome!
> 
> Ufff, and all this because menu-governor does something insane :-(
> 
> Rafael, why can't we simply remove this from menu? All the nr_iowait*()
> users are basically broken and I would much rather fix broken rather
> than work around broken like this.
> 
> That is, from where I'm sitting this all makes the io-wait situation far
> worse instead of better.

IMHO what we need is a way to propagate expected wait times for a
sleeper. Right now iowait serves this purpose in a very crude way, in
that it doesn't really tell you the expected wait, just that it's a
short one.

If we simply remove iowait frequency boosting, then we'll have big
regressions particularly for low/sync storage IO.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 14:28 ` Peter Zijlstra
  2024-09-04 14:41   ` Jens Axboe
@ 2024-09-04 14:42   ` Rafael J. Wysocki
  2024-09-04 15:18     ` Rafael J. Wysocki
  1 sibling, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2024-09-04 14:42 UTC (permalink / raw)
  To: Peter Zijlstra, Jens Axboe; +Cc: linux-kernel, tglx, daniel.lezcano, linux-pm

On Wed, Sep 4, 2024 at 4:28 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
> > Hi,
> >
> > This is v6 of the patchset where the current in_iowait state is split
> > into two parts:
> >
> > 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
> >    in terms of sleep and wakeup latencies.
> > 2) The above, and also accounted as such in the iowait stats.
> >
> > The current ->in_iowait covers both, this series splits it into two types
> > of state so that each can be controlled seperately.
>
> Yeah, but *WHY* !?!? I have some vague memories from last time around,
> but patches should really keep this information.
>
> > Patches 1..3 are prep patches, changing the type of
> > task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
> >
> > Patch 4 does the actual splitting.
> >
> > This has been sitting for a while, would be nice to get this queued up
> > for 6.12. Comments welcome!
>
> Ufff, and all this because menu-governor does something insane :-(
>
> Rafael, why can't we simply remove this from menu?

Same reason as before: people use it and refuse to stop.

But this is mostly about the schedutil cpufreq governor that uses
iowait boosting.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 14:41   ` Jens Axboe
@ 2024-09-04 14:49     ` Jens Axboe
  2024-09-05  9:51     ` Peter Zijlstra
  1 sibling, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-09-04 14:49 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, tglx, rafael, daniel.lezcano, linux-pm

On 9/4/24 8:41 AM, Jens Axboe wrote:
> On 9/4/24 8:28 AM, Peter Zijlstra wrote:
>> On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
>>> Hi,
>>>
>>> This is v6 of the patchset where the current in_iowait state is split
>>> into two parts:
>>>
>>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>>    in terms of sleep and wakeup latencies.
>>> 2) The above, and also accounted as such in the iowait stats.
>>>
>>> The current ->in_iowait covers both, this series splits it into two types
>>> of state so that each can be controlled seperately.
>>
>> Yeah, but *WHY* !?!? I have some vague memories from last time around,
>> but patches should really keep this information.
> 
> To decouple the frequency boost on short waits from the accounting side,
> as lots of tooling equates iowait time with busy time and reports it as
> such. Yeah that's garbage and a reporting issue, but decades of
> education hasn't really improved on that. We should've dumped iowait
> once we moved away from 1-2 processor system or had preemptible kernels,
> but alas we did not and here we are in 2024.

Forgot to mention, it's not *just* an educational thing - lots services
of services do mixed network and disk IO, obviously, and they do have
some interest in retaining iowait metrics on the disk side.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 14:42   ` Rafael J. Wysocki
@ 2024-09-04 15:18     ` Rafael J. Wysocki
  2024-09-05  9:29       ` Christian Loehle
  2024-09-05  9:36       ` Peter Zijlstra
  0 siblings, 2 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2024-09-04 15:18 UTC (permalink / raw)
  To: Peter Zijlstra, Jens Axboe
  Cc: linux-kernel, tglx, daniel.lezcano, linux-pm, Christian Loehle

On Wed, Sep 4, 2024 at 4:42 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Wed, Sep 4, 2024 at 4:28 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
> > > Hi,
> > >
> > > This is v6 of the patchset where the current in_iowait state is split
> > > into two parts:
> > >
> > > 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
> > >    in terms of sleep and wakeup latencies.
> > > 2) The above, and also accounted as such in the iowait stats.
> > >
> > > The current ->in_iowait covers both, this series splits it into two types
> > > of state so that each can be controlled seperately.
> >
> > Yeah, but *WHY* !?!? I have some vague memories from last time around,
> > but patches should really keep this information.
> >
> > > Patches 1..3 are prep patches, changing the type of
> > > task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
> > >
> > > Patch 4 does the actual splitting.
> > >
> > > This has been sitting for a while, would be nice to get this queued up
> > > for 6.12. Comments welcome!
> >
> > Ufff, and all this because menu-governor does something insane :-(
> >
> > Rafael, why can't we simply remove this from menu?
>
> Same reason as before: people use it and refuse to stop.
>
> But this is mostly about the schedutil cpufreq governor that uses
> iowait boosting.

To be more precise, there are two different uses of "iowait" in PM.

One is the nr_iowait_cpu() call in menu_select() and the result of it
is used for two purposes: (1) select different sets of statistics
depending on whether or not this number is zero and (2) set a limit
for the idle state's exit latency that depends on this number (but
note that it only takes effect when the "iowait" statistics are used
in the first place).  Both of these are arguably questionable and it
is unclear to me whether or not they actually help and how much.

The other use is boosting CPU frequency in schedutil and intel_pstate
if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
p->in_iowait value in enqueue_task_fair().

AFAICS, the latter makes a major difference.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 15:18     ` Rafael J. Wysocki
@ 2024-09-05  9:29       ` Christian Loehle
  2024-09-05 10:40         ` Rafael J. Wysocki
  2024-09-05  9:36       ` Peter Zijlstra
  1 sibling, 1 reply; 29+ messages in thread
From: Christian Loehle @ 2024-09-05  9:29 UTC (permalink / raw)
  To: Rafael J. Wysocki, Peter Zijlstra, Jens Axboe
  Cc: linux-kernel, tglx, daniel.lezcano, linux-pm

On 9/4/24 16:18, Rafael J. Wysocki wrote:
> On Wed, Sep 4, 2024 at 4:42 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>>
>> On Wed, Sep 4, 2024 at 4:28 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
>>>> Hi,
>>>>
>>>> This is v6 of the patchset where the current in_iowait state is split
>>>> into two parts:
>>>>
>>>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>>>    in terms of sleep and wakeup latencies.
>>>> 2) The above, and also accounted as such in the iowait stats.
>>>>
>>>> The current ->in_iowait covers both, this series splits it into two types
>>>> of state so that each can be controlled seperately.
>>>
>>> Yeah, but *WHY* !?!? I have some vague memories from last time around,
>>> but patches should really keep this information.
>>>
>>>> Patches 1..3 are prep patches, changing the type of
>>>> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
>>>>
>>>> Patch 4 does the actual splitting.
>>>>
>>>> This has been sitting for a while, would be nice to get this queued up
>>>> for 6.12. Comments welcome!
>>>
>>> Ufff, and all this because menu-governor does something insane :-(
>>>
>>> Rafael, why can't we simply remove this from menu?
>>
>> Same reason as before: people use it and refuse to stop.
>>
>> But this is mostly about the schedutil cpufreq governor that uses
>> iowait boosting.
> 
> To be more precise, there are two different uses of "iowait" in PM.
> 
> One is the nr_iowait_cpu() call in menu_select() and the result of it
> is used for two purposes: (1) select different sets of statistics
> depending on whether or not this number is zero and (2) set a limit
> for the idle state's exit latency that depends on this number (but
> note that it only takes effect when the "iowait" statistics are used
> in the first place).  Both of these are arguably questionable and it
> is unclear to me whether or not they actually help and how much.

So from my perspective it doesn't, not significantly to justify it's
existence anyway. Either it doesn't actually matter for menu, or teo
is able to compete / outperform without relying on it.
Some caution is advised though this really depends on:
- Which idle states are available for the kernel to select.
- How accurate the kernel's view of the idle states is.

Both varies wildly.

> 
> The other use is boosting CPU frequency in schedutil and intel_pstate
> if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
> p->in_iowait value in enqueue_task_fair().
> 
> AFAICS, the latter makes a major difference.


Indeed, fortunately the impact is quite limited here.
But please, Rafael, Jens and Peter, feel free to share your comments
over here too:

https://lore.kernel.org/lkml/20240905092645.2885200-1-christian.loehle@arm.com/

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 15:18     ` Rafael J. Wysocki
  2024-09-05  9:29       ` Christian Loehle
@ 2024-09-05  9:36       ` Peter Zijlstra
  2024-09-05 10:31         ` Christian Loehle
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2024-09-05  9:36 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Jens Axboe, linux-kernel, tglx, daniel.lezcano, linux-pm,
	Christian Loehle

On Wed, Sep 04, 2024 at 05:18:57PM +0200, Rafael J. Wysocki wrote:

> To be more precise, there are two different uses of "iowait" in PM.
> 
> One is the nr_iowait_cpu() call in menu_select() and the result of it
> is used for two purposes: (1) select different sets of statistics
> depending on whether or not this number is zero and (2) set a limit
> for the idle state's exit latency that depends on this number (but
> note that it only takes effect when the "iowait" statistics are used
> in the first place).  Both of these are arguably questionable and it
> is unclear to me whether or not they actually help and how much.

So this one is very dubious, it relies on tasks getting back on the CPU
they went to sleep on -- not guaranteed at all.

> The other use is boosting CPU frequency in schedutil and intel_pstate
> if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
> p->in_iowait value in enqueue_task_fair().

This one is fine and makes sense. At this point we know that p is going
to run and where it is going to run.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-04 14:41   ` Jens Axboe
  2024-09-04 14:49     ` Jens Axboe
@ 2024-09-05  9:51     ` Peter Zijlstra
  1 sibling, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2024-09-05  9:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, tglx, rafael, daniel.lezcano, linux-pm

On Wed, Sep 04, 2024 at 08:41:23AM -0600, Jens Axboe wrote:

> > Yeah, but *WHY* !?!? I have some vague memories from last time around,
> > but patches should really keep this information.
> 
> To decouple the frequency boost on short waits from the accounting side,
> as lots of tooling equates iowait time with busy time and reports it as
> such. Yeah that's garbage and a reporting issue, but decades of
> education hasn't really improved on that. We should've dumped iowait
> once we moved away from 1-2 processor system or had preemptible kernels,
> but alas we did not and here we are in 2024.

There's 'WAIT' in the name, what broken piece of garbage reports it as
busy time? That has *NEVER* been right. Even on UP systems where IO-wait
is actually a sensible number, it is explicitly the time it *could* have
been busy, if only the IO were faster.

And are we really going to make the whole kernel situation worse just
because there's a bunch of broken userspace?

> >> Patches 1..3 are prep patches, changing the type of
> >> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
> >>
> >> Patch 4 does the actual splitting.
> >>
> >> This has been sitting for a while, would be nice to get this queued up
> >> for 6.12. Comments welcome!
> > 
> > Ufff, and all this because menu-governor does something insane :-(
> > 
> > Rafael, why can't we simply remove this from menu? All the nr_iowait*()
> > users are basically broken and I would much rather fix broken rather
> > than work around broken like this.
> > 
> > That is, from where I'm sitting this all makes the io-wait situation far
> > worse instead of better.
> 
> IMHO what we need is a way to propagate expected wait times for a
> sleeper. Right now iowait serves this purpose in a very crude way, in
> that it doesn't really tell you the expected wait, just that it's a
> short one.

Expected wait time is one thing, but you then *still* have no clue what
CPU it will get back on. Very typically it will be another CPU in the
same cache cluster. One that had no consideration of it when it went to
sleep.

A sleeping task is not associated with a CPU. There is a fundamental
mismatch there.

Using io-wait for idle state selection is very tricky because of this.

> If we simply remove iowait frequency boosting, then we'll have big
> regressions particularly for low/sync storage IO.

The frequency boosting thing I don't object to. That happend on wakeup
after we know that and where a task is going to run.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/4] sched/core: have io_schedule_prepare() return a long
  2024-08-19 15:39 ` [PATCH 3/4] sched/core: have io_schedule_prepare() return a long Jens Axboe
@ 2024-09-05  9:57   ` Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2024-09-05  9:57 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, tglx

On Mon, Aug 19, 2024 at 09:39:48AM -0600, Jens Axboe wrote:
> In preparation for needing more state then 32-bit on 64-bit archs,
> switch it to a long instead.

I'm confused, afaict you'll need all of *2* bits in the next patch.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-05  9:36       ` Peter Zijlstra
@ 2024-09-05 10:31         ` Christian Loehle
  2024-09-05 11:00           ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Christian Loehle @ 2024-09-05 10:31 UTC (permalink / raw)
  To: Peter Zijlstra, Rafael J. Wysocki
  Cc: Jens Axboe, linux-kernel, tglx, daniel.lezcano, linux-pm

On 9/5/24 10:36, Peter Zijlstra wrote:
> On Wed, Sep 04, 2024 at 05:18:57PM +0200, Rafael J. Wysocki wrote:
> 
>> To be more precise, there are two different uses of "iowait" in PM.
>>
>> One is the nr_iowait_cpu() call in menu_select() and the result of it
>> is used for two purposes: (1) select different sets of statistics
>> depending on whether or not this number is zero and (2) set a limit
>> for the idle state's exit latency that depends on this number (but
>> note that it only takes effect when the "iowait" statistics are used
>> in the first place).  Both of these are arguably questionable and it
>> is unclear to me whether or not they actually help and how much.
> 
> So this one is very dubious, it relies on tasks getting back on the CPU
> they went to sleep on -- not guaranteed at all.
> 
>> The other use is boosting CPU frequency in schedutil and intel_pstate
>> if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
>> p->in_iowait value in enqueue_task_fair().
> 
> This one is fine and makes sense. At this point we know that p is going
> to run and where it is going to run.

On any even remotely realistic scenario and hardware though the boost
isn't effective until the next enqueue-dequeue-cycle, so if your above
objection is based on that, I would object here too, using your argument.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-05  9:29       ` Christian Loehle
@ 2024-09-05 10:40         ` Rafael J. Wysocki
  0 siblings, 0 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2024-09-05 10:40 UTC (permalink / raw)
  To: Christian Loehle
  Cc: Rafael J. Wysocki, Peter Zijlstra, Jens Axboe, linux-kernel, tglx,
	daniel.lezcano, linux-pm

On Thu, Sep 5, 2024 at 11:29 AM Christian Loehle
<christian.loehle@arm.com> wrote:
>
> On 9/4/24 16:18, Rafael J. Wysocki wrote:
> > On Wed, Sep 4, 2024 at 4:42 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
> >>
> >> On Wed, Sep 4, 2024 at 4:28 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >>>
> >>> On Mon, Aug 19, 2024 at 09:39:45AM -0600, Jens Axboe wrote:
> >>>> Hi,
> >>>>
> >>>> This is v6 of the patchset where the current in_iowait state is split
> >>>> into two parts:
> >>>>
> >>>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
> >>>>    in terms of sleep and wakeup latencies.
> >>>> 2) The above, and also accounted as such in the iowait stats.
> >>>>
> >>>> The current ->in_iowait covers both, this series splits it into two types
> >>>> of state so that each can be controlled seperately.
> >>>
> >>> Yeah, but *WHY* !?!? I have some vague memories from last time around,
> >>> but patches should really keep this information.
> >>>
> >>>> Patches 1..3 are prep patches, changing the type of
> >>>> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
> >>>>
> >>>> Patch 4 does the actual splitting.
> >>>>
> >>>> This has been sitting for a while, would be nice to get this queued up
> >>>> for 6.12. Comments welcome!
> >>>
> >>> Ufff, and all this because menu-governor does something insane :-(
> >>>
> >>> Rafael, why can't we simply remove this from menu?
> >>
> >> Same reason as before: people use it and refuse to stop.
> >>
> >> But this is mostly about the schedutil cpufreq governor that uses
> >> iowait boosting.
> >
> > To be more precise, there are two different uses of "iowait" in PM.
> >
> > One is the nr_iowait_cpu() call in menu_select() and the result of it
> > is used for two purposes: (1) select different sets of statistics
> > depending on whether or not this number is zero and (2) set a limit
> > for the idle state's exit latency that depends on this number (but
> > note that it only takes effect when the "iowait" statistics are used
> > in the first place).  Both of these are arguably questionable and it
> > is unclear to me whether or not they actually help and how much.
>
> So from my perspective it doesn't, not significantly to justify it's
> existence anyway. Either it doesn't actually matter for menu, or teo
> is able to compete / outperform without relying on it.

Thanks for this feedback!

I'm actually going to try to remove that stuff from menu and see if
anyone cries bloody murder.

> Some caution is advised though this really depends on:
> - Which idle states are available for the kernel to select.
> - How accurate the kernel's view of the idle states is.
>
> Both varies wildly.

True, but let's see what the feedback is.

> > The other use is boosting CPU frequency in schedutil and intel_pstate
> > if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
> > p->in_iowait value in enqueue_task_fair().
> >
> > AFAICS, the latter makes a major difference.
>
>
> Indeed, fortunately the impact is quite limited here.
> But please, Rafael, Jens and Peter, feel free to share your comments
> over here too:
>
> https://lore.kernel.org/lkml/20240905092645.2885200-1-christian.loehle@arm.com/

I will.

Thanks!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/4] sched/core: split iowait state into two states
  2024-08-19 15:39 ` [PATCH 4/4] sched/core: split iowait state into two states Jens Axboe
@ 2024-09-05 10:55   ` Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2024-09-05 10:55 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, tglx

On Mon, Aug 19, 2024 at 09:39:49AM -0600, Jens Axboe wrote:

> iowait is a bogus metric, but it's helpful in the sense that it allows
> short waits to not enter sleep states that have a higher exit latency
> than would've otherwise have been picked for iowait'ing tasks. However,
> it's harmless in that lots of applications and monitoring assumes that
> iowait is busy time, or otherwise use it as a health metric.
> Particularly for async IO it's entirely nonsensical.

Holdup... I should've remembered this. You're using ->in_iowait for
io_uring. And that is what started all this.

Now, having in_iowait set when you're waiting on a futex is utterly
insane, but looking at commit 7b72d661f1f2 ("io_uring: gate iowait
schedule on having pending requests") you're now only actually setting
in_iowait when you have pending IO.

And I don't think that is nonsensical as you write above.

You are after all still actually waiting while there is pending IO, no?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-05 10:31         ` Christian Loehle
@ 2024-09-05 11:00           ` Peter Zijlstra
  2024-09-05 11:09             ` Christian Loehle
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2024-09-05 11:00 UTC (permalink / raw)
  To: Christian Loehle
  Cc: Rafael J. Wysocki, Jens Axboe, linux-kernel, tglx, daniel.lezcano,
	linux-pm

On Thu, Sep 05, 2024 at 11:31:09AM +0100, Christian Loehle wrote:
> On 9/5/24 10:36, Peter Zijlstra wrote:
> > On Wed, Sep 04, 2024 at 05:18:57PM +0200, Rafael J. Wysocki wrote:
> > 
> >> To be more precise, there are two different uses of "iowait" in PM.
> >>
> >> One is the nr_iowait_cpu() call in menu_select() and the result of it
> >> is used for two purposes: (1) select different sets of statistics
> >> depending on whether or not this number is zero and (2) set a limit
> >> for the idle state's exit latency that depends on this number (but
> >> note that it only takes effect when the "iowait" statistics are used
> >> in the first place).  Both of these are arguably questionable and it
> >> is unclear to me whether or not they actually help and how much.
> > 
> > So this one is very dubious, it relies on tasks getting back on the CPU
> > they went to sleep on -- not guaranteed at all.
> > 
> >> The other use is boosting CPU frequency in schedutil and intel_pstate
> >> if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
> >> p->in_iowait value in enqueue_task_fair().
> > 
> > This one is fine and makes sense. At this point we know that p is going
> > to run and where it is going to run.
> 
> On any even remotely realistic scenario and hardware though the boost
> isn't effective until the next enqueue-dequeue-cycle, so if your above
> objection is based on that, I would object here too, using your argument.

That is a quality of implementation issue with schedutil no?

The whole notion that the wait was for feeding external hardware, and
thus the normal utilization metric doesn't work right thing is still
valid.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-09-05 11:00           ` Peter Zijlstra
@ 2024-09-05 11:09             ` Christian Loehle
  0 siblings, 0 replies; 29+ messages in thread
From: Christian Loehle @ 2024-09-05 11:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Jens Axboe, linux-kernel, tglx, daniel.lezcano,
	linux-pm

On 9/5/24 12:00, Peter Zijlstra wrote:
> On Thu, Sep 05, 2024 at 11:31:09AM +0100, Christian Loehle wrote:
>> On 9/5/24 10:36, Peter Zijlstra wrote:
>>> On Wed, Sep 04, 2024 at 05:18:57PM +0200, Rafael J. Wysocki wrote:
>>>
>>>> To be more precise, there are two different uses of "iowait" in PM.
>>>>
>>>> One is the nr_iowait_cpu() call in menu_select() and the result of it
>>>> is used for two purposes: (1) select different sets of statistics
>>>> depending on whether or not this number is zero and (2) set a limit
>>>> for the idle state's exit latency that depends on this number (but
>>>> note that it only takes effect when the "iowait" statistics are used
>>>> in the first place).  Both of these are arguably questionable and it
>>>> is unclear to me whether or not they actually help and how much.
>>>
>>> So this one is very dubious, it relies on tasks getting back on the CPU
>>> they went to sleep on -- not guaranteed at all.
>>>
>>>> The other use is boosting CPU frequency in schedutil and intel_pstate
>>>> if SCHED_CPUFREQ_IOWAIT is passed to them which in turn depends on the
>>>> p->in_iowait value in enqueue_task_fair().
>>>
>>> This one is fine and makes sense. At this point we know that p is going
>>> to run and where it is going to run.
>>
>> On any even remotely realistic scenario and hardware though the boost
>> isn't effective until the next enqueue-dequeue-cycle, so if your above
>> objection is based on that, I would object here too, using your argument.
> 
> That is a quality of implementation issue with schedutil no?

Is it? So there is a latency from requesting a new frequency and actually
running on it, for both x86 and arm platforms out there that should still
be a few usecs at least during which the task is running. The task will
dequeue quite soon (otherwise it will build up utilization and then it's
not one we consider problematic wrt to this io utilization problem anyway).
Just to be clear, I'm assuming fast_switch here and then I think schedutil's
implementation isn't the problem, rather the premise of the underlying
problem is.
I have tried to elaborate on that in the RFC I've posted and linked though.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
                   ` (5 preceding siblings ...)
  2024-09-04 14:28 ` Peter Zijlstra
@ 2025-03-31  9:02 ` Pavel Begunkov
  2025-03-31 10:33   ` Christian Loehle
  6 siblings, 1 reply; 29+ messages in thread
From: Pavel Begunkov @ 2025-03-31  9:02 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel
  Cc: peterz, tglx, Rafael J. Wysocki, Christian Loehle

On 8/19/24 16:39, Jens Axboe wrote:
> Hi,
> 
> This is v6 of the patchset where the current in_iowait state is split
> into two parts:
> 
> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>     in terms of sleep and wakeup latencies.
> 2) The above, and also accounted as such in the iowait stats.
> 
> The current ->in_iowait covers both, this series splits it into two types
> of state so that each can be controlled seperately.
> 
> Patches 1..3 are prep patches, changing the type of
> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
> 
> Patch 4 does the actual splitting.
> 
> This has been sitting for a while, would be nice to get this queued up
> for 6.12. Comments welcome!

Good day,

Did anything good happened with these patches or related work?
Christian?

Reminder: the goal is to let io_uring to keep using iowait boosting
but avoid reporting it in the iowait stats, because the jump in the
stat spooks users. I know at least several users carrying out of tree
patches to work it around. And, apparently, disabling the boosting
causes perf regressions.

I'm reading through the thread, but unless I missed something, it looks
like the patchset is actually aligned with future plans on iowait
mentioned in the thread, in a sense that it reduces the exposure to
the user space, and, when it's time, a better approach will be able
replaces it with no visible effect to the user.

On the other hand, there seems to be a work around io_uring patch
queued for, which I quite dislike from io_uring perspective but also
because it exposes even more of iowait to the user.
I can understand why it's there, it has been over a year since v1,
but maybe we can figure something out before it's released? Would
it be fine to have something similar to this series? Any other
ideas?

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2025-03-31  9:02 ` Pavel Begunkov
@ 2025-03-31 10:33   ` Christian Loehle
  2025-04-01  8:21     ` Pavel Begunkov
  0 siblings, 1 reply; 29+ messages in thread
From: Christian Loehle @ 2025-03-31 10:33 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, linux-kernel; +Cc: peterz, tglx, Rafael J. Wysocki

On 3/31/25 10:02, Pavel Begunkov wrote:
> On 8/19/24 16:39, Jens Axboe wrote:
>> Hi,
>>
>> This is v6 of the patchset where the current in_iowait state is split
>> into two parts:
>>
>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>     in terms of sleep and wakeup latencies.
>> 2) The above, and also accounted as such in the iowait stats.
>>
>> The current ->in_iowait covers both, this series splits it into two types
>> of state so that each can be controlled seperately.
>>
>> Patches 1..3 are prep patches, changing the type of
>> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
>>
>> Patch 4 does the actual splitting.
>>
>> This has been sitting for a while, would be nice to get this queued up
>> for 6.12. Comments welcome!
> 
> Good day,
> 
> Did anything good happened with these patches or related work?
> Christian> 

Hi Pavel,
so for cpuidle part we've had commit ("38f83090f515 cpuidle: menu: Remove iowait influence")
for a while now without much complaints, hopefully that means it stays in.
So I'd really like to know how the results still compare for relevant workloads.

cpufreq iowait boosting is still a thing in schedutil and intel_pstate,
and so far I've failed to convince Rafael and Peter to get rid of it.
I still think that is the right thing to do, but it does come with a
regression in most of the simple synthetic fio tests.

> Reminder: the goal is to let io_uring to keep using iowait boosting
> but avoid reporting it in the iowait stats, because the jump in the
> stat spooks users. I know at least several users carrying out of tree
> patches to work it around. And, apparently, disabling the boosting
> causes perf regressions.

Details would be appreciated, I looked the the postgres workload that
justified it initially and that was on cpuidle iowait which is no
longer a thing.

> 
> I'm reading through the thread, but unless I missed something, it looks
> like the patchset is actually aligned with future plans on iowait
> mentioned in the thread, in a sense that it reduces the exposure to
> the user space, and, when it's time, a better approach will be able
> replaces it with no visible effect to the user.

I'm not against $subject necessarily, it's clearly a hack tapering
over this but as I've mentioned I'm fine carrying a revert of $subject
for a future series on iowait boosting.

> 
> On the other hand, there seems to be a work around io_uring patch
> queued for, which I quite dislike from io_uring perspective but also
> because it exposes even more of iowait to the user.
> I can understand why it's there, it has been over a year since v1,
> but maybe we can figure something out before it's released? Would
> it be fine to have something similar to this series? Any other
> ideas?

Ah thank you, I've missed this
https://lore.kernel.org/io-uring/f548f142-d6f3-46d8-9c58-6cf595c968fb@kernel.dk/
Would be nice if this lead to more numbers comparing the two at least.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCHSET v6 0/4] Split iowait into two states
  2025-03-31 10:33   ` Christian Loehle
@ 2025-04-01  8:21     ` Pavel Begunkov
  0 siblings, 0 replies; 29+ messages in thread
From: Pavel Begunkov @ 2025-04-01  8:21 UTC (permalink / raw)
  To: Christian Loehle, Jens Axboe, linux-kernel
  Cc: peterz, tglx, Rafael J. Wysocki

On 3/31/25 11:33, Christian Loehle wrote:
> On 3/31/25 10:02, Pavel Begunkov wrote:
>> On 8/19/24 16:39, Jens Axboe wrote:
>>> Hi,
>>>
>>> This is v6 of the patchset where the current in_iowait state is split
>>> into two parts:
>>>
>>> 1) The "task is sleeping waiting on IO", and would like cpufreq goodness
>>>      in terms of sleep and wakeup latencies.
>>> 2) The above, and also accounted as such in the iowait stats.
>>>
>>> The current ->in_iowait covers both, this series splits it into two types
>>> of state so that each can be controlled seperately.
>>>
>>> Patches 1..3 are prep patches, changing the type of
>>> task_struct->nr_iowait and adding helpers to manipulate the iowait counts.
>>>
>>> Patch 4 does the actual splitting.
>>>
>>> This has been sitting for a while, would be nice to get this queued up
>>> for 6.12. Comments welcome!
>>
>> Good day,
>>
>> Did anything good happened with these patches or related work?
>> Christian>
> 
> Hi Pavel,
> so for cpuidle part we've had commit ("38f83090f515 cpuidle: menu: Remove iowait influence")
> for a while now without much complaints, hopefully that means it stays in.
> So I'd really like to know how the results still compare for relevant workloads.

Sounds great

> cpufreq iowait boosting is still a thing in schedutil and intel_pstate,
> and so far I've failed to convince Rafael and Peter to get rid of it.
> I still think that is the right thing to do, but it does come with a
> regression in most of the simple synthetic fio tests.

IOW, from the io_uring iowait stat problem perspective it got stuck
and is unlikely to move short term.

>> Reminder: the goal is to let io_uring to keep using iowait boosting
>> but avoid reporting it in the iowait stats, because the jump in the
>> stat spooks users. I know at least several users carrying out of tree
>> patches to work it around. And, apparently, disabling the boosting
>> causes perf regressions.
> 
> Details would be appreciated, I looked the the postgres workload that
> justified it initially and that was on cpuidle iowait which is no
> longer a thing.

I wasn't involved and afraid don't have any extra numbers.

>> I'm reading through the thread, but unless I missed something, it looks
>> like the patchset is actually aligned with future plans on iowait
>> mentioned in the thread, in a sense that it reduces the exposure to
>> the user space, and, when it's time, a better approach will be able
>> replaces it with no visible effect to the user.
> 
> I'm not against $subject necessarily, it's clearly a hack tapering
> over this but as I've mentioned I'm fine carrying a revert of $subject
> for a future series on iowait boosting.
> 
>>
>> On the other hand, there seems to be a work around io_uring patch
>> queued for, which I quite dislike from io_uring perspective but also
>> because it exposes even more of iowait to the user.
>> I can understand why it's there, it has been over a year since v1,
>> but maybe we can figure something out before it's released? Would
>> it be fine to have something similar to this series? Any other
>> ideas?
> 
> Ah thank you, I've missed this
> https://lore.kernel.org/io-uring/f548f142-d6f3-46d8-9c58-6cf595c968fb@kernel.dk/
> Would be nice if this lead to more numbers comparing the two at least.

Sure, but I'd rather avoid adding this type of a uapi just to test
it and solve the problem a different way after.

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2025-04-01  8:19 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-19 15:39 [PATCHSET v6 0/4] Split iowait into two states Jens Axboe
2024-08-19 15:39 ` [PATCH 1/4] sched/core: add helpers for iowait handling Jens Axboe
2024-08-19 15:39 ` [PATCH 2/4] sched/core: change rq->nr_iowait type to atomic_long_t Jens Axboe
2024-08-20  2:14   ` Zhang Qiao
2024-08-19 15:39 ` [PATCH 3/4] sched/core: have io_schedule_prepare() return a long Jens Axboe
2024-09-05  9:57   ` Peter Zijlstra
2024-08-19 15:39 ` [PATCH 4/4] sched/core: split iowait state into two states Jens Axboe
2024-09-05 10:55   ` Peter Zijlstra
2024-08-21 14:54 ` [PATCHSET v6 0/4] Split iowait " Christian Loehle
2024-08-21 15:04   ` Jens Axboe
2024-08-21 15:57     ` Christian Loehle
2024-08-24 15:34       ` Jens Axboe
2024-09-04 14:28 ` Peter Zijlstra
2024-09-04 14:41   ` Jens Axboe
2024-09-04 14:49     ` Jens Axboe
2024-09-05  9:51     ` Peter Zijlstra
2024-09-04 14:42   ` Rafael J. Wysocki
2024-09-04 15:18     ` Rafael J. Wysocki
2024-09-05  9:29       ` Christian Loehle
2024-09-05 10:40         ` Rafael J. Wysocki
2024-09-05  9:36       ` Peter Zijlstra
2024-09-05 10:31         ` Christian Loehle
2024-09-05 11:00           ` Peter Zijlstra
2024-09-05 11:09             ` Christian Loehle
2025-03-31  9:02 ` Pavel Begunkov
2025-03-31 10:33   ` Christian Loehle
2025-04-01  8:21     ` Pavel Begunkov
  -- strict thread matches above, loose matches on Subject: below --
2024-08-17 20:45 [PATCHSET v5 " Jens Axboe
2024-08-17 20:45 ` [PATCH 1/4] sched/core: add helpers for iowait handling Jens Axboe
2024-04-16 12:11 [PATCHSET v4 0/4] Split iowait into two states Jens Axboe
2024-04-16 12:11 ` [PATCH 1/4] sched/core: add helpers for iowait handling Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox