* [RFC] io_uring: fix the dead lock between io_uring and core dump
@ 2025-02-26 11:39 Haifeng Xu
2025-02-27 13:09 ` kernel test robot
2025-02-27 18:24 ` kernel test robot
0 siblings, 2 replies; 4+ messages in thread
From: Haifeng Xu @ 2025-02-26 11:39 UTC (permalink / raw)
To: asml.silence, axboe, ebiederm; +Cc: olivier, io-uring, linux-kernel, Haifeng Xu
In our production environment, we found many hung tasks.
Thead A (exit_mm)
...
if (core_state) {
struct core_thread self;
mmap_read_unlock(mm);
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
...
Thead B (coredump_wait)
...
if (core_waiters > 0) {
struct core_thread *ptr;
freezer_do_not_count();
wait_for_completion(&core_state->startup);
freezer_count();
/*
* Wait for all the threads to become inactive, so that
* all the thread context (extended register state, like
* fpu etc) gets copied to the memory.
*/
ptr = core_state->dumper.next;
while (ptr != NULL) {
wait_task_inactive(ptr->task, 0);
ptr = ptr->next;
}
...
Thead C (io_worker_exit)
...
if (refcount_dec_and_test(&worker->ref))
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
...
Thread A is waiting Thead B to finish core dump, but Thead B found that there is
still one thread which doesn't step into exit_mm() to dec core_state->nr_threads.
The thead is Thread C, it has submitted a task_work (create_worker_cb) to Thread B
and then wait Thread B to execute or cancel the work. So this causes deadlock between
io_uring and core dump.
Our kernel vesion is stable 5.15.125, and the commit 1d5f5ea7cb7d ("io-wq: remove worker to owner tw dependency")
is included. When the last io woker exits, it doesn't find any callback. Once scheduled out,
it will invoke io_wq_worker_sleeping() to submit a task work to the master thread. So the
commit 1d5f5ea7cb7d ("io-wq: remove worker to owner tw dependency") won't help in this case.
For the core dump thread, we can set a timeout to check whether the taks_work callback exists,
If needed, cancel the task_work and wake up the io worker, so the dead lock will be resolved.
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
---
fs/coredump.c | 6 ++++--
include/linux/completion.h | 2 ++
io_uring/io-wq.c | 3 +--
io_uring/io-wq.h | 1 +
kernel/sched/completion.c | 11 +++++++++++
kernel/sched/core.c | 6 ++++++
6 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/fs/coredump.c b/fs/coredump.c
index 591700e1b2ce..1d972d5882f0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -42,6 +42,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
+#include <linux/sched/sysctl.h>
#include <linux/elf.h>
#include <linux/uaccess.h>
@@ -406,6 +407,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
int core_waiters = -EBUSY;
+ unsigned long hang_check = sysctl_hung_task_timeout_secs;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
@@ -415,8 +417,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
if (core_waiters > 0) {
struct core_thread *ptr;
- wait_for_completion_state(&core_state->startup,
- TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+ wait_for_completion_state_timeout(&core_state->startup, TASK_UNINTERRUPTIBLE|TASK_FREEZABLE,
+ hang_check * (HZ/2));
/*
* Wait for all the threads to become inactive, so that
* all the thread context (extended register state, like
diff --git a/include/linux/completion.h b/include/linux/completion.h
index fb2915676574..432de8ecc32d 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -104,6 +104,8 @@ extern void wait_for_completion_io(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern int wait_for_completion_state(struct completion *x, unsigned int state);
+extern int wait_for_completion_state_timeout(struct completion *x, unsigned int state,
+ unsigned long timeout);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
extern unsigned long wait_for_completion_io_timeout(struct completion *x,
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 91019b4d0308..1c03dc57a3b3 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -141,7 +141,6 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
struct io_wq_acct *acct,
struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
-static void io_wq_cancel_tw_create(struct io_wq *wq);
static bool io_worker_get(struct io_worker *worker)
{
@@ -1230,7 +1229,7 @@ void io_wq_exit_start(struct io_wq *wq)
set_bit(IO_WQ_BIT_EXIT, &wq->state);
}
-static void io_wq_cancel_tw_create(struct io_wq *wq)
+void io_wq_cancel_tw_create(struct io_wq *wq)
{
struct callback_head *cb;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index b3b004a7b625..48ba66b5d0bd 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -43,6 +43,7 @@ struct io_wq_data {
free_work_fn *free_work;
};
+void io_wq_cancel_tw_create(struct io_wq *wq);
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 3561ab533dd4..9e7936a3cad4 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -269,6 +269,17 @@ int __sched wait_for_completion_state(struct completion *x, unsigned int state)
}
EXPORT_SYMBOL(wait_for_completion_state);
+int __sched wait_for_completion_state_timeout(struct completion *x, unsigned int state,
+ unsigned long timeout)
+{
+ long t = wait_for_common(x, timeout, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state_timeout);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9aecd914ac69..1cbe48559163 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6790,6 +6790,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
{
static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
unsigned int task_flags;
+ struct io_uring_task *io_uring = tsk->io_uring;
/*
* Establish LD_WAIT_CONFIG context to ensure none of the code called
@@ -6806,6 +6807,11 @@ static inline void sched_submit_work(struct task_struct *tsk)
wq_worker_sleeping(tsk);
else if (task_flags & PF_IO_WORKER)
io_wq_worker_sleeping(tsk);
+ else if ((task_flags & PF_DUMPCORE) && io_uring) {
+ struct io_wq *wq = io_uring->io_wq;
+
+ io_wq_cancel_tw_create(wq);
+ }
/*
* spinlock and rwlock must not flush block requests. This will
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* Re: [RFC] io_uring: fix the dead lock between io_uring and core dump
2025-02-26 11:39 [RFC] io_uring: fix the dead lock between io_uring and core dump Haifeng Xu
@ 2025-02-27 13:09 ` kernel test robot
2025-02-27 18:24 ` kernel test robot
1 sibling, 0 replies; 4+ messages in thread
From: kernel test robot @ 2025-02-27 13:09 UTC (permalink / raw)
To: Haifeng Xu; +Cc: oe-kbuild-all
Hi Haifeng,
[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on brauner-vfs/vfs.all linus/master v6.14-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Haifeng-Xu/io_uring-fix-the-dead-lock-between-io_uring-and-core-dump/20250226-194045
base: tip/sched/core
patch link: https://lore.kernel.org/r/20250226113936.385747-1-haifeng.xu%40shopee.com
patch subject: [RFC] io_uring: fix the dead lock between io_uring and core dump
config: i386-buildonly-randconfig-001-20250227 (https://download.01.org/0day-ci/archive/20250227/202502272014.3MUGtfQ4-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250227/202502272014.3MUGtfQ4-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502272014.3MUGtfQ4-lkp@intel.com/
All errors (new ones prefixed by >>):
kernel/sched/core.c: In function 'sched_submit_work':
>> kernel/sched/core.c:6791:45: error: 'struct task_struct' has no member named 'io_uring'
6791 | struct io_uring_task *io_uring = tsk->io_uring;
| ^~
vim +6791 kernel/sched/core.c
6786
6787 static inline void sched_submit_work(struct task_struct *tsk)
6788 {
6789 static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
6790 unsigned int task_flags;
> 6791 struct io_uring_task *io_uring = tsk->io_uring;
6792
6793 /*
6794 * Establish LD_WAIT_CONFIG context to ensure none of the code called
6795 * will use a blocking primitive -- which would lead to recursion.
6796 */
6797 lock_map_acquire_try(&sched_map);
6798
6799 task_flags = tsk->flags;
6800 /*
6801 * If a worker goes to sleep, notify and ask workqueue whether it
6802 * wants to wake up a task to maintain concurrency.
6803 */
6804 if (task_flags & PF_WQ_WORKER)
6805 wq_worker_sleeping(tsk);
6806 else if (task_flags & PF_IO_WORKER)
6807 io_wq_worker_sleeping(tsk);
6808 else if ((task_flags & PF_DUMPCORE) && io_uring) {
6809 struct io_wq *wq = io_uring->io_wq;
6810
6811 io_wq_cancel_tw_create(wq);
6812 }
6813
6814 /*
6815 * spinlock and rwlock must not flush block requests. This will
6816 * deadlock if the callback attempts to acquire a lock which is
6817 * already acquired.
6818 */
6819 SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
6820
6821 /*
6822 * If we are going to sleep and we have plugged IO queued,
6823 * make sure to submit it to avoid deadlocks.
6824 */
6825 blk_flush_plug(tsk->plug, true);
6826
6827 lock_map_release(&sched_map);
6828 }
6829
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [RFC] io_uring: fix the dead lock between io_uring and core dump
2025-02-26 11:39 [RFC] io_uring: fix the dead lock between io_uring and core dump Haifeng Xu
2025-02-27 13:09 ` kernel test robot
@ 2025-02-27 18:24 ` kernel test robot
2025-02-28 9:40 ` Haifeng Xu
1 sibling, 1 reply; 4+ messages in thread
From: kernel test robot @ 2025-02-27 18:24 UTC (permalink / raw)
To: Haifeng Xu; +Cc: llvm, oe-kbuild-all
Hi Haifeng,
[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on brauner-vfs/vfs.all linus/master v6.14-rc4 next-20250227]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Haifeng-Xu/io_uring-fix-the-dead-lock-between-io_uring-and-core-dump/20250226-194045
base: tip/sched/core
patch link: https://lore.kernel.org/r/20250226113936.385747-1-haifeng.xu%40shopee.com
patch subject: [RFC] io_uring: fix the dead lock between io_uring and core dump
config: i386-buildonly-randconfig-003-20250227 (https://download.01.org/0day-ci/archive/20250228/202502280245.WT5bW0xf-lkp@intel.com/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250228/202502280245.WT5bW0xf-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502280245.WT5bW0xf-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from kernel/sched/core.c:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/x86/include/asm/cacheflush.h:5:
In file included from include/linux/mm.h:2224:
include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
505 | item];
| ~~~~
include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
512 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
>> kernel/sched/core.c:6791:40: error: no member named 'io_uring' in 'struct task_struct'
6791 | struct io_uring_task *io_uring = tsk->io_uring;
| ~~~ ^
2 warnings and 1 error generated.
vim +6791 kernel/sched/core.c
6786
6787 static inline void sched_submit_work(struct task_struct *tsk)
6788 {
6789 static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
6790 unsigned int task_flags;
> 6791 struct io_uring_task *io_uring = tsk->io_uring;
6792
6793 /*
6794 * Establish LD_WAIT_CONFIG context to ensure none of the code called
6795 * will use a blocking primitive -- which would lead to recursion.
6796 */
6797 lock_map_acquire_try(&sched_map);
6798
6799 task_flags = tsk->flags;
6800 /*
6801 * If a worker goes to sleep, notify and ask workqueue whether it
6802 * wants to wake up a task to maintain concurrency.
6803 */
6804 if (task_flags & PF_WQ_WORKER)
6805 wq_worker_sleeping(tsk);
6806 else if (task_flags & PF_IO_WORKER)
6807 io_wq_worker_sleeping(tsk);
6808 else if ((task_flags & PF_DUMPCORE) && io_uring) {
6809 struct io_wq *wq = io_uring->io_wq;
6810
6811 io_wq_cancel_tw_create(wq);
6812 }
6813
6814 /*
6815 * spinlock and rwlock must not flush block requests. This will
6816 * deadlock if the callback attempts to acquire a lock which is
6817 * already acquired.
6818 */
6819 SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
6820
6821 /*
6822 * If we are going to sleep and we have plugged IO queued,
6823 * make sure to submit it to avoid deadlocks.
6824 */
6825 blk_flush_plug(tsk->plug, true);
6826
6827 lock_map_release(&sched_map);
6828 }
6829
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [RFC] io_uring: fix the dead lock between io_uring and core dump
2025-02-27 18:24 ` kernel test robot
@ 2025-02-28 9:40 ` Haifeng Xu
0 siblings, 0 replies; 4+ messages in thread
From: Haifeng Xu @ 2025-02-28 9:40 UTC (permalink / raw)
To: kernel test robot; +Cc: llvm, oe-kbuild-all
On 2025/2/28 02:24, kernel test robot wrote:
> Hi Haifeng,
>
> [This is a private test report for your RFC patch.]
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on tip/sched/core]
> [also build test ERROR on brauner-vfs/vfs.all linus/master v6.14-rc4 next-20250227]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Haifeng-Xu/io_uring-fix-the-dead-lock-between-io_uring-and-core-dump/20250226-194045
> base: tip/sched/core
> patch link: https://lore.kernel.org/r/20250226113936.385747-1-haifeng.xu%40shopee.com
> patch subject: [RFC] io_uring: fix the dead lock between io_uring and core dump
> config: i386-buildonly-randconfig-003-20250227 (https://download.01.org/0day-ci/archive/20250228/202502280245.WT5bW0xf-lkp@intel.com/config)
> compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250228/202502280245.WT5bW0xf-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202502280245.WT5bW0xf-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> In file included from kernel/sched/core.c:10:
> In file included from include/linux/highmem.h:8:
> In file included from include/linux/cacheflush.h:5:
> In file included from arch/x86/include/asm/cacheflush.h:5:
> In file included from include/linux/mm.h:2224:
> include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
> 504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~ ^
> 505 | item];
> | ~~~~
> include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
> 511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~ ^
> 512 | NR_VM_NUMA_EVENT_ITEMS +
> | ~~~~~~~~~~~~~~~~~~~~~~
>>> kernel/sched/core.c:6791:40: error: no member named 'io_uring' in 'struct task_struct'
> 6791 | struct io_uring_task *io_uring = tsk->io_uring;
> | ~~~ ^
> 2 warnings and 1 error generated.
>
>
The CONFIG_IO_URING isn't enables, so 'io_uring' can't be found in 'struct task_struct'.
I'll post a new version later.
> vim +6791 kernel/sched/core.c
>
> 6786
> 6787 static inline void sched_submit_work(struct task_struct *tsk)
> 6788 {
> 6789 static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
> 6790 unsigned int task_flags;
>> 6791 struct io_uring_task *io_uring = tsk->io_uring;
> 6792
> 6793 /*
> 6794 * Establish LD_WAIT_CONFIG context to ensure none of the code called
> 6795 * will use a blocking primitive -- which would lead to recursion.
> 6796 */
> 6797 lock_map_acquire_try(&sched_map);
> 6798
> 6799 task_flags = tsk->flags;
> 6800 /*
> 6801 * If a worker goes to sleep, notify and ask workqueue whether it
> 6802 * wants to wake up a task to maintain concurrency.
> 6803 */
> 6804 if (task_flags & PF_WQ_WORKER)
> 6805 wq_worker_sleeping(tsk);
> 6806 else if (task_flags & PF_IO_WORKER)
> 6807 io_wq_worker_sleeping(tsk);
> 6808 else if ((task_flags & PF_DUMPCORE) && io_uring) {
> 6809 struct io_wq *wq = io_uring->io_wq;
> 6810
> 6811 io_wq_cancel_tw_create(wq);
> 6812 }
> 6813
> 6814 /*
> 6815 * spinlock and rwlock must not flush block requests. This will
> 6816 * deadlock if the callback attempts to acquire a lock which is
> 6817 * already acquired.
> 6818 */
> 6819 SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
> 6820
> 6821 /*
> 6822 * If we are going to sleep and we have plugged IO queued,
> 6823 * make sure to submit it to avoid deadlocks.
> 6824 */
> 6825 blk_flush_plug(tsk->plug, true);
> 6826
> 6827 lock_map_release(&sched_map);
> 6828 }
> 6829
>
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-02-28 9:40 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-26 11:39 [RFC] io_uring: fix the dead lock between io_uring and core dump Haifeng Xu
2025-02-27 13:09 ` kernel test robot
2025-02-27 18:24 ` kernel test robot
2025-02-28 9:40 ` Haifeng Xu
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.