From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>, Daniel Wagner <dwagner@suse.de>,
Peter Zijlstra <peterz@infradead.org>,
Sasha Levin <sashal@kernel.org>,
io-uring@vger.kernel.org, netdev@vger.kernel.org,
bpf@vger.kernel.org
Subject: [PATCH AUTOSEL 5.14 41/47] io-wq: remove GFP_ATOMIC allocation off schedule out path
Date: Sun, 5 Sep 2021 21:19:45 -0400 [thread overview]
Message-ID: <20210906011951.928679-41-sashal@kernel.org> (raw)
In-Reply-To: <20210906011951.928679-1-sashal@kernel.org>
From: Jens Axboe <axboe@kernel.dk>
[ Upstream commit d3e9f732c415cf22faa33d6f195e291ad82dc92e ]
Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
stress-ng:
| [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
| [ 90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [ 90.202561] Call Trace:
| [ 90.202577] dump_stack_lvl+0x34/0x44
| [ 90.202584] ___might_sleep.cold+0x87/0x94
| [ 90.202588] rt_spin_lock+0x19/0x70
| [ 90.202593] ___slab_alloc+0xcb/0x7d0
| [ 90.202598] ? newidle_balance.constprop.0+0xf5/0x3b0
| [ 90.202603] ? dequeue_entity+0xc3/0x290
| [ 90.202605] ? io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202610] ? pick_next_task_fair+0xb9/0x330
| [ 90.202612] ? __schedule+0x670/0x1410
| [ 90.202615] ? io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
| [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202625] io_wq_worker_sleeping+0x37/0x50
| [ 90.202628] schedule+0x30/0xd0
| [ 90.202630] schedule_timeout+0x8f/0x1a0
| [ 90.202634] ? __bpf_trace_tick_stop+0x10/0x10
| [ 90.202637] io_wqe_worker+0xfd/0x320
| [ 90.202641] ? finish_task_switch.isra.0+0xd3/0x290
| [ 90.202644] ? io_worker_handle_work+0x670/0x670
| [ 90.202646] ? io_worker_handle_work+0x670/0x670
| [ 90.202649] ret_from_fork+0x22/0x30
which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
a raw spinlock. Besides that not working on RT, doing any kind of
allocation from inside schedule() is kind of nasty and should be avoided
if at all possible.
This particular path happens when an io-wq worker goes to sleep, and we
need a new worker to handle pending work. We currently allocate a small
data item to hold the information we need to create a new worker, but we
can instead include this data in the io_worker struct itself and just
protect it with a single bit lock. We only really need one per worker
anyway, as we will have run pending work between to sleep cycles.
https://lore.kernel.org/lkml/20210804082418.fbibprcwtzyt5qax@beryllium.lan/
Reported-by: Daniel Wagner <dwagner@suse.de>
Tested-by: Daniel Wagner <dwagner@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
fs/io-wq.c | 72 ++++++++++++++++++++++++++++++------------------------
1 file changed, 40 insertions(+), 32 deletions(-)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 7d2ed8c7dd31..4ce83bb48021 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
struct completion ref_done;
+ unsigned long create_state;
+ struct callback_head create_work;
+ int create_index;
+
struct rcu_head rcu;
};
@@ -272,24 +276,18 @@ static void io_wqe_inc_running(struct io_worker *worker)
atomic_inc(&acct->nr_running);
}
-struct create_worker_data {
- struct callback_head work;
- struct io_wqe *wqe;
- int index;
-};
-
static void create_worker_cb(struct callback_head *cb)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
struct io_wq *wq;
struct io_wqe *wqe;
struct io_wqe_acct *acct;
bool do_create = false, first = false;
- cwd = container_of(cb, struct create_worker_data, work);
- wqe = cwd->wqe;
+ worker = container_of(cb, struct io_worker, create_work);
+ wqe = worker->wqe;
wq = wqe->wq;
- acct = &wqe->acct[cwd->index];
+ acct = &wqe->acct[worker->create_index];
raw_spin_lock_irq(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
if (!acct->nr_workers)
@@ -299,33 +297,42 @@ static void create_worker_cb(struct callback_head *cb)
}
raw_spin_unlock_irq(&wqe->lock);
if (do_create) {
- create_io_worker(wq, wqe, cwd->index, first);
+ create_io_worker(wq, wqe, worker->create_index, first);
} else {
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
}
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wqe_acct *acct)
{
- struct create_worker_data *cwd;
struct io_wq *wq = wqe->wq;
/* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail;
+ if (!io_worker_get(worker))
+ goto fail;
+ /*
+ * create_state manages ownership of create_work/index. We should
+ * only need one entry per worker, as the worker going to sleep
+ * will trigger the condition, and waking will clear it once it
+ * runs the task_work.
+ */
+ if (test_bit(0, &worker->create_state) ||
+ test_and_set_bit_lock(0, &worker->create_state))
+ goto fail_release;
- cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
- if (cwd) {
- init_task_work(&cwd->work, create_worker_cb);
- cwd->wqe = wqe;
- cwd->index = acct->index;
- if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
- return;
-
- kfree(cwd);
- }
+ init_task_work(&worker->create_work, create_worker_cb);
+ worker->create_index = acct->index;
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ return;
+ clear_bit_unlock(0, &worker->create_state);
+fail_release:
+ io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
@@ -343,7 +350,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, acct);
+ io_queue_worker_create(wqe, worker, acct);
}
}
@@ -1004,12 +1011,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
static bool io_task_work_match(struct callback_head *cb, void *data)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
if (cb->func != create_worker_cb)
return false;
- cwd = container_of(cb, struct create_worker_data, work);
- return cwd->wqe->wq == data;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker->wqe->wq == data;
}
void io_wq_exit_start(struct io_wq *wq)
@@ -1026,12 +1033,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
return;
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct create_worker_data *cwd;
+ struct io_worker *worker;
- cwd = container_of(cb, struct create_worker_data, work);
- atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+ worker = container_of(cb, struct io_worker, create_work);
+ atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
io_worker_ref_put(wq);
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}
rcu_read_lock();
--
2.30.2
next prev parent reply other threads:[~2021-09-06 1:21 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-06 1:19 [PATCH AUTOSEL 5.14 01/47] locking/mutex: Fix HANDOFF condition Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 02/47] regmap: fix the offset of register error log Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 03/47] regulator: tps65910: Silence deferred probe error Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 04/47] crypto: mxs-dcp - Check for DMA mapping errors Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 05/47] sched/deadline: Fix reset_on_fork reporting of DL tasks Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 06/47] power: supply: axp288_fuel_gauge: Report register-address on readb / writeb errors Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 07/47] crypto: omap-sham - clear dma flags only after omap_sham_update_dma_stop() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 08/47] sched/deadline: Fix missing clock update in migrate_task_rq_dl() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 09/47] rcu/tree: Handle VM stoppage in stall detection Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 10/47] EDAC/mce_amd: Do not load edac_mce_amd module on guests Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 11/47] posix-cpu-timers: Force next expiration recalc after itimer reset Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 12/47] hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 13/47] hrtimer: Ensure timerfd notification for HIGHRES=n Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 14/47] udf: Check LVID earlier Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 15/47] udf: Fix iocharset=utf8 mount option Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 16/47] isofs: joliet: " Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 17/47] bcache: add proper error unwinding in bcache_device_init Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 18/47] nbd: add the check to prevent overflow in __nbd_ioctl() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 19/47] blk-throtl: optimize IOPS throttle for large IO scenarios Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 20/47] nvme-tcp: don't update queue count when failing to set io queues Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 21/47] nvme-rdma: " Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 22/47] nvmet: pass back cntlid on successful completion Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 23/47] power: supply: smb347-charger: Add missing pin control activation Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 24/47] power: supply: max17042_battery: fix typo in MAx17042_TOFF Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 25/47] s390/cio: add dev_busid sysfs entry for each subchannel Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 26/47] s390/zcrypt: fix wrong offset index for APKA master key valid state Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 27/47] libata: fix ata_host_start() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 28/47] sched/topology: Skip updating masks for non-online nodes Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 29/47] crypto: omap - Fix inconsistent locking of device lists Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 30/47] crypto: qat - do not ignore errors from enable_vf2pf_comms() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 31/47] crypto: qat - handle both source of interrupt in VF ISR Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 32/47] crypto: qat - fix reuse of completion variable Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 33/47] crypto: qat - fix naming for init/shutdown VF to PF notifications Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 34/47] crypto: qat - do not export adf_iov_putmsg() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 35/47] crypto: hisilicon/sec - fix the abnormal exiting process Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 36/47] crypto: hisilicon/sec - modify the hardware endian configuration Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 37/47] crypto: tcrypt - Fix missing return value check Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 38/47] fcntl: fix potential deadlocks for &fown_struct.lock Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 39/47] fcntl: fix potential deadlock for &fasync_struct.fa_lock Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 40/47] udf_get_extendedattr() had no boundary checks Sasha Levin
2021-09-06 1:19 ` Sasha Levin [this message]
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 42/47] s390/kasan: fix large PMD pages address alignment check Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 43/47] s390/pci: fix misleading rc in clp_set_pci_fn() Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 44/47] s390/debug: keep debug data on resize Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 45/47] s390/debug: fix debug area life cycle Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 46/47] s390/ap: fix state machine hang after failure to enable irq Sasha Levin
2021-09-06 1:19 ` [PATCH AUTOSEL 5.14 47/47] s390/smp: enable DAT before CPU restart callback is called Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210906011951.928679-41-sashal@kernel.org \
--to=sashal@kernel.org \
--cc=axboe@kernel.dk \
--cc=bpf@vger.kernel.org \
--cc=dwagner@suse.de \
--cc=io-uring@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=peterz@infradead.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox