From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org,
Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 5.8 11/70] io_wq: Make io_wqe::lock a raw_spinlock_t
Date: Sat, 31 Oct 2020 12:35:43 +0100 [thread overview]
Message-ID: <20201031113500.044462247@linuxfoundation.org> (raw)
In-Reply-To: <20201031113459.481803250@linuxfoundation.org>
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
commit 95da84659226d75698a1ab958be0af21d9cc2a9c upstream.
During a context switch the scheduler invokes wq_worker_sleeping() with
disabled preemption. Disabling preemption is needed because it protects
access to `worker->sleeping'. As an optimisation it avoids invoking
schedule() within the schedule path as part of possible wake up (thus
preempt_enable_no_resched() afterwards).
The io-wq has been added to the mix in the same section with disabled
preemption. This breaks on PREEMPT_RT because io_wq_worker_sleeping()
acquires a spinlock_t. Also within the schedule() the spinlock_t must be
acquired after tsk_is_pi_blocked() otherwise it will block on the
sleeping lock again while scheduling out.
While playing with `io_uring-bench' I didn't notice a significant
latency spike after converting io_wqe::lock to a raw_spinlock_t. The
latency was more or less the same.
In order to keep the spinlock_t it would have to be moved after the
tsk_is_pi_blocked() check which would introduce a branch instruction
into the hot path.
The lock is used to maintain the `work_list' and wakes one task up at
most.
Should io_wqe_cancel_pending_work() cause latency spikes, while
searching for a specific item, then it would need to drop the lock
during iterations.
revert_creds() is also invoked under the lock. According to debug
cred::non_rcu is 0. Otherwise it should be moved outside of the locked
section because put_cred_rcu()->free_uid() acquires a sleeping lock.
Convert io_wqe::lock to a raw_spinlock_t.c
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
fs/io-wq.c | 52 ++++++++++++++++++++++++++--------------------------
1 file changed, 26 insertions(+), 26 deletions(-)
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -88,7 +88,7 @@ enum {
*/
struct io_wqe {
struct {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long hash_map;
unsigned flags;
@@ -149,7 +149,7 @@ static bool __io_worker_unuse(struct io_
if (current->files != worker->restore_files) {
__acquire(&wqe->lock);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
task_lock(current);
@@ -168,7 +168,7 @@ static bool __io_worker_unuse(struct io_
if (worker->mm) {
if (!dropped_lock) {
__acquire(&wqe->lock);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
}
__set_current_state(TASK_RUNNING);
@@ -222,17 +222,17 @@ static void io_worker_exit(struct io_wor
worker->flags = 0;
preempt_enable();
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
if (__io_worker_unuse(wqe, worker)) {
__release(&wqe->lock);
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
}
acct->nr_workers--;
nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
/* all workers gone, wq exit can proceed */
if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
@@ -508,7 +508,7 @@ get_next:
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (!work)
break;
io_assign_current_work(worker, work);
@@ -543,7 +543,7 @@ get_next:
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* dependent work is not hashed */
@@ -551,11 +551,11 @@ get_next:
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
} while (work);
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
} while (1);
}
@@ -570,7 +570,7 @@ static int io_wqe_worker(void *data)
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
@@ -581,7 +581,7 @@ loop:
__release(&wqe->lock);
goto loop;
}
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (signal_pending(current))
flush_signals(current);
if (schedule_timeout(WORKER_IDLE_TIMEOUT))
@@ -593,11 +593,11 @@ loop:
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker);
else
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
io_worker_exit(worker);
@@ -637,9 +637,9 @@ void io_wq_worker_sleeping(struct task_s
worker->flags &= ~IO_WORKER_F_RUNNING;
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
io_wqe_dec_running(wqe, worker);
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -663,7 +663,7 @@ static bool create_io_worker(struct io_w
return false;
}
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
@@ -672,7 +672,7 @@ static bool create_io_worker(struct io_w
if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
acct->nr_workers++;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (index == IO_WQ_ACCT_UNBOUND)
atomic_inc(&wq->user->processes);
@@ -727,12 +727,12 @@ static int io_wq_manager(void *data)
if (!node_online(node))
continue;
- spin_lock_irq(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
fork_worker[IO_WQ_ACCT_BOUND] = true;
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
if (fork_worker[IO_WQ_ACCT_BOUND])
create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
if (fork_worker[IO_WQ_ACCT_UNBOUND])
@@ -829,10 +829,10 @@ static void io_wqe_enqueue(struct io_wqe
}
work_flags = work->flags;
- spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock_irqsave(&wqe->lock, flags);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
@@ -959,13 +959,13 @@ static void io_wqe_cancel_pending_work(s
unsigned long flags;
retry:
- spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
match->nr_pending++;
if (!match->cancel_all)
@@ -974,7 +974,7 @@ retry:
/* not safe to continue after unlock */
goto retry;
}
- spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1082,7 +1082,7 @@ struct io_wq *io_wq_create(unsigned boun
}
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
wqe->wq = wq;
- spin_lock_init(&wqe->lock);
+ raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list);
next prev parent reply other threads:[~2020-10-31 11:40 UTC|newest]
Thread overview: 73+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-31 11:35 [PATCH 5.8 00/70] 5.8.18-rc1 review Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 01/70] netfilter: nftables_offload: KASAN slab-out-of-bounds Read in nft_flow_rule_create Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 02/70] io_uring: dont run task work on an exiting task Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 03/70] io_uring: allow timeout/poll/files killing to take task into account Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 04/70] io_uring: move dropping of files into separate helper Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 05/70] io_uring: stash ctx task reference for SQPOLL Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 06/70] io_uring: unconditionally grab req->task Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 07/70] io_uring: return cancelation status from poll/timeout/files handlers Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 08/70] io_uring: enable task/files specific overflow flushing Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 09/70] io_uring: dont rely on weak ->files references Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 10/70] io_uring: reference ->nsproxy for file table commands Greg Kroah-Hartman
2020-10-31 11:35 ` Greg Kroah-Hartman [this message]
2020-10-31 11:35 ` [PATCH 5.8 12/70] io-wq: fix use-after-free in io_wq_worker_running Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 13/70] io_uring: no need to call xa_destroy() on empty xarray Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 14/70] io_uring: Fix use of XArray in __io_uring_files_cancel Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 15/70] io_uring: Fix XArray usage in io_uring_add_task_file Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 16/70] io_uring: Convert advanced XArray uses to the normal API Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 17/70] scripts/setlocalversion: make git describe output more reliable Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 18/70] efi/arm64: libstub: Deal gracefully with EFI_RNG_PROTOCOL failure Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 19/70] fs/kernel_read_file: Remove FIRMWARE_EFI_EMBEDDED enum Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 20/70] arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 21/70] arm64: Run ARCH_WORKAROUND_2 " Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 22/70] arm64: link with -z norelro regardless of CONFIG_RELOCATABLE Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 23/70] x86/PCI: Fix intel_mid_pci.c build error when ACPI is not enabled Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 24/70] x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}() Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 25/70] x86/copy_mc: Introduce copy_mc_enhanced_fast_string() Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 26/70] efivarfs: Replace invalid slashes with exclamation marks in dentries Greg Kroah-Hartman
2020-10-31 11:35 ` [PATCH 5.8 27/70] bnxt_en: Check abort error state in bnxt_open_nic() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 28/70] bnxt_en: Fix regression in workqueue cleanup logic in bnxt_remove_one() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 29/70] bnxt_en: Invoke cancel_delayed_work_sync() for PFs also Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 30/70] bnxt_en: Re-write PCI BARs after PCI fatal error Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 31/70] bnxt_en: Send HWRM_FUNC_RESET fw command unconditionally Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 32/70] chelsio/chtls: fix deadlock issue Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 33/70] chelsio/chtls: fix memory leaks in CPL handlers Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 34/70] chelsio/chtls: fix tls record info to user Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 35/70] cxgb4: set up filter action after rewrites Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 36/70] gtp: fix an use-before-init in gtp_newlink() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 37/70] ibmveth: Fix use of ibmveth in a bridge Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 38/70] ibmvnic: fix ibmvnic_set_mac Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 39/70] mlxsw: core: Fix memory leak on module removal Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 40/70] netem: fix zero division in tabledist Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 41/70] net: hns3: Clear the CMDQ registers before unmapping BAR region Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 42/70] net: ipa: command payloads already mapped Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 43/70] net/sched: act_mpls: Add softdep on mpls_gso.ko Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 44/70] r8169: fix issue with forced threading in combination with shared interrupts Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 45/70] ravb: Fix bit fields checking in ravb_hwtstamp_get() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 46/70] tcp: Prevent low rmem stalls with SO_RCVLOWAT Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 47/70] tipc: fix memory leak caused by tipc_buf_append() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 48/70] net: protect tcf_block_unbind with block lock Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 49/70] erofs: avoid duplicated permission check for "trusted." xattrs Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 50/70] arch/x86/amd/ibs: Fix re-arming IBS Fetch Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 51/70] x86/traps: Fix #DE Oops message regression Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 52/70] x86/xen: disable Firmware First mode for correctable memory errors Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 53/70] PCI: aardvark: Fix initialization with old Marvells Arm Trusted Firmware Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 54/70] ata: ahci: mvebu: Make SATA PHY optional for Armada 3720 Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 55/70] fuse: fix page dereference after free Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 56/70] bpf: Fix comment for helper bpf_current_task_under_cgroup() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 57/70] evm: Check size of security.evm before using it Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 58/70] p54: avoid accessing the data mapped to streaming DMA Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 59/70] cxl: Rework error message for incompatible slots Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 60/70] RDMA/addr: Fix race with netevent_callback()/rdma_addr_cancel() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 61/70] mtd: lpddr: Fix bad logic in print_drs_error Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 62/70] drm/i915/gem: Serialise debugfs i915_gem_objects with ctx->mutex Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 63/70] serial: qcom_geni_serial: To correct QUP Version detection logic Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 64/70] serial: pl011: Fix lockdep splat when handling magic-sysrq interrupt Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 65/70] PM: runtime: Fix timer_expires data type on 32-bit arches Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 66/70] ata: sata_rcar: Fix DMA boundary mask Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 67/70] xen/gntdev.c: Mark pages as dirty Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 68/70] openrisc: Fix issue with get_user for 64-bit values Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 69/70] misc: rtsx: do not setting OC_POWER_DOWN reg in rtsx_pci_init_ocp() Greg Kroah-Hartman
2020-10-31 11:36 ` [PATCH 5.8 70/70] phy: marvell: comphy: Convert internal SMCC firmware return codes to errno Greg Kroah-Hartman
2020-10-31 20:08 ` [PATCH 5.8 00/70] 5.8.18-rc1 review Guenter Roeck
2020-11-01 7:19 ` Naresh Kamboju
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201031113500.044462247@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=axboe@kernel.dk \
--cc=bigeasy@linutronix.de \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox