From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>,
Andrea Righi <andrea.righi@canonical.com>,
Geert Uytterhoeven <geert@linux-m68k.org>,
Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
Sasha Levin <sashal@kernel.org>,
bpf@vger.kernel.org
Subject: [PATCH AUTOSEL 6.6 04/20] kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id()
Date: Wed, 3 Apr 2024 13:17:45 -0400 [thread overview]
Message-ID: <20240403171815.342668-4-sashal@kernel.org> (raw)
In-Reply-To: <20240403171815.342668-1-sashal@kernel.org>
From: Tejun Heo <tj@kernel.org>
[ Upstream commit 4207b556e62f0a8915afc5da4c5d5ad915a253a5 ]
The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.
Consider the following BPF program:
SEC("fentry/__set_cpus_allowed_ptr_locked")
int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
{
struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);
if (cgrp) {
bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
bpf_cgroup_release(cgrp);
}
return 0;
}
__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:
=====================================================
WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
-----------------------------------------------------
repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70
and this task is already holding:
ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
which would create a new lock dependency:
(&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
...
Possible interrupt unsafe locking scenario:
CPU0 CPU1
---- ----
lock(kernfs_idr_lock);
local_irq_disable();
lock(&rq->__lock);
lock(kernfs_idr_lock);
<Interrupt>
lock(&rq->__lock);
*** DEADLOCK ***
...
Call Trace:
dump_stack_lvl+0x55/0x70
dump_stack+0x10/0x20
__lock_acquire+0x781/0x2a40
lock_acquire+0xbf/0x1f0
_raw_spin_lock+0x2f/0x40
kernfs_find_and_get_node_by_id+0x1e/0x70
cgroup_get_from_id+0x21/0x240
bpf_cgroup_from_id+0xe/0x20
bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
bpf_trampoline_6442545632+0x4f/0x1000
__set_cpus_allowed_ptr_locked+0x5/0x5a0
sched_setaffinity+0x1b3/0x290
__x64_sys_sched_setaffinity+0x4f/0x60
do_syscall_64+0x40/0xe0
entry_SYSCALL_64_after_hwframe+0x46/0x4e
Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.
This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
fs/kernfs/dir.c | 31 ++++++++++++++++++++-----------
fs/kernfs/kernfs-internal.h | 2 ++
include/linux/kernfs.h | 2 ++
3 files changed, 24 insertions(+), 11 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 62d39ecf0a466..2405aeb39b9a2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
}
EXPORT_SYMBOL_GPL(kernfs_get);
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+ struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+ kfree_const(kn->name);
+
+ if (kn->iattr) {
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+ }
+
+ kmem_cache_free(kernfs_node_cache, kn);
+}
+
/**
* kernfs_put - put a reference count on a kernfs_node
* @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- kfree_const(kn->name);
-
- if (kn->iattr) {
- simple_xattrs_free(&kn->iattr->xattrs, NULL);
- kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
- }
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
- kmem_cache_free(kernfs_node_cache, kn);
+
+ call_rcu(&kn->rcu, kernfs_free_rcu);
kn = parent;
if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
} else {
/* just released the root kn, free @root too */
idr_destroy(&root->ino_idr);
- kfree(root);
+ kfree_rcu(root, rcu);
}
}
EXPORT_SYMBOL_GPL(kernfs_put);
@@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
ino_t ino = kernfs_id_ino(id);
u32 gen = kernfs_id_gen(id);
- spin_lock(&kernfs_idr_lock);
+ rcu_read_lock();
kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn)
@@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;
- spin_unlock(&kernfs_idr_lock);
+ rcu_read_unlock();
return kn;
err_unlock:
- spin_unlock(&kernfs_idr_lock);
+ rcu_read_unlock();
return NULL;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index a9b854cdfdb5f..210dac7e9ee25 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ struct kernfs_root {
struct rw_semaphore kernfs_rwsem;
struct rw_semaphore kernfs_iattr_rwsem;
struct rw_semaphore kernfs_supers_rwsem;
+
+ struct rcu_head rcu;
};
/* +1 to avoid triggering overflow warning when negating it */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 2a36f3218b510..5a952d00ea159 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -223,6 +223,8 @@ struct kernfs_node {
unsigned short flags;
umode_t mode;
struct kernfs_iattrs *iattr;
+
+ struct rcu_head rcu;
};
/*
--
2.43.0
next prev parent reply other threads:[~2024-04-03 17:18 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-03 17:17 [PATCH AUTOSEL 6.6 01/20] tools: iio: replace seekdir() in iio_generic_buffer Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 02/20] usb: storage: sddr55: fix sloppy typing in sddr55_{read|write}_data() Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 03/20] bus: mhi: host: Add MHI_PM_SYS_ERR_FAIL state Sasha Levin
2024-04-03 17:17 ` Sasha Levin [this message]
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 05/20] usb: gadget: uvc: mark incomplete frames with UVC_STREAM_ERR Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 06/20] usb: typec: ucsi: Limit read size on v1.2 Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 07/20] thunderbolt: Keep the domain powered when USB4 port is in redrive mode Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 08/20] usb: typec: tcpci: add generic tcpci fallback compatible Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 09/20] usb: sl811-hcd: only defined function checkdone if QUIRK2 is defined Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 10/20] ASoC: amd: yc: Fix non-functional mic on ASUS M7600RE Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 11/20] thermal/of: Assume polling-delay(-passive) 0 when absent Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 12/20] ASoC: soc-core.c: Skip dummy codec when adding platforms Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 13/20] x86/xen: attempt to inflate the memory balloon on PVH Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 14/20] fbdev: viafb: fix typo in hw_bitblt_1 and hw_bitblt_2 Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 15/20] io_uring: clear opcode specific data for an early failure Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 16/20] modpost: fix null pointer dereference Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 17/20] drivers/nvme: Add quirks for device 126f:2262 Sasha Levin
2024-04-03 17:17 ` [PATCH AUTOSEL 6.6 18/20] fbmon: prevent division by zero in fb_videomode_from_videomode() Sasha Levin
2024-04-03 17:18 ` [PATCH AUTOSEL 6.6 19/20] ALSA: hda/realtek: Add quirks for some Clevo laptops Sasha Levin
2024-04-03 17:18 ` [PATCH AUTOSEL 6.6 20/20] drm/amdgpu: fix use-after-free bug Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240403171815.342668-4-sashal@kernel.org \
--to=sashal@kernel.org \
--cc=andrea.righi@canonical.com \
--cc=bpf@vger.kernel.org \
--cc=geert@linux-m68k.org \
--cc=gregkh@linuxfoundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox