From: Andrii Nakryiko <andrii@kernel.org>
To: linux-trace-kernel@vger.kernel.org, peterz@infradead.org,
oleg@redhat.com
Cc: rostedt@goodmis.org, mhiramat@kernel.org, bpf@vger.kernel.org,
linux-kernel@vger.kernel.org, jolsa@kernel.org,
paulmck@kernel.org, willy@infradead.org, surenb@google.com,
akpm@linux-foundation.org, linux-mm@kvack.org,
Andrii Nakryiko <andrii@kernel.org>
Subject: [PATCH v3 02/13] uprobes: protected uprobe lifetime with SRCU
Date: Mon, 12 Aug 2024 21:29:06 -0700 [thread overview]
Message-ID: <20240813042917.506057-3-andrii@kernel.org> (raw)
In-Reply-To: <20240813042917.506057-1-andrii@kernel.org>
To avoid unnecessarily taking a (brief) refcount on uprobe during
breakpoint handling in handle_swbp for entry uprobes, make find_uprobe()
not take refcount, but protect the lifetime of a uprobe instance with
RCU. This improves scalability, as refcount gets quite expensive due to
cache line bouncing between multiple CPUs.
Specifically, we utilize our own uprobe-specific SRCU instance for this
RCU protection. put_uprobe() will delay actual kfree() using call_srcu().
For now, uretprobe and single-stepping handling will still acquire
refcount as necessary. We'll address these issues in follow up patches
by making them use SRCU with timeout.
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
kernel/events/uprobes.c | 94 +++++++++++++++++++++++------------------
1 file changed, 54 insertions(+), 40 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 147561c19d57..3e3595753e2c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -41,6 +41,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+DEFINE_STATIC_SRCU(uprobes_srcu);
+
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -59,6 +61,7 @@ struct uprobe {
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
+ struct rcu_head rcu;
loff_t offset;
loff_t ref_ctr_offset;
unsigned long flags;
@@ -617,6 +620,13 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
+static void uprobe_free_rcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ kfree(uprobe);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
if (!refcount_dec_and_test(&uprobe->ref))
@@ -638,7 +648,7 @@ static void put_uprobe(struct uprobe *uprobe)
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- kfree(uprobe);
+ call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
}
static __always_inline
@@ -680,33 +690,25 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+/*
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
+ */
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
- struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
-
- if (node)
- return try_get_uprobe(__node_2_uprobe(node));
-
- return NULL;
-}
+ struct rb_node *node;
-/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
- */
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
-{
- struct uprobe *uprobe;
+ lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
read_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
+ node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
read_unlock(&uprobes_treelock);
- return uprobe;
+ return node ? __node_2_uprobe(node) : NULL;
}
/*
@@ -1080,10 +1082,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
goto free;
/*
* We take mmap_lock for writing to avoid the race with
- * find_active_uprobe() which takes mmap_lock for reading.
+ * find_active_uprobe_rcu() which takes mmap_lock for reading.
* Thus this install_breakpoint() can not make
- * is_trap_at_addr() true right after find_uprobe()
- * returns NULL in find_active_uprobe().
+ * is_trap_at_addr() true right after find_uprobe_rcu()
+ * returns NULL in find_active_uprobe_rcu().
*/
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
@@ -1885,9 +1887,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
+ /* we need to bump refcount to store uprobe in utask */
+ if (!try_get_uprobe(uprobe))
+ return;
+
ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
- return;
+ goto fail;
trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
@@ -1914,11 +1920,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- /*
- * uprobe's refcnt is positive, held by caller, so it's safe to
- * unconditionally bump it one more time here
- */
- ri->uprobe = get_uprobe(uprobe);
+ ri->uprobe = uprobe;
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
@@ -1929,8 +1931,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
utask->return_instances = ri;
return;
- fail:
+fail:
kfree(ri);
+ put_uprobe(uprobe);
}
/* Prepare to single-step probed instruction out of line. */
@@ -1945,9 +1948,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
if (!utask)
return -ENOMEM;
+ if (!try_get_uprobe(uprobe))
+ return -EINVAL;
+
xol_vaddr = xol_get_insn_slot(uprobe);
- if (!xol_vaddr)
- return -ENOMEM;
+ if (!xol_vaddr) {
+ err = -ENOMEM;
+ goto err_out;
+ }
utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
@@ -1955,12 +1963,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
xol_free_insn_slot(current);
- return err;
+ goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
+err_out:
+ put_uprobe(uprobe);
+ return err;
}
/*
@@ -2043,7 +2054,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+/* assumes being inside RCU protected region */
+static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
@@ -2056,7 +2068,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
- uprobe = find_uprobe(inode, offset);
+ uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
@@ -2202,13 +2214,15 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int is_swbp;
+ int is_swbp, srcu_idx;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ srcu_idx = srcu_read_lock(&uprobes_srcu);
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -2224,7 +2238,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
- return;
+ goto out;
}
/* change it in advance for ->handler() and restart */
@@ -2259,12 +2273,12 @@ static void handle_swbp(struct pt_regs *regs)
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr))
- return;
+ if (pre_ssout(uprobe, regs, bp_vaddr))
+ goto out;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
- put_uprobe(uprobe);
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+ srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
/*
--
2.43.5
next prev parent reply other threads:[~2024-08-13 4:29 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-13 4:29 [PATCH v3 00/13] uprobes: RCU-protected hot path optimizations Andrii Nakryiko
2024-08-13 4:29 ` [PATCH v3 01/13] uprobes: revamp uprobe refcounting and lifetime management Andrii Nakryiko
2024-08-13 4:29 ` Andrii Nakryiko [this message]
2024-08-13 4:29 ` [PATCH v3 03/13] uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks Andrii Nakryiko
2024-08-13 4:29 ` [PATCH v3 04/13] uprobes: travers uprobe's consumer list locklessly under SRCU protection Andrii Nakryiko
2024-08-22 14:22 ` Jiri Olsa
2024-08-22 16:59 ` Andrii Nakryiko
2024-08-22 17:35 ` Jiri Olsa
2024-08-22 17:51 ` Andrii Nakryiko
2024-08-13 4:29 ` [PATCH v3 05/13] perf/uprobe: split uprobe_unregister() Andrii Nakryiko
2024-08-13 4:29 ` [PATCH v3 06/13] rbtree: provide rb_find_rcu() / rb_find_add_rcu() Andrii Nakryiko
2024-08-13 4:29 ` [PATCH v3 07/13] uprobes: perform lockless SRCU-protected uprobes_tree lookup Andrii Nakryiko
2024-08-13 4:29 ` [PATCH v3 08/13] uprobes: switch to RCU Tasks Trace flavor for better performance Andrii Nakryiko
2024-08-13 4:29 ` [PATCH RFC v3 09/13] uprobes: SRCU-protect uretprobe lifetime (with timeout) Andrii Nakryiko
2024-08-19 13:41 ` Oleg Nesterov
2024-08-19 20:34 ` Andrii Nakryiko
2024-08-20 15:05 ` Oleg Nesterov
2024-08-20 18:01 ` Andrii Nakryiko
2024-08-13 4:29 ` [PATCH RFC v3 10/13] uprobes: implement SRCU-protected lifetime for single-stepped uprobe Andrii Nakryiko
2024-08-13 4:29 ` [PATCH RFC v3 11/13] mm: introduce mmap_lock_speculation_{start|end} Andrii Nakryiko
2024-08-13 4:29 ` [PATCH RFC v3 12/13] mm: add SLAB_TYPESAFE_BY_RCU to files_cache Andrii Nakryiko
2024-08-13 6:07 ` Mateusz Guzik
2024-08-13 14:49 ` Suren Baghdasaryan
2024-08-13 18:15 ` Andrii Nakryiko
2024-08-13 4:29 ` [PATCH RFC v3 13/13] uprobes: add speculative lockless VMA to inode resolution Andrii Nakryiko
2024-08-13 6:17 ` Mateusz Guzik
2024-08-13 15:36 ` Suren Baghdasaryan
2024-08-15 13:44 ` Mateusz Guzik
2024-08-15 16:47 ` Andrii Nakryiko
2024-08-15 17:45 ` Suren Baghdasaryan
2024-08-15 18:24 ` Mateusz Guzik
2024-08-15 18:58 ` Jann Horn
2024-08-15 19:07 ` Mateusz Guzik
2024-08-15 19:17 ` Arnaldo Carvalho de Melo
2024-08-15 19:18 ` Arnaldo Carvalho de Melo
2024-08-15 19:44 ` Suren Baghdasaryan
2024-08-15 20:17 ` Andrii Nakryiko
2024-08-15 13:24 ` [PATCH v3 00/13] uprobes: RCU-protected hot path optimizations Oleg Nesterov
2024-08-15 16:49 ` Andrii Nakryiko
2024-08-21 16:41 ` Andrii Nakryiko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240813042917.506057-3-andrii@kernel.org \
--to=andrii@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=bpf@vger.kernel.org \
--cc=jolsa@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=mhiramat@kernel.org \
--cc=oleg@redhat.com \
--cc=paulmck@kernel.org \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=surenb@google.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).