* [PATCH 00/10] perf/uprobe: Optimize uprobes
@ 2024-07-08 9:12 Peter Zijlstra
2024-07-08 9:12 ` [PATCH 01/10] perf/uprobe: Re-indent labels Peter Zijlstra
` (10 more replies)
0 siblings, 11 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
Hi!
These patches implement the (S)RCU based proposal to optimize uprobes.
On my c^Htrusty old IVB-EP -- where each (of the 40) CPU calls 'func' in a
tight loop:
perf probe -x ./uprobes test=func
perf stat -ae probe_uprobe:test -- sleep 1
perf probe -x ./uprobes test=func%return
perf stat -ae probe_uprobe:test__return -- sleep 1
PRE:
4,038,804 probe_uprobe:test
2,356,275 probe_uprobe:test__return
POST:
7,216,579 probe_uprobe:test
6,744,786 probe_uprobe:test__return
Patches also available here:
git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/uprobes
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 01/10] perf/uprobe: Re-indent labels
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 02/10] perf/uprobe: Remove spurious whitespace Peter Zijlstra
` (9 subsequent siblings)
10 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
Remove the silly label indenting.
s/^\ \([[:alnum:]]*\):$/\1:/g
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -205,7 +205,7 @@ static int __replace_page(struct vm_area
folio_put(old_folio);
err = 0;
- unlock:
+unlock:
mmu_notifier_invalidate_range_end(&range);
folio_unlock(old_folio);
return err;
@@ -857,7 +857,7 @@ static int prepare_uprobe(struct uprobe
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
- out:
+out:
up_write(&uprobe->consumer_rwsem);
return ret;
@@ -965,7 +965,7 @@ build_map_info(struct address_space *map
struct map_info *info;
int more = 0;
- again:
+again:
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
@@ -1019,7 +1019,7 @@ build_map_info(struct address_space *map
} while (--more);
goto again;
- out:
+out:
while (prev)
prev = free_map_info(prev);
return curr;
@@ -1068,13 +1068,13 @@ register_for_each_vma(struct uprobe *upr
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
- unlock:
+unlock:
mmap_write_unlock(mm);
- free:
+free:
mmput(mm);
info = free_map_info(info);
}
- out:
+out:
percpu_up_write(&dup_mmap_sem);
return err;
}
@@ -1159,7 +1159,7 @@ static int __uprobe_register(struct inod
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
return -EINVAL;
- retry:
+retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
return -ENOMEM;
@@ -1468,7 +1468,7 @@ static int xol_add_vma(struct mm_struct
ret = 0;
/* pairs with get_xol_area() */
smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
- fail:
+fail:
mmap_write_unlock(mm);
return ret;
@@ -1512,7 +1512,7 @@ static struct xol_area *__create_xol_are
kfree(area->bitmap);
free_area:
kfree(area);
- out:
+out:
return NULL;
}
@@ -1915,7 +1915,7 @@ static void prepare_uretprobe(struct upr
utask->return_instances = ri;
return;
- fail:
+fail:
kfree(ri);
}
@@ -2031,7 +2031,7 @@ static int is_trap_at_addr(struct mm_str
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
- out:
+out:
/* This needs to return true for any variant of the trap insn */
return is_trap_insn(&opcode);
}
@@ -2159,7 +2159,7 @@ static void handle_trampoline(struct pt_
utask->return_instances = ri;
return;
- sigill:
+sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
force_sig(SIGILL);
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 02/10] perf/uprobe: Remove spurious whitespace
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
2024-07-08 9:12 ` [PATCH 01/10] perf/uprobe: Re-indent labels Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu() Peter Zijlstra
` (8 subsequent siblings)
10 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -67,7 +67,7 @@ struct uprobe {
* The generic code assumes that it has two members of unknown type
* owned by the arch-specific code:
*
- * insn - copy_insn() saves the original instruction here for
+ * insn - copy_insn() saves the original instruction here for
* arch_uprobe_analyze_insn().
*
* ixol - potentially modified instruction to execute out of
@@ -95,18 +95,18 @@ static LIST_HEAD(delayed_uprobe_list);
* allocated.
*/
struct xol_area {
- wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
- unsigned long *bitmap; /* 0 = free slot */
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
struct vm_special_mapping xol_mapping;
- struct page *pages[2];
+ struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
* the vma go away, and we must handle that reasonably gracefully.
*/
- unsigned long vaddr; /* Page(s) of instruction slots */
+ unsigned long vaddr; /* Page(s) of instruction slots */
};
/*
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu()
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
2024-07-08 9:12 ` [PATCH 01/10] perf/uprobe: Re-indent labels Peter Zijlstra
2024-07-08 9:12 ` [PATCH 02/10] perf/uprobe: Remove spurious whitespace Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-10 1:29 ` Masami Hiramatsu
2024-07-08 9:12 ` [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe() Peter Zijlstra
` (7 subsequent siblings)
10 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
Much like latch_tree, add two RCU methods for the regular RB-tree,
which can be used in conjunction with a seqcount to provide lockless
lookups.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/rbtree.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 67 insertions(+)
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -245,6 +245,42 @@ rb_find_add(struct rb_node *node, struct
}
/**
+ * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Adds a Store-Release for link_node.
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *parent = NULL;
+ int c;
+
+ while (*link) {
+ parent = *link;
+ c = cmp(node, parent);
+
+ if (c < 0)
+ link = &parent->rb_left;
+ else if (c > 0)
+ link = &parent->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node_rcu(node, parent, link);
+ rb_insert_color(node, tree);
+ return NULL;
+}
+
+/**
* rb_find() - find @key in tree @tree
* @key: key to match
* @tree: tree to search
@@ -268,6 +304,37 @@ rb_find(const void *key, const struct rb
else
return node;
}
+
+ return NULL;
+}
+
+/**
+ * rb_find_rcu() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Notably, tree descent vs concurrent tree rotations is unsound and can result
+ * in false-negatives.
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_rcu(const void *key, const struct rb_root *tree,
+ int (*cmp)(const void *key, const struct rb_node *))
+{
+ struct rb_node *node = tree->rb_node;
+
+ while (node) {
+ int c = cmp(key, node);
+
+ if (c < 0)
+ node = rcu_dereference_raw(node->rb_left);
+ else if (c > 0)
+ node = rcu_dereference_raw(node->rb_right);
+ else
+ return node;
+ }
return NULL;
}
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe()
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (2 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu() Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-08 16:35 ` Oleg Nesterov
2024-07-08 9:12 ` [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list Peter Zijlstra
` (6 subsequent siblings)
10 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
With handle_swbp() triggering concurrently on (all) CPUs, tree_lock
becomes a bottleneck. Avoid treelock by doing RCU lookups of the
uprobe.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 40 ++++++++++++++++++++++++++++++++--------
1 file changed, 32 insertions(+), 8 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
@@ -54,6 +55,7 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem)
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
refcount_t ref;
+ struct rcu_head rcu;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
@@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct
return uprobe;
}
+static void uprobe_free_rcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+ kfree(uprobe);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
if (refcount_dec_and_test(&uprobe->ref)) {
@@ -604,7 +612,7 @@ static void put_uprobe(struct uprobe *up
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- kfree(uprobe);
+ call_rcu(&uprobe->rcu, uprobe_free_rcu);
}
}
@@ -653,7 +661,7 @@ static struct uprobe *__find_uprobe(stru
.inode = inode,
.offset = offset,
};
- struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
+ struct rb_node *node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
if (node)
return get_uprobe(__node_2_uprobe(node));
@@ -667,20 +675,32 @@ static struct uprobe *__find_uprobe(stru
*/
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
- struct uprobe *uprobe;
+ unsigned int seq;
- read_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
- read_unlock(&uprobes_treelock);
+ guard(rcu)();
- return uprobe;
+ do {
+ seq = read_seqcount_begin(&uprobes_seqcount);
+ struct uprobe *uprobe = __find_uprobe(inode, offset);
+ if (uprobe) {
+ /*
+ * Lockless RB-tree lookups are prone to false-negatives.
+ * If they find something, it's good. If they do not find,
+ * it needs to be validated.
+ */
+ return uprobe;
+ }
+ } while (read_seqcount_retry(&uprobes_seqcount, seq));
+
+ /* Really didn't find anything. */
+ return NULL;
}
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
- node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node)
return get_uprobe(__node_2_uprobe(node));
@@ -702,7 +722,9 @@ static struct uprobe *insert_uprobe(stru
struct uprobe *u;
write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
+ write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
return u;
@@ -936,7 +958,9 @@ static void delete_uprobe(struct uprobe
return;
write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
rb_erase(&uprobe->rb_node, &uprobes_tree);
+ write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (3 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe() Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-09 12:05 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 06/10] perf/uprobe: Split uprobe_unregister() Peter Zijlstra
` (5 subsequent siblings)
10 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
With handle_swbp() hitting concurrently on (all) CPUs the
uprobe->register_rwsem can get very contended. Replace it with
SRCU+mutex.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 37 +++++++++++++++++++++++--------------
1 file changed, 23 insertions(+), 14 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/srcu.h>
#include <linux/uprobes.h>
@@ -49,6 +50,8 @@ static struct mutex uprobes_mmap_mutex[U
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+DEFINE_STATIC_SRCU(uprobes_srcu);
+
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -56,7 +59,7 @@ struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
refcount_t ref;
struct rcu_head rcu;
- struct rw_semaphore register_rwsem;
+ struct mutex register_mutex;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct uprobe_consumer *consumers;
@@ -750,7 +753,7 @@ static struct uprobe *alloc_uprobe(struc
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
- init_rwsem(&uprobe->register_rwsem);
+ mutex_init(&uprobe->register_mutex);
init_rwsem(&uprobe->consumer_rwsem);
/* add to uprobes_tree, sorted on inode:offset */
@@ -1129,10 +1132,12 @@ void uprobe_unregister(struct inode *ino
if (WARN_ON(!uprobe))
return;
- down_write(&uprobe->register_rwsem);
+ mutex_lock(&uprobe->register_mutex);
__uprobe_unregister(uprobe, uc);
- up_write(&uprobe->register_rwsem);
+ mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
+
+ synchronize_srcu(&uprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
@@ -1192,7 +1197,7 @@ static int __uprobe_register(struct inod
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
*/
- down_write(&uprobe->register_rwsem);
+ mutex_lock(&uprobe->register_mutex);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
consumer_add(uprobe, uc);
@@ -1200,7 +1205,7 @@ static int __uprobe_register(struct inod
if (ret)
__uprobe_unregister(uprobe, uc);
}
- up_write(&uprobe->register_rwsem);
+ mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
if (unlikely(ret == -EAGAIN))
@@ -1240,12 +1245,12 @@ int uprobe_apply(struct inode *inode, lo
if (WARN_ON(!uprobe))
return ret;
- down_write(&uprobe->register_rwsem);
+ mutex_lock(&uprobe->register_mutex);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
- up_write(&uprobe->register_rwsem);
+ mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
return ret;
@@ -2087,14 +2092,19 @@ static struct uprobe *find_active_uprobe
return uprobe;
}
+#define for_each_consumer_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; \
+ pos = rcu_dereference_raw(pos->next))
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ guard(srcu)(&uprobes_srcu);
+
+ for_each_consumer_rcu(uc, uprobe->consumers) {
int rc = 0;
if (uc->handler) {
@@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
}
- up_read(&uprobe->register_rwsem);
}
static void
@@ -2125,12 +2134,12 @@ handle_uretprobe_chain(struct return_ins
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ guard(srcu)(&uprobes_srcu);
+
+ for_each_consumer_rcu(uc, uprobe->consumers) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
- up_read(&uprobe->register_rwsem);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 06/10] perf/uprobe: Split uprobe_unregister()
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (4 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-10 1:40 ` Masami Hiramatsu
2024-07-08 9:12 ` [PATCH 07/10] perf/uprobe: Convert (some) uprobe->refcount to SRCU Peter Zijlstra
` (4 subsequent siblings)
10 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
With uprobe_unregister() having grown a synchronize_srcu(), it becomes
fairly slow to call. Esp. since both users of this API call it in a
loop.
Peel off the sync_srcu() and do it once, after the loop.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/uprobes.h | 8 ++++++--
kernel/events/uprobes.c | 8 ++++++--
kernel/trace/bpf_trace.c | 6 ++++--
kernel/trace/trace_uprobe.c | 6 +++++-
4 files changed, 21 insertions(+), 7 deletions(-)
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -113,7 +113,8 @@ extern int uprobe_write_opcode(struct ar
extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern void uprobe_unregister_nosync(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern void uprobe_unregister_sync(void);
extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
extern void uprobe_start_dup_mmap(void);
@@ -163,7 +164,10 @@ uprobe_apply(struct inode *inode, loff_t
return -ENOSYS;
}
static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+uprobe_unregister_nosync(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+}
+static inline void uprobes_unregister_sync(void)
{
}
static inline int uprobe_mmap(struct vm_area_struct *vma)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1126,7 +1126,7 @@ __uprobe_unregister(struct uprobe *uprob
* @offset: offset from the start of the file.
* @uc: identify which probe if multiple probes are colocated.
*/
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+void uprobe_unregister_nosync(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
@@ -1138,10 +1138,14 @@ void uprobe_unregister(struct inode *ino
__uprobe_unregister(uprobe, uc);
mutex_unlock(&uprobe->register_mutex);
put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
+void uprobe_unregister_sync(void)
+{
synchronize_srcu(&uprobes_srcu);
}
-EXPORT_SYMBOL_GPL(uprobe_unregister);
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
/*
* __uprobe_register - register a probe
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3181,9 +3181,11 @@ static void bpf_uprobe_unregister(struct
u32 i;
for (i = 0; i < cnt; i++) {
- uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- &uprobes[i].consumer);
+ uprobe_unregister_nosync(d_real_inode(path->dentry), uprobes[i].offset,
+ &uprobes[i].consumer);
}
+ if (cnt)
+ uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1104,6 +1104,7 @@ static int trace_uprobe_enable(struct tr
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
+ bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
@@ -1112,9 +1113,12 @@ static void __probe_event_disable(struct
if (!tu->inode)
continue;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ uprobe_unregister_nosync(tu->inode, tu->offset, &tu->consumer);
+ sync = true;
tu->inode = NULL;
}
+ if (sync)
+ uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 07/10] perf/uprobe: Convert (some) uprobe->refcount to SRCU
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (5 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 06/10] perf/uprobe: Split uprobe_unregister() Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 08/10] srcu: Add __srcu_clone_read_lock() Peter Zijlstra
` (3 subsequent siblings)
10 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
With handle_swbp() hitting concurrently on (all) CPUs, potentially on
the same uprobe, the uprobe->refcount can get *very* hot. Move the
struct uprobe lifetime into uprobes_srcu such that it covers both the
uprobe and the uprobe->consumers list.
With this, handle_swbp() can use a single large SRCU critical section
to avoid taking a refcount on the uprobe for it's duration.
Notably, the single-step and uretprobe paths need a reference that
leaves handle_swbp() and will, for now, still use ->refcount.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/events/uprobes.c | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -615,7 +615,7 @@ static void put_uprobe(struct uprobe *up
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- call_rcu(&uprobe->rcu, uprobe_free_rcu);
+ call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
}
}
@@ -667,7 +667,7 @@ static struct uprobe *__find_uprobe(stru
struct rb_node *node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
if (node)
- return get_uprobe(__node_2_uprobe(node));
+ return __node_2_uprobe(node);
return NULL;
}
@@ -680,7 +680,7 @@ static struct uprobe *find_uprobe(struct
{
unsigned int seq;
- guard(rcu)();
+ lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
do {
seq = read_seqcount_begin(&uprobes_seqcount);
@@ -1130,6 +1130,8 @@ void uprobe_unregister_nosync(struct ino
{
struct uprobe *uprobe;
+ guard(srcu)(&uprobes_srcu);
+
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return;
@@ -1137,7 +1139,6 @@ void uprobe_unregister_nosync(struct ino
mutex_lock(&uprobe->register_mutex);
__uprobe_unregister(uprobe, uc);
mutex_unlock(&uprobe->register_mutex);
- put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
@@ -1247,6 +1248,8 @@ int uprobe_apply(struct inode *inode, lo
struct uprobe_consumer *con;
int ret = -ENOENT;
+ guard(srcu)(&uprobes_srcu);
+
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return ret;
@@ -1257,7 +1260,6 @@ int uprobe_apply(struct inode *inode, lo
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
mutex_unlock(&uprobe->register_mutex);
- put_uprobe(uprobe);
return ret;
}
@@ -1977,7 +1979,7 @@ pre_ssout(struct uprobe *uprobe, struct
return err;
}
- utask->active_uprobe = uprobe;
+ utask->active_uprobe = get_uprobe(uprobe);
utask->state = UTASK_SSTEP;
return 0;
}
@@ -2108,7 +2110,7 @@ static void handler_chain(struct uprobe
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
- guard(srcu)(&uprobes_srcu);
+ lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
for_each_consumer_rcu(uc, uprobe->consumers) {
int rc = 0;
@@ -2227,6 +2229,8 @@ static void handle_swbp(struct pt_regs *
if (bp_vaddr == get_trampoline_vaddr())
return handle_trampoline(regs);
+ guard(srcu)(&uprobes_srcu);
+
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
@@ -2255,7 +2259,7 @@ static void handle_swbp(struct pt_regs *
* new and not-yet-analyzed uprobe at the same address, restart.
*/
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
- goto out;
+ return;
/*
* Pairs with the smp_wmb() in prepare_uprobe().
@@ -2268,22 +2272,17 @@ static void handle_swbp(struct pt_regs *
/* Tracing handlers use ->utask to communicate with fetch methods */
if (!get_utask())
- goto out;
+ return;
if (arch_uprobe_ignore(&uprobe->arch, regs))
- goto out;
+ return;
handler_chain(uprobe, regs);
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
- goto out;
-
- if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
-out:
- put_uprobe(uprobe);
+ pre_ssout(uprobe, regs, bp_vaddr);
}
/*
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 08/10] srcu: Add __srcu_clone_read_lock()
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (6 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 07/10] perf/uprobe: Convert (some) uprobe->refcount to SRCU Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-10 5:48 ` Boqun Feng
2024-07-08 9:12 ` [PATCH 09/10] perf/uprobe: Convert single-step and uretprobe to SRCU Peter Zijlstra
` (2 subsequent siblings)
10 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
In order to support carrying an srcu_read_lock() section across fork,
where both the parent and child process will do: srcu_read_unlock(),
it is needed to account for the extra decrement with an extra
increment at fork time.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/srcu.h | 1 +
include/linux/srcutiny.h | 10 ++++++++++
kernel/rcu/srcutree.c | 5 +++++
3 files changed, 16 insertions(+)
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -55,6 +55,7 @@ void call_srcu(struct srcu_struct *ssp,
void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_clone_read_lock(struct srcu_struct *ssp, int idx);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -71,6 +71,16 @@ static inline int __srcu_read_lock(struc
return idx;
}
+static inline void __srcu_clone_read_lock(struct srcu_struct *ssp, int idx)
+{
+ int newval;
+
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1;
+ WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ preempt_enable();
+}
+
static inline void synchronize_srcu_expedited(struct srcu_struct *ssp)
{
synchronize_srcu(ssp);
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -720,6 +720,11 @@ int __srcu_read_lock(struct srcu_struct
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
+void __srcu_clone_read_lock(struct srcu_struct *ssp, int idx)
+{
+ this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
+}
+
/*
* Removes the count for the old reader from the appropriate per-CPU
* element of the srcu_struct. Note that this may well be a different
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 09/10] perf/uprobe: Convert single-step and uretprobe to SRCU
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (7 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 08/10] srcu: Add __srcu_clone_read_lock() Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 10/10] perf/uprobe: Add uretprobe timer Peter Zijlstra
2024-07-08 22:56 ` [PATCH 00/10] perf/uprobe: Optimize uprobes Masami Hiramatsu
10 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
Both single-step and uretprobes take a refcount on struct uprobe in
handle_swbp() in order to ensure struct uprobe stays extant until a
next trap.
Since uprobe_unregister() only cares about the uprobe_consumer
life-time, and these intra-trap sections can be arbitrarily large,
create a second SRCU domain to cover these.
Notably, a uretprobe with a registered return_instance that never
triggers -- because userspace -- will currently pin the
return_instance and related uprobe until the task dies. With this
convertion to SRCU this behaviour will inhibit freeing of all uprobes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/uprobes.h | 2 ++
kernel/events/uprobes.c | 23 ++++++++++++++++-------
2 files changed, 18 insertions(+), 7 deletions(-)
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -78,6 +78,7 @@ struct uprobe_task {
struct return_instance *return_instances;
unsigned int depth;
+ unsigned int active_srcu_idx;
};
struct return_instance {
@@ -86,6 +87,7 @@ struct return_instance {
unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */
bool chained; /* true, if instance is nested */
+ int srcu_idx;
struct return_instance *next; /* keep as stack */
};
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -51,6 +51,7 @@ static struct mutex uprobes_mmap_mutex[U
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
DEFINE_STATIC_SRCU(uprobes_srcu);
+DEFINE_STATIC_SRCU(uretprobes_srcu);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -598,12 +599,18 @@ static struct uprobe *get_uprobe(struct
return uprobe;
}
-static void uprobe_free_rcu(struct rcu_head *rcu)
+static void uprobe_free_stage2(struct rcu_head *rcu)
{
struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
kfree(uprobe);
}
+static void uprobe_free_stage1(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_stage2);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
if (refcount_dec_and_test(&uprobe->ref)) {
@@ -615,7 +622,7 @@ static void put_uprobe(struct uprobe *up
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
- call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
+ call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_stage1);
}
}
@@ -1735,7 +1742,7 @@ unsigned long uprobe_get_trap_addr(struc
static struct return_instance *free_ret_instance(struct return_instance *ri)
{
struct return_instance *next = ri->next;
- put_uprobe(ri->uprobe);
+ srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
kfree(ri);
return next;
}
@@ -1753,7 +1760,7 @@ void uprobe_free_utask(struct task_struc
return;
if (utask->active_uprobe)
- put_uprobe(utask->active_uprobe);
+ srcu_read_unlock(&uretprobes_srcu, utask->active_srcu_idx);
ri = utask->return_instances;
while (ri)
@@ -1796,7 +1803,7 @@ static int dup_utask(struct task_struct
return -ENOMEM;
*n = *o;
- get_uprobe(n->uprobe);
+ __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
n->next = NULL;
*p = n;
@@ -1939,7 +1946,8 @@ static void prepare_uretprobe(struct upr
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- ri->uprobe = get_uprobe(uprobe);
+ ri->srcu_idx = srcu_read_lock(&uretprobes_srcu);
+ ri->uprobe = uprobe;
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
@@ -1979,7 +1987,8 @@ pre_ssout(struct uprobe *uprobe, struct
return err;
}
- utask->active_uprobe = get_uprobe(uprobe);
+ utask->active_srcu_idx = srcu_read_lock(&uretprobes_srcu);
+ utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
}
^ permalink raw reply [flat|nested] 84+ messages in thread
* [PATCH 10/10] perf/uprobe: Add uretprobe timer
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (8 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 09/10] perf/uprobe: Convert single-step and uretprobe to SRCU Peter Zijlstra
@ 2024-07-08 9:12 ` Peter Zijlstra
2024-07-08 22:56 ` [PATCH 00/10] perf/uprobe: Optimize uprobes Masami Hiramatsu
10 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 9:12 UTC (permalink / raw)
To: mingo, andrii
Cc: linux-kernel, peterz, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
In order to put a bound on the uretprobe_srcu critical section, add a
timer to uprobe_task. Upon every RI added or removed the timer is
pushed forward to now + 1s. If the timer were ever to fire, it would
convert the SRCU 'reference' to a refcount reference if possible.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/uprobes.h | 8 +++++
kernel/events/uprobes.c | 66 ++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 68 insertions(+), 6 deletions(-)
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -15,6 +15,7 @@
#include <linux/rbtree.h>
#include <linux/types.h>
#include <linux/wait.h>
+#include <linux/timer.h>
struct vm_area_struct;
struct mm_struct;
@@ -79,6 +80,10 @@ struct uprobe_task {
struct return_instance *return_instances;
unsigned int depth;
unsigned int active_srcu_idx;
+
+ struct timer_list ri_timer;
+ struct callback_head ri_task_work;
+ struct task_struct *task;
};
struct return_instance {
@@ -86,7 +91,8 @@ struct return_instance {
unsigned long func;
unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */
- bool chained; /* true, if instance is nested */
+ u8 chained; /* true, if instance is nested */
+ u8 has_ref;
int srcu_idx;
struct return_instance *next; /* keep as stack */
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1742,7 +1742,12 @@ unsigned long uprobe_get_trap_addr(struc
static struct return_instance *free_ret_instance(struct return_instance *ri)
{
struct return_instance *next = ri->next;
- srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
+ if (ri->uprobe) {
+ if (ri->has_ref)
+ put_uprobe(ri->uprobe);
+ else
+ srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
+ }
kfree(ri);
return next;
}
@@ -1766,11 +1771,48 @@ void uprobe_free_utask(struct task_struc
while (ri)
ri = free_ret_instance(ri);
+ timer_delete_sync(&utask->ri_timer);
+ task_work_cancel(utask->task, &utask->ri_task_work);
xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
}
+static void return_instance_task_work(struct callback_head *head)
+{
+ struct uprobe_task *utask = container_of(head, struct uprobe_task, ri_task_work);
+ struct return_instance *ri;
+
+ for (ri = utask->return_instances; ri; ri = ri->next) {
+ if (!ri->uprobe)
+ continue;
+ if (ri->has_ref)
+ continue;
+ if (refcount_inc_not_zero(&ri->uprobe->ref))
+ ri->has_ref = true;
+ else
+ ri->uprobe = NULL;
+ srcu_read_unlock(&uretprobes_srcu, ri->srcu_idx);
+ }
+}
+
+static void return_instance_timer(struct timer_list *timer)
+{
+ struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+ task_work_add(utask->task, &utask->ri_task_work, TWA_SIGNAL);
+}
+
+static struct uprobe_task *alloc_utask(struct task_struct *task)
+{
+ struct uprobe_task *utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!utask)
+ return NULL;
+ timer_setup(&utask->ri_timer, return_instance_timer, 0);
+ init_task_work(&utask->ri_task_work, return_instance_task_work);
+ utask->task = task;
+ return utask;
+}
+
/*
* Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
@@ -1782,7 +1824,7 @@ void uprobe_free_utask(struct task_struc
static struct uprobe_task *get_utask(void)
{
if (!current->utask)
- current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ current->utask = alloc_utask(current);
return current->utask;
}
@@ -1791,7 +1833,7 @@ static int dup_utask(struct task_struct
struct uprobe_task *n_utask;
struct return_instance **p, *o, *n;
- n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ n_utask = alloc_utask(t);
if (!n_utask)
return -ENOMEM;
t->utask = n_utask;
@@ -1803,13 +1845,20 @@ static int dup_utask(struct task_struct
return -ENOMEM;
*n = *o;
- __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
+ if (n->uprobe) {
+ if (n->has_ref)
+ get_uprobe(n->uprobe);
+ else
+ __srcu_clone_read_lock(&uretprobes_srcu, n->srcu_idx);
+ }
n->next = NULL;
*p = n;
p = &n->next;
n_utask->depth++;
}
+ if (n_utask->return_instances)
+ mod_timer(&n_utask->ri_timer, jiffies + HZ);
return 0;
}
@@ -1957,6 +2006,8 @@ static void prepare_uretprobe(struct upr
ri->next = utask->return_instances;
utask->return_instances = ri;
+ mod_timer(&utask->ri_timer, jiffies + HZ);
+
return;
fail:
kfree(ri);
@@ -2151,6 +2202,9 @@ handle_uretprobe_chain(struct return_ins
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
+ if (!uprobe)
+ return;
+
guard(srcu)(&uprobes_srcu);
for_each_consumer_rcu(uc, uprobe->consumers) {
@@ -2197,8 +2251,10 @@ static void handle_trampoline(struct pt_
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
- if (valid)
+ if (valid) {
handle_uretprobe_chain(ri, regs);
+ mod_timer(&utask->ri_timer, jiffies + HZ);
+ }
ri = free_ret_instance(ri);
utask->depth--;
} while (ri != next);
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe()
2024-07-08 9:12 ` [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe() Peter Zijlstra
@ 2024-07-08 16:35 ` Oleg Nesterov
2024-07-08 18:08 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Oleg Nesterov @ 2024-07-08 16:35 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
I hate to say this again, but I'll try to read this series later ;)
But let me ask...
On 07/08, Peter Zijlstra wrote:
>
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> + kfree(uprobe);
> +}
> +
> static void put_uprobe(struct uprobe *uprobe)
> {
> if (refcount_dec_and_test(&uprobe->ref)) {
> @@ -604,7 +612,7 @@ static void put_uprobe(struct uprobe *up
> mutex_lock(&delayed_uprobe_lock);
> delayed_uprobe_remove(uprobe, NULL);
> mutex_unlock(&delayed_uprobe_lock);
> - kfree(uprobe);
> + call_rcu(&uprobe->rcu, uprobe_free_rcu);
kfree_rcu() ?
> static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> {
> - struct uprobe *uprobe;
> + unsigned int seq;
>
> - read_lock(&uprobes_treelock);
> - uprobe = __find_uprobe(inode, offset);
> - read_unlock(&uprobes_treelock);
> + guard(rcu)();
>
> - return uprobe;
> + do {
> + seq = read_seqcount_begin(&uprobes_seqcount);
> + struct uprobe *uprobe = __find_uprobe(inode, offset);
> + if (uprobe) {
> + /*
> + * Lockless RB-tree lookups are prone to false-negatives.
> + * If they find something, it's good.
Is it true in this case?
Suppose we have uprobe U which has no extra refs, so uprobe_unregister()
called by the task X should remove it from uprobes_tree and kfree.
Suppose that the task T hits the breakpoint and enters handle_swbp().
Now,
- X calls find_uprobe(), this increments U->ref from 1 to 2
register_for_each_vma() succeeds
X enters delete_uprobe()
- T calls find_active_uprobe() -> find_uprobe()
__read_seqcount_begin__read_seqcount_begin() returns an even number
__find_uprobe() -> rb_find_rcu() succeeds
- X continues and returns from delete_uprobe(), U->ref == 1
then it does the final uprobe_unregister()->put_uprobe(U),
refcount_dec_and_test() succeeds, X calls call_rcu(uprobe_free_rcu).
- T does get_uprobe() which changes U->ref from 0 to 1, __find_uprobe()
returns, find_uprobe() doesn't check read_seqcount_retry().
- T returns from find_active_uprobe() and plays with the "soon to be
freed" U.
Looks like __find_uprobe() needs refcount_inc_not_zero() or return NULL, but
I am not sure this is the only problem...
No?
Oleg.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe()
2024-07-08 16:35 ` Oleg Nesterov
@ 2024-07-08 18:08 ` Peter Zijlstra
2024-07-09 14:32 ` Oleg Nesterov
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-08 18:08 UTC (permalink / raw)
To: Oleg Nesterov
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On Mon, Jul 08, 2024 at 06:35:45PM +0200, Oleg Nesterov wrote:
> I hate to say this again, but I'll try to read this series later ;)
>
> But let me ask...
:-)
> On 07/08, Peter Zijlstra wrote:
> >
> > +static void uprobe_free_rcu(struct rcu_head *rcu)
> > +{
> > + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> > + kfree(uprobe);
> > +}
> > +
> > static void put_uprobe(struct uprobe *uprobe)
> > {
> > if (refcount_dec_and_test(&uprobe->ref)) {
> > @@ -604,7 +612,7 @@ static void put_uprobe(struct uprobe *up
> > mutex_lock(&delayed_uprobe_lock);
> > delayed_uprobe_remove(uprobe, NULL);
> > mutex_unlock(&delayed_uprobe_lock);
> > - kfree(uprobe);
> > + call_rcu(&uprobe->rcu, uprobe_free_rcu);
>
> kfree_rcu() ?
I can never remember how that works, also this will very soon be
call_srcu().
> > static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> > {
> > - struct uprobe *uprobe;
> > + unsigned int seq;
> >
> > - read_lock(&uprobes_treelock);
> > - uprobe = __find_uprobe(inode, offset);
> > - read_unlock(&uprobes_treelock);
> > + guard(rcu)();
> >
> > - return uprobe;
> > + do {
> > + seq = read_seqcount_begin(&uprobes_seqcount);
> > + struct uprobe *uprobe = __find_uprobe(inode, offset);
> > + if (uprobe) {
> > + /*
> > + * Lockless RB-tree lookups are prone to false-negatives.
> > + * If they find something, it's good.
>
> Is it true in this case?
>
> Suppose we have uprobe U which has no extra refs, so uprobe_unregister()
> called by the task X should remove it from uprobes_tree and kfree.
>
> Suppose that the task T hits the breakpoint and enters handle_swbp().
>
> Now,
>
> - X calls find_uprobe(), this increments U->ref from 1 to 2
>
> register_for_each_vma() succeeds
>
> X enters delete_uprobe()
>
> - T calls find_active_uprobe() -> find_uprobe()
>
> __read_seqcount_begin__read_seqcount_begin() returns an even number
>
> __find_uprobe() -> rb_find_rcu() succeeds
>
> - X continues and returns from delete_uprobe(), U->ref == 1
>
> then it does the final uprobe_unregister()->put_uprobe(U),
> refcount_dec_and_test() succeeds, X calls call_rcu(uprobe_free_rcu).
>
> - T does get_uprobe() which changes U->ref from 0 to 1, __find_uprobe()
> returns, find_uprobe() doesn't check read_seqcount_retry().
I think you're right. However, this get_uprobe() will go away in a few
patches.
But yeah, I should perhaps have been more careful when splitting things
up :/
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
` (9 preceding siblings ...)
2024-07-08 9:12 ` [PATCH 10/10] perf/uprobe: Add uretprobe timer Peter Zijlstra
@ 2024-07-08 22:56 ` Masami Hiramatsu
2024-07-09 0:25 ` Andrii Nakryiko
2024-07-09 7:11 ` Peter Zijlstra
10 siblings, 2 replies; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-08 22:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
On Mon, 08 Jul 2024 11:12:41 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> Hi!
>
> These patches implement the (S)RCU based proposal to optimize uprobes.
>
> On my c^Htrusty old IVB-EP -- where each (of the 40) CPU calls 'func' in a
> tight loop:
>
> perf probe -x ./uprobes test=func
> perf stat -ae probe_uprobe:test -- sleep 1
>
> perf probe -x ./uprobes test=func%return
> perf stat -ae probe_uprobe:test__return -- sleep 1
>
> PRE:
>
> 4,038,804 probe_uprobe:test
> 2,356,275 probe_uprobe:test__return
>
> POST:
>
> 7,216,579 probe_uprobe:test
> 6,744,786 probe_uprobe:test__return
>
Good results! So this is another series of Andrii's batch register?
(but maybe it becomes simpler)
Thank you,
>
> Patches also available here:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/uprobes
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-08 22:56 ` [PATCH 00/10] perf/uprobe: Optimize uprobes Masami Hiramatsu
@ 2024-07-09 0:25 ` Andrii Nakryiko
2024-07-09 9:01 ` Peter Zijlstra
` (2 more replies)
2024-07-09 7:11 ` Peter Zijlstra
1 sibling, 3 replies; 84+ messages in thread
From: Andrii Nakryiko @ 2024-07-09 0:25 UTC (permalink / raw)
To: Masami Hiramatsu, Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, oleg, jolsa, clm, paulmck,
bpf
On Mon, Jul 8, 2024 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Mon, 08 Jul 2024 11:12:41 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
>
> > Hi!
> >
> > These patches implement the (S)RCU based proposal to optimize uprobes.
> >
> > On my c^Htrusty old IVB-EP -- where each (of the 40) CPU calls 'func' in a
> > tight loop:
> >
> > perf probe -x ./uprobes test=func
> > perf stat -ae probe_uprobe:test -- sleep 1
> >
> > perf probe -x ./uprobes test=func%return
> > perf stat -ae probe_uprobe:test__return -- sleep 1
> >
> > PRE:
> >
> > 4,038,804 probe_uprobe:test
> > 2,356,275 probe_uprobe:test__return
> >
> > POST:
> >
> > 7,216,579 probe_uprobe:test
> > 6,744,786 probe_uprobe:test__return
> >
>
> Good results! So this is another series of Andrii's batch register?
> (but maybe it becomes simpler)
yes, this would be an alternative to my patches
Peter,
I didn't have time to look at the patches just yet, but I managed to
run a quick benchmark (using bench tool we have as part of BPF
selftests) to see both single-threaded performance and how the
performance scales with CPUs (now that we are not bottlenecked on
register_rwsem). Here are some results:
[root@kerneltest003.10.atn6 ~]# for num_threads in {1..20}; do ./bench \
-a -d10 -p $num_threads trig-uprobe-nop | grep Summary; done
Summary: hits 3.278 ± 0.021M/s ( 3.278M/prod)
Summary: hits 4.364 ± 0.005M/s ( 2.182M/prod)
Summary: hits 6.517 ± 0.011M/s ( 2.172M/prod)
Summary: hits 8.203 ± 0.004M/s ( 2.051M/prod)
Summary: hits 9.520 ± 0.012M/s ( 1.904M/prod)
Summary: hits 8.316 ± 0.007M/s ( 1.386M/prod)
Summary: hits 7.893 ± 0.037M/s ( 1.128M/prod)
Summary: hits 8.490 ± 0.014M/s ( 1.061M/prod)
Summary: hits 8.022 ± 0.005M/s ( 0.891M/prod)
Summary: hits 8.471 ± 0.019M/s ( 0.847M/prod)
Summary: hits 8.156 ± 0.021M/s ( 0.741M/prod)
...
(numbers in the first column is total throughput, and xxx/prod is
per-thread throughput). Single-threaded performance (about 3.3 mln/s)
is on part with what I had with my patches. And clearly it scales
better with more thread now that register_rwsem is gone, though,
unfortunately, it doesn't really scale linearly.
Quick profiling for the 8-threaded benchmark shows that we spend >20%
in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
that's what would prevent uprobes from scaling linearly. If you have
some good ideas on how to get rid of that, I think it would be
extremely beneficial. We also spend about 14% of the time in
srcu_read_lock(). The rest is in interrupt handling overhead, actual
user-space function overhead, and in uprobe_dispatcher() calls.
Ramping this up to 16 threads shows that mmap_rwsem is getting more
costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
CPU. Is this expected? (I'm not familiar with the implementation
details)
P.S. Would you be able to rebase your patches on top of latest
probes/for-next, which include Jiri's sys_uretprobe changes. Right now
uretprobe benchmarks are quite unrepresentative because of that.
Thanks!
>
> Thank you,
>
> >
> > Patches also available here:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/uprobes
> >
> >
>
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-08 22:56 ` [PATCH 00/10] perf/uprobe: Optimize uprobes Masami Hiramatsu
2024-07-09 0:25 ` Andrii Nakryiko
@ 2024-07-09 7:11 ` Peter Zijlstra
1 sibling, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 7:11 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: mingo, andrii, linux-kernel, rostedt, oleg, jolsa, clm, paulmck
On Tue, Jul 09, 2024 at 07:56:51AM +0900, Masami Hiramatsu wrote:
> On Mon, 08 Jul 2024 11:12:41 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
>
> > Hi!
> >
> > These patches implement the (S)RCU based proposal to optimize uprobes.
> >
> > On my c^Htrusty old IVB-EP -- where each (of the 40) CPU calls 'func' in a
> > tight loop:
> >
> > perf probe -x ./uprobes test=func
> > perf stat -ae probe_uprobe:test -- sleep 1
> >
> > perf probe -x ./uprobes test=func%return
> > perf stat -ae probe_uprobe:test__return -- sleep 1
> >
> > PRE:
> >
> > 4,038,804 probe_uprobe:test
> > 2,356,275 probe_uprobe:test__return
> >
> > POST:
> >
> > 7,216,579 probe_uprobe:test
> > 6,744,786 probe_uprobe:test__return
> >
>
> Good results! So this is another series of Andrii's batch register?
Yeah, it is my counter proposal. I didn't much like the refcounting
thing he ended up with, and his own numbers show the refcounting remains
a significant problem.
These patches mostly do away with the refcounting entirely -- except for
the extremely rare case where you let a return probe sit for over a
second without anything else happening.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 0:25 ` Andrii Nakryiko
@ 2024-07-09 9:01 ` Peter Zijlstra
2024-07-09 14:11 ` Paul E. McKenney
2024-07-09 9:03 ` Peter Zijlstra
2024-07-09 21:47 ` Andrii Nakryiko
2 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 9:01 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, paulmck, bpf, willy
On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> Quick profiling for the 8-threaded benchmark shows that we spend >20%
> in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> that's what would prevent uprobes from scaling linearly. If you have
> some good ideas on how to get rid of that, I think it would be
> extremely beneficial.
That's find_vma() and friends. I started RCU-ifying that a *long* time
ago when I started the speculative page fault patches. I sorta lost
track of that effort, Willy where are we with that?
Specifically, how feasible would it be to get a simple RCU based
find_vma() version sorted these days?
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 0:25 ` Andrii Nakryiko
2024-07-09 9:01 ` Peter Zijlstra
@ 2024-07-09 9:03 ` Peter Zijlstra
2024-07-09 10:01 ` Jiri Olsa
2024-07-10 0:55 ` Masami Hiramatsu
2024-07-09 21:47 ` Andrii Nakryiko
2 siblings, 2 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 9:03 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, paulmck, bpf
On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> Ramping this up to 16 threads shows that mmap_rwsem is getting more
> costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
> CPU. Is this expected? (I'm not familiar with the implementation
> details)
SRCU getting more expensive is a bit unexpected, it's just a per-cpu
inc/dec and a full barrier.
> P.S. Would you be able to rebase your patches on top of latest
> probes/for-next, which include Jiri's sys_uretprobe changes. Right now
> uretprobe benchmarks are quite unrepresentative because of that.
What branch is that? kernel/events/ stuff usually goes through tip, no?
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 9:03 ` Peter Zijlstra
@ 2024-07-09 10:01 ` Jiri Olsa
2024-07-09 10:16 ` Peter Zijlstra
2024-07-10 0:55 ` Masami Hiramatsu
1 sibling, 1 reply; 84+ messages in thread
From: Jiri Olsa @ 2024-07-09 10:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, clm, paulmck, bpf
On Tue, Jul 09, 2024 at 11:03:04AM +0200, Peter Zijlstra wrote:
> On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
>
> > Ramping this up to 16 threads shows that mmap_rwsem is getting more
> > costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
> > CPU. Is this expected? (I'm not familiar with the implementation
> > details)
>
> SRCU getting more expensive is a bit unexpected, it's just a per-cpu
> inc/dec and a full barrier.
>
> > P.S. Would you be able to rebase your patches on top of latest
> > probes/for-next, which include Jiri's sys_uretprobe changes. Right now
> > uretprobe benchmarks are quite unrepresentative because of that.
>
> What branch is that? kernel/events/ stuff usually goes through tip, no?
it went through the trace tree:
https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git probes/for-next
and it's in linux-next/master already
jirka
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 10:01 ` Jiri Olsa
@ 2024-07-09 10:16 ` Peter Zijlstra
2024-07-09 22:10 ` Masami Hiramatsu
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 10:16 UTC (permalink / raw)
To: Jiri Olsa
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, clm, paulmck, bpf
On Tue, Jul 09, 2024 at 12:01:03PM +0200, Jiri Olsa wrote:
> On Tue, Jul 09, 2024 at 11:03:04AM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> >
> > > Ramping this up to 16 threads shows that mmap_rwsem is getting more
> > > costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
> > > CPU. Is this expected? (I'm not familiar with the implementation
> > > details)
> >
> > SRCU getting more expensive is a bit unexpected, it's just a per-cpu
> > inc/dec and a full barrier.
> >
> > > P.S. Would you be able to rebase your patches on top of latest
> > > probes/for-next, which include Jiri's sys_uretprobe changes. Right now
> > > uretprobe benchmarks are quite unrepresentative because of that.
> >
> > What branch is that? kernel/events/ stuff usually goes through tip, no?
>
> it went through the trace tree:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git probes/for-next
>
> and it's in linux-next/master already
FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
what gives?
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-08 9:12 ` [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list Peter Zijlstra
@ 2024-07-09 12:05 ` Peter Zijlstra
2024-07-09 13:33 ` Oleg Nesterov
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 12:05 UTC (permalink / raw)
To: mingo, andrii; +Cc: linux-kernel, rostedt, mhiramat, oleg, jolsa, clm, paulmck
On Mon, Jul 08, 2024 at 11:12:46AM +0200, Peter Zijlstra wrote:
> @@ -2087,14 +2092,19 @@ static struct uprobe *find_active_uprobe
> return uprobe;
> }
>
> +#define for_each_consumer_rcu(pos, head) \
> + for (pos = rcu_dereference_raw(head); pos; \
> + pos = rcu_dereference_raw(pos->next))
> +
> static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> {
> struct uprobe_consumer *uc;
> int remove = UPROBE_HANDLER_REMOVE;
> bool need_prep = false; /* prepare return uprobe, when needed */
>
> - down_read(&uprobe->register_rwsem);
> - for (uc = uprobe->consumers; uc; uc = uc->next) {
> + guard(srcu)(&uprobes_srcu);
> +
> + for_each_consumer_rcu(uc, uprobe->consumers) {
> int rc = 0;
>
> if (uc->handler) {
> @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> WARN_ON(!uprobe_is_active(uprobe));
> unapply_uprobe(uprobe, current->mm);
^^^ this remove case needs more thought.
> }
> - up_read(&uprobe->register_rwsem);
> }
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-09 12:05 ` Peter Zijlstra
@ 2024-07-09 13:33 ` Oleg Nesterov
2024-07-09 14:32 ` Peter Zijlstra
2024-07-09 14:32 ` Peter Zijlstra
0 siblings, 2 replies; 84+ messages in thread
From: Oleg Nesterov @ 2024-07-09 13:33 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On 07/09, Peter Zijlstra wrote:
>
> > + guard(srcu)(&uprobes_srcu);
> > +
> > + for_each_consumer_rcu(uc, uprobe->consumers) {
> > int rc = 0;
> >
> > if (uc->handler) {
> > @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> > WARN_ON(!uprobe_is_active(uprobe));
> > unapply_uprobe(uprobe, current->mm);
>
> ^^^ this remove case needs more thought.
Yeah... that is why the current code doesn't use ->consumer_rwsem, iirc.
And consumer_add/del need some changes. Say, consumer_add() does
uc->next = uprobe->consumers;
uprobe->consumers = uc;
I guess it should do
WRITE_ONCE(uc->next, uprobe->consumers);
rcu_assign_pointer(uprobe->consumers, uc);
Oleg.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 9:01 ` Peter Zijlstra
@ 2024-07-09 14:11 ` Paul E. McKenney
2024-07-09 14:29 ` Peter Zijlstra
2024-07-09 16:42 ` Andrii Nakryiko
0 siblings, 2 replies; 84+ messages in thread
From: Paul E. McKenney @ 2024-07-09 14:11 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, bpf, willy
On Tue, Jul 09, 2024 at 11:01:53AM +0200, Peter Zijlstra wrote:
> On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
>
> > Quick profiling for the 8-threaded benchmark shows that we spend >20%
> > in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> > that's what would prevent uprobes from scaling linearly. If you have
> > some good ideas on how to get rid of that, I think it would be
> > extremely beneficial.
>
> That's find_vma() and friends. I started RCU-ifying that a *long* time
> ago when I started the speculative page fault patches. I sorta lost
> track of that effort, Willy where are we with that?
>
> Specifically, how feasible would it be to get a simple RCU based
> find_vma() version sorted these days?
Liam's and Willy's Maple Tree work, combined with Suren's per-VMA locking
combined with some of Vlastimil's slab work is pushing in that direction.
I believe that things are getting pretty close.
As with your work in 2010 and MIT guy's work in 2013, corner-case
correctness and corner-case performance regressions continue to provide
quite a bit of good clean fun.
Thanx, Paul
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 14:11 ` Paul E. McKenney
@ 2024-07-09 14:29 ` Peter Zijlstra
2024-07-09 14:36 ` Paul E. McKenney
2024-07-09 16:10 ` Matthew Wilcox
2024-07-09 16:42 ` Andrii Nakryiko
1 sibling, 2 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 14:29 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, bpf, willy
On Tue, Jul 09, 2024 at 07:11:23AM -0700, Paul E. McKenney wrote:
> On Tue, Jul 09, 2024 at 11:01:53AM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> >
> > > Quick profiling for the 8-threaded benchmark shows that we spend >20%
> > > in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> > > that's what would prevent uprobes from scaling linearly. If you have
> > > some good ideas on how to get rid of that, I think it would be
> > > extremely beneficial.
> >
> > That's find_vma() and friends. I started RCU-ifying that a *long* time
> > ago when I started the speculative page fault patches. I sorta lost
> > track of that effort, Willy where are we with that?
> >
> > Specifically, how feasible would it be to get a simple RCU based
> > find_vma() version sorted these days?
>
> Liam's and Willy's Maple Tree work, combined with Suren's per-VMA locking
> combined with some of Vlastimil's slab work is pushing in that direction.
> I believe that things are getting pretty close.
So I fundamentally do not believe in per-VMA locking. Specifically for
this case that would be trading one hot line for another. I tried
telling people that, but it doesn't seem to stick :/
Per VMA refcounts or per VMA locks are a complete fail IMO.
I suppose I should go dig out the latest versions of those patches to
see where they're at :/
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-09 13:33 ` Oleg Nesterov
@ 2024-07-09 14:32 ` Peter Zijlstra
2024-07-09 15:05 ` Oleg Nesterov
2024-07-09 14:32 ` Peter Zijlstra
1 sibling, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 14:32 UTC (permalink / raw)
To: Oleg Nesterov
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On Tue, Jul 09, 2024 at 03:33:49PM +0200, Oleg Nesterov wrote:
> On 07/09, Peter Zijlstra wrote:
> >
> > > + guard(srcu)(&uprobes_srcu);
> > > +
> > > + for_each_consumer_rcu(uc, uprobe->consumers) {
> > > int rc = 0;
> > >
> > > if (uc->handler) {
> > > @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> > > WARN_ON(!uprobe_is_active(uprobe));
> > > unapply_uprobe(uprobe, current->mm);
> >
> > ^^^ this remove case needs more thought.
>
> Yeah... that is why the current code doesn't use ->consumer_rwsem, iirc.
AFAICT something like the below should work. Concurrent
remove_breakpoint() should already be possible today and doesn't appear
to be a problem.
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1260,6 +1260,10 @@ int uprobe_apply(struct inode *inode, lo
return ret;
}
+/*
+ * Can race against uprobe_unregister() / register_for_each_vma(), and relies
+ * on duplicate remove_breakpoint() being a no-op.
+ */
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
VMA_ITERATOR(vmi, mm, 0);
@@ -2101,6 +2105,7 @@ static void handler_chain(struct uprobe
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
+ bool had_handler = false;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
@@ -2115,16 +2120,26 @@ static void handler_chain(struct uprobe
if (uc->ret_handler)
need_prep = true;
+ /*
+ * A single handler that does not mask out REMOVE, means the
+ * probe stays.
+ */
+ had_handler = true;
remove &= rc;
}
+ /*
+ * If there were no handlers called, nobody asked for it to be removed
+ * but also nobody got to mask the value. Fix it up.
+ */
+ if (!had_handler)
+ remove = 0;
+
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
- if (remove && uprobe->consumers) {
- WARN_ON(!uprobe_is_active(uprobe));
+ if (remove)
unapply_uprobe(uprobe, current->mm);
- }
up_read(&uprobe->register_rwsem);
}
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe()
2024-07-08 18:08 ` Peter Zijlstra
@ 2024-07-09 14:32 ` Oleg Nesterov
2024-07-09 15:02 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Oleg Nesterov @ 2024-07-09 14:32 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On 07/08, Peter Zijlstra wrote:
>
> On Mon, Jul 08, 2024 at 06:35:45PM +0200, Oleg Nesterov wrote:
> >
> > Suppose we have uprobe U which has no extra refs, so uprobe_unregister()
> > called by the task X should remove it from uprobes_tree and kfree.
> >
> > Suppose that the task T hits the breakpoint and enters handle_swbp().
> >
> > Now,
> >
> > - X calls find_uprobe(), this increments U->ref from 1 to 2
> >
> > register_for_each_vma() succeeds
> >
> > X enters delete_uprobe()
> >
> > - T calls find_active_uprobe() -> find_uprobe()
> >
> > __read_seqcount_begin__read_seqcount_begin() returns an even number
> >
> > __find_uprobe() -> rb_find_rcu() succeeds
> >
> > - X continues and returns from delete_uprobe(), U->ref == 1
> >
> > then it does the final uprobe_unregister()->put_uprobe(U),
> > refcount_dec_and_test() succeeds, X calls call_rcu(uprobe_free_rcu).
> >
> > - T does get_uprobe() which changes U->ref from 0 to 1, __find_uprobe()
> > returns, find_uprobe() doesn't check read_seqcount_retry().
>
> I think you're right. However, this get_uprobe() will go away in a few
> patches.
OK, I am looking at 7/10 perf/uprobe: Convert (some) uprobe->refcount to SRCU
Yes, __find_uprobe() no longer does get_uprobe(). But at first glance we have
the same problem with this change
@@ -1977,7 +1979,7 @@ pre_ssout(struct uprobe *uprobe, struct
return err;
}
- utask->active_uprobe = uprobe;
+ utask->active_uprobe = get_uprobe(uprobe);
utask->state = UTASK_SSTEP;
return 0;
}
from 7/12 above. It can change uprobe->ref from 0 to 1 when
call_srcu(&uprobes_srcu, uprobe_free_rcu) was already scheduled.
Once guard(srcu)(&uprobes_srcu) in handle_swbp() drops the uprobes_srcu lock,
utask->active_uprobe can be freed.
Oleg.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-09 13:33 ` Oleg Nesterov
2024-07-09 14:32 ` Peter Zijlstra
@ 2024-07-09 14:32 ` Peter Zijlstra
1 sibling, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 14:32 UTC (permalink / raw)
To: Oleg Nesterov
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On Tue, Jul 09, 2024 at 03:33:49PM +0200, Oleg Nesterov wrote:
> I guess it should do
>
> WRITE_ONCE(uc->next, uprobe->consumers);
> rcu_assign_pointer(uprobe->consumers, uc);
Yes...
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 14:29 ` Peter Zijlstra
@ 2024-07-09 14:36 ` Paul E. McKenney
2024-07-09 15:31 ` Peter Zijlstra
2024-07-09 16:10 ` Matthew Wilcox
1 sibling, 1 reply; 84+ messages in thread
From: Paul E. McKenney @ 2024-07-09 14:36 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, bpf, willy
On Tue, Jul 09, 2024 at 04:29:43PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 09, 2024 at 07:11:23AM -0700, Paul E. McKenney wrote:
> > On Tue, Jul 09, 2024 at 11:01:53AM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> > >
> > > > Quick profiling for the 8-threaded benchmark shows that we spend >20%
> > > > in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> > > > that's what would prevent uprobes from scaling linearly. If you have
> > > > some good ideas on how to get rid of that, I think it would be
> > > > extremely beneficial.
> > >
> > > That's find_vma() and friends. I started RCU-ifying that a *long* time
> > > ago when I started the speculative page fault patches. I sorta lost
> > > track of that effort, Willy where are we with that?
> > >
> > > Specifically, how feasible would it be to get a simple RCU based
> > > find_vma() version sorted these days?
> >
> > Liam's and Willy's Maple Tree work, combined with Suren's per-VMA locking
> > combined with some of Vlastimil's slab work is pushing in that direction.
> > I believe that things are getting pretty close.
>
> So I fundamentally do not believe in per-VMA locking. Specifically for
> this case that would be trading one hot line for another. I tried
> telling people that, but it doesn't seem to stick :/
>
> Per VMA refcounts or per VMA locks are a complete fail IMO.
Not even to allow concurrent updates of the address space by different
threads of a process?
For me, per-VMA locking's need to RCU-protect the VMA is a good step
towards permitting RCU-protected scans of the Maple Tree, which then
gets lockless lookup.
> I suppose I should go dig out the latest versions of those patches to
> see where they're at :/
It would not be a bad thing to get another set of eyes on it.
Thanx, Paul
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe()
2024-07-09 14:32 ` Oleg Nesterov
@ 2024-07-09 15:02 ` Peter Zijlstra
2024-07-09 15:23 ` Oleg Nesterov
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 15:02 UTC (permalink / raw)
To: Oleg Nesterov
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On Tue, Jul 09, 2024 at 04:32:55PM +0200, Oleg Nesterov wrote:
> Once guard(srcu)(&uprobes_srcu) in handle_swbp() drops the uprobes_srcu lock,
> utask->active_uprobe can be freed.
Yeah, I've fixed all those already. It's a bit of churn, adding
inc_not_zero all over the place and then removing it again, but yeah, it
makes the individual patches better.
Let me finish the complete set and I'll push out.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-09 14:32 ` Peter Zijlstra
@ 2024-07-09 15:05 ` Oleg Nesterov
2024-07-09 15:19 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Oleg Nesterov @ 2024-07-09 15:05 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On 07/09, Peter Zijlstra wrote:
>
> On Tue, Jul 09, 2024 at 03:33:49PM +0200, Oleg Nesterov wrote:
> > On 07/09, Peter Zijlstra wrote:
> > >
> > > > + guard(srcu)(&uprobes_srcu);
> > > > +
> > > > + for_each_consumer_rcu(uc, uprobe->consumers) {
> > > > int rc = 0;
> > > >
> > > > if (uc->handler) {
> > > > @@ -2116,7 +2126,6 @@ static void handler_chain(struct uprobe
> > > > WARN_ON(!uprobe_is_active(uprobe));
> > > > unapply_uprobe(uprobe, current->mm);
> > >
> > > ^^^ this remove case needs more thought.
> >
> > Yeah... that is why the current code doesn't use ->consumer_rwsem, iirc.
>
> AFAICT something like the below should work. Concurrent
> remove_breakpoint() should already be possible today and doesn't appear
> to be a problem.
Sorry, I don't understand how can this patch help. Yes, it removes the
uprobe->consumers != NULL check, but this is minor.
To simplify, suppose we have a single consumer which is not interested
in this task/mm, it returns UPROBE_HANDLER_REMOVE.
For example, event->hw.target != NULL and the current task is the forked
child which hits the breakpoint copied by dup_mmap().
Now. We need to ensure that another (say system-wide) consumer can't come
and call register_for_each_vma() before unapply_uprobe().
But perhaps I missed your point...
Oleg.
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1260,6 +1260,10 @@ int uprobe_apply(struct inode *inode, lo
> return ret;
> }
>
> +/*
> + * Can race against uprobe_unregister() / register_for_each_vma(), and relies
> + * on duplicate remove_breakpoint() being a no-op.
> + */
> static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
> {
> VMA_ITERATOR(vmi, mm, 0);
> @@ -2101,6 +2105,7 @@ static void handler_chain(struct uprobe
> struct uprobe_consumer *uc;
> int remove = UPROBE_HANDLER_REMOVE;
> bool need_prep = false; /* prepare return uprobe, when needed */
> + bool had_handler = false;
>
> down_read(&uprobe->register_rwsem);
> for (uc = uprobe->consumers; uc; uc = uc->next) {
> @@ -2115,16 +2120,26 @@ static void handler_chain(struct uprobe
> if (uc->ret_handler)
> need_prep = true;
>
> + /*
> + * A single handler that does not mask out REMOVE, means the
> + * probe stays.
> + */
> + had_handler = true;
> remove &= rc;
> }
>
> + /*
> + * If there were no handlers called, nobody asked for it to be removed
> + * but also nobody got to mask the value. Fix it up.
> + */
> + if (!had_handler)
> + remove = 0;
> +
> if (need_prep && !remove)
> prepare_uretprobe(uprobe, regs); /* put bp at return */
>
> - if (remove && uprobe->consumers) {
> - WARN_ON(!uprobe_is_active(uprobe));
> + if (remove)
> unapply_uprobe(uprobe, current->mm);
> - }
> up_read(&uprobe->register_rwsem);
> }
>
>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-09 15:05 ` Oleg Nesterov
@ 2024-07-09 15:19 ` Peter Zijlstra
2024-07-09 15:34 ` Oleg Nesterov
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 15:19 UTC (permalink / raw)
To: Oleg Nesterov
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On Tue, Jul 09, 2024 at 05:05:04PM +0200, Oleg Nesterov wrote:
> To simplify, suppose we have a single consumer which is not interested
> in this task/mm, it returns UPROBE_HANDLER_REMOVE.
>
> For example, event->hw.target != NULL and the current task is the forked
> child which hits the breakpoint copied by dup_mmap().
>
> Now. We need to ensure that another (say system-wide) consumer can't come
> and call register_for_each_vma() before unapply_uprobe().
>
> But perhaps I missed your point...
Ooh, I see. I failed to consider that particular case. This is
interleaving uprobe_register() and handler_chain(). Silly me only looked
at uprobe_unregister() and handler_chain().
Hmm, easiest would be to add a seqcount to register_mutex and simply
skip the remove case when odd.
Then the handler might get a few extra (unwanted) calls, but it should
be able to handle them, they're fundamentally not different from the
first one where it says REMOVE. Right?
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe()
2024-07-09 15:02 ` Peter Zijlstra
@ 2024-07-09 15:23 ` Oleg Nesterov
0 siblings, 0 replies; 84+ messages in thread
From: Oleg Nesterov @ 2024-07-09 15:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On 07/09, Peter Zijlstra wrote:
> On Tue, Jul 09, 2024 at 04:32:55PM +0200, Oleg Nesterov wrote:
>
> > Once guard(srcu)(&uprobes_srcu) in handle_swbp() drops the uprobes_srcu lock,
> > utask->active_uprobe can be freed.
>
> Yeah, I've fixed all those already. It's a bit of churn, adding
> inc_not_zero all over the place
I am wondering if we can move delayed_uprobe_remove() from put_uprobe()
to delete_uprobe()... probably not, I forgot everything.
But if we can, then we can probably do
put_uprobe(uprobe)
{
if (refcount_dec_and_test(&uprobe->ref))
kfree(uprobe);
}
uprobe_put_rcu(struct rcu_head *rcu)
{
uprobe = container_of(...);
put_uprobe(uprobe);
}
delete_uprobe(uprobe)
{
rb_erase(...);
delayed_uprobe_remove(...);
...
call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_put_rcu);
}
and avoid inc_not_zero.
Not sure, I am already exhausted ;)
Oleg.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 14:36 ` Paul E. McKenney
@ 2024-07-09 15:31 ` Peter Zijlstra
2024-07-09 15:56 ` Paul E. McKenney
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-09 15:31 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, bpf, willy
On Tue, Jul 09, 2024 at 07:36:41AM -0700, Paul E. McKenney wrote:
> > Per VMA refcounts or per VMA locks are a complete fail IMO.
>
> Not even to allow concurrent updates of the address space by different
> threads of a process?
Well, I'm sure it helps some workloads. But for others it is just moving
the problem.
> For me, per-VMA locking's need to RCU-protect the VMA is a good step
> towards permitting RCU-protected scans of the Maple Tree, which then
> gets lockless lookup.
Right, the question is if the VMA lock is required to be stable against
splitting. If that is the case, we're hosed :/
At the time I added a seqcount for that, but I'm also remembering that's
one of the things people complained about for single threaded
performance.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list
2024-07-09 15:19 ` Peter Zijlstra
@ 2024-07-09 15:34 ` Oleg Nesterov
0 siblings, 0 replies; 84+ messages in thread
From: Oleg Nesterov @ 2024-07-09 15:34 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, jolsa, clm,
paulmck
On 07/09, Peter Zijlstra wrote:
>
> Hmm, easiest would be to add a seqcount to register_mutex and simply
> skip the remove case when odd.
probably yes,
> Then the handler might get a few extra (unwanted) calls, but it should
> be able to handle them, they're fundamentally not different from the
> first one where it says REMOVE. Right?
plus the handler can never assume that UPROBE_HANDLER_REMOVE won't be
ignored, another consumer can want to trace this task.
Oleg.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 15:31 ` Peter Zijlstra
@ 2024-07-09 15:56 ` Paul E. McKenney
0 siblings, 0 replies; 84+ messages in thread
From: Paul E. McKenney @ 2024-07-09 15:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, bpf, willy
On Tue, Jul 09, 2024 at 05:31:32PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 09, 2024 at 07:36:41AM -0700, Paul E. McKenney wrote:
>
> > > Per VMA refcounts or per VMA locks are a complete fail IMO.
> >
> > Not even to allow concurrent updates of the address space by different
> > threads of a process?
>
> Well, I'm sure it helps some workloads. But for others it is just moving
> the problem.
From where I sit, helping a wide range of workloads is a good thing. ;-)
> > For me, per-VMA locking's need to RCU-protect the VMA is a good step
> > towards permitting RCU-protected scans of the Maple Tree, which then
> > gets lockless lookup.
>
> Right, the question is if the VMA lock is required to be stable against
> splitting. If that is the case, we're hosed :/
Let's just say that VMA splitting and merging has consumed much time
and effort from the usual suspects over the past while.
> At the time I added a seqcount for that, but I'm also remembering that's
> one of the things people complained about for single threaded
> performance.
Sequence locks are lighter weight these days, but again, this has been
very much a long-term whack-a-mole exercise with odd regressions.
Thanx, Paul
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 14:29 ` Peter Zijlstra
2024-07-09 14:36 ` Paul E. McKenney
@ 2024-07-09 16:10 ` Matthew Wilcox
2024-07-09 16:30 ` Matthew Wilcox
2024-07-10 9:16 ` Peter Zijlstra
1 sibling, 2 replies; 84+ messages in thread
From: Matthew Wilcox @ 2024-07-09 16:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul E. McKenney, Andrii Nakryiko, Masami Hiramatsu, mingo,
andrii, linux-kernel, rostedt, oleg, jolsa, clm, bpf
On Tue, Jul 09, 2024 at 04:29:43PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 09, 2024 at 07:11:23AM -0700, Paul E. McKenney wrote:
> > On Tue, Jul 09, 2024 at 11:01:53AM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> > >
> > > > Quick profiling for the 8-threaded benchmark shows that we spend >20%
> > > > in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> > > > that's what would prevent uprobes from scaling linearly. If you have
> > > > some good ideas on how to get rid of that, I think it would be
> > > > extremely beneficial.
> > >
> > > That's find_vma() and friends. I started RCU-ifying that a *long* time
> > > ago when I started the speculative page fault patches. I sorta lost
> > > track of that effort, Willy where are we with that?
Probably best to start with lock_vma_under_rcu() in mm/memory.c.
> > > Specifically, how feasible would it be to get a simple RCU based
> > > find_vma() version sorted these days?
> >
> > Liam's and Willy's Maple Tree work, combined with Suren's per-VMA locking
> > combined with some of Vlastimil's slab work is pushing in that direction.
> > I believe that things are getting pretty close.
>
> So I fundamentally do not believe in per-VMA locking. Specifically for
> this case that would be trading one hot line for another. I tried
> telling people that, but it doesn't seem to stick :/
SRCU also had its own performance problems, so we've got problems one
way or the other. The per-VMA lock probably doesn't work quite the way
you think it does, but it absoutely can be a hot cacheline.
I did propose a store-free variant at LSFMM 2022 and again at 2023,
but was voted down. https://lwn.net/Articles/932298/
I don't think the door is completely closed to a migration to that,
but it's a harder sell than what we've got. Of course, data helps ...
> Per VMA refcounts or per VMA locks are a complete fail IMO.
>
> I suppose I should go dig out the latest versions of those patches to
> see where they're at :/
Merged in v6.4 ;-P
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 16:10 ` Matthew Wilcox
@ 2024-07-09 16:30 ` Matthew Wilcox
2024-07-09 16:57 ` Paul E. McKenney
2024-07-10 9:16 ` Peter Zijlstra
1 sibling, 1 reply; 84+ messages in thread
From: Matthew Wilcox @ 2024-07-09 16:30 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul E. McKenney, Andrii Nakryiko, Masami Hiramatsu, mingo,
andrii, linux-kernel, rostedt, oleg, jolsa, clm, bpf
On Tue, Jul 09, 2024 at 05:10:45PM +0100, Matthew Wilcox wrote:
> > So I fundamentally do not believe in per-VMA locking. Specifically for
> > this case that would be trading one hot line for another. I tried
> > telling people that, but it doesn't seem to stick :/
>
> SRCU also had its own performance problems, so we've got problems one
> way or the other. The per-VMA lock probably doesn't work quite the way
> you think it does, but it absoutely can be a hot cacheline.
>
> I did propose a store-free variant at LSFMM 2022 and again at 2023,
> but was voted down. https://lwn.net/Articles/932298/
Actually, the 2022 version has a bit more of the flavour of the
argument: https://lwn.net/Articles/893906/
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 14:11 ` Paul E. McKenney
2024-07-09 14:29 ` Peter Zijlstra
@ 2024-07-09 16:42 ` Andrii Nakryiko
1 sibling, 0 replies; 84+ messages in thread
From: Andrii Nakryiko @ 2024-07-09 16:42 UTC (permalink / raw)
To: paulmck
Cc: Peter Zijlstra, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, bpf, willy
On Tue, Jul 9, 2024 at 7:11 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Tue, Jul 09, 2024 at 11:01:53AM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> >
> > > Quick profiling for the 8-threaded benchmark shows that we spend >20%
> > > in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> > > that's what would prevent uprobes from scaling linearly. If you have
> > > some good ideas on how to get rid of that, I think it would be
> > > extremely beneficial.
> >
> > That's find_vma() and friends. I started RCU-ifying that a *long* time
> > ago when I started the speculative page fault patches. I sorta lost
> > track of that effort, Willy where are we with that?
> >
> > Specifically, how feasible would it be to get a simple RCU based
> > find_vma() version sorted these days?
>
> Liam's and Willy's Maple Tree work, combined with Suren's per-VMA locking
> combined with some of Vlastimil's slab work is pushing in that direction.
> I believe that things are getting pretty close.
Yep, I realized that this might be a solution right after asking the
question :) I've been recently exposed to per-VMA locking, so I might
take a look at this later.
>
> As with your work in 2010 and MIT guy's work in 2013, corner-case
> correctness and corner-case performance regressions continue to provide
> quite a bit of good clean fun.
>
> Thanx, Paul
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 16:30 ` Matthew Wilcox
@ 2024-07-09 16:57 ` Paul E. McKenney
0 siblings, 0 replies; 84+ messages in thread
From: Paul E. McKenney @ 2024-07-09 16:57 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Peter Zijlstra, Andrii Nakryiko, Masami Hiramatsu, mingo, andrii,
linux-kernel, rostedt, oleg, jolsa, clm, bpf
On Tue, Jul 09, 2024 at 05:30:38PM +0100, Matthew Wilcox wrote:
> On Tue, Jul 09, 2024 at 05:10:45PM +0100, Matthew Wilcox wrote:
> > > So I fundamentally do not believe in per-VMA locking. Specifically for
> > > this case that would be trading one hot line for another. I tried
> > > telling people that, but it doesn't seem to stick :/
> >
> > SRCU also had its own performance problems, so we've got problems one
> > way or the other. The per-VMA lock probably doesn't work quite the way
> > you think it does, but it absoutely can be a hot cacheline.
> >
> > I did propose a store-free variant at LSFMM 2022 and again at 2023,
> > but was voted down. https://lwn.net/Articles/932298/
>
> Actually, the 2022 version has a bit more of the flavour of the
> argument: https://lwn.net/Articles/893906/
Thank you for the citations!
From what I can see, part of the problem is that applications commonly
running on Linux have worked around many of these limitations one way or
another, which makes it harder to find a real-world use case for which
the store-free lookup gives substantial benefits. Perhaps this uprobes
case will be helpful in that regard. (Hey, I can dream, can't I?)
Thanx, Paul
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 0:25 ` Andrii Nakryiko
2024-07-09 9:01 ` Peter Zijlstra
2024-07-09 9:03 ` Peter Zijlstra
@ 2024-07-09 21:47 ` Andrii Nakryiko
2024-07-10 10:12 ` Peter Zijlstra
2 siblings, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-07-09 21:47 UTC (permalink / raw)
To: Masami Hiramatsu, Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, oleg, jolsa, clm, paulmck,
bpf
On Mon, Jul 8, 2024 at 5:25 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Jul 8, 2024 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Mon, 08 Jul 2024 11:12:41 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > Hi!
> > >
> > > These patches implement the (S)RCU based proposal to optimize uprobes.
> > >
> > > On my c^Htrusty old IVB-EP -- where each (of the 40) CPU calls 'func' in a
> > > tight loop:
> > >
> > > perf probe -x ./uprobes test=func
> > > perf stat -ae probe_uprobe:test -- sleep 1
> > >
> > > perf probe -x ./uprobes test=func%return
> > > perf stat -ae probe_uprobe:test__return -- sleep 1
> > >
> > > PRE:
> > >
> > > 4,038,804 probe_uprobe:test
> > > 2,356,275 probe_uprobe:test__return
> > >
> > > POST:
> > >
> > > 7,216,579 probe_uprobe:test
> > > 6,744,786 probe_uprobe:test__return
> > >
> >
> > Good results! So this is another series of Andrii's batch register?
> > (but maybe it becomes simpler)
>
> yes, this would be an alternative to my patches
>
>
> Peter,
>
> I didn't have time to look at the patches just yet, but I managed to
> run a quick benchmark (using bench tool we have as part of BPF
> selftests) to see both single-threaded performance and how the
> performance scales with CPUs (now that we are not bottlenecked on
> register_rwsem). Here are some results:
Running in my local VM with debugging config, I'm getting the
following. Please check if that's something new or it's just another
symptom of the issues that Oleg pointed out already.
[ 11.213834] ================================================
[ 11.214225] WARNING: lock held when returning to user space!
[ 11.214603] 6.10.0-rc6-gd3f5cbffe86b #1263 Tainted: G OE
[ 11.215040] ------------------------------------------------
[ 11.215426] urandom_read/2412 is leaving the kernel with locks still held!
[ 11.215876] 1 lock held by urandom_read/2412:
[ 11.216175] #0: ffffffff835ce8f0 (uretprobes_srcu){.+.+}-{0:0},
at: srcu_read_lock+0x31/0x3f
[ 11.262797] ------------[ cut here ]------------
[ 11.263162] refcount_t: underflow; use-after-free.
[ 11.263474] WARNING: CPU: 1 PID: 2409 at lib/refcount.c:28
refcount_warn_saturate+0x99/0xda
[ 11.263995] Modules linked in: bpf_testmod(OE) aesni_intel(E)
crypto_simd(E) floppy(E) cryptd(E) i2c_piix4(E) crc32c_intel(E)
button(E) i2c_core(E) i6300esb(E) pcspkr(E) serio_raw(E)
[ 11.265105] CPU: 1 PID: 2409 Comm: test_progs Tainted: G
OE 6.10.0-rc6-gd3f5cbffe86b #1263
[ 11.265740] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 11.266507] RIP: 0010:refcount_warn_saturate+0x99/0xda
[ 11.266862] Code: 05 ba 29 1d 02 01 e8 e2 c0 b4 ff 0f 0b c3 80 3d
aa 29 1d 02 00 75 53 48 c7 c7 20 59 50 82 c6 05 9a 29 1d 02 01 e8 c3
c0 b4 ff <0f> 0b c3 80 3d 8a 29 1d 02 00 75 34 a
[ 11.268099] RSP: 0018:ffffc90001fbbd60 EFLAGS: 00010282
[ 11.268451] RAX: 0000000000000026 RBX: ffff88810f333000 RCX: 0000000000000027
[ 11.268931] RDX: 0000000000000000 RSI: ffffffff82580a45 RDI: 00000000ffffffff
[ 11.269417] RBP: ffff888105937818 R08: 0000000000000000 R09: 0000000000000000
[ 11.269910] R10: 00000000756f6366 R11: 0000000063666572 R12: ffff88810f333030
[ 11.270387] R13: ffffc90001fbbb80 R14: ffff888100535190 R15: dead000000000100
[ 11.270870] FS: 00007fc938bd2d00(0000) GS:ffff88881f680000(0000)
knlGS:0000000000000000
[ 11.271363] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 11.271725] CR2: 000000000073a005 CR3: 00000001127d5004 CR4: 0000000000370ef0
[ 11.272220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 11.272693] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 11.273182] Call Trace:
[ 11.273370] <TASK>
[ 11.273518] ? __warn+0x8b/0x14d
[ 11.273753] ? report_bug+0xdb/0x151
[ 11.273997] ? refcount_warn_saturate+0x99/0xda
[ 11.274326] ? handle_bug+0x3c/0x5b
[ 11.274564] ? exc_invalid_op+0x13/0x5c
[ 11.274831] ? asm_exc_invalid_op+0x16/0x20
[ 11.275119] ? refcount_warn_saturate+0x99/0xda
[ 11.275428] uprobe_unregister_nosync+0x61/0x7c
[ 11.275768] __probe_event_disable+0x5d/0x7d
[ 11.276069] probe_event_disable+0x50/0x58
[ 11.276350] trace_uprobe_register+0x4f/0x1a7
[ 11.276667] perf_trace_event_unreg.isra.0+0x1e/0x6b
[ 11.277031] perf_uprobe_destroy+0x26/0x4b
[ 11.277312] _free_event+0x2ad/0x311
[ 11.277558] perf_event_release_kernel+0x210/0x221
[ 11.277887] ? lock_acquire+0x8d/0x266
[ 11.278147] perf_release+0x11/0x14
[ 11.278384] __fput+0x133/0x1fb
[ 11.278604] task_work_run+0x67/0x8b
[ 11.278858] resume_user_mode_work+0x22/0x4a
[ 11.279153] syscall_exit_to_user_mode+0x86/0xdd
[ 11.279465] do_syscall_64+0xa1/0xfb
[ 11.279733] entry_SYSCALL_64_after_hwframe+0x4b/0x53
[ 11.280074] RIP: 0033:0x7fc938da5a94
[ 11.280322] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84
00 00 00 00 00 90 f3 0f 1e fa 80 3d d5 18 0e 00 00 74 13 b8 03 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 44 c3 0f 1f 00 3
[ 11.281538] RSP: 002b:00007ffd3553e6a8 EFLAGS: 00000202 ORIG_RAX:
0000000000000003
[ 11.282003] RAX: 0000000000000000 RBX: 00007ffd3553eb58 RCX: 00007fc938da5a94
[ 11.282498] RDX: 0000000000000011 RSI: 0000000000002401 RDI: 0000000000000012
[ 11.282981] RBP: 00007ffd3553e6e0 R08: 0000000004a90425 R09: 0000000000000007
[ 11.283451] R10: 0000000004a95050 R11: 0000000000000202 R12: 0000000000000000
[ 11.283894] R13: 00007ffd3553eb88 R14: 00007fc938f01000 R15: 0000000000ffad90
[ 11.284387] </TASK>
[ 11.284540] irq event stamp: 70926
[ 11.284779] hardirqs last enabled at (70925): [<ffffffff81c93d2c>]
_raw_spin_unlock_irqrestore+0x30/0x5f
[ 11.285404] hardirqs last disabled at (70926): [<ffffffff81c8e5bd>]
__schedule+0x1ad/0xcab
[ 11.285943] softirqs last enabled at (70898): [<ffffffff811f0992>]
bpf_link_settle+0x2a/0x3b
[ 11.286506] softirqs last disabled at (70896): [<ffffffff811f097d>]
bpf_link_settle+0x15/0x3b
[ 11.287071] ---[ end trace 0000000000000000 ]---
[ 11.400528] ------------[ cut here ]------------
[ 11.400877] refcount_t: saturated; leaking memory.
[ 11.401214] WARNING: CPU: 2 PID: 2409 at lib/refcount.c:22
refcount_warn_saturate+0x5b/0xda
[ 11.401778] Modules linked in: bpf_testmod(OE) aesni_intel(E)
crypto_simd(E) floppy(E) cryptd(E) i2c_piix4(E) crc32c_intel(E)
button(E) i2c_core(E) i6300esb(E) pcspkr(E) serio_raw(E)
[ 11.402822] CPU: 2 PID: 2409 Comm: test_progs Tainted: G W
OE 6.10.0-rc6-gd3f5cbffe86b #1263
[ 11.403396] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 11.404091] RIP: 0010:refcount_warn_saturate+0x5b/0xda
[ 11.404404] Code: 02 01 e8 24 c1 b4 ff 0f 0b c3 80 3d ee 29 1d 02
00 0f 85 91 00 00 00 48 c7 c7 bd 58 50 82 c6 05 da 29 1d 02 01 e8 01
c1 b4 ff <0f> 0b c3 80 3d ca 29 1d 02 00 75 72 a
[ 11.405578] RSP: 0018:ffffc90001fbbca0 EFLAGS: 00010286
[ 11.405906] RAX: 0000000000000026 RBX: ffff888103fb9400 RCX: 0000000000000027
[ 11.406397] RDX: 0000000000000000 RSI: ffffffff82580a45 RDI: 00000000ffffffff
[ 11.406875] RBP: ffff8881029b3400 R08: 0000000000000000 R09: 0000000000000000
[ 11.407345] R10: 00000000756f6366 R11: 0000000063666572 R12: ffff8881015ab5d8
[ 11.407827] R13: ffff888105d9b208 R14: 0000000000078060 R15: 0000000000000000
[ 11.408311] FS: 00007fc938bd2d00(0000) GS:ffff88881f700000(0000)
knlGS:0000000000000000
[ 11.408853] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 11.409254] CR2: 00007fffffffe080 CR3: 00000001127d5006 CR4: 0000000000370ef0
[ 11.409688] Call Trace:
[ 11.409852] <TASK>
[ 11.409988] ? __warn+0x8b/0x14d
[ 11.410199] ? report_bug+0xdb/0x151
[ 11.410422] ? refcount_warn_saturate+0x5b/0xda
[ 11.410748] ? handle_bug+0x3c/0x5b
[ 11.410987] ? exc_invalid_op+0x13/0x5c
[ 11.411254] ? asm_exc_invalid_op+0x16/0x20
[ 11.411538] ? refcount_warn_saturate+0x5b/0xda
[ 11.411857] __uprobe_register+0x185/0x2a2
[ 11.412140] ? __uprobe_perf_filter+0x3f/0x3f
[ 11.412465] probe_event_enable+0x265/0x2b8
[ 11.412753] perf_trace_event_init+0x174/0x1fd
[ 11.413029] perf_uprobe_init+0x8f/0xbc
[ 11.413268] perf_uprobe_event_init+0x52/0x64
[ 11.413538] perf_try_init_event+0x5c/0xe7
[ 11.413799] perf_event_alloc+0x4e1/0xaf5
[ 11.414047] ? _raw_spin_unlock+0x29/0x3a
[ 11.414299] ? alloc_fd+0x190/0x1a3
[ 11.414520] __do_sys_perf_event_open+0x28a/0x9c6
[ 11.414823] do_syscall_64+0x85/0xfb
[ 11.415047] entry_SYSCALL_64_after_hwframe+0x4b/0x53
[ 11.415403] RIP: 0033:0x7fc938db573d
[ 11.415648] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e
fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24
08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8
[ 11.417177] RSP: 002b:00007ffd3553c328 EFLAGS: 00000286 ORIG_RAX:
000000000000012a
[ 11.417724] RAX: ffffffffffffffda RBX: 00007ffd3553eb58 RCX: 00007fc938db573d
[ 11.418233] RDX: 0000000000000000 RSI: 00000000ffffffff RDI: 00007ffd3553c3e0
[ 11.418726] RBP: 00007ffd3553c480 R08: 0000000000000008 R09: 0000000000000000
[ 11.419202] R10: 00000000ffffffff R11: 0000000000000286 R12: 0000000000000000
[ 11.419671] R13: 00007ffd3553eb88 R14: 00007fc938f01000 R15: 0000000000ffad90
[ 11.420151] </TASK>
[ 11.420291] irq event stamp: 70926
[ 11.420502] hardirqs last enabled at (70925): [<ffffffff81c93d2c>]
_raw_spin_unlock_irqrestore+0x30/0x5f
[ 11.421162] hardirqs last disabled at (70926): [<ffffffff81c8e5bd>]
__schedule+0x1ad/0xcab
[ 11.421717] softirqs last enabled at (70898): [<ffffffff811f0992>]
bpf_link_settle+0x2a/0x3b
[ 11.422286] softirqs last disabled at (70896): [<ffffffff811f097d>]
bpf_link_settle+0x15/0x3b
[ 11.422829] ---[ end trace 0000000000000000 ]---
>
> [root@kerneltest003.10.atn6 ~]# for num_threads in {1..20}; do ./bench \
> -a -d10 -p $num_threads trig-uprobe-nop | grep Summary; done
> Summary: hits 3.278 ± 0.021M/s ( 3.278M/prod)
> Summary: hits 4.364 ± 0.005M/s ( 2.182M/prod)
> Summary: hits 6.517 ± 0.011M/s ( 2.172M/prod)
> Summary: hits 8.203 ± 0.004M/s ( 2.051M/prod)
> Summary: hits 9.520 ± 0.012M/s ( 1.904M/prod)
> Summary: hits 8.316 ± 0.007M/s ( 1.386M/prod)
> Summary: hits 7.893 ± 0.037M/s ( 1.128M/prod)
> Summary: hits 8.490 ± 0.014M/s ( 1.061M/prod)
> Summary: hits 8.022 ± 0.005M/s ( 0.891M/prod)
> Summary: hits 8.471 ± 0.019M/s ( 0.847M/prod)
> Summary: hits 8.156 ± 0.021M/s ( 0.741M/prod)
> ...
>
>
> (numbers in the first column is total throughput, and xxx/prod is
> per-thread throughput). Single-threaded performance (about 3.3 mln/s)
> is on part with what I had with my patches. And clearly it scales
> better with more thread now that register_rwsem is gone, though,
> unfortunately, it doesn't really scale linearly.
>
> Quick profiling for the 8-threaded benchmark shows that we spend >20%
> in mmap_read_lock/mmap_read_unlock in find_active_uprobe. I think
> that's what would prevent uprobes from scaling linearly. If you have
> some good ideas on how to get rid of that, I think it would be
> extremely beneficial. We also spend about 14% of the time in
> srcu_read_lock(). The rest is in interrupt handling overhead, actual
> user-space function overhead, and in uprobe_dispatcher() calls.
>
> Ramping this up to 16 threads shows that mmap_rwsem is getting more
> costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
> CPU. Is this expected? (I'm not familiar with the implementation
> details)
>
> P.S. Would you be able to rebase your patches on top of latest
> probes/for-next, which include Jiri's sys_uretprobe changes. Right now
> uretprobe benchmarks are quite unrepresentative because of that.
> Thanks!
>
>
> >
> > Thank you,
> >
> > >
> > > Patches also available here:
> > >
> > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/uprobes
> > >
> > >
> >
> >
> > --
> > Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 10:16 ` Peter Zijlstra
@ 2024-07-09 22:10 ` Masami Hiramatsu
2024-07-10 10:10 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-09 22:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Jiri Olsa, Andrii Nakryiko, Masami Hiramatsu, mingo, andrii,
linux-kernel, rostedt, oleg, clm, paulmck, bpf
On Tue, 9 Jul 2024 12:16:34 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Jul 09, 2024 at 12:01:03PM +0200, Jiri Olsa wrote:
> > On Tue, Jul 09, 2024 at 11:03:04AM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
> > >
> > > > Ramping this up to 16 threads shows that mmap_rwsem is getting more
> > > > costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
> > > > CPU. Is this expected? (I'm not familiar with the implementation
> > > > details)
> > >
> > > SRCU getting more expensive is a bit unexpected, it's just a per-cpu
> > > inc/dec and a full barrier.
> > >
> > > > P.S. Would you be able to rebase your patches on top of latest
> > > > probes/for-next, which include Jiri's sys_uretprobe changes. Right now
> > > > uretprobe benchmarks are quite unrepresentative because of that.
> > >
> > > What branch is that? kernel/events/ stuff usually goes through tip, no?
> >
> > it went through the trace tree:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git probes/for-next
> >
> > and it's in linux-next/master already
>
> FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
> what gives?
This is managing *probes and related dynamic trace-events. Those has been
moved from tip. Could you also add linux-trace-kernel@vger ML to CC?
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 9:03 ` Peter Zijlstra
2024-07-09 10:01 ` Jiri Olsa
@ 2024-07-10 0:55 ` Masami Hiramatsu
1 sibling, 0 replies; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-10 0:55 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Andrii Nakryiko, Masami Hiramatsu, mingo, andrii, linux-kernel,
rostedt, oleg, jolsa, clm, paulmck, bpf
On Tue, 9 Jul 2024 11:03:04 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Jul 08, 2024 at 05:25:14PM -0700, Andrii Nakryiko wrote:
>
> > Ramping this up to 16 threads shows that mmap_rwsem is getting more
> > costly, up to 45% of CPU. SRCU is also growing a bit slower to 19% of
> > CPU. Is this expected? (I'm not familiar with the implementation
> > details)
>
> SRCU getting more expensive is a bit unexpected, it's just a per-cpu
> inc/dec and a full barrier.
>
> > P.S. Would you be able to rebase your patches on top of latest
> > probes/for-next, which include Jiri's sys_uretprobe changes. Right now
> > uretprobe benchmarks are quite unrepresentative because of that.
>
> What branch is that? kernel/events/ stuff usually goes through tip, no?
I'm handling uprobe patches in linux-trace tree, because it's a kind of
probes in the kernel. Actually that is not clarified that the uprobe is
handled by which tree. I had asked to handle kprobes in linux-trace, but
uprobes might not be clear. Is that OK for you to handle uprobes on
linux-trace?
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu()
2024-07-08 9:12 ` [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu() Peter Zijlstra
@ 2024-07-10 1:29 ` Masami Hiramatsu
2024-07-10 7:55 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-10 1:29 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
On Mon, 08 Jul 2024 11:12:44 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> Much like latch_tree, add two RCU methods for the regular RB-tree,
> which can be used in conjunction with a seqcount to provide lockless
> lookups.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/rbtree.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 67 insertions(+)
>
> --- a/include/linux/rbtree.h
> +++ b/include/linux/rbtree.h
> @@ -245,6 +245,42 @@ rb_find_add(struct rb_node *node, struct
> }
>
> /**
> + * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
> + * @node: node to look-for / insert
> + * @tree: tree to search / modify
> + * @cmp: operator defining the node order
> + *
> + * Adds a Store-Release for link_node.
> + *
> + * Returns the rb_node matching @node, or NULL when no match is found and @node
> + * is inserted.
> + */
> +static __always_inline struct rb_node *
> +rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
> + int (*cmp)(struct rb_node *, const struct rb_node *))
> +{
> + struct rb_node **link = &tree->rb_node;
> + struct rb_node *parent = NULL;
> + int c;
> +
> + while (*link) {
Don't we need to use rcu_dereference_raw(*link) here?
> + parent = *link;
> + c = cmp(node, parent);
> +
> + if (c < 0)
> + link = &parent->rb_left;
> + else if (c > 0)
> + link = &parent->rb_right;
> + else
> + return parent;
> + }
> +
> + rb_link_node_rcu(node, parent, link);
> + rb_insert_color(node, tree);
> + return NULL;
> +}
> +
> +/**
> * rb_find() - find @key in tree @tree
> * @key: key to match
> * @tree: tree to search
> @@ -268,6 +304,37 @@ rb_find(const void *key, const struct rb
> else
> return node;
> }
> +
> + return NULL;
> +}
> +
> +/**
> + * rb_find_rcu() - find @key in tree @tree
> + * @key: key to match
> + * @tree: tree to search
> + * @cmp: operator defining the node order
> + *
> + * Notably, tree descent vs concurrent tree rotations is unsound and can result
> + * in false-negatives.
> + *
> + * Returns the rb_node matching @key or NULL.
> + */
> +static __always_inline struct rb_node *
> +rb_find_rcu(const void *key, const struct rb_root *tree,
> + int (*cmp)(const void *key, const struct rb_node *))
> +{
> + struct rb_node *node = tree->rb_node;
> +
> + while (node) {
> + int c = cmp(key, node);
> +
> + if (c < 0)
> + node = rcu_dereference_raw(node->rb_left);
> + else if (c > 0)
> + node = rcu_dereference_raw(node->rb_right);
> + else
> + return node;
> + }
>
> return NULL;
> }
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 06/10] perf/uprobe: Split uprobe_unregister()
2024-07-08 9:12 ` [PATCH 06/10] perf/uprobe: Split uprobe_unregister() Peter Zijlstra
@ 2024-07-10 1:40 ` Masami Hiramatsu
0 siblings, 0 replies; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-10 1:40 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
On Mon, 08 Jul 2024 11:12:47 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> With uprobe_unregister() having grown a synchronize_srcu(), it becomes
> fairly slow to call. Esp. since both users of this API call it in a
> loop.
>
> Peel off the sync_srcu() and do it once, after the loop.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Looks good to me.
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Thanks!
> ---
> include/linux/uprobes.h | 8 ++++++--
> kernel/events/uprobes.c | 8 ++++++--
> kernel/trace/bpf_trace.c | 6 ++++--
> kernel/trace/trace_uprobe.c | 6 +++++-
> 4 files changed, 21 insertions(+), 7 deletions(-)
>
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -113,7 +113,8 @@ extern int uprobe_write_opcode(struct ar
> extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
> extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
> extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
> -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
> +extern void uprobe_unregister_nosync(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
> +extern void uprobe_unregister_sync(void);
> extern int uprobe_mmap(struct vm_area_struct *vma);
> extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
> extern void uprobe_start_dup_mmap(void);
> @@ -163,7 +164,10 @@ uprobe_apply(struct inode *inode, loff_t
> return -ENOSYS;
> }
> static inline void
> -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
> +uprobe_unregister_nosync(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
> +{
> +}
> +static inline void uprobes_unregister_sync(void)
> {
> }
> static inline int uprobe_mmap(struct vm_area_struct *vma)
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1126,7 +1126,7 @@ __uprobe_unregister(struct uprobe *uprob
> * @offset: offset from the start of the file.
> * @uc: identify which probe if multiple probes are colocated.
> */
> -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
> +void uprobe_unregister_nosync(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
> {
> struct uprobe *uprobe;
>
> @@ -1138,10 +1138,14 @@ void uprobe_unregister(struct inode *ino
> __uprobe_unregister(uprobe, uc);
> mutex_unlock(&uprobe->register_mutex);
> put_uprobe(uprobe);
> +}
> +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
>
> +void uprobe_unregister_sync(void)
> +{
> synchronize_srcu(&uprobes_srcu);
> }
> -EXPORT_SYMBOL_GPL(uprobe_unregister);
> +EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
>
> /*
> * __uprobe_register - register a probe
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -3181,9 +3181,11 @@ static void bpf_uprobe_unregister(struct
> u32 i;
>
> for (i = 0; i < cnt; i++) {
> - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
> - &uprobes[i].consumer);
> + uprobe_unregister_nosync(d_real_inode(path->dentry), uprobes[i].offset,
> + &uprobes[i].consumer);
> }
> + if (cnt)
> + uprobe_unregister_sync();
> }
>
> static void bpf_uprobe_multi_link_release(struct bpf_link *link)
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1104,6 +1104,7 @@ static int trace_uprobe_enable(struct tr
> static void __probe_event_disable(struct trace_probe *tp)
> {
> struct trace_uprobe *tu;
> + bool sync = false;
>
> tu = container_of(tp, struct trace_uprobe, tp);
> WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
> @@ -1112,9 +1113,12 @@ static void __probe_event_disable(struct
> if (!tu->inode)
> continue;
>
> - uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
> + uprobe_unregister_nosync(tu->inode, tu->offset, &tu->consumer);
> + sync = true;
> tu->inode = NULL;
> }
> + if (sync)
> + uprobe_unregister_sync();
> }
>
> static int probe_event_enable(struct trace_event_call *call,
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 08/10] srcu: Add __srcu_clone_read_lock()
2024-07-08 9:12 ` [PATCH 08/10] srcu: Add __srcu_clone_read_lock() Peter Zijlstra
@ 2024-07-10 5:48 ` Boqun Feng
2024-07-10 10:02 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Boqun Feng @ 2024-07-10 5:48 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
On Mon, Jul 08, 2024 at 11:12:49AM +0200, Peter Zijlstra wrote:
> In order to support carrying an srcu_read_lock() section across fork,
> where both the parent and child process will do: srcu_read_unlock(),
> it is needed to account for the extra decrement with an extra
> increment at fork time.
>
We also need to dup the per-task lock held stack in order to maintain
consistent data for lockdep, right?
Regards,
Boqun
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/srcu.h | 1 +
> include/linux/srcutiny.h | 10 ++++++++++
> kernel/rcu/srcutree.c | 5 +++++
> 3 files changed, 16 insertions(+)
>
> --- a/include/linux/srcu.h
> +++ b/include/linux/srcu.h
> @@ -55,6 +55,7 @@ void call_srcu(struct srcu_struct *ssp,
> void (*func)(struct rcu_head *head));
> void cleanup_srcu_struct(struct srcu_struct *ssp);
> int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
> +void __srcu_clone_read_lock(struct srcu_struct *ssp, int idx);
> void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
> void synchronize_srcu(struct srcu_struct *ssp);
> unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
> --- a/include/linux/srcutiny.h
> +++ b/include/linux/srcutiny.h
> @@ -71,6 +71,16 @@ static inline int __srcu_read_lock(struc
> return idx;
> }
>
> +static inline void __srcu_clone_read_lock(struct srcu_struct *ssp, int idx)
> +{
> + int newval;
> +
> + preempt_disable(); // Needed for PREEMPT_AUTO
> + newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1;
> + WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
> + preempt_enable();
> +}
> +
> static inline void synchronize_srcu_expedited(struct srcu_struct *ssp)
> {
> synchronize_srcu(ssp);
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -720,6 +720,11 @@ int __srcu_read_lock(struct srcu_struct
> }
> EXPORT_SYMBOL_GPL(__srcu_read_lock);
>
> +void __srcu_clone_read_lock(struct srcu_struct *ssp, int idx)
> +{
> + this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
> +}
> +
> /*
> * Removes the count for the old reader from the appropriate per-CPU
> * element of the srcu_struct. Note that this may well be a different
>
>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu()
2024-07-10 1:29 ` Masami Hiramatsu
@ 2024-07-10 7:55 ` Peter Zijlstra
2024-07-10 14:05 ` Masami Hiramatsu
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 7:55 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: mingo, andrii, linux-kernel, rostedt, oleg, jolsa, clm, paulmck
On Wed, Jul 10, 2024 at 10:29:59AM +0900, Masami Hiramatsu wrote:
> On Mon, 08 Jul 2024 11:12:44 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
>
> > Much like latch_tree, add two RCU methods for the regular RB-tree,
> > which can be used in conjunction with a seqcount to provide lockless
> > lookups.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > include/linux/rbtree.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 67 insertions(+)
> >
> > --- a/include/linux/rbtree.h
> > +++ b/include/linux/rbtree.h
> > @@ -245,6 +245,42 @@ rb_find_add(struct rb_node *node, struct
> > }
> >
> > /**
> > + * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
> > + * @node: node to look-for / insert
> > + * @tree: tree to search / modify
> > + * @cmp: operator defining the node order
> > + *
> > + * Adds a Store-Release for link_node.
> > + *
> > + * Returns the rb_node matching @node, or NULL when no match is found and @node
> > + * is inserted.
> > + */
> > +static __always_inline struct rb_node *
> > +rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
> > + int (*cmp)(struct rb_node *, const struct rb_node *))
> > +{
> > + struct rb_node **link = &tree->rb_node;
> > + struct rb_node *parent = NULL;
> > + int c;
> > +
> > + while (*link) {
>
> Don't we need to use rcu_dereference_raw(*link) here?
This is a modifying operation and as such we can assume operation under
the exclusive lock. IOW the tree should be stable here.
> > + parent = *link;
> > + c = cmp(node, parent);
> > +
> > + if (c < 0)
> > + link = &parent->rb_left;
> > + else if (c > 0)
> > + link = &parent->rb_right;
> > + else
> > + return parent;
> > + }
> > +
> > + rb_link_node_rcu(node, parent, link);
Only the link operation needs the rcu_assign_pointer() thing for
publishing our new node.
> > + rb_insert_color(node, tree);
The rotations use WRITE_ONCE() to avoid tearing.
> > + return NULL;
> > +}
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 16:10 ` Matthew Wilcox
2024-07-09 16:30 ` Matthew Wilcox
@ 2024-07-10 9:16 ` Peter Zijlstra
2024-07-10 9:40 ` Peter Zijlstra
1 sibling, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 9:16 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Paul E. McKenney, Andrii Nakryiko, Masami Hiramatsu, mingo,
andrii, linux-kernel, rostedt, oleg, jolsa, clm, bpf
On Tue, Jul 09, 2024 at 05:10:44PM +0100, Matthew Wilcox wrote:
> Probably best to start with lock_vma_under_rcu() in mm/memory.c.
So close and yet so far :/
> > > > Specifically, how feasible would it be to get a simple RCU based
> > > > find_vma() version sorted these days?
> > >
> > > Liam's and Willy's Maple Tree work, combined with Suren's per-VMA locking
> > > combined with some of Vlastimil's slab work is pushing in that direction.
> > > I believe that things are getting pretty close.
> >
> > So I fundamentally do not believe in per-VMA locking. Specifically for
> > this case that would be trading one hot line for another. I tried
> > telling people that, but it doesn't seem to stick :/
>
> SRCU also had its own performance problems, so we've got problems one
> way or the other. The per-VMA lock probably doesn't work quite the way
> you think it does, but it absoutely can be a hot cacheline.
>
> I did propose a store-free variant at LSFMM 2022 and again at 2023,
> but was voted down. https://lwn.net/Articles/932298/
That seems to be lacking much details. Anyway, page-tables are sorta RCU
freed -- per GUP fast requirements. Making it actually RCU isn't
hard, just increases the delay.
> I don't think the door is completely closed to a migration to that,
> but it's a harder sell than what we've got. Of course, data helps ...
Current 'problem' is a heavily multi-threaded application using uprobes.
Executable is typically one VMA (per DSO) and all the probes will live
in that one VMA. Then have all the CPUs concurrently hit probes, and
they're all hammering the same VMA in order to translate
{mm,vaddr}->{inode,offset}.
After fixing a ton of uprobe things, Andrii is seeing 45% of CPU time
spend in mmap_rwsem:
https://lkml.kernel.org/r/CAEf4BzY6tXrDGkW6mkxCY551pZa1G+Sgxeuex==nvHUEp9ynpg@mail.gmail.com
Given it's all typically one VMA, anything per VMA is not going to help
much.
Anyway, back to this PER_VMA_LOCK code, it took me a fair while to
digest because it lacks a coherent comment. It took me extra time
because I assumed (silly me) that _seq would mean an actual sequence
count.
If it were an actual sequence count, I could make it work, but sadly,
not. Also, vma_end_write() seems to be missing :-( If anything it could
be used to lockdep annotate the thing.
Mooo.. I need to stare more at this to see if perhaps it can be made to
work, but so far, no joy :/
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-10 9:16 ` Peter Zijlstra
@ 2024-07-10 9:40 ` Peter Zijlstra
2024-07-22 19:09 ` Suren Baghdasaryan
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 9:40 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Paul E. McKenney, Andrii Nakryiko, Masami Hiramatsu, mingo,
andrii, linux-kernel, rostedt, oleg, jolsa, clm, bpf
On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> If it were an actual sequence count, I could make it work, but sadly,
> not. Also, vma_end_write() seems to be missing :-( If anything it could
> be used to lockdep annotate the thing.
>
> Mooo.. I need to stare more at this to see if perhaps it can be made to
> work, but so far, no joy :/
See, this is what I want, except I can't close the race against VMA
modification because of that crazy locking scheme :/
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2146,11 +2146,58 @@ static int is_trap_at_addr(struct mm_str
return is_trap_insn(&opcode);
}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+#ifndef CONFIG_PER_VMA_LOCK
+static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
+{
+ return NULL;
+}
+#else
+static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, bp_vaddr, bp_vaddr);
+
+ guard(rcu)();
+
+again:
+ vma = mas_walk(&mas);
+ if (!vma)
+ return NULL;
+
+ /* vma_write_start() -- in progress */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ return NULL;
+
+ /*
+ * Completely broken, because of the crazy vma locking scheme you
+ * cannot avoid the per-vma rwlock and doing so means you're racy
+ * against modifications.
+ *
+ * A simple actual seqcount would'be been cheaper and more usefull.
+ */
+
+ if (!valid_vma(vma, false))
+ return NULL;
+
+ struct inode = file_inode(vma->vm_file);
+ loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+
+ // XXX: if (vma_seq_retry(...)) goto again;
+
+ return find_uprobe(inode, offset);
+}
+#endif
+
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+ struct uprobe *uprobe = __find_active_uprobe(bp_vaddr)
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ if (uprobe)
+ return uprobe;
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 08/10] srcu: Add __srcu_clone_read_lock()
2024-07-10 5:48 ` Boqun Feng
@ 2024-07-10 10:02 ` Peter Zijlstra
2024-07-10 12:05 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 10:02 UTC (permalink / raw)
To: Boqun Feng
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
On Tue, Jul 09, 2024 at 10:48:39PM -0700, Boqun Feng wrote:
> On Mon, Jul 08, 2024 at 11:12:49AM +0200, Peter Zijlstra wrote:
> > In order to support carrying an srcu_read_lock() section across fork,
> > where both the parent and child process will do: srcu_read_unlock(),
> > it is needed to account for the extra decrement with an extra
> > increment at fork time.
> >
>
> We also need to dup the per-task lock held stack in order to maintain
> consistent data for lockdep, right?
Urgh, not the whole stack, but yeah, we need to stick an entry on there.
Let me see if I can frob that somehow.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 22:10 ` Masami Hiramatsu
@ 2024-07-10 10:10 ` Peter Zijlstra
2024-07-10 14:56 ` Masami Hiramatsu
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 10:10 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Jiri Olsa, Andrii Nakryiko, mingo, andrii, linux-kernel, rostedt,
oleg, clm, paulmck, bpf
On Wed, Jul 10, 2024 at 07:10:46AM +0900, Masami Hiramatsu wrote:
> > FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
> > what gives?
>
> This is managing *probes and related dynamic trace-events. Those has been
> moved from tip. Could you also add linux-trace-kernel@vger ML to CC?
./scripts/get_maintainer.pl -f kernel/events/uprobes.c
disagrees with that, also things like:
https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=4a365eb8a6d9940e838739935f1ce21f1ec8e33f
touch common perf stuff, and very much would require at least an ack
from the perf folks.
Not cool.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-09 21:47 ` Andrii Nakryiko
@ 2024-07-10 10:12 ` Peter Zijlstra
2024-07-10 12:34 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 10:12 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, paulmck, bpf
On Tue, Jul 09, 2024 at 02:47:12PM -0700, Andrii Nakryiko wrote:
> Running in my local VM with debugging config, I'm getting the
> following. Please check if that's something new or it's just another
> symptom of the issues that Oleg pointed out already.
>
>
> [ 11.213834] ================================================
> [ 11.214225] WARNING: lock held when returning to user space!
> [ 11.214603] 6.10.0-rc6-gd3f5cbffe86b #1263 Tainted: G OE
> [ 11.215040] ------------------------------------------------
> [ 11.215426] urandom_read/2412 is leaving the kernel with locks still held!
> [ 11.215876] 1 lock held by urandom_read/2412:
> [ 11.216175] #0: ffffffff835ce8f0 (uretprobes_srcu){.+.+}-{0:0},
> at: srcu_read_lock+0x31/0x3f
Bah, I forgot the SRCU thing had lockdep on.
> [ 11.262797] ------------[ cut here ]------------
> [ 11.263162] refcount_t: underflow; use-after-free.
> [ 11.263474] WARNING: CPU: 1 PID: 2409 at lib/refcount.c:28
> refcount_warn_saturate+0x99/0xda
> [ 11.263995] Modules linked in: bpf_testmod(OE) aesni_intel(E)
> crypto_simd(E) floppy(E) cryptd(E) i2c_piix4(E) crc32c_intel(E)
> button(E) i2c_core(E) i6300esb(E) pcspkr(E) serio_raw(E)
> [ 11.265105] CPU: 1 PID: 2409 Comm: test_progs Tainted: G
> OE 6.10.0-rc6-gd3f5cbffe86b #1263
> [ 11.265740] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> [ 11.266507] RIP: 0010:refcount_warn_saturate+0x99/0xda
> [ 11.266862] Code: 05 ba 29 1d 02 01 e8 e2 c0 b4 ff 0f 0b c3 80 3d
> aa 29 1d 02 00 75 53 48 c7 c7 20 59 50 82 c6 05 9a 29 1d 02 01 e8 c3
> c0 b4 ff <0f> 0b c3 80 3d 8a 29 1d 02 00 75 34 a
> [ 11.268099] RSP: 0018:ffffc90001fbbd60 EFLAGS: 00010282
> [ 11.268451] RAX: 0000000000000026 RBX: ffff88810f333000 RCX: 0000000000000027
> [ 11.268931] RDX: 0000000000000000 RSI: ffffffff82580a45 RDI: 00000000ffffffff
> [ 11.269417] RBP: ffff888105937818 R08: 0000000000000000 R09: 0000000000000000
> [ 11.269910] R10: 00000000756f6366 R11: 0000000063666572 R12: ffff88810f333030
> [ 11.270387] R13: ffffc90001fbbb80 R14: ffff888100535190 R15: dead000000000100
> [ 11.270870] FS: 00007fc938bd2d00(0000) GS:ffff88881f680000(0000)
> knlGS:0000000000000000
> [ 11.271363] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 11.271725] CR2: 000000000073a005 CR3: 00000001127d5004 CR4: 0000000000370ef0
> [ 11.272220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 11.272693] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 11.273182] Call Trace:
> [ 11.273370] <TASK>
> [ 11.273518] ? __warn+0x8b/0x14d
> [ 11.273753] ? report_bug+0xdb/0x151
> [ 11.273997] ? refcount_warn_saturate+0x99/0xda
> [ 11.274326] ? handle_bug+0x3c/0x5b
> [ 11.274564] ? exc_invalid_op+0x13/0x5c
> [ 11.274831] ? asm_exc_invalid_op+0x16/0x20
> [ 11.275119] ? refcount_warn_saturate+0x99/0xda
> [ 11.275428] uprobe_unregister_nosync+0x61/0x7c
> [ 11.275768] __probe_event_disable+0x5d/0x7d
> [ 11.276069] probe_event_disable+0x50/0x58
This I'll have to stare at for a bit.
Thanks!
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 08/10] srcu: Add __srcu_clone_read_lock()
2024-07-10 10:02 ` Peter Zijlstra
@ 2024-07-10 12:05 ` Peter Zijlstra
0 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 12:05 UTC (permalink / raw)
To: Boqun Feng
Cc: mingo, andrii, linux-kernel, rostedt, mhiramat, oleg, jolsa, clm,
paulmck
On Wed, Jul 10, 2024 at 12:02:02PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 09, 2024 at 10:48:39PM -0700, Boqun Feng wrote:
> > On Mon, Jul 08, 2024 at 11:12:49AM +0200, Peter Zijlstra wrote:
> > > In order to support carrying an srcu_read_lock() section across fork,
> > > where both the parent and child process will do: srcu_read_unlock(),
> > > it is needed to account for the extra decrement with an extra
> > > increment at fork time.
> > >
> >
> > We also need to dup the per-task lock held stack in order to maintain
> > consistent data for lockdep, right?
>
> Urgh, not the whole stack, but yeah, we need to stick an entry on there.
>
> Let me see if I can frob that somehow.
Ah, since this all is across userspace, the easiest solution it so also
use __srcu_read_{,un}lock() and ignore lockdep entirely.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-10 10:12 ` Peter Zijlstra
@ 2024-07-10 12:34 ` Peter Zijlstra
0 siblings, 0 replies; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-10 12:34 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, paulmck, bpf
On Wed, Jul 10, 2024 at 12:12:39PM +0200, Peter Zijlstra wrote:
> > [ 11.262797] ------------[ cut here ]------------
> > [ 11.263162] refcount_t: underflow; use-after-free.
> > [ 11.263474] WARNING: CPU: 1 PID: 2409 at lib/refcount.c:28
> > refcount_warn_saturate+0x99/0xda
> > [ 11.263995] Modules linked in: bpf_testmod(OE) aesni_intel(E)
> > crypto_simd(E) floppy(E) cryptd(E) i2c_piix4(E) crc32c_intel(E)
> > button(E) i2c_core(E) i6300esb(E) pcspkr(E) serio_raw(E)
> > [ 11.265105] CPU: 1 PID: 2409 Comm: test_progs Tainted: G
> > OE 6.10.0-rc6-gd3f5cbffe86b #1263
> > [ 11.265740] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> > [ 11.266507] RIP: 0010:refcount_warn_saturate+0x99/0xda
> > [ 11.266862] Code: 05 ba 29 1d 02 01 e8 e2 c0 b4 ff 0f 0b c3 80 3d
> > aa 29 1d 02 00 75 53 48 c7 c7 20 59 50 82 c6 05 9a 29 1d 02 01 e8 c3
> > c0 b4 ff <0f> 0b c3 80 3d 8a 29 1d 02 00 75 34 a
> > [ 11.268099] RSP: 0018:ffffc90001fbbd60 EFLAGS: 00010282
> > [ 11.268451] RAX: 0000000000000026 RBX: ffff88810f333000 RCX: 0000000000000027
> > [ 11.268931] RDX: 0000000000000000 RSI: ffffffff82580a45 RDI: 00000000ffffffff
> > [ 11.269417] RBP: ffff888105937818 R08: 0000000000000000 R09: 0000000000000000
> > [ 11.269910] R10: 00000000756f6366 R11: 0000000063666572 R12: ffff88810f333030
> > [ 11.270387] R13: ffffc90001fbbb80 R14: ffff888100535190 R15: dead000000000100
> > [ 11.270870] FS: 00007fc938bd2d00(0000) GS:ffff88881f680000(0000)
> > knlGS:0000000000000000
> > [ 11.271363] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [ 11.271725] CR2: 000000000073a005 CR3: 00000001127d5004 CR4: 0000000000370ef0
> > [ 11.272220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [ 11.272693] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [ 11.273182] Call Trace:
> > [ 11.273370] <TASK>
> > [ 11.273518] ? __warn+0x8b/0x14d
> > [ 11.273753] ? report_bug+0xdb/0x151
> > [ 11.273997] ? refcount_warn_saturate+0x99/0xda
> > [ 11.274326] ? handle_bug+0x3c/0x5b
> > [ 11.274564] ? exc_invalid_op+0x13/0x5c
> > [ 11.274831] ? asm_exc_invalid_op+0x16/0x20
> > [ 11.275119] ? refcount_warn_saturate+0x99/0xda
> > [ 11.275428] uprobe_unregister_nosync+0x61/0x7c
> > [ 11.275768] __probe_event_disable+0x5d/0x7d
> > [ 11.276069] probe_event_disable+0x50/0x58
>
> This I'll have to stare at for a bit.
I found one put_uprobe() that should've now been a srcu_read_unlock().
That might explain things.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu()
2024-07-10 7:55 ` Peter Zijlstra
@ 2024-07-10 14:05 ` Masami Hiramatsu
0 siblings, 0 replies; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-10 14:05 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, andrii, linux-kernel, rostedt, oleg, jolsa, clm, paulmck
On Wed, 10 Jul 2024 09:55:57 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jul 10, 2024 at 10:29:59AM +0900, Masami Hiramatsu wrote:
> > On Mon, 08 Jul 2024 11:12:44 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > Much like latch_tree, add two RCU methods for the regular RB-tree,
> > > which can be used in conjunction with a seqcount to provide lockless
> > > lookups.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > > include/linux/rbtree.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 67 insertions(+)
> > >
> > > --- a/include/linux/rbtree.h
> > > +++ b/include/linux/rbtree.h
> > > @@ -245,6 +245,42 @@ rb_find_add(struct rb_node *node, struct
> > > }
> > >
> > > /**
> > > + * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
> > > + * @node: node to look-for / insert
> > > + * @tree: tree to search / modify
> > > + * @cmp: operator defining the node order
> > > + *
> > > + * Adds a Store-Release for link_node.
> > > + *
> > > + * Returns the rb_node matching @node, or NULL when no match is found and @node
> > > + * is inserted.
> > > + */
> > > +static __always_inline struct rb_node *
> > > +rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
> > > + int (*cmp)(struct rb_node *, const struct rb_node *))
> > > +{
> > > + struct rb_node **link = &tree->rb_node;
> > > + struct rb_node *parent = NULL;
> > > + int c;
> > > +
> > > + while (*link) {
> >
> > Don't we need to use rcu_dereference_raw(*link) here?
>
> This is a modifying operation and as such we can assume operation under
> the exclusive lock. IOW the tree should be stable here.
Ah, got it.
>
> > > + parent = *link;
> > > + c = cmp(node, parent);
> > > +
> > > + if (c < 0)
> > > + link = &parent->rb_left;
> > > + else if (c > 0)
> > > + link = &parent->rb_right;
> > > + else
> > > + return parent;
> > > + }
> > > +
> > > + rb_link_node_rcu(node, parent, link);
>
> Only the link operation needs the rcu_assign_pointer() thing for
> publishing our new node.
Yes.
>
> > > + rb_insert_color(node, tree);
>
> The rotations use WRITE_ONCE() to avoid tearing.
OK, thanks for confirmation.
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Thank you,
>
> > > + return NULL;
> > > +}
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-10 10:10 ` Peter Zijlstra
@ 2024-07-10 14:56 ` Masami Hiramatsu
2024-07-10 18:40 ` Andrii Nakryiko
0 siblings, 1 reply; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-10 14:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Jiri Olsa, Andrii Nakryiko, mingo, andrii, linux-kernel, rostedt,
oleg, clm, paulmck, bpf
On Wed, 10 Jul 2024 12:10:03 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jul 10, 2024 at 07:10:46AM +0900, Masami Hiramatsu wrote:
>
> > > FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
> > > what gives?
> >
> > This is managing *probes and related dynamic trace-events. Those has been
> > moved from tip. Could you also add linux-trace-kernel@vger ML to CC?
>
> ./scripts/get_maintainer.pl -f kernel/events/uprobes.c
>
> disagrees with that, also things like:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=4a365eb8a6d9940e838739935f1ce21f1ec8e33f
>
> touch common perf stuff, and very much would require at least an ack
> from the perf folks.
Hmm, indeed. I'm OK to pass those patches (except for trace_uprobe things)
to -tip if you can.
>
> Not cool.
Yeah, the probe things are boundary.
BTW, IMHO, there could be dependency issues on *probes. Those are usually used
by ftrace/perf/bpf, which are managed by different trees. This means a series
can span multiple trees. Mutually reviewing is the solution?
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-10 14:56 ` Masami Hiramatsu
@ 2024-07-10 18:40 ` Andrii Nakryiko
2024-07-11 8:51 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-07-10 18:40 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Peter Zijlstra, Jiri Olsa, mingo, andrii, linux-kernel, rostedt,
oleg, clm, paulmck, bpf
On Wed, Jul 10, 2024 at 7:56 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Wed, 10 Jul 2024 12:10:03 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
>
> > On Wed, Jul 10, 2024 at 07:10:46AM +0900, Masami Hiramatsu wrote:
> >
> > > > FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
> > > > what gives?
> > >
> > > This is managing *probes and related dynamic trace-events. Those has been
> > > moved from tip. Could you also add linux-trace-kernel@vger ML to CC?
> >
> > ./scripts/get_maintainer.pl -f kernel/events/uprobes.c
> >
> > disagrees with that, also things like:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=4a365eb8a6d9940e838739935f1ce21f1ec8e33f
> >
> > touch common perf stuff, and very much would require at least an ack
> > from the perf folks.
>
> Hmm, indeed. I'm OK to pass those patches (except for trace_uprobe things)
> to -tip if you can.
>
> >
> > Not cool.
>
You were aware of this patch and cc'ed personally (just like
linux-perf-users@vger.kernel.org) on all revisions of it. I addressed
your concerns in [0], you went silent after that and patches were
sitting idle for more than a month.
But regardless, if you'd like me to do any adjustments, please let me know.
[0] https://lore.kernel.org/all/CAEf4Bzazi7YMz9n0V46BU7xthQjNdQL_zma5vzgCm_7C-_CvmQ@mail.gmail.com/
> Yeah, the probe things are boundary.
> BTW, IMHO, there could be dependency issues on *probes. Those are usually used
> by ftrace/perf/bpf, which are managed by different trees. This means a series
> can span multiple trees. Mutually reviewing is the solution?
>
I agree, there is no one best tree for stuff like this. So as long as
relevant people and mailing lists are CC'ed we hopefully should be
fine?
> Thank you,
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-10 18:40 ` Andrii Nakryiko
@ 2024-07-11 8:51 ` Peter Zijlstra
2024-07-11 15:17 ` Masami Hiramatsu
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-11 8:51 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Masami Hiramatsu, Jiri Olsa, mingo, andrii, linux-kernel, rostedt,
oleg, clm, paulmck, bpf
On Wed, Jul 10, 2024 at 11:40:17AM -0700, Andrii Nakryiko wrote:
> On Wed, Jul 10, 2024 at 7:56 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Wed, 10 Jul 2024 12:10:03 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > On Wed, Jul 10, 2024 at 07:10:46AM +0900, Masami Hiramatsu wrote:
> > >
> > > > > FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
> > > > > what gives?
> > > >
> > > > This is managing *probes and related dynamic trace-events. Those has been
> > > > moved from tip. Could you also add linux-trace-kernel@vger ML to CC?
> > >
> > > ./scripts/get_maintainer.pl -f kernel/events/uprobes.c
> > >
> > > disagrees with that, also things like:
> > >
> > > https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=4a365eb8a6d9940e838739935f1ce21f1ec8e33f
> > >
> > > touch common perf stuff, and very much would require at least an ack
> > > from the perf folks.
> >
> > Hmm, indeed. I'm OK to pass those patches (except for trace_uprobe things)
> > to -tip if you can.
> >
> > >
> > > Not cool.
> >
>
> You were aware of this patch and cc'ed personally (just like
> linux-perf-users@vger.kernel.org) on all revisions of it. I addressed
> your concerns in [0], you went silent after that and patches were
> sitting idle for more than a month.
Yeah, I remember seeing it. But I was surprised it got applied. If I'm
tardy -- this can happen, more so of late since I'm still recovering
from injury and I get far more email than I could hope to process in a
work day -- please ping.
(also, being 'forced' into using a split keyboard means I'm also
re-learning how to type, further slowing me down -- training muscle
memory takes a while)
Taking patches that touch other trees is fairly common, but in all those
cases an ACK is 'required'.
(also also, I'm not the only maintainer there)
> But regardless, if you'd like me to do any adjustments, please let me know.
>
> [0] https://lore.kernel.org/all/CAEf4Bzazi7YMz9n0V46BU7xthQjNdQL_zma5vzgCm_7C-_CvmQ@mail.gmail.com/
>
I'll check, it might be fine, its just the surprise of having it show up
in some random tree that set me off.
> > Yeah, the probe things are boundary.
> > BTW, IMHO, there could be dependency issues on *probes. Those are usually used
> > by ftrace/perf/bpf, which are managed by different trees. This means a series
> > can span multiple trees. Mutually reviewing is the solution?
> >
>
> I agree, there is no one best tree for stuff like this. So as long as
> relevant people and mailing lists are CC'ed we hopefully should be
> fine?
Typically, yeah, that should work just fine.
But if Masami wants to do uprobes, then it might be prudent to add a
MAINTAINERS entry for it.
A solution might be to add a UPROBES entry and add masami, oleg (if he
wants) and myself as maintainers -- did I forget anyone? Git seems to
suggest it's mostly been Oleg carrying this thing.
That is, one way or another I think we should get
./scripts/get_maintainer.pl to emit more people for the relevant files.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-11 8:51 ` Peter Zijlstra
@ 2024-07-11 15:17 ` Masami Hiramatsu
2024-07-11 15:22 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-11 15:17 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Andrii Nakryiko, Masami Hiramatsu, Jiri Olsa, mingo, andrii,
linux-kernel, rostedt, oleg, clm, paulmck, bpf
On Thu, 11 Jul 2024 10:51:18 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jul 10, 2024 at 11:40:17AM -0700, Andrii Nakryiko wrote:
> > On Wed, Jul 10, 2024 at 7:56 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > >
> > > On Wed, 10 Jul 2024 12:10:03 +0200
> > > Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > > On Wed, Jul 10, 2024 at 07:10:46AM +0900, Masami Hiramatsu wrote:
> > > >
> > > > > > FFS :-/ That touches all sorts and doesn't have any perf ack on. Masami
> > > > > > what gives?
> > > > >
> > > > > This is managing *probes and related dynamic trace-events. Those has been
> > > > > moved from tip. Could you also add linux-trace-kernel@vger ML to CC?
> > > >
> > > > ./scripts/get_maintainer.pl -f kernel/events/uprobes.c
> > > >
> > > > disagrees with that, also things like:
> > > >
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git/commit/?h=probes/for-next&id=4a365eb8a6d9940e838739935f1ce21f1ec8e33f
> > > >
> > > > touch common perf stuff, and very much would require at least an ack
> > > > from the perf folks.
> > >
> > > Hmm, indeed. I'm OK to pass those patches (except for trace_uprobe things)
> > > to -tip if you can.
> > >
> > > >
> > > > Not cool.
> > >
> >
> > You were aware of this patch and cc'ed personally (just like
> > linux-perf-users@vger.kernel.org) on all revisions of it. I addressed
> > your concerns in [0], you went silent after that and patches were
> > sitting idle for more than a month.
>
> Yeah, I remember seeing it. But I was surprised it got applied. If I'm
> tardy -- this can happen, more so of late since I'm still recovering
> from injury and I get far more email than I could hope to process in a
> work day -- please ping.
I also forgot to ping you. I'll ask you next time.
>
> (also, being 'forced' into using a split keyboard means I'm also
> re-learning how to type, further slowing me down -- training muscle
> memory takes a while)
>
> Taking patches that touch other trees is fairly common, but in all those
> cases an ACK is 'required'.
OK, should I wait for your Ack for other patches on probes/for-next?
>
> (also also, I'm not the only maintainer there)
>
> > But regardless, if you'd like me to do any adjustments, please let me know.
> >
> > [0] https://lore.kernel.org/all/CAEf4Bzazi7YMz9n0V46BU7xthQjNdQL_zma5vzgCm_7C-_CvmQ@mail.gmail.com/
> >
>
> I'll check, it might be fine, its just the surprise of having it show up
> in some random tree that set me off.
>
> > > Yeah, the probe things are boundary.
> > > BTW, IMHO, there could be dependency issues on *probes. Those are usually used
> > > by ftrace/perf/bpf, which are managed by different trees. This means a series
> > > can span multiple trees. Mutually reviewing is the solution?
> > >
> >
> > I agree, there is no one best tree for stuff like this. So as long as
> > relevant people and mailing lists are CC'ed we hopefully should be
> > fine?
>
> Typically, yeah, that should work just fine.
>
> But if Masami wants to do uprobes, then it might be prudent to add a
> MAINTAINERS entry for it.
>
> A solution might be to add a UPROBES entry and add masami, oleg (if he
> wants) and myself as maintainers -- did I forget anyone? Git seems to
> suggest it's mostly been Oleg carrying this thing.
That sounds good for me. Like this?
From 87dfb9c0e7660e83debd69a0c7e28bc61d214fa8 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 12 Jul 2024 00:08:30 +0900
Subject: [PATCH] MAINTAINERS: Add uprobes entry
Add uprobes entry to MAINTAINERS and move its maintenance on the linux-trace
tree as same as other probes.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
MAINTAINERS | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/MAINTAINERS b/MAINTAINERS
index da5352dbd4f3..7f6285d98b39 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23105,6 +23105,18 @@ F: drivers/mtd/ubi/
F: include/linux/mtd/ubi.h
F: include/uapi/mtd/ubi-user.h
+UPROBES
+M: Masami Hiramatsu <mhiramat@kernel.org>
+M: Oleg Nesterov <oleg@redhat.com>
+M: Peter Zijlstra <peterz@infradead.org>
+L: linux-kernel@vger.kernel.org
+L: linux-trace-kernel@vger.kernel.org
+S: Maintained
+Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
+F: include/linux/uprobes.h
+F: kernel/events/uprobes.c
+
USB "USBNET" DRIVER FRAMEWORK
M: Oliver Neukum <oneukum@suse.com>
L: netdev@vger.kernel.org
--
2.34.1
Thanks,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply related [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-11 15:17 ` Masami Hiramatsu
@ 2024-07-11 15:22 ` Peter Zijlstra
2024-07-11 17:47 ` Steven Rostedt
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-11 15:22 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Andrii Nakryiko, Jiri Olsa, mingo, andrii, linux-kernel, rostedt,
oleg, clm, paulmck, bpf
On Fri, Jul 12, 2024 at 12:17:18AM +0900, Masami Hiramatsu wrote:
> From 87dfb9c0e7660e83debd69a0c7e28bc61d214fa8 Mon Sep 17 00:00:00 2001
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Fri, 12 Jul 2024 00:08:30 +0900
> Subject: [PATCH] MAINTAINERS: Add uprobes entry
>
> Add uprobes entry to MAINTAINERS and move its maintenance on the linux-trace
> tree as same as other probes.
>
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
> MAINTAINERS | 12 ++++++++++++
> 1 file changed, 12 insertions(+)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index da5352dbd4f3..7f6285d98b39 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -23105,6 +23105,18 @@ F: drivers/mtd/ubi/
> F: include/linux/mtd/ubi.h
> F: include/uapi/mtd/ubi-user.h
>
> +UPROBES
> +M: Masami Hiramatsu <mhiramat@kernel.org>
> +M: Oleg Nesterov <oleg@redhat.com>
> +M: Peter Zijlstra <peterz@infradead.org>
> +L: linux-kernel@vger.kernel.org
> +L: linux-trace-kernel@vger.kernel.org
> +S: Maintained
> +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/
> +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
> +F: include/linux/uprobes.h
> +F: kernel/events/uprobes.c
Maybe no Q/T. Neither Oleg nor me have write access to that git tree.
Also, I think you want:
F: arch/*/kernel/uprobes.c
F: arch/*/kernel/probes/uprobes.c
F: arch/*/include/asm/uprobes.h
This is just to ensure get_maintainers.sh gets our email addresses for
all uprobes stuff.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-11 15:22 ` Peter Zijlstra
@ 2024-07-11 17:47 ` Steven Rostedt
2024-07-11 23:59 ` Masami Hiramatsu
0 siblings, 1 reply; 84+ messages in thread
From: Steven Rostedt @ 2024-07-11 17:47 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Masami Hiramatsu, Andrii Nakryiko, Jiri Olsa, mingo, andrii,
linux-kernel, oleg, clm, paulmck, bpf
On Thu, 11 Jul 2024 17:22:38 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> > +UPROBES
> > +M: Masami Hiramatsu <mhiramat@kernel.org>
> > +M: Oleg Nesterov <oleg@redhat.com>
> > +M: Peter Zijlstra <peterz@infradead.org>
> > +L: linux-kernel@vger.kernel.org
> > +L: linux-trace-kernel@vger.kernel.org
> > +S: Maintained
> > +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/
> > +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
> > +F: include/linux/uprobes.h
> > +F: kernel/events/uprobes.c
>
> Maybe no Q/T. Neither Oleg nor me have write access to that git tree.
>
> Also, I think you want:
>
> F: arch/*/kernel/uprobes.c
> F: arch/*/kernel/probes/uprobes.c
> F: arch/*/include/asm/uprobes.h
>
>
> This is just to ensure get_maintainers.sh gets our email addresses for
> all uprobes stuff.
Agreed. As those files can go through other trees, it's best not to add
linux-trace.git nor patchwork to MAINTAINERS file. It's just there to make
sure the proper people are Cc'd.
-- Steve
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-11 17:47 ` Steven Rostedt
@ 2024-07-11 23:59 ` Masami Hiramatsu
0 siblings, 0 replies; 84+ messages in thread
From: Masami Hiramatsu @ 2024-07-11 23:59 UTC (permalink / raw)
To: Steven Rostedt
Cc: Peter Zijlstra, Masami Hiramatsu, Andrii Nakryiko, Jiri Olsa,
mingo, andrii, linux-kernel, oleg, clm, paulmck, bpf
On Thu, 11 Jul 2024 13:47:03 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 11 Jul 2024 17:22:38 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > +UPROBES
> > > +M: Masami Hiramatsu <mhiramat@kernel.org>
> > > +M: Oleg Nesterov <oleg@redhat.com>
> > > +M: Peter Zijlstra <peterz@infradead.org>
> > > +L: linux-kernel@vger.kernel.org
> > > +L: linux-trace-kernel@vger.kernel.org
> > > +S: Maintained
> > > +Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/
> > > +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
> > > +F: include/linux/uprobes.h
> > > +F: kernel/events/uprobes.c
> >
> > Maybe no Q/T. Neither Oleg nor me have write access to that git tree.
Aah right...
> >
> > Also, I think you want:
> >
> > F: arch/*/kernel/uprobes.c
> > F: arch/*/kernel/probes/uprobes.c
> > F: arch/*/include/asm/uprobes.h
OK, I confirmed it covers all arch.
> > This is just to ensure get_maintainers.sh gets our email addresses for
> > all uprobes stuff.
>
> Agreed. As those files can go through other trees, it's best not to add
> linux-trace.git nor patchwork to MAINTAINERS file. It's just there to make
> sure the proper people are Cc'd.
OK, let me update it.
Thank you!
>
> -- Steve
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-10 9:40 ` Peter Zijlstra
@ 2024-07-22 19:09 ` Suren Baghdasaryan
2024-07-27 0:20 ` Andrii Nakryiko
2024-07-30 13:46 ` Peter Zijlstra
0 siblings, 2 replies; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-07-22 19:09 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Matthew Wilcox, Paul E. McKenney, Andrii Nakryiko,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
>
> > If it were an actual sequence count, I could make it work, but sadly,
> > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > be used to lockdep annotate the thing.
Thanks Matthew for forwarding me this discussion!
> >
> > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > work, but so far, no joy :/
>
> See, this is what I want, except I can't close the race against VMA
> modification because of that crazy locking scheme :/
Happy to explain more about this crazy locking scheme. The catch is
that we can write-lock a VMA only while holding mmap_lock for write
and we unlock all write-locked VMAs together when we drop that
mmap_lock:
mmap_write_lock(mm);
vma_start_write(vma1);
vma_start_write(vma2);
...
mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
This is done because oftentimes we need to lock multiple VMAs when
modifying the address space (vma merge/split) and unlocking them
individually would be more expensive than unlocking them in bulk by
incrementing mm->mm_lock_seq.
>
>
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -2146,11 +2146,58 @@ static int is_trap_at_addr(struct mm_str
> return is_trap_insn(&opcode);
> }
>
> -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> +#ifndef CONFIG_PER_VMA_LOCK
> +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> +{
> + return NULL;
> +}
> +#else
IIUC your code below, you want to get vma->vm_file without locking the
VMA. I think under RCU that would have been possible if vma->vm_file
were RCU-safe, which it's not (we had discussions with Paul and
Matthew about that in
https://lore.kernel.org/all/CAJuCfpHW2=Zu+CHXL+5fjWxGk=CVix=C66ra+DmXgn6r3+fsXg@mail.gmail.com/).
Otherwise you could store the value of vma->vm_lock_seq before
comparing it with mm->mm_lock_seq, then do get_file(vma->file) and
then compare your locally stored vm_lock_seq against vma->vm_lock_seq
to see if VMA got locked for modification after we got the file. So,
unless I miss some other race, I think the VMA locking sequence does
not preclude you from implementing __find_active_uprobe() but
accessing vma->vm_file would be unsafe without some kind of locking.
> +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> {
> struct mm_struct *mm = current->mm;
> struct uprobe *uprobe = NULL;
> struct vm_area_struct *vma;
> + MA_STATE(mas, &mm->mm_mt, bp_vaddr, bp_vaddr);
> +
> + guard(rcu)();
> +
> +again:
> + vma = mas_walk(&mas);
> + if (!vma)
> + return NULL;
> +
> + /* vma_write_start() -- in progress */
> + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> + return NULL;
> +
> + /*
> + * Completely broken, because of the crazy vma locking scheme you
> + * cannot avoid the per-vma rwlock and doing so means you're racy
> + * against modifications.
> + *
> + * A simple actual seqcount would'be been cheaper and more usefull.
> + */
> +
> + if (!valid_vma(vma, false))
> + return NULL;
> +
> + struct inode = file_inode(vma->vm_file);
> + loff_t offset = vaddr_to_offset(vma, bp_vaddr);
> +
> + // XXX: if (vma_seq_retry(...)) goto again;
> +
> + return find_uprobe(inode, offset);
> +}
> +#endif
> +
> +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> +{
> + struct uprobe *uprobe = __find_active_uprobe(bp_vaddr)
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *vma;
> +
> + if (uprobe)
> + return uprobe;
>
> mmap_read_lock(mm);
> vma = vma_lookup(mm, bp_vaddr);
>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-22 19:09 ` Suren Baghdasaryan
@ 2024-07-27 0:20 ` Andrii Nakryiko
2024-07-27 1:29 ` Suren Baghdasaryan
2024-07-30 13:46 ` Peter Zijlstra
1 sibling, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-07-27 0:20 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Peter Zijlstra, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Mon, Jul 22, 2024 at 12:09 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> >
> > > If it were an actual sequence count, I could make it work, but sadly,
> > > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > > be used to lockdep annotate the thing.
>
> Thanks Matthew for forwarding me this discussion!
>
> > >
> > > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > > work, but so far, no joy :/
> >
> > See, this is what I want, except I can't close the race against VMA
> > modification because of that crazy locking scheme :/
>
> Happy to explain more about this crazy locking scheme. The catch is
> that we can write-lock a VMA only while holding mmap_lock for write
> and we unlock all write-locked VMAs together when we drop that
> mmap_lock:
>
> mmap_write_lock(mm);
> vma_start_write(vma1);
> vma_start_write(vma2);
> ...
> mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
>
> This is done because oftentimes we need to lock multiple VMAs when
> modifying the address space (vma merge/split) and unlocking them
> individually would be more expensive than unlocking them in bulk by
> incrementing mm->mm_lock_seq.
>
> >
> >
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -2146,11 +2146,58 @@ static int is_trap_at_addr(struct mm_str
> > return is_trap_insn(&opcode);
> > }
> >
> > -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> > +#ifndef CONFIG_PER_VMA_LOCK
> > +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> > +{
> > + return NULL;
> > +}
> > +#else
>
> IIUC your code below, you want to get vma->vm_file without locking the
> VMA. I think under RCU that would have been possible if vma->vm_file
> were RCU-safe, which it's not (we had discussions with Paul and
> Matthew about that in
> https://lore.kernel.org/all/CAJuCfpHW2=Zu+CHXL+5fjWxGk=CVix=C66ra+DmXgn6r3+fsXg@mail.gmail.com/).
> Otherwise you could store the value of vma->vm_lock_seq before
> comparing it with mm->mm_lock_seq, then do get_file(vma->file) and
> then compare your locally stored vm_lock_seq against vma->vm_lock_seq
> to see if VMA got locked for modification after we got the file. So,
> unless I miss some other race, I think the VMA locking sequence does
> not preclude you from implementing __find_active_uprobe() but
> accessing vma->vm_file would be unsafe without some kind of locking.
Hey Suren!
I've haven't yet dug properly into this, but from quick checking
around I think for the hot path (where this all matters), we really
only want to get vma's underlying inode. vm_file itself is just a
means to that end. If there is some clever way to do
vma->vm_file->f_inode under RCU and without mmap_read_lock, that would
be good enough, I think.
>
> > +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> > {
> > struct mm_struct *mm = current->mm;
> > struct uprobe *uprobe = NULL;
> > struct vm_area_struct *vma;
> > + MA_STATE(mas, &mm->mm_mt, bp_vaddr, bp_vaddr);
> > +
> > + guard(rcu)();
> > +
> > +again:
> > + vma = mas_walk(&mas);
> > + if (!vma)
> > + return NULL;
> > +
> > + /* vma_write_start() -- in progress */
> > + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > + return NULL;
> > +
> > + /*
> > + * Completely broken, because of the crazy vma locking scheme you
> > + * cannot avoid the per-vma rwlock and doing so means you're racy
> > + * against modifications.
> > + *
> > + * A simple actual seqcount would'be been cheaper and more usefull.
> > + */
> > +
> > + if (!valid_vma(vma, false))
> > + return NULL;
> > +
> > + struct inode = file_inode(vma->vm_file);
> > + loff_t offset = vaddr_to_offset(vma, bp_vaddr);
> > +
> > + // XXX: if (vma_seq_retry(...)) goto again;
> > +
> > + return find_uprobe(inode, offset);
> > +}
> > +#endif
> > +
> > +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> > +{
> > + struct uprobe *uprobe = __find_active_uprobe(bp_vaddr)
> > + struct mm_struct *mm = current->mm;
> > + struct vm_area_struct *vma;
> > +
> > + if (uprobe)
> > + return uprobe;
> >
> > mmap_read_lock(mm);
> > vma = vma_lookup(mm, bp_vaddr);
> >
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-27 0:20 ` Andrii Nakryiko
@ 2024-07-27 1:29 ` Suren Baghdasaryan
2024-07-27 3:45 ` Matthew Wilcox
0 siblings, 1 reply; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-07-27 1:29 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Fri, Jul 26, 2024 at 5:20 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Jul 22, 2024 at 12:09 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> > >
> > > > If it were an actual sequence count, I could make it work, but sadly,
> > > > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > > > be used to lockdep annotate the thing.
> >
> > Thanks Matthew for forwarding me this discussion!
> >
> > > >
> > > > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > > > work, but so far, no joy :/
> > >
> > > See, this is what I want, except I can't close the race against VMA
> > > modification because of that crazy locking scheme :/
> >
> > Happy to explain more about this crazy locking scheme. The catch is
> > that we can write-lock a VMA only while holding mmap_lock for write
> > and we unlock all write-locked VMAs together when we drop that
> > mmap_lock:
> >
> > mmap_write_lock(mm);
> > vma_start_write(vma1);
> > vma_start_write(vma2);
> > ...
> > mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
> >
> > This is done because oftentimes we need to lock multiple VMAs when
> > modifying the address space (vma merge/split) and unlocking them
> > individually would be more expensive than unlocking them in bulk by
> > incrementing mm->mm_lock_seq.
> >
> > >
> > >
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -2146,11 +2146,58 @@ static int is_trap_at_addr(struct mm_str
> > > return is_trap_insn(&opcode);
> > > }
> > >
> > > -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> > > +#ifndef CONFIG_PER_VMA_LOCK
> > > +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> > > +{
> > > + return NULL;
> > > +}
> > > +#else
> >
> > IIUC your code below, you want to get vma->vm_file without locking the
> > VMA. I think under RCU that would have been possible if vma->vm_file
> > were RCU-safe, which it's not (we had discussions with Paul and
> > Matthew about that in
> > https://lore.kernel.org/all/CAJuCfpHW2=Zu+CHXL+5fjWxGk=CVix=C66ra+DmXgn6r3+fsXg@mail.gmail.com/).
> > Otherwise you could store the value of vma->vm_lock_seq before
> > comparing it with mm->mm_lock_seq, then do get_file(vma->file) and
> > then compare your locally stored vm_lock_seq against vma->vm_lock_seq
> > to see if VMA got locked for modification after we got the file. So,
> > unless I miss some other race, I think the VMA locking sequence does
> > not preclude you from implementing __find_active_uprobe() but
> > accessing vma->vm_file would be unsafe without some kind of locking.
>
> Hey Suren!
>
> I've haven't yet dug properly into this, but from quick checking
> around I think for the hot path (where this all matters), we really
> only want to get vma's underlying inode. vm_file itself is just a
> means to that end. If there is some clever way to do
> vma->vm_file->f_inode under RCU and without mmap_read_lock, that would
> be good enough, I think.
Hi Andrii,
Sorry, I'm not aware of any other way to get the inode from vma. Maybe
Matthew with his FS background can find a way?
>
> >
> > > +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> > > {
> > > struct mm_struct *mm = current->mm;
> > > struct uprobe *uprobe = NULL;
> > > struct vm_area_struct *vma;
> > > + MA_STATE(mas, &mm->mm_mt, bp_vaddr, bp_vaddr);
> > > +
> > > + guard(rcu)();
> > > +
> > > +again:
> > > + vma = mas_walk(&mas);
> > > + if (!vma)
> > > + return NULL;
> > > +
> > > + /* vma_write_start() -- in progress */
> > > + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > > + return NULL;
> > > +
> > > + /*
> > > + * Completely broken, because of the crazy vma locking scheme you
> > > + * cannot avoid the per-vma rwlock and doing so means you're racy
> > > + * against modifications.
> > > + *
> > > + * A simple actual seqcount would'be been cheaper and more usefull.
> > > + */
> > > +
> > > + if (!valid_vma(vma, false))
> > > + return NULL;
> > > +
> > > + struct inode = file_inode(vma->vm_file);
> > > + loff_t offset = vaddr_to_offset(vma, bp_vaddr);
> > > +
> > > + // XXX: if (vma_seq_retry(...)) goto again;
> > > +
> > > + return find_uprobe(inode, offset);
> > > +}
> > > +#endif
> > > +
> > > +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> > > +{
> > > + struct uprobe *uprobe = __find_active_uprobe(bp_vaddr)
> > > + struct mm_struct *mm = current->mm;
> > > + struct vm_area_struct *vma;
> > > +
> > > + if (uprobe)
> > > + return uprobe;
> > >
> > > mmap_read_lock(mm);
> > > vma = vma_lookup(mm, bp_vaddr);
> > >
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-27 1:29 ` Suren Baghdasaryan
@ 2024-07-27 3:45 ` Matthew Wilcox
2024-07-30 3:18 ` Andrii Nakryiko
2024-07-30 13:10 ` Peter Zijlstra
0 siblings, 2 replies; 84+ messages in thread
From: Matthew Wilcox @ 2024-07-27 3:45 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Andrii Nakryiko, Peter Zijlstra, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Fri, Jul 26, 2024 at 06:29:44PM -0700, Suren Baghdasaryan wrote:
> On Fri, Jul 26, 2024 at 5:20 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Jul 22, 2024 at 12:09 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> > > >
> > > > > If it were an actual sequence count, I could make it work, but sadly,
> > > > > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > > > > be used to lockdep annotate the thing.
> > >
> > > Thanks Matthew for forwarding me this discussion!
> > >
> > > > >
> > > > > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > > > > work, but so far, no joy :/
> > > >
> > > > See, this is what I want, except I can't close the race against VMA
> > > > modification because of that crazy locking scheme :/
> > >
> > > Happy to explain more about this crazy locking scheme. The catch is
> > > that we can write-lock a VMA only while holding mmap_lock for write
> > > and we unlock all write-locked VMAs together when we drop that
> > > mmap_lock:
> > >
> > > mmap_write_lock(mm);
> > > vma_start_write(vma1);
> > > vma_start_write(vma2);
> > > ...
> > > mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
> > >
> > > This is done because oftentimes we need to lock multiple VMAs when
> > > modifying the address space (vma merge/split) and unlocking them
> > > individually would be more expensive than unlocking them in bulk by
> > > incrementing mm->mm_lock_seq.
> > >
> > > >
> > > >
> > > > --- a/kernel/events/uprobes.c
> > > > +++ b/kernel/events/uprobes.c
> > > > @@ -2146,11 +2146,58 @@ static int is_trap_at_addr(struct mm_str
> > > > return is_trap_insn(&opcode);
> > > > }
> > > >
> > > > -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> > > > +#ifndef CONFIG_PER_VMA_LOCK
> > > > +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> > > > +{
> > > > + return NULL;
> > > > +}
> > > > +#else
> > >
> > > IIUC your code below, you want to get vma->vm_file without locking the
> > > VMA. I think under RCU that would have been possible if vma->vm_file
> > > were RCU-safe, which it's not (we had discussions with Paul and
> > > Matthew about that in
> > > https://lore.kernel.org/all/CAJuCfpHW2=Zu+CHXL+5fjWxGk=CVix=C66ra+DmXgn6r3+fsXg@mail.gmail.com/).
> > > Otherwise you could store the value of vma->vm_lock_seq before
> > > comparing it with mm->mm_lock_seq, then do get_file(vma->file) and
> > > then compare your locally stored vm_lock_seq against vma->vm_lock_seq
> > > to see if VMA got locked for modification after we got the file. So,
> > > unless I miss some other race, I think the VMA locking sequence does
> > > not preclude you from implementing __find_active_uprobe() but
> > > accessing vma->vm_file would be unsafe without some kind of locking.
> >
> > Hey Suren!
> >
> > I've haven't yet dug properly into this, but from quick checking
> > around I think for the hot path (where this all matters), we really
> > only want to get vma's underlying inode. vm_file itself is just a
> > means to that end. If there is some clever way to do
> > vma->vm_file->f_inode under RCU and without mmap_read_lock, that would
> > be good enough, I think.
>
> Hi Andrii,
> Sorry, I'm not aware of any other way to get the inode from vma. Maybe
> Matthew with his FS background can find a way?
Hum. What if we added SLAB_TYPESAFE_BY_RCU to files_cachep? That way
we could do:
inode = NULL;
rcu_read_lock();
vma = find_vma(mm, address);
if (!vma)
goto unlock;
file = READ_ONCE(vma->vm_file);
if (!file)
goto unlock;
inode = file->f_inode;
if (file != READ_ONCE(vma->vm_file))
inode = NULL;
unlock:
rcu_read_unlock();
if (inode)
return inode;
mmap_read_lock();
vma = find_vma(mm, address);
...
I think this would be safe because 'vma' will not be reused while we
hold the read lock, and while 'file' might be reused, whatever f_inode
points to won't be used if vm_file is no longer what it once was.
On the other hand, it's quarter to midnight on Friday, and I have a
terrible virus that I'm struggling through, so not ideal circumstances
for me to be reasoning about RCU guarantees.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-27 3:45 ` Matthew Wilcox
@ 2024-07-30 3:18 ` Andrii Nakryiko
2024-07-30 13:10 ` Peter Zijlstra
1 sibling, 0 replies; 84+ messages in thread
From: Andrii Nakryiko @ 2024-07-30 3:18 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Suren Baghdasaryan, Peter Zijlstra, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Fri, Jul 26, 2024 at 8:45 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Jul 26, 2024 at 06:29:44PM -0700, Suren Baghdasaryan wrote:
> > On Fri, Jul 26, 2024 at 5:20 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Mon, Jul 22, 2024 at 12:09 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> > > > >
> > > > > > If it were an actual sequence count, I could make it work, but sadly,
> > > > > > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > > > > > be used to lockdep annotate the thing.
> > > >
> > > > Thanks Matthew for forwarding me this discussion!
> > > >
> > > > > >
> > > > > > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > > > > > work, but so far, no joy :/
> > > > >
> > > > > See, this is what I want, except I can't close the race against VMA
> > > > > modification because of that crazy locking scheme :/
> > > >
> > > > Happy to explain more about this crazy locking scheme. The catch is
> > > > that we can write-lock a VMA only while holding mmap_lock for write
> > > > and we unlock all write-locked VMAs together when we drop that
> > > > mmap_lock:
> > > >
> > > > mmap_write_lock(mm);
> > > > vma_start_write(vma1);
> > > > vma_start_write(vma2);
> > > > ...
> > > > mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
> > > >
> > > > This is done because oftentimes we need to lock multiple VMAs when
> > > > modifying the address space (vma merge/split) and unlocking them
> > > > individually would be more expensive than unlocking them in bulk by
> > > > incrementing mm->mm_lock_seq.
> > > >
> > > > >
> > > > >
> > > > > --- a/kernel/events/uprobes.c
> > > > > +++ b/kernel/events/uprobes.c
> > > > > @@ -2146,11 +2146,58 @@ static int is_trap_at_addr(struct mm_str
> > > > > return is_trap_insn(&opcode);
> > > > > }
> > > > >
> > > > > -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> > > > > +#ifndef CONFIG_PER_VMA_LOCK
> > > > > +static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> > > > > +{
> > > > > + return NULL;
> > > > > +}
> > > > > +#else
> > > >
> > > > IIUC your code below, you want to get vma->vm_file without locking the
> > > > VMA. I think under RCU that would have been possible if vma->vm_file
> > > > were RCU-safe, which it's not (we had discussions with Paul and
> > > > Matthew about that in
> > > > https://lore.kernel.org/all/CAJuCfpHW2=Zu+CHXL+5fjWxGk=CVix=C66ra+DmXgn6r3+fsXg@mail.gmail.com/).
> > > > Otherwise you could store the value of vma->vm_lock_seq before
> > > > comparing it with mm->mm_lock_seq, then do get_file(vma->file) and
> > > > then compare your locally stored vm_lock_seq against vma->vm_lock_seq
> > > > to see if VMA got locked for modification after we got the file. So,
> > > > unless I miss some other race, I think the VMA locking sequence does
> > > > not preclude you from implementing __find_active_uprobe() but
> > > > accessing vma->vm_file would be unsafe without some kind of locking.
> > >
> > > Hey Suren!
> > >
> > > I've haven't yet dug properly into this, but from quick checking
> > > around I think for the hot path (where this all matters), we really
> > > only want to get vma's underlying inode. vm_file itself is just a
> > > means to that end. If there is some clever way to do
> > > vma->vm_file->f_inode under RCU and without mmap_read_lock, that would
> > > be good enough, I think.
> >
> > Hi Andrii,
> > Sorry, I'm not aware of any other way to get the inode from vma. Maybe
> > Matthew with his FS background can find a way?
>
> Hum. What if we added SLAB_TYPESAFE_BY_RCU to files_cachep? That way
> we could do:
>
> inode = NULL;
> rcu_read_lock();
> vma = find_vma(mm, address);
> if (!vma)
> goto unlock;
> file = READ_ONCE(vma->vm_file);
> if (!file)
> goto unlock;
> inode = file->f_inode;
> if (file != READ_ONCE(vma->vm_file))
> inode = NULL;
> unlock:
> rcu_read_unlock();
>
> if (inode)
> return inode;
> mmap_read_lock();
> vma = find_vma(mm, address);
> ...
>
> I think this would be safe because 'vma' will not be reused while we
> hold the read lock, and while 'file' might be reused, whatever f_inode
> points to won't be used if vm_file is no longer what it once was.
>
> On the other hand, it's quarter to midnight on Friday, and I have a
> terrible virus that I'm struggling through, so not ideal circumstances
> for me to be reasoning about RCU guarantees.
Hi Matthew,
Hopefully you got some rest over the weekend and are feeling better!
What you proposed above with SLAB_TYPESAFE_BY_RCU (assuming it is safe
and correct) I think would solve the uprobe scalability problem when
it comes to mmap_lock. So did you get a chance to think this through
some more? And if that still seems correct, how should we go about
making this happen?
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-27 3:45 ` Matthew Wilcox
2024-07-30 3:18 ` Andrii Nakryiko
@ 2024-07-30 13:10 ` Peter Zijlstra
2024-07-30 18:10 ` Suren Baghdasaryan
1 sibling, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-30 13:10 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Suren Baghdasaryan, Andrii Nakryiko, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Sat, Jul 27, 2024 at 04:45:53AM +0100, Matthew Wilcox wrote:
> Hum. What if we added SLAB_TYPESAFE_BY_RCU to files_cachep? That way
> we could do:
>
> inode = NULL;
> rcu_read_lock();
> vma = find_vma(mm, address);
> if (!vma)
> goto unlock;
> file = READ_ONCE(vma->vm_file);
> if (!file)
> goto unlock;
> inode = file->f_inode;
> if (file != READ_ONCE(vma->vm_file))
> inode = NULL;
remove_vma() does not clear vm_file, nor do I think we ever re-assign
this field after it is set on creation.
That is, I'm struggling to see what this would do. AFAICT this can still
happen:
rcu_read_lock();
vma = find_vma();
remove_vma()
fput(vma->vm_file);
dup_fd)
newf = kmem_cache_alloc(...)
newf->f_inode = blah
file = READ_ONCE(vma->vm_file);
inode = file->f_inode; // blah
if (file != READ_ONCE(vma->vm_file)) // still match
> unlock:
> rcu_read_unlock();
>
> if (inode)
> return inode;
> mmap_read_lock();
> vma = find_vma(mm, address);
> ...
>
> I think this would be safe because 'vma' will not be reused while we
> hold the read lock, and while 'file' might be reused, whatever f_inode
> points to won't be used if vm_file is no longer what it once was.
Also, we need vaddr_to_offset() which needs additional serialization
against vma_lock.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-22 19:09 ` Suren Baghdasaryan
2024-07-27 0:20 ` Andrii Nakryiko
@ 2024-07-30 13:46 ` Peter Zijlstra
2024-07-30 18:16 ` Suren Baghdasaryan
1 sibling, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-07-30 13:46 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Matthew Wilcox, Paul E. McKenney, Andrii Nakryiko,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Mon, Jul 22, 2024 at 12:09:21PM -0700, Suren Baghdasaryan wrote:
> On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> >
> > > If it were an actual sequence count, I could make it work, but sadly,
> > > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > > be used to lockdep annotate the thing.
>
> Thanks Matthew for forwarding me this discussion!
>
> > >
> > > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > > work, but so far, no joy :/
> >
> > See, this is what I want, except I can't close the race against VMA
> > modification because of that crazy locking scheme :/
>
> Happy to explain more about this crazy locking scheme. The catch is
> that we can write-lock a VMA only while holding mmap_lock for write
> and we unlock all write-locked VMAs together when we drop that
> mmap_lock:
>
> mmap_write_lock(mm);
> vma_start_write(vma1);
> vma_start_write(vma2);
> ...
> mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
>
> This is done because oftentimes we need to lock multiple VMAs when
> modifying the address space (vma merge/split) and unlocking them
> individually would be more expensive than unlocking them in bulk by
> incrementing mm->mm_lock_seq.
Right, but you can do that without having it quite this insane.
You can still make mm_lock_seq a proper seqcount, and still have
vma_end_write() -- even if its an empty stub only used for validation.
That is, something like the below, which adds a light barrier, ensures
that mm_lock_seq is a proper sequence count.
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index de9dc20b01ba..daa19d1a3022 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -104,6 +104,8 @@ static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq+1);
+ smp_wmb();
__mmap_lock_trace_acquire_returned(mm, true, true);
}
With the above addition we could write (although I think we still need
the RCU_SLAB thing on files_cachep):
static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
struct inode *inode;
loff_t offset;
int seq;
guard(rcu)();
seq = READ_ONCE(mm->mm_lock_seq);
smp_rmb();
do {
vma = find_vma(mm, bp_vaddr);
if (!vma)
return NULL;
if (!valid_vma(vma, false))
return NULL;
inode = file_inode(vma->vm_file);
offset = vaddr_to_offset(vma, bp_vaddr);
} while (smp_rmb(), seq != READ_ONCE(mm->mm_lock_seq));
return find_uprobe(inode, offset);
}
^ permalink raw reply related [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-30 13:10 ` Peter Zijlstra
@ 2024-07-30 18:10 ` Suren Baghdasaryan
2024-08-03 5:47 ` Andrii Nakryiko
0 siblings, 1 reply; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-07-30 18:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Matthew Wilcox, Andrii Nakryiko, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Tue, Jul 30, 2024 at 6:11 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Sat, Jul 27, 2024 at 04:45:53AM +0100, Matthew Wilcox wrote:
>
> > Hum. What if we added SLAB_TYPESAFE_BY_RCU to files_cachep? That way
> > we could do:
> >
> > inode = NULL;
> > rcu_read_lock();
> > vma = find_vma(mm, address);
> > if (!vma)
> > goto unlock;
> > file = READ_ONCE(vma->vm_file);
> > if (!file)
> > goto unlock;
> > inode = file->f_inode;
> > if (file != READ_ONCE(vma->vm_file))
> > inode = NULL;
>
> remove_vma() does not clear vm_file, nor do I think we ever re-assign
> this field after it is set on creation.
Quite correct and even if we clear vm_file in remove_vma() and/or
reset it on creation I don't think that would be enough. IIUC the
warning about SLAB_TYPESAFE_BY_RCU here:
https://elixir.bootlin.com/linux/v6.10.2/source/include/linux/slab.h#L98
means that the vma object can be reused in the same RCU grace period.
>
> That is, I'm struggling to see what this would do. AFAICT this can still
> happen:
>
> rcu_read_lock();
> vma = find_vma();
> remove_vma()
> fput(vma->vm_file);
> dup_fd)
> newf = kmem_cache_alloc(...)
> newf->f_inode = blah
>
Imagine that the vma got freed and reused at this point. Then
vma->vm_file might be pointing to a valid but a completely different
file.
> file = READ_ONCE(vma->vm_file);
> inode = file->f_inode; // blah
> if (file != READ_ONCE(vma->vm_file)) // still match
I think we should also check that the VMA represents the same area
after we obtained the inode.
>
>
> > unlock:
> > rcu_read_unlock();
> >
> > if (inode)
> > return inode;
> > mmap_read_lock();
> > vma = find_vma(mm, address);
> > ...
> >
> > I think this would be safe because 'vma' will not be reused while we
> > hold the read lock, and while 'file' might be reused, whatever f_inode
> > points to won't be used if vm_file is no longer what it once was.
>
>
> Also, we need vaddr_to_offset() which needs additional serialization
> against vma_lock.
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-30 13:46 ` Peter Zijlstra
@ 2024-07-30 18:16 ` Suren Baghdasaryan
0 siblings, 0 replies; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-07-30 18:16 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Matthew Wilcox, Paul E. McKenney, Andrii Nakryiko,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Tue, Jul 30, 2024 at 6:46 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Jul 22, 2024 at 12:09:21PM -0700, Suren Baghdasaryan wrote:
> > On Wed, Jul 10, 2024 at 2:40 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Jul 10, 2024 at 11:16:31AM +0200, Peter Zijlstra wrote:
> > >
> > > > If it were an actual sequence count, I could make it work, but sadly,
> > > > not. Also, vma_end_write() seems to be missing :-( If anything it could
> > > > be used to lockdep annotate the thing.
> >
> > Thanks Matthew for forwarding me this discussion!
> >
> > > >
> > > > Mooo.. I need to stare more at this to see if perhaps it can be made to
> > > > work, but so far, no joy :/
> > >
> > > See, this is what I want, except I can't close the race against VMA
> > > modification because of that crazy locking scheme :/
> >
> > Happy to explain more about this crazy locking scheme. The catch is
> > that we can write-lock a VMA only while holding mmap_lock for write
> > and we unlock all write-locked VMAs together when we drop that
> > mmap_lock:
> >
> > mmap_write_lock(mm);
> > vma_start_write(vma1);
> > vma_start_write(vma2);
> > ...
> > mmap_write_unlock(mm); -> vma_end_write_all(mm); // unlocks all locked vmas
> >
> > This is done because oftentimes we need to lock multiple VMAs when
> > modifying the address space (vma merge/split) and unlocking them
> > individually would be more expensive than unlocking them in bulk by
> > incrementing mm->mm_lock_seq.
>
> Right, but you can do that without having it quite this insane.
I'm happy to take any suggestions that would improve the current mechanism.
>
> You can still make mm_lock_seq a proper seqcount, and still have
> vma_end_write() -- even if its an empty stub only used for validation.
It's doable but what will we be validating here? That the vma is indeed locked?
>
> That is, something like the below, which adds a light barrier, ensures
> that mm_lock_seq is a proper sequence count.
>
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index de9dc20b01ba..daa19d1a3022 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -104,6 +104,8 @@ static inline void mmap_write_lock(struct mm_struct *mm)
> {
> __mmap_lock_trace_start_locking(mm, true);
> down_write(&mm->mmap_lock);
> + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq+1);
> + smp_wmb();
> __mmap_lock_trace_acquire_returned(mm, true, true);
> }
Ok, I'll try the above change and check the benchmarks for any regressions.
Thanks for the suggestions, Peter!
>
>
> With the above addition we could write (although I think we still need
> the RCU_SLAB thing on files_cachep):
>
> static struct uprobe *__find_active_uprobe(unsigned long bp_vaddr)
> {
> struct mm_struct *mm = current->mm;
> struct uprobe *uprobe = NULL;
> struct vm_area_struct *vma;
> struct inode *inode;
> loff_t offset;
> int seq;
>
> guard(rcu)();
>
> seq = READ_ONCE(mm->mm_lock_seq);
> smp_rmb();
> do {
> vma = find_vma(mm, bp_vaddr);
> if (!vma)
> return NULL;
>
> if (!valid_vma(vma, false))
> return NULL;
>
> inode = file_inode(vma->vm_file);
> offset = vaddr_to_offset(vma, bp_vaddr);
>
> } while (smp_rmb(), seq != READ_ONCE(mm->mm_lock_seq));
>
> return find_uprobe(inode, offset);
> }
>
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-07-30 18:10 ` Suren Baghdasaryan
@ 2024-08-03 5:47 ` Andrii Nakryiko
2024-08-03 8:53 ` Peter Zijlstra
0 siblings, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-03 5:47 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Peter Zijlstra, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Tue, Jul 30, 2024 at 11:10 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jul 30, 2024 at 6:11 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Sat, Jul 27, 2024 at 04:45:53AM +0100, Matthew Wilcox wrote:
> >
> > > Hum. What if we added SLAB_TYPESAFE_BY_RCU to files_cachep? That way
> > > we could do:
> > >
> > > inode = NULL;
> > > rcu_read_lock();
> > > vma = find_vma(mm, address);
> > > if (!vma)
> > > goto unlock;
> > > file = READ_ONCE(vma->vm_file);
> > > if (!file)
> > > goto unlock;
> > > inode = file->f_inode;
> > > if (file != READ_ONCE(vma->vm_file))
> > > inode = NULL;
> >
> > remove_vma() does not clear vm_file, nor do I think we ever re-assign
> > this field after it is set on creation.
>
> Quite correct and even if we clear vm_file in remove_vma() and/or
> reset it on creation I don't think that would be enough. IIUC the
> warning about SLAB_TYPESAFE_BY_RCU here:
> https://elixir.bootlin.com/linux/v6.10.2/source/include/linux/slab.h#L98
> means that the vma object can be reused in the same RCU grace period.
>
> >
> > That is, I'm struggling to see what this would do. AFAICT this can still
> > happen:
> >
> > rcu_read_lock();
> > vma = find_vma();
> > remove_vma()
> > fput(vma->vm_file);
> > dup_fd)
> > newf = kmem_cache_alloc(...)
> > newf->f_inode = blah
> >
>
> Imagine that the vma got freed and reused at this point. Then
> vma->vm_file might be pointing to a valid but a completely different
> file.
>
> > file = READ_ONCE(vma->vm_file);
> > inode = file->f_inode; // blah
> > if (file != READ_ONCE(vma->vm_file)) // still match
>
> I think we should also check that the VMA represents the same area
> after we obtained the inode.
>
> >
> >
> > > unlock:
> > > rcu_read_unlock();
> > >
> > > if (inode)
> > > return inode;
> > > mmap_read_lock();
> > > vma = find_vma(mm, address);
> > > ...
> > >
> > > I think this would be safe because 'vma' will not be reused while we
> > > hold the read lock, and while 'file' might be reused, whatever f_inode
> > > points to won't be used if vm_file is no longer what it once was.
> >
> >
> > Also, we need vaddr_to_offset() which needs additional serialization
> > against vma_lock.
Is there any reason why the approach below won't work? I basically
open-coded the logic in find_active_uprobe(), doing find_vma() under
RCU lock, then fetching vma->vm_file->f_inode. If at any point any
check fails, we fallback to mmap_read_lock-protected logic, but if
not, we just validate that vma->vm_lock_seq didn't change in between
all this.
Note that we don't need to grab inode refcount, we already keep the
reference to inodes in uprobes_tree, so if we find a match (using
find_uprobe() call), then we are good. As long as that
vma->vm_file->f_inode chain didn't change, of course.
Is vma->vm_lock_seq updated on any change to a VMA? This seems to work
fine in practice, but would be good to know for sure.
Author: Andrii Nakryiko <andrii@kernel.org>
Date: Fri Aug 2 22:16:40 2024 -0700
uprobes: add speculative lockless VMA to inode resolution
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8be9e34e786a..e21b68a39f13 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2251,6 +2251,52 @@ static struct uprobe
*find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
+#ifdef CONFIG_PER_VMA_LOCK
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
+ struct file *vm_file;
+ struct inode *vm_inode;
+ unsigned long vm_pgoff, vm_start, vm_end;
+ int vm_lock_seq;
+ loff_t offset;
+
+ rcu_read_lock();
+
+ vma = vma_lookup(mm, bp_vaddr);
+ if (!vma)
+ goto retry_with_lock;
+
+ vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
+
+ vm_file = READ_ONCE(vma->vm_file);
+ vm_flags = READ_ONCE(vma->vm_flags);
+ if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
+ goto retry_with_lock;
+
+ vm_inode = READ_ONCE(vm_file->f_inode);
+ vm_pgoff = READ_ONCE(vma->vm_pgoff);
+ vm_start = READ_ONCE(vma->vm_start);
+ vm_end = READ_ONCE(vma->vm_end);
+ if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
+ goto retry_with_lock;
+
+ offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
+ uprobe = find_uprobe_rcu(vm_inode, offset);
+ if (!uprobe)
+ goto retry_with_lock;
+
+ /* now double check that nothing about VMA changed */
+ if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
+ goto retry_with_lock;
+
+ /* happy case, we speculated successfully */
+ rcu_read_unlock();
+ return uprobe;
+
+retry_with_lock:
+ rcu_read_unlock();
+ uprobe = NULL;
+#endif
+
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
if (vma) {
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..211a84ee92b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
^ permalink raw reply related [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-03 5:47 ` Andrii Nakryiko
@ 2024-08-03 8:53 ` Peter Zijlstra
2024-08-04 23:22 ` Andrii Nakryiko
0 siblings, 1 reply; 84+ messages in thread
From: Peter Zijlstra @ 2024-08-03 8:53 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Suren Baghdasaryan, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, rostedt, oleg,
jolsa, clm, bpf
On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> Is there any reason why the approach below won't work?
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 8be9e34e786a..e21b68a39f13 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -2251,6 +2251,52 @@ static struct uprobe
> *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> struct uprobe *uprobe = NULL;
> struct vm_area_struct *vma;
>
> +#ifdef CONFIG_PER_VMA_LOCK
> + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> + struct file *vm_file;
> + struct inode *vm_inode;
> + unsigned long vm_pgoff, vm_start, vm_end;
> + int vm_lock_seq;
> + loff_t offset;
> +
> + rcu_read_lock();
> +
> + vma = vma_lookup(mm, bp_vaddr);
> + if (!vma)
> + goto retry_with_lock;
> +
> + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
So vma->vm_lock_seq is only updated on vma_start_write()
> +
> + vm_file = READ_ONCE(vma->vm_file);
> + vm_flags = READ_ONCE(vma->vm_flags);
> + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> + goto retry_with_lock;
> +
> + vm_inode = READ_ONCE(vm_file->f_inode);
> + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> + vm_start = READ_ONCE(vma->vm_start);
> + vm_end = READ_ONCE(vma->vm_end);
None of those are written with WRITE_ONCE(), so this buys you nothing.
Compiler could be updating them one byte at a time while you load some
franken-update.
Also, if you're in the middle of split_vma() you might not get a
consistent set.
> + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> + goto retry_with_lock;
> +
> + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> + uprobe = find_uprobe_rcu(vm_inode, offset);
> + if (!uprobe)
> + goto retry_with_lock;
> +
> + /* now double check that nothing about VMA changed */
> + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> + goto retry_with_lock;
Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
checking you're in or after the same modification cycle.
The point of sequence locks is to check you *IN* a modification cycle
and retry if you are. You're now explicitly continuing if you're in a
modification.
You really need:
seq++;
wmb();
... do modification
wmb();
seq++;
vs
do {
s = READ_ONCE(seq) & ~1;
rmb();
... read stuff
} while (rmb(), seq != s);
The thing to note is that seq will be odd while inside a modification
and even outside, further if the pre and post seq are both even but not
identical, you've crossed a modification and also need to retry.
> +
> + /* happy case, we speculated successfully */
> + rcu_read_unlock();
> + return uprobe;
> +
> +retry_with_lock:
> + rcu_read_unlock();
> + uprobe = NULL;
> +#endif
> +
> mmap_read_lock(mm);
> vma = vma_lookup(mm, bp_vaddr);
> if (vma) {
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..211a84ee92b4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> NULL);
> files_cachep = kmem_cache_create("files_cache",
> sizeof(struct files_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> NULL);
> fs_cachep = kmem_cache_create("fs_cache",
> sizeof(struct fs_struct), 0,
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-03 8:53 ` Peter Zijlstra
@ 2024-08-04 23:22 ` Andrii Nakryiko
2024-08-06 4:08 ` Andrii Nakryiko
0 siblings, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-04 23:22 UTC (permalink / raw)
To: Peter Zijlstra, rostedt
Cc: Suren Baghdasaryan, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
>
> > Is there any reason why the approach below won't work?
>
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 8be9e34e786a..e21b68a39f13 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -2251,6 +2251,52 @@ static struct uprobe
> > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > struct uprobe *uprobe = NULL;
> > struct vm_area_struct *vma;
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > + struct file *vm_file;
> > + struct inode *vm_inode;
> > + unsigned long vm_pgoff, vm_start, vm_end;
> > + int vm_lock_seq;
> > + loff_t offset;
> > +
> > + rcu_read_lock();
> > +
> > + vma = vma_lookup(mm, bp_vaddr);
> > + if (!vma)
> > + goto retry_with_lock;
> > +
> > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
>
> So vma->vm_lock_seq is only updated on vma_start_write()
yep, I've looked a bit more at the implementation now
>
> > +
> > + vm_file = READ_ONCE(vma->vm_file);
> > + vm_flags = READ_ONCE(vma->vm_flags);
> > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > + goto retry_with_lock;
> > +
> > + vm_inode = READ_ONCE(vm_file->f_inode);
> > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > + vm_start = READ_ONCE(vma->vm_start);
> > + vm_end = READ_ONCE(vma->vm_end);
>
> None of those are written with WRITE_ONCE(), so this buys you nothing.
> Compiler could be updating them one byte at a time while you load some
> franken-update.
>
> Also, if you're in the middle of split_vma() you might not get a
> consistent set.
I used READ_ONCE() only to prevent the compiler from re-reading those
values. We assume those values are garbage anyways and double-check
everything, so lack of WRITE_ONCE doesn't matter. Same for
inconsistency if we are in the middle of split_vma().
We use the result of all this speculative calculation only if we find
a valid uprobe (which could be a false positive) *and* if we detect
that nothing about VMA changed (which is what I got wrong, but
honestly I was actually betting on others to help me get this right
anyways).
>
> > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > + goto retry_with_lock;
> > +
> > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > + if (!uprobe)
> > + goto retry_with_lock;
> > +
> > + /* now double check that nothing about VMA changed */
> > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > + goto retry_with_lock;
>
> Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> checking you're in or after the same modification cycle.
>
> The point of sequence locks is to check you *IN* a modification cycle
> and retry if you are. You're now explicitly continuing if you're in a
> modification.
>
> You really need:
>
> seq++;
> wmb();
>
> ... do modification
>
> wmb();
> seq++;
>
> vs
>
> do {
> s = READ_ONCE(seq) & ~1;
> rmb();
>
> ... read stuff
>
> } while (rmb(), seq != s);
>
>
> The thing to note is that seq will be odd while inside a modification
> and even outside, further if the pre and post seq are both even but not
> identical, you've crossed a modification and also need to retry.
>
Ok, I don't think I got everything you have written above, sorry. But
let me explain what I think I need to do and please correct what I
(still) got wrong.
a) before starting speculation,
a.1) read and remember current->mm->mm_lock_seq (using
smp_load_acquire(), right?)
a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
out, try with proper mmap_lock
b) proceed with the inode pointer fetch and offset calculation as I've coded it
c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
this could still be wrong)
d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
already modified VMA, bail out
e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
modified, bail out
At this point we should have a guarantee that nothing about mm
changed, nor that VMA started being modified during our speculative
calculation+uprobe lookup. So if we found a valid uprobe, it must be a
correct one that we need.
Is that enough? Any holes in the approach? And thanks for thoroughly
thinking about this, btw!
P.S. This is basically the last big blocker towards linear uprobes
scalability with the number of active CPUs. I have
uretprobe+SRCU+timeout implemented and it seems to work fine, will
post soon-ish.
P.P.S Also, funny enough, below was another big scalability limiter
(and the last one) :) I'm not sure if we can just drop it, or I should
use per-CPU counter, but with the below change and speculative VMA
lookup (however buggy, works ok for benchmarking), I finally get
linear scaling of uprobe triggering throughput with number of CPUs. We
are very close.
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f7443e996b1b..64c2bc316a08 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
uprobe_consumer *con, struct pt_regs *regs)
int ret = 0;
tu = container_of(con, struct trace_uprobe, consumer);
- tu->nhit++;
+ //tu->nhit++;
udd.tu = tu;
udd.bp_addr = instruction_pointer(regs);
> > +
> > + /* happy case, we speculated successfully */
> > + rcu_read_unlock();
> > + return uprobe;
> > +
> > +retry_with_lock:
> > + rcu_read_unlock();
> > + uprobe = NULL;
> > +#endif
> > +
> > mmap_read_lock(mm);
> > vma = vma_lookup(mm, bp_vaddr);
> > if (vma) {
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index cc760491f201..211a84ee92b4 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > NULL);
> > files_cachep = kmem_cache_create("files_cache",
> > sizeof(struct files_struct), 0,
> > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > NULL);
> > fs_cachep = kmem_cache_create("fs_cache",
> > sizeof(struct fs_struct), 0,
^ permalink raw reply related [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-04 23:22 ` Andrii Nakryiko
@ 2024-08-06 4:08 ` Andrii Nakryiko
2024-08-06 14:50 ` Suren Baghdasaryan
2024-08-07 1:36 ` Suren Baghdasaryan
0 siblings, 2 replies; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-06 4:08 UTC (permalink / raw)
To: Peter Zijlstra, rostedt
Cc: Suren Baghdasaryan, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> >
> > > Is there any reason why the approach below won't work?
> >
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 8be9e34e786a..e21b68a39f13 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > struct uprobe *uprobe = NULL;
> > > struct vm_area_struct *vma;
> > >
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > + struct file *vm_file;
> > > + struct inode *vm_inode;
> > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > + int vm_lock_seq;
> > > + loff_t offset;
> > > +
> > > + rcu_read_lock();
> > > +
> > > + vma = vma_lookup(mm, bp_vaddr);
> > > + if (!vma)
> > > + goto retry_with_lock;
> > > +
> > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> >
> > So vma->vm_lock_seq is only updated on vma_start_write()
>
> yep, I've looked a bit more at the implementation now
>
> >
> > > +
> > > + vm_file = READ_ONCE(vma->vm_file);
> > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > + goto retry_with_lock;
> > > +
> > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > + vm_start = READ_ONCE(vma->vm_start);
> > > + vm_end = READ_ONCE(vma->vm_end);
> >
> > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > Compiler could be updating them one byte at a time while you load some
> > franken-update.
> >
> > Also, if you're in the middle of split_vma() you might not get a
> > consistent set.
>
> I used READ_ONCE() only to prevent the compiler from re-reading those
> values. We assume those values are garbage anyways and double-check
> everything, so lack of WRITE_ONCE doesn't matter. Same for
> inconsistency if we are in the middle of split_vma().
>
> We use the result of all this speculative calculation only if we find
> a valid uprobe (which could be a false positive) *and* if we detect
> that nothing about VMA changed (which is what I got wrong, but
> honestly I was actually betting on others to help me get this right
> anyways).
>
> >
> > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > + goto retry_with_lock;
> > > +
> > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > + if (!uprobe)
> > > + goto retry_with_lock;
> > > +
> > > + /* now double check that nothing about VMA changed */
> > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > + goto retry_with_lock;
> >
> > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > checking you're in or after the same modification cycle.
> >
> > The point of sequence locks is to check you *IN* a modification cycle
> > and retry if you are. You're now explicitly continuing if you're in a
> > modification.
> >
> > You really need:
> >
> > seq++;
> > wmb();
> >
> > ... do modification
> >
> > wmb();
> > seq++;
> >
> > vs
> >
> > do {
> > s = READ_ONCE(seq) & ~1;
> > rmb();
> >
> > ... read stuff
> >
> > } while (rmb(), seq != s);
> >
> >
> > The thing to note is that seq will be odd while inside a modification
> > and even outside, further if the pre and post seq are both even but not
> > identical, you've crossed a modification and also need to retry.
> >
>
> Ok, I don't think I got everything you have written above, sorry. But
> let me explain what I think I need to do and please correct what I
> (still) got wrong.
>
> a) before starting speculation,
> a.1) read and remember current->mm->mm_lock_seq (using
> smp_load_acquire(), right?)
> a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> out, try with proper mmap_lock
> b) proceed with the inode pointer fetch and offset calculation as I've coded it
> c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> this could still be wrong)
> d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> already modified VMA, bail out
> e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> modified, bail out
>
> At this point we should have a guarantee that nothing about mm
> changed, nor that VMA started being modified during our speculative
> calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> correct one that we need.
>
> Is that enough? Any holes in the approach? And thanks for thoroughly
> thinking about this, btw!
Ok, with slight modifications to the details of the above (e.g., there
is actually no "odd means VMA is being modified" thing with
vm_lock_seq), I ended up with the implementation below. Basically we
validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
mm_lock_seq (which otherwise would mean "VMA is being modified").
There is a possibility that vm_lock_seq == mm_lock_seq just by
accident, which is not a correctness problem, we'll just fallback to
locked implementation until something about VMA or mm_struct itself
changes. Which is fine, and if mm folks ever change this locking
schema, this might go away.
If this seems on the right track, I think we can just move
mm_start_vma_specuation()/mm_end_vma_speculation() into
include/linux/mm.h.
And after thinking a bit more about READ_ONCE() usage, I changed them
to data_race() to not trigger KCSAN warnings. Initially I kept
READ_ONCE() only around vma->vm_file access, but given we never change
it until vma is freed and reused (which would be prevented by
guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
even data_race() is probably not necessary.
Anyways, please see the patch below. Would be nice if mm folks
(Suren?) could confirm that this is not broken.
Author: Andrii Nakryiko <andrii@kernel.org>
Date: Fri Aug 2 22:16:40 2024 -0700
uprobes: add speculative lockless VMA to inode resolution
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3de311c56d47..bee7a929ff02 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
*mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void mm_start_vma_speculation(struct mm_struct *mm, int
*mm_lock_seq)
+{
+ *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
+}
+
+/* returns true if speculation was safe (no mm and vma modification
happened) */
+static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
int mm_lock_seq)
+{
+ int mm_seq, vma_seq;
+
+ mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
+ vma_seq = READ_ONCE(vma->vm_lock_seq);
+
+ return mm_seq == mm_lock_seq && vma_seq != mm_seq;
+}
+
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+ const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe;
+ struct vm_area_struct *vma;
+ struct file *vm_file;
+ struct inode *vm_inode;
+ unsigned long vm_pgoff, vm_start;
+ int mm_lock_seq;
+ loff_t offset;
+
+ guard(rcu)();
+
+ mm_start_vma_speculation(mm, &mm_lock_seq);
+
+ vma = vma_lookup(mm, bp_vaddr);
+ if (!vma)
+ return NULL;
+
+ vm_file = data_race(vma->vm_file);
+ if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
+ return NULL;
+
+ vm_inode = data_race(vm_file->f_inode);
+ vm_pgoff = data_race(vma->vm_pgoff);
+ vm_start = data_race(vma->vm_start);
+
+ offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
+ uprobe = find_uprobe_rcu(vm_inode, offset);
+ if (!uprobe)
+ return NULL;
+
+ /* now double check that nothing about MM and VMA changed */
+ if (!mm_end_vma_speculation(vma, mm_lock_seq))
+ return NULL;
+
+ /* happy case, we speculated successfully */
+ return uprobe;
+}
+#else /* !CONFIG_PER_VMA_LOCK */
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+ return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
/* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr,
int *is_swbp)
{
@@ -2251,6 +2315,10 @@ static struct uprobe
*find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
+ uprobe = find_active_uprobe_speculative(bp_vaddr);
+ if (uprobe)
+ return uprobe;
+
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
if (vma) {
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..211a84ee92b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
>
> P.S. This is basically the last big blocker towards linear uprobes
> scalability with the number of active CPUs. I have
> uretprobe+SRCU+timeout implemented and it seems to work fine, will
> post soon-ish.
>
> P.P.S Also, funny enough, below was another big scalability limiter
> (and the last one) :) I'm not sure if we can just drop it, or I should
> use per-CPU counter, but with the below change and speculative VMA
> lookup (however buggy, works ok for benchmarking), I finally get
> linear scaling of uprobe triggering throughput with number of CPUs. We
> are very close.
>
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index f7443e996b1b..64c2bc316a08 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
> uprobe_consumer *con, struct pt_regs *regs)
> int ret = 0;
>
> tu = container_of(con, struct trace_uprobe, consumer);
> - tu->nhit++;
> + //tu->nhit++;
>
> udd.tu = tu;
> udd.bp_addr = instruction_pointer(regs);
>
>
> > > +
> > > + /* happy case, we speculated successfully */
> > > + rcu_read_unlock();
> > > + return uprobe;
> > > +
> > > +retry_with_lock:
> > > + rcu_read_unlock();
> > > + uprobe = NULL;
> > > +#endif
> > > +
> > > mmap_read_lock(mm);
> > > vma = vma_lookup(mm, bp_vaddr);
> > > if (vma) {
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index cc760491f201..211a84ee92b4 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > NULL);
> > > files_cachep = kmem_cache_create("files_cache",
> > > sizeof(struct files_struct), 0,
> > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > NULL);
> > > fs_cachep = kmem_cache_create("fs_cache",
> > > sizeof(struct fs_struct), 0,
^ permalink raw reply related [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-06 4:08 ` Andrii Nakryiko
@ 2024-08-06 14:50 ` Suren Baghdasaryan
2024-08-06 17:40 ` Andrii Nakryiko
2024-08-07 1:36 ` Suren Baghdasaryan
1 sibling, 1 reply; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-08-06 14:50 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > >
> > > > Is there any reason why the approach below won't work?
> > >
> > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > --- a/kernel/events/uprobes.c
> > > > +++ b/kernel/events/uprobes.c
> > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > struct uprobe *uprobe = NULL;
> > > > struct vm_area_struct *vma;
> > > >
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > + struct file *vm_file;
> > > > + struct inode *vm_inode;
> > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > + int vm_lock_seq;
> > > > + loff_t offset;
> > > > +
> > > > + rcu_read_lock();
> > > > +
> > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > + if (!vma)
> > > > + goto retry_with_lock;
> > > > +
> > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > >
> > > So vma->vm_lock_seq is only updated on vma_start_write()
> >
> > yep, I've looked a bit more at the implementation now
> >
> > >
> > > > +
> > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > + goto retry_with_lock;
> > > > +
> > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > + vm_end = READ_ONCE(vma->vm_end);
> > >
> > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > Compiler could be updating them one byte at a time while you load some
> > > franken-update.
> > >
> > > Also, if you're in the middle of split_vma() you might not get a
> > > consistent set.
> >
> > I used READ_ONCE() only to prevent the compiler from re-reading those
> > values. We assume those values are garbage anyways and double-check
> > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > inconsistency if we are in the middle of split_vma().
> >
> > We use the result of all this speculative calculation only if we find
> > a valid uprobe (which could be a false positive) *and* if we detect
> > that nothing about VMA changed (which is what I got wrong, but
> > honestly I was actually betting on others to help me get this right
> > anyways).
> >
> > >
> > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > + goto retry_with_lock;
> > > > +
> > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > + if (!uprobe)
> > > > + goto retry_with_lock;
> > > > +
> > > > + /* now double check that nothing about VMA changed */
> > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > + goto retry_with_lock;
> > >
> > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > checking you're in or after the same modification cycle.
> > >
> > > The point of sequence locks is to check you *IN* a modification cycle
> > > and retry if you are. You're now explicitly continuing if you're in a
> > > modification.
> > >
> > > You really need:
> > >
> > > seq++;
> > > wmb();
> > >
> > > ... do modification
> > >
> > > wmb();
> > > seq++;
> > >
> > > vs
> > >
> > > do {
> > > s = READ_ONCE(seq) & ~1;
> > > rmb();
> > >
> > > ... read stuff
> > >
> > > } while (rmb(), seq != s);
> > >
> > >
> > > The thing to note is that seq will be odd while inside a modification
> > > and even outside, further if the pre and post seq are both even but not
> > > identical, you've crossed a modification and also need to retry.
> > >
> >
> > Ok, I don't think I got everything you have written above, sorry. But
> > let me explain what I think I need to do and please correct what I
> > (still) got wrong.
> >
> > a) before starting speculation,
> > a.1) read and remember current->mm->mm_lock_seq (using
> > smp_load_acquire(), right?)
> > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > out, try with proper mmap_lock
> > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > this could still be wrong)
> > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > already modified VMA, bail out
> > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > modified, bail out
> >
> > At this point we should have a guarantee that nothing about mm
> > changed, nor that VMA started being modified during our speculative
> > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > correct one that we need.
> >
> > Is that enough? Any holes in the approach? And thanks for thoroughly
> > thinking about this, btw!
>
> Ok, with slight modifications to the details of the above (e.g., there
> is actually no "odd means VMA is being modified" thing with
> vm_lock_seq), I ended up with the implementation below. Basically we
> validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> mm_lock_seq (which otherwise would mean "VMA is being modified").
> There is a possibility that vm_lock_seq == mm_lock_seq just by
> accident, which is not a correctness problem, we'll just fallback to
> locked implementation until something about VMA or mm_struct itself
> changes. Which is fine, and if mm folks ever change this locking
> schema, this might go away.
>
> If this seems on the right track, I think we can just move
> mm_start_vma_specuation()/mm_end_vma_speculation() into
> include/linux/mm.h.
>
> And after thinking a bit more about READ_ONCE() usage, I changed them
> to data_race() to not trigger KCSAN warnings. Initially I kept
> READ_ONCE() only around vma->vm_file access, but given we never change
> it until vma is freed and reused (which would be prevented by
> guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> even data_race() is probably not necessary.
>
> Anyways, please see the patch below. Would be nice if mm folks
> (Suren?) could confirm that this is not broken.
Hi Andrii,
Sorry, I'm catching up on my emails and will get back to this to read
through the discussion later today.
One obvious thing that is problematic in this whole schema is the fact
that vm_file is not RCU-safe as I mentioned before. See below.
>
>
>
> Author: Andrii Nakryiko <andrii@kernel.org>
> Date: Fri Aug 2 22:16:40 2024 -0700
>
> uprobes: add speculative lockless VMA to inode resolution
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 3de311c56d47..bee7a929ff02 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> *mm, unsigned long vaddr)
> return is_trap_insn(&opcode);
> }
>
> +#ifdef CONFIG_PER_VMA_LOCK
> +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> *mm_lock_seq)
> +{
> + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> +}
> +
> +/* returns true if speculation was safe (no mm and vma modification
> happened) */
> +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> int mm_lock_seq)
> +{
> + int mm_seq, vma_seq;
> +
> + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> + vma_seq = READ_ONCE(vma->vm_lock_seq);
> +
> + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> +}
> +
> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> + const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> + struct mm_struct *mm = current->mm;
> + struct uprobe *uprobe;
> + struct vm_area_struct *vma;
> + struct file *vm_file;
> + struct inode *vm_inode;
> + unsigned long vm_pgoff, vm_start;
> + int mm_lock_seq;
> + loff_t offset;
> +
> + guard(rcu)();
> +
> + mm_start_vma_speculation(mm, &mm_lock_seq);
> +
> + vma = vma_lookup(mm, bp_vaddr);
> + if (!vma)
> + return NULL;
> +
> + vm_file = data_race(vma->vm_file);
Consider what happens if remove_vma() is racing with this code and at
this point it calls fput(vma->vm_file). Your vma will be valid (it's
freed after RCU grace period) but vma->vm_file can be freed from under
you. For this to work vma->vm_file should be made RCU-safe.
Thanks,
Suren.
> + if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> + return NULL;
> +
> + vm_inode = data_race(vm_file->f_inode);
> + vm_pgoff = data_race(vma->vm_pgoff);
> + vm_start = data_race(vma->vm_start);
> +
> + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> + uprobe = find_uprobe_rcu(vm_inode, offset);
> + if (!uprobe)
> + return NULL;
> +
> + /* now double check that nothing about MM and VMA changed */
> + if (!mm_end_vma_speculation(vma, mm_lock_seq))
> + return NULL;
> +
> + /* happy case, we speculated successfully */
> + return uprobe;
> +}
> +#else /* !CONFIG_PER_VMA_LOCK */
> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> + return NULL;
> +}
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> /* assumes being inside RCU protected region */
> static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr,
> int *is_swbp)
> {
> @@ -2251,6 +2315,10 @@ static struct uprobe
> *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> struct uprobe *uprobe = NULL;
> struct vm_area_struct *vma;
>
> + uprobe = find_active_uprobe_speculative(bp_vaddr);
> + if (uprobe)
> + return uprobe;
> +
> mmap_read_lock(mm);
> vma = vma_lookup(mm, bp_vaddr);
> if (vma) {
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..211a84ee92b4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> NULL);
> files_cachep = kmem_cache_create("files_cache",
> sizeof(struct files_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> +
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> NULL);
> fs_cachep = kmem_cache_create("fs_cache",
> sizeof(struct fs_struct), 0,
>
>
> >
> > P.S. This is basically the last big blocker towards linear uprobes
> > scalability with the number of active CPUs. I have
> > uretprobe+SRCU+timeout implemented and it seems to work fine, will
> > post soon-ish.
> >
> > P.P.S Also, funny enough, below was another big scalability limiter
> > (and the last one) :) I'm not sure if we can just drop it, or I should
> > use per-CPU counter, but with the below change and speculative VMA
> > lookup (however buggy, works ok for benchmarking), I finally get
> > linear scaling of uprobe triggering throughput with number of CPUs. We
> > are very close.
> >
> > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> > index f7443e996b1b..64c2bc316a08 100644
> > --- a/kernel/trace/trace_uprobe.c
> > +++ b/kernel/trace/trace_uprobe.c
> > @@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
> > uprobe_consumer *con, struct pt_regs *regs)
> > int ret = 0;
> >
> > tu = container_of(con, struct trace_uprobe, consumer);
> > - tu->nhit++;
> > + //tu->nhit++;
> >
> > udd.tu = tu;
> > udd.bp_addr = instruction_pointer(regs);
> >
> >
> > > > +
> > > > + /* happy case, we speculated successfully */
> > > > + rcu_read_unlock();
> > > > + return uprobe;
> > > > +
> > > > +retry_with_lock:
> > > > + rcu_read_unlock();
> > > > + uprobe = NULL;
> > > > +#endif
> > > > +
> > > > mmap_read_lock(mm);
> > > > vma = vma_lookup(mm, bp_vaddr);
> > > > if (vma) {
> > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > index cc760491f201..211a84ee92b4 100644
> > > > --- a/kernel/fork.c
> > > > +++ b/kernel/fork.c
> > > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > > NULL);
> > > > files_cachep = kmem_cache_create("files_cache",
> > > > sizeof(struct files_struct), 0,
> > > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > > NULL);
> > > > fs_cachep = kmem_cache_create("fs_cache",
> > > > sizeof(struct fs_struct), 0,
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-06 14:50 ` Suren Baghdasaryan
@ 2024-08-06 17:40 ` Andrii Nakryiko
2024-08-06 17:44 ` Suren Baghdasaryan
0 siblings, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-06 17:40 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Tue, Aug 6, 2024 at 7:51 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > >
> > > > > Is there any reason why the approach below won't work?
> > > >
> > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > --- a/kernel/events/uprobes.c
> > > > > +++ b/kernel/events/uprobes.c
> > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > struct uprobe *uprobe = NULL;
> > > > > struct vm_area_struct *vma;
> > > > >
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > + struct file *vm_file;
> > > > > + struct inode *vm_inode;
> > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > + int vm_lock_seq;
> > > > > + loff_t offset;
> > > > > +
> > > > > + rcu_read_lock();
> > > > > +
> > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > + if (!vma)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > >
> > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > >
> > > yep, I've looked a bit more at the implementation now
> > >
> > > >
> > > > > +
> > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > >
> > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > Compiler could be updating them one byte at a time while you load some
> > > > franken-update.
> > > >
> > > > Also, if you're in the middle of split_vma() you might not get a
> > > > consistent set.
> > >
> > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > values. We assume those values are garbage anyways and double-check
> > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > inconsistency if we are in the middle of split_vma().
> > >
> > > We use the result of all this speculative calculation only if we find
> > > a valid uprobe (which could be a false positive) *and* if we detect
> > > that nothing about VMA changed (which is what I got wrong, but
> > > honestly I was actually betting on others to help me get this right
> > > anyways).
> > >
> > > >
> > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > + if (!uprobe)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + /* now double check that nothing about VMA changed */
> > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > + goto retry_with_lock;
> > > >
> > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > checking you're in or after the same modification cycle.
> > > >
> > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > modification.
> > > >
> > > > You really need:
> > > >
> > > > seq++;
> > > > wmb();
> > > >
> > > > ... do modification
> > > >
> > > > wmb();
> > > > seq++;
> > > >
> > > > vs
> > > >
> > > > do {
> > > > s = READ_ONCE(seq) & ~1;
> > > > rmb();
> > > >
> > > > ... read stuff
> > > >
> > > > } while (rmb(), seq != s);
> > > >
> > > >
> > > > The thing to note is that seq will be odd while inside a modification
> > > > and even outside, further if the pre and post seq are both even but not
> > > > identical, you've crossed a modification and also need to retry.
> > > >
> > >
> > > Ok, I don't think I got everything you have written above, sorry. But
> > > let me explain what I think I need to do and please correct what I
> > > (still) got wrong.
> > >
> > > a) before starting speculation,
> > > a.1) read and remember current->mm->mm_lock_seq (using
> > > smp_load_acquire(), right?)
> > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > out, try with proper mmap_lock
> > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > this could still be wrong)
> > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > already modified VMA, bail out
> > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > modified, bail out
> > >
> > > At this point we should have a guarantee that nothing about mm
> > > changed, nor that VMA started being modified during our speculative
> > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > correct one that we need.
> > >
> > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > thinking about this, btw!
> >
> > Ok, with slight modifications to the details of the above (e.g., there
> > is actually no "odd means VMA is being modified" thing with
> > vm_lock_seq), I ended up with the implementation below. Basically we
> > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > mm_lock_seq (which otherwise would mean "VMA is being modified").
> > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > accident, which is not a correctness problem, we'll just fallback to
> > locked implementation until something about VMA or mm_struct itself
> > changes. Which is fine, and if mm folks ever change this locking
> > schema, this might go away.
> >
> > If this seems on the right track, I think we can just move
> > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > include/linux/mm.h.
> >
> > And after thinking a bit more about READ_ONCE() usage, I changed them
> > to data_race() to not trigger KCSAN warnings. Initially I kept
> > READ_ONCE() only around vma->vm_file access, but given we never change
> > it until vma is freed and reused (which would be prevented by
> > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > even data_race() is probably not necessary.
> >
> > Anyways, please see the patch below. Would be nice if mm folks
> > (Suren?) could confirm that this is not broken.
>
> Hi Andrii,
> Sorry, I'm catching up on my emails and will get back to this to read
> through the discussion later today.
Great, thank you! I appreciate you taking a thorough look!
> One obvious thing that is problematic in this whole schema is the fact
> that vm_file is not RCU-safe as I mentioned before. See below.
>
> >
> >
> >
> > Author: Andrii Nakryiko <andrii@kernel.org>
> > Date: Fri Aug 2 22:16:40 2024 -0700
> >
> > uprobes: add speculative lockless VMA to inode resolution
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> >
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 3de311c56d47..bee7a929ff02 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > *mm, unsigned long vaddr)
> > return is_trap_insn(&opcode);
> > }
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > *mm_lock_seq)
> > +{
> > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > +}
> > +
> > +/* returns true if speculation was safe (no mm and vma modification
> > happened) */
> > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > int mm_lock_seq)
> > +{
> > + int mm_seq, vma_seq;
> > +
> > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > +
> > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> > +}
> > +
> > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > +{
> > + const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > + struct mm_struct *mm = current->mm;
> > + struct uprobe *uprobe;
> > + struct vm_area_struct *vma;
> > + struct file *vm_file;
> > + struct inode *vm_inode;
> > + unsigned long vm_pgoff, vm_start;
> > + int mm_lock_seq;
> > + loff_t offset;
> > +
> > + guard(rcu)();
> > +
> > + mm_start_vma_speculation(mm, &mm_lock_seq);
> > +
> > + vma = vma_lookup(mm, bp_vaddr);
> > + if (!vma)
> > + return NULL;
> > +
> > + vm_file = data_race(vma->vm_file);
>
> Consider what happens if remove_vma() is racing with this code and at
> this point it calls fput(vma->vm_file). Your vma will be valid (it's
> freed after RCU grace period) but vma->vm_file can be freed from under
> you. For this to work vma->vm_file should be made RCU-safe.
Note that I'm adding SLAB_TYPESAFE_BY_RCU to files_cachep, as
suggested by Matthew. I thought that would be enough to ensure that
vm_file's memory won't be freed until after the RCU grace period. It's
ok if file struct itself is reused for another file (we should detect
that as vma/mm change with sequence numbers).
> Thanks,
> Suren.
>
> > + if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > + return NULL;
> > +
> > + vm_inode = data_race(vm_file->f_inode);
> > + vm_pgoff = data_race(vma->vm_pgoff);
> > + vm_start = data_race(vma->vm_start);
> > +
> > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > + if (!uprobe)
> > + return NULL;
> > +
> > + /* now double check that nothing about MM and VMA changed */
> > + if (!mm_end_vma_speculation(vma, mm_lock_seq))
> > + return NULL;
> > +
> > + /* happy case, we speculated successfully */
> > + return uprobe;
> > +}
> > +#else /* !CONFIG_PER_VMA_LOCK */
> > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > +{
> > + return NULL;
> > +}
> > +#endif /* CONFIG_PER_VMA_LOCK */
> > +
> > /* assumes being inside RCU protected region */
> > static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr,
> > int *is_swbp)
> > {
> > @@ -2251,6 +2315,10 @@ static struct uprobe
> > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > struct uprobe *uprobe = NULL;
> > struct vm_area_struct *vma;
> >
> > + uprobe = find_active_uprobe_speculative(bp_vaddr);
> > + if (uprobe)
> > + return uprobe;
> > +
> > mmap_read_lock(mm);
> > vma = vma_lookup(mm, bp_vaddr);
> > if (vma) {
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index cc760491f201..211a84ee92b4 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > NULL);
> > files_cachep = kmem_cache_create("files_cache",
> > sizeof(struct files_struct), 0,
> > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > +
> > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > NULL);
> > fs_cachep = kmem_cache_create("fs_cache",
> > sizeof(struct fs_struct), 0,
> >
> >
> > >
> > > P.S. This is basically the last big blocker towards linear uprobes
> > > scalability with the number of active CPUs. I have
> > > uretprobe+SRCU+timeout implemented and it seems to work fine, will
> > > post soon-ish.
> > >
> > > P.P.S Also, funny enough, below was another big scalability limiter
> > > (and the last one) :) I'm not sure if we can just drop it, or I should
> > > use per-CPU counter, but with the below change and speculative VMA
> > > lookup (however buggy, works ok for benchmarking), I finally get
> > > linear scaling of uprobe triggering throughput with number of CPUs. We
> > > are very close.
> > >
> > > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> > > index f7443e996b1b..64c2bc316a08 100644
> > > --- a/kernel/trace/trace_uprobe.c
> > > +++ b/kernel/trace/trace_uprobe.c
> > > @@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
> > > uprobe_consumer *con, struct pt_regs *regs)
> > > int ret = 0;
> > >
> > > tu = container_of(con, struct trace_uprobe, consumer);
> > > - tu->nhit++;
> > > + //tu->nhit++;
> > >
> > > udd.tu = tu;
> > > udd.bp_addr = instruction_pointer(regs);
> > >
> > >
> > > > > +
> > > > > + /* happy case, we speculated successfully */
> > > > > + rcu_read_unlock();
> > > > > + return uprobe;
> > > > > +
> > > > > +retry_with_lock:
> > > > > + rcu_read_unlock();
> > > > > + uprobe = NULL;
> > > > > +#endif
> > > > > +
> > > > > mmap_read_lock(mm);
> > > > > vma = vma_lookup(mm, bp_vaddr);
> > > > > if (vma) {
> > > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > index cc760491f201..211a84ee92b4 100644
> > > > > --- a/kernel/fork.c
> > > > > +++ b/kernel/fork.c
> > > > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > > > NULL);
> > > > > files_cachep = kmem_cache_create("files_cache",
> > > > > sizeof(struct files_struct), 0,
> > > > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > > > NULL);
> > > > > fs_cachep = kmem_cache_create("fs_cache",
> > > > > sizeof(struct fs_struct), 0,
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-06 17:40 ` Andrii Nakryiko
@ 2024-08-06 17:44 ` Suren Baghdasaryan
0 siblings, 0 replies; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-08-06 17:44 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Tue, Aug 6, 2024 at 10:40 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Aug 6, 2024 at 7:51 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > > >
> > > > > > Is there any reason why the approach below won't work?
> > > > >
> > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > > --- a/kernel/events/uprobes.c
> > > > > > +++ b/kernel/events/uprobes.c
> > > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > > struct uprobe *uprobe = NULL;
> > > > > > struct vm_area_struct *vma;
> > > > > >
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > > + struct file *vm_file;
> > > > > > + struct inode *vm_inode;
> > > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > > + int vm_lock_seq;
> > > > > > + loff_t offset;
> > > > > > +
> > > > > > + rcu_read_lock();
> > > > > > +
> > > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > > + if (!vma)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > > >
> > > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > > >
> > > > yep, I've looked a bit more at the implementation now
> > > >
> > > > >
> > > > > > +
> > > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > > >
> > > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > > Compiler could be updating them one byte at a time while you load some
> > > > > franken-update.
> > > > >
> > > > > Also, if you're in the middle of split_vma() you might not get a
> > > > > consistent set.
> > > >
> > > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > > values. We assume those values are garbage anyways and double-check
> > > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > > inconsistency if we are in the middle of split_vma().
> > > >
> > > > We use the result of all this speculative calculation only if we find
> > > > a valid uprobe (which could be a false positive) *and* if we detect
> > > > that nothing about VMA changed (which is what I got wrong, but
> > > > honestly I was actually betting on others to help me get this right
> > > > anyways).
> > > >
> > > > >
> > > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > > + if (!uprobe)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + /* now double check that nothing about VMA changed */
> > > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > > + goto retry_with_lock;
> > > > >
> > > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > > checking you're in or after the same modification cycle.
> > > > >
> > > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > > modification.
> > > > >
> > > > > You really need:
> > > > >
> > > > > seq++;
> > > > > wmb();
> > > > >
> > > > > ... do modification
> > > > >
> > > > > wmb();
> > > > > seq++;
> > > > >
> > > > > vs
> > > > >
> > > > > do {
> > > > > s = READ_ONCE(seq) & ~1;
> > > > > rmb();
> > > > >
> > > > > ... read stuff
> > > > >
> > > > > } while (rmb(), seq != s);
> > > > >
> > > > >
> > > > > The thing to note is that seq will be odd while inside a modification
> > > > > and even outside, further if the pre and post seq are both even but not
> > > > > identical, you've crossed a modification and also need to retry.
> > > > >
> > > >
> > > > Ok, I don't think I got everything you have written above, sorry. But
> > > > let me explain what I think I need to do and please correct what I
> > > > (still) got wrong.
> > > >
> > > > a) before starting speculation,
> > > > a.1) read and remember current->mm->mm_lock_seq (using
> > > > smp_load_acquire(), right?)
> > > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > > out, try with proper mmap_lock
> > > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > > this could still be wrong)
> > > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > > already modified VMA, bail out
> > > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > > modified, bail out
> > > >
> > > > At this point we should have a guarantee that nothing about mm
> > > > changed, nor that VMA started being modified during our speculative
> > > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > > correct one that we need.
> > > >
> > > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > > thinking about this, btw!
> > >
> > > Ok, with slight modifications to the details of the above (e.g., there
> > > is actually no "odd means VMA is being modified" thing with
> > > vm_lock_seq), I ended up with the implementation below. Basically we
> > > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > > mm_lock_seq (which otherwise would mean "VMA is being modified").
> > > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > > accident, which is not a correctness problem, we'll just fallback to
> > > locked implementation until something about VMA or mm_struct itself
> > > changes. Which is fine, and if mm folks ever change this locking
> > > schema, this might go away.
> > >
> > > If this seems on the right track, I think we can just move
> > > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > > include/linux/mm.h.
> > >
> > > And after thinking a bit more about READ_ONCE() usage, I changed them
> > > to data_race() to not trigger KCSAN warnings. Initially I kept
> > > READ_ONCE() only around vma->vm_file access, but given we never change
> > > it until vma is freed and reused (which would be prevented by
> > > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > > even data_race() is probably not necessary.
> > >
> > > Anyways, please see the patch below. Would be nice if mm folks
> > > (Suren?) could confirm that this is not broken.
> >
> > Hi Andrii,
> > Sorry, I'm catching up on my emails and will get back to this to read
> > through the discussion later today.
>
> Great, thank you! I appreciate you taking a thorough look!
>
> > One obvious thing that is problematic in this whole schema is the fact
> > that vm_file is not RCU-safe as I mentioned before. See below.
> >
> > >
> > >
> > >
> > > Author: Andrii Nakryiko <andrii@kernel.org>
> > > Date: Fri Aug 2 22:16:40 2024 -0700
> > >
> > > uprobes: add speculative lockless VMA to inode resolution
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > >
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 3de311c56d47..bee7a929ff02 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > > *mm, unsigned long vaddr)
> > > return is_trap_insn(&opcode);
> > > }
> > >
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > > *mm_lock_seq)
> > > +{
> > > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > > +}
> > > +
> > > +/* returns true if speculation was safe (no mm and vma modification
> > > happened) */
> > > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > > int mm_lock_seq)
> > > +{
> > > + int mm_seq, vma_seq;
> > > +
> > > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > > +
> > > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> > > +}
> > > +
> > > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > > +{
> > > + const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > > + struct mm_struct *mm = current->mm;
> > > + struct uprobe *uprobe;
> > > + struct vm_area_struct *vma;
> > > + struct file *vm_file;
> > > + struct inode *vm_inode;
> > > + unsigned long vm_pgoff, vm_start;
> > > + int mm_lock_seq;
> > > + loff_t offset;
> > > +
> > > + guard(rcu)();
> > > +
> > > + mm_start_vma_speculation(mm, &mm_lock_seq);
> > > +
> > > + vma = vma_lookup(mm, bp_vaddr);
> > > + if (!vma)
> > > + return NULL;
> > > +
> > > + vm_file = data_race(vma->vm_file);
> >
> > Consider what happens if remove_vma() is racing with this code and at
> > this point it calls fput(vma->vm_file). Your vma will be valid (it's
> > freed after RCU grace period) but vma->vm_file can be freed from under
> > you. For this to work vma->vm_file should be made RCU-safe.
>
> Note that I'm adding SLAB_TYPESAFE_BY_RCU to files_cachep, as
> suggested by Matthew. I thought that would be enough to ensure that
> vm_file's memory won't be freed until after the RCU grace period. It's
> ok if file struct itself is reused for another file (we should detect
> that as vma/mm change with sequence numbers).
Ah, I see SLAB_TYPESAFE_BY_RCU now in the previous version which I
haven't reviewed yet. Ok, I'll go over the whole thread before
replying. Thanks!
>
> > Thanks,
> > Suren.
> >
> > > + if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > > + return NULL;
> > > +
> > > + vm_inode = data_race(vm_file->f_inode);
> > > + vm_pgoff = data_race(vma->vm_pgoff);
> > > + vm_start = data_race(vma->vm_start);
> > > +
> > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > + if (!uprobe)
> > > + return NULL;
> > > +
> > > + /* now double check that nothing about MM and VMA changed */
> > > + if (!mm_end_vma_speculation(vma, mm_lock_seq))
> > > + return NULL;
> > > +
> > > + /* happy case, we speculated successfully */
> > > + return uprobe;
> > > +}
> > > +#else /* !CONFIG_PER_VMA_LOCK */
> > > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > > +{
> > > + return NULL;
> > > +}
> > > +#endif /* CONFIG_PER_VMA_LOCK */
> > > +
> > > /* assumes being inside RCU protected region */
> > > static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr,
> > > int *is_swbp)
> > > {
> > > @@ -2251,6 +2315,10 @@ static struct uprobe
> > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > struct uprobe *uprobe = NULL;
> > > struct vm_area_struct *vma;
> > >
> > > + uprobe = find_active_uprobe_speculative(bp_vaddr);
> > > + if (uprobe)
> > > + return uprobe;
> > > +
> > > mmap_read_lock(mm);
> > > vma = vma_lookup(mm, bp_vaddr);
> > > if (vma) {
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index cc760491f201..211a84ee92b4 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > NULL);
> > > files_cachep = kmem_cache_create("files_cache",
> > > sizeof(struct files_struct), 0,
> > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > +
> > > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > NULL);
> > > fs_cachep = kmem_cache_create("fs_cache",
> > > sizeof(struct fs_struct), 0,
> > >
> > >
> > > >
> > > > P.S. This is basically the last big blocker towards linear uprobes
> > > > scalability with the number of active CPUs. I have
> > > > uretprobe+SRCU+timeout implemented and it seems to work fine, will
> > > > post soon-ish.
> > > >
> > > > P.P.S Also, funny enough, below was another big scalability limiter
> > > > (and the last one) :) I'm not sure if we can just drop it, or I should
> > > > use per-CPU counter, but with the below change and speculative VMA
> > > > lookup (however buggy, works ok for benchmarking), I finally get
> > > > linear scaling of uprobe triggering throughput with number of CPUs. We
> > > > are very close.
> > > >
> > > > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> > > > index f7443e996b1b..64c2bc316a08 100644
> > > > --- a/kernel/trace/trace_uprobe.c
> > > > +++ b/kernel/trace/trace_uprobe.c
> > > > @@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
> > > > uprobe_consumer *con, struct pt_regs *regs)
> > > > int ret = 0;
> > > >
> > > > tu = container_of(con, struct trace_uprobe, consumer);
> > > > - tu->nhit++;
> > > > + //tu->nhit++;
> > > >
> > > > udd.tu = tu;
> > > > udd.bp_addr = instruction_pointer(regs);
> > > >
> > > >
> > > > > > +
> > > > > > + /* happy case, we speculated successfully */
> > > > > > + rcu_read_unlock();
> > > > > > + return uprobe;
> > > > > > +
> > > > > > +retry_with_lock:
> > > > > > + rcu_read_unlock();
> > > > > > + uprobe = NULL;
> > > > > > +#endif
> > > > > > +
> > > > > > mmap_read_lock(mm);
> > > > > > vma = vma_lookup(mm, bp_vaddr);
> > > > > > if (vma) {
> > > > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > > index cc760491f201..211a84ee92b4 100644
> > > > > > --- a/kernel/fork.c
> > > > > > +++ b/kernel/fork.c
> > > > > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > > > > NULL);
> > > > > > files_cachep = kmem_cache_create("files_cache",
> > > > > > sizeof(struct files_struct), 0,
> > > > > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > > > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > > > > NULL);
> > > > > > fs_cachep = kmem_cache_create("fs_cache",
> > > > > > sizeof(struct fs_struct), 0,
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-06 4:08 ` Andrii Nakryiko
2024-08-06 14:50 ` Suren Baghdasaryan
@ 2024-08-07 1:36 ` Suren Baghdasaryan
2024-08-07 5:13 ` Suren Baghdasaryan
1 sibling, 1 reply; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-08-07 1:36 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > >
> > > > Is there any reason why the approach below won't work?
> > >
> > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > --- a/kernel/events/uprobes.c
> > > > +++ b/kernel/events/uprobes.c
> > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > struct uprobe *uprobe = NULL;
> > > > struct vm_area_struct *vma;
> > > >
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > + struct file *vm_file;
> > > > + struct inode *vm_inode;
> > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > + int vm_lock_seq;
> > > > + loff_t offset;
> > > > +
> > > > + rcu_read_lock();
> > > > +
> > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > + if (!vma)
> > > > + goto retry_with_lock;
> > > > +
> > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > >
> > > So vma->vm_lock_seq is only updated on vma_start_write()
> >
> > yep, I've looked a bit more at the implementation now
> >
> > >
> > > > +
> > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > + goto retry_with_lock;
> > > > +
> > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > + vm_end = READ_ONCE(vma->vm_end);
> > >
> > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > Compiler could be updating them one byte at a time while you load some
> > > franken-update.
> > >
> > > Also, if you're in the middle of split_vma() you might not get a
> > > consistent set.
> >
> > I used READ_ONCE() only to prevent the compiler from re-reading those
> > values. We assume those values are garbage anyways and double-check
> > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > inconsistency if we are in the middle of split_vma().
> >
> > We use the result of all this speculative calculation only if we find
> > a valid uprobe (which could be a false positive) *and* if we detect
> > that nothing about VMA changed (which is what I got wrong, but
> > honestly I was actually betting on others to help me get this right
> > anyways).
> >
> > >
> > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > + goto retry_with_lock;
> > > > +
> > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > + if (!uprobe)
> > > > + goto retry_with_lock;
> > > > +
> > > > + /* now double check that nothing about VMA changed */
> > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > + goto retry_with_lock;
> > >
> > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > checking you're in or after the same modification cycle.
> > >
> > > The point of sequence locks is to check you *IN* a modification cycle
> > > and retry if you are. You're now explicitly continuing if you're in a
> > > modification.
> > >
> > > You really need:
> > >
> > > seq++;
> > > wmb();
> > >
> > > ... do modification
> > >
> > > wmb();
> > > seq++;
> > >
> > > vs
> > >
> > > do {
> > > s = READ_ONCE(seq) & ~1;
> > > rmb();
> > >
> > > ... read stuff
> > >
> > > } while (rmb(), seq != s);
> > >
> > >
> > > The thing to note is that seq will be odd while inside a modification
> > > and even outside, further if the pre and post seq are both even but not
> > > identical, you've crossed a modification and also need to retry.
> > >
> >
> > Ok, I don't think I got everything you have written above, sorry. But
> > let me explain what I think I need to do and please correct what I
> > (still) got wrong.
> >
> > a) before starting speculation,
> > a.1) read and remember current->mm->mm_lock_seq (using
> > smp_load_acquire(), right?)
> > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > out, try with proper mmap_lock
> > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > this could still be wrong)
> > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > already modified VMA, bail out
> > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > modified, bail out
> >
> > At this point we should have a guarantee that nothing about mm
> > changed, nor that VMA started being modified during our speculative
> > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > correct one that we need.
> >
> > Is that enough? Any holes in the approach? And thanks for thoroughly
> > thinking about this, btw!
>
> Ok, with slight modifications to the details of the above (e.g., there
> is actually no "odd means VMA is being modified" thing with
> vm_lock_seq),
Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
means your VMA is write-locked and is being modified.
> I ended up with the implementation below. Basically we
> validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> mm_lock_seq (which otherwise would mean "VMA is being modified").
Validating that mm->mm_lock_seq did not change does not provide you
with useful information. It only means that between the point where
you recorded mm->mm_lock_seq and where you are checking it, there was
an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
not have even been part of that modification for which mmap_lock was
taken.
In theory what you need is simpler (simplified code for explanation only):
int vm_lock_seq = vma->vm_lock_seq;
if (vm_lock_seq == mm->mm_lock_seq)
goto bail_out; /* VMA is write-locked */
/* copy required VMA attributes */
if (vm_lock_seq != vma->vm_lock_seq)
goto bail_out; /* VMA got write-locked */
But this would require proper ACQUIRE/RELEASE semantics for
vma->vm_lock_seq which is currently not there because all reads/writes
to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
protection, so additional ordering is not required. If you decide to
add that semantics for vma->vm_lock_seq, please make sure that
pagefault path performance does not regress.
> There is a possibility that vm_lock_seq == mm_lock_seq just by
> accident, which is not a correctness problem, we'll just fallback to
> locked implementation until something about VMA or mm_struct itself
> changes. Which is fine, and if mm folks ever change this locking
> schema, this might go away.
>
> If this seems on the right track, I think we can just move
> mm_start_vma_specuation()/mm_end_vma_speculation() into
> include/linux/mm.h.
>
> And after thinking a bit more about READ_ONCE() usage, I changed them
> to data_race() to not trigger KCSAN warnings. Initially I kept
> READ_ONCE() only around vma->vm_file access, but given we never change
> it until vma is freed and reused (which would be prevented by
> guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> even data_race() is probably not necessary.
>
> Anyways, please see the patch below. Would be nice if mm folks
> (Suren?) could confirm that this is not broken.
>
>
>
> Author: Andrii Nakryiko <andrii@kernel.org>
> Date: Fri Aug 2 22:16:40 2024 -0700
>
> uprobes: add speculative lockless VMA to inode resolution
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 3de311c56d47..bee7a929ff02 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> *mm, unsigned long vaddr)
> return is_trap_insn(&opcode);
> }
>
> +#ifdef CONFIG_PER_VMA_LOCK
> +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> *mm_lock_seq)
> +{
> + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> +}
> +
> +/* returns true if speculation was safe (no mm and vma modification
> happened) */
> +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> int mm_lock_seq)
> +{
> + int mm_seq, vma_seq;
> +
> + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> + vma_seq = READ_ONCE(vma->vm_lock_seq);
> +
> + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
After spending some time on this I think what you do here is
semantically correct but sub-optimal.
This check means that there was no call to
mmap_write_unlock()/mmap_write_downgrade() since
mm_start_vma_speculation() and the vma is not currently locked. To
unlock a write-locked VMA you do need to call
map_write_unlock()/mmap_write_downgrade(), so I think this check would
guarantee that your vma was not locked and modified from under us.
However this will also trigger false positives if
mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
are using was never locked. So, it will bail out more than necessary.
Maybe it's ok?
> +}
> +
> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> + const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> + struct mm_struct *mm = current->mm;
> + struct uprobe *uprobe;
> + struct vm_area_struct *vma;
> + struct file *vm_file;
> + struct inode *vm_inode;
> + unsigned long vm_pgoff, vm_start;
> + int mm_lock_seq;
> + loff_t offset;
> +
> + guard(rcu)();
> +
> + mm_start_vma_speculation(mm, &mm_lock_seq);
> +
> + vma = vma_lookup(mm, bp_vaddr);
> + if (!vma)
> + return NULL;
> +
> + vm_file = data_race(vma->vm_file);
> + if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> + return NULL;
> +
> + vm_inode = data_race(vm_file->f_inode);
> + vm_pgoff = data_race(vma->vm_pgoff);
> + vm_start = data_race(vma->vm_start);
> +
> + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> + uprobe = find_uprobe_rcu(vm_inode, offset);
> + if (!uprobe)
> + return NULL;
> +
> + /* now double check that nothing about MM and VMA changed */
> + if (!mm_end_vma_speculation(vma, mm_lock_seq))
> + return NULL;
> +
> + /* happy case, we speculated successfully */
> + return uprobe;
> +}
> +#else /* !CONFIG_PER_VMA_LOCK */
> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> + return NULL;
> +}
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> /* assumes being inside RCU protected region */
> static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr,
> int *is_swbp)
> {
> @@ -2251,6 +2315,10 @@ static struct uprobe
> *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> struct uprobe *uprobe = NULL;
> struct vm_area_struct *vma;
>
> + uprobe = find_active_uprobe_speculative(bp_vaddr);
> + if (uprobe)
> + return uprobe;
> +
> mmap_read_lock(mm);
> vma = vma_lookup(mm, bp_vaddr);
> if (vma) {
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..211a84ee92b4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> NULL);
> files_cachep = kmem_cache_create("files_cache",
> sizeof(struct files_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> +
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> NULL);
> fs_cachep = kmem_cache_create("fs_cache",
> sizeof(struct fs_struct), 0,
>
>
> >
> > P.S. This is basically the last big blocker towards linear uprobes
> > scalability with the number of active CPUs. I have
> > uretprobe+SRCU+timeout implemented and it seems to work fine, will
> > post soon-ish.
> >
> > P.P.S Also, funny enough, below was another big scalability limiter
> > (and the last one) :) I'm not sure if we can just drop it, or I should
> > use per-CPU counter, but with the below change and speculative VMA
> > lookup (however buggy, works ok for benchmarking), I finally get
> > linear scaling of uprobe triggering throughput with number of CPUs. We
> > are very close.
> >
> > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> > index f7443e996b1b..64c2bc316a08 100644
> > --- a/kernel/trace/trace_uprobe.c
> > +++ b/kernel/trace/trace_uprobe.c
> > @@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
> > uprobe_consumer *con, struct pt_regs *regs)
> > int ret = 0;
> >
> > tu = container_of(con, struct trace_uprobe, consumer);
> > - tu->nhit++;
> > + //tu->nhit++;
> >
> > udd.tu = tu;
> > udd.bp_addr = instruction_pointer(regs);
> >
> >
> > > > +
> > > > + /* happy case, we speculated successfully */
> > > > + rcu_read_unlock();
> > > > + return uprobe;
> > > > +
> > > > +retry_with_lock:
> > > > + rcu_read_unlock();
> > > > + uprobe = NULL;
> > > > +#endif
> > > > +
> > > > mmap_read_lock(mm);
> > > > vma = vma_lookup(mm, bp_vaddr);
> > > > if (vma) {
> > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > index cc760491f201..211a84ee92b4 100644
> > > > --- a/kernel/fork.c
> > > > +++ b/kernel/fork.c
> > > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > > NULL);
> > > > files_cachep = kmem_cache_create("files_cache",
> > > > sizeof(struct files_struct), 0,
> > > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > > NULL);
> > > > fs_cachep = kmem_cache_create("fs_cache",
> > > > sizeof(struct fs_struct), 0,
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-07 1:36 ` Suren Baghdasaryan
@ 2024-08-07 5:13 ` Suren Baghdasaryan
2024-08-07 17:49 ` Andrii Nakryiko
0 siblings, 1 reply; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-08-07 5:13 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Tue, Aug 6, 2024 at 6:36 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > >
> > > > > Is there any reason why the approach below won't work?
> > > >
> > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > --- a/kernel/events/uprobes.c
> > > > > +++ b/kernel/events/uprobes.c
> > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > struct uprobe *uprobe = NULL;
> > > > > struct vm_area_struct *vma;
> > > > >
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > + struct file *vm_file;
> > > > > + struct inode *vm_inode;
> > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > + int vm_lock_seq;
> > > > > + loff_t offset;
> > > > > +
> > > > > + rcu_read_lock();
> > > > > +
> > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > + if (!vma)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > >
> > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > >
> > > yep, I've looked a bit more at the implementation now
> > >
> > > >
> > > > > +
> > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > >
> > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > Compiler could be updating them one byte at a time while you load some
> > > > franken-update.
> > > >
> > > > Also, if you're in the middle of split_vma() you might not get a
> > > > consistent set.
> > >
> > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > values. We assume those values are garbage anyways and double-check
> > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > inconsistency if we are in the middle of split_vma().
> > >
> > > We use the result of all this speculative calculation only if we find
> > > a valid uprobe (which could be a false positive) *and* if we detect
> > > that nothing about VMA changed (which is what I got wrong, but
> > > honestly I was actually betting on others to help me get this right
> > > anyways).
> > >
> > > >
> > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > + if (!uprobe)
> > > > > + goto retry_with_lock;
> > > > > +
> > > > > + /* now double check that nothing about VMA changed */
> > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > + goto retry_with_lock;
> > > >
> > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > checking you're in or after the same modification cycle.
> > > >
> > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > modification.
> > > >
> > > > You really need:
> > > >
> > > > seq++;
> > > > wmb();
> > > >
> > > > ... do modification
> > > >
> > > > wmb();
> > > > seq++;
> > > >
> > > > vs
> > > >
> > > > do {
> > > > s = READ_ONCE(seq) & ~1;
> > > > rmb();
> > > >
> > > > ... read stuff
> > > >
> > > > } while (rmb(), seq != s);
> > > >
> > > >
> > > > The thing to note is that seq will be odd while inside a modification
> > > > and even outside, further if the pre and post seq are both even but not
> > > > identical, you've crossed a modification and also need to retry.
> > > >
> > >
> > > Ok, I don't think I got everything you have written above, sorry. But
> > > let me explain what I think I need to do and please correct what I
> > > (still) got wrong.
> > >
> > > a) before starting speculation,
> > > a.1) read and remember current->mm->mm_lock_seq (using
> > > smp_load_acquire(), right?)
> > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > out, try with proper mmap_lock
> > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > this could still be wrong)
> > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > already modified VMA, bail out
> > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > modified, bail out
> > >
> > > At this point we should have a guarantee that nothing about mm
> > > changed, nor that VMA started being modified during our speculative
> > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > correct one that we need.
> > >
> > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > thinking about this, btw!
> >
> > Ok, with slight modifications to the details of the above (e.g., there
> > is actually no "odd means VMA is being modified" thing with
> > vm_lock_seq),
>
> Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
> means your VMA is write-locked and is being modified.
>
> > I ended up with the implementation below. Basically we
> > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > mm_lock_seq (which otherwise would mean "VMA is being modified").
>
> Validating that mm->mm_lock_seq did not change does not provide you
> with useful information. It only means that between the point where
> you recorded mm->mm_lock_seq and where you are checking it, there was
> an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
> not have even been part of that modification for which mmap_lock was
> taken.
>
> In theory what you need is simpler (simplified code for explanation only):
>
> int vm_lock_seq = vma->vm_lock_seq;
> if (vm_lock_seq == mm->mm_lock_seq)
> goto bail_out; /* VMA is write-locked */
>
> /* copy required VMA attributes */
>
> if (vm_lock_seq != vma->vm_lock_seq)
> goto bail_out; /* VMA got write-locked */
>
> But this would require proper ACQUIRE/RELEASE semantics for
> vma->vm_lock_seq which is currently not there because all reads/writes
> to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
> protection, so additional ordering is not required. If you decide to
> add that semantics for vma->vm_lock_seq, please make sure that
> pagefault path performance does not regress.
>
> > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > accident, which is not a correctness problem, we'll just fallback to
> > locked implementation until something about VMA or mm_struct itself
> > changes. Which is fine, and if mm folks ever change this locking
> > schema, this might go away.
> >
> > If this seems on the right track, I think we can just move
> > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > include/linux/mm.h.
> >
> > And after thinking a bit more about READ_ONCE() usage, I changed them
> > to data_race() to not trigger KCSAN warnings. Initially I kept
> > READ_ONCE() only around vma->vm_file access, but given we never change
> > it until vma is freed and reused (which would be prevented by
> > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > even data_race() is probably not necessary.
> >
> > Anyways, please see the patch below. Would be nice if mm folks
> > (Suren?) could confirm that this is not broken.
> >
> >
> >
> > Author: Andrii Nakryiko <andrii@kernel.org>
> > Date: Fri Aug 2 22:16:40 2024 -0700
> >
> > uprobes: add speculative lockless VMA to inode resolution
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> >
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 3de311c56d47..bee7a929ff02 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > *mm, unsigned long vaddr)
> > return is_trap_insn(&opcode);
> > }
> >
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > *mm_lock_seq)
> > +{
> > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > +}
> > +
> > +/* returns true if speculation was safe (no mm and vma modification
> > happened) */
> > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > int mm_lock_seq)
> > +{
> > + int mm_seq, vma_seq;
> > +
> > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > +
> > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
>
> After spending some time on this I think what you do here is
> semantically correct but sub-optimal.
Actually, after staring at this code some more I think
vma->vm_lock_seq not having proper ACQUIRE/RELEASE semantics would
bite us here as well. The entire find_active_uprobe_speculative()
might be executing while mmap_lock is write-locked (so, mm_seq ==
mm_lock_seq is satisfied) and we might miss that the VMA is locked due
to vma->vm_lock_seq read/write reordering. Though it's late and I
might have missed some memory barriers which would prevent this
scenario...
> This check means that there was no call to
> mmap_write_unlock()/mmap_write_downgrade() since
> mm_start_vma_speculation() and the vma is not currently locked. To
> unlock a write-locked VMA you do need to call
> map_write_unlock()/mmap_write_downgrade(), so I think this check would
> guarantee that your vma was not locked and modified from under us.
> However this will also trigger false positives if
> mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
> are using was never locked. So, it will bail out more than necessary.
> Maybe it's ok?
>
> > +}
> > +
> > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > +{
> > + const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > + struct mm_struct *mm = current->mm;
> > + struct uprobe *uprobe;
> > + struct vm_area_struct *vma;
> > + struct file *vm_file;
> > + struct inode *vm_inode;
> > + unsigned long vm_pgoff, vm_start;
> > + int mm_lock_seq;
> > + loff_t offset;
> > +
> > + guard(rcu)();
> > +
> > + mm_start_vma_speculation(mm, &mm_lock_seq);
> > +
> > + vma = vma_lookup(mm, bp_vaddr);
> > + if (!vma)
> > + return NULL;
> > +
> > + vm_file = data_race(vma->vm_file);
> > + if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > + return NULL;
> > +
> > + vm_inode = data_race(vm_file->f_inode);
> > + vm_pgoff = data_race(vma->vm_pgoff);
> > + vm_start = data_race(vma->vm_start);
> > +
> > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > + if (!uprobe)
> > + return NULL;
> > +
> > + /* now double check that nothing about MM and VMA changed */
> > + if (!mm_end_vma_speculation(vma, mm_lock_seq))
> > + return NULL;
> > +
> > + /* happy case, we speculated successfully */
> > + return uprobe;
> > +}
> > +#else /* !CONFIG_PER_VMA_LOCK */
> > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > +{
> > + return NULL;
> > +}
> > +#endif /* CONFIG_PER_VMA_LOCK */
> > +
> > /* assumes being inside RCU protected region */
> > static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr,
> > int *is_swbp)
> > {
> > @@ -2251,6 +2315,10 @@ static struct uprobe
> > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > struct uprobe *uprobe = NULL;
> > struct vm_area_struct *vma;
> >
> > + uprobe = find_active_uprobe_speculative(bp_vaddr);
> > + if (uprobe)
> > + return uprobe;
> > +
> > mmap_read_lock(mm);
> > vma = vma_lookup(mm, bp_vaddr);
> > if (vma) {
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index cc760491f201..211a84ee92b4 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > NULL);
> > files_cachep = kmem_cache_create("files_cache",
> > sizeof(struct files_struct), 0,
> > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > +
> > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > NULL);
> > fs_cachep = kmem_cache_create("fs_cache",
> > sizeof(struct fs_struct), 0,
> >
> >
> > >
> > > P.S. This is basically the last big blocker towards linear uprobes
> > > scalability with the number of active CPUs. I have
> > > uretprobe+SRCU+timeout implemented and it seems to work fine, will
> > > post soon-ish.
> > >
> > > P.P.S Also, funny enough, below was another big scalability limiter
> > > (and the last one) :) I'm not sure if we can just drop it, or I should
> > > use per-CPU counter, but with the below change and speculative VMA
> > > lookup (however buggy, works ok for benchmarking), I finally get
> > > linear scaling of uprobe triggering throughput with number of CPUs. We
> > > are very close.
> > >
> > > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> > > index f7443e996b1b..64c2bc316a08 100644
> > > --- a/kernel/trace/trace_uprobe.c
> > > +++ b/kernel/trace/trace_uprobe.c
> > > @@ -1508,7 +1508,7 @@ static int uprobe_dispatcher(struct
> > > uprobe_consumer *con, struct pt_regs *regs)
> > > int ret = 0;
> > >
> > > tu = container_of(con, struct trace_uprobe, consumer);
> > > - tu->nhit++;
> > > + //tu->nhit++;
> > >
> > > udd.tu = tu;
> > > udd.bp_addr = instruction_pointer(regs);
> > >
> > >
> > > > > +
> > > > > + /* happy case, we speculated successfully */
> > > > > + rcu_read_unlock();
> > > > > + return uprobe;
> > > > > +
> > > > > +retry_with_lock:
> > > > > + rcu_read_unlock();
> > > > > + uprobe = NULL;
> > > > > +#endif
> > > > > +
> > > > > mmap_read_lock(mm);
> > > > > vma = vma_lookup(mm, bp_vaddr);
> > > > > if (vma) {
> > > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > index cc760491f201..211a84ee92b4 100644
> > > > > --- a/kernel/fork.c
> > > > > +++ b/kernel/fork.c
> > > > > @@ -3160,7 +3160,7 @@ void __init proc_caches_init(void)
> > > > > NULL);
> > > > > files_cachep = kmem_cache_create("files_cache",
> > > > > sizeof(struct files_struct), 0,
> > > > > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > > > > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT|SLAB_TYPESAFE_BY_RCU,
> > > > > NULL);
> > > > > fs_cachep = kmem_cache_create("fs_cache",
> > > > > sizeof(struct fs_struct), 0,
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-07 5:13 ` Suren Baghdasaryan
@ 2024-08-07 17:49 ` Andrii Nakryiko
2024-08-07 18:04 ` Suren Baghdasaryan
0 siblings, 1 reply; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-07 17:49 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Tue, Aug 6, 2024 at 10:13 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Aug 6, 2024 at 6:36 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > > >
> > > > > > Is there any reason why the approach below won't work?
> > > > >
> > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > > --- a/kernel/events/uprobes.c
> > > > > > +++ b/kernel/events/uprobes.c
> > > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > > struct uprobe *uprobe = NULL;
> > > > > > struct vm_area_struct *vma;
> > > > > >
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > > + struct file *vm_file;
> > > > > > + struct inode *vm_inode;
> > > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > > + int vm_lock_seq;
> > > > > > + loff_t offset;
> > > > > > +
> > > > > > + rcu_read_lock();
> > > > > > +
> > > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > > + if (!vma)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > > >
> > > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > > >
> > > > yep, I've looked a bit more at the implementation now
> > > >
> > > > >
> > > > > > +
> > > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > > >
> > > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > > Compiler could be updating them one byte at a time while you load some
> > > > > franken-update.
> > > > >
> > > > > Also, if you're in the middle of split_vma() you might not get a
> > > > > consistent set.
> > > >
> > > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > > values. We assume those values are garbage anyways and double-check
> > > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > > inconsistency if we are in the middle of split_vma().
> > > >
> > > > We use the result of all this speculative calculation only if we find
> > > > a valid uprobe (which could be a false positive) *and* if we detect
> > > > that nothing about VMA changed (which is what I got wrong, but
> > > > honestly I was actually betting on others to help me get this right
> > > > anyways).
> > > >
> > > > >
> > > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > > + if (!uprobe)
> > > > > > + goto retry_with_lock;
> > > > > > +
> > > > > > + /* now double check that nothing about VMA changed */
> > > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > > + goto retry_with_lock;
> > > > >
> > > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > > checking you're in or after the same modification cycle.
> > > > >
> > > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > > modification.
> > > > >
> > > > > You really need:
> > > > >
> > > > > seq++;
> > > > > wmb();
> > > > >
> > > > > ... do modification
> > > > >
> > > > > wmb();
> > > > > seq++;
> > > > >
> > > > > vs
> > > > >
> > > > > do {
> > > > > s = READ_ONCE(seq) & ~1;
> > > > > rmb();
> > > > >
> > > > > ... read stuff
> > > > >
> > > > > } while (rmb(), seq != s);
> > > > >
> > > > >
> > > > > The thing to note is that seq will be odd while inside a modification
> > > > > and even outside, further if the pre and post seq are both even but not
> > > > > identical, you've crossed a modification and also need to retry.
> > > > >
> > > >
> > > > Ok, I don't think I got everything you have written above, sorry. But
> > > > let me explain what I think I need to do and please correct what I
> > > > (still) got wrong.
> > > >
> > > > a) before starting speculation,
> > > > a.1) read and remember current->mm->mm_lock_seq (using
> > > > smp_load_acquire(), right?)
> > > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > > out, try with proper mmap_lock
> > > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > > this could still be wrong)
> > > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > > already modified VMA, bail out
> > > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > > modified, bail out
> > > >
> > > > At this point we should have a guarantee that nothing about mm
> > > > changed, nor that VMA started being modified during our speculative
> > > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > > correct one that we need.
> > > >
> > > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > > thinking about this, btw!
> > >
> > > Ok, with slight modifications to the details of the above (e.g., there
> > > is actually no "odd means VMA is being modified" thing with
> > > vm_lock_seq),
> >
> > Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
> > means your VMA is write-locked and is being modified.
> >
> > > I ended up with the implementation below. Basically we
> > > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > > mm_lock_seq (which otherwise would mean "VMA is being modified").
> >
> > Validating that mm->mm_lock_seq did not change does not provide you
> > with useful information. It only means that between the point where
> > you recorded mm->mm_lock_seq and where you are checking it, there was
> > an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
> > not have even been part of that modification for which mmap_lock was
> > taken.
> >
> > In theory what you need is simpler (simplified code for explanation only):
> >
> > int vm_lock_seq = vma->vm_lock_seq;
> > if (vm_lock_seq == mm->mm_lock_seq)
> > goto bail_out; /* VMA is write-locked */
> >
> > /* copy required VMA attributes */
> >
> > if (vm_lock_seq != vma->vm_lock_seq)
> > goto bail_out; /* VMA got write-locked */
> >
> > But this would require proper ACQUIRE/RELEASE semantics for
> > vma->vm_lock_seq which is currently not there because all reads/writes
> > to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
> > protection, so additional ordering is not required. If you decide to
> > add that semantics for vma->vm_lock_seq, please make sure that
> > pagefault path performance does not regress.
> >
> > > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > > accident, which is not a correctness problem, we'll just fallback to
> > > locked implementation until something about VMA or mm_struct itself
> > > changes. Which is fine, and if mm folks ever change this locking
> > > schema, this might go away.
> > >
> > > If this seems on the right track, I think we can just move
> > > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > > include/linux/mm.h.
> > >
> > > And after thinking a bit more about READ_ONCE() usage, I changed them
> > > to data_race() to not trigger KCSAN warnings. Initially I kept
> > > READ_ONCE() only around vma->vm_file access, but given we never change
> > > it until vma is freed and reused (which would be prevented by
> > > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > > even data_race() is probably not necessary.
> > >
> > > Anyways, please see the patch below. Would be nice if mm folks
> > > (Suren?) could confirm that this is not broken.
> > >
> > >
> > >
> > > Author: Andrii Nakryiko <andrii@kernel.org>
> > > Date: Fri Aug 2 22:16:40 2024 -0700
> > >
> > > uprobes: add speculative lockless VMA to inode resolution
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > >
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 3de311c56d47..bee7a929ff02 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > > *mm, unsigned long vaddr)
> > > return is_trap_insn(&opcode);
> > > }
> > >
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > > *mm_lock_seq)
> > > +{
> > > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > > +}
> > > +
> > > +/* returns true if speculation was safe (no mm and vma modification
> > > happened) */
> > > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > > int mm_lock_seq)
> > > +{
> > > + int mm_seq, vma_seq;
> > > +
> > > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > > +
> > > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> >
> > After spending some time on this I think what you do here is
> > semantically correct but sub-optimal.
>
Yes, requiring that mm_lock_seq doesn't change is too pessimistic, but
relative to the frequency of uprobe/uretprobe triggering (and how fast
the lookup is) this won't matter much. Absolute majority of uprobe
lookups will manage to succeed while none of mm's VMAs change at all.
So I felt like that's ok, at least for starters.
My goal is to minimize intrusion into purely mm-related code, this
whole uprobe work is already pretty large and sprawling, I don't want
to go on another quest to change locking semantics for vma, if I don't
absolutely have to :) But see below for adjusted logic based on your
comments.
> Actually, after staring at this code some more I think
> vma->vm_lock_seq not having proper ACQUIRE/RELEASE semantics would
> bite us here as well. The entire find_active_uprobe_speculative()
> might be executing while mmap_lock is write-locked (so, mm_seq ==
> mm_lock_seq is satisfied) and we might miss that the VMA is locked due
> to vma->vm_lock_seq read/write reordering. Though it's late and I
> might have missed some memory barriers which would prevent this
> scenario...
So, please bear with me, if it's a stupid question. But don't all
locks have implicit ACQUIRE and RELEASE semantics already? At least
that's my reading of Documentation/memory-barriers.txt.
So with that, wouldn't it be OK to just change
READ_ONCE(vma->vm_lock_seq) to smp_load_acquire(&vma->vm_lock_seq) and
mitigate the issue you pointed out?
So maybe something like below:
rcu_read_lock()
vma = find_vma(...)
if (!vma) /* bail */
vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
/* I think vm_lock has to be acquired first to avoid the race */
if (mm_lock_seq == vm_lock_seq)
/* bail, vma is write-locked */
... perform uprobe lookup logic based on vma->vm_file->f_inode ...
if (smp_load_acquire(&vma->vm_lock_seq) != vm_lock_seq)
/* bail, VMA might have changed */
Thoughts?
>
> > This check means that there was no call to
> > mmap_write_unlock()/mmap_write_downgrade() since
> > mm_start_vma_speculation() and the vma is not currently locked. To
> > unlock a write-locked VMA you do need to call
> > map_write_unlock()/mmap_write_downgrade(), so I think this check would
> > guarantee that your vma was not locked and modified from under us.
> > However this will also trigger false positives if
> > mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
> > are using was never locked. So, it will bail out more than necessary.
> > Maybe it's ok?
> >
> > > +}
> > > +
[...]
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-07 17:49 ` Andrii Nakryiko
@ 2024-08-07 18:04 ` Suren Baghdasaryan
2024-08-07 18:30 ` Andrii Nakryiko
2024-08-07 18:33 ` Suren Baghdasaryan
0 siblings, 2 replies; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-08-07 18:04 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Wed, Aug 7, 2024 at 5:49 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Aug 6, 2024 at 10:13 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Aug 6, 2024 at 6:36 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > > > <andrii.nakryiko@gmail.com> wrote:
> > > > >
> > > > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > >
> > > > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > > > >
> > > > > > > Is there any reason why the approach below won't work?
> > > > > >
> > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > > > --- a/kernel/events/uprobes.c
> > > > > > > +++ b/kernel/events/uprobes.c
> > > > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > > > struct uprobe *uprobe = NULL;
> > > > > > > struct vm_area_struct *vma;
> > > > > > >
> > > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > > > + struct file *vm_file;
> > > > > > > + struct inode *vm_inode;
> > > > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > > > + int vm_lock_seq;
> > > > > > > + loff_t offset;
> > > > > > > +
> > > > > > > + rcu_read_lock();
> > > > > > > +
> > > > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > > > + if (!vma)
> > > > > > > + goto retry_with_lock;
> > > > > > > +
> > > > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > >
> > > > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > > > >
> > > > > yep, I've looked a bit more at the implementation now
> > > > >
> > > > > >
> > > > > > > +
> > > > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > > > + goto retry_with_lock;
> > > > > > > +
> > > > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > > > >
> > > > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > > > Compiler could be updating them one byte at a time while you load some
> > > > > > franken-update.
> > > > > >
> > > > > > Also, if you're in the middle of split_vma() you might not get a
> > > > > > consistent set.
> > > > >
> > > > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > > > values. We assume those values are garbage anyways and double-check
> > > > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > > > inconsistency if we are in the middle of split_vma().
> > > > >
> > > > > We use the result of all this speculative calculation only if we find
> > > > > a valid uprobe (which could be a false positive) *and* if we detect
> > > > > that nothing about VMA changed (which is what I got wrong, but
> > > > > honestly I was actually betting on others to help me get this right
> > > > > anyways).
> > > > >
> > > > > >
> > > > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > > > + goto retry_with_lock;
> > > > > > > +
> > > > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > > > + if (!uprobe)
> > > > > > > + goto retry_with_lock;
> > > > > > > +
> > > > > > > + /* now double check that nothing about VMA changed */
> > > > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > > > + goto retry_with_lock;
> > > > > >
> > > > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > > > checking you're in or after the same modification cycle.
> > > > > >
> > > > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > > > modification.
> > > > > >
> > > > > > You really need:
> > > > > >
> > > > > > seq++;
> > > > > > wmb();
> > > > > >
> > > > > > ... do modification
> > > > > >
> > > > > > wmb();
> > > > > > seq++;
> > > > > >
> > > > > > vs
> > > > > >
> > > > > > do {
> > > > > > s = READ_ONCE(seq) & ~1;
> > > > > > rmb();
> > > > > >
> > > > > > ... read stuff
> > > > > >
> > > > > > } while (rmb(), seq != s);
> > > > > >
> > > > > >
> > > > > > The thing to note is that seq will be odd while inside a modification
> > > > > > and even outside, further if the pre and post seq are both even but not
> > > > > > identical, you've crossed a modification and also need to retry.
> > > > > >
> > > > >
> > > > > Ok, I don't think I got everything you have written above, sorry. But
> > > > > let me explain what I think I need to do and please correct what I
> > > > > (still) got wrong.
> > > > >
> > > > > a) before starting speculation,
> > > > > a.1) read and remember current->mm->mm_lock_seq (using
> > > > > smp_load_acquire(), right?)
> > > > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > > > out, try with proper mmap_lock
> > > > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > > > this could still be wrong)
> > > > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > > > already modified VMA, bail out
> > > > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > > > modified, bail out
> > > > >
> > > > > At this point we should have a guarantee that nothing about mm
> > > > > changed, nor that VMA started being modified during our speculative
> > > > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > > > correct one that we need.
> > > > >
> > > > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > > > thinking about this, btw!
> > > >
> > > > Ok, with slight modifications to the details of the above (e.g., there
> > > > is actually no "odd means VMA is being modified" thing with
> > > > vm_lock_seq),
> > >
> > > Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
> > > means your VMA is write-locked and is being modified.
> > >
> > > > I ended up with the implementation below. Basically we
> > > > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > > > mm_lock_seq (which otherwise would mean "VMA is being modified").
> > >
> > > Validating that mm->mm_lock_seq did not change does not provide you
> > > with useful information. It only means that between the point where
> > > you recorded mm->mm_lock_seq and where you are checking it, there was
> > > an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
> > > not have even been part of that modification for which mmap_lock was
> > > taken.
> > >
> > > In theory what you need is simpler (simplified code for explanation only):
> > >
> > > int vm_lock_seq = vma->vm_lock_seq;
> > > if (vm_lock_seq == mm->mm_lock_seq)
> > > goto bail_out; /* VMA is write-locked */
> > >
> > > /* copy required VMA attributes */
> > >
> > > if (vm_lock_seq != vma->vm_lock_seq)
> > > goto bail_out; /* VMA got write-locked */
> > >
> > > But this would require proper ACQUIRE/RELEASE semantics for
> > > vma->vm_lock_seq which is currently not there because all reads/writes
> > > to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
> > > protection, so additional ordering is not required. If you decide to
> > > add that semantics for vma->vm_lock_seq, please make sure that
> > > pagefault path performance does not regress.
> > >
> > > > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > > > accident, which is not a correctness problem, we'll just fallback to
> > > > locked implementation until something about VMA or mm_struct itself
> > > > changes. Which is fine, and if mm folks ever change this locking
> > > > schema, this might go away.
> > > >
> > > > If this seems on the right track, I think we can just move
> > > > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > > > include/linux/mm.h.
> > > >
> > > > And after thinking a bit more about READ_ONCE() usage, I changed them
> > > > to data_race() to not trigger KCSAN warnings. Initially I kept
> > > > READ_ONCE() only around vma->vm_file access, but given we never change
> > > > it until vma is freed and reused (which would be prevented by
> > > > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > > > even data_race() is probably not necessary.
> > > >
> > > > Anyways, please see the patch below. Would be nice if mm folks
> > > > (Suren?) could confirm that this is not broken.
> > > >
> > > >
> > > >
> > > > Author: Andrii Nakryiko <andrii@kernel.org>
> > > > Date: Fri Aug 2 22:16:40 2024 -0700
> > > >
> > > > uprobes: add speculative lockless VMA to inode resolution
> > > >
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > >
> > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > index 3de311c56d47..bee7a929ff02 100644
> > > > --- a/kernel/events/uprobes.c
> > > > +++ b/kernel/events/uprobes.c
> > > > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > > > *mm, unsigned long vaddr)
> > > > return is_trap_insn(&opcode);
> > > > }
> > > >
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > > > *mm_lock_seq)
> > > > +{
> > > > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > > > +}
> > > > +
> > > > +/* returns true if speculation was safe (no mm and vma modification
> > > > happened) */
> > > > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > > > int mm_lock_seq)
> > > > +{
> > > > + int mm_seq, vma_seq;
> > > > +
> > > > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > > > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > > > +
> > > > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> > >
> > > After spending some time on this I think what you do here is
> > > semantically correct but sub-optimal.
> >
>
> Yes, requiring that mm_lock_seq doesn't change is too pessimistic, but
> relative to the frequency of uprobe/uretprobe triggering (and how fast
> the lookup is) this won't matter much. Absolute majority of uprobe
> lookups will manage to succeed while none of mm's VMAs change at all.
> So I felt like that's ok, at least for starters.
>
> My goal is to minimize intrusion into purely mm-related code, this
> whole uprobe work is already pretty large and sprawling, I don't want
> to go on another quest to change locking semantics for vma, if I don't
> absolutely have to :) But see below for adjusted logic based on your
> comments.
>
> > Actually, after staring at this code some more I think
> > vma->vm_lock_seq not having proper ACQUIRE/RELEASE semantics would
> > bite us here as well. The entire find_active_uprobe_speculative()
> > might be executing while mmap_lock is write-locked (so, mm_seq ==
> > mm_lock_seq is satisfied) and we might miss that the VMA is locked due
> > to vma->vm_lock_seq read/write reordering. Though it's late and I
> > might have missed some memory barriers which would prevent this
> > scenario...
>
> So, please bear with me, if it's a stupid question. But don't all
> locks have implicit ACQUIRE and RELEASE semantics already? At least
> that's my reading of Documentation/memory-barriers.txt.
>
> So with that, wouldn't it be OK to just change
> READ_ONCE(vma->vm_lock_seq) to smp_load_acquire(&vma->vm_lock_seq) and
> mitigate the issue you pointed out?
>
>
> So maybe something like below:
>
> rcu_read_lock()
>
> vma = find_vma(...)
> if (!vma) /* bail */
>
> vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> /* I think vm_lock has to be acquired first to avoid the race */
> if (mm_lock_seq == vm_lock_seq)
> /* bail, vma is write-locked */
>
> ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
>
> if (smp_load_acquire(&vma->vm_lock_seq) != vm_lock_seq)
> /* bail, VMA might have changed */
>
> Thoughts?
Hi Andrii,
I've prepared a quick patch following Peter's suggestion in [1] to
make mm->mm_lock_seq a proper seqcount. I'll post it shortly as RFC so
you can try it out. I think that would be a much cleaner solution.
I'll post a link to it shortly.
Thanks,
Suren.
[1] https://lore.kernel.org/all/20240730134605.GO33588@noisy.programming.kicks-ass.net/
>
> >
> > > This check means that there was no call to
> > > mmap_write_unlock()/mmap_write_downgrade() since
> > > mm_start_vma_speculation() and the vma is not currently locked. To
> > > unlock a write-locked VMA you do need to call
> > > map_write_unlock()/mmap_write_downgrade(), so I think this check would
> > > guarantee that your vma was not locked and modified from under us.
> > > However this will also trigger false positives if
> > > mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
> > > are using was never locked. So, it will bail out more than necessary.
> > > Maybe it's ok?
> > >
> > > > +}
> > > > +
>
> [...]
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-07 18:04 ` Suren Baghdasaryan
@ 2024-08-07 18:30 ` Andrii Nakryiko
2024-08-07 18:33 ` Suren Baghdasaryan
1 sibling, 0 replies; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-07 18:30 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Wed, Aug 7, 2024 at 11:05 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Aug 7, 2024 at 5:49 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Aug 6, 2024 at 10:13 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Tue, Aug 6, 2024 at 6:36 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> > > > <andrii.nakryiko@gmail.com> wrote:
> > > > >
> > > > > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > >
> > > > > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > > > > >
> > > > > > > > Is there any reason why the approach below won't work?
> > > > > > >
> > > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > > > > --- a/kernel/events/uprobes.c
> > > > > > > > +++ b/kernel/events/uprobes.c
> > > > > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > > > > struct uprobe *uprobe = NULL;
> > > > > > > > struct vm_area_struct *vma;
> > > > > > > >
> > > > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > > > > + struct file *vm_file;
> > > > > > > > + struct inode *vm_inode;
> > > > > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > > > > + int vm_lock_seq;
> > > > > > > > + loff_t offset;
> > > > > > > > +
> > > > > > > > + rcu_read_lock();
> > > > > > > > +
> > > > > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > > > > + if (!vma)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > > >
> > > > > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > > > > >
> > > > > > yep, I've looked a bit more at the implementation now
> > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > > > > >
> > > > > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > > > > Compiler could be updating them one byte at a time while you load some
> > > > > > > franken-update.
> > > > > > >
> > > > > > > Also, if you're in the middle of split_vma() you might not get a
> > > > > > > consistent set.
> > > > > >
> > > > > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > > > > values. We assume those values are garbage anyways and double-check
> > > > > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > > > > inconsistency if we are in the middle of split_vma().
> > > > > >
> > > > > > We use the result of all this speculative calculation only if we find
> > > > > > a valid uprobe (which could be a false positive) *and* if we detect
> > > > > > that nothing about VMA changed (which is what I got wrong, but
> > > > > > honestly I was actually betting on others to help me get this right
> > > > > > anyways).
> > > > > >
> > > > > > >
> > > > > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > > > > + if (!uprobe)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + /* now double check that nothing about VMA changed */
> > > > > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > > > > + goto retry_with_lock;
> > > > > > >
> > > > > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > > > > checking you're in or after the same modification cycle.
> > > > > > >
> > > > > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > > > > modification.
> > > > > > >
> > > > > > > You really need:
> > > > > > >
> > > > > > > seq++;
> > > > > > > wmb();
> > > > > > >
> > > > > > > ... do modification
> > > > > > >
> > > > > > > wmb();
> > > > > > > seq++;
> > > > > > >
> > > > > > > vs
> > > > > > >
> > > > > > > do {
> > > > > > > s = READ_ONCE(seq) & ~1;
> > > > > > > rmb();
> > > > > > >
> > > > > > > ... read stuff
> > > > > > >
> > > > > > > } while (rmb(), seq != s);
> > > > > > >
> > > > > > >
> > > > > > > The thing to note is that seq will be odd while inside a modification
> > > > > > > and even outside, further if the pre and post seq are both even but not
> > > > > > > identical, you've crossed a modification and also need to retry.
> > > > > > >
> > > > > >
> > > > > > Ok, I don't think I got everything you have written above, sorry. But
> > > > > > let me explain what I think I need to do and please correct what I
> > > > > > (still) got wrong.
> > > > > >
> > > > > > a) before starting speculation,
> > > > > > a.1) read and remember current->mm->mm_lock_seq (using
> > > > > > smp_load_acquire(), right?)
> > > > > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > > > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > > > > out, try with proper mmap_lock
> > > > > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > > > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > > > > this could still be wrong)
> > > > > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > > > > already modified VMA, bail out
> > > > > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > > > > modified, bail out
> > > > > >
> > > > > > At this point we should have a guarantee that nothing about mm
> > > > > > changed, nor that VMA started being modified during our speculative
> > > > > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > > > > correct one that we need.
> > > > > >
> > > > > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > > > > thinking about this, btw!
> > > > >
> > > > > Ok, with slight modifications to the details of the above (e.g., there
> > > > > is actually no "odd means VMA is being modified" thing with
> > > > > vm_lock_seq),
> > > >
> > > > Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
> > > > means your VMA is write-locked and is being modified.
> > > >
> > > > > I ended up with the implementation below. Basically we
> > > > > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > > > > mm_lock_seq (which otherwise would mean "VMA is being modified").
> > > >
> > > > Validating that mm->mm_lock_seq did not change does not provide you
> > > > with useful information. It only means that between the point where
> > > > you recorded mm->mm_lock_seq and where you are checking it, there was
> > > > an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
> > > > not have even been part of that modification for which mmap_lock was
> > > > taken.
> > > >
> > > > In theory what you need is simpler (simplified code for explanation only):
> > > >
> > > > int vm_lock_seq = vma->vm_lock_seq;
> > > > if (vm_lock_seq == mm->mm_lock_seq)
> > > > goto bail_out; /* VMA is write-locked */
> > > >
> > > > /* copy required VMA attributes */
> > > >
> > > > if (vm_lock_seq != vma->vm_lock_seq)
> > > > goto bail_out; /* VMA got write-locked */
> > > >
> > > > But this would require proper ACQUIRE/RELEASE semantics for
> > > > vma->vm_lock_seq which is currently not there because all reads/writes
> > > > to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
> > > > protection, so additional ordering is not required. If you decide to
> > > > add that semantics for vma->vm_lock_seq, please make sure that
> > > > pagefault path performance does not regress.
> > > >
> > > > > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > > > > accident, which is not a correctness problem, we'll just fallback to
> > > > > locked implementation until something about VMA or mm_struct itself
> > > > > changes. Which is fine, and if mm folks ever change this locking
> > > > > schema, this might go away.
> > > > >
> > > > > If this seems on the right track, I think we can just move
> > > > > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > > > > include/linux/mm.h.
> > > > >
> > > > > And after thinking a bit more about READ_ONCE() usage, I changed them
> > > > > to data_race() to not trigger KCSAN warnings. Initially I kept
> > > > > READ_ONCE() only around vma->vm_file access, but given we never change
> > > > > it until vma is freed and reused (which would be prevented by
> > > > > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > > > > even data_race() is probably not necessary.
> > > > >
> > > > > Anyways, please see the patch below. Would be nice if mm folks
> > > > > (Suren?) could confirm that this is not broken.
> > > > >
> > > > >
> > > > >
> > > > > Author: Andrii Nakryiko <andrii@kernel.org>
> > > > > Date: Fri Aug 2 22:16:40 2024 -0700
> > > > >
> > > > > uprobes: add speculative lockless VMA to inode resolution
> > > > >
> > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > >
> > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > index 3de311c56d47..bee7a929ff02 100644
> > > > > --- a/kernel/events/uprobes.c
> > > > > +++ b/kernel/events/uprobes.c
> > > > > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > > > > *mm, unsigned long vaddr)
> > > > > return is_trap_insn(&opcode);
> > > > > }
> > > > >
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > > > > *mm_lock_seq)
> > > > > +{
> > > > > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > > > > +}
> > > > > +
> > > > > +/* returns true if speculation was safe (no mm and vma modification
> > > > > happened) */
> > > > > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > > > > int mm_lock_seq)
> > > > > +{
> > > > > + int mm_seq, vma_seq;
> > > > > +
> > > > > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > > > > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > +
> > > > > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> > > >
> > > > After spending some time on this I think what you do here is
> > > > semantically correct but sub-optimal.
> > >
> >
> > Yes, requiring that mm_lock_seq doesn't change is too pessimistic, but
> > relative to the frequency of uprobe/uretprobe triggering (and how fast
> > the lookup is) this won't matter much. Absolute majority of uprobe
> > lookups will manage to succeed while none of mm's VMAs change at all.
> > So I felt like that's ok, at least for starters.
> >
> > My goal is to minimize intrusion into purely mm-related code, this
> > whole uprobe work is already pretty large and sprawling, I don't want
> > to go on another quest to change locking semantics for vma, if I don't
> > absolutely have to :) But see below for adjusted logic based on your
> > comments.
> >
> > > Actually, after staring at this code some more I think
> > > vma->vm_lock_seq not having proper ACQUIRE/RELEASE semantics would
> > > bite us here as well. The entire find_active_uprobe_speculative()
> > > might be executing while mmap_lock is write-locked (so, mm_seq ==
> > > mm_lock_seq is satisfied) and we might miss that the VMA is locked due
> > > to vma->vm_lock_seq read/write reordering. Though it's late and I
> > > might have missed some memory barriers which would prevent this
> > > scenario...
> >
> > So, please bear with me, if it's a stupid question. But don't all
> > locks have implicit ACQUIRE and RELEASE semantics already? At least
> > that's my reading of Documentation/memory-barriers.txt.
> >
> > So with that, wouldn't it be OK to just change
> > READ_ONCE(vma->vm_lock_seq) to smp_load_acquire(&vma->vm_lock_seq) and
> > mitigate the issue you pointed out?
> >
> >
> > So maybe something like below:
> >
> > rcu_read_lock()
> >
> > vma = find_vma(...)
> > if (!vma) /* bail */
> >
> > vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> > mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> > /* I think vm_lock has to be acquired first to avoid the race */
> > if (mm_lock_seq == vm_lock_seq)
> > /* bail, vma is write-locked */
> >
> > ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> >
> > if (smp_load_acquire(&vma->vm_lock_seq) != vm_lock_seq)
> > /* bail, VMA might have changed */
> >
> > Thoughts?
>
> Hi Andrii,
> I've prepared a quick patch following Peter's suggestion in [1] to
> make mm->mm_lock_seq a proper seqcount. I'll post it shortly as RFC so
> you can try it out. I think that would be a much cleaner solution.
happy to try, thanks!
> I'll post a link to it shortly.
> Thanks,
> Suren.
>
> [1] https://lore.kernel.org/all/20240730134605.GO33588@noisy.programming.kicks-ass.net/
>
>
> >
> > >
> > > > This check means that there was no call to
> > > > mmap_write_unlock()/mmap_write_downgrade() since
> > > > mm_start_vma_speculation() and the vma is not currently locked. To
> > > > unlock a write-locked VMA you do need to call
> > > > map_write_unlock()/mmap_write_downgrade(), so I think this check would
> > > > guarantee that your vma was not locked and modified from under us.
> > > > However this will also trigger false positives if
> > > > mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
> > > > are using was never locked. So, it will bail out more than necessary.
> > > > Maybe it's ok?
> > > >
> > > > > +}
> > > > > +
> >
> > [...]
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-07 18:04 ` Suren Baghdasaryan
2024-08-07 18:30 ` Andrii Nakryiko
@ 2024-08-07 18:33 ` Suren Baghdasaryan
2024-08-08 0:47 ` Andrii Nakryiko
1 sibling, 1 reply; 84+ messages in thread
From: Suren Baghdasaryan @ 2024-08-07 18:33 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Wed, Aug 7, 2024 at 11:04 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Aug 7, 2024 at 5:49 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Aug 6, 2024 at 10:13 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Tue, Aug 6, 2024 at 6:36 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> > > > <andrii.nakryiko@gmail.com> wrote:
> > > > >
> > > > > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > >
> > > > > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > > > > >
> > > > > > > > Is there any reason why the approach below won't work?
> > > > > > >
> > > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > > > > --- a/kernel/events/uprobes.c
> > > > > > > > +++ b/kernel/events/uprobes.c
> > > > > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > > > > struct uprobe *uprobe = NULL;
> > > > > > > > struct vm_area_struct *vma;
> > > > > > > >
> > > > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > > > > + struct file *vm_file;
> > > > > > > > + struct inode *vm_inode;
> > > > > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > > > > + int vm_lock_seq;
> > > > > > > > + loff_t offset;
> > > > > > > > +
> > > > > > > > + rcu_read_lock();
> > > > > > > > +
> > > > > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > > > > + if (!vma)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > > >
> > > > > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > > > > >
> > > > > > yep, I've looked a bit more at the implementation now
> > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > > > > >
> > > > > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > > > > Compiler could be updating them one byte at a time while you load some
> > > > > > > franken-update.
> > > > > > >
> > > > > > > Also, if you're in the middle of split_vma() you might not get a
> > > > > > > consistent set.
> > > > > >
> > > > > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > > > > values. We assume those values are garbage anyways and double-check
> > > > > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > > > > inconsistency if we are in the middle of split_vma().
> > > > > >
> > > > > > We use the result of all this speculative calculation only if we find
> > > > > > a valid uprobe (which could be a false positive) *and* if we detect
> > > > > > that nothing about VMA changed (which is what I got wrong, but
> > > > > > honestly I was actually betting on others to help me get this right
> > > > > > anyways).
> > > > > >
> > > > > > >
> > > > > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > > > > + if (!uprobe)
> > > > > > > > + goto retry_with_lock;
> > > > > > > > +
> > > > > > > > + /* now double check that nothing about VMA changed */
> > > > > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > > > > + goto retry_with_lock;
> > > > > > >
> > > > > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > > > > checking you're in or after the same modification cycle.
> > > > > > >
> > > > > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > > > > modification.
> > > > > > >
> > > > > > > You really need:
> > > > > > >
> > > > > > > seq++;
> > > > > > > wmb();
> > > > > > >
> > > > > > > ... do modification
> > > > > > >
> > > > > > > wmb();
> > > > > > > seq++;
> > > > > > >
> > > > > > > vs
> > > > > > >
> > > > > > > do {
> > > > > > > s = READ_ONCE(seq) & ~1;
> > > > > > > rmb();
> > > > > > >
> > > > > > > ... read stuff
> > > > > > >
> > > > > > > } while (rmb(), seq != s);
> > > > > > >
> > > > > > >
> > > > > > > The thing to note is that seq will be odd while inside a modification
> > > > > > > and even outside, further if the pre and post seq are both even but not
> > > > > > > identical, you've crossed a modification and also need to retry.
> > > > > > >
> > > > > >
> > > > > > Ok, I don't think I got everything you have written above, sorry. But
> > > > > > let me explain what I think I need to do and please correct what I
> > > > > > (still) got wrong.
> > > > > >
> > > > > > a) before starting speculation,
> > > > > > a.1) read and remember current->mm->mm_lock_seq (using
> > > > > > smp_load_acquire(), right?)
> > > > > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > > > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > > > > out, try with proper mmap_lock
> > > > > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > > > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > > > > this could still be wrong)
> > > > > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > > > > already modified VMA, bail out
> > > > > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > > > > modified, bail out
> > > > > >
> > > > > > At this point we should have a guarantee that nothing about mm
> > > > > > changed, nor that VMA started being modified during our speculative
> > > > > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > > > > correct one that we need.
> > > > > >
> > > > > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > > > > thinking about this, btw!
> > > > >
> > > > > Ok, with slight modifications to the details of the above (e.g., there
> > > > > is actually no "odd means VMA is being modified" thing with
> > > > > vm_lock_seq),
> > > >
> > > > Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
> > > > means your VMA is write-locked and is being modified.
> > > >
> > > > > I ended up with the implementation below. Basically we
> > > > > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > > > > mm_lock_seq (which otherwise would mean "VMA is being modified").
> > > >
> > > > Validating that mm->mm_lock_seq did not change does not provide you
> > > > with useful information. It only means that between the point where
> > > > you recorded mm->mm_lock_seq and where you are checking it, there was
> > > > an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
> > > > not have even been part of that modification for which mmap_lock was
> > > > taken.
> > > >
> > > > In theory what you need is simpler (simplified code for explanation only):
> > > >
> > > > int vm_lock_seq = vma->vm_lock_seq;
> > > > if (vm_lock_seq == mm->mm_lock_seq)
> > > > goto bail_out; /* VMA is write-locked */
> > > >
> > > > /* copy required VMA attributes */
> > > >
> > > > if (vm_lock_seq != vma->vm_lock_seq)
> > > > goto bail_out; /* VMA got write-locked */
> > > >
> > > > But this would require proper ACQUIRE/RELEASE semantics for
> > > > vma->vm_lock_seq which is currently not there because all reads/writes
> > > > to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
> > > > protection, so additional ordering is not required. If you decide to
> > > > add that semantics for vma->vm_lock_seq, please make sure that
> > > > pagefault path performance does not regress.
> > > >
> > > > > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > > > > accident, which is not a correctness problem, we'll just fallback to
> > > > > locked implementation until something about VMA or mm_struct itself
> > > > > changes. Which is fine, and if mm folks ever change this locking
> > > > > schema, this might go away.
> > > > >
> > > > > If this seems on the right track, I think we can just move
> > > > > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > > > > include/linux/mm.h.
> > > > >
> > > > > And after thinking a bit more about READ_ONCE() usage, I changed them
> > > > > to data_race() to not trigger KCSAN warnings. Initially I kept
> > > > > READ_ONCE() only around vma->vm_file access, but given we never change
> > > > > it until vma is freed and reused (which would be prevented by
> > > > > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > > > > even data_race() is probably not necessary.
> > > > >
> > > > > Anyways, please see the patch below. Would be nice if mm folks
> > > > > (Suren?) could confirm that this is not broken.
> > > > >
> > > > >
> > > > >
> > > > > Author: Andrii Nakryiko <andrii@kernel.org>
> > > > > Date: Fri Aug 2 22:16:40 2024 -0700
> > > > >
> > > > > uprobes: add speculative lockless VMA to inode resolution
> > > > >
> > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > >
> > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > index 3de311c56d47..bee7a929ff02 100644
> > > > > --- a/kernel/events/uprobes.c
> > > > > +++ b/kernel/events/uprobes.c
> > > > > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > > > > *mm, unsigned long vaddr)
> > > > > return is_trap_insn(&opcode);
> > > > > }
> > > > >
> > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > > > > *mm_lock_seq)
> > > > > +{
> > > > > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > > > > +}
> > > > > +
> > > > > +/* returns true if speculation was safe (no mm and vma modification
> > > > > happened) */
> > > > > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > > > > int mm_lock_seq)
> > > > > +{
> > > > > + int mm_seq, vma_seq;
> > > > > +
> > > > > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > > > > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > +
> > > > > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> > > >
> > > > After spending some time on this I think what you do here is
> > > > semantically correct but sub-optimal.
> > >
> >
> > Yes, requiring that mm_lock_seq doesn't change is too pessimistic, but
> > relative to the frequency of uprobe/uretprobe triggering (and how fast
> > the lookup is) this won't matter much. Absolute majority of uprobe
> > lookups will manage to succeed while none of mm's VMAs change at all.
> > So I felt like that's ok, at least for starters.
> >
> > My goal is to minimize intrusion into purely mm-related code, this
> > whole uprobe work is already pretty large and sprawling, I don't want
> > to go on another quest to change locking semantics for vma, if I don't
> > absolutely have to :) But see below for adjusted logic based on your
> > comments.
> >
> > > Actually, after staring at this code some more I think
> > > vma->vm_lock_seq not having proper ACQUIRE/RELEASE semantics would
> > > bite us here as well. The entire find_active_uprobe_speculative()
> > > might be executing while mmap_lock is write-locked (so, mm_seq ==
> > > mm_lock_seq is satisfied) and we might miss that the VMA is locked due
> > > to vma->vm_lock_seq read/write reordering. Though it's late and I
> > > might have missed some memory barriers which would prevent this
> > > scenario...
> >
> > So, please bear with me, if it's a stupid question. But don't all
> > locks have implicit ACQUIRE and RELEASE semantics already? At least
> > that's my reading of Documentation/memory-barriers.txt.
> >
> > So with that, wouldn't it be OK to just change
> > READ_ONCE(vma->vm_lock_seq) to smp_load_acquire(&vma->vm_lock_seq) and
> > mitigate the issue you pointed out?
> >
> >
> > So maybe something like below:
> >
> > rcu_read_lock()
> >
> > vma = find_vma(...)
> > if (!vma) /* bail */
> >
> > vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> > mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> > /* I think vm_lock has to be acquired first to avoid the race */
> > if (mm_lock_seq == vm_lock_seq)
> > /* bail, vma is write-locked */
> >
> > ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> >
> > if (smp_load_acquire(&vma->vm_lock_seq) != vm_lock_seq)
> > /* bail, VMA might have changed */
> >
> > Thoughts?
>
> Hi Andrii,
> I've prepared a quick patch following Peter's suggestion in [1] to
> make mm->mm_lock_seq a proper seqcount. I'll post it shortly as RFC so
> you can try it out. I think that would be a much cleaner solution.
> I'll post a link to it shortly.
The RFC is posted at
https://lore.kernel.org/all/20240807182325.2585582-1-surenb@google.com/.
With that patch you can do:
bool success = false;
int seq;
if (!mmap_lock_speculation_start(mm, &seq)) /* bail out */
rcu_read_lock()
vma = find_vma(...)
if (!vma) /* rcu_read_unlock and bail out */
/* obtain vma->vm_file->f_inode */
rcu_read_unlock();
if (!mmap_lock_speculation_end(mm, seq)) /* bail out */
> Thanks,
> Suren.
>
> [1] https://lore.kernel.org/all/20240730134605.GO33588@noisy.programming.kicks-ass.net/
>
>
> >
> > >
> > > > This check means that there was no call to
> > > > mmap_write_unlock()/mmap_write_downgrade() since
> > > > mm_start_vma_speculation() and the vma is not currently locked. To
> > > > unlock a write-locked VMA you do need to call
> > > > map_write_unlock()/mmap_write_downgrade(), so I think this check would
> > > > guarantee that your vma was not locked and modified from under us.
> > > > However this will also trigger false positives if
> > > > mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
> > > > are using was never locked. So, it will bail out more than necessary.
> > > > Maybe it's ok?
> > > >
> > > > > +}
> > > > > +
> >
> > [...]
^ permalink raw reply [flat|nested] 84+ messages in thread
* Re: [PATCH 00/10] perf/uprobe: Optimize uprobes
2024-08-07 18:33 ` Suren Baghdasaryan
@ 2024-08-08 0:47 ` Andrii Nakryiko
0 siblings, 0 replies; 84+ messages in thread
From: Andrii Nakryiko @ 2024-08-08 0:47 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Peter Zijlstra, rostedt, Matthew Wilcox, Paul E. McKenney,
Masami Hiramatsu, mingo, andrii, linux-kernel, oleg, jolsa, clm,
bpf
On Wed, Aug 7, 2024 at 11:34 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Aug 7, 2024 at 11:04 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Wed, Aug 7, 2024 at 5:49 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Tue, Aug 6, 2024 at 10:13 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Tue, Aug 6, 2024 at 6:36 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Mon, Aug 5, 2024 at 9:08 PM Andrii Nakryiko
> > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > >
> > > > > > On Sun, Aug 4, 2024 at 4:22 PM Andrii Nakryiko
> > > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > > >
> > > > > > > On Sat, Aug 3, 2024 at 1:53 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > > > >
> > > > > > > > On Fri, Aug 02, 2024 at 10:47:15PM -0700, Andrii Nakryiko wrote:
> > > > > > > >
> > > > > > > > > Is there any reason why the approach below won't work?
> > > > > > > >
> > > > > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > > > > index 8be9e34e786a..e21b68a39f13 100644
> > > > > > > > > --- a/kernel/events/uprobes.c
> > > > > > > > > +++ b/kernel/events/uprobes.c
> > > > > > > > > @@ -2251,6 +2251,52 @@ static struct uprobe
> > > > > > > > > *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
> > > > > > > > > struct uprobe *uprobe = NULL;
> > > > > > > > > struct vm_area_struct *vma;
> > > > > > > > >
> > > > > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > > > > + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE, vm_flags;
> > > > > > > > > + struct file *vm_file;
> > > > > > > > > + struct inode *vm_inode;
> > > > > > > > > + unsigned long vm_pgoff, vm_start, vm_end;
> > > > > > > > > + int vm_lock_seq;
> > > > > > > > > + loff_t offset;
> > > > > > > > > +
> > > > > > > > > + rcu_read_lock();
> > > > > > > > > +
> > > > > > > > > + vma = vma_lookup(mm, bp_vaddr);
> > > > > > > > > + if (!vma)
> > > > > > > > > + goto retry_with_lock;
> > > > > > > > > +
> > > > > > > > > + vm_lock_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > > > >
> > > > > > > > So vma->vm_lock_seq is only updated on vma_start_write()
> > > > > > >
> > > > > > > yep, I've looked a bit more at the implementation now
> > > > > > >
> > > > > > > >
> > > > > > > > > +
> > > > > > > > > + vm_file = READ_ONCE(vma->vm_file);
> > > > > > > > > + vm_flags = READ_ONCE(vma->vm_flags);
> > > > > > > > > + if (!vm_file || (vm_flags & flags) != VM_MAYEXEC)
> > > > > > > > > + goto retry_with_lock;
> > > > > > > > > +
> > > > > > > > > + vm_inode = READ_ONCE(vm_file->f_inode);
> > > > > > > > > + vm_pgoff = READ_ONCE(vma->vm_pgoff);
> > > > > > > > > + vm_start = READ_ONCE(vma->vm_start);
> > > > > > > > > + vm_end = READ_ONCE(vma->vm_end);
> > > > > > > >
> > > > > > > > None of those are written with WRITE_ONCE(), so this buys you nothing.
> > > > > > > > Compiler could be updating them one byte at a time while you load some
> > > > > > > > franken-update.
> > > > > > > >
> > > > > > > > Also, if you're in the middle of split_vma() you might not get a
> > > > > > > > consistent set.
> > > > > > >
> > > > > > > I used READ_ONCE() only to prevent the compiler from re-reading those
> > > > > > > values. We assume those values are garbage anyways and double-check
> > > > > > > everything, so lack of WRITE_ONCE doesn't matter. Same for
> > > > > > > inconsistency if we are in the middle of split_vma().
> > > > > > >
> > > > > > > We use the result of all this speculative calculation only if we find
> > > > > > > a valid uprobe (which could be a false positive) *and* if we detect
> > > > > > > that nothing about VMA changed (which is what I got wrong, but
> > > > > > > honestly I was actually betting on others to help me get this right
> > > > > > > anyways).
> > > > > > >
> > > > > > > >
> > > > > > > > > + if (bp_vaddr < vm_start || bp_vaddr >= vm_end)
> > > > > > > > > + goto retry_with_lock;
> > > > > > > > > +
> > > > > > > > > + offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > > > > > + uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > > > > > + if (!uprobe)
> > > > > > > > > + goto retry_with_lock;
> > > > > > > > > +
> > > > > > > > > + /* now double check that nothing about VMA changed */
> > > > > > > > > + if (vm_lock_seq != READ_ONCE(vma->vm_lock_seq))
> > > > > > > > > + goto retry_with_lock;
> > > > > > > >
> > > > > > > > Since vma->vma_lock_seq is only ever updated at vma_start_write() you're
> > > > > > > > checking you're in or after the same modification cycle.
> > > > > > > >
> > > > > > > > The point of sequence locks is to check you *IN* a modification cycle
> > > > > > > > and retry if you are. You're now explicitly continuing if you're in a
> > > > > > > > modification.
> > > > > > > >
> > > > > > > > You really need:
> > > > > > > >
> > > > > > > > seq++;
> > > > > > > > wmb();
> > > > > > > >
> > > > > > > > ... do modification
> > > > > > > >
> > > > > > > > wmb();
> > > > > > > > seq++;
> > > > > > > >
> > > > > > > > vs
> > > > > > > >
> > > > > > > > do {
> > > > > > > > s = READ_ONCE(seq) & ~1;
> > > > > > > > rmb();
> > > > > > > >
> > > > > > > > ... read stuff
> > > > > > > >
> > > > > > > > } while (rmb(), seq != s);
> > > > > > > >
> > > > > > > >
> > > > > > > > The thing to note is that seq will be odd while inside a modification
> > > > > > > > and even outside, further if the pre and post seq are both even but not
> > > > > > > > identical, you've crossed a modification and also need to retry.
> > > > > > > >
> > > > > > >
> > > > > > > Ok, I don't think I got everything you have written above, sorry. But
> > > > > > > let me explain what I think I need to do and please correct what I
> > > > > > > (still) got wrong.
> > > > > > >
> > > > > > > a) before starting speculation,
> > > > > > > a.1) read and remember current->mm->mm_lock_seq (using
> > > > > > > smp_load_acquire(), right?)
> > > > > > > a.2) read vma->vm_lock_seq (using smp_load_acquire() I presume)
> > > > > > > a.3) if vm_lock_seq is odd, we are already modifying VMA, so bail
> > > > > > > out, try with proper mmap_lock
> > > > > > > b) proceed with the inode pointer fetch and offset calculation as I've coded it
> > > > > > > c) lookup uprobe by inode+offset, if failed -- bail out (if succeeded,
> > > > > > > this could still be wrong)
> > > > > > > d) re-read vma->vm_lock_seq, if it changed, we started modifying/have
> > > > > > > already modified VMA, bail out
> > > > > > > e) re-read mm->mm_lock_seq, if that changed -- presume VMA got
> > > > > > > modified, bail out
> > > > > > >
> > > > > > > At this point we should have a guarantee that nothing about mm
> > > > > > > changed, nor that VMA started being modified during our speculative
> > > > > > > calculation+uprobe lookup. So if we found a valid uprobe, it must be a
> > > > > > > correct one that we need.
> > > > > > >
> > > > > > > Is that enough? Any holes in the approach? And thanks for thoroughly
> > > > > > > thinking about this, btw!
> > > > > >
> > > > > > Ok, with slight modifications to the details of the above (e.g., there
> > > > > > is actually no "odd means VMA is being modified" thing with
> > > > > > vm_lock_seq),
> > > > >
> > > > > Correct. Instead of that (vm_lock_seq->vm_lock_seq == mm->mm_lock_seq)
> > > > > means your VMA is write-locked and is being modified.
> > > > >
> > > > > > I ended up with the implementation below. Basically we
> > > > > > validate that mm->mm_lock_seq didn't change and that vm_lock_seq !=
> > > > > > mm_lock_seq (which otherwise would mean "VMA is being modified").
> > > > >
> > > > > Validating that mm->mm_lock_seq did not change does not provide you
> > > > > with useful information. It only means that between the point where
> > > > > you recorded mm->mm_lock_seq and where you are checking it, there was
> > > > > an mmap_write_unlock() or mmap_write_downgrade() call. Your VMA might
> > > > > not have even been part of that modification for which mmap_lock was
> > > > > taken.
> > > > >
> > > > > In theory what you need is simpler (simplified code for explanation only):
> > > > >
> > > > > int vm_lock_seq = vma->vm_lock_seq;
> > > > > if (vm_lock_seq == mm->mm_lock_seq)
> > > > > goto bail_out; /* VMA is write-locked */
> > > > >
> > > > > /* copy required VMA attributes */
> > > > >
> > > > > if (vm_lock_seq != vma->vm_lock_seq)
> > > > > goto bail_out; /* VMA got write-locked */
> > > > >
> > > > > But this would require proper ACQUIRE/RELEASE semantics for
> > > > > vma->vm_lock_seq which is currently not there because all reads/writes
> > > > > to vma->vm_lock_seq that matter are done under vma->vm_lock->lock
> > > > > protection, so additional ordering is not required. If you decide to
> > > > > add that semantics for vma->vm_lock_seq, please make sure that
> > > > > pagefault path performance does not regress.
> > > > >
> > > > > > There is a possibility that vm_lock_seq == mm_lock_seq just by
> > > > > > accident, which is not a correctness problem, we'll just fallback to
> > > > > > locked implementation until something about VMA or mm_struct itself
> > > > > > changes. Which is fine, and if mm folks ever change this locking
> > > > > > schema, this might go away.
> > > > > >
> > > > > > If this seems on the right track, I think we can just move
> > > > > > mm_start_vma_specuation()/mm_end_vma_speculation() into
> > > > > > include/linux/mm.h.
> > > > > >
> > > > > > And after thinking a bit more about READ_ONCE() usage, I changed them
> > > > > > to data_race() to not trigger KCSAN warnings. Initially I kept
> > > > > > READ_ONCE() only around vma->vm_file access, but given we never change
> > > > > > it until vma is freed and reused (which would be prevented by
> > > > > > guard(rcu)), I dropped READ_ONCE() and only added data_race(). And
> > > > > > even data_race() is probably not necessary.
> > > > > >
> > > > > > Anyways, please see the patch below. Would be nice if mm folks
> > > > > > (Suren?) could confirm that this is not broken.
> > > > > >
> > > > > >
> > > > > >
> > > > > > Author: Andrii Nakryiko <andrii@kernel.org>
> > > > > > Date: Fri Aug 2 22:16:40 2024 -0700
> > > > > >
> > > > > > uprobes: add speculative lockless VMA to inode resolution
> > > > > >
> > > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > > >
> > > > > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > > > > index 3de311c56d47..bee7a929ff02 100644
> > > > > > --- a/kernel/events/uprobes.c
> > > > > > +++ b/kernel/events/uprobes.c
> > > > > > @@ -2244,6 +2244,70 @@ static int is_trap_at_addr(struct mm_struct
> > > > > > *mm, unsigned long vaddr)
> > > > > > return is_trap_insn(&opcode);
> > > > > > }
> > > > > >
> > > > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > > > +static inline void mm_start_vma_speculation(struct mm_struct *mm, int
> > > > > > *mm_lock_seq)
> > > > > > +{
> > > > > > + *mm_lock_seq = smp_load_acquire(&mm->mm_lock_seq);
> > > > > > +}
> > > > > > +
> > > > > > +/* returns true if speculation was safe (no mm and vma modification
> > > > > > happened) */
> > > > > > +static inline bool mm_end_vma_speculation(struct vm_area_struct *vma,
> > > > > > int mm_lock_seq)
> > > > > > +{
> > > > > > + int mm_seq, vma_seq;
> > > > > > +
> > > > > > + mm_seq = smp_load_acquire(&vma->vm_mm->mm_lock_seq);
> > > > > > + vma_seq = READ_ONCE(vma->vm_lock_seq);
> > > > > > +
> > > > > > + return mm_seq == mm_lock_seq && vma_seq != mm_seq;
> > > > >
> > > > > After spending some time on this I think what you do here is
> > > > > semantically correct but sub-optimal.
> > > >
> > >
> > > Yes, requiring that mm_lock_seq doesn't change is too pessimistic, but
> > > relative to the frequency of uprobe/uretprobe triggering (and how fast
> > > the lookup is) this won't matter much. Absolute majority of uprobe
> > > lookups will manage to succeed while none of mm's VMAs change at all.
> > > So I felt like that's ok, at least for starters.
> > >
> > > My goal is to minimize intrusion into purely mm-related code, this
> > > whole uprobe work is already pretty large and sprawling, I don't want
> > > to go on another quest to change locking semantics for vma, if I don't
> > > absolutely have to :) But see below for adjusted logic based on your
> > > comments.
> > >
> > > > Actually, after staring at this code some more I think
> > > > vma->vm_lock_seq not having proper ACQUIRE/RELEASE semantics would
> > > > bite us here as well. The entire find_active_uprobe_speculative()
> > > > might be executing while mmap_lock is write-locked (so, mm_seq ==
> > > > mm_lock_seq is satisfied) and we might miss that the VMA is locked due
> > > > to vma->vm_lock_seq read/write reordering. Though it's late and I
> > > > might have missed some memory barriers which would prevent this
> > > > scenario...
> > >
> > > So, please bear with me, if it's a stupid question. But don't all
> > > locks have implicit ACQUIRE and RELEASE semantics already? At least
> > > that's my reading of Documentation/memory-barriers.txt.
> > >
> > > So with that, wouldn't it be OK to just change
> > > READ_ONCE(vma->vm_lock_seq) to smp_load_acquire(&vma->vm_lock_seq) and
> > > mitigate the issue you pointed out?
> > >
> > >
> > > So maybe something like below:
> > >
> > > rcu_read_lock()
> > >
> > > vma = find_vma(...)
> > > if (!vma) /* bail */
> > >
> > > vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> > > mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> > > /* I think vm_lock has to be acquired first to avoid the race */
> > > if (mm_lock_seq == vm_lock_seq)
> > > /* bail, vma is write-locked */
> > >
> > > ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> > >
> > > if (smp_load_acquire(&vma->vm_lock_seq) != vm_lock_seq)
> > > /* bail, VMA might have changed */
> > >
> > > Thoughts?
> >
> > Hi Andrii,
> > I've prepared a quick patch following Peter's suggestion in [1] to
> > make mm->mm_lock_seq a proper seqcount. I'll post it shortly as RFC so
> > you can try it out. I think that would be a much cleaner solution.
> > I'll post a link to it shortly.
>
> The RFC is posted at
> https://lore.kernel.org/all/20240807182325.2585582-1-surenb@google.com/.
Yep, looks good, thanks a lot! Applied locally and will be running
tests and benchmarks. If anything comes up, I'll let you know.
> With that patch you can do:
>
> bool success = false;
> int seq;
>
> if (!mmap_lock_speculation_start(mm, &seq)) /* bail out */
>
> rcu_read_lock()
> vma = find_vma(...)
> if (!vma) /* rcu_read_unlock and bail out */
> /* obtain vma->vm_file->f_inode */
> rcu_read_unlock();
>
> if (!mmap_lock_speculation_end(mm, seq)) /* bail out */
>
> > Thanks,
> > Suren.
> >
> > [1] https://lore.kernel.org/all/20240730134605.GO33588@noisy.programming.kicks-ass.net/
> >
> >
> > >
> > > >
> > > > > This check means that there was no call to
> > > > > mmap_write_unlock()/mmap_write_downgrade() since
> > > > > mm_start_vma_speculation() and the vma is not currently locked. To
> > > > > unlock a write-locked VMA you do need to call
> > > > > map_write_unlock()/mmap_write_downgrade(), so I think this check would
> > > > > guarantee that your vma was not locked and modified from under us.
> > > > > However this will also trigger false positives if
> > > > > mmap_write_unlock()/mmap_write_downgrade() was called but the vma you
> > > > > are using was never locked. So, it will bail out more than necessary.
> > > > > Maybe it's ok?
> > > > >
> > > > > > +}
> > > > > > +
> > >
> > > [...]
^ permalink raw reply [flat|nested] 84+ messages in thread
end of thread, other threads:[~2024-08-08 0:47 UTC | newest]
Thread overview: 84+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-08 9:12 [PATCH 00/10] perf/uprobe: Optimize uprobes Peter Zijlstra
2024-07-08 9:12 ` [PATCH 01/10] perf/uprobe: Re-indent labels Peter Zijlstra
2024-07-08 9:12 ` [PATCH 02/10] perf/uprobe: Remove spurious whitespace Peter Zijlstra
2024-07-08 9:12 ` [PATCH 03/10] rbtree: Provide rb_find_rcu() / rb_find_add_rcu() Peter Zijlstra
2024-07-10 1:29 ` Masami Hiramatsu
2024-07-10 7:55 ` Peter Zijlstra
2024-07-10 14:05 ` Masami Hiramatsu
2024-07-08 9:12 ` [PATCH 04/10] perf/uprobe: RCU-ify find_uprobe() Peter Zijlstra
2024-07-08 16:35 ` Oleg Nesterov
2024-07-08 18:08 ` Peter Zijlstra
2024-07-09 14:32 ` Oleg Nesterov
2024-07-09 15:02 ` Peter Zijlstra
2024-07-09 15:23 ` Oleg Nesterov
2024-07-08 9:12 ` [PATCH 05/10] perf/uprobe: SRCU-ify uprobe->consumer list Peter Zijlstra
2024-07-09 12:05 ` Peter Zijlstra
2024-07-09 13:33 ` Oleg Nesterov
2024-07-09 14:32 ` Peter Zijlstra
2024-07-09 15:05 ` Oleg Nesterov
2024-07-09 15:19 ` Peter Zijlstra
2024-07-09 15:34 ` Oleg Nesterov
2024-07-09 14:32 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 06/10] perf/uprobe: Split uprobe_unregister() Peter Zijlstra
2024-07-10 1:40 ` Masami Hiramatsu
2024-07-08 9:12 ` [PATCH 07/10] perf/uprobe: Convert (some) uprobe->refcount to SRCU Peter Zijlstra
2024-07-08 9:12 ` [PATCH 08/10] srcu: Add __srcu_clone_read_lock() Peter Zijlstra
2024-07-10 5:48 ` Boqun Feng
2024-07-10 10:02 ` Peter Zijlstra
2024-07-10 12:05 ` Peter Zijlstra
2024-07-08 9:12 ` [PATCH 09/10] perf/uprobe: Convert single-step and uretprobe to SRCU Peter Zijlstra
2024-07-08 9:12 ` [PATCH 10/10] perf/uprobe: Add uretprobe timer Peter Zijlstra
2024-07-08 22:56 ` [PATCH 00/10] perf/uprobe: Optimize uprobes Masami Hiramatsu
2024-07-09 0:25 ` Andrii Nakryiko
2024-07-09 9:01 ` Peter Zijlstra
2024-07-09 14:11 ` Paul E. McKenney
2024-07-09 14:29 ` Peter Zijlstra
2024-07-09 14:36 ` Paul E. McKenney
2024-07-09 15:31 ` Peter Zijlstra
2024-07-09 15:56 ` Paul E. McKenney
2024-07-09 16:10 ` Matthew Wilcox
2024-07-09 16:30 ` Matthew Wilcox
2024-07-09 16:57 ` Paul E. McKenney
2024-07-10 9:16 ` Peter Zijlstra
2024-07-10 9:40 ` Peter Zijlstra
2024-07-22 19:09 ` Suren Baghdasaryan
2024-07-27 0:20 ` Andrii Nakryiko
2024-07-27 1:29 ` Suren Baghdasaryan
2024-07-27 3:45 ` Matthew Wilcox
2024-07-30 3:18 ` Andrii Nakryiko
2024-07-30 13:10 ` Peter Zijlstra
2024-07-30 18:10 ` Suren Baghdasaryan
2024-08-03 5:47 ` Andrii Nakryiko
2024-08-03 8:53 ` Peter Zijlstra
2024-08-04 23:22 ` Andrii Nakryiko
2024-08-06 4:08 ` Andrii Nakryiko
2024-08-06 14:50 ` Suren Baghdasaryan
2024-08-06 17:40 ` Andrii Nakryiko
2024-08-06 17:44 ` Suren Baghdasaryan
2024-08-07 1:36 ` Suren Baghdasaryan
2024-08-07 5:13 ` Suren Baghdasaryan
2024-08-07 17:49 ` Andrii Nakryiko
2024-08-07 18:04 ` Suren Baghdasaryan
2024-08-07 18:30 ` Andrii Nakryiko
2024-08-07 18:33 ` Suren Baghdasaryan
2024-08-08 0:47 ` Andrii Nakryiko
2024-07-30 13:46 ` Peter Zijlstra
2024-07-30 18:16 ` Suren Baghdasaryan
2024-07-09 16:42 ` Andrii Nakryiko
2024-07-09 9:03 ` Peter Zijlstra
2024-07-09 10:01 ` Jiri Olsa
2024-07-09 10:16 ` Peter Zijlstra
2024-07-09 22:10 ` Masami Hiramatsu
2024-07-10 10:10 ` Peter Zijlstra
2024-07-10 14:56 ` Masami Hiramatsu
2024-07-10 18:40 ` Andrii Nakryiko
2024-07-11 8:51 ` Peter Zijlstra
2024-07-11 15:17 ` Masami Hiramatsu
2024-07-11 15:22 ` Peter Zijlstra
2024-07-11 17:47 ` Steven Rostedt
2024-07-11 23:59 ` Masami Hiramatsu
2024-07-10 0:55 ` Masami Hiramatsu
2024-07-09 21:47 ` Andrii Nakryiko
2024-07-10 10:12 ` Peter Zijlstra
2024-07-10 12:34 ` Peter Zijlstra
2024-07-09 7:11 ` Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).