* [RFC 01/10] ftrace: Make alloc_and_copy_ftrace_hash direct friendly
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 02/10] ftrace: Add register_ftrace_direct_hash function Jiri Olsa
` (9 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
Make alloc_and_copy_ftrace_hash to copy also direct address
for each hash entry.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
kernel/trace/ftrace.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4203fad56b6c..5b8f565a1258 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1190,7 +1190,7 @@ static void __add_hash_entry(struct ftrace_hash *hash,
}
static struct ftrace_func_entry *
-add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct)
{
struct ftrace_func_entry *entry;
@@ -1199,11 +1199,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
return NULL;
entry->ip = ip;
+ entry->direct = direct;
__add_hash_entry(hash, entry);
return entry;
}
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+ return add_hash_entry_direct(hash, ip, 0);
+}
+
static void
free_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
@@ -1376,7 +1383,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- if (add_hash_entry(new_hash, entry->ip) == NULL)
+ if (add_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL)
goto free_hash;
}
}
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 02/10] ftrace: Add register_ftrace_direct_hash function
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
2025-07-29 10:28 ` [RFC 01/10] ftrace: Make alloc_and_copy_ftrace_hash direct friendly Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 03/10] ftrace: Add unregister_ftrace_direct_hash function Jiri Olsa
` (8 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
Adding register_ftrace_direct_hash function that registers
all entries (ip -> direct) provided in hash argument.
The difference to current register_ftrace_direct is
- hash argument that allows to register multiple ip -> direct
entries at once
- we can call register_ftrace_direct_hash multiple times on the
same ftrace_ops object, becase after first registration with
register_ftrace_function_nolock, it uses ftrace_update_ops to
update the ftrace_ops object
This change will allow us to have simple ftrace_ops for all bpf
direct interface users in following changes.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 7 +++
kernel/trace/ftrace.c | 123 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 130 insertions(+)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b672ca15f265..e45bcc9de53b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -526,6 +526,8 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
+int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
+
void ftrace_stub_direct_tramp(void);
#else
@@ -552,6 +554,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l
return -ENODEV;
}
+int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ return -ENODEV;
+}
+
/*
* This must be implemented by the architecture.
* It is the way the ftrace direct_ops helper, when called
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b8f565a1258..1dbb113f4e9c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6219,6 +6219,129 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+static unsigned long hash_count(struct ftrace_hash *hash)
+{
+ return hash ? hash->count : 0;
+}
+
+/**
+ * hash_add - adds two struct ftrace_hash and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *add;
+ int size, i;
+
+ size = hash_count(a) + hash_count(b);
+ if (size > 32)
+ size = 32;
+
+ add = alloc_and_copy_ftrace_hash(fls(size), a);
+ if (!add)
+ goto error;
+
+ size = 1 << b->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ if (add_hash_entry_direct(add, entry->ip, entry->direct) == NULL)
+ goto error;
+ }
+ }
+ return add;
+
+ error:
+ free_ftrace_hash(add);
+ return NULL;
+}
+
+static void call_direct_funcs_hash(unsigned long ip, unsigned long pip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ unsigned long addr;
+
+ addr = ftrace_find_rec_direct(ip);
+ if (!addr)
+ return;
+
+ arch_ftrace_set_direct_caller(fregs, addr);
+}
+
+int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *filter_hash = NULL, *new_hash = NULL, *free_hash = NULL;
+ struct ftrace_func_entry *entry;
+ int i, size, err;
+ bool reg;
+
+ if (!hash_count(hash))
+ return 0;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entry is not already registered. */
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (__ftrace_lookup_ip(direct_functions, entry->ip))
+ goto out_unlock;
+ }
+ }
+
+ filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ /* If there's nothing in filter_hash we need to register the ops. */
+ reg = hash_count(filter_hash) == 0;
+ if (reg) {
+ if (ops->func || ops->trampoline)
+ goto out_unlock;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ goto out_unlock;
+ }
+
+ filter_hash = hash_add(filter_hash, hash);
+ if (!filter_hash)
+ goto out_unlock;
+
+ new_hash = hash_add(direct_functions, hash);
+ if (!new_hash)
+ goto out_unlock;
+
+ free_hash = direct_functions;
+ rcu_assign_pointer(direct_functions, new_hash);
+ new_hash = NULL;
+
+ if (reg) {
+ ops->func = call_direct_funcs_hash;
+ ops->flags = MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+ ops->local_hash.filter_hash = filter_hash;
+
+ err = register_ftrace_function_nolock(ops);
+ if (!err)
+ filter_hash = NULL;
+ } else {
+ err = ftrace_update_ops(ops, filter_hash, EMPTY_HASH);
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (free_hash && free_hash != EMPTY_HASH)
+ call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
+
+ if (filter_hash)
+ free_ftrace_hash(filter_hash);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct_hash);
+
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 03/10] ftrace: Add unregister_ftrace_direct_hash function
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
2025-07-29 10:28 ` [RFC 01/10] ftrace: Make alloc_and_copy_ftrace_hash direct friendly Jiri Olsa
2025-07-29 10:28 ` [RFC 02/10] ftrace: Add register_ftrace_direct_hash function Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 04/10] ftrace: Add modify_ftrace_direct_hash function Jiri Olsa
` (7 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
Adding unregister_ftrace_direct_hash function that unregisters
all entries (ip -> direct) provided in hash argument.
The difference to current unregister_ftrace_direct is
- hash argument that allows to unregister multiple ip -> direct
entries at once
- we can call unregister_ftrace_direct_hash multiple times on the
same ftrace_ops object, becase we do not need to unregister
all entries at once, we can do it gradualy with the help of
ftrace_update_ops function
This change will allow us to have simple ftrace_ops for all bpf
direct interface users in following changes.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 6 +++
kernel/trace/ftrace.c | 98 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e45bcc9de53b..7ff6004498c0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -527,6 +527,7 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
+int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
void ftrace_stub_direct_tramp(void);
@@ -559,6 +560,11 @@ int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash
return -ENODEV;
}
+int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ return -ENODEV;
+}
+
/*
* This must be implemented by the architecture.
* It is the way the ftrace direct_ops helper, when called
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1dbb113f4e9c..d761237ec70f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6342,6 +6342,104 @@ int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash
}
EXPORT_SYMBOL_GPL(register_ftrace_direct_hash);
+/**
+ * hash_sub - substracts @b from @a and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry, *del;
+ struct ftrace_hash *sub;
+ int size, i;
+
+ sub = alloc_and_copy_ftrace_hash(a->size_bits, a);
+ if (!sub)
+ goto error;
+
+ size = 1 << b->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(sub, entry->ip);
+ if (WARN_ON_ONCE(!del))
+ goto error;
+ remove_hash_entry(sub, del);
+ kfree(del);
+ }
+ }
+ return sub;
+
+ error:
+ free_ftrace_hash(sub);
+ return NULL;
+}
+
+int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *new_hash = NULL, *filter_hash = NULL, *free_hash = NULL;
+ struct ftrace_func_entry *del, *entry;
+ unsigned long size, i;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return 0;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entries are already registered. */
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!del || del->direct != entry->direct)
+ goto out_unlock;
+ }
+ }
+
+ filter_hash = hash_sub(ops->func_hash->filter_hash, hash);
+ if (!filter_hash)
+ goto out_unlock;
+
+ new_hash = hash_sub(direct_functions, hash);
+ if (!new_hash)
+ goto out_unlock;
+
+ /* If there's nothing left, we need to unregister the ops. */
+ if (ftrace_hash_empty(filter_hash)) {
+ err = unregister_ftrace_function(ops);
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ ftrace_free_filter(ops);
+ ops->func_hash->filter_hash = NULL;
+ } else {
+ err = ftrace_update_ops(ops, filter_hash, EMPTY_HASH);
+ }
+
+ free_hash = direct_functions;
+ rcu_assign_pointer(direct_functions, new_hash);
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (free_hash && free_hash != EMPTY_HASH)
+ call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
+
+ if (filter_hash)
+ free_ftrace_hash(filter_hash);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct_hash);
+
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 04/10] ftrace: Add modify_ftrace_direct_hash function
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (2 preceding siblings ...)
2025-07-29 10:28 ` [RFC 03/10] ftrace: Add unregister_ftrace_direct_hash function Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 05/10] ftrace: Export some of hash related functions Jiri Olsa
` (6 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
Adding modify_ftrace_direct_hash function that modifies
all entries (ip -> direct) provided in hash argument.
The difference to current unregister_ftrace_direct is
- hash argument that allows to modify multiple ip -> direct
entries at once
This change will allow us to have simple ftrace_ops for all bpf
direct interface users in following changes.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 6 +++++
kernel/trace/ftrace.c | 58 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 64 insertions(+)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7ff6004498c0..8761765d9abc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -528,6 +528,7 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
+int modify_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock);
void ftrace_stub_direct_tramp(void);
@@ -565,6 +566,11 @@ int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *ha
return -ENODEV;
}
+int modify_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
+{
+ return -ENODEV;
+}
+
/*
* This must be implemented by the architecture.
* It is the way the ftrace direct_ops helper, when called
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d761237ec70f..755d5550ac44 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6440,6 +6440,64 @@ int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *ha
}
EXPORT_SYMBOL_GPL(unregister_ftrace_direct_hash);
+int modify_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
+{
+ struct ftrace_func_entry *entry, *tmp;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ unsigned long size, i;
+ int err;
+
+ if (!hash_count(hash))
+ return 0;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ if (do_direct_lock)
+ mutex_lock(&direct_mutex);
+
+ /* Enable the tmp_ops to have the same functions as the direct ops */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash = ops->func_hash;
+
+ err = register_ftrace_function_nolock(&tmp_ops);
+ if (err)
+ goto unlock;
+
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ tmp = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!tmp)
+ continue;
+ tmp->direct = entry->direct;
+ }
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+unlock:
+ if (do_direct_lock)
+ mutex_unlock(&direct_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_hash);
+
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 05/10] ftrace: Export some of hash related functions
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (3 preceding siblings ...)
2025-07-29 10:28 ` [RFC 04/10] ftrace: Add modify_ftrace_direct_hash function Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 06/10] ftrace: Use direct hash interface in direct functions Jiri Olsa
` (5 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
We are going to use these functions in following changes.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 16 ++++++++++++++++
kernel/trace/ftrace.c | 7 +++----
kernel/trace/trace.h | 8 --------
3 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8761765d9abc..9a6fcdafeda2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -397,6 +397,22 @@ enum ftrace_ops_cmd {
typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define FTRACE_HASH_DEFAULT_BITS 10
+
+struct ftrace_hash {
+ unsigned long size_bits;
+ struct hlist_head *buckets;
+ unsigned long count;
+ unsigned long flags;
+ struct rcu_head rcu;
+};
+
+struct ftrace_hash *alloc_ftrace_hash(int size_bits);
+void free_ftrace_hash(struct ftrace_hash *hash);
+struct ftrace_func_entry *add_hash_entry_direct(struct ftrace_hash *hash,
+ unsigned long ip, unsigned long direct);
+
/* The hash used to know what functions callbacks trace */
struct ftrace_ops_hash {
struct ftrace_hash __rcu *notrace_hash;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 755d5550ac44..fcb8f2d3172b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -68,7 +68,6 @@
})
/* hash bits for specific function selection */
-#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1189,7 +1188,7 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static struct ftrace_func_entry *
+struct ftrace_func_entry *
add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct)
{
struct ftrace_func_entry *entry;
@@ -1269,7 +1268,7 @@ static void clear_ftrace_mod_list(struct list_head *head)
mutex_unlock(&ftrace_lock);
}
-static void free_ftrace_hash(struct ftrace_hash *hash)
+void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
return;
@@ -1309,7 +1308,7 @@ void ftrace_free_filter(struct ftrace_ops *ops)
}
EXPORT_SYMBOL_GPL(ftrace_free_filter);
-static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
struct ftrace_hash *hash;
int size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd084953a98b..74ef7755f361 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -899,14 +899,6 @@ enum {
FTRACE_HASH_FL_MOD = (1 << 0),
};
-struct ftrace_hash {
- unsigned long size_bits;
- struct hlist_head *buckets;
- unsigned long count;
- unsigned long flags;
- struct rcu_head rcu;
-};
-
struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 06/10] ftrace: Use direct hash interface in direct functions
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (4 preceding siblings ...)
2025-07-29 10:28 ` [RFC 05/10] ftrace: Export some of hash related functions Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 07/10] bpf: Add trampoline ip hash table Jiri Olsa
` (4 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
Implement current *_ftrace_direct function with their *_hash
function counterparts.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 17 +--
kernel/bpf/trampoline.c | 10 +-
kernel/trace/ftrace.c | 242 +++++-----------------------------
kernel/trace/trace_selftest.c | 5 +-
4 files changed, 45 insertions(+), 229 deletions(-)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9a6fcdafeda2..85f4ab1a1e72 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -536,11 +536,10 @@ struct ftrace_func_entry {
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
unsigned long ftrace_find_rec_direct(unsigned long ip);
-int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
-int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr);
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr,
bool free_filters);
-int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
-int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr, bool lock_direct_mutex);
int register_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
int unregister_ftrace_direct_hash(struct ftrace_ops *ops, struct ftrace_hash *hash);
@@ -554,20 +553,16 @@ static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
{
return 0;
}
-static inline int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
+static inline int register_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr)
{
return -ENODEV;
}
-static inline int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+static inline int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr,
bool free_filters)
{
return -ENODEV;
}
-static inline int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
-{
- return -ENODEV;
-}
-static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
+static inline int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr, bool lock_direct_mutex)
{
return -ENODEV;
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 0e364614c3a2..6bf272715f0e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -181,7 +181,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
+ ret = unregister_ftrace_direct(tr->fops, (unsigned long) ip, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
@@ -195,10 +195,7 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
int ret;
if (tr->func.ftrace_managed) {
- if (lock_direct_mutex)
- ret = modify_ftrace_direct(tr->fops, (long)new_addr);
- else
- ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct(tr->fops, (unsigned long) ip, (long)new_addr, lock_direct_mutex);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
}
@@ -220,8 +217,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
if (tr->func.ftrace_managed) {
- ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
- ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ ret = register_ftrace_direct(tr->fops, (unsigned long)ip, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fcb8f2d3172b..151ca94f496a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2593,16 +2593,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
-static void call_direct_funcs(unsigned long ip, unsigned long pip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
- unsigned long addr = READ_ONCE(ops->direct_call);
-
- if (!addr)
- return;
-
- arch_ftrace_set_direct_caller(fregs, addr);
-}
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -5935,28 +5925,24 @@ static int check_direct_multi(struct ftrace_ops *ops)
return 0;
}
-static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr)
+static void register_ftrace_direct_cb(struct rcu_head *rhp)
{
- struct ftrace_func_entry *entry, *del;
- int size, i;
+ struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu);
- size = 1 << hash->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- del = __ftrace_lookup_ip(direct_functions, entry->ip);
- if (del && del->direct == addr) {
- remove_hash_entry(direct_functions, del);
- kfree(del);
- }
- }
- }
+ free_ftrace_hash(fhp);
}
-static void register_ftrace_direct_cb(struct rcu_head *rhp)
+static struct ftrace_hash *hash_from_ip(unsigned long ip, unsigned long addr)
{
- struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu);
+ struct ftrace_hash *hash;
- free_ftrace_hash(fhp);
+ ip = ftrace_location(ip);
+ if (!ip)
+ return NULL;
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ if (!hash || !add_hash_entry_direct(hash, ip, addr))
+ return NULL;
+ return hash;
}
/**
@@ -5981,89 +5967,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp)
* -ENODEV - @ip does not point to a ftrace nop location (or not supported)
* -ENOMEM - There was an allocation failure.
*/
-int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr)
{
- struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
- struct ftrace_func_entry *entry, *new;
- int err = -EBUSY, size, i;
-
- if (ops->func || ops->trampoline)
- return -EINVAL;
- if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
- return -EINVAL;
- if (ops->flags & FTRACE_OPS_FL_ENABLED)
- return -EINVAL;
-
- hash = ops->func_hash->filter_hash;
- if (ftrace_hash_empty(hash))
- return -EINVAL;
-
- mutex_lock(&direct_mutex);
-
- /* Make sure requested entries are not already registered.. */
- size = 1 << hash->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- if (ftrace_find_rec_direct(entry->ip))
- goto out_unlock;
- }
- }
-
- err = -ENOMEM;
-
- /* Make a copy hash to place the new and the old entries in */
- size = hash->count + direct_functions->count;
- size = fls(size);
- if (size > FTRACE_HASH_MAX_BITS)
- size = FTRACE_HASH_MAX_BITS;
- new_hash = alloc_ftrace_hash(size);
- if (!new_hash)
- goto out_unlock;
-
- /* Now copy over the existing direct entries */
- size = 1 << direct_functions->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
- new = add_hash_entry(new_hash, entry->ip);
- if (!new)
- goto out_unlock;
- new->direct = entry->direct;
- }
- }
-
- /* ... and add the new entries */
- size = 1 << hash->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- new = add_hash_entry(new_hash, entry->ip);
- if (!new)
- goto out_unlock;
- /* Update both the copy and the hash entry */
- new->direct = addr;
- entry->direct = addr;
- }
- }
-
- free_hash = direct_functions;
- rcu_assign_pointer(direct_functions, new_hash);
- new_hash = NULL;
-
- ops->func = call_direct_funcs;
- ops->flags = MULTI_FLAGS;
- ops->trampoline = FTRACE_REGS_ADDR;
- ops->direct_call = addr;
-
- err = register_ftrace_function_nolock(ops);
-
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- if (free_hash && free_hash != EMPTY_HASH)
- call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
+ struct ftrace_hash *hash;
+ int err;
- if (new_hash)
- free_ftrace_hash(new_hash);
+ hash = hash_from_ip(ip, addr);
+ if (!hash)
+ return -ENOMEM;
+ err = register_ftrace_direct_hash(ops, hash);
+ free_ftrace_hash(hash);
return err;
}
EXPORT_SYMBOL_GPL(register_ftrace_direct);
@@ -6083,111 +5997,24 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
* 0 on success
* -EINVAL - The @ops object was not properly registered.
*/
-int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr,
bool free_filters)
{
- struct ftrace_hash *hash = ops->func_hash->filter_hash;
+ struct ftrace_hash *hash;
int err;
- if (check_direct_multi(ops))
- return -EINVAL;
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return -EINVAL;
-
- mutex_lock(&direct_mutex);
- err = unregister_ftrace_function(ops);
- remove_direct_functions_hash(hash, addr);
- mutex_unlock(&direct_mutex);
-
- /* cleanup for possible another register call */
- ops->func = NULL;
- ops->trampoline = 0;
+ hash = hash_from_ip(ip, addr);
+ if (!hash)
+ return -ENOMEM;
+ err = unregister_ftrace_direct_hash(ops, hash);
+ free_ftrace_hash(hash);
if (free_filters)
ftrace_free_filter(ops);
return err;
}
EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
-static int
-__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
-{
- struct ftrace_hash *hash;
- struct ftrace_func_entry *entry, *iter;
- static struct ftrace_ops tmp_ops = {
- .func = ftrace_stub,
- .flags = FTRACE_OPS_FL_STUB,
- };
- int i, size;
- int err;
-
- lockdep_assert_held_once(&direct_mutex);
-
- /* Enable the tmp_ops to have the same functions as the direct ops */
- ftrace_ops_init(&tmp_ops);
- tmp_ops.func_hash = ops->func_hash;
- tmp_ops.direct_call = addr;
-
- err = register_ftrace_function_nolock(&tmp_ops);
- if (err)
- return err;
-
- /*
- * Now the ftrace_ops_list_func() is called to do the direct callers.
- * We can safely change the direct functions attached to each entry.
- */
- mutex_lock(&ftrace_lock);
-
- hash = ops->func_hash->filter_hash;
- size = 1 << hash->size_bits;
- for (i = 0; i < size; i++) {
- hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
- entry = __ftrace_lookup_ip(direct_functions, iter->ip);
- if (!entry)
- continue;
- entry->direct = addr;
- }
- }
- /* Prevent store tearing if a trampoline concurrently accesses the value */
- WRITE_ONCE(ops->direct_call, addr);
-
- mutex_unlock(&ftrace_lock);
-
- /* Removing the tmp_ops will add the updated direct callers to the functions */
- unregister_ftrace_function(&tmp_ops);
-
- return err;
-}
-
-/**
- * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call
- * to call something else
- * @ops: The address of the struct ftrace_ops object
- * @addr: The address of the new trampoline to call at @ops functions
- *
- * This is used to unregister currently registered direct caller and
- * register new one @addr on functions registered in @ops object.
- *
- * Note there's window between ftrace_shutdown and ftrace_startup calls
- * where there will be no callbacks called.
- *
- * Caller should already have direct_mutex locked, so we don't lock
- * direct_mutex here.
- *
- * Returns: zero on success. Non zero on error, which includes:
- * -EINVAL - The @ops object was not properly registered.
- */
-int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
-{
- if (check_direct_multi(ops))
- return -EINVAL;
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return -EINVAL;
-
- return __modify_ftrace_direct(ops, addr);
-}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
-
/**
* modify_ftrace_direct - Modify an existing direct 'multi' call
* to call something else
@@ -6203,18 +6030,17 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long ip, unsigned long addr, bool lock_direct_mutex)
{
+ struct ftrace_hash *hash;
int err;
- if (check_direct_multi(ops))
- return -EINVAL;
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return -EINVAL;
+ hash = hash_from_ip(ip, addr);
+ if (!hash)
+ return -ENOMEM;
- mutex_lock(&direct_mutex);
- err = __modify_ftrace_direct(ops, addr);
- mutex_unlock(&direct_mutex);
+ err = modify_ftrace_direct_hash(ops, hash, lock_direct_mutex);
+ free_ftrace_hash(hash);
return err;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d88c44f1dfa5..37f5eb1f252b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1135,8 +1135,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Register direct function together with graph tracer
* and make sure we get graph trace.
*/
- ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
- ret = register_ftrace_direct(&direct,
+ ret = register_ftrace_direct(&direct, (unsigned long)DYN_FTRACE_TEST_NAME,
(unsigned long)ftrace_stub_direct_tramp);
if (ret)
goto out;
@@ -1159,7 +1158,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
unregister_ftrace_graph(&fgraph_ops);
- ret = unregister_ftrace_direct(&direct,
+ ret = unregister_ftrace_direct(&direct, (unsigned long)DYN_FTRACE_TEST_NAME,
(unsigned long)ftrace_stub_direct_tramp,
true);
if (ret)
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 07/10] bpf: Add trampoline ip hash table
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (5 preceding siblings ...)
2025-07-29 10:28 ` [RFC 06/10] ftrace: Use direct hash interface in direct functions Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 08/10] ftrace: Factor ftrace_ops ops_func interface Jiri Olsa
` (3 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
Following changes need to lookup trampoline based on its ip address,
adding hash table for that.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/bpf.h | 7 +++++--
kernel/bpf/trampoline.c | 30 +++++++++++++++++++-----------
2 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9cd2164ed23..c14bde400d97 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1253,14 +1253,17 @@ struct bpf_tramp_image {
};
struct bpf_trampoline {
- /* hlist for trampoline_table */
- struct hlist_node hlist;
+ /* hlist for trampoline_key_table */
+ struct hlist_node hlist_key;
+ /* hlist for trampoline_ip_table */
+ struct hlist_node hlist_ip;
struct ftrace_ops *fops;
/* serializes access to fields of this trampoline */
struct mutex mutex;
refcount_t refcnt;
u32 flags;
u64 key;
+ unsigned long ip;
struct {
struct btf_func_model model;
void *addr;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 6bf272715f0e..84bcd9f6bd74 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -24,9 +24,10 @@ const struct bpf_prog_ops bpf_extension_prog_ops = {
#define TRAMPOLINE_HASH_BITS 10
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
-static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE];
+static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
-/* serializes access to trampoline_table */
+/* serializes access to trampoline tables */
static DEFINE_MUTEX(trampoline_mutex);
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -135,15 +136,15 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
int i;
mutex_lock(&trampoline_mutex);
- head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
- hlist_for_each_entry(tr, head, hlist) {
+ head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head, hlist_key) {
if (tr->key == key) {
refcount_inc(&tr->refcnt);
goto out;
@@ -164,8 +165,12 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
#endif
tr->key = key;
- INIT_HLIST_NODE(&tr->hlist);
- hlist_add_head(&tr->hlist, head);
+ tr->ip = ip;
+ INIT_HLIST_NODE(&tr->hlist_key);
+ INIT_HLIST_NODE(&tr->hlist_ip);
+ hlist_add_head(&tr->hlist_key, head);
+ head = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)];
+ hlist_add_head(&tr->hlist_ip, head);
refcount_set(&tr->refcnt, 1);
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
@@ -800,7 +805,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
prog->aux->attach_btf_id);
bpf_lsm_find_cgroup_shim(prog, &bpf_func);
- tr = bpf_trampoline_lookup(key);
+ tr = bpf_trampoline_lookup(key, 0);
if (WARN_ON_ONCE(!tr))
return;
@@ -820,7 +825,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
{
struct bpf_trampoline *tr;
- tr = bpf_trampoline_lookup(key);
+ tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr);
if (!tr)
return NULL;
@@ -856,7 +861,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* fexit progs. The fentry-only trampoline will be freed via
* multiple rcu callbacks.
*/
- hlist_del(&tr->hlist);
+ hlist_del(&tr->hlist_key);
+ hlist_del(&tr->hlist_ip);
if (tr->fops) {
ftrace_free_filter(tr->fops);
kfree(tr->fops);
@@ -1135,7 +1141,9 @@ static int __init init_trampolines(void)
int i;
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&trampoline_table[i]);
+ INIT_HLIST_HEAD(&trampoline_key_table[i]);
+ for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&trampoline_ip_table[i]);
return 0;
}
late_initcall(init_trampolines);
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 08/10] ftrace: Factor ftrace_ops ops_func interface
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (6 preceding siblings ...)
2025-07-29 10:28 ` [RFC 07/10] bpf: Add trampoline ip hash table Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 09/10] bpf: Remove ftrace_ops from bpf_trampoline object Jiri Olsa
` (2 subsequent siblings)
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
We are going to remove "ftrace_ops->private == bpf_trampoline" setup
in following changes.
Adding ip argument to ftrace_ops_func_t callback function, so we can
use it to look up the trampoline.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 2 +-
kernel/bpf/trampoline.c | 26 ++++++++++++++++++++++++--
kernel/trace/ftrace.c | 6 +++---
3 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 85f4ab1a1e72..1a61f969550d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -394,7 +394,7 @@ enum ftrace_ops_cmd {
* Negative on failure. The return value is dependent on the
* callback.
*/
-typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
+typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, unsigned long ip, enum ftrace_ops_cmd cmd);
#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 84bcd9f6bd74..398c1a722d83 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -33,11 +33,33 @@ static DEFINE_MUTEX(trampoline_mutex);
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
-static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+static struct bpf_trampoline *bpf_trampoline_ip_lookup(unsigned long ip)
{
- struct bpf_trampoline *tr = ops->private;
+ struct hlist_head *head_ip;
+ struct bpf_trampoline *tr;
+
+ mutex_lock(&trampoline_mutex);
+ head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head_ip, hlist_ip) {
+ if (tr->func.addr == (void *) ip)
+ goto out;
+ }
+ tr = NULL;
+out:
+ mutex_unlock(&trampoline_mutex);
+ return tr;
+}
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
+ enum ftrace_ops_cmd cmd)
+{
+ struct bpf_trampoline *tr;
int ret = 0;
+ tr = bpf_trampoline_ip_lookup(ip);
+ if (!tr)
+ return -EINVAL;
+
if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
/* This is called inside register_ftrace_direct_multi(), so
* tr->mutex is already locked.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 151ca94f496a..943feabdd5e6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2040,7 +2040,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
*/
if (!ops->ops_func)
return -EBUSY;
- ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
if (ret)
return ret;
} else if (is_ipmodify) {
@@ -8746,7 +8746,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
if (!op->ops_func)
return -EBUSY;
- ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
if (ret)
return ret;
}
@@ -8793,7 +8793,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
/* The cleanup is optional, ignore any errors */
if (found_op && op->ops_func)
- op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
}
}
mutex_unlock(&direct_mutex);
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 09/10] bpf: Remove ftrace_ops from bpf_trampoline object
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (7 preceding siblings ...)
2025-07-29 10:28 ` [RFC 08/10] ftrace: Factor ftrace_ops ops_func interface Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 10:28 ` [RFC 10/10] Revert "ftrace: Store direct called addresses in their ops" Jiri Olsa
2025-07-29 17:57 ` [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Mark Rutland
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
We no longer need ftrace_ops in each bpf_trampoline object,
we can manage with just single ftrace_ops for all direct
trampoline attachments.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/bpf.h | 1 -
kernel/bpf/trampoline.c | 34 ++++++++++------------------------
2 files changed, 10 insertions(+), 25 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c14bde400d97..bad29fe38a12 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1257,7 +1257,6 @@ struct bpf_trampoline {
struct hlist_node hlist_key;
/* hlist for trampoline_ip_table */
struct hlist_node hlist_ip;
- struct ftrace_ops *fops;
/* serializes access to fields of this trampoline */
struct mutex mutex;
refcount_t refcnt;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 398c1a722d83..e6a0e7b20bb6 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -175,16 +175,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
- tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
- if (!tr->fops) {
- kfree(tr);
- tr = NULL;
- goto out;
- }
- tr->fops->private = tr;
- tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
-#endif
tr->key = key;
tr->ip = ip;
@@ -202,13 +192,19 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
return tr;
}
+struct ftrace_ops direct_ops = {
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ .ops_func = bpf_tramp_ftrace_ops_func,
+#endif
+};
+
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct(tr->fops, (unsigned long) ip, (long)old_addr, false);
+ ret = unregister_ftrace_direct(&direct_ops, (unsigned long) ip, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
@@ -222,7 +218,7 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
int ret;
if (tr->func.ftrace_managed) {
- ret = modify_ftrace_direct(tr->fops, (unsigned long) ip, (long)new_addr, lock_direct_mutex);
+ ret = modify_ftrace_direct(&direct_ops, (unsigned long) ip, (long)new_addr, lock_direct_mutex);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
}
@@ -237,14 +233,11 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
int ret;
faddr = ftrace_location((unsigned long)ip);
- if (faddr) {
- if (!tr->fops)
- return -ENOTSUPP;
+ if (faddr)
tr->func.ftrace_managed = true;
- }
if (tr->func.ftrace_managed) {
- ret = register_ftrace_direct(tr->fops, (unsigned long)ip, (long)new_addr);
+ ret = register_ftrace_direct(&direct_ops, (unsigned long)ip, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
}
@@ -502,9 +495,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
* BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
* trampoline again, and retry register.
*/
- /* reset fops->func and fops->trampoline for re-register */
- tr->fops->func = NULL;
- tr->fops->trampoline = 0;
/* free im memory and reallocate later */
bpf_tramp_image_free(im);
@@ -885,10 +875,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
*/
hlist_del(&tr->hlist_key);
hlist_del(&tr->hlist_ip);
- if (tr->fops) {
- ftrace_free_filter(tr->fops);
- kfree(tr->fops);
- }
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [RFC 10/10] Revert "ftrace: Store direct called addresses in their ops"
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (8 preceding siblings ...)
2025-07-29 10:28 ` [RFC 09/10] bpf: Remove ftrace_ops from bpf_trampoline object Jiri Olsa
@ 2025-07-29 10:28 ` Jiri Olsa
2025-07-29 17:57 ` [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Mark Rutland
10 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-29 10:28 UTC (permalink / raw)
To: Steven Rostedt, Florent Revest, Mark Rutland
Cc: bpf, linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong
This reverts commit dbaccb618fabde8b8596e341f8d76da63a9b0c2f.
Current code uses ip address to lookup the trampoline and we need the
ops to point multiple trampolines, hence this is no longer needed.
TODO this probably breaks arm.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
include/linux/ftrace.h | 3 ---
1 file changed, 3 deletions(-)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1a61f969550d..27b26a87231c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -457,9 +457,6 @@ struct ftrace_ops {
struct list_head subop_list;
ftrace_ops_func_t ops_func;
struct ftrace_ops *managed;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
- unsigned long direct_call;
-#endif
#endif
};
--
2.50.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-07-29 10:28 [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Jiri Olsa
` (9 preceding siblings ...)
2025-07-29 10:28 ` [RFC 10/10] Revert "ftrace: Store direct called addresses in their ops" Jiri Olsa
@ 2025-07-29 17:57 ` Mark Rutland
2025-07-30 11:19 ` Jiri Olsa
10 siblings, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2025-07-29 17:57 UTC (permalink / raw)
To: Jiri Olsa
Cc: Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
Hi Jiri,
[adding some powerpc and riscv folk, see below]
On Tue, Jul 29, 2025 at 12:28:03PM +0200, Jiri Olsa wrote:
> hi,
> while poking the multi-tracing interface I ended up with just one
> ftrace_ops object to attach all trampolines.
>
> This change allows to use less direct API calls during the attachment
> changes in the future code, so in effect speeding up the attachment.
How important is that, and what sort of speedup does this result in? I
ask due to potential performance hits noted below, and I'm lacking
context as to why we want to do this in the first place -- what is this
intended to enable/improve?
> However having just single ftrace_ops object removes direct_call
> field from direct_call, which is needed by arm, so I'm not sure
> it's the right path forward.
It's also needed by powerpc and riscv since commits:
a52f6043a2238d65 ("powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS")
b21cdb9523e5561b ("riscv: ftrace: support direct call using call_ops")
> Mark, Florent,
> any idea how hard would it be to for arm to get rid of direct_call field?
For architectures which follow the arm64 style of implementation, it's
pretty hard to get rid of it without introducing a performance hit to
the call and/or a hit to attachment/detachment/modification. It would
also end up being a fair amount more complicated.
There's some historical rationale at:
https://lore.kernel.org/lkml/ZfBbxPDd0rz6FN2T@FVFF77S0Q05N/
... but the gist is that for several reasons we want the ops pointer in
the callsite, and for atomic modification of this to switch everything
dependent on that ops atomically, as this keeps the call logic and
attachment/detachment/modification logic simple and pretty fast.
If we remove the direct_call pointer from the ftrace_ops, then IIUC our
options include:
* Point the callsite pointer at some intermediate structure which points
to the ops (e.g. the dyn_ftrace for the callsite). That introduces an
additional dependent load per call that needs the ops, and introduces
potential incoherency with other fields in that structure, requiring
more synchronization overhead for attachment/detachment/modification.
* Point the callsite pointer at a trampoline which can generate the ops
pointer. This requires that every ops has a trampoline even for
non-direct usage, which then requires more memory / I$, has more
potential failure points, and is generally more complicated. The
performance here will vary by architecture and platform, on some this
might be faster, on some it might be slower.
Note that we probably still need to bounce through an intermediary
trampoline here to actually load from the callsite pointer and
indirectly branch to it.
... but I'm not really keen on either unless we really have to remove
the ftrace_ops::direct_call field, since they come with a substantial
jump in complexity.
Mark.
>
> thougts? thanks,
> jirka
>
>
> ---
> Jiri Olsa (10):
> ftrace: Make alloc_and_copy_ftrace_hash direct friendly
> ftrace: Add register_ftrace_direct_hash function
> ftrace: Add unregister_ftrace_direct_hash function
> ftrace: Add modify_ftrace_direct_hash function
> ftrace: Export some of hash related functions
> ftrace: Use direct hash interface in direct functions
> bpf: Add trampoline ip hash table
> ftrace: Factor ftrace_ops ops_func interface
> bpf: Remove ftrace_ops from bpf_trampoline object
> Revert "ftrace: Store direct called addresses in their ops"
>
> include/linux/bpf.h | 8 +-
> include/linux/ftrace.h | 51 ++++++++++---
> kernel/bpf/trampoline.c | 94 +++++++++++++-----------
> kernel/trace/ftrace.c | 481 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------------
> kernel/trace/trace.h | 8 --
> kernel/trace/trace_selftest.c | 5 +-
> 6 files changed, 395 insertions(+), 252 deletions(-)
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-07-29 17:57 ` [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines Mark Rutland
@ 2025-07-30 11:19 ` Jiri Olsa
2025-07-30 13:56 ` Steven Rostedt
2025-08-01 9:49 ` Mark Rutland
0 siblings, 2 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-30 11:19 UTC (permalink / raw)
To: Mark Rutland
Cc: Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
On Tue, Jul 29, 2025 at 06:57:40PM +0100, Mark Rutland wrote:
> Hi Jiri,
>
> [adding some powerpc and riscv folk, see below]
>
> On Tue, Jul 29, 2025 at 12:28:03PM +0200, Jiri Olsa wrote:
> > hi,
> > while poking the multi-tracing interface I ended up with just one
> > ftrace_ops object to attach all trampolines.
> >
> > This change allows to use less direct API calls during the attachment
> > changes in the future code, so in effect speeding up the attachment.
>
> How important is that, and what sort of speedup does this result in? I
> ask due to potential performance hits noted below, and I'm lacking
> context as to why we want to do this in the first place -- what is this
> intended to enable/improve?
so it's all work on PoC stage, the idea is to be able to attach many
(like 20,30,40k) functions to their trampolines quickly, which at the
moment is slow because all the involved interfaces work with just single
function/tracempoline relation
there's ongoing development by Menglong [1] to organize such attachment
for multiple functions and trampolines, but still at the end we have to use
ftrace direct interface to do the attachment for each involved ftrace_ops
so at the point of attachment it helps to have as few ftrace_ops objects
as possible, in my test code I ended up with just single ftrace_ops object
and I see attachment time for 20k functions to be around 3 seconds
IIUC Menglong's change needs 12 ftrace_ops objects so we need to do around
12 direct ftrace_ops direct calls .. so probably not that bad, but still
it would be faster with just single ftrace_ops involved
[1] https://lore.kernel.org/bpf/20250703121521.1874196-1-dongml2@chinatelecom.cn/
>
> > However having just single ftrace_ops object removes direct_call
> > field from direct_call, which is needed by arm, so I'm not sure
> > it's the right path forward.
>
> It's also needed by powerpc and riscv since commits:
>
> a52f6043a2238d65 ("powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS")
> b21cdb9523e5561b ("riscv: ftrace: support direct call using call_ops")
>
> > Mark, Florent,
> > any idea how hard would it be to for arm to get rid of direct_call field?
>
> For architectures which follow the arm64 style of implementation, it's
> pretty hard to get rid of it without introducing a performance hit to
> the call and/or a hit to attachment/detachment/modification. It would
> also end up being a fair amount more complicated.
>
> There's some historical rationale at:
>
> https://lore.kernel.org/lkml/ZfBbxPDd0rz6FN2T@FVFF77S0Q05N/
>
> ... but the gist is that for several reasons we want the ops pointer in
> the callsite, and for atomic modification of this to switch everything
> dependent on that ops atomically, as this keeps the call logic and
> attachment/detachment/modification logic simple and pretty fast.
>
> If we remove the direct_call pointer from the ftrace_ops, then IIUC our
> options include:
>
> * Point the callsite pointer at some intermediate structure which points
> to the ops (e.g. the dyn_ftrace for the callsite). That introduces an
> additional dependent load per call that needs the ops, and introduces
> potential incoherency with other fields in that structure, requiring
> more synchronization overhead for attachment/detachment/modification.
>
> * Point the callsite pointer at a trampoline which can generate the ops
> pointer. This requires that every ops has a trampoline even for
> non-direct usage, which then requires more memory / I$, has more
> potential failure points, and is generally more complicated. The
> performance here will vary by architecture and platform, on some this
> might be faster, on some it might be slower.
>
> Note that we probably still need to bounce through an intermediary
> trampoline here to actually load from the callsite pointer and
> indirectly branch to it.
>
> ... but I'm not really keen on either unless we really have to remove
> the ftrace_ops::direct_call field, since they come with a substantial
> jump in complexity.
ok, that sounds bad.. thanks for the details
Steven, please correct me if/when I'm wrong ;-)
IIUC in x86_64, IF there's just single ftrace_ops defined for the function,
it will bypass ftrace trampoline and call directly the direct trampoline
for the function, like:
<foo>:
call direct_trampoline
...
IF there are other ftrace_ops 'users' on the same function, we execute
each of them like:
<foo>:
call ftrace_trampoline
call ftrace_ops_1->func
call ftrace_ops_2->func
...
with our direct ftrace_ops->func currently using ftrace_ops->direct_call
to return direct trampoline for the function:
-static void call_direct_funcs(unsigned long ip, unsigned long pip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
- unsigned long addr = READ_ONCE(ops->direct_call);
-
- if (!addr)
- return;
-
- arch_ftrace_set_direct_caller(fregs, addr);
-}
in the new changes it will do hash lookup (based on ip) for the direct
trampoline we want to execute:
+static void call_direct_funcs_hash(unsigned long ip, unsigned long pip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ unsigned long addr;
+
+ addr = ftrace_find_rec_direct(ip);
+ if (!addr)
+ return;
+
+ arch_ftrace_set_direct_caller(fregs, addr);
+}
still this is the slow path for the case where multiple ftrace_ops objects use
same function.. for the fast path we have the direct attachment as described above
sorry I probably forgot/missed discussion on this, but doing the fast path like in
x86_64 is not an option in arm, right?
thanks,
jirka
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-07-30 11:19 ` Jiri Olsa
@ 2025-07-30 13:56 ` Steven Rostedt
2025-07-31 20:40 ` Jiri Olsa
2025-08-01 9:49 ` Mark Rutland
1 sibling, 1 reply; 19+ messages in thread
From: Steven Rostedt @ 2025-07-30 13:56 UTC (permalink / raw)
To: Jiri Olsa
Cc: Mark Rutland, Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
On Wed, 30 Jul 2025 13:19:51 +0200
Jiri Olsa <olsajiri@gmail.com> wrote:
> so it's all work on PoC stage, the idea is to be able to attach many
> (like 20,30,40k) functions to their trampolines quickly, which at the
> moment is slow because all the involved interfaces work with just single
> function/tracempoline relation
Sounds like you are reinventing the ftrace mechanism itself. Which I warned
against when I first introduced direct trampolines, which were purposely
designed to do a few functions, not thousands. But, oh well.
> Steven, please correct me if/when I'm wrong ;-)
>
> IIUC in x86_64, IF there's just single ftrace_ops defined for the function,
> it will bypass ftrace trampoline and call directly the direct trampoline
> for the function, like:
>
> <foo>:
> call direct_trampoline
> ...
Yes.
And it will also do the same for normal ftrace functions. If you have:
struct ftrace_ops {
.func = myfunc;
};
It will create a trampoline that has:
<tramp>
...
call myfunc
...
ret
On x86, I believe the ftrace_ops for myfunc is added to the trampoline,
where as in arm, it's part of the function header. To modify it, it
requires converting to the list operation (which ignores the ops
parameter), then the ops at the function gets changed before it goes to the
new function.
And if it is the only ops attached to a function foo, the function foo
would have:
<foo>
call tramp
...
But what's nice about this is that if you have 12 different ftrace_ops that
each attach to a 1000 different functions, but no two ftrace_ops attach to
the same function, they all do the above. No hash needed!
>
> IF there are other ftrace_ops 'users' on the same function, we execute
> each of them like:
>
> <foo>:
> call ftrace_trampoline
> call ftrace_ops_1->func
> call ftrace_ops_2->func
> ...
>
> with our direct ftrace_ops->func currently using ftrace_ops->direct_call
> to return direct trampoline for the function:
>
> -static void call_direct_funcs(unsigned long ip, unsigned long pip,
> - struct ftrace_ops *ops, struct ftrace_regs *fregs)
> -{
> - unsigned long addr = READ_ONCE(ops->direct_call);
> -
> - if (!addr)
> - return;
> -
> - arch_ftrace_set_direct_caller(fregs, addr);
> -}
>
> in the new changes it will do hash lookup (based on ip) for the direct
> trampoline we want to execute:
>
> +static void call_direct_funcs_hash(unsigned long ip, unsigned long pip,
> + struct ftrace_ops *ops, struct ftrace_regs *fregs)
> +{
> + unsigned long addr;
> +
> + addr = ftrace_find_rec_direct(ip);
> + if (!addr)
> + return;
> +
> + arch_ftrace_set_direct_caller(fregs, addr);
> +}
I think the above will work.
>
> still this is the slow path for the case where multiple ftrace_ops objects use
> same function.. for the fast path we have the direct attachment as described above
>
> sorry I probably forgot/missed discussion on this, but doing the fast path like in
> x86_64 is not an option in arm, right?
That's a question for Mark, right?
-- Steve
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-07-30 13:56 ` Steven Rostedt
@ 2025-07-31 20:40 ` Jiri Olsa
0 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-07-31 20:40 UTC (permalink / raw)
To: Steven Rostedt
Cc: Jiri Olsa, Mark Rutland, Steven Rostedt, Florent Revest, bpf,
linux-kernel, linux-trace-kernel, linux-arm-kernel,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Menglong Dong, Naveen N Rao, Michael Ellerman,
Björn Töpel, Andy Chiu, Alexandre Ghiti, Palmer Dabbelt
On Wed, Jul 30, 2025 at 09:56:41AM -0400, Steven Rostedt wrote:
> On Wed, 30 Jul 2025 13:19:51 +0200
> Jiri Olsa <olsajiri@gmail.com> wrote:
>
> > so it's all work on PoC stage, the idea is to be able to attach many
> > (like 20,30,40k) functions to their trampolines quickly, which at the
> > moment is slow because all the involved interfaces work with just single
> > function/tracempoline relation
>
> Sounds like you are reinventing the ftrace mechanism itself. Which I warned
> against when I first introduced direct trampolines, which were purposely
> designed to do a few functions, not thousands. But, oh well.
>
>
> > Steven, please correct me if/when I'm wrong ;-)
> >
> > IIUC in x86_64, IF there's just single ftrace_ops defined for the function,
> > it will bypass ftrace trampoline and call directly the direct trampoline
> > for the function, like:
> >
> > <foo>:
> > call direct_trampoline
> > ...
>
> Yes.
>
> And it will also do the same for normal ftrace functions. If you have:
>
> struct ftrace_ops {
> .func = myfunc;
> };
>
> It will create a trampoline that has:
>
> <tramp>
> ...
> call myfunc
> ...
> ret
>
> On x86, I believe the ftrace_ops for myfunc is added to the trampoline,
> where as in arm, it's part of the function header. To modify it, it
> requires converting to the list operation (which ignores the ops
> parameter), then the ops at the function gets changed before it goes to the
> new function.
>
> And if it is the only ops attached to a function foo, the function foo
> would have:
>
> <foo>
> call tramp
> ...
>
> But what's nice about this is that if you have 12 different ftrace_ops that
> each attach to a 1000 different functions, but no two ftrace_ops attach to
> the same function, they all do the above. No hash needed!
>
> >
> > IF there are other ftrace_ops 'users' on the same function, we execute
> > each of them like:
> >
> > <foo>:
> > call ftrace_trampoline
> > call ftrace_ops_1->func
> > call ftrace_ops_2->func
> > ...
> >
> > with our direct ftrace_ops->func currently using ftrace_ops->direct_call
> > to return direct trampoline for the function:
> >
> > -static void call_direct_funcs(unsigned long ip, unsigned long pip,
> > - struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > -{
> > - unsigned long addr = READ_ONCE(ops->direct_call);
> > -
> > - if (!addr)
> > - return;
> > -
> > - arch_ftrace_set_direct_caller(fregs, addr);
> > -}
> >
> > in the new changes it will do hash lookup (based on ip) for the direct
> > trampoline we want to execute:
> >
> > +static void call_direct_funcs_hash(unsigned long ip, unsigned long pip,
> > + struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > +{
> > + unsigned long addr;
> > +
> > + addr = ftrace_find_rec_direct(ip);
> > + if (!addr)
> > + return;
> > +
> > + arch_ftrace_set_direct_caller(fregs, addr);
> > +}
>
> I think the above will work.
>
> >
> > still this is the slow path for the case where multiple ftrace_ops objects use
> > same function.. for the fast path we have the direct attachment as described above
> >
> > sorry I probably forgot/missed discussion on this, but doing the fast path like in
> > x86_64 is not an option in arm, right?
>
> That's a question for Mark, right?
yes, thanks for the other details
jirka
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-07-30 11:19 ` Jiri Olsa
2025-07-30 13:56 ` Steven Rostedt
@ 2025-08-01 9:49 ` Mark Rutland
2025-08-02 21:26 ` Jiri Olsa
1 sibling, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2025-08-01 9:49 UTC (permalink / raw)
To: Jiri Olsa
Cc: Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
On Wed, Jul 30, 2025 at 01:19:51PM +0200, Jiri Olsa wrote:
> On Tue, Jul 29, 2025 at 06:57:40PM +0100, Mark Rutland wrote:
> > Hi Jiri,
> >
> > [adding some powerpc and riscv folk, see below]
> >
> > On Tue, Jul 29, 2025 at 12:28:03PM +0200, Jiri Olsa wrote:
> > > hi,
> > > while poking the multi-tracing interface I ended up with just one
> > > ftrace_ops object to attach all trampolines.
> > >
> > > This change allows to use less direct API calls during the attachment
> > > changes in the future code, so in effect speeding up the attachment.
> >
> > How important is that, and what sort of speedup does this result in? I
> > ask due to potential performance hits noted below, and I'm lacking
> > context as to why we want to do this in the first place -- what is this
> > intended to enable/improve?
>
> so it's all work on PoC stage, the idea is to be able to attach many
> (like 20,30,40k) functions to their trampolines quickly, which at the
> moment is slow because all the involved interfaces work with just single
> function/tracempoline relation
Do you know which aspect of that is slow? e.g. is that becuase you have
to update each ftrace_ops independently, and pay the synchronization
overhead per-ops?
I ask because it might be possible to do some more batching there, at
least for architectures like arm64 that use the CALL_OPS approach.
> there's ongoing development by Menglong [1] to organize such attachment
> for multiple functions and trampolines, but still at the end we have to use
> ftrace direct interface to do the attachment for each involved ftrace_ops
>
> so at the point of attachment it helps to have as few ftrace_ops objects
> as possible, in my test code I ended up with just single ftrace_ops object
> and I see attachment time for 20k functions to be around 3 seconds
>
> IIUC Menglong's change needs 12 ftrace_ops objects so we need to do around
> 12 direct ftrace_ops direct calls .. so probably not that bad, but still
> it would be faster with just single ftrace_ops involved
>
> [1] https://lore.kernel.org/bpf/20250703121521.1874196-1-dongml2@chinatelecom.cn/
>
> >
> > > However having just single ftrace_ops object removes direct_call
> > > field from direct_call, which is needed by arm, so I'm not sure
> > > it's the right path forward.
> >
> > It's also needed by powerpc and riscv since commits:
> >
> > a52f6043a2238d65 ("powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS")
> > b21cdb9523e5561b ("riscv: ftrace: support direct call using call_ops")
> >
> > > Mark, Florent,
> > > any idea how hard would it be to for arm to get rid of direct_call field?
> >
> > For architectures which follow the arm64 style of implementation, it's
> > pretty hard to get rid of it without introducing a performance hit to
> > the call and/or a hit to attachment/detachment/modification. It would
> > also end up being a fair amount more complicated.
> >
> > There's some historical rationale at:
> >
> > https://lore.kernel.org/lkml/ZfBbxPDd0rz6FN2T@FVFF77S0Q05N/
> >
> > ... but the gist is that for several reasons we want the ops pointer in
> > the callsite, and for atomic modification of this to switch everything
> > dependent on that ops atomically, as this keeps the call logic and
> > attachment/detachment/modification logic simple and pretty fast.
> >
> > If we remove the direct_call pointer from the ftrace_ops, then IIUC our
> > options include:
> >
> > * Point the callsite pointer at some intermediate structure which points
> > to the ops (e.g. the dyn_ftrace for the callsite). That introduces an
> > additional dependent load per call that needs the ops, and introduces
> > potential incoherency with other fields in that structure, requiring
> > more synchronization overhead for attachment/detachment/modification.
> >
> > * Point the callsite pointer at a trampoline which can generate the ops
> > pointer. This requires that every ops has a trampoline even for
> > non-direct usage, which then requires more memory / I$, has more
> > potential failure points, and is generally more complicated. The
> > performance here will vary by architecture and platform, on some this
> > might be faster, on some it might be slower.
> >
> > Note that we probably still need to bounce through an intermediary
> > trampoline here to actually load from the callsite pointer and
> > indirectly branch to it.
> >
> > ... but I'm not really keen on either unless we really have to remove
> > the ftrace_ops::direct_call field, since they come with a substantial
> > jump in complexity.
>
> ok, that sounds bad.. thanks for the details
>
> Steven, please correct me if/when I'm wrong ;-)
>
> IIUC in x86_64, IF there's just single ftrace_ops defined for the function,
> it will bypass ftrace trampoline and call directly the direct trampoline
> for the function, like:
>
> <foo>:
> call direct_trampoline
> ...
More details at the end of this reply; arm64 can sometimes do this, but
not always, and even when there's a single ftrace_ops we may need to
bounce through a common trampoline (which can still be cheap).
> IF there are other ftrace_ops 'users' on the same function, we execute
> each of them like:
>
> <foo>:
> call ftrace_trampoline
> call ftrace_ops_1->func
> call ftrace_ops_2->func
> ...
More details at the end of this reply; arm64 does essentially the same
thing via the ftrace_list_ops and ftrace_ops_list_func().
> with our direct ftrace_ops->func currently using ftrace_ops->direct_call
> to return direct trampoline for the function:
>
> -static void call_direct_funcs(unsigned long ip, unsigned long pip,
> - struct ftrace_ops *ops, struct ftrace_regs *fregs)
> -{
> - unsigned long addr = READ_ONCE(ops->direct_call);
> -
> - if (!addr)
> - return;
> -
> - arch_ftrace_set_direct_caller(fregs, addr);
> -}
More details at the end of this reply; at present, when an instrumented
function has a single ops, arm64 can call ops->direct_call directly from
the common trampoline, and only needs to fall back to
call_direct_funcs() when there are multiple ops.
> in the new changes it will do hash lookup (based on ip) for the direct
> trampoline we want to execute:
>
> +static void call_direct_funcs_hash(unsigned long ip, unsigned long pip,
> + struct ftrace_ops *ops, struct ftrace_regs *fregs)
> +{
> + unsigned long addr;
> +
> + addr = ftrace_find_rec_direct(ip);
> + if (!addr)
> + return;
> +
> + arch_ftrace_set_direct_caller(fregs, addr);
> +}
>
> still this is the slow path for the case where multiple ftrace_ops objects use
> same function.. for the fast path we have the direct attachment as described above
>
> sorry I probably forgot/missed discussion on this, but doing the fast path like in
> x86_64 is not an option in arm, right?
On arm64 we have a fast path, BUT branch range limitations means that we
cannot always branch directly from the instrumented function to the
direct func with a single branch instruction. We use ops->direct_call to
handle that case within a common trampoline, which is significantly
cheaper that iterating over the ops and/or looking up the direct func
from a hash.
With CALL_OPS, we place a pointer to the ops immediately before the
instrumented function, and have the instrumented function branch to a
common trampoline which can load that pointer (and can then branch to
any direct func as necessary).
The instrumented function looks like:
# Aligned to 8 bytes
func - 8:
< pointer to ops >
func:
BTI // optional
MOV X9, LR // save original return address
NOP // patched to `BL ftrace_caller`
func_body:
... and then in ftrace_caller we can recover the 'ops' pointer with:
BIC <tmp>, LR, 0x7 // align down (skips BTI)
LDR <ops>, [<tmp>, #-16] // load ops pointer
LDR <direct>, [<ops>, #FTRACE_OPS_DIRECT_CALL] // load ops->direct_call
CBNZ <direct>, ftrace_caller_direct // if !NULL, make direct call
< fall through to non-direct func case here >
Having the ops (and ops->direct_call) means that getting to the direct
func is significantly cheaper than having to lookup the direct func via
the hash.
Where an instrumented function has a single ops, this can get to the
direct func with a low constant overhead, significantly cheaper than
looking up the direct func via the hash.
Where an instrumented function has multiple ops, the ops pointer is
pointed at a common ftrace_list_ops, where ftrace_ops_list_func()
iterates over all the other relevant ops.
Mark.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-08-01 9:49 ` Mark Rutland
@ 2025-08-02 21:26 ` Jiri Olsa
2025-08-06 10:20 ` Mark Rutland
0 siblings, 1 reply; 19+ messages in thread
From: Jiri Olsa @ 2025-08-02 21:26 UTC (permalink / raw)
To: Mark Rutland
Cc: Jiri Olsa, Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
On Fri, Aug 01, 2025 at 10:49:56AM +0100, Mark Rutland wrote:
> On Wed, Jul 30, 2025 at 01:19:51PM +0200, Jiri Olsa wrote:
> > On Tue, Jul 29, 2025 at 06:57:40PM +0100, Mark Rutland wrote:
> > > Hi Jiri,
> > >
> > > [adding some powerpc and riscv folk, see below]
> > >
> > > On Tue, Jul 29, 2025 at 12:28:03PM +0200, Jiri Olsa wrote:
> > > > hi,
> > > > while poking the multi-tracing interface I ended up with just one
> > > > ftrace_ops object to attach all trampolines.
> > > >
> > > > This change allows to use less direct API calls during the attachment
> > > > changes in the future code, so in effect speeding up the attachment.
> > >
> > > How important is that, and what sort of speedup does this result in? I
> > > ask due to potential performance hits noted below, and I'm lacking
> > > context as to why we want to do this in the first place -- what is this
> > > intended to enable/improve?
> >
> > so it's all work on PoC stage, the idea is to be able to attach many
> > (like 20,30,40k) functions to their trampolines quickly, which at the
> > moment is slow because all the involved interfaces work with just single
> > function/tracempoline relation
>
> Do you know which aspect of that is slow? e.g. is that becuase you have
> to update each ftrace_ops independently, and pay the synchronization
> overhead per-ops?
>
> I ask because it might be possible to do some more batching there, at
> least for architectures like arm64 that use the CALL_OPS approach.
IIRC it's the rcu sync in register_ftrace_direct and ftrace_shutdown
I'll try to profile that case again, there might have been changes
since the last time we did that
>
> > there's ongoing development by Menglong [1] to organize such attachment
> > for multiple functions and trampolines, but still at the end we have to use
> > ftrace direct interface to do the attachment for each involved ftrace_ops
> >
> > so at the point of attachment it helps to have as few ftrace_ops objects
> > as possible, in my test code I ended up with just single ftrace_ops object
> > and I see attachment time for 20k functions to be around 3 seconds
> >
> > IIUC Menglong's change needs 12 ftrace_ops objects so we need to do around
> > 12 direct ftrace_ops direct calls .. so probably not that bad, but still
> > it would be faster with just single ftrace_ops involved
> >
> > [1] https://lore.kernel.org/bpf/20250703121521.1874196-1-dongml2@chinatelecom.cn/
> >
> > >
> > > > However having just single ftrace_ops object removes direct_call
> > > > field from direct_call, which is needed by arm, so I'm not sure
> > > > it's the right path forward.
> > >
> > > It's also needed by powerpc and riscv since commits:
> > >
> > > a52f6043a2238d65 ("powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS")
> > > b21cdb9523e5561b ("riscv: ftrace: support direct call using call_ops")
> > >
> > > > Mark, Florent,
> > > > any idea how hard would it be to for arm to get rid of direct_call field?
> > >
> > > For architectures which follow the arm64 style of implementation, it's
> > > pretty hard to get rid of it without introducing a performance hit to
> > > the call and/or a hit to attachment/detachment/modification. It would
> > > also end up being a fair amount more complicated.
> > >
> > > There's some historical rationale at:
> > >
> > > https://lore.kernel.org/lkml/ZfBbxPDd0rz6FN2T@FVFF77S0Q05N/
> > >
> > > ... but the gist is that for several reasons we want the ops pointer in
> > > the callsite, and for atomic modification of this to switch everything
> > > dependent on that ops atomically, as this keeps the call logic and
> > > attachment/detachment/modification logic simple and pretty fast.
> > >
> > > If we remove the direct_call pointer from the ftrace_ops, then IIUC our
> > > options include:
> > >
> > > * Point the callsite pointer at some intermediate structure which points
> > > to the ops (e.g. the dyn_ftrace for the callsite). That introduces an
> > > additional dependent load per call that needs the ops, and introduces
> > > potential incoherency with other fields in that structure, requiring
> > > more synchronization overhead for attachment/detachment/modification.
> > >
> > > * Point the callsite pointer at a trampoline which can generate the ops
> > > pointer. This requires that every ops has a trampoline even for
> > > non-direct usage, which then requires more memory / I$, has more
> > > potential failure points, and is generally more complicated. The
> > > performance here will vary by architecture and platform, on some this
> > > might be faster, on some it might be slower.
> > >
> > > Note that we probably still need to bounce through an intermediary
> > > trampoline here to actually load from the callsite pointer and
> > > indirectly branch to it.
> > >
> > > ... but I'm not really keen on either unless we really have to remove
> > > the ftrace_ops::direct_call field, since they come with a substantial
> > > jump in complexity.
> >
> > ok, that sounds bad.. thanks for the details
> >
> > Steven, please correct me if/when I'm wrong ;-)
> >
> > IIUC in x86_64, IF there's just single ftrace_ops defined for the function,
> > it will bypass ftrace trampoline and call directly the direct trampoline
> > for the function, like:
> >
> > <foo>:
> > call direct_trampoline
> > ...
>
> More details at the end of this reply; arm64 can sometimes do this, but
> not always, and even when there's a single ftrace_ops we may need to
> bounce through a common trampoline (which can still be cheap).
>
> > IF there are other ftrace_ops 'users' on the same function, we execute
> > each of them like:
> >
> > <foo>:
> > call ftrace_trampoline
> > call ftrace_ops_1->func
> > call ftrace_ops_2->func
> > ...
>
> More details at the end of this reply; arm64 does essentially the same
> thing via the ftrace_list_ops and ftrace_ops_list_func().
>
> > with our direct ftrace_ops->func currently using ftrace_ops->direct_call
> > to return direct trampoline for the function:
> >
> > -static void call_direct_funcs(unsigned long ip, unsigned long pip,
> > - struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > -{
> > - unsigned long addr = READ_ONCE(ops->direct_call);
> > -
> > - if (!addr)
> > - return;
> > -
> > - arch_ftrace_set_direct_caller(fregs, addr);
> > -}
>
> More details at the end of this reply; at present, when an instrumented
> function has a single ops, arm64 can call ops->direct_call directly from
> the common trampoline, and only needs to fall back to
> call_direct_funcs() when there are multiple ops.
>
> > in the new changes it will do hash lookup (based on ip) for the direct
> > trampoline we want to execute:
> >
> > +static void call_direct_funcs_hash(unsigned long ip, unsigned long pip,
> > + struct ftrace_ops *ops, struct ftrace_regs *fregs)
> > +{
> > + unsigned long addr;
> > +
> > + addr = ftrace_find_rec_direct(ip);
> > + if (!addr)
> > + return;
> > +
> > + arch_ftrace_set_direct_caller(fregs, addr);
> > +}
> >
> > still this is the slow path for the case where multiple ftrace_ops objects use
> > same function.. for the fast path we have the direct attachment as described above
> >
> > sorry I probably forgot/missed discussion on this, but doing the fast path like in
> > x86_64 is not an option in arm, right?
>
> On arm64 we have a fast path, BUT branch range limitations means that we
> cannot always branch directly from the instrumented function to the
> direct func with a single branch instruction. We use ops->direct_call to
> handle that case within a common trampoline, which is significantly
> cheaper that iterating over the ops and/or looking up the direct func
> from a hash.
>
> With CALL_OPS, we place a pointer to the ops immediately before the
> instrumented function, and have the instrumented function branch to a
> common trampoline which can load that pointer (and can then branch to
> any direct func as necessary).
>
> The instrumented function looks like:
>
> # Aligned to 8 bytes
> func - 8:
> < pointer to ops >
stupid question.. so there's ftrace_ops pointer stored for each function at
'func - 8` ? why not store the func's direct trampoline address in there?
> func:
> BTI // optional
> MOV X9, LR // save original return address
> NOP // patched to `BL ftrace_caller`
> func_body:
>
> ... and then in ftrace_caller we can recover the 'ops' pointer with:
>
> BIC <tmp>, LR, 0x7 // align down (skips BTI)
> LDR <ops>, [<tmp>, #-16] // load ops pointer
>
> LDR <direct>, [<ops>, #FTRACE_OPS_DIRECT_CALL] // load ops->direct_call
> CBNZ <direct>, ftrace_caller_direct // if !NULL, make direct call
>
> < fall through to non-direct func case here >
>
> Having the ops (and ops->direct_call) means that getting to the direct
> func is significantly cheaper than having to lookup the direct func via
> the hash.
>
> Where an instrumented function has a single ops, this can get to the
> direct func with a low constant overhead, significantly cheaper than
> looking up the direct func via the hash.
>
> Where an instrumented function has multiple ops, the ops pointer is
> pointed at a common ftrace_list_ops, where ftrace_ops_list_func()
> iterates over all the other relevant ops.
thanks for all the details, I'll check if both the new change and ops->direct_call
could live together for x86 and other arch, but it will probably complicate
things a lot more
jirka
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-08-02 21:26 ` Jiri Olsa
@ 2025-08-06 10:20 ` Mark Rutland
2025-08-13 11:09 ` Jiri Olsa
0 siblings, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2025-08-06 10:20 UTC (permalink / raw)
To: Jiri Olsa
Cc: Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
On Sat, Aug 02, 2025 at 11:26:46PM +0200, Jiri Olsa wrote:
> On Fri, Aug 01, 2025 at 10:49:56AM +0100, Mark Rutland wrote:
> > On Wed, Jul 30, 2025 at 01:19:51PM +0200, Jiri Olsa wrote:
> > > On Tue, Jul 29, 2025 at 06:57:40PM +0100, Mark Rutland wrote:
> > > >
> > > > On Tue, Jul 29, 2025 at 12:28:03PM +0200, Jiri Olsa wrote:
> > > > > hi,
> > > > > while poking the multi-tracing interface I ended up with just one
> > > > > ftrace_ops object to attach all trampolines.
> > > > >
> > > > > This change allows to use less direct API calls during the attachment
> > > > > changes in the future code, so in effect speeding up the attachment.
> > > >
> > > > How important is that, and what sort of speedup does this result in? I
> > > > ask due to potential performance hits noted below, and I'm lacking
> > > > context as to why we want to do this in the first place -- what is this
> > > > intended to enable/improve?
> > >
> > > so it's all work on PoC stage, the idea is to be able to attach many
> > > (like 20,30,40k) functions to their trampolines quickly, which at the
> > > moment is slow because all the involved interfaces work with just single
> > > function/tracempoline relation
> >
> > Do you know which aspect of that is slow? e.g. is that becuase you have
> > to update each ftrace_ops independently, and pay the synchronization
> > overhead per-ops?
> >
> > I ask because it might be possible to do some more batching there, at
> > least for architectures like arm64 that use the CALL_OPS approach.
>
> IIRC it's the rcu sync in register_ftrace_direct and ftrace_shutdown
> I'll try to profile that case again, there might have been changes
> since the last time we did that
Do you mean synchronize_rcu_tasks()?
The call in register_ftrace_direct() was removed in commit:
33f137143e651321 ("ftrace: Use asynchronous grace period for register_ftrace_direct()")
... but in ftrace_shutdown() we still have a call to synchronize_rcu_tasks(),
and to synchronize_rcu_tasks_rude().
The call to synchronize_rcu_tasks() is still necessary, but we might be
abel to batch that better with API changes.
I think we might be able to remove the call to
synchronize_rcu_tasks_rude() on architectures with ARCH_WANTS_NO_INSTR,
since there shouldn't be any instrumentable functions called with RCU
not watching. That'd need to be checked.
[...]
> > > sorry I probably forgot/missed discussion on this, but doing the fast path like in
> > > x86_64 is not an option in arm, right?
> >
> > On arm64 we have a fast path, BUT branch range limitations means that we
> > cannot always branch directly from the instrumented function to the
> > direct func with a single branch instruction. We use ops->direct_call to
> > handle that case within a common trampoline, which is significantly
> > cheaper that iterating over the ops and/or looking up the direct func
> > from a hash.
> >
> > With CALL_OPS, we place a pointer to the ops immediately before the
> > instrumented function, and have the instrumented function branch to a
> > common trampoline which can load that pointer (and can then branch to
> > any direct func as necessary).
> >
> > The instrumented function looks like:
> >
> > # Aligned to 8 bytes
> > func - 8:
> > < pointer to ops >
>
> stupid question.. so there's ftrace_ops pointer stored for each function at
> 'func - 8` ? why not store the func's direct trampoline address in there?
Once reason is that today we don't have trampolines for all ops. Since
branch range limitations can require bouncing through the common ops,
it's simpler/better to bounce from that to the regular call than to
bounce from that to a trampoline which makes the regular call.
We *could* consider adding trampolines, but that comes with a jump in
complexity that we originally tried to avoid, and a potential
performance hit for regular ftrace calls. IIUC that will require similar
synchronization to what we have today, so it's not clearly a win
generally.
I'd like to better understand what the real bottleneck is; AFAICT it's
the tasks-rcu synchronization, and sharing the hash means that you only
need to do that once. I think that it should be possible to share that
synchronization across multiple ops updates with some API changes (e.g.
something like the batching of text_poke on x86).
If we can do that, it might benefit other users too (e.g.
live-patching), even if trampolines aren't being used, and would keep
the arch bits simple/maintainable.
[...]
> thanks for all the details, I'll check if both the new change and ops->direct_call
> could live together for x86 and other arch, but it will probably complicate
> things a lot more
Thanks; please let me know if there's any challenges there!
Mark.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC 00/10] ftrace,bpf: Use single direct ops for bpf trampolines
2025-08-06 10:20 ` Mark Rutland
@ 2025-08-13 11:09 ` Jiri Olsa
0 siblings, 0 replies; 19+ messages in thread
From: Jiri Olsa @ 2025-08-13 11:09 UTC (permalink / raw)
To: Mark Rutland
Cc: Jiri Olsa, Steven Rostedt, Florent Revest, bpf, linux-kernel,
linux-trace-kernel, linux-arm-kernel, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Menglong Dong, Naveen N Rao,
Michael Ellerman, Björn Töpel, Andy Chiu,
Alexandre Ghiti, Palmer Dabbelt
On Wed, Aug 06, 2025 at 11:20:08AM +0100, Mark Rutland wrote:
> On Sat, Aug 02, 2025 at 11:26:46PM +0200, Jiri Olsa wrote:
> > On Fri, Aug 01, 2025 at 10:49:56AM +0100, Mark Rutland wrote:
> > > On Wed, Jul 30, 2025 at 01:19:51PM +0200, Jiri Olsa wrote:
> > > > On Tue, Jul 29, 2025 at 06:57:40PM +0100, Mark Rutland wrote:
> > > > >
> > > > > On Tue, Jul 29, 2025 at 12:28:03PM +0200, Jiri Olsa wrote:
> > > > > > hi,
> > > > > > while poking the multi-tracing interface I ended up with just one
> > > > > > ftrace_ops object to attach all trampolines.
> > > > > >
> > > > > > This change allows to use less direct API calls during the attachment
> > > > > > changes in the future code, so in effect speeding up the attachment.
> > > > >
> > > > > How important is that, and what sort of speedup does this result in? I
> > > > > ask due to potential performance hits noted below, and I'm lacking
> > > > > context as to why we want to do this in the first place -- what is this
> > > > > intended to enable/improve?
> > > >
> > > > so it's all work on PoC stage, the idea is to be able to attach many
> > > > (like 20,30,40k) functions to their trampolines quickly, which at the
> > > > moment is slow because all the involved interfaces work with just single
> > > > function/tracempoline relation
> > >
> > > Do you know which aspect of that is slow? e.g. is that becuase you have
> > > to update each ftrace_ops independently, and pay the synchronization
> > > overhead per-ops?
> > >
> > > I ask because it might be possible to do some more batching there, at
> > > least for architectures like arm64 that use the CALL_OPS approach.
> >
> > IIRC it's the rcu sync in register_ftrace_direct and ftrace_shutdown
> > I'll try to profile that case again, there might have been changes
> > since the last time we did that
>
> Do you mean synchronize_rcu_tasks()?
>
> The call in register_ftrace_direct() was removed in commit:
>
> 33f137143e651321 ("ftrace: Use asynchronous grace period for register_ftrace_direct()")
>
> ... but in ftrace_shutdown() we still have a call to synchronize_rcu_tasks(),
> and to synchronize_rcu_tasks_rude().
>
> The call to synchronize_rcu_tasks() is still necessary, but we might be
> abel to batch that better with API changes.
>
> I think we might be able to remove the call to
> synchronize_rcu_tasks_rude() on architectures with ARCH_WANTS_NO_INSTR,
> since there shouldn't be any instrumentable functions called with RCU
> not watching. That'd need to be checked.
>
> [...]
>
> > > > sorry I probably forgot/missed discussion on this, but doing the fast path like in
> > > > x86_64 is not an option in arm, right?
> > >
> > > On arm64 we have a fast path, BUT branch range limitations means that we
> > > cannot always branch directly from the instrumented function to the
> > > direct func with a single branch instruction. We use ops->direct_call to
> > > handle that case within a common trampoline, which is significantly
> > > cheaper that iterating over the ops and/or looking up the direct func
> > > from a hash.
> > >
> > > With CALL_OPS, we place a pointer to the ops immediately before the
> > > instrumented function, and have the instrumented function branch to a
> > > common trampoline which can load that pointer (and can then branch to
> > > any direct func as necessary).
> > >
> > > The instrumented function looks like:
> > >
> > > # Aligned to 8 bytes
> > > func - 8:
> > > < pointer to ops >
> >
> > stupid question.. so there's ftrace_ops pointer stored for each function at
> > 'func - 8` ? why not store the func's direct trampoline address in there?
>
> Once reason is that today we don't have trampolines for all ops. Since
> branch range limitations can require bouncing through the common ops,
> it's simpler/better to bounce from that to the regular call than to
> bounce from that to a trampoline which makes the regular call.
>
> We *could* consider adding trampolines, but that comes with a jump in
> complexity that we originally tried to avoid, and a potential
> performance hit for regular ftrace calls. IIUC that will require similar
> synchronization to what we have today, so it's not clearly a win
> generally.
>
> I'd like to better understand what the real bottleneck is; AFAICT it's
> the tasks-rcu synchronization, and sharing the hash means that you only
> need to do that once. I think that it should be possible to share that
> synchronization across multiple ops updates with some API changes (e.g.
> something like the batching of text_poke on x86).
yea, so rcu does not seem to be the cause anymore (IIRC that was the
case some time ago) it looks like now the time is spent in the ftrace
internals that iterate and update call sites
the test was loop on attach/detach of fentry program
31.48% test_progs [kernel.kallsyms] [k] ftrace_replace_code
10.98% test_progs [kernel.kallsyms] [k] __ftrace_hash_update_ipmodify
6.41% test_progs [kernel.kallsyms] [k] __ftrace_hash_rec_update
4.69% test_progs [kernel.kallsyms] [k] ftrace_check_record
4.59% test_progs [kernel.kallsyms] [k] ftrace_lookup_ip
3.65% swapper [kernel.kallsyms] [k] acpi_os_read_port
3.40% test_progs [kernel.kallsyms] [k] srso_alias_return_thunk
2.97% test_progs [kernel.kallsyms] [k] srso_alias_safe_ret
2.67% test_progs [kernel.kallsyms] [k] ftrace_rec_iter_record
2.05% test_progs [kernel.kallsyms] [k] ftrace_test_record
1.83% test_progs [kernel.kallsyms] [k] ftrace_rec_iter_next
1.76% test_progs [kernel.kallsyms] [k] smp_call_function_many_cond
1.05% rcu_tasks_kthre [kernel.kallsyms] [k] rcu_tasks_pertask
0.70% test_progs [kernel.kallsyms] [k] btf_find_by_name_kind
0.61% swapper [kernel.kallsyms] [k] srso_alias_safe_ret
0.55% swapper [kernel.kallsyms] [k] io_idle
so by sharing the hash we do that (iterate and update functions)
just once
jirka
---
31.48% test_progs [kernel.kallsyms] [k] ftrace_replace_code
|
|--11.54%--ftrace_replace_code
| ftrace_modify_all_code
| |
| |--6.06%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --5.47%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
|--8.81%--ftrace_check_record
| ftrace_replace_code
| ftrace_modify_all_code
| |
| |--4.72%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --4.10%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
|--3.60%--ftrace_rec_iter_record
| ftrace_replace_code
| ftrace_modify_all_code
| |
| |--1.91%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --1.69%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
|--3.50%--ftrace_rec_iter_next
| ftrace_replace_code
| ftrace_modify_all_code
| |
| |--2.08%--ftrace_startup
| | register_ftrace_function_nolock
| | register_ftrace_direct
| | bpf_trampoline_update
| | __bpf_trampoline_link_prog
| | bpf_trampoline_link_prog
| | bpf_tracing_prog_attach
| | bpf_raw_tp_link_attach
| | __sys_bpf
| | __x64_sys_bpf
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | syscall
| | skel_raw_tracepoint_open
| | fentry_test_lskel__test1__attach
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --1.42%--ftrace_shutdown.part.0
| unregister_ftrace_function
| unregister_ftrace_direct
| bpf_trampoline_update
| bpf_trampoline_unlink_prog
| bpf_tracing_link_release
| bpf_link_free
| bpf_link_release
| __fput
| __x64_sys_close
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| __syscall_cancel_arch_end
| __syscall_cancel
| __close
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
|--2.44%--srso_alias_safe_ret
| srso_alias_return_thunk
| ftrace_replace_code
| ftrace_modify_all_code
| |
| |--1.36%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --1.07%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--1.59%--ftrace_test_record
ftrace_replace_code
ftrace_modify_all_code
|
|--0.87%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--0.72%--ftrace_shutdown.part.0
unregister_ftrace_function
unregister_ftrace_direct
bpf_trampoline_update
bpf_trampoline_unlink_prog
bpf_tracing_link_release
bpf_link_free
bpf_link_release
__fput
__x64_sys_close
do_syscall_64
entry_SYSCALL_64_after_hwframe
__syscall_cancel_arch_end
__syscall_cancel
__close
fentry_test_common
fentry_test
test_fentry_test
run_one_test
main
__libc_start_call_main
__libc_start_main@@GLIBC_2.34
_start
10.98% test_progs [kernel.kallsyms] [k] __ftrace_hash_update_ipmodify
|
|--7.90%--__ftrace_hash_update_ipmodify
| |
| |--4.27%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --3.63%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--3.06%--ftrace_lookup_ip
__ftrace_hash_update_ipmodify
|
|--1.92%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--1.14%--ftrace_shutdown.part.0
unregister_ftrace_function
unregister_ftrace_direct
bpf_trampoline_update
bpf_trampoline_unlink_prog
bpf_tracing_link_release
bpf_link_free
bpf_link_release
__fput
__x64_sys_close
do_syscall_64
entry_SYSCALL_64_after_hwframe
__syscall_cancel_arch_end
__syscall_cancel
__close
fentry_test_common
fentry_test
test_fentry_test
run_one_test
main
__libc_start_call_main
__libc_start_main@@GLIBC_2.34
_start
6.41% test_progs [kernel.kallsyms] [k] __ftrace_hash_rec_update
|
|--3.37%--__ftrace_hash_rec_update
| |
| |--1.90%--ftrace_startup
| | register_ftrace_function_nolock
| | register_ftrace_direct
| | bpf_trampoline_update
| | __bpf_trampoline_link_prog
| | bpf_trampoline_link_prog
| | bpf_tracing_prog_attach
| | bpf_raw_tp_link_attach
| | __sys_bpf
| | __x64_sys_bpf
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | syscall
| | skel_raw_tracepoint_open
| | fentry_test_lskel__test1__attach
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --1.47%--ftrace_shutdown.part.0
| unregister_ftrace_function
| unregister_ftrace_direct
| bpf_trampoline_update
| bpf_trampoline_unlink_prog
| bpf_tracing_link_release
| bpf_link_free
| bpf_link_release
| __fput
| __x64_sys_close
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| __syscall_cancel_arch_end
| __syscall_cancel
| __close
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
|--2.16%--ftrace_lookup_ip
| __ftrace_hash_rec_update
| |
| |--1.16%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --0.99%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--0.88%--srso_alias_safe_ret
|
--0.79%--__ftrace_hash_rec_update
|
--0.52%--ftrace_shutdown.part.0
unregister_ftrace_function
unregister_ftrace_direct
bpf_trampoline_update
bpf_trampoline_unlink_prog
bpf_tracing_link_release
bpf_link_free
bpf_link_release
__fput
__x64_sys_close
do_syscall_64
entry_SYSCALL_64_after_hwframe
__syscall_cancel_arch_end
__syscall_cancel
__close
fentry_test_common
fentry_test
test_fentry_test
run_one_test
main
__libc_start_call_main
__libc_start_main@@GLIBC_2.34
_start
4.69% test_progs [kernel.kallsyms] [k] ftrace_check_record
|
|--2.04%--ftrace_check_record
| ftrace_replace_code
| ftrace_modify_all_code
| |
| |--1.06%--ftrace_startup
| | register_ftrace_function_nolock
| | register_ftrace_direct
| | bpf_trampoline_update
| | __bpf_trampoline_link_prog
| | bpf_trampoline_link_prog
| | bpf_tracing_prog_attach
| | bpf_raw_tp_link_attach
| | __sys_bpf
| | __x64_sys_bpf
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | syscall
| | skel_raw_tracepoint_open
| | fentry_test_lskel__test1__attach
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --0.98%--ftrace_shutdown.part.0
| unregister_ftrace_function
| unregister_ftrace_direct
| bpf_trampoline_update
| bpf_trampoline_unlink_prog
| bpf_tracing_link_release
| bpf_link_free
| bpf_link_release
| __fput
| __x64_sys_close
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| __syscall_cancel_arch_end
| __syscall_cancel
| __close
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--1.28%--ftrace_replace_code
ftrace_modify_all_code
|
--0.81%--ftrace_startup
register_ftrace_function_nolock
register_ftrace_direct
bpf_trampoline_update
__bpf_trampoline_link_prog
bpf_trampoline_link_prog
bpf_tracing_prog_attach
bpf_raw_tp_link_attach
__sys_bpf
__x64_sys_bpf
do_syscall_64
entry_SYSCALL_64_after_hwframe
syscall
skel_raw_tracepoint_open
fentry_test_lskel__test1__attach
fentry_test_common
fentry_test
test_fentry_test
run_one_test
main
__libc_start_call_main
__libc_start_main@@GLIBC_2.34
_start
4.59% test_progs [kernel.kallsyms] [k] ftrace_lookup_ip
|
|--1.99%--__ftrace_hash_update_ipmodify
| |
| |--1.03%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --0.96%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
|--1.67%--ftrace_lookup_ip
| |
| --1.19%--__ftrace_hash_update_ipmodify
| |
| |--0.60%--ftrace_shutdown.part.0
| | unregister_ftrace_function
| | unregister_ftrace_direct
| | bpf_trampoline_update
| | bpf_trampoline_unlink_prog
| | bpf_tracing_link_release
| | bpf_link_free
| | bpf_link_release
| | __fput
| | __x64_sys_close
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | __syscall_cancel_arch_end
| | __syscall_cancel
| | __close
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --0.59%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--0.81%--__ftrace_hash_rec_update
3.65% swapper [kernel.kallsyms] [k] acpi_os_read_port
|
|--1.03%--acpi_os_read_port
| acpi_hw_read_port
| acpi_hw_read
| acpi_hw_register_read
| acpi_read_bit_register
| acpi_idle_enter_bm
| cpuidle_enter_state
| cpuidle_enter
| do_idle
| cpu_startup_entry
| |
| --0.97%--start_secondary
| common_startup_64
|
|--0.82%--srso_alias_safe_ret
|
--0.74%--acpi_hw_read
acpi_hw_register_read
acpi_read_bit_register
acpi_idle_enter_bm
cpuidle_enter_state
cpuidle_enter
do_idle
cpu_startup_entry
|
--0.74%--start_secondary
common_startup_64
3.40% test_progs [kernel.kallsyms] [k] srso_alias_return_thunk
|
|--0.85%--ftrace_replace_code
| ftrace_modify_all_code
| |
| --0.51%--ftrace_startup
| register_ftrace_function_nolock
| register_ftrace_direct
| bpf_trampoline_update
| __bpf_trampoline_link_prog
| bpf_trampoline_link_prog
| bpf_tracing_prog_attach
| bpf_raw_tp_link_attach
| __sys_bpf
| __x64_sys_bpf
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| syscall
| skel_raw_tracepoint_open
| fentry_test_lskel__test1__attach
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--0.64%--ftrace_check_record
ftrace_replace_code
ftrace_modify_all_code
2.97% test_progs [kernel.kallsyms] [k] srso_alias_safe_ret
|
|--0.73%--ftrace_check_record
| ftrace_replace_code
| ftrace_modify_all_code
|
--0.69%--ftrace_replace_code
ftrace_modify_all_code
2.67% test_progs [kernel.kallsyms] [k] ftrace_rec_iter_record
|
|--1.19%--ftrace_replace_code
| ftrace_modify_all_code
| |
| |--0.68%--ftrace_startup
| | register_ftrace_function_nolock
| | register_ftrace_direct
| | bpf_trampoline_update
| | __bpf_trampoline_link_prog
| | bpf_trampoline_link_prog
| | bpf_tracing_prog_attach
| | bpf_raw_tp_link_attach
| | __sys_bpf
| | __x64_sys_bpf
| | do_syscall_64
| | entry_SYSCALL_64_after_hwframe
| | syscall
| | skel_raw_tracepoint_open
| | fentry_test_lskel__test1__attach
| | fentry_test_common
| | fentry_test
| | test_fentry_test
| | run_one_test
| | main
| | __libc_start_call_main
| | __libc_start_main@@GLIBC_2.34
| | _start
| |
| --0.51%--ftrace_shutdown.part.0
| unregister_ftrace_function
| unregister_ftrace_direct
| bpf_trampoline_update
| bpf_trampoline_unlink_prog
| bpf_tracing_link_release
| bpf_link_free
| bpf_link_release
| __fput
| __x64_sys_close
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| __syscall_cancel_arch_end
| __syscall_cancel
| __close
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--0.69%--ftrace_check_record
ftrace_replace_code
ftrace_modify_all_code
2.05% test_progs [kernel.kallsyms] [k] ftrace_test_record
|
--0.79%--ftrace_replace_code
ftrace_modify_all_code
1.83% test_progs [kernel.kallsyms] [k] ftrace_rec_iter_next
|
--0.87%--ftrace_replace_code
ftrace_modify_all_code
|
--0.51%--ftrace_startup
register_ftrace_function_nolock
register_ftrace_direct
bpf_trampoline_update
__bpf_trampoline_link_prog
bpf_trampoline_link_prog
bpf_tracing_prog_attach
bpf_raw_tp_link_attach
__sys_bpf
__x64_sys_bpf
do_syscall_64
entry_SYSCALL_64_after_hwframe
syscall
skel_raw_tracepoint_open
fentry_test_lskel__test1__attach
fentry_test_common
fentry_test
test_fentry_test
run_one_test
main
__libc_start_call_main
__libc_start_main@@GLIBC_2.34
_start
1.76% test_progs [kernel.kallsyms] [k] smp_call_function_many_cond
|
--1.73%--smp_call_function_many_cond
on_each_cpu_cond_mask
|
--1.57%--smp_text_poke_batch_finish
|
--1.55%--ftrace_modify_all_code
|
|--0.91%--ftrace_shutdown.part.0
| unregister_ftrace_function
| unregister_ftrace_direct
| bpf_trampoline_update
| bpf_trampoline_unlink_prog
| bpf_tracing_link_release
| bpf_link_free
| bpf_link_release
| __fput
| __x64_sys_close
| do_syscall_64
| entry_SYSCALL_64_after_hwframe
| __syscall_cancel_arch_end
| __syscall_cancel
| __close
| fentry_test_common
| fentry_test
| test_fentry_test
| run_one_test
| main
| __libc_start_call_main
| __libc_start_main@@GLIBC_2.34
| _start
|
--0.64%--ftrace_startup
register_ftrace_function_nolock
register_ftrace_direct
bpf_trampoline_update
__bpf_trampoline_link_prog
bpf_trampoline_link_prog
bpf_tracing_prog_attach
bpf_raw_tp_link_attach
__sys_bpf
__x64_sys_bpf
do_syscall_64
entry_SYSCALL_64_after_hwframe
syscall
skel_raw_tracepoint_open
fentry_test_lskel__test1__attach
fentry_test_common
fentry_test
test_fentry_test
run_one_test
main
__libc_start_call_main
__libc_start_main@@GLIBC_2.34
_start
1.05% rcu_tasks_kthre [kernel.kallsyms] [k] rcu_tasks_pertask
|
--0.65%--rcu_tasks_wait_gp
rcu_tasks_one_gp
rcu_tasks_kthread
kthread
ret_from_fork
ret_from_fork_asm
0.70% test_progs [kernel.kallsyms] [k] btf_find_by_name_kind
|
--0.59%--btf_find_by_name_kind
^ permalink raw reply [flat|nested] 19+ messages in thread