* [PATCH 1/2] Add a cpuinfos BPF map
@ 2025-03-31 21:45 eugene.loh
2025-03-31 21:45 ` [PATCH 2/2] Clean up sched provider trampoline FIXMEs eugene.loh
2025-04-01 5:36 ` [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map Kris Van Hees
0 siblings, 2 replies; 8+ messages in thread
From: eugene.loh @ 2025-03-31 21:45 UTC (permalink / raw)
To: dtrace, dtrace-devel
From: Eugene Loh <eugene.loh@oracle.com>
The cpuinfo BPF map is a per-CPU map that has CPU information
on each CPU for that CPU.
Add a cpuinfos BPF map that allows any CPU to access information
for any other CPU.
For now, we retain the older per-CPU map. If desired, a future
patch can migrate existing uses of the per-CPU map to the new
map, decommissioning the old one. This would include map set up:
*) libdtrace/dt_dlibs.c: DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
*) libdtrace/dt_impl.h: int dt_cpumap_fd;
*) libdtrace/dt_bpf.c: dtp->dt_cpumap_fd = ...
libdtrace/dt_bpf.c: CREATE_MAP(cpuinfo)
and map use:
*) bpf/get_agg.c
*) bpf/get_bvar.c
*) libdtrace/dt_cg.c
*) libdtrace/dt_prov_lockstat.c
Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
---
libdtrace/dt_bpf.c | 13 +++++++++++++
libdtrace/dt_dlibs.c | 1 +
libdtrace/dt_impl.h | 1 +
3 files changed, 15 insertions(+)
diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
index 6d42a96c7..8da51d6b9 100644
--- a/libdtrace/dt_bpf.c
+++ b/libdtrace/dt_bpf.c
@@ -786,7 +786,20 @@ gmap_create_cpuinfo(dtrace_hdl_t *dtp)
if (dtp->dt_cpumap_fd == -1)
return -1;
+ dtp->dt_cpusmap_fd = create_gmap(dtp, "cpuinfos",
+ BPF_MAP_TYPE_HASH,
+ sizeof(uint32_t),
+ sizeof(dt_bpf_cpuinfo_t), ncpus);
+ if (dtp->dt_cpusmap_fd == -1)
+ return -1;
+
rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
+
+ for (i = 0, ci = &conf->cpus[0]; i < ncpus && rc != -1; i++, ci++) {
+ key = ci->cpu_id;
+ rc = dt_bpf_map_update(dtp->dt_cpusmap_fd, &key, &data[ci->cpu_id]);
+ }
+
dt_free(dtp, data);
if (rc == -1)
return dt_bpf_error(dtp,
diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
index 21df22a8a..0f19f3566 100644
--- a/libdtrace/dt_dlibs.c
+++ b/libdtrace/dt_dlibs.c
@@ -61,6 +61,7 @@ static const dt_ident_t dt_bpf_symbols[] = {
DT_BPF_SYMBOL(agggen, DT_IDENT_PTR),
DT_BPF_SYMBOL(buffers, DT_IDENT_PTR),
DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
+ DT_BPF_SYMBOL(cpuinfos, DT_IDENT_PTR),
DT_BPF_SYMBOL(dvars, DT_IDENT_PTR),
DT_BPF_SYMBOL(gvars, DT_IDENT_PTR),
DT_BPF_SYMBOL(lvars, DT_IDENT_PTR),
diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
index 68fb8ec53..a5e42801c 100644
--- a/libdtrace/dt_impl.h
+++ b/libdtrace/dt_impl.h
@@ -390,6 +390,7 @@ struct dtrace_hdl {
int dt_aggmap_fd; /* file descriptor for the 'aggs' BPF map */
int dt_genmap_fd; /* file descriptor for the 'agggen' BPF map */
int dt_cpumap_fd; /* file descriptor for the 'cpuinfo' BPF map */
+ int dt_cpusmap_fd; /* file descriptor for the 'cpuinfos' BPF map */
int dt_usdt_pridsmap_fd; /* file descriptor for the 'usdt_prids' BPF map */
int dt_usdt_namesmap_fd; /* file descriptor for the 'usdt_names' BPF map */
dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
--
2.43.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/2] Clean up sched provider trampoline FIXMEs
2025-03-31 21:45 [PATCH 1/2] Add a cpuinfos BPF map eugene.loh
@ 2025-03-31 21:45 ` eugene.loh
2025-04-01 5:36 ` [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map Kris Van Hees
1 sibling, 0 replies; 8+ messages in thread
From: eugene.loh @ 2025-03-31 21:45 UTC (permalink / raw)
To: dtrace, dtrace-devel
From: Eugene Loh <eugene.loh@oracle.com>
The sched provider trampoline for enqueue and dequeue probes had
pending FIXMEs for providing a cpuinfo_t* for the cpu associated
with the run queue. Implement the missing code.
Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
---
libdtrace/dt_prov_sched.c | 74 +++++++++++++++++++++++++------
test/unittest/sched/tst.enqueue.d | 1 -
2 files changed, 60 insertions(+), 15 deletions(-)
diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
index 3a218f3cb..d626b27be 100644
--- a/libdtrace/dt_prov_sched.c
+++ b/libdtrace/dt_prov_sched.c
@@ -84,6 +84,40 @@ static int populate(dtrace_hdl_t *dtp)
probe_args, probes);
}
+/*
+ * Get a pointer to the cpuinfo_t structure for the CPU associated
+ * with the runqueue that is in arg0.
+ *
+ * Clobbers %r1 through %r5
+ * Stores pointer to cpuinfo_t struct in %r0
+ */
+static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
+{
+ dt_ident_t *idp = dt_dlib_get_map(dtp, "cpuinfos");
+
+ assert(idp != NULL);
+
+ /* Put the runqueue pointer from mst->arg0 into %r3. */
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
+
+ /* Turn it into a pointer to its cpu member. */
+ emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, dt_cg_ctf_offsetof("struct rq", "cpu", NULL, 1)));
+
+ /* Call bpf_probe_read_kernel(%fp + DT_TRAMP_SP_SLOT[0], sizeof(int), %r3) */
+ emit(dlp, BPF_MOV_IMM(BPF_REG_2, (int) sizeof(int)));
+ emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
+ emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_TRAMP_SP_SLOT(0)));
+ emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
+ emit(dlp, BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, exitlbl));
+
+ /* Now look up the corresponding cpuinfo_t. */
+ dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
+ emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
+ emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
+ emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
+ emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
+}
+
/*
* Generate a BPF trampoline for a SDT probe.
*
@@ -98,18 +132,39 @@ static int populate(dtrace_hdl_t *dtp)
*/
static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
{
+ dtrace_hdl_t *dtp = pcb->pcb_hdl;
dt_irlist_t *dlp = &pcb->pcb_ir;
dt_probe_t *prp = pcb->pcb_probe;
if (strcmp(prp->desc->prb, "dequeue") == 0) {
- emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
- emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
/*
- * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
- * associated with the runqueue.
+ * Get the runqueue from arg0 and place a cpuinfo_t* into %r0.
+ */
+ get_cpuinfo(dtp, dlp, exitlbl);
+
+ /*
+ * Copy arg1 into arg0.
*/
- emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
+
+ /* Store the cpuinfo_t* in %r0 into arg1. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
} else if (strcmp(prp->desc->prb, "enqueue") == 0) {
+ /*
+ * Get the runqueue from arg0 and place a cpuinfo_t* into %r0.
+ */
+ get_cpuinfo(dtp, dlp, exitlbl);
+
+ /*
+ * Copy arg1 into arg0.
+ */
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
+
+ /* Store the cpuinfo_t* in %r0 into arg1. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
+
/*
* This is ugly but necessary... enqueue_task() takes a flags argument and the
* ENQUEUE_HEAD flag is used to indicate that the task is to be placed at the
@@ -120,15 +175,6 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
* outside the kernel source tree.
*/
#define ENQUEUE_HEAD 0x10
-
- emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
- emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
- /*
- * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
- * associated with the runqueue.
- */
- emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
-
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_0, ENQUEUE_HEAD));
emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(2), BPF_REG_0));
diff --git a/test/unittest/sched/tst.enqueue.d b/test/unittest/sched/tst.enqueue.d
index f445ac843..28dcace8c 100644
--- a/test/unittest/sched/tst.enqueue.d
+++ b/test/unittest/sched/tst.enqueue.d
@@ -4,7 +4,6 @@
* Licensed under the Universal Permissive License v 1.0 as shown at
* http://oss.oracle.com/licenses/upl.
*/
-/* @@xfail: dtv2 */
#pragma D option switchrate=100hz
#pragma D option destructive
--
2.43.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map
2025-03-31 21:45 [PATCH 1/2] Add a cpuinfos BPF map eugene.loh
2025-03-31 21:45 ` [PATCH 2/2] Clean up sched provider trampoline FIXMEs eugene.loh
@ 2025-04-01 5:36 ` Kris Van Hees
2025-04-01 22:54 ` Eugene Loh
1 sibling, 1 reply; 8+ messages in thread
From: Kris Van Hees @ 2025-04-01 5:36 UTC (permalink / raw)
To: eugene.loh; +Cc: dtrace, dtrace-devel
This is not the way to go about this. If, in order to implement the cpuinfo_t
argument to sched probes, a regular BPF array map is needed so that cpuinfo
data can be accessed for any given CPU id, then the existing map should be
replaced with the new one, and its use updated to access the new one. That
way you can also keep the name of the map, etc...
Introducing this new map with exactly the same data, and then hoping to
deprecate the old one later is making things more messy.
On Mon, Mar 31, 2025 at 05:45:00PM -0400, eugene.loh--- via DTrace-devel wrote:
> From: Eugene Loh <eugene.loh@oracle.com>
>
> The cpuinfo BPF map is a per-CPU map that has CPU information
> on each CPU for that CPU.
>
> Add a cpuinfos BPF map that allows any CPU to access information
> for any other CPU.
>
> For now, we retain the older per-CPU map. If desired, a future
> patch can migrate existing uses of the per-CPU map to the new
> map, decommissioning the old one. This would include map set up:
>
> *) libdtrace/dt_dlibs.c: DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
>
> *) libdtrace/dt_impl.h: int dt_cpumap_fd;
>
> *) libdtrace/dt_bpf.c: dtp->dt_cpumap_fd = ...
> libdtrace/dt_bpf.c: CREATE_MAP(cpuinfo)
>
> and map use:
>
> *) bpf/get_agg.c
> *) bpf/get_bvar.c
> *) libdtrace/dt_cg.c
> *) libdtrace/dt_prov_lockstat.c
>
> Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
> ---
> libdtrace/dt_bpf.c | 13 +++++++++++++
> libdtrace/dt_dlibs.c | 1 +
> libdtrace/dt_impl.h | 1 +
> 3 files changed, 15 insertions(+)
>
> diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
> index 6d42a96c7..8da51d6b9 100644
> --- a/libdtrace/dt_bpf.c
> +++ b/libdtrace/dt_bpf.c
> @@ -786,7 +786,20 @@ gmap_create_cpuinfo(dtrace_hdl_t *dtp)
> if (dtp->dt_cpumap_fd == -1)
> return -1;
>
> + dtp->dt_cpusmap_fd = create_gmap(dtp, "cpuinfos",
> + BPF_MAP_TYPE_HASH,
> + sizeof(uint32_t),
> + sizeof(dt_bpf_cpuinfo_t), ncpus);
> + if (dtp->dt_cpusmap_fd == -1)
> + return -1;
> +
> rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
> +
> + for (i = 0, ci = &conf->cpus[0]; i < ncpus && rc != -1; i++, ci++) {
> + key = ci->cpu_id;
> + rc = dt_bpf_map_update(dtp->dt_cpusmap_fd, &key, &data[ci->cpu_id]);
> + }
> +
> dt_free(dtp, data);
> if (rc == -1)
> return dt_bpf_error(dtp,
> diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
> index 21df22a8a..0f19f3566 100644
> --- a/libdtrace/dt_dlibs.c
> +++ b/libdtrace/dt_dlibs.c
> @@ -61,6 +61,7 @@ static const dt_ident_t dt_bpf_symbols[] = {
> DT_BPF_SYMBOL(agggen, DT_IDENT_PTR),
> DT_BPF_SYMBOL(buffers, DT_IDENT_PTR),
> DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
> + DT_BPF_SYMBOL(cpuinfos, DT_IDENT_PTR),
> DT_BPF_SYMBOL(dvars, DT_IDENT_PTR),
> DT_BPF_SYMBOL(gvars, DT_IDENT_PTR),
> DT_BPF_SYMBOL(lvars, DT_IDENT_PTR),
> diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
> index 68fb8ec53..a5e42801c 100644
> --- a/libdtrace/dt_impl.h
> +++ b/libdtrace/dt_impl.h
> @@ -390,6 +390,7 @@ struct dtrace_hdl {
> int dt_aggmap_fd; /* file descriptor for the 'aggs' BPF map */
> int dt_genmap_fd; /* file descriptor for the 'agggen' BPF map */
> int dt_cpumap_fd; /* file descriptor for the 'cpuinfo' BPF map */
> + int dt_cpusmap_fd; /* file descriptor for the 'cpuinfos' BPF map */
> int dt_usdt_pridsmap_fd; /* file descriptor for the 'usdt_prids' BPF map */
> int dt_usdt_namesmap_fd; /* file descriptor for the 'usdt_names' BPF map */
> dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
> --
> 2.43.5
>
>
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map
2025-04-01 5:36 ` [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map Kris Van Hees
@ 2025-04-01 22:54 ` Eugene Loh
2025-04-01 23:03 ` Kris Van Hees
2025-04-02 9:37 ` Alan Maguire
0 siblings, 2 replies; 8+ messages in thread
From: Eugene Loh @ 2025-04-01 22:54 UTC (permalink / raw)
To: Kris Van Hees; +Cc: dtrace, dtrace-devel
Here is a proposal. First, two observations:
1. (As Alan pointed out to me in a facepalm moment), one can write a
simple D script to check enqueue_task_*()'s rq->cpu against the current
CPU. He and I both find that the two CPUs are generally -- but not
always -- the same. So, the strictly correct thing to do is use the
rq->cpu value, even though you can just use the current CPU and be
correct "99%" of the time.
2. A BPF program can access per-cpu-array values on other CPUs. Well, I
guess you need commit 0734311 ("bpf: add bpf_map_lookup_percpu_elem for
percpu map"). That's in 5.18. That is, UEK9.
So my proposal is to leave the per-cpu cpuinfo BPF map alone. Perform a
runtime test whether bpf_map_lookup_percpu_elem() is available. If so,
do that cross-CPU lookup -- the 2/2 patch I posted -- but using the new
helper function. If not, use a simpler on-CPU lookup, which should be
right "99%" of the time. (I have a simple patch that uses the current
CPU. Pretty simple.)
On 4/1/25 01:36, Kris Van Hees wrote:
> This is not the way to go about this. If, in order to implement the cpuinfo_t
> argument to sched probes, a regular BPF array map is needed so that cpuinfo
> data can be accessed for any given CPU id, then the existing map should be
> replaced with the new one, and its use updated to access the new one. That
> way you can also keep the name of the map, etc...
>
> Introducing this new map with exactly the same data, and then hoping to
> deprecate the old one later is making things more messy.
>
> On Mon, Mar 31, 2025 at 05:45:00PM -0400, eugene.loh--- via DTrace-devel wrote:
>> From: Eugene Loh <eugene.loh@oracle.com>
>>
>> The cpuinfo BPF map is a per-CPU map that has CPU information
>> on each CPU for that CPU.
>>
>> Add a cpuinfos BPF map that allows any CPU to access information
>> for any other CPU.
>>
>> For now, we retain the older per-CPU map. If desired, a future
>> patch can migrate existing uses of the per-CPU map to the new
>> map, decommissioning the old one. This would include map set up:
>>
>> *) libdtrace/dt_dlibs.c: DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
>>
>> *) libdtrace/dt_impl.h: int dt_cpumap_fd;
>>
>> *) libdtrace/dt_bpf.c: dtp->dt_cpumap_fd = ...
>> libdtrace/dt_bpf.c: CREATE_MAP(cpuinfo)
>>
>> and map use:
>>
>> *) bpf/get_agg.c
>> *) bpf/get_bvar.c
>> *) libdtrace/dt_cg.c
>> *) libdtrace/dt_prov_lockstat.c
>>
>> Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
>> ---
>> libdtrace/dt_bpf.c | 13 +++++++++++++
>> libdtrace/dt_dlibs.c | 1 +
>> libdtrace/dt_impl.h | 1 +
>> 3 files changed, 15 insertions(+)
>>
>> diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
>> index 6d42a96c7..8da51d6b9 100644
>> --- a/libdtrace/dt_bpf.c
>> +++ b/libdtrace/dt_bpf.c
>> @@ -786,7 +786,20 @@ gmap_create_cpuinfo(dtrace_hdl_t *dtp)
>> if (dtp->dt_cpumap_fd == -1)
>> return -1;
>>
>> + dtp->dt_cpusmap_fd = create_gmap(dtp, "cpuinfos",
>> + BPF_MAP_TYPE_HASH,
>> + sizeof(uint32_t),
>> + sizeof(dt_bpf_cpuinfo_t), ncpus);
>> + if (dtp->dt_cpusmap_fd == -1)
>> + return -1;
>> +
>> rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
>> +
>> + for (i = 0, ci = &conf->cpus[0]; i < ncpus && rc != -1; i++, ci++) {
>> + key = ci->cpu_id;
>> + rc = dt_bpf_map_update(dtp->dt_cpusmap_fd, &key, &data[ci->cpu_id]);
>> + }
>> +
>> dt_free(dtp, data);
>> if (rc == -1)
>> return dt_bpf_error(dtp,
>> diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
>> index 21df22a8a..0f19f3566 100644
>> --- a/libdtrace/dt_dlibs.c
>> +++ b/libdtrace/dt_dlibs.c
>> @@ -61,6 +61,7 @@ static const dt_ident_t dt_bpf_symbols[] = {
>> DT_BPF_SYMBOL(agggen, DT_IDENT_PTR),
>> DT_BPF_SYMBOL(buffers, DT_IDENT_PTR),
>> DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
>> + DT_BPF_SYMBOL(cpuinfos, DT_IDENT_PTR),
>> DT_BPF_SYMBOL(dvars, DT_IDENT_PTR),
>> DT_BPF_SYMBOL(gvars, DT_IDENT_PTR),
>> DT_BPF_SYMBOL(lvars, DT_IDENT_PTR),
>> diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
>> index 68fb8ec53..a5e42801c 100644
>> --- a/libdtrace/dt_impl.h
>> +++ b/libdtrace/dt_impl.h
>> @@ -390,6 +390,7 @@ struct dtrace_hdl {
>> int dt_aggmap_fd; /* file descriptor for the 'aggs' BPF map */
>> int dt_genmap_fd; /* file descriptor for the 'agggen' BPF map */
>> int dt_cpumap_fd; /* file descriptor for the 'cpuinfo' BPF map */
>> + int dt_cpusmap_fd; /* file descriptor for the 'cpuinfos' BPF map */
>> int dt_usdt_pridsmap_fd; /* file descriptor for the 'usdt_prids' BPF map */
>> int dt_usdt_namesmap_fd; /* file descriptor for the 'usdt_names' BPF map */
>> dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
>> --
>> 2.43.5
>>
>>
>> _______________________________________________
>> DTrace-devel mailing list
>> DTrace-devel@oss.oracle.com
>> https://oss.oracle.com/mailman/listinfo/dtrace-devel
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map
2025-04-01 22:54 ` Eugene Loh
@ 2025-04-01 23:03 ` Kris Van Hees
2025-04-02 23:50 ` Eugene Loh
2025-04-02 9:37 ` Alan Maguire
1 sibling, 1 reply; 8+ messages in thread
From: Kris Van Hees @ 2025-04-01 23:03 UTC (permalink / raw)
To: Eugene Loh; +Cc: Kris Van Hees, dtrace, dtrace-devel
On Tue, Apr 01, 2025 at 06:54:29PM -0400, Eugene Loh wrote:
> Here is a proposal. First, two observations:
>
> 1. (As Alan pointed out to me in a facepalm moment), one can write a simple
> D script to check enqueue_task_*()'s rq->cpu against the current CPU. He
> and I both find that the two CPUs are generally -- but not always -- the
> same. So, the strictly correct thing to do is use the rq->cpu value, even
> though you can just use the current CPU and be correct "99%" of the time.
Sort of what I expected. But nice to see it confirmed.
> 2. A BPF program can access per-cpu-array values on other CPUs. Well, I
> guess you need commit 0734311 ("bpf: add bpf_map_lookup_percpu_elem for
> percpu map"). That's in 5.18. That is, UEK9.
We oculd get that backported I bet but that doesn't help upstream. So, not
really worth asking for a backport I think.
> So my proposal is to leave the per-cpu cpuinfo BPF map alone. Perform a
> runtime test whether bpf_map_lookup_percpu_elem() is available. If so, do
> that cross-CPU lookup -- the 2/2 patch I posted -- but using the new helper
> function. If not, use a simpler on-CPU lookup, which should be right "99%"
> of the time. (I have a simple patch that uses the current CPU. Pretty
> simple.)
But... 95% correct of the time doesn't quite cut it. I could see some quite
useful case for using these probes to specifically capture the times when it
does *not* originate from the same CPU. Settling for 95% correctness seems
like an odd tradeoff to me. Especally since we can get it right 100% of the
time without too much trouble. Just convert the cpuinfo map to a regular
array map, and instead of indexing it with a 0 key all the time, index it with
the value returned by bpf_get_smp_processor_id((). Then we have code that will
work on all kernels - no special casing. And I do not see there being much
performance difference by going this route.
> On 4/1/25 01:36, Kris Van Hees wrote:
>
> > This is not the way to go about this. If, in order to implement the cpuinfo_t
> > argument to sched probes, a regular BPF array map is needed so that cpuinfo
> > data can be accessed for any given CPU id, then the existing map should be
> > replaced with the new one, and its use updated to access the new one. That
> > way you can also keep the name of the map, etc...
> >
> > Introducing this new map with exactly the same data, and then hoping to
> > deprecate the old one later is making things more messy.
> >
> > On Mon, Mar 31, 2025 at 05:45:00PM -0400, eugene.loh--- via DTrace-devel wrote:
> > > From: Eugene Loh <eugene.loh@oracle.com>
> > >
> > > The cpuinfo BPF map is a per-CPU map that has CPU information
> > > on each CPU for that CPU.
> > >
> > > Add a cpuinfos BPF map that allows any CPU to access information
> > > for any other CPU.
> > >
> > > For now, we retain the older per-CPU map. If desired, a future
> > > patch can migrate existing uses of the per-CPU map to the new
> > > map, decommissioning the old one. This would include map set up:
> > >
> > > *) libdtrace/dt_dlibs.c: DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
> > >
> > > *) libdtrace/dt_impl.h: int dt_cpumap_fd;
> > >
> > > *) libdtrace/dt_bpf.c: dtp->dt_cpumap_fd = ...
> > > libdtrace/dt_bpf.c: CREATE_MAP(cpuinfo)
> > >
> > > and map use:
> > >
> > > *) bpf/get_agg.c
> > > *) bpf/get_bvar.c
> > > *) libdtrace/dt_cg.c
> > > *) libdtrace/dt_prov_lockstat.c
> > >
> > > Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
> > > ---
> > > libdtrace/dt_bpf.c | 13 +++++++++++++
> > > libdtrace/dt_dlibs.c | 1 +
> > > libdtrace/dt_impl.h | 1 +
> > > 3 files changed, 15 insertions(+)
> > >
> > > diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
> > > index 6d42a96c7..8da51d6b9 100644
> > > --- a/libdtrace/dt_bpf.c
> > > +++ b/libdtrace/dt_bpf.c
> > > @@ -786,7 +786,20 @@ gmap_create_cpuinfo(dtrace_hdl_t *dtp)
> > > if (dtp->dt_cpumap_fd == -1)
> > > return -1;
> > > + dtp->dt_cpusmap_fd = create_gmap(dtp, "cpuinfos",
> > > + BPF_MAP_TYPE_HASH,
> > > + sizeof(uint32_t),
> > > + sizeof(dt_bpf_cpuinfo_t), ncpus);
> > > + if (dtp->dt_cpusmap_fd == -1)
> > > + return -1;
> > > +
> > > rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
> > > +
> > > + for (i = 0, ci = &conf->cpus[0]; i < ncpus && rc != -1; i++, ci++) {
> > > + key = ci->cpu_id;
> > > + rc = dt_bpf_map_update(dtp->dt_cpusmap_fd, &key, &data[ci->cpu_id]);
> > > + }
> > > +
> > > dt_free(dtp, data);
> > > if (rc == -1)
> > > return dt_bpf_error(dtp,
> > > diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
> > > index 21df22a8a..0f19f3566 100644
> > > --- a/libdtrace/dt_dlibs.c
> > > +++ b/libdtrace/dt_dlibs.c
> > > @@ -61,6 +61,7 @@ static const dt_ident_t dt_bpf_symbols[] = {
> > > DT_BPF_SYMBOL(agggen, DT_IDENT_PTR),
> > > DT_BPF_SYMBOL(buffers, DT_IDENT_PTR),
> > > DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
> > > + DT_BPF_SYMBOL(cpuinfos, DT_IDENT_PTR),
> > > DT_BPF_SYMBOL(dvars, DT_IDENT_PTR),
> > > DT_BPF_SYMBOL(gvars, DT_IDENT_PTR),
> > > DT_BPF_SYMBOL(lvars, DT_IDENT_PTR),
> > > diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
> > > index 68fb8ec53..a5e42801c 100644
> > > --- a/libdtrace/dt_impl.h
> > > +++ b/libdtrace/dt_impl.h
> > > @@ -390,6 +390,7 @@ struct dtrace_hdl {
> > > int dt_aggmap_fd; /* file descriptor for the 'aggs' BPF map */
> > > int dt_genmap_fd; /* file descriptor for the 'agggen' BPF map */
> > > int dt_cpumap_fd; /* file descriptor for the 'cpuinfo' BPF map */
> > > + int dt_cpusmap_fd; /* file descriptor for the 'cpuinfos' BPF map */
> > > int dt_usdt_pridsmap_fd; /* file descriptor for the 'usdt_prids' BPF map */
> > > int dt_usdt_namesmap_fd; /* file descriptor for the 'usdt_names' BPF map */
> > > dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
> > > --
> > > 2.43.5
> > >
> > >
> > > _______________________________________________
> > > DTrace-devel mailing list
> > > DTrace-devel@oss.oracle.com
> > > https://oss.oracle.com/mailman/listinfo/dtrace-devel
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map
2025-04-01 22:54 ` Eugene Loh
2025-04-01 23:03 ` Kris Van Hees
@ 2025-04-02 9:37 ` Alan Maguire
2025-04-02 19:18 ` Eugene Loh
1 sibling, 1 reply; 8+ messages in thread
From: Alan Maguire @ 2025-04-02 9:37 UTC (permalink / raw)
To: Eugene Loh, Kris Van Hees; +Cc: dtrace, dtrace-devel
On 01/04/2025 23:54, Eugene Loh wrote:
> Here is a proposal. First, two observations:
>
> 1. (As Alan pointed out to me in a facepalm moment), one can write a
> simple D script to check enqueue_task_*()'s rq->cpu against the current
> CPU. He and I both find that the two CPUs are generally -- but not
> always -- the same. So, the strictly correct thing to do is use the rq-
>>cpu value, even though you can just use the current CPU and be correct
> "99%" of the time.
>
> 2. A BPF program can access per-cpu-array values on other CPUs. Well, I
> guess you need commit 0734311 ("bpf: add bpf_map_lookup_percpu_elem for
> percpu map"). That's in 5.18. That is, UEK9.
>
Nice find; this commit looks relatively standalone so you could file a
bug to request backport to UEK7U3 if it'd help. No guarantees of course
but it's not too distant from 5.15 and we've backported helpers before
and managed to deal with kABI issues.
> So my proposal is to leave the per-cpu cpuinfo BPF map alone. Perform a
> runtime test whether bpf_map_lookup_percpu_elem() is available. If so,
> do that cross-CPU lookup -- the 2/2 patch I posted -- but using the new
> helper function. If not, use a simpler on-CPU lookup, which should be
> right "99%" of the time. (I have a simple patch that uses the current
> CPU. Pretty simple.)
For what it's worth, I think it'd probably be more valuable to preserve
an accurate CPU id and worry less about the other fields in the
cpuinfo_t; i.e. when tracing, I mostly care about accurate cpu id info
and never look at the other data in a cpuinfo_t. So if it wasn't
possible to retrieve accurate cpuinfo_t info via a cross-cpu lookup via
the 5.19 helper, it might be better to fake up a cpuinfo_t with a
correct cpu id and other fields unset. I'm probably missing it, but I
don't see where those fields are populated currently; tried this a few
times and they look to be unset for me aside from cpu id:
# dtrace -n 'BEGIN { print((cpuinfo_t *)curcpu); } '
dtrace: description 'BEGIN ' matched 1 probe
CPU ID FUNCTION:NAME
5 1 :BEGIN 0xffffe05e7fb61b00 = *
(cpuinfo_t) {
.cpu_id = (processorid_t)5,
}
^C
# dtrace -n 'BEGIN { print((cpuinfo_t *)curcpu); } '
dtrace: description 'BEGIN ' matched 1 probe
CPU ID FUNCTION:NAME
7 1 :BEGIN 0xffffe05e7fbe1b00 = *
(cpuinfo_t) {
.cpu_id = (processorid_t)7,
}
^C
If those other fields are unset, maybe there would be a way to invoke a
translator to create a cpuinfo_t from just the cpu id? Not sure about
the mechanics here, but my worry would be that it could be exactly the
times where we are on cpu x and enqueueing on cpu y we might be
interested in, and if that info wasn't preserved we might miss something
valuable about how the system was behaving. Anyway thanks for fixing up
the sched provider, it's really useful!
Alan
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map
2025-04-02 9:37 ` Alan Maguire
@ 2025-04-02 19:18 ` Eugene Loh
0 siblings, 0 replies; 8+ messages in thread
From: Eugene Loh @ 2025-04-02 19:18 UTC (permalink / raw)
To: Alan Maguire, Kris Van Hees; +Cc: dtrace, dtrace-devel
On 4/2/25 05:37, Alan Maguire wrote:
> On 01/04/2025 23:54, Eugene Loh wrote:
>> Here is a proposal. First, two observations:
>>
>> 1. (As Alan pointed out to me in a facepalm moment), one can write a
>> simple D script to check enqueue_task_*()'s rq->cpu against the current
>> CPU. He and I both find that the two CPUs are generally -- but not
>> always -- the same. So, the strictly correct thing to do is use the rq-
>>> cpu value, even though you can just use the current CPU and be correct
>> "99%" of the time.
>>
>> 2. A BPF program can access per-cpu-array values on other CPUs. Well, I
>> guess you need commit 0734311 ("bpf: add bpf_map_lookup_percpu_elem for
>> percpu map"). That's in 5.18. That is, UEK9.
>>
> Nice find; this commit looks relatively standalone so you could file a
> bug to request backport to UEK7U3 if it'd help. No guarantees of course
> but it's not too distant from 5.15 and we've backported helpers before
> and managed to deal with kABI issues.
Kris was leaning to not relying on this helper since it's not ubiquitous
and I agree. In particular, it's not too hard just to have a global map
that one accesses by cpuid (self or other).
>> So my proposal is to leave the per-cpu cpuinfo BPF map alone. Perform a
>> runtime test whether bpf_map_lookup_percpu_elem() is available. If so,
>> do that cross-CPU lookup -- the 2/2 patch I posted -- but using the new
>> helper function. If not, use a simpler on-CPU lookup, which should be
>> right "99%" of the time. (I have a simple patch that uses the current
>> CPU. Pretty simple.)
> For what it's worth, I think it'd probably be more valuable to preserve
> an accurate CPU id
Kris agrees and I'm on board.
> and worry less about the other fields in the
> cpuinfo_t; i.e. when tracing, I mostly care about accurate cpu id info
> and never look at the other data in a cpuinfo_t. So if it wasn't
> possible to retrieve accurate cpuinfo_t info via a cross-cpu lookup via
> the 5.19 helper, it might be better to fake up a cpuinfo_t with a
> correct cpu id and other fields unset. I'm probably missing it, but I
> don't see where those fields are populated currently;
dt_conf.c sets sets cpu_id and cpu_chip.
https://docs.oracle.com/en/operating-systems/oracle-linux/dtrace-guide/dtrace-ref-DTraceProviders.html#dt_schedargs_prov
says cpu_pset and cpu_lgrp are unsupported.
Anyhow, I have a patch (will probably post today) that changes the
per-cpu array to a global map and so we should be able to get the CPUs
right.
> tried this a few
> times and they look to be unset for me aside from cpu id:
>
> # dtrace -n 'BEGIN { print((cpuinfo_t *)curcpu); } '
> dtrace: description 'BEGIN ' matched 1 probe
> CPU ID FUNCTION:NAME
> 5 1 :BEGIN 0xffffe05e7fb61b00 = *
> (cpuinfo_t) {
> .cpu_id = (processorid_t)5,
> }
>
> ^C
>
> # dtrace -n 'BEGIN { print((cpuinfo_t *)curcpu); } '
> dtrace: description 'BEGIN ' matched 1 probe
> CPU ID FUNCTION:NAME
> 7 1 :BEGIN 0xffffe05e7fbe1b00 = *
> (cpuinfo_t) {
> .cpu_id = (processorid_t)7,
> }
>
> ^C
>
> If those other fields are unset, maybe there would be a way to invoke a
> translator to create a cpuinfo_t from just the cpu id? Not sure about
> the mechanics here, but my worry would be that it could be exactly the
> times where we are on cpu x and enqueueing on cpu y we might be
> interested in, and if that info wasn't preserved we might miss something
> valuable about how the system was behaving. Anyway thanks for fixing up
> the sched provider, it's really useful!
>
> Alan
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map
2025-04-01 23:03 ` Kris Van Hees
@ 2025-04-02 23:50 ` Eugene Loh
0 siblings, 0 replies; 8+ messages in thread
From: Eugene Loh @ 2025-04-02 23:50 UTC (permalink / raw)
To: dtrace-devel, dtrace
On 4/1/25 19:03, Kris Van Hees wrote:
> On Tue, Apr 01, 2025 at 06:54:29PM -0400, Eugene Loh wrote:
>> Here is a proposal. First, two observations:
>>
>> 1. (As Alan pointed out to me in a facepalm moment), one can write a simple
>> D script to check enqueue_task_*()'s rq->cpu against the current CPU. He
>> and I both find that the two CPUs are generally -- but not always -- the
>> same. So, the strictly correct thing to do is use the rq->cpu value, even
>> though you can just use the current CPU and be correct "99%" of the time.
> Sort of what I expected. But nice to see it confirmed.
>
>> 2. A BPF program can access per-cpu-array values on other CPUs. Well, I
>> guess you need commit 0734311 ("bpf: add bpf_map_lookup_percpu_elem for
>> percpu map"). That's in 5.18. That is, UEK9.
> We oculd get that backported I bet but that doesn't help upstream. So, not
> really worth asking for a backport I think.
>
>> So my proposal is to leave the per-cpu cpuinfo BPF map alone. Perform a
>> runtime test whether bpf_map_lookup_percpu_elem() is available. If so, do
>> that cross-CPU lookup -- the 2/2 patch I posted -- but using the new helper
>> function. If not, use a simpler on-CPU lookup, which should be right "99%"
>> of the time. (I have a simple patch that uses the current CPU. Pretty
>> simple.)
> But... 95% correct of the time doesn't quite cut it. I could see some quite
> useful case for using these probes to specifically capture the times when it
> does *not* originate from the same CPU. Settling for 95% correctness seems
> like an odd tradeoff to me. Especally since we can get it right 100% of the
> time without too much trouble. Just convert the cpuinfo map to a regular
> array map, and instead of indexing it with a 0 key all the time, index it with
> the value returned by bpf_get_smp_processor_id((). Then we have code that will
> work on all kernels - no special casing. And I do not see there being much
> performance difference by going this route.
This patch is withdrawn.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2025-04-02 23:51 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-31 21:45 [PATCH 1/2] Add a cpuinfos BPF map eugene.loh
2025-03-31 21:45 ` [PATCH 2/2] Clean up sched provider trampoline FIXMEs eugene.loh
2025-04-01 5:36 ` [DTrace-devel] [PATCH 1/2] Add a cpuinfos BPF map Kris Van Hees
2025-04-01 22:54 ` Eugene Loh
2025-04-01 23:03 ` Kris Van Hees
2025-04-02 23:50 ` Eugene Loh
2025-04-02 9:37 ` Alan Maguire
2025-04-02 19:18 ` Eugene Loh
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.