From: eugene.loh@oracle.com
To: dtrace@lists.linux.dev, dtrace-devel@oss.oracle.com
Subject: [PATCH 2/2] Extend the USDT bit mask to multiple words
Date: Wed, 19 Feb 2025 23:43:50 -0500 [thread overview]
Message-ID: <20250220044350.14953-2-eugene.loh@oracle.com> (raw)
In-Reply-To: <20250220044350.14953-1-eugene.loh@oracle.com>
From: Eugene Loh <eugene.loh@oracle.com>
Currently, USDT is limited to 64 probe descriptions since the
underlying probe uses a 64-bit mask to decide which probes to execute.
Change to a multi-word bit mask that can be extended to however many
probe descriptions there are.
Also, change the mask words to be 32-bit rather than 64-bit. The reason
is that, commonly, there will be fewer than 32 probe descriptions. In
this case, we shorten the value of the "USDT prids" BPF map from 16 bytes
uint32_t prid;
long long mask[1];
down to 8 bytes
uint32_t prid;
uint32_t mask[1];
(The second member is smaller and no longer costs extra padding.)
We also add an
extern int usdt_prids_map_val_extra_bytes;
to denote how many extra bytes will be needed for the extended mask.
This value is computed by usdt_prids_map_val_extra_bytes_init().
Currently, this function is awkwardly called in gmap_create_usdt(),
just before the value is needed. Such a call to a provider-specific
function is clumsy, but there are no other calls to the provider
between compilation (where the number of statements is determined)
and this map creation.
Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
---
libdtrace/dt_bpf.c | 6 +-
libdtrace/dt_bpf_maps.h | 3 +-
libdtrace/dt_prov_uprobe.c | 81 ++++++++---
.../usdt/tst.many_probe_descriptions.r | 1 +
.../usdt/tst.many_probe_descriptions.sh | 64 +++++++++
.../usdt/tst.many_probe_descriptions2.r | 1 +
.../usdt/tst.many_probe_descriptions2.sh | 127 ++++++++++++++++++
7 files changed, 260 insertions(+), 23 deletions(-)
create mode 100644 test/unittest/usdt/tst.many_probe_descriptions.r
create mode 100755 test/unittest/usdt/tst.many_probe_descriptions.sh
create mode 100644 test/unittest/usdt/tst.many_probe_descriptions2.r
create mode 100755 test/unittest/usdt/tst.many_probe_descriptions2.sh
diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
index 662fd81a4..1ed9376ea 100644
--- a/libdtrace/dt_bpf.c
+++ b/libdtrace/dt_bpf.c
@@ -940,6 +940,7 @@ gmap_create_probes(dtrace_hdl_t *dtp)
return 0;
}
+void usdt_prids_map_val_extra_bytes_init(dtrace_hdl_t *dtp);
/*
* Create the 'usdt_names' and 'usdt_prids' BPF maps.
*
@@ -965,8 +966,11 @@ gmap_create_usdt(dtrace_hdl_t *dtp)
if (dtp->dt_usdt_namesmap_fd == -1)
return -1;
+ usdt_prids_map_val_extra_bytes_init(dtp);
+
dtp->dt_usdt_pridsmap_fd = create_gmap(dtp, "usdt_prids", BPF_MAP_TYPE_HASH,
- sizeof(usdt_prids_map_key_t), sizeof(usdt_prids_map_val_t), nusdtprobes);
+ sizeof(usdt_prids_map_key_t),
+ sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes, nusdtprobes);
if (dtp->dt_usdt_pridsmap_fd == -1)
return -1;
diff --git a/libdtrace/dt_bpf_maps.h b/libdtrace/dt_bpf_maps.h
index 884dc3983..ba17d8942 100644
--- a/libdtrace/dt_bpf_maps.h
+++ b/libdtrace/dt_bpf_maps.h
@@ -48,8 +48,9 @@ typedef struct usdt_prids_map_key {
} usdt_prids_map_key_t;
typedef struct usdt_prids_map_val {
uint32_t prid; /* should be dtrace_id_t, sys/dtrace_types.h */
- long long mask;
+ uint32_t mask[1];
} usdt_prids_map_val_t;
+extern int usdt_prids_map_val_extra_bytes;
#ifdef __cplusplus
}
diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
index f1323cc31..2a5b0ce91 100644
--- a/libdtrace/dt_prov_uprobe.c
+++ b/libdtrace/dt_prov_uprobe.c
@@ -76,6 +76,8 @@ typedef struct list_key {
usdt_prids_map_key_t key;
} list_key_t;
+int usdt_prids_map_val_extra_bytes;
+
static const dtrace_pattr_t pattr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
@@ -175,7 +177,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
int fdprids = dtp->dt_usdt_pridsmap_fd;
int fdnames = dtp->dt_usdt_namesmap_fd;
usdt_prids_map_key_t key, nxt;
- usdt_prids_map_val_t val;
+ usdt_prids_map_val_t *val = alloca(sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes);
list_key_t keys_to_delete, *elem, *elem_next;
dt_probe_t *prp, *prp_next;
@@ -190,7 +192,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
while (dt_bpf_map_next_key(fdprids, &key, &nxt) == 0) {
memcpy(&key, &nxt, sizeof(usdt_prids_map_key_t));
- if (dt_bpf_map_lookup(fdprids, &key, &val) == -1)
+ if (dt_bpf_map_lookup(fdprids, &key, val) == -1)
return dt_set_errno(dtp, EDT_BPF);
/* Check if the process is still running. */
@@ -203,7 +205,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
* we might delete the same usdt_names entry
* multiple times. That's okay.
*/
- dt_bpf_map_delete(fdnames, &val.prid);
+ dt_bpf_map_delete(fdnames, &val->prid);
/*
* Delete the usdt_prids entry.
@@ -224,7 +226,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
* FIXME. There might be another case, where the process
* is still running, but some of its USDT probes are gone?
* So maybe we have to check for the existence of one of
- * dtrace_probedesc_t *pdp = dtp->dt_probes[val.prid]->desc;
+ * dtrace_probedesc_t *pdp = dtp->dt_probes[val->prid]->desc;
* char *prv = ...pdp->prv minus the numerial part;
*
* /run/dtrace/probes/$pid/$pdp->prv/$pdp->mod/$pdp->fun/$pdp->prb
@@ -346,6 +348,33 @@ ignore_clause(dtrace_hdl_t *dtp, int n, const dt_probe_t *uprp)
return 0;
}
+void usdt_prids_map_val_extra_bytes_init(dtrace_hdl_t *dtp) {
+ int i, n = 0, w = sizeof(((usdt_prids_map_val_t *)0)->mask[0]);
+
+ /* Count how many statements cannot be ignored, regardless of uprp. */
+ for (i = 0; i < dtp->dt_stmt_nextid; i++) {
+ dtrace_stmtdesc_t *stp;
+
+ stp = dtp->dt_stmts[i];
+ if (stp == NULL || ignore_clause(dtp, i, NULL))
+ continue;
+
+ n++;
+ }
+
+ /* Determine how many bytes are needed for this many bits. */
+ n = (n + CHAR_BIT - 1) / CHAR_BIT;
+
+ /* Determine how many words are needed for this many bytes. */
+ n = (n + w - 1) / w;
+
+ /* Determine how many extra bytes are needed. */
+ if (n > 1)
+ usdt_prids_map_val_extra_bytes = (n - 1) * w;
+ else
+ usdt_prids_map_val_extra_bytes = 0;
+}
+
static int add_probe_uprobe(dtrace_hdl_t *dtp, dt_probe_t *prp)
{
dtrace_difo_t *dp;
@@ -416,6 +445,7 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
int fd = dtp->dt_usdt_namesmap_fd;
pid_t pid;
list_probe_t *pup;
+ usdt_prids_map_val_t *val;
/* Add probe name elements to usdt_names map. */
p = probnam;
@@ -451,11 +481,11 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
}
/* Add prid and bit mask to usdt_prids map. */
+ val = alloca(sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes);
for (pup = prp->prv_data; pup != NULL; pup = dt_list_next(pup)) {
dt_probe_t *uprp = pup->probe;
- long long mask = 0, bit = 1;
+ uint32_t iword = 0, mask = 0, bit = 1;
usdt_prids_map_key_t key;
- usdt_prids_map_val_t val;
dt_uprobe_t *upp = uprp->prv_data;
/*
@@ -473,11 +503,15 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
dtrace_stmtdesc_t *stp;
stp = dtp->dt_stmts[n];
- if (stp == NULL)
+ if (stp == NULL || ignore_clause(dtp, n, uprp))
continue;
- if (ignore_clause(dtp, n, uprp))
- continue;
+ if (bit == 0) {
+ val->mask[iword] = mask;
+ mask = 0;
+ iword++;
+ bit = 1;
+ }
if (dt_gmatch(prp->desc->prv, stp->dtsd_ecbdesc->dted_probe.prv) &&
dt_gmatch(prp->desc->mod, stp->dtsd_ecbdesc->dted_probe.mod) &&
@@ -492,11 +526,11 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
key.pid = pid;
key.uprid = uprp->desc->id;
- val.prid = prp->desc->id;
- val.mask = mask;
+ val->prid = prp->desc->id;
+ val->mask[iword] = mask;
// FIXME Check return value, but how should errors be handled?
- dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, &val);
+ dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, val);
}
return 0;
@@ -922,7 +956,7 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
const list_probe_t *pop;
uint_t lbl_exit = pcb->pcb_exitlbl;
dt_ident_t *usdt_prids = dt_dlib_get_map(dtp, "usdt_prids");
- int n;
+ int n, ibit, w = CHAR_BIT * sizeof(((usdt_prids_map_val_t *)0)->mask[0]);
assert(usdt_prids != NULL);
@@ -1020,7 +1054,8 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
*/
assert(sizeof(usdt_prids_map_key_t) <= DT_STK_SLOT_SZ);
emit(dlp, BPF_STORE(BPF_W, BPF_REG_FP, DT_TRAMP_SP_SLOT(0), BPF_REG_0));
- emit(dlp, BPF_STORE_IMM(BPF_W, BPF_REG_FP, DT_TRAMP_SP_SLOT(0) + sizeof(pid_t), uprp->desc->id));
+ emit(dlp, BPF_STORE_IMM(BPF_W, BPF_REG_FP,
+ DT_TRAMP_SP_SLOT(0) + (int)sizeof(pid_t), uprp->desc->id));
dt_cg_xsetx(dlp, usdt_prids, DT_LBL_NONE, BPF_REG_1, usdt_prids->di_id);
emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
@@ -1054,27 +1089,30 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
emit(dlp, BPF_LOAD(BPF_W, BPF_REG_1, BPF_REG_0, 0));
emit(dlp, BPF_STORE(BPF_W, BPF_REG_7, DMST_PRID, BPF_REG_1));
- /* Read the bit mask from the table lookup in %r6. */ // FIXME someday, extend this past 64 bits
- emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_6, BPF_REG_0, offsetof(usdt_prids_map_val_t, mask)));
+ /* Store the value key for reuse. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_FP, DT_TRAMP_SP_SLOT(0), BPF_REG_0));
/*
* Hold the bit mask in %r6 between clause calls.
*/
- for (n = 0; n < dtp->dt_stmt_nextid; n++) {
+ for (ibit = n = 0; n < dtp->dt_stmt_nextid; n++) {
dtrace_stmtdesc_t *stp;
dt_ident_t *idp;
uint_t lbl_next;
stp = dtp->dt_stmts[n];
- if (stp == NULL)
- continue;
-
- if (ignore_clause(dtp, n, uprp))
+ if (stp == NULL || ignore_clause(dtp, n, uprp))
continue;
idp = stp->dtsd_clause;
lbl_next = dt_irlist_label(dlp);
+ /* Load the next word of the bit mask into %r6. */
+ if (ibit % w == 0) {
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_TRAMP_SP_SLOT(0)));
+ emit(dlp, BPF_LOAD(BPF_W, BPF_REG_6, BPF_REG_0, offsetof(usdt_prids_map_val_t, mask[ibit / w])));
+ }
+
/* If the lowest %r6 bit is 0, skip over this clause. */
emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_6));
emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 1));
@@ -1102,6 +1140,7 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
/* Right-shift %r6. */
emit(dlp, BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1));
+ ibit++;
}
out:
diff --git a/test/unittest/usdt/tst.many_probe_descriptions.r b/test/unittest/usdt/tst.many_probe_descriptions.r
new file mode 100644
index 000000000..2e9ba477f
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions.r
@@ -0,0 +1 @@
+success
diff --git a/test/unittest/usdt/tst.many_probe_descriptions.sh b/test/unittest/usdt/tst.many_probe_descriptions.sh
new file mode 100755
index 000000000..92a61d5b7
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Oracle Linux DTrace.
+# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at
+# http://oss.oracle.com/licenses/upl.
+
+dtrace=$1
+TRIGGER=$PWD/test/triggers/usdt-tst-args
+
+DIRNAME="$tmpdir/usdt-many_probe_descriptions.$$.$RANDOM"
+mkdir -p $DIRNAME
+cd $DIRNAME
+
+# Construct the D scripts and output files.
+# We stick 80 probe descriptions in each of 3 scripts to test
+# USDT's ability to handle hundreds of probe descriptions.
+for d in 0 1 2; do
+for x in 00 01 02 03 04 05 06 07 08 09 \
+ 10 11 12 13 14 15 16 17 18 19 \
+ 20 21 22 23 24 25 26 27 28 29 \
+ 30 31 32 33 34 35 36 37 38 39 \
+ 40 41 42 43 44 45 46 47 48 49 \
+ 50 51 52 53 54 55 56 57 58 59 \
+ 60 61 62 63 64 65 66 67 68 69 \
+ 70 71 72 73 74 75 76 77 78 79 \
+; do
+ echo 'test_prov$target:::place { printf("'$d$x'\n"); }' >> D$d.d
+ echo $d$x >> expect.txt
+done
+done
+echo 'test_prov$target:::place { exit(0); }' >> D$d.d
+echo >> expect.txt
+
+# Run DTrace.
+
+$dtrace $dt_flags -c $TRIGGER -q -s D0.d -s D1.d -s D2.d >& actual.txt
+if [ $? -eq 0 ]; then
+ if diff -q expect.txt actual.txt > /dev/null; then
+ echo success
+ exit 0
+ else
+ echo ERROR: did not get expected results
+ echo === expect.txt
+ cat expect.txt
+ echo === actual.txt
+ cat actual.txt
+ echo === diff
+ diff expect.txt actual.txt
+ fi
+else
+ echo ERROR: dtrace error
+ echo ==== output
+ cat actual.txt
+fi
+
+echo ==== script D0.d
+cat D0.d
+echo ==== script D1.d
+cat D1.d
+echo ==== script D2.d
+cat D2.d
+
+exit 1
diff --git a/test/unittest/usdt/tst.many_probe_descriptions2.r b/test/unittest/usdt/tst.many_probe_descriptions2.r
new file mode 100644
index 000000000..2e9ba477f
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions2.r
@@ -0,0 +1 @@
+success
diff --git a/test/unittest/usdt/tst.many_probe_descriptions2.sh b/test/unittest/usdt/tst.many_probe_descriptions2.sh
new file mode 100755
index 000000000..cc8821c6e
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions2.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+#
+# Oracle Linux DTrace.
+# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at
+# http://oss.oracle.com/licenses/upl.
+
+# This test uses many probes and probe descriptions. Therefore, the
+# number of BPF programs to load into the kernel -- dt_bpf_load_prog()
+# calling prp->prov->impl->load_prog(), which is dt_bpf_prog_load() --
+# and the duration of each load are both increasing.
+# @@timeout: 400
+
+dtrace=$1
+
+DIRNAME="$tmpdir/usdt-many_probe_descriptions2.$$.$RANDOM"
+mkdir -p $DIRNAME
+cd $DIRNAME
+
+# Set the lists.
+# - The probes will be foo$x$y.
+# - The probe descriptions will be foo$x* and foo*$y, for each $d.
+# So if there are nx items in xlist, ny in ylist, and nd in dlist,
+# - there will be roughly nx*ny probes
+# - there will be roughly (nx+ny)*nd probe descriptions
+
+xlist="a b c d e f g h i j k l m"
+ylist="n o p q r s t u v w x y z"
+dlist="0 1 2 3 4 5 6 7 8"
+
+# Make the trigger: Preambles.
+
+echo "provider testprov {" > prov.d
+
+echo '#include "prov.h"' > main.c
+echo 'int main(int argc, char **argv) {' >> main.c
+
+# Make the trigger: Loop over the probes.
+
+for x in $xlist; do
+for y in $ylist; do
+ echo "probe foo$x$y();" >> prov.d
+ echo "TESTPROV_FOO$x$y();" | awk '{ print(toupper($1)) }' >> main.c
+done
+done
+
+# Make the trigger: Epilogues.
+
+echo "};" >> prov.d
+echo "return 0; }" >> main.c
+
+# Build the trigger.
+
+$dtrace $dt_flags -h -s prov.d
+if [ $? -ne 0 ]; then
+ echo "failed to generate header file" >&2
+ cat prov.d
+ exit 1
+fi
+gcc $test_cppflags -c main.c
+if [ $? -ne 0 ]; then
+ echo "failed to compile test" >&2
+ cat main.c
+ exit 1
+fi
+$dtrace $dt_flags -G -64 -s prov.d main.o
+if [ $? -ne 0 ]; then
+ echo "failed to create DOF" >&2
+ exit 1
+fi
+gcc $test_ldflags -o main main.o prov.o
+if [ $? -ne 0 ]; then
+ echo "failed to link final executable" >&2
+ exit 1
+fi
+
+# Prepare the D script, generating the probe descriptions.
+
+rm -f D.d
+for d in $dlist; do
+ for x in $xlist; do
+ echo 'testprov$target:::foo'$x'* { printf("'$d' '$x'* %s\n", probename) }' >> D.d
+ done
+ for y in $ylist; do
+ echo 'testprov$target:::foo*'$y' { printf("'$d' *'$y' %s\n", probename) }' >> D.d
+ done
+done
+
+# Prepare the expected output.
+
+for x in $xlist; do
+for y in $ylist; do
+for d in $dlist; do
+ echo $d $x'*' foo$x$y >> expect.txt
+ echo $d '*'$y foo$x$y >> expect.txt
+done
+done
+done
+echo >> expect.txt
+
+# Run DTrace.
+
+$dtrace $dt_flags -c ./main -qs D.d >& actual.txt
+if [ $? -ne 0 ]; then
+ echo ERROR: dtrace error
+ echo "==== D script"
+ cat D.d
+ echo "==== output"
+ cat actual.txt
+ exit 1
+fi
+
+# Check results.
+
+if diff -q expect.txt actual.txt; then
+ echo success
+ exit 0
+else
+ echo ERROR: unexpected results
+ echo "==== expect"
+ cat expect.txt
+ echo "==== actual"
+ cat actual.txt
+ echo "==== diff"
+ diff expect.txt actual.txt
+ exit 1
+fi
--
2.43.5
next prev parent reply other threads:[~2025-02-20 4:43 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-20 4:43 [PATCH 1/2] Clarify how the usdt_prids key is stored on the BPF stack eugene.loh
2025-02-20 4:43 ` eugene.loh [this message]
[not found] ` <Z9rXYvNQxZqODC3o@kvh-deb-bpf.us.oracle.com>
2025-03-19 15:18 ` [DTrace-devel] " Kris Van Hees
2025-03-19 16:30 ` Eugene Loh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250220044350.14953-2-eugene.loh@oracle.com \
--to=eugene.loh@oracle.com \
--cc=dtrace-devel@oss.oracle.com \
--cc=dtrace@lists.linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.