* [PATCH] io: adjust io provider for NFS tracepoint variants
@ 2025-01-07 22:44 Kris Van Hees
2025-01-08 1:47 ` Elena Zannoni
2025-01-08 6:01 ` Eugene Loh
0 siblings, 2 replies; 6+ messages in thread
From: Kris Van Hees @ 2025-01-07 22:44 UTC (permalink / raw)
To: DTrace mailing lists, dtrace-devel
Kernels prior to 5.6.0 pass 3 arguments (derived from the NFS hdr)
to the nfs_initiate_read raw tracepoint, whereas kernels as of 5.6.0
pass just the NFS hdr.
Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
---
libdtrace/dt_prov_io.c | 127 +++++++++++++++++++++++++++++++++++------
1 file changed, 108 insertions(+), 19 deletions(-)
diff --git a/libdtrace/dt_prov_io.c b/libdtrace/dt_prov_io.c
index 385dc792..45cb875a 100644
--- a/libdtrace/dt_prov_io.c
+++ b/libdtrace/dt_prov_io.c
@@ -63,17 +63,9 @@ static probe_dep_t probes[] = {
{ "start",
DTRACE_PROBESPEC_NAME, "rawtp:block::block_bio_queue" },
{ "start",
- DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_read",
- DT_VERSION_NUMBER(5, 6, 0), },
+ DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_read" },
{ "start",
- DTRACE_PROBESPEC_NAME, "fbt:nfs:nfs_initiate_read:entry",
- 0, DT_VERSION_NUMBER(5, 5, 19) },
- { "start",
- DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_write",
- DT_VERSION_NUMBER(5, 6, 0), },
- { "start",
- DTRACE_PROBESPEC_NAME, "fbt:nfs:nfs_initiate_write:entry",
- 0, DT_VERSION_NUMBER(5, 5, 19) },
+ DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_write" },
{ NULL, }
};
@@ -155,12 +147,109 @@ static void deref_r3(dt_irlist_t *dlp, uint_t exitlbl, int addend, int width,
emit(dlp, BPF_LOAD(width, reg, BPF_REG_FP, DT_TRAMP_SP_SLOT(0)));
}
+/*
+ * For NFS events, we have to construct a fake struct bio, which we have to
+ * populate from the inode (arg0) and hdr->good_bytes (arg2) arguments the
+ * underlying probe provides.
+ */
+static void io_nfs_args_v1(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
+ const char *prb, const char *uprb)
+{
+ int off;
+ size_t siz;
+
+ /*
+ * Determine the various sizes and offsets we want.
+ *
+ * // Access these fields relative to &bio.
+ * struct bio bio = {
+ * .bi_opf = ...,
+ * .bi_iter.bi_size = ..., // struct bvec_iter bi_iter
+ * .bi_iter.bi_sector = ...,
+ * .bi_bdev = 0, // -or- .bi_disk = 0
+ * };
+ *
+ * // Access these fields relative to hdr.
+ * struct nfs_pgio_header *hdr;
+ * ... = hdr->res.count; // struct nfs_pgio_res res
+ */
+
+ /*
+ * Declare the -io-bio variable and store its address in %r6.
+ */
+ dt_cg_tramp_decl_var(pcb, &v_bio);
+ dt_cg_tramp_get_var(pcb, "this->-io-bio", 1, BPF_REG_6);
+
+ /* Fill in bi_opf */
+ off = dt_cg_ctf_offsetof("struct bio", "bi_opf", &siz, 0);
+ siz = bpf_ldst_size(siz, 1);
+ if (strstr(uprb, "read"))
+ emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_READ));
+ else
+ emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_WRITE));
+
+ /*
+ * bio.bi_iter.bi_size = hdr->foo.count;
+ *
+ * For the 'start' probe, count is arg2
+ * For the 'done' probe, count is hdr->res.count (hdr in arg1)
+ */
+ if (strcmp(prb, "start") == 0) {
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
+ } else {
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
+ off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "res", NULL, 0)
+ + dt_cg_ctf_offsetof("struct nfs_pgio_res", "count", &siz, 0);
+ deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
+ }
+
+ off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
+ + dt_cg_ctf_offsetof("struct bvec_iter", "bi_size", &siz, 0);
+ siz = bpf_ldst_size(siz, 1);
+ emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
+
+ /*
+ * bio.bi_iter.bi_sector = inode;
+ */
+ if (strcmp(prb, "start") == 0) {
+ /* inode is arg0 */
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
+ } else {
+ /* use hdr->inode, hdr is arg1 */
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
+
+ off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "inode", &siz, 0);
+ deref_r3(dlp, exitlbl, off, siz, BPF_REG_3);
+ }
+
+ off = dt_cg_ctf_offsetof("struct nfs_inode", "fileid", &siz, 0)
+ - dt_cg_ctf_offsetof("struct nfs_inode", "vfs_inode", NULL, 0);
+ deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
+
+ off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
+ + dt_cg_ctf_offsetof("struct bvec_iter", "bi_sector", &siz, 0);
+ siz = bpf_ldst_size(siz, 1);
+ emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
+
+ /*
+ * bio.bi_bdev = 0;
+ */
+ off = dt_cg_ctf_offsetof("struct bio", "bi_bdev", &siz, 1);
+ if (off == -1)
+ off = dt_cg_ctf_offsetof("struct bio", "bi_disk", &siz, 0);
+ siz = bpf_ldst_size(siz, 1);
+ emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, 0));
+
+ /* Store a pointer to the fake bio in arg0. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_6));
+}
+
/*
* For NFS events, we have to construct a fake struct bio, which we have to
* populate from the nfs_pgio_header argument the underlying probe provides.
*/
-static void io_nfs_args(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
- const char *prb, const char *uprb)
+static void io_nfs_args_v2(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
+ const char *prb, const char *uprb)
{
int off;
size_t siz;
@@ -411,6 +500,7 @@ static void io_xfs_args(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl)
*/
static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
{
+ dtrace_hdl_t *dtp = pcb->pcb_hdl;
dt_irlist_t *dlp = &pcb->pcb_ir;
dt_probe_t *prp = pcb->pcb_probe;
dt_probe_t *uprp = pcb->pcb_parent_probe;
@@ -420,13 +510,12 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
* we need to synthesize one.
*/
if (strcmp(uprp->desc->mod, "nfs") == 0) {
- /*
- * If the underlying probe is an FBT probe, we pass function
- * name. Otherwise, pass probe name.
- */
- io_nfs_args(pcb, dlp, exitlbl, prp->desc->prb,
- strcmp(uprp->desc->prb, "entry") == 0
- ? uprp->desc->fun : uprp->desc->prb);
+ if (dtp->dt_kernver < DT_VERSION_NUMBER(5, 6, 0))
+ io_nfs_args_v1(pcb, dlp, exitlbl, prp->desc->prb,
+ uprp->desc->prb);
+ else
+ io_nfs_args_v2(pcb, dlp, exitlbl, prp->desc->prb,
+ uprp->desc->prb);
goto done;
} else if (strcmp(uprp->desc->mod, "xfs") == 0) {
io_xfs_args(pcb, dlp, exitlbl);
--
2.45.2
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] io: adjust io provider for NFS tracepoint variants
2025-01-07 22:44 [PATCH] io: adjust io provider for NFS tracepoint variants Kris Van Hees
@ 2025-01-08 1:47 ` Elena Zannoni
2025-01-08 6:01 ` Eugene Loh
1 sibling, 0 replies; 6+ messages in thread
From: Elena Zannoni @ 2025-01-08 1:47 UTC (permalink / raw)
To: Kris Van Hees, DTrace mailing lists, dtrace-devel
Reviewed-by: Elena Zannoni <elena.zannoni@oracle.com>
elena
On 1/7/25 15:44, Kris Van Hees wrote:
> Kernels prior to 5.6.0 pass 3 arguments (derived from the NFS hdr)
> to the nfs_initiate_read raw tracepoint, whereas kernels as of 5.6.0
> pass just the NFS hdr.
>
> Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
> ---
> libdtrace/dt_prov_io.c | 127 +++++++++++++++++++++++++++++++++++------
> 1 file changed, 108 insertions(+), 19 deletions(-)
>
> diff --git a/libdtrace/dt_prov_io.c b/libdtrace/dt_prov_io.c
> index 385dc792..45cb875a 100644
> --- a/libdtrace/dt_prov_io.c
> +++ b/libdtrace/dt_prov_io.c
> @@ -63,17 +63,9 @@ static probe_dep_t probes[] = {
> { "start",
> DTRACE_PROBESPEC_NAME, "rawtp:block::block_bio_queue" },
> { "start",
> - DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_read",
> - DT_VERSION_NUMBER(5, 6, 0), },
> + DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_read" },
> { "start",
> - DTRACE_PROBESPEC_NAME, "fbt:nfs:nfs_initiate_read:entry",
> - 0, DT_VERSION_NUMBER(5, 5, 19) },
> - { "start",
> - DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_write",
> - DT_VERSION_NUMBER(5, 6, 0), },
> - { "start",
> - DTRACE_PROBESPEC_NAME, "fbt:nfs:nfs_initiate_write:entry",
> - 0, DT_VERSION_NUMBER(5, 5, 19) },
> + DTRACE_PROBESPEC_NAME, "rawtp:nfs::nfs_initiate_write" },
> { NULL, }
> };
>
> @@ -155,12 +147,109 @@ static void deref_r3(dt_irlist_t *dlp, uint_t exitlbl, int addend, int width,
> emit(dlp, BPF_LOAD(width, reg, BPF_REG_FP, DT_TRAMP_SP_SLOT(0)));
> }
>
> +/*
> + * For NFS events, we have to construct a fake struct bio, which we have to
> + * populate from the inode (arg0) and hdr->good_bytes (arg2) arguments the
> + * underlying probe provides.
> + */
> +static void io_nfs_args_v1(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
> + const char *prb, const char *uprb)
> +{
> + int off;
> + size_t siz;
> +
> + /*
> + * Determine the various sizes and offsets we want.
> + *
> + * // Access these fields relative to &bio.
> + * struct bio bio = {
> + * .bi_opf = ...,
> + * .bi_iter.bi_size = ..., // struct bvec_iter bi_iter
> + * .bi_iter.bi_sector = ...,
> + * .bi_bdev = 0, // -or- .bi_disk = 0
> + * };
> + *
> + * // Access these fields relative to hdr.
> + * struct nfs_pgio_header *hdr;
> + * ... = hdr->res.count; // struct nfs_pgio_res res
> + */
> +
> + /*
> + * Declare the -io-bio variable and store its address in %r6.
> + */
> + dt_cg_tramp_decl_var(pcb, &v_bio);
> + dt_cg_tramp_get_var(pcb, "this->-io-bio", 1, BPF_REG_6);
> +
> + /* Fill in bi_opf */
> + off = dt_cg_ctf_offsetof("struct bio", "bi_opf", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + if (strstr(uprb, "read"))
> + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_READ));
> + else
> + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_WRITE));
> +
> + /*
> + * bio.bi_iter.bi_size = hdr->foo.count;
> + *
> + * For the 'start' probe, count is arg2
> + * For the 'done' probe, count is hdr->res.count (hdr in arg1)
> + */
> + if (strcmp(prb, "start") == 0) {
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
> + } else {
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> + off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "res", NULL, 0)
> + + dt_cg_ctf_offsetof("struct nfs_pgio_res", "count", &siz, 0);
> + deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
> + }
> +
> + off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
> + + dt_cg_ctf_offsetof("struct bvec_iter", "bi_size", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
> +
> + /*
> + * bio.bi_iter.bi_sector = inode;
> + */
> + if (strcmp(prb, "start") == 0) {
> + /* inode is arg0 */
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> + } else {
> + /* use hdr->inode, hdr is arg1 */
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> +
> + off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "inode", &siz, 0);
> + deref_r3(dlp, exitlbl, off, siz, BPF_REG_3);
> + }
> +
> + off = dt_cg_ctf_offsetof("struct nfs_inode", "fileid", &siz, 0)
> + - dt_cg_ctf_offsetof("struct nfs_inode", "vfs_inode", NULL, 0);
> + deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
> +
> + off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
> + + dt_cg_ctf_offsetof("struct bvec_iter", "bi_sector", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
> +
> + /*
> + * bio.bi_bdev = 0;
> + */
> + off = dt_cg_ctf_offsetof("struct bio", "bi_bdev", &siz, 1);
> + if (off == -1)
> + off = dt_cg_ctf_offsetof("struct bio", "bi_disk", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, 0));
> +
> + /* Store a pointer to the fake bio in arg0. */
> + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_6));
> +}
> +
> /*
> * For NFS events, we have to construct a fake struct bio, which we have to
> * populate from the nfs_pgio_header argument the underlying probe provides.
> */
> -static void io_nfs_args(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
> - const char *prb, const char *uprb)
> +static void io_nfs_args_v2(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
> + const char *prb, const char *uprb)
> {
> int off;
> size_t siz;
> @@ -411,6 +500,7 @@ static void io_xfs_args(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl)
> */
> static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> {
> + dtrace_hdl_t *dtp = pcb->pcb_hdl;
> dt_irlist_t *dlp = &pcb->pcb_ir;
> dt_probe_t *prp = pcb->pcb_probe;
> dt_probe_t *uprp = pcb->pcb_parent_probe;
> @@ -420,13 +510,12 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> * we need to synthesize one.
> */
> if (strcmp(uprp->desc->mod, "nfs") == 0) {
> - /*
> - * If the underlying probe is an FBT probe, we pass function
> - * name. Otherwise, pass probe name.
> - */
> - io_nfs_args(pcb, dlp, exitlbl, prp->desc->prb,
> - strcmp(uprp->desc->prb, "entry") == 0
> - ? uprp->desc->fun : uprp->desc->prb);
> + if (dtp->dt_kernver < DT_VERSION_NUMBER(5, 6, 0))
> + io_nfs_args_v1(pcb, dlp, exitlbl, prp->desc->prb,
> + uprp->desc->prb);
> + else
> + io_nfs_args_v2(pcb, dlp, exitlbl, prp->desc->prb,
> + uprp->desc->prb);
> goto done;
> } else if (strcmp(uprp->desc->mod, "xfs") == 0) {
> io_xfs_args(pcb, dlp, exitlbl);
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] io: adjust io provider for NFS tracepoint variants
2025-01-07 22:44 [PATCH] io: adjust io provider for NFS tracepoint variants Kris Van Hees
2025-01-08 1:47 ` Elena Zannoni
@ 2025-01-08 6:01 ` Eugene Loh
2025-01-08 11:34 ` Nick Alcock
2025-01-09 18:15 ` Kris Van Hees
1 sibling, 2 replies; 6+ messages in thread
From: Eugene Loh @ 2025-01-08 6:01 UTC (permalink / raw)
To: Kris Van Hees, DTrace mailing lists, dtrace-devel
You already have a R-b, it seems the code is right (woo hoo! cool), and
we're trying to get a release out the door. But here are my "late to
the party" comments. Feel free to ignore.
The names v1 and v2 strike me as funny; they just make up new numbers.
Not a big deal, but ideally the names might more closely reflect the
actual version changes we're tracking.
Breaking the function out into a new "v1" version is some unneeded
copy-and-paste code bloat, I think. The function is kind of large and
the only difference between v1 and v2 is I guess in the two, relatively
short, "start" sections. So, keeping this stuff as one function -- with
extra logic in the two "start" sections -- might be more compact and
clearer than having two rather similar functions.
On 1/7/25 17:44, Kris Van Hees wrote:
> Kernels prior to 5.6.0 pass 3 arguments (derived from the NFS hdr)
> to the nfs_initiate_read raw tracepoint, whereas kernels as of 5.6.0
> pass just the NFS hdr.
Is the same true of write? If so, then maybe say so and point out that
what we're really doing is changing the handling of nfs "start" while
leaving nfs "done" alone... that would correspond more closely to what
is happening in the code, which talks of start and done.
> Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
> ---
> libdtrace/dt_prov_io.c | 127 +++++++++++++++++++++++++++++++++++------
> 1 file changed, 108 insertions(+), 19 deletions(-)
>
> diff --git a/libdtrace/dt_prov_io.c b/libdtrace/dt_prov_io.c
> @@ -155,12 +147,109 @@ static void deref_r3(dt_irlist_t *dlp, uint_t exitlbl, int addend, int width,
> +/*
> + * For NFS events, we have to construct a fake struct bio, which we have to
> + * populate from the inode (arg0) and hdr->good_bytes (arg2) arguments the
> + * underlying probe provides.
> + */
> +static void io_nfs_args_v1(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
> + const char *prb, const char *uprb)
> +{
> + int off;
> + size_t siz;
> +
> + /*
> + * Determine the various sizes and offsets we want.
> + *
> + * // Access these fields relative to &bio.
> + * struct bio bio = {
> + * .bi_opf = ...,
> + * .bi_iter.bi_size = ..., // struct bvec_iter bi_iter
> + * .bi_iter.bi_sector = ...,
> + * .bi_bdev = 0, // -or- .bi_disk = 0
> + * };
> + *
> + * // Access these fields relative to hdr.
> + * struct nfs_pgio_header *hdr;
> + * ... = hdr->res.count; // struct nfs_pgio_res res
> + */
> +
> + /*
> + * Declare the -io-bio variable and store its address in %r6.
> + */
> + dt_cg_tramp_decl_var(pcb, &v_bio);
> + dt_cg_tramp_get_var(pcb, "this->-io-bio", 1, BPF_REG_6);
> +
> + /* Fill in bi_opf */
> + off = dt_cg_ctf_offsetof("struct bio", "bi_opf", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + if (strstr(uprb, "read"))
> + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_READ));
> + else
> + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_WRITE));
> +
> + /*
> + * bio.bi_iter.bi_size = hdr->foo.count;
If "v1" is broken out, then this hdr->foo.count no longer makes sense.
Plus, the comment is more or less superseded by the following two
comment lines.
Or, mimic how the comments are handled below for inode.
> + *
> + * For the 'start' probe, count is arg2
> + * For the 'done' probe, count is hdr->res.count (hdr in arg1)
> + */
> + if (strcmp(prb, "start") == 0) {
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
> + } else {
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> + off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "res", NULL, 0)
> + + dt_cg_ctf_offsetof("struct nfs_pgio_res", "count", &siz, 0);
> + deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
> + }
> +
> + off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
> + + dt_cg_ctf_offsetof("struct bvec_iter", "bi_size", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
> +
> + /*
> + * bio.bi_iter.bi_sector = inode;
> + */
> + if (strcmp(prb, "start") == 0) {
> + /* inode is arg0 */
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> + } else {
> + /* use hdr->inode, hdr is arg1 */
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> +
> + off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "inode", &siz, 0);
> + deref_r3(dlp, exitlbl, off, siz, BPF_REG_3);
> + }
> +
> + off = dt_cg_ctf_offsetof("struct nfs_inode", "fileid", &siz, 0)
> + - dt_cg_ctf_offsetof("struct nfs_inode", "vfs_inode", NULL, 0);
> + deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
> +
> + off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
> + + dt_cg_ctf_offsetof("struct bvec_iter", "bi_sector", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
> +
> + /*
> + * bio.bi_bdev = 0;
> + */
> + off = dt_cg_ctf_offsetof("struct bio", "bi_bdev", &siz, 1);
> + if (off == -1)
> + off = dt_cg_ctf_offsetof("struct bio", "bi_disk", &siz, 0);
> + siz = bpf_ldst_size(siz, 1);
> + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, 0));
> +
> + /* Store a pointer to the fake bio in arg0. */
> + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_6));
> +}
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] io: adjust io provider for NFS tracepoint variants
2025-01-08 6:01 ` Eugene Loh
@ 2025-01-08 11:34 ` Nick Alcock
2025-01-09 18:18 ` Kris Van Hees
2025-01-09 18:15 ` Kris Van Hees
1 sibling, 1 reply; 6+ messages in thread
From: Nick Alcock @ 2025-01-08 11:34 UTC (permalink / raw)
To: Eugene Loh; +Cc: Kris Van Hees, DTrace mailing lists, dtrace-devel
On 8 Jan 2025, Eugene Loh uttered the following:
> You already have a R-b, it seems the code is right (woo hoo! cool), and we're trying to get a release out the door. But here are my
> "late to the party" comments. Feel free to ignore.
Ditto :)
> The names v1 and v2 strike me as funny; they just make up new numbers. Not a big deal, but ideally the names might more closely
> reflect the actual version changes we're tracking.
Agreed.
> Breaking the function out into a new "v1" version is some unneeded copy-and-paste code bloat, I think. The function is kind of
> large and the only difference between v1 and v2 is I guess in the two, relatively short, "start" sections. So, keeping this stuff
> as one function -- with extra logic in the two "start" sections -- might be more compact and clearer than having two rather similar
> functions.
Agreed, but more generally it seems we are growing increasingly many
fairly similar fake bio functions in the io provider: with this I think
we're up to three. Maybe it's time to refactor some of this into some
sort of shared fake bio machinery? Certainly if we grow more I think we
should (and given the number of network filesystems, that seems quite
likely to happen).
> On 1/7/25 17:44, Kris Van Hees wrote:
>> Kernels prior to 5.6.0 pass 3 arguments (derived from the NFS hdr)
>> to the nfs_initiate_read raw tracepoint, whereas kernels as of 5.6.0
>> pass just the NFS hdr.
>
> Is the same true of write? If so, then maybe say so and point out that what we're really doing is changing the handling of nfs
> "start" while leaving nfs "done" alone... that would correspond more closely to what is happening in the code, which talks of start
> and done.
Good thinkiing. That would be commit
5bb2a7cb9fe58d2b1efedd6058d442c7871c45ec ("NFS: Clean up generic
writeback tracepoints"), also by Trond, also first landing in 5.6-rc1.
Same sort of thing.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] io: adjust io provider for NFS tracepoint variants
2025-01-08 6:01 ` Eugene Loh
2025-01-08 11:34 ` Nick Alcock
@ 2025-01-09 18:15 ` Kris Van Hees
1 sibling, 0 replies; 6+ messages in thread
From: Kris Van Hees @ 2025-01-09 18:15 UTC (permalink / raw)
To: Eugene Loh; +Cc: Kris Van Hees, DTrace mailing lists, dtrace-devel
On Wed, Jan 08, 2025 at 01:01:28AM -0500, Eugene Loh wrote:
> You already have a R-b, it seems the code is right (woo hoo! cool), and
> we're trying to get a release out the door. But here are my "late to the
> party" comments. Feel free to ignore.
>
> The names v1 and v2 strike me as funny; they just make up new numbers. Not
> a big deal, but ideally the names might more closely reflect the actual
> version changes we're tracking.
The point of using v1 and v2 is really to reflect that there are two variants
of the tracepoints. I deliberately did not associate them (through their name)
with specific kernel versions because that fails very quickly when people start
doing things like backports. With the current naming, the implementation is
reflecting the 1st variant of the tracepoints vs the 2nd variant. The logic
to use one vs the other is handled where that choice needs to be made.
This naming also allows for a future choice to perhaps differentiate based on
e.g. prototype of the tracepoint rather than a kernel version (again, that
would handle backported patches much more graceafully).
> Breaking the function out into a new "v1" version is some unneeded
> copy-and-paste code bloat, I think. The function is kind of large and the
> only difference between v1 and v2 is I guess in the two, relatively short,
> "start" sections. So, keeping this stuff as one function -- with extra
> logic in the two "start" sections -- might be more compact and clearer than
> having two rather similar functions.
The main point of the bloat is to make it very clear what we are doing here
and why. There certainly is opportunity for refactoring the code, but this was
not the time for it.
> On 1/7/25 17:44, Kris Van Hees wrote:
> > Kernels prior to 5.6.0 pass 3 arguments (derived from the NFS hdr)
> > to the nfs_initiate_read raw tracepoint, whereas kernels as of 5.6.0
> > pass just the NFS hdr.
>
> Is the same true of write? If so, then maybe say so and point out that what
> we're really doing is changing the handling of nfs "start" while leaving nfs
> "done" alone... that would correspond more closely to what is happening in
> the code, which talks of start and done.
Yes, and the code handles that also. So yes, should have mentioned write also.
Mea culpa.
> > Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
> > ---
> > libdtrace/dt_prov_io.c | 127 +++++++++++++++++++++++++++++++++++------
> > 1 file changed, 108 insertions(+), 19 deletions(-)
> >
> > diff --git a/libdtrace/dt_prov_io.c b/libdtrace/dt_prov_io.c
> > @@ -155,12 +147,109 @@ static void deref_r3(dt_irlist_t *dlp, uint_t exitlbl, int addend, int width,
> > +/*
> > + * For NFS events, we have to construct a fake struct bio, which we have to
> > + * populate from the inode (arg0) and hdr->good_bytes (arg2) arguments the
> > + * underlying probe provides.
> > + */
> > +static void io_nfs_args_v1(dt_pcb_t *pcb, dt_irlist_t *dlp, uint_t exitlbl,
> > + const char *prb, const char *uprb)
> > +{
> > + int off;
> > + size_t siz;
> > +
> > + /*
> > + * Determine the various sizes and offsets we want.
> > + *
> > + * // Access these fields relative to &bio.
> > + * struct bio bio = {
> > + * .bi_opf = ...,
> > + * .bi_iter.bi_size = ..., // struct bvec_iter bi_iter
> > + * .bi_iter.bi_sector = ...,
> > + * .bi_bdev = 0, // -or- .bi_disk = 0
> > + * };
> > + *
> > + * // Access these fields relative to hdr.
> > + * struct nfs_pgio_header *hdr;
> > + * ... = hdr->res.count; // struct nfs_pgio_res res
> > + */
> > +
> > + /*
> > + * Declare the -io-bio variable and store its address in %r6.
> > + */
> > + dt_cg_tramp_decl_var(pcb, &v_bio);
> > + dt_cg_tramp_get_var(pcb, "this->-io-bio", 1, BPF_REG_6);
> > +
> > + /* Fill in bi_opf */
> > + off = dt_cg_ctf_offsetof("struct bio", "bi_opf", &siz, 0);
> > + siz = bpf_ldst_size(siz, 1);
> > + if (strstr(uprb, "read"))
> > + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_READ));
> > + else
> > + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, REQ_OP_WRITE));
> > +
> > + /*
> > + * bio.bi_iter.bi_size = hdr->foo.count;
>
> If "v1" is broken out, then this hdr->foo.count no longer makes sense.
> Plus, the comment is more or less superseded by the following two comment
> lines.
>
> Or, mimic how the comments are handled below for inode.
Yes, clean up will be needed.
> > + *
> > + * For the 'start' probe, count is arg2
> > + * For the 'done' probe, count is hdr->res.count (hdr in arg1)
> > + */
> > + if (strcmp(prb, "start") == 0) {
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
> > + } else {
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> > + off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "res", NULL, 0)
> > + + dt_cg_ctf_offsetof("struct nfs_pgio_res", "count", &siz, 0);
> > + deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
> > + }
> > +
> > + off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
> > + + dt_cg_ctf_offsetof("struct bvec_iter", "bi_size", &siz, 0);
> > + siz = bpf_ldst_size(siz, 1);
> > + emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
> > +
> > + /*
> > + * bio.bi_iter.bi_sector = inode;
> > + */
> > + if (strcmp(prb, "start") == 0) {
> > + /* inode is arg0 */
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> > + } else {
> > + /* use hdr->inode, hdr is arg1 */
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> > +
> > + off = dt_cg_ctf_offsetof("struct nfs_pgio_header", "inode", &siz, 0);
> > + deref_r3(dlp, exitlbl, off, siz, BPF_REG_3);
> > + }
> > +
> > + off = dt_cg_ctf_offsetof("struct nfs_inode", "fileid", &siz, 0)
> > + - dt_cg_ctf_offsetof("struct nfs_inode", "vfs_inode", NULL, 0);
> > + deref_r3(dlp, exitlbl, off, siz, BPF_REG_0);
> > +
> > + off = dt_cg_ctf_offsetof("struct bio", "bi_iter", NULL, 0)
> > + + dt_cg_ctf_offsetof("struct bvec_iter", "bi_sector", &siz, 0);
> > + siz = bpf_ldst_size(siz, 1);
> > + emit(dlp, BPF_STORE(siz, BPF_REG_6, off, BPF_REG_0));
> > +
> > + /*
> > + * bio.bi_bdev = 0;
> > + */
> > + off = dt_cg_ctf_offsetof("struct bio", "bi_bdev", &siz, 1);
> > + if (off == -1)
> > + off = dt_cg_ctf_offsetof("struct bio", "bi_disk", &siz, 0);
> > + siz = bpf_ldst_size(siz, 1);
> > + emit(dlp, BPF_STORE_IMM(siz, BPF_REG_6, off, 0));
> > +
> > + /* Store a pointer to the fake bio in arg0. */
> > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_6));
> > +}
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] io: adjust io provider for NFS tracepoint variants
2025-01-08 11:34 ` Nick Alcock
@ 2025-01-09 18:18 ` Kris Van Hees
0 siblings, 0 replies; 6+ messages in thread
From: Kris Van Hees @ 2025-01-09 18:18 UTC (permalink / raw)
To: Nick Alcock; +Cc: Eugene Loh, Kris Van Hees, DTrace mailing lists, dtrace-devel
On Wed, Jan 08, 2025 at 11:34:36AM +0000, Nick Alcock wrote:
> On 8 Jan 2025, Eugene Loh uttered the following:
>
> > You already have a R-b, it seems the code is right (woo hoo! cool), and we're trying to get a release out the door. But here are my
> > "late to the party" comments. Feel free to ignore.
>
> Ditto :)
>
> > The names v1 and v2 strike me as funny; they just make up new numbers. Not a big deal, but ideally the names might more closely
> > reflect the actual version changes we're tracking.
>
> Agreed.
See my rationale in my reply to Eugene.
> > Breaking the function out into a new "v1" version is some unneeded copy-and-paste code bloat, I think. The function is kind of
> > large and the only difference between v1 and v2 is I guess in the two, relatively short, "start" sections. So, keeping this stuff
> > as one function -- with extra logic in the two "start" sections -- might be more compact and clearer than having two rather similar
> > functions.
>
> Agreed, but more generally it seems we are growing increasingly many
> fairly similar fake bio functions in the io provider: with this I think
> we're up to three. Maybe it's time to refactor some of this into some
> sort of shared fake bio machinery? Certainly if we grow more I think we
> should (and given the number of network filesystems, that seems quite
> likely to happen).
The trouble is that they are also sufficiently different, and with how we need
to generate code to trace down pointer chains etc in various ways, consolidated
code can get more convoluted and hard to read. It will remain a balance
between sharing code with possible lack of clarity and more complexity, or
code duplication with more clarity and reduces complexity. Sometimes it gets
hard to judge which is better, but this certainly can be looked at and we can
go from there.
> > On 1/7/25 17:44, Kris Van Hees wrote:
> >> Kernels prior to 5.6.0 pass 3 arguments (derived from the NFS hdr)
> >> to the nfs_initiate_read raw tracepoint, whereas kernels as of 5.6.0
> >> pass just the NFS hdr.
> >
> > Is the same true of write? If so, then maybe say so and point out that what we're really doing is changing the handling of nfs
> > "start" while leaving nfs "done" alone... that would correspond more closely to what is happening in the code, which talks of start
> > and done.
>
> Good thinkiing. That would be commit
> 5bb2a7cb9fe58d2b1efedd6058d442c7871c45ec ("NFS: Clean up generic
> writeback tracepoints"), also by Trond, also first landing in 5.6-rc1.
> Same sort of thing.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-01-09 18:18 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-07 22:44 [PATCH] io: adjust io provider for NFS tracepoint variants Kris Van Hees
2025-01-08 1:47 ` Elena Zannoni
2025-01-08 6:01 ` Eugene Loh
2025-01-08 11:34 ` Nick Alcock
2025-01-09 18:18 ` Kris Van Hees
2025-01-09 18:15 ` Kris Van Hees
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox