From: Kris Van Hees <kris.van.hees@oracle.com>
To: Nick Alcock <nick.alcock@oracle.com>
Cc: dtrace@lists.linux.dev, dtrace-devel@oss.oracle.com,
eugene.loh@oracle.com
Subject: Re: [PATCH 3/5] libproc: debugging improvements
Date: Fri, 6 Dec 2024 23:38:34 -0500 [thread overview]
Message-ID: <Z1PRShEInAMO3WPy@oracle.com> (raw)
In-Reply-To: <20241203113610.75104-4-nick.alcock@oracle.com>
On Tue, Dec 03, 2024 at 11:36:08AM +0000, Nick Alcock wrote:
> Attempting to track down the intermittent failures in
> test/unittest/proc/tst.multitrace.sh is rendered difficult by the fact that
> multiple dtraces are running at once, tracing multiple processes, but the
> debugging messages emitted by libproc do not provide either the TID of the
> tracing thread or the PID of the process being traced in too many cases.
>
> Worse yet, if you do turn debugging on, both dtraces emit debugging output
> simultaneously to the same stderr stream. The interleaving is bad enough,
> but very often this causes lines to be emitted to stderr that do not start
> with the standard libproc DEBUG time-since-epoch: string, because it got
> interspersed into the previous line. runtest then partitions that partial
> line off into a separate part of the log, rendering everything entirely
> incomprehensible.
>
> So emit more PID-related info, including the TID of the process-control
> thread; and arrange for tst.multitrace.sh to capture DTrace debugging output
> itself and dump it as two separated pieces: if a dtrace exits nonzero, note
> which one exited as well as dumping its debug output.
>
> Also, when this-should-never-happen conditions like a Pwait() returning
> -ECHILD happen (usually indicating that we were not the victim's tracer
> after all) by dumping the entire /proc/$pid/status of that process to
> the debug stream, so we can tell what the tracer *was* and whether the
> process was even stopped. (No impact at all if this condition never
> happens, which it never should, of course, or if debugging is off.)
>
> Signed-off-by: Nick Alcock <nick.alcock@oracle.com>
Reviewed-by: Kris Van Hees <kris.van.hees@oracle.com>
> ---
> libdtrace/dt_proc.c | 6 ++-
> libproc/Pcontrol.c | 57 +++++++++++++++++++++++++---
> libproc/rtld_db.c | 2 +-
> test/unittest/usdt/tst.multitrace.sh | 17 ++++++++-
> 4 files changed, 73 insertions(+), 9 deletions(-)
>
> diff --git a/libdtrace/dt_proc.c b/libdtrace/dt_proc.c
> index 41b148e807f84..6ad57cc5f12ef 100644
> --- a/libdtrace/dt_proc.c
> +++ b/libdtrace/dt_proc.c
> @@ -870,6 +870,9 @@ dt_proc_control(void *arg)
> int err;
> jmp_buf exec_jmp;
>
> + dt_dprintf("%i: process control thread %i starting.\n", dpr->dpr_pid,
> + gettid());
> +
> /*
> * Set up global libproc hooks that must be active before any processes
> * are grabbed or created.
> @@ -1421,7 +1424,8 @@ dt_proc_control_cleanup(void *arg)
> * out by ptrace() wrappers above us in the call stack, since the whole
> * thread is going away.
> */
> - dt_dprintf("%i: process control thread going away.\n", dpr->dpr_pid);
> + dt_dprintf("%i: process control thread %i going away.\n", dpr->dpr_pid,
> + gettid());
> if(dpr->dpr_lock_count_ctrl == 0 ||
> !pthread_equal(dpr->dpr_lock_holder, pthread_self()))
> dt_proc_lock(dpr);
> diff --git a/libproc/Pcontrol.c b/libproc/Pcontrol.c
> index c96a410b70b3f..a32a362eaa1fb 100644
> --- a/libproc/Pcontrol.c
> +++ b/libproc/Pcontrol.c
> @@ -66,6 +66,7 @@ static int add_bkpt(struct ps_prochandle *P, uintptr_t addr,
> void *data);
> static void delete_bkpt_handler(struct bkpt *bkpt);
> static jmp_buf **single_thread_unwinder_pad(struct ps_prochandle *unused);
> +static void Pdump_proc_status(pid_t pid);
>
> static ptrace_lock_hook_fun *ptrace_lock_hook;
> static waitpid_lock_hook_fun *waitpid_lock_hook;
> @@ -332,10 +333,12 @@ Pgrab(pid_t pid, int noninvasiveness, int already_ptraced, void *wrap_arg,
>
> if (*perr < 0) {
> if (noninvasiveness < 1) {
> + _dprintf("%i: Pgrab(): not grabbed.\n", P->pid);
> Pfree_internal(P);
> return NULL;
> }
> close(P->memfd);
> + _dprintf("%i: Pgrab(): grabbed noninvasively.\n", P->pid);
> noninvasiveness = 2;
> }
> } else {
> @@ -719,7 +722,9 @@ Pwait_internal(struct ps_prochandle *P, boolean_t block, int *return_early)
> return 0;
>
> if (errno == ECHILD) {
> + _dprintf("%i: Pwait: got ECHILD from waitpid(), state %i, trace count %i, halted %i\n", P->pid, P->state, P->ptrace_count, P->ptrace_halted);
> P->state = PS_DEAD;
> + Pdump_proc_status(P->pid);
> return 0;
> }
>
> @@ -1262,7 +1267,7 @@ Ppush_state(struct ps_prochandle *P, int state)
>
> s->state = state;
> dt_list_prepend(&P->ptrace_states, s);
> - _dprintf("Ppush_state(): ptrace_count %i, state %i\n", P->ptrace_count, s->state);
> + _dprintf("%i: Ppush_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count, s->state);
>
> return s;
> }
> @@ -1278,7 +1283,7 @@ Ppop_state(struct ps_prochandle *P)
>
> s = dt_list_next(&P->ptrace_states);
> dt_list_delete(&P->ptrace_states, s);
> - _dprintf("Ppop_state(): ptrace_count %i, state %i\n", P->ptrace_count+1, s->state);
> + _dprintf("%i: Ppop_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count+1, s->state);
> state = s->state;
> free(s);
> return state;
> @@ -1422,6 +1427,9 @@ err_nostate:
> if (P->ptrace_count == 0 && ptrace_lock_hook)
> ptrace_lock_hook(P, P->wrap_arg, 0);
>
> + _dprintf("Ptrace(): error return (possibly other tracer), trace count now %i: %s\n",
> + P->ptrace_count, strerror(errno));
> +
> if (err != -ECHILD)
> return err;
> else
> @@ -1511,8 +1519,11 @@ Puntrace(struct ps_prochandle *P, int leave_stopped)
> P->state = PS_RUN;
> P->ptraced = FALSE;
> if ((wrapped_ptrace(P, PTRACE_DETACH, P->pid, 0, 0) < 0) &&
> - (errno == ESRCH))
> + (errno == ESRCH)) {
> + _dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
> + P->pid);
> P->state = PS_DEAD;
> + }
> P->ptrace_halted = FALSE;
> P->info_valid = 0;
> }
> @@ -1812,7 +1823,7 @@ Punbkpt(struct ps_prochandle *P, uintptr_t addr)
> if (Preset_bkpt_ip(P, P->tracing_bkpt) < 0)
> switch (errno) {
> case ESRCH:
> - _dprintf("%i: -ESRCH, process is dead.\n",
> + _dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
> P->pid);
> P->state = PS_DEAD;
> return;
> @@ -1870,7 +1881,8 @@ Punbkpt_child_poke(struct ps_prochandle *P, pid_t pid, bkpt_t *bkpt)
> bkpt->bkpt_addr, bkpt->orig_insn) < 0)
> switch (errno) {
> case ESRCH:
> - _dprintf("%i: -ESRCH, process is dead.\n", child_pid);
> + _dprintf("%i: Punbkpt_child_poke(): -ESRCH, process is dead.\n",
> + child_pid);
> if (!pid)
> P->state = PS_DEAD;
> return;
> @@ -2181,8 +2193,11 @@ Pbkpt_continue(struct ps_prochandle *P)
> if (wrapped_ptrace(P, PTRACE_CONT, P->pid, 0, 0) < 0) {
> int err = errno;
> if (err == ESRCH) {
> - if ((kill(P->pid, 0) < 0) && errno == ESRCH)
> + if ((kill(P->pid, 0) < 0) && errno == ESRCH) {
> + _dprintf("%i: Pbpkt_continue(): Got ESRCH, process is dead.\n",
> + P->pid);
> P->state = PS_DEAD;
> + }
> }
> /*
> * Since we must have an outstanding Ptrace() anyway,
> @@ -2760,6 +2775,36 @@ Phastty(pid_t pid)
> return tty != 0;
> }
>
> +/*
> + * Dump /proc/$pid/status into the debug log.
> + */
> +static void
> +Pdump_proc_status(pid_t pid)
> +{
> + char status[PATH_MAX];
> + FILE *fp;
> + char *line = NULL;
> + size_t len;
> +
> + snprintf(status, sizeof(status), "/proc/%i/status", pid);
> +
> + if ((fp = fopen(status, "r")) == NULL) {
> + _dprintf("Process is dead.\n");
> + return;
> + }
> +
> + while (getline(&line, &len, fp) >= 0) {
> + if (strlen(line) > 0) {
> + if (line[strlen(line) - 1] == '\n')
> + line[strlen(line)-1] = '\0';
> + _dprintf("%li: %s\n", (long)pid, line);
> + }
> + }
> + free(line);
> + fclose(fp);
> + return;
> +}
> +
> /*
> * Get a specific field out of /proc/$pid/status and return the portion after
> * the colon in a new dynamically allocated string, or NULL if no field matches.
> diff --git a/libproc/rtld_db.c b/libproc/rtld_db.c
> index b6d33ce38425a..9eb63138326dc 100644
> --- a/libproc/rtld_db.c
> +++ b/libproc/rtld_db.c
> @@ -1771,7 +1771,7 @@ rd_loadobj_iter(rd_agent_t *rd, rl_iter_f *fun, void *state)
> if (rd->P->state == PS_DEAD) {
> *jmp_pad = old_exec_jmp;
>
> - _dprintf("%i: link map iteration failed: process is dead..\n",
> + _dprintf("%i: link map iteration failed: process is dead.\n",
> rd->P->pid);
> return RD_ERR;
> }
> diff --git a/test/unittest/usdt/tst.multitrace.sh b/test/unittest/usdt/tst.multitrace.sh
> index 4e53dbb992508..262c782821ef0 100755
> --- a/test/unittest/usdt/tst.multitrace.sh
> +++ b/test/unittest/usdt/tst.multitrace.sh
> @@ -73,7 +73,7 @@ if [ $? -ne 0 ]; then
> fi
>
> script() {
> - exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 <<'EOF'
> + exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 2> debug.$3 <<'EOF'
> int fired[pid_t];
> int exited[pid_t];
>
> @@ -142,13 +142,28 @@ DONE=$!
> script $ONE $TWO 2 &
> DTWO=$!
>
> +dump_debug() {
> + if [[ -n $DTRACE_DEBUG ]]; then
> + echo "runtest DEBUG $(date +%s): Debug output of first dtrace so far, PID $DONE" >&2
> + cat debug.1 >&2
> +
> + echo "runtest DEBUG $(date +%s): Debug output of second dtrace so far, PID $DTWO" >&2
> + cat debug.2 >&2
> + fi
> +}
> +
> if ! wait $DONE; then
> + dump_debug
> + echo "first dtrace exited nonzero at $(date +%s)" >&2
> exit 1
> fi
>
> if ! wait $DTWO; then
> + dump_debug
> + echo "second dtrace exited nonzero at $(date +%s)" >&2
> exit 1
> fi
> +dump_debug
>
> wait $ONE $TWO
>
> --
> 2.47.1.279.g84c5f4e78e
>
next prev parent reply other threads:[~2024-12-07 4:38 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-03 11:36 [PATCH 0/5] fix test/unittest/usdt/tst.multitrace.sh Nick Alcock
2024-12-03 11:36 ` [PATCH 1/5] Revert "Tweak self-armouring" Nick Alcock
2024-12-07 4:37 ` [DTrace-devel] " Kris Van Hees
2024-12-03 11:36 ` [PATCH 2/5] proc: more self-grab improvements Nick Alcock
2024-12-03 13:43 ` Nick Alcock
2024-12-05 12:58 ` Nick Alcock
2024-12-05 13:43 ` [DTrace-devel] " Nick Alcock
2024-12-07 4:38 ` Kris Van Hees
2024-12-03 11:36 ` [PATCH 3/5] libproc: debugging improvements Nick Alcock
2024-12-07 4:38 ` Kris Van Hees [this message]
2024-12-03 11:36 ` [PATCH 4/5] libproc: guard against Puntrace() of terminated processes Nick Alcock
2024-12-03 18:06 ` [DTrace-devel] " Nick Alcock
2024-12-03 18:09 ` Nick Alcock
2024-12-07 4:38 ` Kris Van Hees
2024-12-03 11:36 ` [PATCH 5/5] libproc: drop Pgrab() special cases in Ptrace() Nick Alcock
2024-12-07 4:40 ` Kris Van Hees
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Z1PRShEInAMO3WPy@oracle.com \
--to=kris.van.hees@oracle.com \
--cc=dtrace-devel@oss.oracle.com \
--cc=dtrace@lists.linux.dev \
--cc=eugene.loh@oracle.com \
--cc=nick.alcock@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.