public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
To: Steven Rostedt <rostedt@goodmis.org>,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Josh Poimboeuf <jpoimboe@kernel.org>,
	x86@kernel.org, Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Indu Bhagat <indu.bhagat@oracle.com>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>,
	Jiri Olsa <jolsa@kernel.org>, Namhyung Kim <namhyung@kernel.org>,
	Ian Rogers <irogers@google.com>,
	Adrian Hunter <adrian.hunter@intel.com>,
	linux-perf-users@vger.kernel.org, Mark Brown <broonie@kernel.org>,
	linux-toolchains@vger.kernel.org, Jordan Rome <jordalgo@meta.com>,
	Sam James <sam@gentoo.org>,
	Andrii Nakryiko <andrii.nakryiko@gmail.com>,
	Jens Remus <jremus@linux.ibm.com>,
	Florian Weimer <fweimer@redhat.com>,
	Andy Lutomirski <luto@kernel.org>, Weinan Liu <wnliu@google.com>,
	Blake Jones <blakejones@google.com>,
	Beau Belgrave <beaub@linux.microsoft.com>,
	"Jose E. Marchesi" <jemarch@gnu.org>,
	Alexander Aring <aahringo@redhat.com>
Subject: Re: [PATCH v5 3/9] unwind deferred: Use bitmask to determine which callbacks to call
Date: Mon, 28 Apr 2025 12:33:50 -0400	[thread overview]
Message-ID: <02339b93-de9a-49e4-8dbb-137d02fc6ea8@efficios.com> (raw)
In-Reply-To: <20250424192612.844558089@goodmis.org>

On 2025-04-24 15:24, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
> 
> In order to know which registered callback requested a stacktrace for when
> the task goes back to user space, add a bitmask for all registered
> tracers. The bitmask is the size of log, which means that on a 32 bit

size of long

> machine, it can have at most 32 registered tracers, and on 64 bit, it can
> have at most 64 registered tracers. This should not be an issue as there
> should not be more than 10 (unless BPF can abuse this?).
> 
> When a tracer registers with unwind_deferred_init() it will get a bit
> number assigned to it. When a tracer requests a stacktrace, it will have
> its bit set within the task_struct. When the task returns back to user
> space, it will call the callbacks for all the registered tracers where
> their bits are set in the task's mask.
> 
> When a tracer is removed by the unwind_deferred_cancel() all current tasks
> will clear the associated bit, just in case another tracer gets registered
> immediately afterward and then gets their callback called unexpectedly.
> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>   include/linux/sched.h           |  1 +
>   include/linux/unwind_deferred.h |  1 +
>   kernel/unwind/deferred.c        | 44 ++++++++++++++++++++++++++++++---
>   3 files changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a1e1c07cadfb..d3ee0c5405d6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1649,6 +1649,7 @@ struct task_struct {
>   
>   #ifdef CONFIG_UNWIND_USER
>   	struct unwind_task_info		unwind_info;
> +	unsigned long			unwind_mask;
>   #endif
>   
>   	/* CPU-specific state of this task: */
> diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
> index d36784cae658..719a7cfb3164 100644
> --- a/include/linux/unwind_deferred.h
> +++ b/include/linux/unwind_deferred.h
> @@ -13,6 +13,7 @@ typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stackt
>   struct unwind_work {
>   	struct list_head		list;
>   	unwind_callback_t		func;
> +	int				bit;

int or unsigned int ?

Rename "bit" to "requester_id" ?

>   };
>   
>   #ifdef CONFIG_UNWIND_USER
> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index 2afd197da2ef..f505cb1766de 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -26,6 +26,7 @@ static DEFINE_PER_CPU(u64, unwind_ctx_ctr);
>   /* Guards adding to and reading the list of callbacks */
>   static DEFINE_MUTEX(callback_mutex);
>   static LIST_HEAD(callbacks);
> +static unsigned long unwind_mask;

Perhaps "reserved_unwind_mask" ?

>   
>   /*
>    * The context cookie is a unique identifier that is assigned to a user
> @@ -135,6 +136,7 @@ static void unwind_deferred_task_work(struct callback_head *head)
>   	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
>   	struct unwind_stacktrace trace;
>   	struct unwind_work *work;
> +	struct task_struct *task = current;
>   	u64 cookie;
>   
>   	if (WARN_ON_ONCE(!info->pending))
> @@ -156,7 +158,10 @@ static void unwind_deferred_task_work(struct callback_head *head)
>   
>   	guard(mutex)(&callback_mutex);
>   	list_for_each_entry(work, &callbacks, list) {
> -		work->func(work, &trace, cookie);
> +		if (task->unwind_mask & (1UL << work->bit)) {
> +			work->func(work, &trace, cookie);
> +			clear_bit(work->bit, &current->unwind_mask);
> +		}

You could change this list of callbacks for an array of pointers,
indexed by "requester_id".

Then you can do a for each bit on task->unwind_mask, and all bits
that match end up calling the callback for the matching array index.

>   	}
>   	barrier();
>   	/* If another task work is pending, reuse the cookie and stack trace */
> @@ -194,9 +199,12 @@ static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
>   		*cookie = info->nmi_cookie;
>   	}
>   
> -	if (info->pending)
> +	if (current->unwind_mask & (1UL << work->bit))
>   		return 0;
>   
> +	if (info->pending)
> +		goto out;
> +
>   	ret = task_work_add(current, &info->work, TWA_NMI_CURRENT);
>   	if (ret) {
>   		if (inited_cookie)
> @@ -205,6 +213,8 @@ static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
>   	}
>   
>   	info->pending = 1;
> + out:
> +	set_bit(work->bit, &current->unwind_mask);
>   
>   	return 0;
>   }
> @@ -244,14 +254,18 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
>   
>   	*cookie = get_cookie(info);
>   
> +	/* This is already queued */
> +	if (current->unwind_mask & (1UL << work->bit))
> +		return 0;
> +
>   	/* callback already pending? */
>   	pending = READ_ONCE(info->pending);
>   	if (pending)
> -		return 0;
> +		goto out;
>   
>   	/* Claim the work unless an NMI just now swooped in to do so. */
>   	if (!try_cmpxchg(&info->pending, &pending, 1))

Not that it necessarily matters performance wise here, but can this be a
try_cmpxchg_local if we're working on the task struct and only expecting
interruption from NMIs ?

> -		return 0;
> +		goto out;
>   
>   	/* The work has been claimed, now schedule it. */
>   	ret = task_work_add(current, &info->work, TWA_RESUME);
> @@ -260,16 +274,29 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
>   		return ret;
>   	}
>   
> + out:
> +	set_bit(work->bit, &current->unwind_mask);
> +
>   	return 0;
>   }
>   
>   void unwind_deferred_cancel(struct unwind_work *work)
>   {
> +	struct task_struct *g, *t;
> +
>   	if (!work)
>   		return;
>   
>   	guard(mutex)(&callback_mutex);
>   	list_del(&work->list);
> +
> +	clear_bit(work->bit, &unwind_mask);
> +
> +	guard(rcu)();
> +	/* Clear this bit from all threads */
> +	for_each_process_thread(g, t) {
> +		clear_bit(work->bit, &t->unwind_mask);
> +	}

It is enough to guard with RCU ? See syscall_regfunc() from
tracepoint.c where we do:

                 read_lock(&tasklist_lock);
                 for_each_process_thread(p, t) {
                         set_task_syscall_work(t, SYSCALL_TRACEPOINT);
                 }
                 read_unlock(&tasklist_lock);

To prevent concurrent fork from adding threads while we
iterate, thus opening the possibility of missing a clear
due to a concurrent fork + set bit.

Thanks,

Mathieu

>   }
>   
>   int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
> @@ -277,6 +304,14 @@ int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
>   	memset(work, 0, sizeof(*work));
>   
>   	guard(mutex)(&callback_mutex);
> +
> +	/* See if there's a bit in the mask available */
> +	if (unwind_mask == ~0UL)
> +		return -EBUSY;
> +
> +	work->bit = ffz(unwind_mask);
> +	unwind_mask |= 1UL << work->bit;
> +
>   	list_add(&work->list, &callbacks);
>   	work->func = func;
>   	return 0;
> @@ -288,6 +323,7 @@ void unwind_task_init(struct task_struct *task)
>   
>   	memset(info, 0, sizeof(*info));
>   	init_task_work(&info->work, unwind_deferred_task_work);
> +	task->unwind_mask = 0;
>   }
>   
>   void unwind_task_free(struct task_struct *task)


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

  reply	other threads:[~2025-04-28 16:33 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-24 19:24 [PATCH v5 0/9] tracing: Deferred unwinding of user space stack traces Steven Rostedt
2025-04-24 19:24 ` [PATCH v5 1/9] unwind_user/deferred: Add deferred unwinding interface Steven Rostedt
2025-04-24 19:24 ` [PATCH v5 2/9] unwind_user/deferred: Make unwind deferral requests NMI-safe Steven Rostedt
2025-04-24 19:24 ` [PATCH v5 3/9] unwind deferred: Use bitmask to determine which callbacks to call Steven Rostedt
2025-04-28 16:33   ` Mathieu Desnoyers [this message]
2025-04-28 16:56     ` Steven Rostedt
2025-04-28 18:00       ` Mathieu Desnoyers
2025-04-28 18:12         ` Steven Rostedt
2025-04-28 18:13           ` Mathieu Desnoyers
2025-04-24 19:25 ` [PATCH v5 4/9] tracing: Do not bother getting user space stacktraces for kernel threads Steven Rostedt
2025-04-24 19:25 ` [PATCH v5 5/9] tracing: Rename __dynamic_array() to __dynamic_field() for ftrace events Steven Rostedt
2025-04-24 19:25 ` [PATCH v5 6/9] tracing: Implement deferred user space stacktracing Steven Rostedt
2025-04-24 19:25 ` [PATCH v5 7/9] mm: Add guard for mmap_read_lock Steven Rostedt
2025-04-24 19:25 ` [PATCH v5 8/9] tracing: Have deferred user space stacktrace show file offsets Steven Rostedt
2025-04-24 19:25 ` [PATCH v5 9/9] tracing: Show inode and device major:minor in deferred user space stacktrace Steven Rostedt
2025-04-24 19:29 ` [PATCH v5 0/9] tracing: Deferred unwinding of user space stack traces Steven Rostedt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=02339b93-de9a-49e4-8dbb-137d02fc6ea8@efficios.com \
    --to=mathieu.desnoyers@efficios.com \
    --cc=aahringo@redhat.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=andrii.nakryiko@gmail.com \
    --cc=beaub@linux.microsoft.com \
    --cc=blakejones@google.com \
    --cc=broonie@kernel.org \
    --cc=fweimer@redhat.com \
    --cc=indu.bhagat@oracle.com \
    --cc=irogers@google.com \
    --cc=jemarch@gnu.org \
    --cc=jolsa@kernel.org \
    --cc=jordalgo@meta.com \
    --cc=jpoimboe@kernel.org \
    --cc=jremus@linux.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=linux-toolchains@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mhiramat@kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=sam@gentoo.org \
    --cc=wnliu@google.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox