All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Mateusz Guzik <mjguzik@gmail.com>
Cc: brauner@kernel.org, akpm@linux-foundation.org,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	"Eric W. Biederman" <ebiederm@xmission.com>
Subject: Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
Date: Tue, 28 Jan 2025 19:29:33 +0100	[thread overview]
Message-ID: <20250128182932.GC24845@redhat.com> (raw)
In-Reply-To: <20250128160743.3142544-1-mjguzik@gmail.com>

(Add Eric).


On 01/28, Mateusz Guzik wrote:
>
> Both add_device_randomness() and attach_pid()/detach_pid()

So afaics this patch does 2 different things, and I do think this needs
2 separate patches. Can you split this change please?

As for add_device_randomness(). I must have missed something, but I still can't
understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
to release_release_task() and avoid release_task_post->randomness.

You said:

	I wanted to keep the load where it was

but why??? Again, I must have missed something, but to me this simply adds the
unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
task-to-release != current, so what is the point?

Oleg.

> have their
> own synchronisation mechanisms.
>
> The clone side calls them *without* the tasklist_lock held, meaning
> parallel calls can contend on their locks.
>
> The exit side calls them *with* the tasklist_lock lock, which means the
> hold time is avoidably extended by waiting on either of the 2 locks, in
> turn exacerbating contention on tasklist_lock itself.
>
> Postponing the work until after the lock is dropped bumps thread
> creation/destruction rate by 15% on a 24 core vm.
>
> Bench (plop into will-it-scale):
> $ cat tests/threadspawn1.c
>
> char *testcase_description = "Thread creation and teardown";
>
> static void *worker(void *arg)
> {
>         return (NULL);
> }
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
>         pthread_t thread;
>         int error;
>
>         while (1) {
>                 error = pthread_create(&thread, NULL, worker, NULL);
>                 assert(error == 0);
>                 error = pthread_join(thread, NULL);
>                 assert(error == 0);
>                 (*iterations)++;
>         }
> }
>
> Run:
> $ ./threadspawn1_processes -t 24
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - introduce a struct to collect work
> - move out pids as well
>
> there is more which can be pulled out
>
> this may look suspicious:
> +	proc_flush_pid(p->thread_pid);
>
> AFAICS this is constant for the duration of the lifetime, so i don't
> there is a problem
>
>  include/linux/pid.h |  1 +
>  kernel/exit.c       | 53 +++++++++++++++++++++++++++++++++------------
>  kernel/pid.c        | 23 +++++++++++++++-----
>  3 files changed, 58 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f3..6e9fcacd02cd 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
>   * these helpers must be called with the tasklist_lock write-held.
>   */
>  extern void attach_pid(struct task_struct *task, enum pid_type);
> +extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
>  extern void detach_pid(struct task_struct *task, enum pid_type);
>  extern void change_pid(struct task_struct *task, enum pid_type,
>  			struct pid *pid);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1dcddfe537ee..4e452d3e3a89 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
>  late_initcall(kernel_exit_sysfs_init);
>  #endif
>
> -static void __unhash_process(struct task_struct *p, bool group_dead)
> +/*
> + * For things release_task would like to do *after* tasklist_lock is released.
> + */
> +struct release_task_post {
> +	unsigned long long randomness;
> +	struct pid *pids[PIDTYPE_MAX];
> +};
> +
> +static void __unhash_process(struct release_task_post *post, struct task_struct *p,
> +			     bool group_dead)
>  {
>  	nr_threads--;
> -	detach_pid(p, PIDTYPE_PID);
> +	post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
>  	if (group_dead) {
> -		detach_pid(p, PIDTYPE_TGID);
> -		detach_pid(p, PIDTYPE_PGID);
> -		detach_pid(p, PIDTYPE_SID);
> +		post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
> +		post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
> +		post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
>
>  		list_del_rcu(&p->tasks);
>  		list_del_init(&p->sibling);
> @@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
>  /*
>   * This function expects the tasklist_lock write-locked.
>   */
> -static void __exit_signal(struct task_struct *tsk)
> +static void __exit_signal(struct release_task_post *post,
> +			  struct task_struct *tsk)
>  {
>  	struct signal_struct *sig = tsk->signal;
>  	bool group_dead = thread_group_leader(tsk);
> @@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
>  			sig->curr_target = next_thread(tsk);
>  	}
>
> -	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
> -			      sizeof(unsigned long long));
> +	post->randomness = tsk->se.sum_exec_runtime;
>
>  	/*
>  	 * Accumulate here the counters for all threads as they die. We could
> @@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
>  	task_io_accounting_add(&sig->ioac, &tsk->ioac);
>  	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
>  	sig->nr_threads--;
> -	__unhash_process(tsk, group_dead);
> +	__unhash_process(post, tsk, group_dead);
>  	write_sequnlock(&sig->stats_lock);
>
>  	/*
> @@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
>  void release_task(struct task_struct *p)
>  {
>  	struct task_struct *leader;
> -	struct pid *thread_pid;
>  	int zap_leader;
> +	struct release_task_post post;
>  repeat:
> +	memset(&post, 0, sizeof(post));
> +
> +	proc_flush_pid(p->thread_pid);
> +
>  	/* don't need to get the RCU readlock here - the process is dead and
>  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
>  	rcu_read_lock();
> @@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
>
>  	write_lock_irq(&tasklist_lock);
>  	ptrace_release_task(p);
> -	thread_pid = get_pid(p->thread_pid);
> -	__exit_signal(p);
> +	__exit_signal(&post, p);
>
>  	/*
>  	 * If we are the last non-leader member of the thread
> @@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
>  	}
>
>  	write_unlock_irq(&tasklist_lock);
> -	proc_flush_pid(thread_pid);
> -	put_pid(thread_pid);
> +
> +	/*
> +	 * Process clean up deferred to after we drop the tasklist lock.
> +	 */
> +	add_device_randomness((const void*) &post.randomness,
> +			      sizeof(unsigned long long));
> +	if (post.pids[PIDTYPE_PID])
> +		free_pid(post.pids[PIDTYPE_PID]);
> +	if (post.pids[PIDTYPE_TGID])
> +		free_pid(post.pids[PIDTYPE_TGID]);
> +	if (post.pids[PIDTYPE_PGID])
> +		free_pid(post.pids[PIDTYPE_PGID]);
> +	if (post.pids[PIDTYPE_SID])
> +		free_pid(post.pids[PIDTYPE_SID]);
> +
>  	release_thread(p);
>  	put_task_struct_rcu_user(p);
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3a10a7b6fcf8..047cdbcef5cf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
>  	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
>  }
>
> -static void __change_pid(struct task_struct *task, enum pid_type type,
> +static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
>  			struct pid *new)
>  {
>  	struct pid **pid_ptr = task_pid_ptr(task, type);
> @@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
>
>  	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
>  		if (pid_has_task(pid, tmp))
> -			return;
> +			return NULL;
>
> -	free_pid(pid);
> +	return pid;
> +}
> +
> +struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
> +{
> +	return __change_pid(task, type, NULL);
>  }
>
>  void detach_pid(struct task_struct *task, enum pid_type type)
>  {
> -	__change_pid(task, type, NULL);
> +	struct pid *pid;
> +
> +	pid = detach_pid_return(task, type);
> +	if (pid)
> +		free_pid(pid);
>  }
>
>  void change_pid(struct task_struct *task, enum pid_type type,
>  		struct pid *pid)
>  {
> -	__change_pid(task, type, pid);
> +	struct pid *opid;
> +
> +	opid = __change_pid(task, type, pid);
> +	if (opid)
> +		free_pid(opid);
>  	attach_pid(task, type);
>  }
>
> --
> 2.43.0
>



  reply	other threads:[~2025-01-28 18:30 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-28 16:07 [PATCH v2] exit: perform randomness and pid work without tasklist_lock Mateusz Guzik
2025-01-28 18:29 ` Oleg Nesterov [this message]
2025-01-28 18:38   ` Mateusz Guzik
2025-01-28 19:22     ` Oleg Nesterov
2025-01-30 11:01       ` Mateusz Guzik
2025-01-31 20:55 ` Eric W. Biederman
2025-01-31 22:31   ` Mateusz Guzik
2025-01-31 23:09     ` Eric W. Biederman
2025-02-01 14:03       ` Mateusz Guzik

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250128182932.GC24845@redhat.com \
    --to=oleg@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=ebiederm@xmission.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mjguzik@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.