All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Mike Christie <michael.christie@oracle.com>
Cc: axboe@kernel.dk, brauner@kernel.org,
	linux-kernel@vger.kernel.org, oleg@redhat.com,
	linux@leemhuis.info, ebiederm@xmission.com, stefanha@redhat.com,
	nicolas.dichtel@6wind.com,
	virtualization@lists.linux-foundation.org,
	torvalds@linux-foundation.org
Subject: Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
Date: Mon, 22 May 2023 15:40:48 -0400	[thread overview]
Message-ID: <20230522153852-mutt-send-email-mst@kernel.org> (raw)
In-Reply-To: <20230522025124.5863-4-michael.christie@oracle.com>

On Sun, May 21, 2023 at 09:51:24PM -0500, Mike Christie wrote:
> When switching from kthreads to vhost_tasks two bugs were added:
> 1. The vhost worker tasks's now show up as processes so scripts doing ps
> or ps a would not incorrectly detect the vhost task as another process.
> 2. kthreads disabled freeze by setting PF_NOFREEZE, but vhost tasks's
> didn't disable or add support for them.
> 
> To fix both bugs, this switches the vhost task to be thread in the
> process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call
> get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that
> SIGKILL/STOP support is required because CLONE_THREAD requires
> CLONE_SIGHAND which requires those 2 signals to be suppported.
> 
> This a modified version of patch originally written by Linus which
> handles his review comment to himself to rename ignore_signals to
> block_signals to better represent what it now does. And it includes a
> change to vhost_worker() to support SIGSTOP/KILL and freeze, and it
> drops the wait use per Oleg's review comment that it's no longer needed
> when using CLONE_THREAD.
> 
> Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/vhost/vhost.c      | 17 ++++++++++++++++-
>  include/linux/sched/task.h |  2 +-
>  kernel/fork.c              | 12 +++---------
>  kernel/signal.c            |  1 +
>  kernel/vhost_task.c        | 16 ++++------------
>  5 files changed, 25 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index a92af08e7864..bf83e9340e40 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -338,6 +338,7 @@ static int vhost_worker(void *data)
>  	struct vhost_worker *worker = data;
>  	struct vhost_work *work, *work_next;
>  	struct llist_node *node;
> +	bool dead = false;
>  
>  	for (;;) {
>  		/* mb paired w/ kthread_stop */
> @@ -349,8 +350,22 @@ static int vhost_worker(void *data)
>  		}
>  
>  		node = llist_del_all(&worker->work_list);
> -		if (!node)
> +		if (!node) {
>  			schedule();
> +			/*
> +			 * When we get a SIGKILL our release function will
> +			 * be called. That will stop new IOs from being queued
> +			 * and check for outstanding cmd responses. It will then
> +			 * call vhost_task_stop to tell us to return and exit.
> +			 */
> +			if (!dead && signal_pending(current)) {
> +				struct ksignal ksig;
> +
> +				dead = get_signal(&ksig);
> +				if (dead)
> +					clear_thread_flag(TIF_SIGPENDING);


Does get_signal actually return true only on SIGKILL then?

> +			}
> +		}
>  
>  		node = llist_reverse_order(node);
>  		/* make sure flag is seen after deletion */
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 537cbf9a2ade..249a5ece9def 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -29,7 +29,7 @@ struct kernel_clone_args {
>  	u32 io_thread:1;
>  	u32 user_worker:1;
>  	u32 no_files:1;
> -	u32 ignore_signals:1;
> +	u32 block_signals:1;
>  	unsigned long stack;
>  	unsigned long stack_size;
>  	unsigned long tls;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ed4e01daccaa..9e04ab5c3946 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2338,14 +2338,10 @@ __latent_entropy struct task_struct *copy_process(
>  		p->flags |= PF_KTHREAD;
>  	if (args->user_worker)
>  		p->flags |= PF_USER_WORKER;
> -	if (args->io_thread) {
> -		/*
> -		 * Mark us an IO worker, and block any signal that isn't
> -		 * fatal or STOP
> -		 */
> +	if (args->io_thread)
>  		p->flags |= PF_IO_WORKER;
> +	if (args->block_signals)
>  		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
> -	}
>  
>  	if (args->name)
>  		strscpy_pad(p->comm, args->name, sizeof(p->comm));
> @@ -2517,9 +2513,6 @@ __latent_entropy struct task_struct *copy_process(
>  	if (retval)
>  		goto bad_fork_cleanup_io;
>  
> -	if (args->ignore_signals)
> -		ignore_signals(p);
> -
>  	stackleak_task_init(p);
>  
>  	if (pid != &init_struct_pid) {
> @@ -2861,6 +2854,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
>  		.fn_arg		= arg,
>  		.io_thread	= 1,
>  		.user_worker	= 1,
> +		.block_signals	= 1,
>  	};
>  
>  	return copy_process(NULL, 0, node, &args);
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 8050fe23c732..a0f00a078cbb 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2891,6 +2891,7 @@ bool get_signal(struct ksignal *ksig)
>  
>  	return ksig->sig > 0;
>  }
> +EXPORT_SYMBOL_GPL(get_signal);

If you are exporting this, could you add documentation please?


>  /**
>   * signal_delivered - called after signal delivery to update blocked signals
> diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
> index b7cbd66f889e..7a2d7d9fe772 100644
> --- a/kernel/vhost_task.c
> +++ b/kernel/vhost_task.c
> @@ -31,22 +31,13 @@ static int vhost_task_fn(void *data)
>   */
>  void vhost_task_stop(struct vhost_task *vtsk)
>  {
> -	pid_t pid = vtsk->task->pid;
> -
>  	set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
>  	wake_up_process(vtsk->task);
>  	/*
>  	 * Make sure vhost_task_fn is no longer accessing the vhost_task before
> -	 * freeing it below. If userspace crashed or exited without closing,
> -	 * then the vhost_task->task could already be marked dead so
> -	 * kernel_wait will return early.
> +	 * freeing it below.
>  	 */
>  	wait_for_completion(&vtsk->exited);
> -	/*
> -	 * If we are just closing/removing a device and the parent process is
> -	 * not exiting then reap the task.
> -	 */
> -	kernel_wait4(pid, NULL, __WCLONE, NULL);
>  	kfree(vtsk);
>  }
>  EXPORT_SYMBOL_GPL(vhost_task_stop);
> @@ -75,13 +66,14 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
>  				     const char *name)
>  {
>  	struct kernel_clone_args args = {
> -		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM,
> +		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM |
> +				  CLONE_THREAD | CLONE_SIGHAND,
>  		.exit_signal	= 0,
>  		.fn		= vhost_task_fn,
>  		.name		= name,
>  		.user_worker	= 1,
>  		.no_files	= 1,
> -		.ignore_signals	= 1,
> +		.block_signals	= 1,
>  	};
>  	struct vhost_task *vtsk;
>  	struct task_struct *tsk;
> -- 
> 2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

WARNING: multiple messages have this Message-ID (diff)
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Mike Christie <michael.christie@oracle.com>
Cc: oleg@redhat.com, linux@leemhuis.info, nicolas.dichtel@6wind.com,
	axboe@kernel.dk, ebiederm@xmission.com,
	torvalds@linux-foundation.org, linux-kernel@vger.kernel.org,
	virtualization@lists.linux-foundation.org, sgarzare@redhat.com,
	jasowang@redhat.com, stefanha@redhat.com, brauner@kernel.org
Subject: Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
Date: Mon, 22 May 2023 15:40:48 -0400	[thread overview]
Message-ID: <20230522153852-mutt-send-email-mst@kernel.org> (raw)
In-Reply-To: <20230522025124.5863-4-michael.christie@oracle.com>

On Sun, May 21, 2023 at 09:51:24PM -0500, Mike Christie wrote:
> When switching from kthreads to vhost_tasks two bugs were added:
> 1. The vhost worker tasks's now show up as processes so scripts doing ps
> or ps a would not incorrectly detect the vhost task as another process.
> 2. kthreads disabled freeze by setting PF_NOFREEZE, but vhost tasks's
> didn't disable or add support for them.
> 
> To fix both bugs, this switches the vhost task to be thread in the
> process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call
> get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that
> SIGKILL/STOP support is required because CLONE_THREAD requires
> CLONE_SIGHAND which requires those 2 signals to be suppported.
> 
> This a modified version of patch originally written by Linus which
> handles his review comment to himself to rename ignore_signals to
> block_signals to better represent what it now does. And it includes a
> change to vhost_worker() to support SIGSTOP/KILL and freeze, and it
> drops the wait use per Oleg's review comment that it's no longer needed
> when using CLONE_THREAD.
> 
> Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/vhost/vhost.c      | 17 ++++++++++++++++-
>  include/linux/sched/task.h |  2 +-
>  kernel/fork.c              | 12 +++---------
>  kernel/signal.c            |  1 +
>  kernel/vhost_task.c        | 16 ++++------------
>  5 files changed, 25 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index a92af08e7864..bf83e9340e40 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -338,6 +338,7 @@ static int vhost_worker(void *data)
>  	struct vhost_worker *worker = data;
>  	struct vhost_work *work, *work_next;
>  	struct llist_node *node;
> +	bool dead = false;
>  
>  	for (;;) {
>  		/* mb paired w/ kthread_stop */
> @@ -349,8 +350,22 @@ static int vhost_worker(void *data)
>  		}
>  
>  		node = llist_del_all(&worker->work_list);
> -		if (!node)
> +		if (!node) {
>  			schedule();
> +			/*
> +			 * When we get a SIGKILL our release function will
> +			 * be called. That will stop new IOs from being queued
> +			 * and check for outstanding cmd responses. It will then
> +			 * call vhost_task_stop to tell us to return and exit.
> +			 */
> +			if (!dead && signal_pending(current)) {
> +				struct ksignal ksig;
> +
> +				dead = get_signal(&ksig);
> +				if (dead)
> +					clear_thread_flag(TIF_SIGPENDING);


Does get_signal actually return true only on SIGKILL then?

> +			}
> +		}
>  
>  		node = llist_reverse_order(node);
>  		/* make sure flag is seen after deletion */
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 537cbf9a2ade..249a5ece9def 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -29,7 +29,7 @@ struct kernel_clone_args {
>  	u32 io_thread:1;
>  	u32 user_worker:1;
>  	u32 no_files:1;
> -	u32 ignore_signals:1;
> +	u32 block_signals:1;
>  	unsigned long stack;
>  	unsigned long stack_size;
>  	unsigned long tls;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ed4e01daccaa..9e04ab5c3946 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2338,14 +2338,10 @@ __latent_entropy struct task_struct *copy_process(
>  		p->flags |= PF_KTHREAD;
>  	if (args->user_worker)
>  		p->flags |= PF_USER_WORKER;
> -	if (args->io_thread) {
> -		/*
> -		 * Mark us an IO worker, and block any signal that isn't
> -		 * fatal or STOP
> -		 */
> +	if (args->io_thread)
>  		p->flags |= PF_IO_WORKER;
> +	if (args->block_signals)
>  		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
> -	}
>  
>  	if (args->name)
>  		strscpy_pad(p->comm, args->name, sizeof(p->comm));
> @@ -2517,9 +2513,6 @@ __latent_entropy struct task_struct *copy_process(
>  	if (retval)
>  		goto bad_fork_cleanup_io;
>  
> -	if (args->ignore_signals)
> -		ignore_signals(p);
> -
>  	stackleak_task_init(p);
>  
>  	if (pid != &init_struct_pid) {
> @@ -2861,6 +2854,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
>  		.fn_arg		= arg,
>  		.io_thread	= 1,
>  		.user_worker	= 1,
> +		.block_signals	= 1,
>  	};
>  
>  	return copy_process(NULL, 0, node, &args);
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 8050fe23c732..a0f00a078cbb 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2891,6 +2891,7 @@ bool get_signal(struct ksignal *ksig)
>  
>  	return ksig->sig > 0;
>  }
> +EXPORT_SYMBOL_GPL(get_signal);

If you are exporting this, could you add documentation please?


>  /**
>   * signal_delivered - called after signal delivery to update blocked signals
> diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
> index b7cbd66f889e..7a2d7d9fe772 100644
> --- a/kernel/vhost_task.c
> +++ b/kernel/vhost_task.c
> @@ -31,22 +31,13 @@ static int vhost_task_fn(void *data)
>   */
>  void vhost_task_stop(struct vhost_task *vtsk)
>  {
> -	pid_t pid = vtsk->task->pid;
> -
>  	set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
>  	wake_up_process(vtsk->task);
>  	/*
>  	 * Make sure vhost_task_fn is no longer accessing the vhost_task before
> -	 * freeing it below. If userspace crashed or exited without closing,
> -	 * then the vhost_task->task could already be marked dead so
> -	 * kernel_wait will return early.
> +	 * freeing it below.
>  	 */
>  	wait_for_completion(&vtsk->exited);
> -	/*
> -	 * If we are just closing/removing a device and the parent process is
> -	 * not exiting then reap the task.
> -	 */
> -	kernel_wait4(pid, NULL, __WCLONE, NULL);
>  	kfree(vtsk);
>  }
>  EXPORT_SYMBOL_GPL(vhost_task_stop);
> @@ -75,13 +66,14 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
>  				     const char *name)
>  {
>  	struct kernel_clone_args args = {
> -		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM,
> +		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM |
> +				  CLONE_THREAD | CLONE_SIGHAND,
>  		.exit_signal	= 0,
>  		.fn		= vhost_task_fn,
>  		.name		= name,
>  		.user_worker	= 1,
>  		.no_files	= 1,
> -		.ignore_signals	= 1,
> +		.block_signals	= 1,
>  	};
>  	struct vhost_task *vtsk;
>  	struct task_struct *tsk;
> -- 
> 2.25.1


  parent reply	other threads:[~2023-05-22 19:41 UTC|newest]

Thread overview: 103+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-22  2:51 [PATCH 0/3] vhost: Fix freezer/ps regressions Mike Christie
2023-05-22  2:51 ` Mike Christie
2023-05-22  2:51 ` [PATCH 1/3] signal: Don't always put SIGKILL in shared_pending Mike Christie
2023-05-22  2:51   ` Mike Christie
2023-05-23 15:30   ` Eric W. Biederman
2023-05-23 15:30     ` Eric W. Biederman
2023-05-22  2:51 ` [PATCH 2/3] signal: Don't exit for PF_USER_WORKER tasks Mike Christie
2023-05-22  2:51   ` Mike Christie
2023-05-22  2:51 ` [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression Mike Christie
2023-05-22  2:51   ` Mike Christie
2023-05-22 12:30   ` Oleg Nesterov
2023-05-22 12:30     ` Oleg Nesterov
2023-05-22 17:00     ` Mike Christie
2023-05-22 17:00       ` Mike Christie
2023-05-22 17:47       ` Oleg Nesterov
2023-05-22 17:47         ` Oleg Nesterov
2023-05-23 12:15         ` Oleg Nesterov
2023-05-23 12:15           ` Oleg Nesterov
2023-05-23 15:57           ` Eric W. Biederman
2023-05-23 15:57             ` Eric W. Biederman
2023-05-24 14:10             ` Oleg Nesterov
2023-05-24 14:10               ` Oleg Nesterov
2023-05-24 14:44               ` Eric W. Biederman
2023-05-24 14:44                 ` Eric W. Biederman
2023-05-25 11:55                 ` Oleg Nesterov
2023-05-25 11:55                   ` Oleg Nesterov
2023-05-25 15:30                   ` Eric W. Biederman
2023-05-25 15:30                     ` Eric W. Biederman
2023-05-25 16:20                     ` Linus Torvalds
2023-05-25 16:20                       ` Linus Torvalds
2023-05-27  9:49                       ` Eric W. Biederman
2023-05-27  9:49                         ` Eric W. Biederman
2023-05-27 16:12                         ` Linus Torvalds
2023-05-27 16:12                           ` Linus Torvalds
2023-05-28  1:17                           ` Eric W. Biederman
2023-05-28  1:17                             ` Eric W. Biederman
2023-05-28  1:21                             ` Linus Torvalds
2023-05-28  1:21                               ` Linus Torvalds
2023-05-29 11:19                             ` Oleg Nesterov
2023-05-29 11:19                               ` Oleg Nesterov
2023-05-29 16:09                               ` michael.christie
2023-05-29 16:09                                 ` michael.christie
2023-05-29 17:46                                 ` Oleg Nesterov
2023-05-29 17:46                                   ` Oleg Nesterov
2023-05-29 17:54                                   ` Oleg Nesterov
2023-05-29 17:54                                     ` Oleg Nesterov
2023-05-29 19:03                                     ` Mike Christie
2023-05-29 19:03                                       ` Mike Christie
2023-05-29 19:35                                   ` Mike Christie
2023-05-29 19:35                                     ` Mike Christie
2023-05-29 19:46                                     ` michael.christie
2023-05-29 19:46                                       ` michael.christie
2023-05-30  2:48                                       ` Eric W. Biederman
2023-05-30  2:48                                         ` Eric W. Biederman
2023-05-30  2:38                                 ` Eric W. Biederman
2023-05-30  2:38                                   ` Eric W. Biederman
2023-05-30 15:34                                   ` Mike Christie
2023-05-30 15:34                                     ` Mike Christie
2023-05-31  3:30                                   ` Mike Christie
2023-05-31  3:30                                     ` Mike Christie
2023-05-29 16:11                               ` michael.christie
2023-05-29 16:11                                 ` michael.christie
2023-05-30 14:15                               ` Christian Brauner
2023-05-30 17:55                                 ` Oleg Nesterov
2023-05-30 17:55                                   ` Oleg Nesterov
2023-05-30 15:01                         ` Eric W. Biederman
2023-05-30 15:01                           ` Eric W. Biederman
2023-05-31  5:22             ` Jason Wang
2023-05-31  5:22               ` Jason Wang
2023-05-24  0:02           ` Mike Christie
2023-05-24  0:02             ` Mike Christie
2023-05-25 16:15           ` Mike Christie
2023-05-25 16:15             ` Mike Christie
2023-05-28  1:41             ` Eric W. Biederman
2023-05-28  1:41               ` Eric W. Biederman
2023-05-28 19:29               ` Mike Christie
2023-05-28 19:29                 ` Mike Christie
2023-05-31  5:22           ` Jason Wang
2023-05-31  5:22             ` Jason Wang
2023-05-31  7:25             ` Oleg Nesterov
2023-05-31  7:25               ` Oleg Nesterov
2023-05-31  8:17               ` Jason Wang
2023-05-31  8:17                 ` Jason Wang
2023-05-31  9:14                 ` Oleg Nesterov
2023-05-31  9:14                   ` Oleg Nesterov
2023-06-01  2:44                   ` Jason Wang
2023-06-01  2:44                     ` Jason Wang
2023-06-01  7:43                     ` Oleg Nesterov
2023-06-01  7:43                       ` Oleg Nesterov
2023-06-02  5:03                       ` Jason Wang
2023-06-02  5:03                         ` Jason Wang
2023-06-02 17:58                         ` Oleg Nesterov
2023-06-02 17:58                           ` Oleg Nesterov
2023-06-02 20:07                           ` Linus Torvalds
2023-06-02 20:07                             ` Linus Torvalds
2023-06-05 14:20                             ` Oleg Nesterov
2023-06-05 14:20                               ` Oleg Nesterov
2023-05-22 19:40   ` Michael S. Tsirkin [this message]
2023-05-22 19:40     ` Michael S. Tsirkin
2023-05-23 15:39     ` Eric W. Biederman
2023-05-23 15:39       ` Eric W. Biederman
2023-05-23 15:48     ` Mike Christie
2023-05-23 15:48       ` Mike Christie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230522153852-mutt-send-email-mst@kernel.org \
    --to=mst@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=ebiederm@xmission.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@leemhuis.info \
    --cc=michael.christie@oracle.com \
    --cc=nicolas.dichtel@6wind.com \
    --cc=oleg@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=torvalds@linux-foundation.org \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.