All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Daniel P. Berrangé" <berrange@redhat.com>
To: Stefan Hajnoczi <stefanha@redhat.com>
Cc: virtio-fs@redhat.com, qemu-devel@nongnu.org
Subject: Re: [Virtio-fs] [PATCH 2/2] virtiofsd: move to a new pid namespace
Date: Thu, 24 Oct 2019 11:26:11 +0100	[thread overview]
Message-ID: <20191024102611.GF3700@redhat.com> (raw)
In-Reply-To: <20191016160157.12414-3-stefanha@redhat.com>

On Wed, Oct 16, 2019 at 05:01:57PM +0100, Stefan Hajnoczi wrote:
> virtiofsd needs access to /proc/self/fd.  Let's move to a new pid
> namespace so that a compromised process cannot see another other
> processes running on the system.
> 
> One wrinkle in this approach: unshare(CLONE_NEWPID) affects *child*
> processes and not the current process.  Therefore we need to fork the
> pid 1 process that will actually run virtiofsd and leave a parent in
> waitpid(2).  This is not the same thing as daemonization and parent
> processes should not notice a difference.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  contrib/virtiofsd/passthrough_ll.c | 95 ++++++++++++++++++++++--------
>  1 file changed, 72 insertions(+), 23 deletions(-)
> 
> diff --git a/contrib/virtiofsd/passthrough_ll.c b/contrib/virtiofsd/passthrough_ll.c
> index c27ff7d800..b6ee9b2e90 100644
> --- a/contrib/virtiofsd/passthrough_ll.c
> +++ b/contrib/virtiofsd/passthrough_ll.c
> @@ -56,9 +56,12 @@
>  #include <sys/xattr.h>
>  #include <sys/mman.h>
>  #include <sys/socket.h>
> +#include <sys/types.h>
>  #include <sys/un.h>
> +#include <sys/wait.h>
>  #include <sys/capability.h>
>  
> +
>  #include "ireg.h"
>  #include <sys/mount.h>
>  #include <sys/resource.h>
> @@ -2749,6 +2752,72 @@ static void setup_net_namespace(void)
>  	}
>  }
>  
> +/*
> + * Move to a new pid namespace to prevent access to other processes if this
> + * process is compromised.
> + */
> +static void setup_pid_namespace(void)
> +{
> +	pid_t child;
> +
> +	/*
> +	 * Create a new pid namespace for *child* processes.  We'll have to
> +	 * fork in order to enter the new pid namespace.  A new mount namespace
> +	 * is also needed so that we can remount /proc for the new pid
> +	 * namespace.
> +	 */
> +	if (unshare(CLONE_NEWPID | CLONE_NEWNS) != 0) {
> +		fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
> +		exit(1);
> +	}
> +
> +	child = fork();
> +	if (child < 0) {
> +		fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
> +		exit(1);
> +	}
> +	if (child > 0) {
> +		pid_t waited;
> +		int wstatus;
> +
> +		/* The parent waits for the child */
> +		do {
> +			waited = waitpid(child, &wstatus, 0);
> +		} while (waited < 0 && errno == EINTR);
> +
> +		if (WIFEXITED(wstatus)) {
> +			exit(WEXITSTATUS(wstatus));
> +		}
> +
> +		exit(1);
> +	}

It might be useful to call prctl(PR_SET_PDEATHSIG) here, so that
if the parent process exits for any reason, the child will be killed
off too.

> +
> +	/*
> +	 * If the mounts have shared propagation then we want to opt out so our
> +	 * mount changes don't affect the parent mount namespace.
> +	 */
> +	if (mount(NULL, "/", NULL, MS_REC|MS_SLAVE, NULL) < 0) {
> +		fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
> +		exit(1);
> +	}
> +
> +	/* The child must remount /proc to use the new pid namespace */
> +	if (mount("proc", "/proc", "proc",
> +		  MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
> +		fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
> +		exit(1);
> +	}
> +}

I feel like this is making things a bit misleading.

 setup_pid_namespace()

is now creating the mount namespace and pid namespace, and doing
some mount point config

 setup_mount_namespace()

is not creating the mount namespace, but is doing some more mount
point config.

And then there's setup_net_namespace() too.

I think there could be a  single

  setup_namespaces()

method that does the unshare(CLONE_NEWNS|CLONE_NEWNET|CLONE_NEWPID)
and forking the child.

And a setup_mounts()

method that does all the mount() calls.

> +
> +static void setup_proc_self_fd(struct lo_data *lo)
> +{
> +	lo->proc_self_fd = open("/proc/self/fd", O_PATH);
> +	if (lo->proc_self_fd == -1) {
> +		fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
> +		exit(1);
> +	}
> +}
> +
>  /* This magic is based on lxc's lxc_pivot_root() */
>  static void setup_pivot_root(const char *source)
>  {
> @@ -2803,20 +2872,10 @@ static void setup_pivot_root(const char *source)
>  
>  /*
>   * Make the source directory our root so symlinks cannot escape and no other
> - * files are accessible.
> + * files are accessible.  Assumes unshare(CLONE_NEWNS) was already called.
>   */
>  static void setup_mount_namespace(const char *source)
>  {
> -	if (unshare(CLONE_NEWNS) != 0) {
> -		fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNS): %m\n");
> -		exit(1);
> -	}
> -
> -	if (mount(NULL, "/", NULL, MS_REC|MS_SLAVE, NULL) < 0) {
> -		fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_PRIVATE): %m\n");
> -		exit(1);
> -	}
> -
>  	if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
>  		fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
>  		exit(1);
> @@ -2831,6 +2890,8 @@ static void setup_mount_namespace(const char *source)
>   */
>  static void setup_sandbox(struct lo_data *lo, bool enable_syslog)
>  {
> +	setup_pid_namespace();
> +	setup_proc_self_fd(lo);
>  	setup_net_namespace();
>  	setup_mount_namespace(lo->source);
>  	setup_seccomp(enable_syslog);
> @@ -2860,15 +2921,6 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root)
>  	g_atomic_int_set(&root->refcount, 2);
>  }
>  
> -static void setup_proc_self_fd(struct lo_data *lo)
> -{
> -	lo->proc_self_fd = open("/proc/self/fd", O_PATH);
> -	if (lo->proc_self_fd == -1) {
> -		fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
> -		exit(1);
> -	}
> -}
> -
>  /* Raise the maximum number of open file descriptors to the system limit */
>  static void setup_nofile_rlimit(void)
>  {
> @@ -3110,9 +3162,6 @@ int main(int argc, char *argv[])
>  		get_shared(&lo, &lo.root);
>  	}
>  
> -	/* Must be after daemonize to get the right /proc/self/fd */
> -	setup_proc_self_fd(&lo);
> -
>  	setup_sandbox(&lo, opts.syslog);
>  
>  	setup_root(&lo, &lo.root);
> -- 
> 2.21.0
> 
> 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|


WARNING: multiple messages have this Message-ID (diff)
From: "Daniel P. Berrangé" <berrange@redhat.com>
To: Stefan Hajnoczi <stefanha@redhat.com>
Cc: virtio-fs@redhat.com, qemu-devel@nongnu.org,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>
Subject: Re: [PATCH 2/2] virtiofsd: move to a new pid namespace
Date: Thu, 24 Oct 2019 11:26:11 +0100	[thread overview]
Message-ID: <20191024102611.GF3700@redhat.com> (raw)
In-Reply-To: <20191016160157.12414-3-stefanha@redhat.com>

On Wed, Oct 16, 2019 at 05:01:57PM +0100, Stefan Hajnoczi wrote:
> virtiofsd needs access to /proc/self/fd.  Let's move to a new pid
> namespace so that a compromised process cannot see another other
> processes running on the system.
> 
> One wrinkle in this approach: unshare(CLONE_NEWPID) affects *child*
> processes and not the current process.  Therefore we need to fork the
> pid 1 process that will actually run virtiofsd and leave a parent in
> waitpid(2).  This is not the same thing as daemonization and parent
> processes should not notice a difference.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  contrib/virtiofsd/passthrough_ll.c | 95 ++++++++++++++++++++++--------
>  1 file changed, 72 insertions(+), 23 deletions(-)
> 
> diff --git a/contrib/virtiofsd/passthrough_ll.c b/contrib/virtiofsd/passthrough_ll.c
> index c27ff7d800..b6ee9b2e90 100644
> --- a/contrib/virtiofsd/passthrough_ll.c
> +++ b/contrib/virtiofsd/passthrough_ll.c
> @@ -56,9 +56,12 @@
>  #include <sys/xattr.h>
>  #include <sys/mman.h>
>  #include <sys/socket.h>
> +#include <sys/types.h>
>  #include <sys/un.h>
> +#include <sys/wait.h>
>  #include <sys/capability.h>
>  
> +
>  #include "ireg.h"
>  #include <sys/mount.h>
>  #include <sys/resource.h>
> @@ -2749,6 +2752,72 @@ static void setup_net_namespace(void)
>  	}
>  }
>  
> +/*
> + * Move to a new pid namespace to prevent access to other processes if this
> + * process is compromised.
> + */
> +static void setup_pid_namespace(void)
> +{
> +	pid_t child;
> +
> +	/*
> +	 * Create a new pid namespace for *child* processes.  We'll have to
> +	 * fork in order to enter the new pid namespace.  A new mount namespace
> +	 * is also needed so that we can remount /proc for the new pid
> +	 * namespace.
> +	 */
> +	if (unshare(CLONE_NEWPID | CLONE_NEWNS) != 0) {
> +		fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
> +		exit(1);
> +	}
> +
> +	child = fork();
> +	if (child < 0) {
> +		fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
> +		exit(1);
> +	}
> +	if (child > 0) {
> +		pid_t waited;
> +		int wstatus;
> +
> +		/* The parent waits for the child */
> +		do {
> +			waited = waitpid(child, &wstatus, 0);
> +		} while (waited < 0 && errno == EINTR);
> +
> +		if (WIFEXITED(wstatus)) {
> +			exit(WEXITSTATUS(wstatus));
> +		}
> +
> +		exit(1);
> +	}

It might be useful to call prctl(PR_SET_PDEATHSIG) here, so that
if the parent process exits for any reason, the child will be killed
off too.

> +
> +	/*
> +	 * If the mounts have shared propagation then we want to opt out so our
> +	 * mount changes don't affect the parent mount namespace.
> +	 */
> +	if (mount(NULL, "/", NULL, MS_REC|MS_SLAVE, NULL) < 0) {
> +		fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
> +		exit(1);
> +	}
> +
> +	/* The child must remount /proc to use the new pid namespace */
> +	if (mount("proc", "/proc", "proc",
> +		  MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
> +		fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
> +		exit(1);
> +	}
> +}

I feel like this is making things a bit misleading.

 setup_pid_namespace()

is now creating the mount namespace and pid namespace, and doing
some mount point config

 setup_mount_namespace()

is not creating the mount namespace, but is doing some more mount
point config.

And then there's setup_net_namespace() too.

I think there could be a  single

  setup_namespaces()

method that does the unshare(CLONE_NEWNS|CLONE_NEWNET|CLONE_NEWPID)
and forking the child.

And a setup_mounts()

method that does all the mount() calls.

> +
> +static void setup_proc_self_fd(struct lo_data *lo)
> +{
> +	lo->proc_self_fd = open("/proc/self/fd", O_PATH);
> +	if (lo->proc_self_fd == -1) {
> +		fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
> +		exit(1);
> +	}
> +}
> +
>  /* This magic is based on lxc's lxc_pivot_root() */
>  static void setup_pivot_root(const char *source)
>  {
> @@ -2803,20 +2872,10 @@ static void setup_pivot_root(const char *source)
>  
>  /*
>   * Make the source directory our root so symlinks cannot escape and no other
> - * files are accessible.
> + * files are accessible.  Assumes unshare(CLONE_NEWNS) was already called.
>   */
>  static void setup_mount_namespace(const char *source)
>  {
> -	if (unshare(CLONE_NEWNS) != 0) {
> -		fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNS): %m\n");
> -		exit(1);
> -	}
> -
> -	if (mount(NULL, "/", NULL, MS_REC|MS_SLAVE, NULL) < 0) {
> -		fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_PRIVATE): %m\n");
> -		exit(1);
> -	}
> -
>  	if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
>  		fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
>  		exit(1);
> @@ -2831,6 +2890,8 @@ static void setup_mount_namespace(const char *source)
>   */
>  static void setup_sandbox(struct lo_data *lo, bool enable_syslog)
>  {
> +	setup_pid_namespace();
> +	setup_proc_self_fd(lo);
>  	setup_net_namespace();
>  	setup_mount_namespace(lo->source);
>  	setup_seccomp(enable_syslog);
> @@ -2860,15 +2921,6 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root)
>  	g_atomic_int_set(&root->refcount, 2);
>  }
>  
> -static void setup_proc_self_fd(struct lo_data *lo)
> -{
> -	lo->proc_self_fd = open("/proc/self/fd", O_PATH);
> -	if (lo->proc_self_fd == -1) {
> -		fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
> -		exit(1);
> -	}
> -}
> -
>  /* Raise the maximum number of open file descriptors to the system limit */
>  static void setup_nofile_rlimit(void)
>  {
> @@ -3110,9 +3162,6 @@ int main(int argc, char *argv[])
>  		get_shared(&lo, &lo.root);
>  	}
>  
> -	/* Must be after daemonize to get the right /proc/self/fd */
> -	setup_proc_self_fd(&lo);
> -
>  	setup_sandbox(&lo, opts.syslog);
>  
>  	setup_root(&lo, &lo.root);
> -- 
> 2.21.0
> 
> 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|



  parent reply	other threads:[~2019-10-24 10:26 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-16 16:01 [Virtio-fs] [PATCH 0/2] virtiofsd: add net and pid namespace sandboxing Stefan Hajnoczi
2019-10-16 16:01 ` Stefan Hajnoczi
2019-10-16 16:01 ` [Virtio-fs] [PATCH 1/2] virtiofsd: move to an empty network namespace Stefan Hajnoczi
2019-10-16 16:01   ` Stefan Hajnoczi
2019-10-23  9:34   ` [Virtio-fs] " Dr. David Alan Gilbert
2019-10-23  9:34     ` Dr. David Alan Gilbert
2019-10-16 16:01 ` [Virtio-fs] [PATCH 2/2] virtiofsd: move to a new pid namespace Stefan Hajnoczi
2019-10-16 16:01   ` Stefan Hajnoczi
2019-10-17 14:45   ` [Virtio-fs] " Vivek Goyal
2019-10-17 16:11     ` Stefan Hajnoczi
2019-10-23  9:46   ` Dr. David Alan Gilbert
2019-10-23  9:46     ` Dr. David Alan Gilbert
2019-10-24 10:26   ` Daniel P. Berrangé [this message]
2019-10-24 10:26     ` Daniel P. Berrangé
2019-10-25 12:53     ` [Virtio-fs] " Stefan Hajnoczi
2019-10-25 12:53       ` Stefan Hajnoczi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191024102611.GF3700@redhat.com \
    --to=berrange@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    --cc=virtio-fs@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.