All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hajime Tazaki <thehajime@gmail.com>
To: benjamin@sipsolutions.net
Cc: linux-um@lists.infradead.org, johannes@sipsolutions.net,
	benjamin.berg@intel.com
Subject: Re: [PATCH 7/9] um: Implement kernel side of SECCOMP based process handling
Date: Fri, 07 Mar 2025 16:04:50 +0900	[thread overview]
Message-ID: <m2wmd1z5gt.wl-thehajime@gmail.com> (raw)
In-Reply-To: <20250224181827.647129-8-benjamin@sipsolutions.net>


Hello,

thanks for the update; was waiting for this.

On Tue, 25 Feb 2025 03:18:25 +0900,
Benjamin Berg wrote:
> 
> This adds the kernel side of the seccomp based process handling.
> 
> Co-authored-by: Johannes Berg <johannes@sipsolutions.net>
> Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
> Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
(snip)
> diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
> index f8ee5d612c47..0abc509e3f4c 100644
> --- a/arch/um/kernel/skas/mmu.c
> +++ b/arch/um/kernel/skas/mmu.c
> @@ -38,14 +38,11 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
>  	scoped_guard(spinlock_irqsave, &mm_list_lock) {
>  		/* Insert into list, used for lookups when the child dies */
>  		list_add(&mm->context.list, &mm_list);
> -

maybe this is not needed.

>  	}
>  
> -	new_id->pid = start_userspace(stack);
> -	if (new_id->pid < 0) {
> -		ret = new_id->pid;
> +	ret = start_userspace(new_id);
> +	if (ret < 0)
>  		goto out_free;
> -	}
>  
>  	/* Ensure the new MM is clean and nothing unwanted is mapped */
>  	unmap(new_id, 0, STUB_START);
> diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
> index 23c99b285e82..f40f2332b676 100644
> --- a/arch/um/kernel/skas/stub_exe.c
> +++ b/arch/um/kernel/skas/stub_exe.c
> @@ -3,6 +3,9 @@
>  #include <asm/unistd.h>
>  #include <sysdep/stub.h>
>  #include <stub-data.h>
> +#include <linux/filter.h>
> +#include <linux/seccomp.h>
> +#include <generated/asm-offsets.h>
>  
>  void _start(void);
>  
> @@ -25,8 +28,6 @@ noinline static void real_init(void)
>  	} sa = {
>  		/* Need to set SA_RESTORER (but the handler never returns) */
>  		.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
> -		/* no need to mask any signals */
> -		.sa_mask = 0,
>  	};
>  
>  	/* set a nice name */
> @@ -35,6 +36,9 @@ noinline static void real_init(void)
>  	/* Make sure this process dies if the kernel dies */
>  	stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);
>  
> +	/* Needed in SECCOMP mode (and safe to do anyway) */
> +	stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> +
>  	/* read information from STDIN and close it */
>  	res = stub_syscall3(__NR_read, 0,
>  			    (unsigned long)&init_data, sizeof(init_data));
> @@ -63,18 +67,133 @@ noinline static void real_init(void)
>  	stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
>  	stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
>  
> -	/* register SIGSEGV handler */
> -	sa.sa_handler_ = (void *) init_data.segv_handler;
> -	res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0,
> -			    sizeof(sa.sa_mask));
> -	if (res != 0)
> -		stub_syscall1(__NR_exit, 13);
> +	/* register signal handlers */
> +	sa.sa_handler_ = (void *) init_data.signal_handler;
> +	sa.sa_restorer = (void *) init_data.signal_restorer;
> +	if (!init_data.seccomp) {
> +		/* In ptrace mode, the SIGSEGV handler never returns */
> +		sa.sa_mask = 0;
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 13);
> +	} else {
> +		/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
> +		sa.sa_mask = ~0ULL;
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 14);
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 15);
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 16);
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 17);
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGILL,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 18);
> +
> +		res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
> +				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> +		if (res != 0)
> +			stub_syscall1(__NR_exit, 19);
> +	}
> +
> +	/*
> +	 * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
> +	 * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
> +	 */
> +	if (init_data.seccomp) {
> +		struct sock_filter filter[] = {
> +#if __BITS_PER_LONG > 32
> +			/* [0] Load upper 32bit of instruction pointer from seccomp_data */
> +			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> +				 (offsetof(struct seccomp_data, instruction_pointer) + 4)),
> +
> +			/* [1] Jump forward 3 instructions if the upper address is not identical */
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3),
> +#endif
> +			/* [2] Load lower 32bit of instruction pointer from seccomp_data */
> +			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> +				 (offsetof(struct seccomp_data, instruction_pointer))),
> +
> +			/* [3] Mask out lower bits */
> +			BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),
> +
> +			/* [4] Jump to [6] if the lower bits are not on the expected page */
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0),
> +
> +			/* [5] Trap call, allow */
> +			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
> +
> +			/* [6,7] Check architecture */
> +			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> +				 offsetof(struct seccomp_data, arch)),
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
> +				 UM_SECCOMP_ARCH_NATIVE, 1, 0),
> +
> +			/* [8] Kill (for architecture check) */
> +			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
> +
> +			/* [9] Load syscall number */
> +			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> +				 offsetof(struct seccomp_data, nr)),
> +
> +			/* [10-14] Check against permitted syscalls */
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
> +				 5, 0),
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
> +				 4, 0),
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
> +				 3, 0),
> +#ifdef __i386__
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area,
> +				 2, 0),
> +#else
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
> +				 2, 0),
> +#endif
> +			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
> +				 1, 0),

I was trying to understand what you mean 'permitted syscalls' here.
Is this a list of syscall used by UML itself, or something else ?

and should the list be maintained/updated if UML expands the permitted
syscalls ?

> +			/* [15] Not one of the permitted syscalls */
> +			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
> +
> +			/* [16] Permitted call for the stub */
> +			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
> +		};
> +		struct sock_fprog prog = {
> +			.len = sizeof(filter) / sizeof(filter[0]),
> +			.filter = filter,
> +		};
> +
> +		if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
> +				  SECCOMP_FILTER_FLAG_TSYNC,
> +				  (unsigned long)&prog) != 0)
> +			stub_syscall1(__NR_exit, 20);
>  
> -	stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
> +		/* Fall through, the exit syscall will cause SIGSYS */
> +	} else {
> +		stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
>  
> -	stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
> +		stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
> +	}
>  
> -	stub_syscall1(__NR_exit, 14);
> +	stub_syscall1(__NR_exit, 30);
>  
>  	__builtin_unreachable();
>  }

I was thinking that if I can clean up (or share) the seccomp filter
code of nommu UML with this, but it is not likely as the memory layout
is different.  I would think that the detection part might be useful
as well for nommu.

-- Hajime


  reply	other threads:[~2025-03-07  7:05 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-24 18:18 [PATCH 0/9] SECCOMP based userspace for UML Benjamin Berg
2025-02-24 18:18 ` [PATCH 1/9] um: Store full CSGSFS and SS register from mcontext Benjamin Berg
2025-02-24 18:18 ` [PATCH 2/9] um: Move faultinfo extraction into userspace routine Benjamin Berg
2025-03-18 10:25   ` Johannes Berg
2025-02-24 18:18 ` [PATCH 3/9] um: Add stub side of SECCOMP/futex based process handling Benjamin Berg
2025-02-24 18:18 ` [PATCH 4/9] um: Add helper functions to get/set state for SECCOMP Benjamin Berg
2025-02-24 18:18 ` [PATCH 5/9] um: Add SECCOMP support detection and initialization Benjamin Berg
2025-02-24 18:18 ` [PATCH 6/9] um: Track userspace children dying in SECCOMP mode Benjamin Berg
2025-02-24 18:18 ` [PATCH 7/9] um: Implement kernel side of SECCOMP based process handling Benjamin Berg
2025-03-07  7:04   ` Hajime Tazaki [this message]
2025-03-07 10:27     ` Benjamin Berg
2025-02-24 18:18 ` [PATCH 8/9] um: pass FD for memory operations when needed Benjamin Berg
2025-02-24 18:18 ` [PATCH 9/9] um: Add UML_SECCOMP configuration option Benjamin Berg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m2wmd1z5gt.wl-thehajime@gmail.com \
    --to=thehajime@gmail.com \
    --cc=benjamin.berg@intel.com \
    --cc=benjamin@sipsolutions.net \
    --cc=johannes@sipsolutions.net \
    --cc=linux-um@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.