secure computing for 2.6.7

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* secure computing for 2.6.7
@ 2004-07-04 17:39 andrea
  2004-07-04 21:35 ` Andrew Morton
  2004-07-07 19:27 ` Hans Reiser
  0 siblings, 2 replies; 42+ messages in thread
From: andrea @ 2004-07-04 17:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton

Hello,

I need this new kernel feature for a reseach spare time project I'm
developing in the weekends.  The fast path cost is basically only the
s/testb/testw/ change in entry.S. (and even that might be removed with a
more signficant effort but I don't think anybody could worry about that
change).

This might be better off for 2.7 but I would like if people could have a
look, and it's simple enough that it might be included in 2.6 too later
on. (it just need to be ported to the other archs, only x86 is
implemented here, but that's easy)

Especially I would like to know if anybody can see an hole in this. This
is an order of magnitude more secure of chroot and of capabilities and
much simpler and it doesn't require root privilegies to activate. I
wasn't forced to take secure computing down into kernel space but I
believe it's the simplest and most secure and most efficient approch. An
userspace alternative would been to elaborate this below bytecode
userspace approch but besides being an order of magnitude slower it also
is a lot more complicated and less secure, and it keeps into the
equation the virtual machine that executes the code later on:

	http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286134

Furthermore I much prefer to run the bytecode on the bare hardware for
performance reasons, and the less layering the more secure.

I tested it with this:

#include <stdio.h>
#include <signal.h>
#include <unistd.h>

static void sigint(int s)
{
	printf("SIGINT\n");
}
static void sigpipe(int s)
{
	printf("SIGPIPE\n");
	pause();
}

int main(void) {

        signal(SIGINT, sigint);
        signal(SIGPIPE, sigpipe);
	printf("start\n");

	while (1);
        return 0;
}

on one shell:

andrea@xeon:~> echo 1 > /proc/`pidof seccomp`/seccomp
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGINT `pidof seccomp`
andrea@xeon:~> kill -SIGPIPE `pidof seccomp`
andrea@xeon:~> 

on the other:
andrea@xeon:~> ./seccomp
start
SIGINT
SIGINT
SIGINT
SIGINT
SIGINT
SIGINT
SIGINT
SIGPIPE
Killed
andrea@xeon:~> echo $?
137
andrea@xeon:~> 

(pause isn't allowed and the secure computing sigkill the task)

diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/arch/i386/kernel/entry.S seccomp/arch/i386/kernel/entry.S
--- 2.6.7/arch/i386/kernel/entry.S	2004-05-10 08:59:10.000000000 +0200
+++ seccomp/arch/i386/kernel/entry.S	2004-07-04 18:22:23.862198096 +0200
@@ -163,12 +163,19 @@ do_lcall:
 	movl %edx,EIP(%ebp)	# Now we move them to their "normal" places
 	movl %ecx,CS(%ebp)	#
 	GET_THREAD_INFO_WITH_ESP(%ebp)	# GET_THREAD_INFO
+	/* call gates cannot run with SECCOMP enabled */
+	testw $(_TIF_SECCOMP),TI_FLAGS(%ebp)
+	jnz sigkill
 	movl TI_EXEC_DOMAIN(%ebp), %edx	# Get the execution domain
 	call *4(%edx)		# Call the lcall7 handler for the domain
 	addl $4, %esp
 	popl %eax
 	jmp resume_userspace
 
+sigkill:
+	pushl $9
+	call do_exit		
+
 ENTRY(lcall27)
 	pushfl			# We get a different stack layout with call
 				# gates, which has to be cleaned up later..
@@ -264,7 +271,7 @@ sysenter_past_esp:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
@@ -287,7 +294,7 @@ ENTRY(system_call)
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 					# system call tracing in operation
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 syscall_call:
 	call *sys_call_table(,%eax,4)
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/arch/i386/kernel/ptrace.c seccomp/arch/i386/kernel/ptrace.c
--- 2.6.7/arch/i386/kernel/ptrace.c	2004-05-10 08:59:10.000000000 +0200
+++ seccomp/arch/i386/kernel/ptrace.c	2004-07-04 18:23:28.597356856 +0200
@@ -15,6 +15,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -534,6 +535,8 @@ void do_syscall_trace(struct pt_regs *re
 			audit_syscall_exit(current, regs->eax);
 	}
 
+	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+		secure_computing(regs->orig_eax);
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 	if (!(current->ptrace & PT_PTRACED))
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/fs/proc/base.c seccomp/fs/proc/base.c
--- 2.6.7/fs/proc/base.c	2004-05-10 08:59:34.000000000 +0200
+++ seccomp/fs/proc/base.c	2004-07-04 18:43:37.103635976 +0200
@@ -32,6 +32,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/seccomp.h>
 
 /*
  * For hysterical raisins we keep the same inumbers as in the old procfs.
@@ -48,6 +49,7 @@ enum pid_directory_inos {
 	PROC_TGID_TASK,
 	PROC_TGID_STATUS,
 	PROC_TGID_MEM,
+	PROC_TGID_SECCOMP,
 	PROC_TGID_CWD,
 	PROC_TGID_ROOT,
 	PROC_TGID_EXE,
@@ -71,6 +73,7 @@ enum pid_directory_inos {
 	PROC_TID_INO,
 	PROC_TID_STATUS,
 	PROC_TID_MEM,
+	PROC_TID_SECCOMP,
 	PROC_TID_CWD,
 	PROC_TID_ROOT,
 	PROC_TID_EXE,
@@ -113,6 +116,7 @@ static struct pid_entry tgid_base_stuff[
 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
 	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
@@ -135,6 +139,7 @@ static struct pid_entry tid_base_stuff[]
 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
 	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
@@ -689,6 +694,58 @@ static struct inode_operations proc_mem_
 	.permission	= proc_permission,
 };
 
+static ssize_t seccomp_read(struct file * file, char * buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20];
+	loff_t __ppos = *ppos;
+	size_t len;
+
+	len = sprintf(__buf, "%u\n", tsk->seccomp_mode) + 1;
+	if (__ppos >= len)
+		return 0;
+	if (count > len-__ppos)
+		count = len-__ppos;
+	if (copy_to_user(buf, __buf + __ppos, count))
+		return -EFAULT;
+	*ppos += count;
+	return count;
+}
+
+static ssize_t seccomp_write(struct file * file, const char * buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20], * end;
+	unsigned int seccomp_mode;
+
+	/* can set it only once to be even more secure */
+	if (unlikely(tsk->seccomp_mode))
+		return -EPERM;
+
+	memset(__buf, 0, 20);
+	if (count > 19)
+		count = 19;
+	if (copy_from_user(__buf, buf, count))
+		return -EFAULT;
+	seccomp_mode = simple_strtoul(__buf, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+		tsk->seccomp_mode = seccomp_mode;
+		set_tsk_thread_flag(tsk, TIF_SECCOMP);
+	}
+	if (unlikely(!(end - __buf)))
+		return -EIO;
+	return end - __buf;
+}
+
+static struct file_operations proc_seccomp_operations = {
+	.read		= seccomp_read,
+	.write		= seccomp_write,
+};
+
 static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1342,6 +1399,10 @@ static struct dentry *proc_pident_lookup
 			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
 			break;
+		case PROC_TID_SECCOMP:
+		case PROC_TGID_SECCOMP:
+			inode->i_fop = &proc_seccomp_operations;
+			break;
 		case PROC_TID_MOUNTS:
 		case PROC_TGID_MOUNTS:
 			inode->i_fop = &proc_mounts_operations;
Files 2.6.7/ID and seccomp/ID differ
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/include/asm-i386/thread_info.h seccomp/include/asm-i386/thread_info.h
--- 2.6.7/include/asm-i386/thread_info.h	2004-05-10 08:59:36.000000000 +0200
+++ seccomp/include/asm-i386/thread_info.h	2004-07-04 18:25:17.304830808 +0200
@@ -152,6 +152,7 @@ static inline unsigned long current_stac
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -161,12 +162,13 @@ static inline unsigned long current_stac
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
-#define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP))
+#define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP) /* work to do on any return to u-space */
 
 /*
  * Thread-synchronous status.
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/include/linux/sched.h seccomp/include/linux/sched.h
--- 2.6.7/include/linux/sched.h	2004-05-10 08:59:41.000000000 +0200
+++ seccomp/include/linux/sched.h	2004-07-04 17:34:34.601392040 +0200
@@ -480,6 +480,7 @@ struct task_struct {
 	
 	void *security;
 	struct audit_context *audit_context;
+	unsigned int seccomp_mode;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/include/linux/seccomp.h seccomp/include/linux/seccomp.h
--- 2.6.7/include/linux/seccomp.h	1970-01-01 01:00:00.000000000 +0100
+++ seccomp/include/linux/seccomp.h	2004-07-04 17:39:40.097949504 +0200
@@ -0,0 +1,8 @@
+#ifndef _LINUX_SECCOMP_H
+#define _LINUX_SECCOMP_H
+
+#define NR_SECCOMP_MODES 1
+
+extern void secure_computing(int);
+
+#endif /* _LINUX_SECCOMP_H */
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/kernel/Makefile seccomp/kernel/Makefile
--- 2.6.7/kernel/Makefile	2004-05-10 08:59:41.000000000 +0200
+++ seccomp/kernel/Makefile	2004-07-04 18:28:31.347331864 +0200
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o
+	    kthread.o seccomp.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/kernel/seccomp.c seccomp/kernel/seccomp.c
--- 2.6.7/kernel/seccomp.c	1970-01-01 01:00:00.000000000 +0100
+++ seccomp/kernel/seccomp.c	2004-07-04 19:12:51.063993472 +0200
@@ -0,0 +1,54 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004  Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+#include <asm/unistd.h>
+
+/* #define SECCOMP_DEBUG 1 */
+
+/*
+ * Secure computing mode 1 allows only read/write/close/exit.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+	__NR_read, __NR_write, __NR_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+#ifdef __NR_sigreturn
+	__NR_sigreturn,
+#else
+	__NR_rt_sigreturn,
+#endif
+};
+
+void secure_computing(int this_syscall)
+{
+	int mode = current->seccomp_mode;
+	int * syscall;
+
+	switch (mode) {
+	case 1:
+		for (syscall = mode1_syscalls;
+		     syscall < mode1_syscalls + sizeof(mode1_syscalls)/sizeof(int);
+		     syscall++)
+			if (*syscall == this_syscall)
+				return;
+		break;
+	default:
+		BUG();
+	}
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	do_exit(SIGKILL);
+}

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-04 17:39 secure computing for 2.6.7 andrea
@ 2004-07-04 21:35 ` Andrew Morton
  2004-07-04 23:32   ` andrea
  2004-10-12 14:24   ` Andrea Arcangeli
  2004-07-07 19:27 ` Hans Reiser
  1 sibling, 2 replies; 42+ messages in thread
From: Andrew Morton @ 2004-07-04 21:35 UTC (permalink / raw)
  To: andrea; +Cc: linux-kernel

andrea@cpushare.com wrote:
>
> I need this new kernel feature for a reseach spare time project I'm
>  developing in the weekends.  The fast path cost is basically only the
>  s/testb/testw/ change in entry.S. (and even that might be removed with a
>  more signficant effort but I don't think anybody could worry about that
>  change).
> 
>  This might be better off for 2.7 but I would like if people could have a
>  look, and it's simple enough that it might be included in 2.6 too later
>  on. (it just need to be ported to the other archs, only x86 is
>  implemented here, but that's easy)
> 
>  Especially I would like to know if anybody can see an hole in this. This
>  is an order of magnitude more secure of chroot and of capabilities and
>  much simpler and it doesn't require root privilegies to activate. I
>  wasn't forced to take secure computing down into kernel space but I
>  believe it's the simplest and most secure and most efficient approch. An
>  userspace alternative would been to elaborate this below bytecode
>  userspace approch but besides being an order of magnitude slower it also
>  is a lot more complicated and less secure, and it keeps into the
>  equation the virtual machine that executes the code later on:
> 
>  	http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286134

I'm not sure what to say about this, really.

Of course, yes, the patch is sufficiently safe and simple for it to be
mergeable in 2.6, if this is the way we want to do secure computing.  I'd
wonder whether the API should be syscall-based rather than /proc-based, and
whether there should be a config option for it.

But the wider questions are stuff like "where is all this coming from",
"where will it all end up" and "what are the alternatives".


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-04 21:35 ` Andrew Morton
@ 2004-07-04 23:32   ` andrea
  2004-07-05  0:37     ` Phy Prabab
  2004-10-12 14:24   ` Andrea Arcangeli
  1 sibling, 1 reply; 42+ messages in thread
From: andrea @ 2004-07-04 23:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On Sun, Jul 04, 2004 at 02:35:26PM -0700, Andrew Morton wrote:
> Of course, yes, the patch is sufficiently safe and simple for it to be

Ok, great.

> mergeable in 2.6, if this is the way we want to do secure computing.  I'd

In the last weekends I evaluated many different ways to solve the issue
(most of them in userspace because they would have the huge advantage of
working in other OS too, the python way that parsed the bytecode looked
quite intriguing, but it's an order of magnitude slower compared to x86
bytecode and it was a lot more complex to make it work with the math
module and similar other safe operations, plus it was non portable to
non-x86 arch [though portable to other x86 OS] and I believe it was less
secure since the virtual machine was still involved).

At the end this linux centric kernel-space solution I'm proposing is the
only simple enough way that I would be confortable enough to trust
myself without feeling to risk anything, plus it will run the stuff at
full speed and with zero memory resource waste for another virtual
machine. This approach basically can only break if the cpu has bugs
(like 0xf00f or an mmx capable processor on a non-mmx aware OS, mmx is
not backwards compatible cpu feature w.r.t. security) but linux is
getting everything right in terms of cpu bugs.

BTW, of course this will also require a "safe" userspace loader, that
will take care of closing all file descriptors and to set the stack
rlimit before enabling the kernel feature, but that's very easy to
implement safely (even easier than the kernel side).

One interesting thing is that the vsyscalls will make gettimeofday
available too, but I don't think the output of gettimeofday can be
considered sensitive data. Though I need to keep an eye open on the
vsyscall page to be sure nothing sensitive goes in there.

> wonder whether the API should be syscall-based rather than /proc-based, and

I find the /proc-based simpler, but I certainly wouldn't be against
making it a syscall. So just let me know if you prefer to change it to a
syscall. The syscall would be a bit faster to run but it's not a fast
path.

> whether there should be a config option for it.

I don't think it worth to have a config option for this (you could
return to use testb instead of testw but it doesn't seem to be
significant enough to require a config option),

> But the wider questions are stuff like "where is all this coming from",
> "where will it all end up" and "what are the alternatives".

I'm not ready to talk about my usage, but it has absolutely nothing to do
with the kernel (except for needing this kind of feature from either the
kernel, or from an higher level virtual machine). So this probably
wouldn't be the appropriate forum.

Thanks a lot for the quick and positive comments.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-04 23:32   ` andrea
@ 2004-07-05  0:37     ` Phy Prabab
  0 siblings, 0 replies; 42+ messages in thread
From: Phy Prabab @ 2004-07-05  0:37 UTC (permalink / raw)
  To: andrea, Andrew Morton; +Cc: linux-kernel

I noticed that there is a website (www.cpushare.com)
for what you are speaking of, however, I do not
clearly understand the scope of your project.  Is
there a breif definition/scope of what you are trying
to accomplish?

Thank you for your time.
Phy

--- andrea@cpushare.com wrote:
> On Sun, Jul 04, 2004 at 02:35:26PM -0700, Andrew
> Morton wrote:
> > Of course, yes, the patch is sufficiently safe and
> simple for it to be
> 
> Ok, great.
> 
> > mergeable in 2.6, if this is the way we want to do
> secure computing.  I'd
> 
> In the last weekends I evaluated many different ways
> to solve the issue
> (most of them in userspace because they would have
> the huge advantage of
> working in other OS too, the python way that parsed
> the bytecode looked
> quite intriguing, but it's an order of magnitude
> slower compared to x86
> bytecode and it was a lot more complex to make it
> work with the math
> module and similar other safe operations, plus it
> was non portable to
> non-x86 arch [though portable to other x86 OS] and I
> believe it was less
> secure since the virtual machine was still
> involved).
> 
> At the end this linux centric kernel-space solution
> I'm proposing is the
> only simple enough way that I would be confortable
> enough to trust
> myself without feeling to risk anything, plus it
> will run the stuff at
> full speed and with zero memory resource waste for
> another virtual
> machine. This approach basically can only break if
> the cpu has bugs
> (like 0xf00f or an mmx capable processor on a
> non-mmx aware OS, mmx is
> not backwards compatible cpu feature w.r.t.
> security) but linux is
> getting everything right in terms of cpu bugs.
> 
> BTW, of course this will also require a "safe"
> userspace loader, that
> will take care of closing all file descriptors and
> to set the stack
> rlimit before enabling the kernel feature, but
> that's very easy to
> implement safely (even easier than the kernel side).
> 
> One interesting thing is that the vsyscalls will
> make gettimeofday
> available too, but I don't think the output of
> gettimeofday can be
> considered sensitive data. Though I need to keep an
> eye open on the
> vsyscall page to be sure nothing sensitive goes in
> there.
> 
> > wonder whether the API should be syscall-based
> rather than /proc-based, and
> 
> I find the /proc-based simpler, but I certainly
> wouldn't be against
> making it a syscall. So just let me know if you
> prefer to change it to a
> syscall. The syscall would be a bit faster to run
> but it's not a fast
> path.
> 
> > whether there should be a config option for it.
> 
> I don't think it worth to have a config option for
> this (you could
> return to use testb instead of testw but it doesn't
> seem to be
> significant enough to require a config option),
> 
> > But the wider questions are stuff like "where is
> all this coming from",
> > "where will it all end up" and "what are the
> alternatives".
> 
> I'm not ready to talk about my usage, but it has
> absolutely nothing to do
> with the kernel (except for needing this kind of
> feature from either the
> kernel, or from an higher level virtual machine). So
> this probably
> wouldn't be the appropriate forum.
> 
> Thanks a lot for the quick and positive comments.
> -
> To unsubscribe from this list: send the line
> "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at 
> http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 



		
__________________________________
Do you Yahoo!?
New and Improved Yahoo! Mail - Send 10MB messages!
http://promotions.yahoo.com/new_mail 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-04 17:39 secure computing for 2.6.7 andrea
  2004-07-04 21:35 ` Andrew Morton
@ 2004-07-07 19:27 ` Hans Reiser
  2004-08-01 10:22   ` Andrea Arcangeli
  2004-08-03 12:40   ` Stephen Smalley
  1 sibling, 2 replies; 42+ messages in thread
From: Hans Reiser @ 2004-07-07 19:27 UTC (permalink / raw)
  To: andrea; +Cc: linux-kernel, Andrew Morton

Am I right to think that this could complement nicely our plans 
described at www.namesys.com/blackbox_security.html

?

If I understand right from the summaries, what Andrea does is exactly 
what we specify we won't do but is important in our proposal.

If anyone is interested, DARPA is funding our proposal, and I need to 
find a US based researcher to work with me on it (I hate nationalism and 
national discrimination in research projects..... oh well, my Russians 
will get paid better as a result of doing the less interesting private 
sector work instead....)

Hans

andrea@cpushare.com wrote:

>Hello,
>
>I need this new kernel feature for a reseach spare time project I'm
>developing in the weekends.  The fast path cost is basically only the
>s/testb/testw/ change in entry.S. (and even that might be removed with a
>more signficant effort but I don't think anybody could worry about that
>change).
>
>This might be better off for 2.7 but I would like if people could have a
>look, and it's simple enough that it might be included in 2.6 too later
>on. (it just need to be ported to the other archs, only x86 is
>implemented here, but that's easy)
>
>Especially I would like to know if anybody can see an hole in this. This
>is an order of magnitude more secure of chroot and of capabilities and
>much simpler and it doesn't require root privilegies to activate. I
>wasn't forced to take secure computing down into kernel space but I
>believe it's the simplest and most secure and most efficient approch. An
>userspace alternative would been to elaborate this below bytecode
>userspace approch but besides being an order of magnitude slower it also
>is a lot more complicated and less secure, and it keeps into the
>equation the virtual machine that executes the code later on:
>
>	http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286134
>
>Furthermore I much prefer to run the bytecode on the bare hardware for
>performance reasons, and the less layering the more secure.
>
>I tested it with this:
>
>#include <stdio.h>
>#include <signal.h>
>#include <unistd.h>
>
>static void sigint(int s)
>{
>	printf("SIGINT\n");
>}
>static void sigpipe(int s)
>{
>	printf("SIGPIPE\n");
>	pause();
>}
>
>int main(void) {
>
>        signal(SIGINT, sigint);
>        signal(SIGPIPE, sigpipe);
>	printf("start\n");
>
>	while (1);
>        return 0;
>}
>
>on one shell:
>
>andrea@xeon:~> echo 1 > /proc/`pidof seccomp`/seccomp
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGINT `pidof seccomp`
>andrea@xeon:~> kill -SIGPIPE `pidof seccomp`
>andrea@xeon:~> 
>
>on the other:
>andrea@xeon:~> ./seccomp
>start
>SIGINT
>SIGINT
>SIGINT
>SIGINT
>SIGINT
>SIGINT
>SIGINT
>SIGPIPE
>Killed
>andrea@xeon:~> echo $?
>137
>andrea@xeon:~> 
>
>(pause isn't allowed and the secure computing sigkill the task)
>
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/arch/i386/kernel/entry.S seccomp/arch/i386/kernel/entry.S
>--- 2.6.7/arch/i386/kernel/entry.S	2004-05-10 08:59:10.000000000 +0200
>+++ seccomp/arch/i386/kernel/entry.S	2004-07-04 18:22:23.862198096 +0200
>@@ -163,12 +163,19 @@ do_lcall:
> 	movl %edx,EIP(%ebp)	# Now we move them to their "normal" places
> 	movl %ecx,CS(%ebp)	#
> 	GET_THREAD_INFO_WITH_ESP(%ebp)	# GET_THREAD_INFO
>+	/* call gates cannot run with SECCOMP enabled */
>+	testw $(_TIF_SECCOMP),TI_FLAGS(%ebp)
>+	jnz sigkill
> 	movl TI_EXEC_DOMAIN(%ebp), %edx	# Get the execution domain
> 	call *4(%edx)		# Call the lcall7 handler for the domain
> 	addl $4, %esp
> 	popl %eax
> 	jmp resume_userspace
> 
>+sigkill:
>+	pushl $9
>+	call do_exit		
>+
> ENTRY(lcall27)
> 	pushfl			# We get a different stack layout with call
> 				# gates, which has to be cleaned up later..
>@@ -264,7 +271,7 @@ sysenter_past_esp:
> 	cmpl $(nr_syscalls), %eax
> 	jae syscall_badsys
> 
>-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
>+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_FLAGS(%ebp)
> 	jnz syscall_trace_entry
> 	call *sys_call_table(,%eax,4)
> 	movl %eax,EAX(%esp)
>@@ -287,7 +294,7 @@ ENTRY(system_call)
> 	cmpl $(nr_syscalls), %eax
> 	jae syscall_badsys
> 					# system call tracing in operation
>-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
>+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_FLAGS(%ebp)
> 	jnz syscall_trace_entry
> syscall_call:
> 	call *sys_call_table(,%eax,4)
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/arch/i386/kernel/ptrace.c seccomp/arch/i386/kernel/ptrace.c
>--- 2.6.7/arch/i386/kernel/ptrace.c	2004-05-10 08:59:10.000000000 +0200
>+++ seccomp/arch/i386/kernel/ptrace.c	2004-07-04 18:23:28.597356856 +0200
>@@ -15,6 +15,7 @@
> #include <linux/user.h>
> #include <linux/security.h>
> #include <linux/audit.h>
>+#include <linux/seccomp.h>
> 
> #include <asm/uaccess.h>
> #include <asm/pgtable.h>
>@@ -534,6 +535,8 @@ void do_syscall_trace(struct pt_regs *re
> 			audit_syscall_exit(current, regs->eax);
> 	}
> 
>+	if (unlikely(test_thread_flag(TIF_SECCOMP)))
>+		secure_computing(regs->orig_eax);
> 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
> 		return;
> 	if (!(current->ptrace & PT_PTRACED))
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/fs/proc/base.c seccomp/fs/proc/base.c
>--- 2.6.7/fs/proc/base.c	2004-05-10 08:59:34.000000000 +0200
>+++ seccomp/fs/proc/base.c	2004-07-04 18:43:37.103635976 +0200
>@@ -32,6 +32,7 @@
> #include <linux/mount.h>
> #include <linux/security.h>
> #include <linux/ptrace.h>
>+#include <linux/seccomp.h>
> 
> /*
>  * For hysterical raisins we keep the same inumbers as in the old procfs.
>@@ -48,6 +49,7 @@ enum pid_directory_inos {
> 	PROC_TGID_TASK,
> 	PROC_TGID_STATUS,
> 	PROC_TGID_MEM,
>+	PROC_TGID_SECCOMP,
> 	PROC_TGID_CWD,
> 	PROC_TGID_ROOT,
> 	PROC_TGID_EXE,
>@@ -71,6 +73,7 @@ enum pid_directory_inos {
> 	PROC_TID_INO,
> 	PROC_TID_STATUS,
> 	PROC_TID_MEM,
>+	PROC_TID_SECCOMP,
> 	PROC_TID_CWD,
> 	PROC_TID_ROOT,
> 	PROC_TID_EXE,
>@@ -113,6 +116,7 @@ static struct pid_entry tgid_base_stuff[
> 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
> 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
> 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
>+	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
> 	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
> 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
> 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
>@@ -135,6 +139,7 @@ static struct pid_entry tid_base_stuff[]
> 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
> 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
> 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
>+	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
> 	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
> 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
> 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
>@@ -689,6 +694,58 @@ static struct inode_operations proc_mem_
> 	.permission	= proc_permission,
> };
> 
>+static ssize_t seccomp_read(struct file * file, char * buf,
>+			    size_t count, loff_t *ppos)
>+{
>+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
>+	char __buf[20];
>+	loff_t __ppos = *ppos;
>+	size_t len;
>+
>+	len = sprintf(__buf, "%u\n", tsk->seccomp_mode) + 1;
>+	if (__ppos >= len)
>+		return 0;
>+	if (count > len-__ppos)
>+		count = len-__ppos;
>+	if (copy_to_user(buf, __buf + __ppos, count))
>+		return -EFAULT;
>+	*ppos += count;
>+	return count;
>+}
>+
>+static ssize_t seccomp_write(struct file * file, const char * buf,
>+			     size_t count, loff_t *ppos)
>+{
>+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
>+	char __buf[20], * end;
>+	unsigned int seccomp_mode;
>+
>+	/* can set it only once to be even more secure */
>+	if (unlikely(tsk->seccomp_mode))
>+		return -EPERM;
>+
>+	memset(__buf, 0, 20);
>+	if (count > 19)
>+		count = 19;
>+	if (copy_from_user(__buf, buf, count))
>+		return -EFAULT;
>+	seccomp_mode = simple_strtoul(__buf, &end, 0);
>+	if (*end == '\n')
>+		end++;
>+	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
>+		tsk->seccomp_mode = seccomp_mode;
>+		set_tsk_thread_flag(tsk, TIF_SECCOMP);
>+	}
>+	if (unlikely(!(end - __buf)))
>+		return -EIO;
>+	return end - __buf;
>+}
>+
>+static struct file_operations proc_seccomp_operations = {
>+	.read		= seccomp_read,
>+	.write		= seccomp_write,
>+};
>+
> static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
> {
> 	struct inode *inode = dentry->d_inode;
>@@ -1342,6 +1399,10 @@ static struct dentry *proc_pident_lookup
> 			inode->i_op = &proc_mem_inode_operations;
> 			inode->i_fop = &proc_mem_operations;
> 			break;
>+		case PROC_TID_SECCOMP:
>+		case PROC_TGID_SECCOMP:
>+			inode->i_fop = &proc_seccomp_operations;
>+			break;
> 		case PROC_TID_MOUNTS:
> 		case PROC_TGID_MOUNTS:
> 			inode->i_fop = &proc_mounts_operations;
>Files 2.6.7/ID and seccomp/ID differ
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/include/asm-i386/thread_info.h seccomp/include/asm-i386/thread_info.h
>--- 2.6.7/include/asm-i386/thread_info.h	2004-05-10 08:59:36.000000000 +0200
>+++ seccomp/include/asm-i386/thread_info.h	2004-07-04 18:25:17.304830808 +0200
>@@ -152,6 +152,7 @@ static inline unsigned long current_stac
> #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
> #define TIF_IRET		5	/* return with iret */
> #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
>+#define TIF_SECCOMP		8	/* secure computing */
> #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
> 
> #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
>@@ -161,12 +162,13 @@ static inline unsigned long current_stac
> #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
> #define _TIF_IRET		(1<<TIF_IRET)
> #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
>+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
> #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
> 
> /* work to do on interrupt/exception return */
> #define _TIF_WORK_MASK \
>-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
>-#define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
>+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP))
>+#define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP) /* work to do on any return to u-space */
> 
> /*
>  * Thread-synchronous status.
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/include/linux/sched.h seccomp/include/linux/sched.h
>--- 2.6.7/include/linux/sched.h	2004-05-10 08:59:41.000000000 +0200
>+++ seccomp/include/linux/sched.h	2004-07-04 17:34:34.601392040 +0200
>@@ -480,6 +480,7 @@ struct task_struct {
> 	
> 	void *security;
> 	struct audit_context *audit_context;
>+	unsigned int seccomp_mode;
> 
> /* Thread group tracking */
>    	u32 parent_exec_id;
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/include/linux/seccomp.h seccomp/include/linux/seccomp.h
>--- 2.6.7/include/linux/seccomp.h	1970-01-01 01:00:00.000000000 +0100
>+++ seccomp/include/linux/seccomp.h	2004-07-04 17:39:40.097949504 +0200
>@@ -0,0 +1,8 @@
>+#ifndef _LINUX_SECCOMP_H
>+#define _LINUX_SECCOMP_H
>+
>+#define NR_SECCOMP_MODES 1
>+
>+extern void secure_computing(int);
>+
>+#endif /* _LINUX_SECCOMP_H */
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/kernel/Makefile seccomp/kernel/Makefile
>--- 2.6.7/kernel/Makefile	2004-05-10 08:59:41.000000000 +0200
>+++ seccomp/kernel/Makefile	2004-07-04 18:28:31.347331864 +0200
>@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o
> 	    sysctl.o capability.o ptrace.o timer.o user.o \
> 	    signal.o sys.o kmod.o workqueue.o pid.o \
> 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
>-	    kthread.o
>+	    kthread.o seccomp.o
> 
> obj-$(CONFIG_FUTEX) += futex.o
> obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
>diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids 2.6.7/kernel/seccomp.c seccomp/kernel/seccomp.c
>--- 2.6.7/kernel/seccomp.c	1970-01-01 01:00:00.000000000 +0100
>+++ seccomp/kernel/seccomp.c	2004-07-04 19:12:51.063993472 +0200
>@@ -0,0 +1,54 @@
>+/*
>+ * linux/kernel/seccomp.c
>+ *
>+ * Copyright 2004  Andrea Arcangeli <andrea@cpushare.com>
>+ *
>+ * This defines a simple but solid secure-computing mode.
>+ */
>+
>+#include <linux/seccomp.h>
>+#include <linux/sched.h>
>+#include <asm/unistd.h>
>+
>+/* #define SECCOMP_DEBUG 1 */
>+
>+/*
>+ * Secure computing mode 1 allows only read/write/close/exit.
>+ * To be fully secure this must be combined with rlimit
>+ * to limit the stack allocations too.
>+ */
>+static int mode1_syscalls[] = {
>+	__NR_read, __NR_write, __NR_exit,
>+	/*
>+	 * Allow either sigreturn or rt_sigreturn, newer archs
>+	 * like x86-64 only defines __NR_rt_sigreturn.
>+	 */
>+#ifdef __NR_sigreturn
>+	__NR_sigreturn,
>+#else
>+	__NR_rt_sigreturn,
>+#endif
>+};
>+
>+void secure_computing(int this_syscall)
>+{
>+	int mode = current->seccomp_mode;
>+	int * syscall;
>+
>+	switch (mode) {
>+	case 1:
>+		for (syscall = mode1_syscalls;
>+		     syscall < mode1_syscalls + sizeof(mode1_syscalls)/sizeof(int);
>+		     syscall++)
>+			if (*syscall == this_syscall)
>+				return;
>+		break;
>+	default:
>+		BUG();
>+	}
>+
>+#ifdef SECCOMP_DEBUG
>+	dump_stack();
>+#endif
>+	do_exit(SIGKILL);
>+}
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at  http://www.tux.org/lkml/
>
>
>  
>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-07 19:27 ` Hans Reiser
@ 2004-08-01 10:22   ` Andrea Arcangeli
  2004-08-01 12:01     ` chris
  2004-08-01 14:55     ` Bernd Eckenfels
  2004-08-03 12:40   ` Stephen Smalley
  1 sibling, 2 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-01 10:22 UTC (permalink / raw)
  To: Hans Reiser; +Cc: linux-kernel, Andrew Morton

On Wed, Jul 07, 2004 at 12:27:18PM -0700, Hans Reiser wrote:
> Am I right to think that this could complement nicely our plans 
> described at www.namesys.com/blackbox_security.html
> 
> ?

sounds like yes. However what I'm doing with the seccomp [2] mode is
much order of magnitude simpler and less generic, so I don't expect it
will be useful to many apps. When you mention in your document that
chroot "... is currently too much work for it to see as much usage as I
would like it to.", the seccomp mode doesn't help here since it's an
order of magnitude harder to jail stuff inside the seccomp mode than it
is for chroot, however the security guarantees are many more orders of
magnitude better than the ones provided by chroot so it very much worth
to use the seccomp mode it if you really need the maximal degree of
security like I do.

the seccomp mode is pratically only about computing securely inside an
"unbreakable" jail, not much else. Everything can be offloaded to a
seccomp computing process in theory and you can build an API with file
descriptors communicating with pipes or sockets. However it's not easy
to implement for general purpose applications like web browsers where
the untrusted scripts normally have to draw on the screen something too,
which means the API to the seccomp task would be quite huge, plus the
slowdown could be significant (though you could in theory very well
share xshm of the xserver with the seccomp-mode task). I've put a
demostration code in the website [3] that shows a task running in seccomp
mode, that is the right way to use it, no execve involved. Possibly the
only minor dependency is about the popen3 to close all file descriptors
before starting the task, but I've a quick check in the child too that
bugs out if something went wrong shall the behaviour ever change on the
python side.

Note that to get maximum security I cannot even trust the elf loader in
the kernel. there were bugs in the past that crashed the kernel by just
executing ./vmlinux on x86 ;). That code is way too complex to remotely
trust it. In short there's no way I can trust the ELF parsing and the
exec syscall at all.  All I trust is read/write/sigreturn/exit + the cpu
hardware and the context switch. I feel I can trust that just fine.
Statistically speaking the only thing that could have escaped seccomp
mode was the mmx feature that wasn't backwards compatible with previous
x86 cpus and people could sniff the mmx status of the other tasks, I was
involved myself in the kernel fix for that a few years ago, that's the
only scary bit (i.e. the cpu hardware dependency). well in theory I
could run an interpreter in seccomp mode, but that would be hundred
times slower and nuking 99/100 of the power of the thing isn't exactly
nice, so I'm going to trust the cpu hardware for now.

Note that I don't care much about whatever _DoS_ in the cpus like f00f
or kernel bugs leading to DoS like the TF bitflag being set in the
instruction before lcall or the fnclex, those will be autodetected and
stopped immediatly and IP address and time of the bad guy will be
trivial to identify. When any bad guy does attack the system it's
trivial to catch, the only thing I couldn't catch is the mmx sniffing
if nobody notices it from the sourcecode, but I'm confortable there are
no bugs like that anymore.

Now the last thing that would be nice to have on the kernel side as far
as cpushare is concerned (and the main reason for this email) is that
I'd like a more reliable way to know if a certain kernel is secure
enough to run stuff in seccomp mode. This is about disaster recovery,
after something very bad has happened if the seccomp mode jail broke.
Let's assume there's another cpu bug in the future in some future cpu
that will be shipped years from now, that will allow sniffing data from
other tasks like it happened with the mmx in the past. cpushare will
react to any security bug immediatly as soon as they're disclosed.
But then to restart the thing I need to identify reliably which users
can still run stuff in seccomp mode safely. I could use the kernel
version returned by uname -r, but think if an user applies his security
fix to his previous kernel and reboots the machine with the "fixed"
patch after a minute or if a vendor doesn't change its uname -r string
after applying a fix (not the case for SUSE of course, we always bump
the uname -r string after an update, but I don't know if every possible
vendor does that and I don't know if every possible linux user is using
vendor kernels).  If there was something like
/proc/sys/kernel/security_sequence [1] or anything like that that we
could increase every time we fix a kernel bug (note, it's not mandatory
to increase it every time, and we could increase it even weeks after the
bug has been fixed after somebody ask for it, and it would still work
fine) that would make life much easier since I wouldn't need to build an
ugly database of all kernel releases covering every possible
distribution (and I couldn't cover the people using self-built kernels
or cvs/bk-revisions).  Some ugly database may be needed anyways
eventually, like if the bug can only happen on a certain cpu revision, I
sure don't want to force everyone to upgrade nuking all userbase at
once, if the user enabled auto-upgrade I'd offload some python bytecode
that will check the cpu and it will give a warning to the users with the
"old cpus" and I'd only really stop the "new cpus" from executing in
seccomp mode unless they fix the bug and they bump the security_sequence
accordingly.  If the bug is very very bad and it can even modify stuff
outside the jail (I cannot exclude that could ever happen and I must be
ready to react for it anytime) I'd have to ask the user to reinstall his
machine from scratch (I'll keep track of every user security_sequence
and cpu revision of course). If it's an hardware bug and it cannot be
worked around in the kernel then I'll giveup even before checking the
security_sequence.

Especially considering the spread of virus on the internet in some
insecure OS (and considering nobody should ever trust an antivirus or a
virus definition installed _after_ a virus has already spread on the
machine), and considering the probability that the seccomp jail could
break, I believe cpushare is very safe and I couldn't be designed in a
safer way with current x86 hardware.

Somebody suggested ptrace could do something similar, but the tracer
could get killed with an oom, I need something unbreakable. The selinux
could do something similar but not everyone has it enabled and it
slowsdown things a bit (fixable for some big spinlock) but the seccomp
mode is also much simpler and in turn more secure than selinux (of
course this leads it to be less generic but I can live with that).

If people forgets to bump the sequence number with the fix that's fine
too, nothing bad will happen, but maybe Linus will get a patch from me
after a while that bumps the security number. The one thing I care is to
identify the buggy kernels reliably, I don't need to identify the secure
kernels reliably, fase positives are fine, false negatives not, and a
sequence number would work fine for that.

I could call it /proc/sys/kernel/seccomp_security_sequence too, but I
thought that calling it security_sequence and to have it more generic
would be fine. I will check that it's higher than a certain number, but
even if it's even higher I'm fine. I mean I don't expect to ever check
that sysctl for a value > 0, but I'm fine if every single security
bugfix bumps the kernel version, but it's up to the security people to
evaluate if it makes any sense to bump the kernel version or not.

To make an example when sendmail broke because they were the only ones
using the capability syscall of the kernel, they could have changed
their source to check for security_sequence to be > something.  Then
everybody downloading the new sendmail version would had no risk to run
their application on a insecure kernel. Checking for a certain
security_sequence at daemon-startup seems low-overhead enough that may
be useful not just for cpushare (under an #ifdef __linux__).

Note: I don't even need this patch applied right now, since I can
very easily interpret the missing "sysctl" as security_sequence -1, but
after a bug like fnclex that happened a few months ago I would need to
have this patch applied or I'd be _forced_ in a very big `uname -r`
total nightmare.

this wastes 4 bytes for the integer and a few bytes for the sysctl data
structure.

Comments/suggestions?

security_sequence patch against kernel CVS follows:

Signed-off-by: Andrea Arcangeli <andrea@cpushare.com>

Index: security-sequence/include/linux/sysctl.h
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/include/linux/sysctl.h,v
retrieving revision 1.75
diff -u -p -r1.75 sysctl.h
--- security-sequence/include/linux/sysctl.h	24 Jun 2004 15:54:04 -0000	1.75
+++ security-sequence/include/linux/sysctl.h	1 Aug 2004 09:20:58 -0000
@@ -133,6 +133,7 @@ enum
 	KERN_NGROUPS_MAX=63,	/* int: NGROUPS_MAX */
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
+	KERN_SECURITY_SEQUENCE=66,	/* int: security sequence number */
 };

Index: security-sequence/kernel/sysctl.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/kernel/sysctl.c,v
retrieving revision 1.83
diff -u -p -r1.83 sysctl.c
--- security-sequence/kernel/sysctl.c	31 Jul 2004 05:49:36 -0000	1.83
+++ security-sequence/kernel/sysctl.c	1 Aug 2004 09:55:44 -0000
@@ -71,6 +71,15 @@ static int minolduid;

 static int ngroups_max = NGROUPS_MAX;

+/*
+ * bump this sequence number after fixing any kernel security bug
+ * that could render insecure some userspace application. This
+ * way future versions of the userpace application will be able
+ * to reliably make sure to run on a secure kernel.
+ * I hope 31bit are enough... ;).
+ */
+static int security_sequence;
+
 #ifdef CONFIG_KMOD
 extern char modprobe_path[];
 #endif
@@ -620,6 +629,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= KERN_SECURITY_SEQUENCE,
+		.procname	= "security_sequence",
+		.data		= &security_sequence,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };

[1]	http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.8-rc2/security_sequence-1
[2]	http://www.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.8-rc2/seccomp-2
[3]	http://www.cpushare.com/download/download.php

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 10:22   ` Andrea Arcangeli
@ 2004-08-01 12:01     ` chris
  2004-08-01 15:01       ` Andrea Arcangeli
  2004-08-01 14:55     ` Bernd Eckenfels
  1 sibling, 1 reply; 42+ messages in thread
From: chris @ 2004-08-01 12:01 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Hans Reiser, linux-kernel, Andrew Morton

On Sun, 1 Aug 2004, Andrea Arcangeli wrote:

> sounds like yes. However what I'm doing with the seccomp [2] mode is
> much order of magnitude simpler and less generic, so I don't expect it
> will be useful to many apps. When you mention in your document that

Hi Andrea,

Do you have plans to generalize seccomp into somelike like a "syscall
firewall"? This _would_ be useful to many apps, and provide good security
benefits - for example, vsftpd does not need most of the previously-buggy
syscalls such as sysctl(), mremap() and execve(). But it does need more
than just read(), write() and exit()!

Cheers
Chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 10:22   ` Andrea Arcangeli
  2004-08-01 12:01     ` chris
@ 2004-08-01 14:55     ` Bernd Eckenfels
  2004-08-01 15:51       ` Andrea Arcangeli
  1 sibling, 1 reply; 42+ messages in thread
From: Bernd Eckenfels @ 2004-08-01 14:55 UTC (permalink / raw)
  To: linux-kernel

In article <20040801102231.GB6295@dualathlon.random> you wrote:
> /proc/sys/kernel/security_sequence [1] or anything like that that we
> could increase every time we fix a kernel bug (note, it's not mandatory
> to increase it every time, and we could increase it even weeks after the
> bug has been fixed after somebody ask for it, and it would still work
> fine)

Hmm.. yes, kind of patchlevel/build number.

I feel that you miss a solution for different branches, backporting and
partial fixes. Since that way it can only work for the official branches of
the kernel, it would be enough to check the minor number.

Personally I think it is better to keep that in the utsname.release. If you
realy want to have an integer, then add it for easy parsing and allow it to
have multiple parallel issuers:

For example like:

2.6.9_XXX(linux26.3501,MM.123)

where this has applied fix 3501 in the 2.6 branch and 123 according to
vendor MM, so you do not need to understand vendors XXX schema. However I am
not sure if you are willing to accept the fact, that backporters will then
raise the level to a value you will only expect for more recent versions,
i.e.:

2.6.9(linux26.3501)
-> security bug1 is discoverd and fixed
2.6.10pre1(linux26.3502)
-> security bug2 is discoverd and fixed
-> features are added
2.6.10(linux26.3503)

Now  somebody  decides to backport the bug1 fix:

2.6.9-2(linux26.3502)

and the bug2 fix:

2.6.9-3(linux26.3503) <- is level 3503 but does not have all 2.6.10 features?!

And it gets even more hairy: if only the bug2 fix is
backported, how can an application state that it needs that (without
impliciteley also reling on bug1 to be fixed)

Greetings
Bernd
-- 
eckes privat - http://www.eckes.org/
Project Freefire - http://www.freefire.org/

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 12:01     ` chris
@ 2004-08-01 15:01       ` Andrea Arcangeli
  2004-08-01 17:29         ` chris
  2004-08-03 12:48         ` Stephen Smalley
  0 siblings, 2 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-01 15:01 UTC (permalink / raw)
  To: chris; +Cc: Hans Reiser, linux-kernel, Andrew Morton

On Sun, Aug 01, 2004 at 01:01:10PM +0100, chris@scary.beasts.org wrote:
> Hi Andrea,
> 
> Do you have plans to generalize seccomp into somelike like a "syscall
> firewall"? This _would_ be useful to many apps, and provide good security
> benefits - for example, vsftpd does not need most of the previously-buggy
> syscalls such as sysctl(), mremap() and execve(). But it does need more
> than just read(), write() and exit()!

Seems like a few people is interested in what you suggest above. it'd be
very trivial to add a seccomp-mode = 2 that adds more syscalls like the
socket syscalls like accept/sendfile/send/recv and also the open syscall
(which means you want to use chroot still).  In the code you can see I
wrote it so that more modes can be added freely. I mean it has some
flexibility already.  vsftpd could enable the seccomp mode 2 on itself
after it has initialized.

(this is only a trivial patch example of the extension) 

--- security-sequence/kernel/seccomp.c.~1~	2004-08-01 16:10:46.970806680 +0200
+++ security-sequence/kernel/seccomp.c	2004-08-01 16:17:17.537431528 +0200
@@ -30,12 +30,31 @@ static int mode1_syscalls[] = {
 #endif
 };

+/*
+ * Secure computing mode 2 is for network daemons.
+ */
+static int mode2_syscalls[] = {
+#ifdef __NR_sigreturn
+	__NR_rt_sigreturn,
+#endif
+	__NR_open, __NR_sendfile, __NR_sendfile64, __NR_close,
+	__NR_poll, __NR_fork, __NR_wait4, __NR_socketcall, __NR_getdents,
+	__NR_mmap2, __NR_munmap,
+};
+
 void secure_computing(int this_syscall)
 {
 	int mode = current->seccomp_mode;
 	int * syscall;

 	switch (mode) {
+	case 2:
+		for (syscall = mode2_syscalls;
+		     syscall < mode2_syscalls + sizeof(mode2_syscalls)/sizeof(int);
+		     syscall++)
+			if (*syscall == this_syscall)
+				return;
+		/* mode 2 extends mode 1: fallthrough */
 	case 1:
 		for (syscall = mode1_syscalls;
 		     syscall < mode1_syscalls + sizeof(mode1_syscalls)/sizeof(int);

the above might be enough to make a network daemon work, but it probably
would still need some userspace modification to ensure it fits, and you
may have to add some reasonably safe syscall that I might have forgotten
in the example.

plus you'd still need a chroot to limit the scope of the "open" syscall.

to make threading work futex and clone and some other is going to be
needed by glibc.

After that it would be _very_ secure thanks to the seccomp mode 2.
Especially the fact they've no way to exec is quite nice since they
always try to find the /bin/sh string somewhere to make the thing work.

I can imagine some people may want a true firewall configurable via
userspace, so that they can filter the syscall parameters too and they
can customize it as they need. the seccomp patch conceptually fits that
need just fine too but you've to write the code and extended it for that ;).
That would be seccomp mode >=3.

I'm posting these emails so much in advance just to raise discussion of
what people would like to see implemented so that it benefits everybody
so any extension is welcome.  OTOH while I will certainly help auditing
any extension of the seccomp mode I'm probably not going to have spare
resources to spend in writing that fully featured syscalltables firewall
mode >=3 that some people would like to see. I hope somebody else will
volounteer for that if there's an agreement that's the way to go ;). The
basic seccomp infrastructure/entry-point is there to build it. The
syscall parameters can be trivially passed down by adding a few more
params to the secure_computing function.

The important thing is to verify the API is extendible and I think it
is. The way this could work is to merge the patch as-is and to later add
a seccomp-mode == 2 for relaxed network daemon usage, and to later make
all seccomp modes from 3 to max-int to be configurable with a firewall
ala iptables when somebody volounteers to implement that. I really like
keeping mode 1 static and dumb (that is a feature), this way I'll never
risk somebody to mess the syscalltables firewall and to create an hole
in its own machine, and most important I would never need to depend on
the code in "case 3:" to be safe, since that is going to be a lot more
complicated and in turn less secure than the "case 1:" code.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 14:55     ` Bernd Eckenfels
@ 2004-08-01 15:51       ` Andrea Arcangeli
  2004-08-01 17:24         ` Bernd Eckenfels
  2004-08-02  3:17         ` Horst von Brand
  0 siblings, 2 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-01 15:51 UTC (permalink / raw)
  To: Bernd Eckenfels; +Cc: linux-kernel

On Sun, Aug 01, 2004 at 04:55:47PM +0200, Bernd Eckenfels wrote:
> Personally I think it is better to keep that in the utsname.release. If you
> realy want to have an integer, then add it for easy parsing and allow it to
> have multiple parallel issuers:
> 
> For example like:
> 
> 2.6.9_XXX(linux26.3501,MM.123)

I think it's overkill to pollue the `uname -r` with that detail. The
`uname -r` is about the release version nothing else.

> where this has applied fix 3501 in the 2.6 branch and 123 according to
> vendor MM, so you do not need to understand vendors XXX schema. However I am

if all vendors uses different numbers (i.e. vendor MM.123) then I can as
well build the ugly database in function of the `uname -r`
vendorization, building a database of uname -r or a database of
MM/linux-26/whatever isn't going to be any different.

> not sure if you are willing to accept the fact, that backporters will then
> raise the level to a value you will only expect for more recent versions,
> i.e.:
> 
> 2.6.9(linux26.3501)
> -> security bug1 is discoverd and fixed
> 2.6.10pre1(linux26.3502)
> -> security bug2 is discoverd and fixed
> -> features are added
> 2.6.10(linux26.3503)
> 
> Now  somebody  decides to backport the bug1 fix:
> 
> 2.6.9-2(linux26.3502)
> 
> and the bug2 fix:
> 
> 2.6.9-3(linux26.3503) <- is level 3503 but does not have all 2.6.10 features?!
> 
> And it gets even more hairy: if only the bug2 fix is
> backported, how can an application state that it needs that (without
> impliciteley also reling on bug1 to be fixed)

note this isn't a build number (the features in 2.6.10 don't matter at
all, the only thing it matters is that all security bugs up to 3503 are
included). This would be kernel wide for 2.4, 2.2 and 2.6. If 2.6 has a
bug and 2.4 not, then 2.4 can bump the number without any other
modification. because this isn't a build number, there are very few
fixes that would require bumping the security_sequence number, so it's
not like we're going to ever see 3503 in our life. The last one that
would require bumping the sequence number was probably fnclex months
ago, previously there was mremap (mremap wasn't relevant for seccomp but
some app might want to know about it). the the only one relevant for
seccomp mode 1 was fnclex and the only previous one I can recall
relevant for seccomp was the mmx sniffing a few years ago.

If a vendor ships you a kernel with security_sequence 3502 but it misses
the _few_ needed bugfixes that are supposedly included in a kernel with
security sequence 3502, than such a vendor could as well ship you a
kernel with seccomp mode 1 doing nothing or if you prefer they could
ship a kernel with a bug in the tcp stack that gains root by sending an
escape sequence on the wire, you get the idea. Then you could just
prefer to change vendor and run a secure OS ;). I exclude that a vendor
could intentionally not want to backport all needed fixes that requires
bumping the sequence number.  And if they don't want to ship a "secure"
kernel they've simply not to touch the value of their security_sequence
and everything will work securely, they can just delete the rejects on
the sequence number modifications. You will definitely get a reject if
you apply a fix that bumps the sequence number and you miss the previous
fix that bumped the sequence number, so it can't go unnoticed (and
vendors are required to check rejects while applying patches). If
everyone bumps the number with security fixes this would actually _help_
increasing security, since even people self compiling kernels would
notice they miss some bugfix as they get the reject.

If people don't like the idea no problem, as worse I can build a
vendor-tuned security database in function of `uname -r` but it'll be
less secure and it'll be less finegrined than it could. I just thought a
sequence_number could be useful to something more than just knowing when
I can run safely in seccomp mode. Or if you prefer I can rename it to
seccomp_security_sequence, this way it would need a change only once
every few years at most and none of the kernel developers will have to
care about it and I'll take care to update it myself when needed (i.e.
ideally never ;).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 15:51       ` Andrea Arcangeli
@ 2004-08-01 17:24         ` Bernd Eckenfels
  2004-08-02  3:17         ` Horst von Brand
  1 sibling, 0 replies; 42+ messages in thread
From: Bernd Eckenfels @ 2004-08-01 17:24 UTC (permalink / raw)
  To: linux-kernel

In article <20040801155128.GG6295@dualathlon.random> you wrote:
>> where this has applied fix 3501 in the 2.6 branch and 123 according to
>> vendor MM, so you do not need to understand vendors XXX schema. However I am
> 
> if all vendors uses different numbers (i.e. vendor MM.123) then I can as
> well build the ugly database in function of the `uname -r`
> vendorization, building a database of uname -r or a database of
> MM/linux-26/whatever isn't going to be any different.

This is not about different vendors certifying the same level. Mainstream
software will always require a general fix-level, however internal software
may require some other hardening/configuration. Distributions and even end
users could use that string to certify "this build is compliant to
requirement level 123"

But I agree, it might be overkill for the utsname, however I feel leaving
that out of a secure level syscall may be underkill .)

Greetings
Bernd
-- 
eckes privat - http://www.eckes.org/
Project Freefire - http://www.freefire.org/

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 15:01       ` Andrea Arcangeli
@ 2004-08-01 17:29         ` chris
  2004-08-01 18:52           ` Bernd Eckenfels
                             ` (2 more replies)
  2004-08-03 12:48         ` Stephen Smalley
  1 sibling, 3 replies; 42+ messages in thread
From: chris @ 2004-08-01 17:29 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chris, Hans Reiser, linux-kernel, Andrew Morton

On Sun, 1 Aug 2004, Andrea Arcangeli wrote:

> On Sun, Aug 01, 2004 at 01:01:10PM +0100, chris@scary.beasts.org wrote:
> > Hi Andrea,
> >
> > Do you have plans to generalize seccomp into somelike like a "syscall
> > firewall"? This _would_ be useful to many apps, and provide good security

[...]

> Seems like a few people is interested in what you suggest above. it'd be
> very trivial to add a seccomp-mode = 2 that adds more syscalls like the
> socket syscalls like accept/sendfile/send/recv and also the open syscall
> (which means you want to use chroot still).  In the code you can see I
> wrote it so that more modes can be added freely. I mean it has some
> flexibility already.  vsftpd could enable the seccomp mode 2 on itself
> after it has initialized.

Using the above approach, we (the app writers) would never agree on the
syscall lists required for different seccomp modes ;-)

How hard would it be to have a per-task bitmap of syscalls allowed? This
way, a task could restrict to the exact subset of syscalls required for
maximum security.
The bitmap would
- Be allocated on demand (for no overhead in the common case)
- Deny all syscalls not covered by the supplied bitmap, to cater for
syscall table expansion
- Be inherited across fork and (probably) shared across clone

Cheers
Chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 17:29         ` chris
@ 2004-08-01 18:52           ` Bernd Eckenfels
  2004-08-01 20:45           ` Alan Cox
  2004-08-01 23:06           ` Andrea Arcangeli
  2 siblings, 0 replies; 42+ messages in thread
From: Bernd Eckenfels @ 2004-08-01 18:52 UTC (permalink / raw)
  To: linux-kernel

In article <Pine.LNX.4.58.0408011801260.1368@sphinx.mythic-beasts.com> you wrote:
> How hard would it be to have a per-task bitmap of syscalls allowed? This
> way, a task could restrict to the exact subset of syscalls required for
> maximum security.

Which somewhat overlaps with the user-priveldeges patches and some lsm
modules, so i am not sure if this needs to be small and lean to be useful on
its own.

Bernd
-- 
eckes privat - http://www.eckes.org/
Project Freefire - http://www.freefire.org/

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 17:29         ` chris
  2004-08-01 18:52           ` Bernd Eckenfels
@ 2004-08-01 20:45           ` Alan Cox
  2004-08-01 23:10             ` Andrea Arcangeli
  2004-08-01 23:06           ` Andrea Arcangeli
  2 siblings, 1 reply; 42+ messages in thread
From: Alan Cox @ 2004-08-01 20:45 UTC (permalink / raw)
  To: chris
  Cc: Andrea Arcangeli, Hans Reiser, Linux Kernel Mailing List,
	Andrew Morton

On Sul, 2004-08-01 at 18:29, chris@scary.beasts.org wrote:
> How hard would it be to have a per-task bitmap of syscalls allowed? This
> way, a task could restrict to the exact subset of syscalls required for
> maximum security.

Very easy indeed, although you might have to do a tiny bit of tweaking
for kernel made syscalls (eg hotplug triggers).

You can already do all of this using several user space applications
that manage it via ptrace. They do have a performance hit however.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 17:29         ` chris
  2004-08-01 18:52           ` Bernd Eckenfels
  2004-08-01 20:45           ` Alan Cox
@ 2004-08-01 23:06           ` Andrea Arcangeli
  2004-08-02  6:52             ` David Wagner
  2 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-01 23:06 UTC (permalink / raw)
  To: chris; +Cc: Hans Reiser, linux-kernel, Andrew Morton

On Sun, Aug 01, 2004 at 06:29:05PM +0100, chris@scary.beasts.org wrote:
> Using the above approach, we (the app writers) would never agree on the
> syscall lists required for different seccomp modes ;-)

I see the problem ;).

> How hard would it be to have a per-task bitmap of syscalls allowed? This
> way, a task could restrict to the exact subset of syscalls required for
> maximum security.
> The bitmap would
> - Be allocated on demand (for no overhead in the common case)
> - Deny all syscalls not covered by the supplied bitmap, to cater for
> syscall table expansion
> - Be inherited across fork and (probably) shared across clone

your app will have then to learn about the syscall details of every
arch (which is normally a kernel internal thing), the most obvious
example is the difference between the sigreturn and rt_sigreturn, plus
the syscall numbers vary across every arch and the bitmap will have to
differ depending on the architecture (while the seccomp mode number is a
fixed interface for all archs and it hides all internal details like
sigreturn/rt_sigreturn). The one thing I don't like is that if somebody
changes the signal frame to use a new_rt_sigreturn the app will break
and I'll have to upload an update and I'll have again to check for uname
-r to know which kernel has to enable what. I mean when the new
behaviour is introduced it won't be too bad, it'll just get a false
positive sigkill, it could happen as well if somebody forgets to update
seccomp.c after changing the signal frame.

While I only get disavantages from the bitmap, if people really want the
arch dependent bitmap I'm certainly able to put kernel architectural
internal knowledge into some python code that will build the right
bitmap depending on the arch and depending on the uname -r.

So it's up to you. Feel free to discuss and choose what you prefer. I'm
biased and I prefer seccomp, you can still implement the bitmap on top
of seccomp as seccomp mode == 2. I'm not saying you shouldn't get the
bitmap, my previous suggestion of syscalltable that would parse as well
the parameters was a lot more complicated than the bitmap, doing the
bitmap on top of seccomp will be easy (we could add some more storage
into the seccomp file too, so if you write number 2 followed by data,
the kernel will allocate such later data afte the first 32bits, as a
bitmap). And still the seccomp mode will provide you the infrastructure
and the entry point. This is actually simple enough (not comparable to
the syscalltables) that I can implement it myself if you agree on this
direction (next weekend).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 23:10             ` Andrea Arcangeli
@ 2004-08-01 23:08               ` Alan Cox
  2004-08-02 10:25                 ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Alan Cox @ 2004-08-01 23:08 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: chris, Hans Reiser, Linux Kernel Mailing List, Andrew Morton

On Llu, 2004-08-02 at 00:10, Andrea Arcangeli wrote:
> On Sun, Aug 01, 2004 at 09:45:14PM +0100, Alan Cox wrote:
> > You can already do all of this using several user space applications
> > that manage it via ptrace. They do have a performance hit however.
> 
> the tracer can be killed by oom due some other random app in the
> machine, plus SIGCHLD may confuse the tracer, then it needs to know
> about arch details again (like the bitmap), and the whole ptrace
> infastructure is a lot more complicate and in turn less secure. syscall
> performance is the last worry (at least for my usage).

syscall performance is something the other 99.99% of users not using the
feature will care about however. 

One of the things that you can sensibly do I think which will also avoid
a performance hit is to use the same kernel path as strace and friends
do for syscall tracing but capture and verify the syscall in kernel mode
rather than trapping back out. That will at least keep the usual fast
path unharmed by the security toys.



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 20:45           ` Alan Cox
@ 2004-08-01 23:10             ` Andrea Arcangeli
  2004-08-01 23:08               ` Alan Cox
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-01 23:10 UTC (permalink / raw)
  To: Alan Cox
  Cc: chris, Andrea Arcangeli, Hans Reiser, Linux Kernel Mailing List,
	Andrew Morton

On Sun, Aug 01, 2004 at 09:45:14PM +0100, Alan Cox wrote:
> You can already do all of this using several user space applications
> that manage it via ptrace. They do have a performance hit however.

the tracer can be killed by oom due some other random app in the
machine, plus SIGCHLD may confuse the tracer, then it needs to know
about arch details again (like the bitmap), and the whole ptrace
infastructure is a lot more complicate and in turn less secure. syscall
performance is the last worry (at least for my usage).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
       [not found]   ` <2olLt-4wI-5@gated-at.bofh.it>
@ 2004-08-02  0:05     ` Andi Kleen
  2004-08-02 10:19       ` Andrea Arcangeli
  2004-08-04 13:18       ` V13
  0 siblings, 2 replies; 42+ messages in thread
From: Andi Kleen @ 2004-08-02  0:05 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel

Andrea Arcangeli <andrea@cpushare.com> writes:

> +/*
> + * bump this sequence number after fixing any kernel security bug
> + * that could render insecure some userspace application. This
> + * way future versions of the userpace application will be able
> + * to reliably make sure to run on a secure kernel.
> + * I hope 31bit are enough... ;).
> + */
> +static int security_sequence;

I don't think a sequence number is a good idea. Consider a
vendor/third party kernel fixing a security bug, but mainline hasn't
taken the patch yet for some reason.

The vendor kernel could not safely increase this number, because it 
could conflict with some other security bug fixed in mainline at the 
same time. 

The end result would be that the kernel would be fixed, but 
the application has no way to tell.

Better may be a bitmap, but even there you still have an problem 
with allocating these bits. 

A safe solution would be a file in /proc that lists CAN idenitifiers of
fixed bugs or similar, but that may be quite some overhead to maintain
and parse.

-Andi

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 15:51       ` Andrea Arcangeli
  2004-08-01 17:24         ` Bernd Eckenfels
@ 2004-08-02  3:17         ` Horst von Brand
  2004-08-02 16:31           ` Andrea Arcangeli
  1 sibling, 1 reply; 42+ messages in thread
From: Horst von Brand @ 2004-08-02  3:17 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Bernd Eckenfels, linux-kernel

Andrea Arcangeli <andrea@suse.de> said:

[...]

> note this isn't a build number (the features in 2.6.10 don't matter at
> all, the only thing it matters is that all security bugs up to 3503 are
> included).

Pray tell, how do you know if a random "compiler warning fix" isn't a plug
for an exploitable hole, and if a "security fix" really does fix a real
security problem that can be abused?

Truth is, you can never know. So, this degenerates into sequential patch
numbering, which is completely hopeless.
-- 
Dr. Horst H. von Brand                   User #22616 counter.li.org
Departamento de Informatica                     Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria              +56 32 654239
Casilla 110-V, Valparaiso, Chile                Fax:  +56 32 797513

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 23:06           ` Andrea Arcangeli
@ 2004-08-02  6:52             ` David Wagner
  0 siblings, 0 replies; 42+ messages in thread
From: David Wagner @ 2004-08-02  6:52 UTC (permalink / raw)
  To: linux-kernel

Andrea Arcangeli  wrote:
>On Sun, Aug 01, 2004 at 06:29:05PM +0100, chris@scary.beasts.org wrote:
>> How hard would it be to have a per-task bitmap of syscalls allowed?
>
>your app will have then to learn about the syscall details of every
>arch (which is normally a kernel internal thing),

I'm not convinced this is a big deal.  In security, you always white
list known safe operations (never black list unsafe ones!).  Therefore,
you only white list the ones you know, and the result will be fail-safe
when porting to new architectures.

If the only hard case is *sigreturn(), it's not too hard to hard-code
that once and be done with it.

It seems like a bitmap will be much more flexible.  I already spotted
issues with the list of syscalls someone else posted (it included open(),
if I recall correctly), and I bet others would dislike any list I would
come up with.  My experience working with experimental tools like these
is that different apps may need different restrictions.

>the syscall numbers vary across every arch 

Isn't that what #include <sys/syscall.h> is for?  

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-02  0:05     ` Andi Kleen
@ 2004-08-02 10:19       ` Andrea Arcangeli
  2004-08-02 19:06         ` Rik van Riel
  2004-08-04 13:18       ` V13
  1 sibling, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-02 10:19 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Hi Andi,

On Mon, Aug 02, 2004 at 02:05:20AM +0200, Andi Kleen wrote:
> I don't think a sequence number is a good idea. Consider a
> vendor/third party kernel fixing a security bug, but mainline hasn't
> taken the patch yet for some reason.

does this really happen? The thing I'm thinking about here is the fnclex
and f00f kind of bugs. Or even worse ones. Those bugs area _always_
fixed in mainline too. Maybe I should simply rename it to
seccomp_security_sequence instead so nobody has to touch it for years
(ideally forever ;).

Also note, not increasing the sequence number isn't fatal, false
positives are fine, what I cannot accept are false negatives. So if
mainline didn't issue the new number yet, nobody would be forced to
increase that number. As far as I'm concerned (for my usage) I would
take care of pushing the fix into mainline myself, so I'm not worried
about mainline not including the security fixes.

This is all about disaster recovery, it doesn't necessairly need to be
efficient if the thing is difficult to fix (like workarounding future
nasty hardware bugs, which may take time), it only must be safe. Even if
mainline increases the number after months I don't mind as far as no
false negatives are generated.

For relatively easy things like fnclex that didn't clear the exceptions,
vendor-sec would be the place assigned to increase the numbers so that
new kernels would be synchronized and released with a new number all at
the same time (including mainline).

> The vendor kernel could not safely increase this number, because it 
> could conflict with some other security bug fixed in mainline at the 
> same time. 

Yep. But the vendor kernel isn't forced to increase it, as far as it
doesn't create false negatives that's fine, it's not that urgent to
increase it.

> A safe solution would be a file in /proc that lists CAN idenitifiers of
> fixed bugs or similar, but that may be quite some overhead to maintain
> and parse.

this would be perfectly fine too, the only bad thing is the wasting of
kernel memory for that. 1 byte of securty_sequence was a minor issue. If
we go this way, then we should be build knowledge into the boot process
so that this information is dumped into /var/log/kernel_security as
world readable during boot time, and then released from the kernel (this
shouldn't be too difficult to implement).

building the `uname -r` universal vendor database for every possible
distro version is doable too, I don't strictly need to add this
security_sequence (or even better a CAN list to dump in a
kernel_security directory), but I feel much safer (and I'd be a lot
simpler) in having a vendor neutral standard way to check if the kernel
is secure enough to run stuff in seccomp mode than to relay on `uname
-r` checks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 23:08               ` Alan Cox
@ 2004-08-02 10:25                 ` Andrea Arcangeli
  0 siblings, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-02 10:25 UTC (permalink / raw)
  To: Alan Cox; +Cc: chris, Hans Reiser, Linux Kernel Mailing List, Andrew Morton

Hi Alan,

On Mon, Aug 02, 2004 at 12:08:00AM +0100, Alan Cox wrote:
> One of the things that you can sensibly do I think which will also avoid
> a performance hit is to use the same kernel path as strace and friends
> do for syscall tracing but capture and verify the syscall in kernel mode
> rather than trapping back out. That will at least keep the usual fast
> path unharmed by the security toys.

The usual fast path is already unharmed by the security toys like
syscall auditing and seccomp mode (my only slowdown I introduced with
seccomp was to replace a testb with a testw, that doesn't count as a
slowdown, does it?). The only way to speedup the usual fast path is by
removing ptrace, everything already hooks into the ptrace testb/w.

About ptrace major issue being speed that's mostly true, but I'm not
sure what happens to uml if the tracer gets killed by oom, maybe it just
crashes at the next syscall, maybe not, dunno. I feel a lot safer in
having the secure computing happening inside seccomp than inside uml
from my part. I considered using uml before doing seccomp, but seccomp
was going to be a lot simpler and safer (and faster but I don't need
throughput in read/write/sigreturn/exit ;).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-02  3:17         ` Horst von Brand
@ 2004-08-02 16:31           ` Andrea Arcangeli
  0 siblings, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-02 16:31 UTC (permalink / raw)
  To: Horst von Brand; +Cc: Bernd Eckenfels, linux-kernel

On Sun, Aug 01, 2004 at 11:17:19PM -0400, Horst von Brand wrote:
> Andrea Arcangeli <andrea@suse.de> said:
> 
> [...]
> 
> > note this isn't a build number (the features in 2.6.10 don't matter at
> > all, the only thing it matters is that all security bugs up to 3503 are
> > included).
> 
> Pray tell, how do you know if a random "compiler warning fix" isn't a plug
> for an exploitable hole, and if a "security fix" really does fix a real
> security problem that can be abused?
> 
> Truth is, you can never know. So, this degenerates into sequential patch
> numbering, which is completely hopeless.

nothing is perfect. keeping track of a few sporadic kernel builds with
unsafe compiler with `uname -r` is quite easy compared to keeping track
of every security `uname -r` out there. It's about the common case
working well (common case is like fnclex), corner cases will have to be
handled with a db anyways, but it'll be much simpler to single out a few
spoardic `uname -r` than to keep track of everything in the common cases
too.

For example if a new bug triggers only on a certain buggy future cpu, I
don't want to shutdown the whole thing but I'll have a db that will
single out only such specific cpu if the security_sequence is lower than
N.

But anyways I start to think I should probably rename it to
seccomp_security_sequence, so that it's not going to degenerate in the
sequential patch numbering and it'll really work well for the common
case since there's a seccomp relevant bug less than once every 2 years
or less (and half the time they're hardware related and not a software
issues).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-02 10:19       ` Andrea Arcangeli
@ 2004-08-02 19:06         ` Rik van Riel
  2004-08-02 21:35           ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Rik van Riel @ 2004-08-02 19:06 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andi Kleen, linux-kernel

On Mon, 2 Aug 2004, Andrea Arcangeli wrote:
> On Mon, Aug 02, 2004 at 02:05:20AM +0200, Andi Kleen wrote:
> > I don't think a sequence number is a good idea. Consider a
> > vendor/third party kernel fixing a security bug, but mainline hasn't
> > taken the patch yet for some reason.
> 
> does this really happen?

Think EVMS in a certain SuSE kernel.  Hard to imagine
no security bugs got fixed in that code ;)

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-02 19:06         ` Rik van Riel
@ 2004-08-02 21:35           ` Andrea Arcangeli
  0 siblings, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-08-02 21:35 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andi Kleen, linux-kernel

On Mon, Aug 02, 2004 at 03:06:48PM -0400, Rik van Riel wrote:
> Think EVMS in a certain SuSE kernel.  Hard to imagine
> no security bugs got fixed in that code ;)

we make sure they're obviously safe in security terms before applying so
that was really a bad example.

But let's assume there's a real seccomp relevant bug in a RH kernel,
it's still zerocost to bump the security sequence all over the place (in
SUSE and mainline too), just like 2.4 would need to bump the sequence
number too if we find a 2.6-only bug. So there's absolutely no problem
at all even in such a case.

The only issue I can see after the complains I heard so far, is that
it could be too complicated for the community to synchronize and agree
on the ID for every security related patch (rejects pain or inefficient
communication could make it not feasible).

But seccomp bugs are so rare and so extremely severe for the whole
userbase (not only for people using seccomp mode, think f00f or fnclex
or mmx sniffing) that this will actually work fine, just like I hope we
can successfuly agree and synchronize on the syscall numbers that also
are added rarely.

What I mean is that the seccomp_security_sequence is going to work fine
as far as the syscalls works fine, and that's the only thing I need as
far as cpushare is concerned.

But I certainly agree with Andi that we might prefer to take the CAN
way, that way it won't help only seccomp userbase, and it'll be possibly
easier to maintain since we don't need to synchronize ourself, but we'll
relay on somebody else to issue unique ID for us which makes the ID
selection a no brainer. plus it provides a bit more of information just
in case somebody forgot to fix a security bug. Though I'd expect heavy
rejects on that file if you forget to apply a security fix (which to me
was a feature but apparently somebody thinks is just lower flexibility
to get rejects if your kernel is going to be insecure).

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-07 19:27 ` Hans Reiser
  2004-08-01 10:22   ` Andrea Arcangeli
@ 2004-08-03 12:40   ` Stephen Smalley
  2004-08-03 21:02     ` Alexander Lyamin
                       ` (2 more replies)
  1 sibling, 3 replies; 42+ messages in thread
From: Stephen Smalley @ 2004-08-03 12:40 UTC (permalink / raw)
  To: Hans Reiser; +Cc: andrea, lkml, Andrew Morton

On Wed, 2004-07-07 at 15:27, Hans Reiser wrote:
> Am I right to think that this could complement nicely our plans 
> described at www.namesys.com/blackbox_security.html

Hi Hans,

Out of curiosity, what do you think that this proposal will achieve that
cannot already be done via SELinux policy?  SELinux policy can already
express access rules based not only on the executable and user, but even
the entire call chain that led to a given executable.

-- 
Stephen Smalley <sds@epoch.ncsc.mil>
National Security Agency


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-01 15:01       ` Andrea Arcangeli
  2004-08-01 17:29         ` chris
@ 2004-08-03 12:48         ` Stephen Smalley
  1 sibling, 0 replies; 42+ messages in thread
From: Stephen Smalley @ 2004-08-03 12:48 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: chris, Hans Reiser, lkml, Andrew Morton

On Sun, 2004-08-01 at 11:01, Andrea Arcangeli wrote:
> Seems like a few people is interested in what you suggest above. it'd be
> very trivial to add a seccomp-mode = 2 that adds more syscalls like the
> socket syscalls like accept/sendfile/send/recv and also the open syscall
> (which means you want to use chroot still).  In the code you can see I
> wrote it so that more modes can be added freely. I mean it has some
> flexibility already.  vsftpd could enable the seccomp mode 2 on itself
> after it has initialized.

As you begin moving toward increasingly general modes of operation, and
get to the point of having to filter actual system call parameters, I
think you would be better served by using an LSM and mediating access to
the actual kernel objects.

For your own particular mode of operation, have you considered running
the untrusted code in a virtual machine, using something like Xen,
rather than trying to create a "safe" subset of kernel calls for a
single kernel instance?

-- 
Stephen Smalley <sds@epoch.ncsc.mil>
National Security Agency

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re:  secure computing for 2.6.7
  2004-08-03 12:40   ` Stephen Smalley
@ 2004-08-03 21:02     ` Alexander Lyamin
  2004-08-05 11:47       ` Stephen Smalley
  2004-08-04  8:57     ` Hans Reiser
  2004-08-07 23:20     ` Hans Reiser
  2 siblings, 1 reply; 42+ messages in thread
From: Alexander Lyamin @ 2004-08-03 21:02 UTC (permalink / raw)
  To: Stephen Smalley; +Cc: Hans Reiser, andrea, lkml, Andrew Morton

Tue, Aug 03, 2004 at 08:40:45AM -0400, Stephen Smalley wrote:
> On Wed, 2004-07-07 at 15:27, Hans Reiser wrote:
> > Am I right to think that this could complement nicely our plans 
> > described at www.namesys.com/blackbox_security.html
> Hi Hans,
> 
> Out of curiosity, what do you think that this proposal will achieve that
> cannot already be done via SELinux policy?  SELinux policy can already
> express access rules based not only on the executable and user, but even
> the entire call chain that led to a given executable.

convinience ? speed ?

RBAC is a Good Thing, but I wonder if it could provide throughout syntax analysis
for vfs related syscalls. As it is now.

At least what declared in their docs, fs-wise they are somewhat like this

Macro Name	Description
stat_file_perms	Permissions to call stat or access on a file.
x_file_perms	Permissions to execute a file.
r_file_perms	Permissions to read a file.
rx_file_perms	Permissions to read and execute a file.
rw_file_perms	Permissions to read and write a file.
ra_file_perms	Permissions to read and append to a file.
link_file_perms	Permissions to link, unlink, or rename a file.
create_file_perms	Permissions to create, access, and delete a file.
r_dir_perms	Permissions to read and search a directory.
rw_dir_perms	Permissions to read and modify a directory.
ra_dir_perms	Permissions to read and add entries to a directory.
create_dir_perms	Permissions to create, access, and delete a directory.
mount_fs_perms	Permissions to mount and unmount a filesystem.

*shrugs*
Well, I am probably wrong...

p.s. _AND_ if I remember correctly reiser4 supposed to provide finer-then-file grain security.
well, at least it easily could, being truly semantic-enabled fs.

-- 
"the liberation loophole will make it clear.."
lex lyamin

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-03 12:40   ` Stephen Smalley
  2004-08-03 21:02     ` Alexander Lyamin
@ 2004-08-04  8:57     ` Hans Reiser
  2004-08-05 11:48       ` Stephen Smalley
  2004-08-07 23:20     ` Hans Reiser
  2 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2004-08-04  8:57 UTC (permalink / raw)
  To: Stephen Smalley; +Cc: andrea, lkml, Andrew Morton

Stephen Smalley wrote:

>On Wed, 2004-07-07 at 15:27, Hans Reiser wrote:
>  
>
>>Am I right to think that this could complement nicely our plans 
>>described at www.namesys.com/blackbox_security.html
>>    
>>
>
>Hi Hans,
>
>Out of curiosity, what do you think that this proposal 
>
before I answer, "this proposal" refers to my proposal or Andrea's?  
Kind of necessary information to my formulating an answer.;-)

>will achieve that
>cannot already be done via SELinux policy?  SELinux policy can already
>express access rules based not only on the executable and user, but even
>the entire call chain that led to a given executable.
>
>  
>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-02  0:05     ` Andi Kleen
  2004-08-02 10:19       ` Andrea Arcangeli
@ 2004-08-04 13:18       ` V13
  1 sibling, 0 replies; 42+ messages in thread
From: V13 @ 2004-08-04 13:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andrea Arcangeli, linux-kernel

On Monday 02 August 2004 03:05, Andi Kleen wrote:
> I don't think a sequence number is a good idea. Consider a
> vendor/third party kernel fixing a security bug, but mainline hasn't
> taken the patch yet for some reason.
>
> The vendor kernel could not safely increase this number, because it
> could conflict with some other security bug fixed in mainline at the
> same time.
>
> The end result would be that the kernel would be fixed, but
> the application has no way to tell.
>
> Better may be a bitmap, but even there you still have an problem
> with allocating these bits.
>
> A safe solution would be a file in /proc that lists CAN idenitifiers of
> fixed bugs or similar, but that may be quite some overhead to maintain
> and parse.

What about using the kernel version (instead of a seq #) plus a /proc file 
which lists the fixed CAN ids? This way a patch to a kernel will add an entry 
to /proc/koko and the program whould check the kernel version. If the kernel 
version is less or equal to X then it will read the /proc/koko for applied 
patches.

When a new version of the kernel is released then the /proc/koko file will be 
cleared a bit since version X.Y.Z means that the patches were added.

This leads to a hole between releasing a new version of the program and 
releasing a new kernel version, since the author will not be able to know if 
the next version of the kernel will have this bug fixed or not, so he cannot 
safely check the kernel version for a >X.Y.Z.

This can be solved in combination with a user space library that maintains a 
list of known kernel fixes and an API like: int can_is_fixed(...); which will 
combine the /proc information with the kernel version. The library (i.e 
an /etc file) will maintain a list of known fixes and the kernel version it 
was applied and will read the /proc/koko file for extra information. This may 
lead to false positives in case the library is an older version and the 
kernel is upgraded (since the lib will not know about the applied patch) but:

a) This is acceptable by the conditions you've set
b) Can be partialy solved by keeping CAN ids in /proc/koko for N versions of 
kernel (or for N months)

> -Andi
<<V13>>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re:  secure computing for 2.6.7
  2004-08-03 21:02     ` Alexander Lyamin
@ 2004-08-05 11:47       ` Stephen Smalley
  0 siblings, 0 replies; 42+ messages in thread
From: Stephen Smalley @ 2004-08-05 11:47 UTC (permalink / raw)
  To: flx; +Cc: Hans Reiser, andrea, lkml, Andrew Morton

On Tue, 2004-08-03 at 17:02, Alexander Lyamin wrote:
> convinience ? speed ?
> 
> 
> RBAC is a Good Thing, but I wonder if it could provide throughout syntax analysis
> for vfs related syscalls. As it is now.
> 
> At least what declared in their docs, fs-wise they are somewhat like this
> 
> Macro Name	Description
> stat_file_perms	Permissions to call stat or access on a file.
> x_file_perms	Permissions to execute a file.
> r_file_perms	Permissions to read a file.
> rx_file_perms	Permissions to read and execute a file.
> rw_file_perms	Permissions to read and write a file.
> ra_file_perms	Permissions to read and append to a file.
> link_file_perms	Permissions to link, unlink, or rename a file.
> create_file_perms	Permissions to create, access, and delete a file.
> r_dir_perms	Permissions to read and search a directory.
> rw_dir_perms	Permissions to read and modify a directory.
> ra_dir_perms	Permissions to read and add entries to a directory.
> create_dir_perms	Permissions to create, access, and delete a directory.
> mount_fs_perms	Permissions to mount and unmount a filesystem.

I'm not sure I understand what you are trying to say.  What you list
above are common sets of permissions defined as macros for convenience,
but you still have the freedom to specify individual permissions.
  
-- 
Stephen Smalley <sds@epoch.ncsc.mil>
National Security Agency


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-04  8:57     ` Hans Reiser
@ 2004-08-05 11:48       ` Stephen Smalley
  0 siblings, 0 replies; 42+ messages in thread
From: Stephen Smalley @ 2004-08-05 11:48 UTC (permalink / raw)
  To: Hans Reiser; +Cc: andrea, lkml, Andrew Morton

On Wed, 2004-08-04 at 04:57, Hans Reiser wrote:
> before I answer, "this proposal" refers to my proposal or Andrea's?  
> Kind of necessary information to my formulating an answer.;-)

Your proposal.

-- 
Stephen Smalley <sds@epoch.ncsc.mil>
National Security Agency


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-03 12:40   ` Stephen Smalley
  2004-08-03 21:02     ` Alexander Lyamin
  2004-08-04  8:57     ` Hans Reiser
@ 2004-08-07 23:20     ` Hans Reiser
  2004-08-09 12:35       ` Stephen Smalley
  2 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2004-08-07 23:20 UTC (permalink / raw)
  To: Stephen Smalley; +Cc: andrea, lkml, Andrew Morton

Stephen Smalley wrote:

>On Wed, 2004-07-07 at 15:27, Hans Reiser wrote:
>  
>
>>Am I right to think that this could complement nicely our plans 
>>described at www.namesys.com/blackbox_security.html
>>    
>>
>
>Hi Hans,
>
>Out of curiosity, what do you think that this proposal will achieve that
>cannot already be done via SELinux policy?  SELinux policy can already
>express access rules based not only on the executable and user, but even
>the entire call chain that led to a given executable.
>
>  
>
Where do you store the access rules?  With the executable?   How do you 
automate their determination?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-08-07 23:20     ` Hans Reiser
@ 2004-08-09 12:35       ` Stephen Smalley
  0 siblings, 0 replies; 42+ messages in thread
From: Stephen Smalley @ 2004-08-09 12:35 UTC (permalink / raw)
  To: Hans Reiser; +Cc: andrea, lkml, Andrew Morton

On Sat, 2004-08-07 at 19:20, Hans Reiser wrote:
> Where do you store the access rules?  With the executable?   How do you 
> automate their determination?

Executables, like other files, are assigned security types (security
equivalence classes for objects) stored as extended attributes.  Policy
rules based on security domains (security equivalence classes for
processes) and security types are defined in a separate security policy
configuration that is compiled into an internal form by a policy
compiler and loaded into the kernel by early userspace (presently by a
modified /sbin/init).   With regard to automating their determination,
SELinux has some rudimentary features for collecting audit data
(optionally in a permissive mode where access denials are merely logged,
not denied) and generating policy rules from such audit data, and there
is work underway to develop better tools for policy generation,
including back ends for analysis of generated policy rules against
security objectives.  You have to be rather careful about such automated
generation, as many programs and library functions probe for access that
is not truly needed for their operation and some code actually varies
its behavior based on such probes (e.g. falling back to a less
privileged mode of operation if the probe fails).

-- 
Stephen Smalley <sds@epoch.ncsc.mil>
National Security Agency

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-07-04 21:35 ` Andrew Morton
  2004-07-04 23:32   ` andrea
@ 2004-10-12 14:24   ` Andrea Arcangeli
  2004-10-12 15:32     ` Rik van Riel
  1 sibling, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-10-12 14:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On Sun, Jul 04, 2004 at 02:35:26PM -0700, Andrew Morton wrote:
> Of course, yes, the patch is sufficiently safe and simple for it to be
> mergeable in 2.6, if this is the way we want to do secure computing.  I'd
> wonder whether the API should be syscall-based rather than /proc-based, and
> whether there should be a config option for it.

here a new patch, possibly candidate for merging in 2.6.10pre?

	http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.9-rc4/seccomp

I added the config option, mostly to be sure archs will show the seccomp
file only if they really support the feature interally.

For my purpose seccomp is the most robust and secure API I could desire.
Adding genericity isn't the object, the object is to keep it simple and
obviously safe and as hard as possible to break.  I plan to eventually
go a bit more complex (and in turn a bit less secure from the point of
view of the seller) with xen-like trusted computing later once there
will be enough hardware in the market to make it worthwhile. As for the
syscall vs /proc, it's not performance critical, and I find this more
usable (plus currently I'm firing it on with python and excuting a new
syscalls with python isn't as quick as a file('/proc/' + pid +
'/seccomp', 'w').write('1').

Also note, I don't mind if the seccomp file could be removed from /proc
eventually, as far as I have the guarantee that when it's in there it
implements the feature. Ideally the seccomp.c file should be pretty much
fixed in stone and not subject to any further kernel development.

To receive the data asynchronously SIGIO can be set by the
seccomp-loader, or it can simply retry some read syscall from the socket
once every couple of seconds if the buffer isn't already full (socket
can be set in nonblocking mode).  That's all userspace stuff that
belongs to the seccomp loader. On the kernel side I will make it with
only read/write/exit/sigreturn.  Even once trusted computing will be
enabled I will only allow those few operations to communicate with the
untrusted world. So the model is going to stay and this also means
ideally no bytecode would require modification to run in trusted
computing mode by just creating a proper trusted-seccomp-loader (we'll
see if this is really true, I think it's at least theoretically
feasible, but it's not a short term matter).

It's a pain to program inside the seccomp mode for the programmer, but
the power he/she will get if he does I believe could make it worthwhile
and the whole thing worth a quick try.

Another reason for merging this is that projects like BOINC should start
using seccomp too. They write in their webpage "Accidental abuse of
participant hosts by projects: BOINC does nothing to prevent this. The
chances of it happening can be minimized by pre-released application
testing. Projects should test their applications thoroughly on all
platforms and with all input data scenarios before promoting them to
production status.".  seccomp will fix it completely, which means the
userbase could increase significantly too, beacuse the seller will not
have to trust the buyer not to have bugs.

Relaying on seccomp not to break (like I'm doing) is no different from
relaying on the netfilter code not to break and it's no different from
relaying on the openssh code not to break, and again no different from
relaying on the IPSEC code not to break. Except this is an order of
magnitude simpler to guarantee as obviously safe since much less kernel
code is involved in these secure paths. Plus this is a lot more secure
too since if something breaks I will force an upgrade immediatly on
every single client connected and I'll notify via email as well anybody
who could have been affeected, something not enforceable on firewall
kernel code for example on a random computer on the internet.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 14:24   ` Andrea Arcangeli
@ 2004-10-12 15:32     ` Rik van Riel
  2004-10-12 15:59       ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Rik van Riel @ 2004-10-12 15:32 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, linux-kernel

On Tue, 12 Oct 2004, Andrea Arcangeli wrote:

> here a new patch, possibly candidate for merging in 2.6.10pre?
> 
> 	http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.9-rc4/seccomp

How do you start a seccomp process in a secure way ?


-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 15:32     ` Rik van Riel
@ 2004-10-12 15:59       ` Andrea Arcangeli
  2004-10-12 16:28         ` Rik van Riel
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-10-12 15:59 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andrew Morton, linux-kernel

On Tue, Oct 12, 2004 at 11:32:57AM -0400, Rik van Riel wrote:
> On Tue, 12 Oct 2004, Andrea Arcangeli wrote:
> 
> > here a new patch, possibly candidate for merging in 2.6.10pre?
> > 
> > 	http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.9-rc4/seccomp
> 
> How do you start a seccomp process in a secure way ?

there's an example here:

http://www.cpushare.com/download/cpushare-0.4.tar.bz2

check python seccomp_test.py and seccomp-loader.c.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 15:59       ` Andrea Arcangeli
@ 2004-10-12 16:28         ` Rik van Riel
  2004-10-12 17:46           ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Rik van Riel @ 2004-10-12 16:28 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, linux-kernel

On Tue, 12 Oct 2004, Andrea Arcangeli wrote:

> http://www.cpushare.com/download/cpushare-0.4.tar.bz2
> 
> check python seccomp_test.py and seccomp-loader.c.

Looks like it should work, though really only for the
purposes of cpushare and nothing else.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 16:28         ` Rik van Riel
@ 2004-10-12 17:46           ` Andrea Arcangeli
  2004-10-12 18:04             ` Rik van Riel
  2004-10-12 18:10             ` Rik van Riel
  0 siblings, 2 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-10-12 17:46 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andrew Morton, linux-kernel

On Tue, Oct 12, 2004 at 12:28:48PM -0400, Rik van Riel wrote:
> Looks like it should work, though really only for the
> purposes of cpushare and nothing else.

in the short term I sure agree, and in my humble opinion this is true
for trusted computing too.

However as said boinc and seti would better start using it too.

And people could start using it for other things too every time they
deal with untrusted data or bytecode. The parsing untrusted data case is
especially easy since it won't even require the seccom-loader (since the
executable is trusted before it starts managing the untrusted data
coming from the network). For example you can parse the jpeg into a
seccomp mode task, that gets the jpeg in input of the pipe and it throws
the uncompressed bitmap in output ready to be written in the
framebuffer. Basically every decompression scheme can run in a task
running in seccomp mode and for most usages the only risk is to see or
listen to garbage, but no exploit once the raw data is pushed into the
hardware as "raw data".

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 17:46           ` Andrea Arcangeli
@ 2004-10-12 18:04             ` Rik van Riel
  2004-10-12 18:10             ` Rik van Riel
  1 sibling, 0 replies; 42+ messages in thread
From: Rik van Riel @ 2004-10-12 18:04 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, linux-kernel

On Tue, 12 Oct 2004, Andrea Arcangeli wrote:
> On Tue, Oct 12, 2004 at 12:28:48PM -0400, Rik van Riel wrote:
> > Looks like it should work, though really only for the
> > purposes of cpushare and nothing else.

> However as said boinc and seti would better start using it too.

Are they interested ?

> And people could start using it for other things too every time they
> deal with untrusted data or bytecode.

Would be interesting for eg. browser plugins, though I don't
know whether the current seccomp infrastructure is powerful
enough for that ...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 17:46           ` Andrea Arcangeli
  2004-10-12 18:04             ` Rik van Riel
@ 2004-10-12 18:10             ` Rik van Riel
  2004-10-12 18:29               ` Andrea Arcangeli
  1 sibling, 1 reply; 42+ messages in thread
From: Rik van Riel @ 2004-10-12 18:10 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, linux-kernel

On Tue, 12 Oct 2004, Andrea Arcangeli wrote:

> However as said boinc and seti would better start using it too.

Thinking about it some more, I'm not convinced they can.

After all, they need to get new data to perform calculations
on, and pass the results of previous calculations on to the
server.

In order to do that, the user needs to run code that's not
restricted by seccomp. Taking that into account, what's the
point ?

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: secure computing for 2.6.7
  2004-10-12 18:10             ` Rik van Riel
@ 2004-10-12 18:29               ` Andrea Arcangeli
  0 siblings, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-10-12 18:29 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andrew Morton, linux-kernel

On Tue, Oct 12, 2004 at 02:10:52PM -0400, Rik van Riel wrote:
> On Tue, 12 Oct 2004, Andrea Arcangeli wrote:
> 
> > However as said boinc and seti would better start using it too.
> 
> Thinking about it some more, I'm not convinced they can.
> 
> After all, they need to get new data to perform calculations
> on, and pass the results of previous calculations on to the
> server.
> 
> In order to do that, the user needs to run code that's not
> restricted by seccomp. [..]

Getting new data to performance calculations and pass the results up to
the buyer is what I'm doing too and it's the ideal workload to use
with seccomp or trusted computing. But this is very offtopic discussion
for this list.

jpeg sure can be decompressed too.

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2004-10-12 18:29 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-07-04 17:39 secure computing for 2.6.7 andrea
2004-07-04 21:35 ` Andrew Morton
2004-07-04 23:32   ` andrea
2004-07-05  0:37     ` Phy Prabab
2004-10-12 14:24   ` Andrea Arcangeli
2004-10-12 15:32     ` Rik van Riel
2004-10-12 15:59       ` Andrea Arcangeli
2004-10-12 16:28         ` Rik van Riel
2004-10-12 17:46           ` Andrea Arcangeli
2004-10-12 18:04             ` Rik van Riel
2004-10-12 18:10             ` Rik van Riel
2004-10-12 18:29               ` Andrea Arcangeli
2004-07-07 19:27 ` Hans Reiser
2004-08-01 10:22   ` Andrea Arcangeli
2004-08-01 12:01     ` chris
2004-08-01 15:01       ` Andrea Arcangeli
2004-08-01 17:29         ` chris
2004-08-01 18:52           ` Bernd Eckenfels
2004-08-01 20:45           ` Alan Cox
2004-08-01 23:10             ` Andrea Arcangeli
2004-08-01 23:08               ` Alan Cox
2004-08-02 10:25                 ` Andrea Arcangeli
2004-08-01 23:06           ` Andrea Arcangeli
2004-08-02  6:52             ` David Wagner
2004-08-03 12:48         ` Stephen Smalley
2004-08-01 14:55     ` Bernd Eckenfels
2004-08-01 15:51       ` Andrea Arcangeli
2004-08-01 17:24         ` Bernd Eckenfels
2004-08-02  3:17         ` Horst von Brand
2004-08-02 16:31           ` Andrea Arcangeli
2004-08-03 12:40   ` Stephen Smalley
2004-08-03 21:02     ` Alexander Lyamin
2004-08-05 11:47       ` Stephen Smalley
2004-08-04  8:57     ` Hans Reiser
2004-08-05 11:48       ` Stephen Smalley
2004-08-07 23:20     ` Hans Reiser
2004-08-09 12:35       ` Stephen Smalley
     [not found] <2ejhQ-4lc-5@gated-at.bofh.it>
     [not found] ` <2fqhq-1RU-45@gated-at.bofh.it>
     [not found]   ` <2olLt-4wI-5@gated-at.bofh.it>
2004-08-02  0:05     ` Andi Kleen
2004-08-02 10:19       ` Andrea Arcangeli
2004-08-02 19:06         ` Rik van Riel
2004-08-02 21:35           ` Andrea Arcangeli
2004-08-04 13:18       ` V13

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox