From: Vasiliy Kulikov <segoon@openwall.com>
To: kernel-hardening@lists.openwall.com
Subject: Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
Date: Fri, 12 Aug 2011 16:07:48 +0400 [thread overview]
Message-ID: <20110812120747.GA14598@albatros> (raw)
In-Reply-To: <20110810164225.GA32177@openwall.com>
Hi,
This is the updated version. It tries to handle denied syscalls as if
they are disabled (MSR/IDT entry is not initialized). I've copied
handlers' from interrupt handlers and removed kprobes code, which is
looks like dead in this specific case.
---
arch/x86/ia32/ia32entry.S | 33 +++++
arch/x86/include/asm/elf.h | 5 +-
arch/x86/include/asm/thread_info.h | 13 ++-
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/entry_64.S | 12 ++-
arch/x86/kernel/syscall_restrict.c | 229 ++++++++++++++++++++++++++++++++++++
arch/x86/kernel/traps.c | 2 +-
kernel/fork.c | 5 +
8 files changed, 293 insertions(+), 7 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..5bc1882 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
.quad 1b,ia32_badarg
.previous
GET_THREAD_INFO(%r10)
+ testl $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+ jnz ia32_denied_sysenter
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
@@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target)
.quad 1b,ia32_badarg
.previous
GET_THREAD_INFO(%r10)
+ testl $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+ jnz ia32_denied_syscall
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
@@ -421,6 +425,8 @@ ENTRY(ia32_syscall)
this could be a problem. */
SAVE_ARGS 0,1,0
GET_THREAD_INFO(%r10)
+ testl $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+ jnz ia32_denied_int
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
@@ -453,6 +459,33 @@ ia32_badsys:
movq $-ENOSYS,%rax
jmp ia32_sysret
+ia32_denied_sysenter:
+ SAVE_REST
+ CLEAR_RREGS
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call do_ia32_denied_sysenter
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ jmp int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_syscall:
+ SAVE_REST
+ CLEAR_RREGS
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ movq $-ENOSYS,%rax
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ jmp int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_int:
+ SAVE_REST
+ CLEAR_RREGS
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call do_ia32_denied_int
+ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ jmp int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
quiet_ni_syscall:
movq $-ENOSYS,%rax
ret
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..fb054c7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -153,9 +153,10 @@ do { \
* This is used to ensure we don't load something for the wrong architecture.
*/
#define elf_check_arch(x) \
- ((x)->e_machine == EM_X86_64)
+ ((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED))
-#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x) \
+ (elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED))
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..1e93040 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED 29 /* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED 30 /* 64 bit syscalls are denied */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -117,6 +119,8 @@ struct thread_info {
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED (1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED (1 << TIF_SYSCALL64_DENIED)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void)
ti->status |= TS_RESTORE_SIGMASK;
set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
}
-#endif /* !__ASSEMBLY__ */
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+#define __HAVE_ARCH_POST_FORK
+
+extern void arch_post_fork(struct task_struct *task);
+
+#endif /* CONFIG_IA32_EMULATION */
+
extern void arch_task_cache_init(void);
extern void free_thread_info(struct thread_info *ti);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_SYSCTL) += syscall_restrict.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..b184a45 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
+ testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx)
+ jnz denied_sys
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
jnz tracesys
system_call_fastpath:
@@ -539,8 +541,14 @@ sysret_signal:
jmp int_check_syscall_exit_work
badsys:
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp ret_from_sys_call
+ SAVE_REST
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+ FIXUP_TOP_OF_STACK %rdi
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call do_denied_syscall
+ LOAD_ARGS ARGOFFSET, 1
+ RESTORE_REST
+ jmp int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
#ifdef CONFIG_AUDITSYSCALL
/*
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..a676f22
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,229 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <asm/kdebug.h>
+#include <linux/kdebug.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info);
+
+asmlinkage
+void do_ia32_denied_sysenter(struct pt_regs *regs)
+{
+ current->thread.error_code = 0;
+ current->thread.trap_no = 13;
+
+ if (printk_ratelimit()) {
+ pr_err("%s[%d] attempt to use denied 32-bit sysenter, ip:%lx sp:%lx",
+ current->comm, task_pid_nr(current),
+ regs->ip, regs->sp);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ force_sig(SIGSEGV, current);
+ return;
+
+}
+
+asmlinkage
+void do_ia32_denied_int(struct pt_regs *regs)
+{
+ if (printk_ratelimit()) {
+ pr_err("%s[%d] attempt to use denied 32-bit int80h, ip :%lx sp:%lx",
+ current->comm, task_pid_nr(current),
+ regs->ip, regs->sp);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ do_trap(11, SIGBUS, "segment not present", regs, 0, NULL);
+}
+
+asmlinkage
+void do_denied_syscall(struct pt_regs *regs)
+{
+ siginfo_t info = {
+ .si_signo = SIGILL,
+ .si_errno = 0,
+ .si_code = ILL_ILLOPN,
+ .si_addr = (void __user *)regs->ip
+ };
+
+ if (printk_ratelimit()) {
+ pr_err("%s[%d] attempt to use denied 64-bit syscall, ip:%lx sp:%lx",
+ current->comm, task_pid_nr(current),
+ regs->ip, regs->sp);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+
+ do_trap(6, SIGILL, "invalid opcode", regs, 0, &info);
+}
+
+static int task_get_bitness(struct task_struct *task)
+{
+ if (test_ti_thread_flag(task_thread_info(task), TIF_IA32))
+ return 32;
+ else
+ return 64;
+}
+
+static bool pidns_locked(struct pid_namespace *pid_ns)
+{
+ struct thread_info *ti = task_thread_info(pid_ns->child_reaper);
+
+ return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) ||
+ test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED);
+}
+
+static int bits_to_flags(int bits)
+{
+ if (bits == 32)
+ return TIF_SYSCALL64_DENIED;
+ else
+ return TIF_SYSCALL32_DENIED;
+}
+
+void arch_post_fork(struct task_struct *task)
+{
+ int clear_bit_nr;
+
+ if (!pidns_locked(current->nsproxy->pid_ns))
+ return;
+
+ clear_bit_nr = bits_to_flags(task_get_bitness(current));
+ set_tsk_thread_flag(task, clear_bit_nr);
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
+{
+ struct task_struct *task;
+ int old_bits;
+ int nr;
+
+ for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+ task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ if (!task)
+ continue;
+
+ old_bits = task_get_bitness(task);
+ if (old_bits != bits) {
+ pr_err("Inconsistent syscall restriction detected! "
+ "Parent ns tries to restrict syscalls to %d "
+ "bits while some task is %d bit.",
+ bits, old_bits);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static void __bitness_lock(struct pid_namespace *pid_ns, int bits)
+{
+ u32 clear_bit_nr;
+ struct task_struct *task;
+ int nr;
+
+ clear_bit_nr = bits_to_flags(bits);
+
+ for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+ task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ if (task)
+ set_tsk_thread_flag(task, clear_bit_nr);
+ }
+}
+
+static int bitness_lock(struct pid_namespace *pid_ns)
+{
+ int rc, new_bits;
+
+ rcu_read_lock();
+ write_lock_irq(&tasklist_lock);
+
+ new_bits = task_get_bitness(pid_ns->child_reaper);
+ rc = __pidns_may_lock_bitness(pid_ns, new_bits);
+ if (!rc)
+ __bitness_lock(pid_ns, new_bits);
+
+ write_unlock_irq(&tasklist_lock);
+ rcu_read_unlock();
+ return rc;
+}
+
+static int bitness_locked_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int rc, new_bits, old_bits;
+ struct ctl_table tbl = {
+ .procname = table->procname,
+ .data = &new_bits,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ };
+
+ old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns);
+ rc = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+ if (rc || !write)
+ return rc;
+
+ if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits))
+ return -EACCES;
+ if (new_bits && old_bits)
+ return 0;
+ return bitness_lock(current->nsproxy->pid_ns);
+}
+
+static struct ctl_table abi_syscall_restrict[] = {
+ {
+ .procname = "bitness_locked",
+ .mode = 0644,
+ .proc_handler = bitness_locked_handler
+ },
+ {}
+};
+
+#else /* CONFIG_IA32_EMULATION */
+
+static int one = 1;
+
+static struct ctl_table abi_syscall_restrict[] = {
+ {
+ .procname = "bitness_locked",
+ .data = &one,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+ {}
+};
+
+#endif /* CONFIG_IA32_EMULATION */
+
+
+static struct ctl_table abi_root[] = {
+ {
+ .procname = "abi",
+ .mode = 0555,
+ .child = abi_syscall_restrict
+ },
+ {}
+};
+
+__init int syscall_restrict_init(void)
+{
+ register_sysctl_table(abi_root);
+ return 0;
+}
+device_initcall(syscall_restrict_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9682ec5..a9bf9cf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -116,7 +116,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
dec_preempt_count();
}
-static void __kprobes
+void __kprobes
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..55e4455 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}
+#ifndef __HAVE_ARCH_POST_FORK
+#define arch_post_fork(p)
+#endif
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+ arch_post_fork(p);
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
--
next prev parent reply other threads:[~2011-08-12 12:07 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-08-07 11:00 [kernel-hardening] 32/64 bitness restriction for pid namespace Vasiliy Kulikov
2011-08-08 17:39 ` [kernel-hardening] " Vasiliy Kulikov
2011-08-10 9:52 ` Vasiliy Kulikov
2011-08-10 13:03 ` [kernel-hardening] " Solar Designer
2011-08-10 13:27 ` Vasiliy Kulikov
2011-08-10 14:26 ` Solar Designer
2011-08-10 15:02 ` Vasiliy Kulikov
2011-08-10 15:40 ` Solar Designer
2011-08-10 16:21 ` Vasiliy Kulikov
2011-08-10 16:42 ` Solar Designer
2011-08-12 12:07 ` Vasiliy Kulikov [this message]
2011-08-12 12:23 ` Solar Designer
2011-08-13 15:12 ` Vasiliy Kulikov
2011-08-13 15:19 ` Solar Designer
2011-08-13 16:55 ` Vasiliy Kulikov
2011-08-13 17:31 ` Vasiliy Kulikov
2011-08-13 19:25 ` Solar Designer
2011-08-13 19:22 ` Solar Designer
2011-08-14 9:50 ` Solar Designer
2011-08-14 10:16 ` Vasiliy Kulikov
2011-08-14 11:29 ` Solar Designer
2011-08-14 11:55 ` Vasiliy Kulikov
2011-08-14 12:04 ` Solar Designer
2011-08-14 12:16 ` Vasiliy Kulikov
2011-08-15 15:38 ` Vasiliy Kulikov
2011-08-15 21:33 ` Solar Designer
2011-08-16 6:39 ` Vasiliy Kulikov
2011-08-15 21:46 ` Solar Designer
2011-08-16 6:25 ` Vasiliy Kulikov
2011-08-18 10:34 ` Solar Designer
2011-08-18 14:42 ` Vasiliy Kulikov
2011-08-12 9:09 ` Vasiliy Kulikov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110812120747.GA14598@albatros \
--to=segoon@openwall.com \
--cc=kernel-hardening@lists.openwall.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.