* [PATCH v3 01/13] tracing: split out filter initialization and clean up.
2011-05-26 18:49 ` Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
` (11 subsequent siblings)
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Peter Zijlstra, Paul Mackerras, Arnaldo Carvalho de Melo,
Frederic Weisbecker
Moves the perf-specific profile event allocation and freeing code into
kernel/perf_event.c where it is called from and two symbols are exported
via ftrace_event.h for instantiating struct event_filters without
requiring a change to the core tracing code.
The change allows globally registered ftrace events to be used in
event_filter structs. perf is the current consumer, but a possible
future consumer is a system call filtering using the secure computing
hooks (and the existing syscalls subsystem events).
Signed-off-by: Will Drewry <wad@chromium.org>
---
include/linux/ftrace_event.h | 9 +++--
kernel/perf_event.c | 7 +++-
kernel/trace/trace_events_filter.c | 60 ++++++++++++++++++++++--------------
3 files changed, 48 insertions(+), 28 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 22b32af..fea9d98 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -216,6 +216,12 @@ extern int filter_current_check_discard(struct ring_buffer *buffer,
void *rec,
struct ring_buffer_event *event);
+extern void ftrace_free_filter(struct event_filter *filter);
+extern int ftrace_parse_filter(struct event_filter **filter,
+ int event_id,
+ const char *filter_str);
+extern const char *ftrace_get_filter_string(const struct event_filter *filter);
+
enum {
FILTER_OTHER = 0,
FILTER_STATIC_STRING,
@@ -266,9 +272,6 @@ extern int perf_trace_init(struct perf_event *event);
extern void perf_trace_destroy(struct perf_event *event);
extern int perf_trace_add(struct perf_event *event, int flags);
extern void perf_trace_del(struct perf_event *event, int flags);
-extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
- char *filter_str);
-extern void ftrace_profile_free_filter(struct perf_event *event);
extern void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs *regs, int *rctxp);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8e81a98..1da45e7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5588,7 +5588,8 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
if (IS_ERR(filter_str))
return PTR_ERR(filter_str);
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ ret = ftrace_parse_filter(&event->filter, event->attr.config,
+ filter_str);
kfree(filter_str);
return ret;
@@ -5596,7 +5597,9 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
static void perf_event_free_filter(struct perf_event *event)
{
- ftrace_profile_free_filter(event);
+ struct event_filter *filter = event->filter;
+ event->filter = NULL;
+ ftrace_free_filter(filter);
}
#else
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddc..787b174 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -522,7 +522,7 @@ static void remove_filter_string(struct event_filter *filter)
}
static int replace_filter_string(struct event_filter *filter,
- char *filter_string)
+ const char *filter_string)
{
kfree(filter->filter_string);
filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
@@ -1936,21 +1936,27 @@ out_unlock:
return err;
}
-#ifdef CONFIG_PERF_EVENTS
-
-void ftrace_profile_free_filter(struct perf_event *event)
+/* ftrace_free_filter - frees a parsed filter its internal structures.
+ *
+ * @filter: pointer to the event_filter to free.
+ */
+void ftrace_free_filter(struct event_filter *filter)
{
- struct event_filter *filter = event->filter;
-
- event->filter = NULL;
- __free_filter(filter);
+ if (filter)
+ __free_filter(filter);
}
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
-int ftrace_profile_set_filter(struct perf_event *event, int event_id,
- char *filter_str)
+/* ftrace_parse_filter - allocates and populates a new event_filter
+ *
+ * @event_id: may be something like syscalls::sys_event_tkill's id.
+ * @filter_str: pointer to the filter string. Ownership IS taken.
+ */
+int ftrace_parse_filter(struct event_filter **filter,
+ int event_id,
+ const char *filter_str)
{
int err;
- struct event_filter *filter;
struct filter_parse_state *ps;
struct ftrace_event_call *call = NULL;
@@ -1966,12 +1972,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
goto out_unlock;
err = -EEXIST;
- if (event->filter)
+ if (*filter)
goto out_unlock;
- filter = __alloc_filter();
- if (!filter) {
- err = PTR_ERR(filter);
+ *filter = __alloc_filter();
+ if (IS_ERR_OR_NULL(*filter)) {
+ err = PTR_ERR(*filter);
goto out_unlock;
}
@@ -1980,14 +1986,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
if (!ps)
goto free_filter;
- parse_init(ps, filter_ops, filter_str);
+ replace_filter_string(*filter, filter_str);
+
+ parse_init(ps, filter_ops, (*filter)->filter_string);
err = filter_parse(ps);
if (err)
goto free_ps;
- err = replace_preds(call, filter, ps, filter_str, false);
- if (!err)
- event->filter = filter;
+ err = replace_preds(call, *filter, ps, (*filter)->filter_string, false);
free_ps:
filter_opstack_clear(ps);
@@ -1995,14 +2001,22 @@ free_ps:
kfree(ps);
free_filter:
- if (err)
- __free_filter(filter);
+ if (err) {
+ __free_filter(*filter);
+ *filter = NULL;
+ }
out_unlock:
mutex_unlock(&event_mutex);
return err;
}
+EXPORT_SYMBOL_GPL(ftrace_parse_filter);
-#endif /* CONFIG_PERF_EVENTS */
-
+const char *ftrace_get_filter_string(const struct event_filter *filter)
+{
+ if (!filter)
+ return NULL;
+ return filter->filter_string;
+}
+EXPORT_SYMBOL_GPL(ftrace_get_filter_string);
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
2011-05-26 18:49 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 01/13] tracing: split out filter initialization and clean up Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 7:00 ` Ingo Molnar
2011-06-01 3:10 ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
` (10 subsequent siblings)
12 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Frederic Weisbecker, Ingo Molnar
perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
infrastructure. As such, many the helpers target at perf can be split
into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
consumer interface.
This change splits out syscall_trace_enter construction from
perf_syscall_enter for current into two helpers:
- ftrace_syscall_enter_state
- ftrace_syscall_enter_state_size
And adds another helper for completeness:
- ftrace_syscall_exit_state_size
These helpers allow for shared code between perf ftrace events and
any other consumers of CONFIG_FTRACE_SYSCALLS events. The proposed
seccomp_filter patches use this code.
Signed-off-by: Will Drewry <wad@chromium.org>
---
include/trace/syscall.h | 4 ++
kernel/trace/trace_syscalls.c | 96 +++++++++++++++++++++++++++++++++++------
2 files changed, 86 insertions(+), 14 deletions(-)
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 31966a4..242ae04 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -41,6 +41,10 @@ extern int reg_event_syscall_exit(struct ftrace_event_call *call);
extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
extern int
ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
+extern int ftrace_syscall_enter_state(u8 *buf, size_t available,
+ struct trace_entry **entry);
+extern size_t ftrace_syscall_enter_state_size(int nb_args);
+extern size_t ftrace_syscall_exit_state_size(void);
enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event);
enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0..f37f120 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -95,7 +95,7 @@ find_syscall_meta(unsigned long syscall)
return NULL;
}
-static struct syscall_metadata *syscall_nr_to_meta(int nr)
+struct syscall_metadata *syscall_nr_to_meta(int nr)
{
if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
return NULL;
@@ -498,7 +498,7 @@ static int sys_perf_refcount_exit;
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
- struct syscall_trace_enter *rec;
+ void *buf;
struct hlist_head *head;
int syscall_nr;
int rctx;
@@ -513,25 +513,22 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
/* get the size after alignment with the u32 buffer size field */
- size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
- size = ALIGN(size + sizeof(u32), sizeof(u64));
- size -= sizeof(u32);
+ size = ftrace_syscall_enter_state_size(sys_data->nb_args);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
"perf buffer not large enough"))
return;
- rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->event.type, regs, &rctx);
- if (!rec)
+ buf = perf_trace_buf_prepare(size, sys_data->enter_event->event.type,
+ regs, &rctx);
+ if (!buf)
return;
- rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args,
- (unsigned long *)&rec->args);
+ /* The only error conditions in this helper are handled above. */
+ ftrace_syscall_enter_state(buf, size, NULL);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(buf, size, rctx, 0, 1, regs, head);
}
int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -587,8 +584,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
return;
/* We can probably do that at build time */
- size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
- size -= sizeof(u32);
+ size = ftrace_syscall_exit_state_size();
/*
* Impossible, but be paranoid with the future
@@ -688,3 +684,75 @@ static int syscall_exit_register(struct ftrace_event_call *event,
}
return 0;
}
+
+/* ftrace_syscall_enter_state_size - returns the state size required.
+ *
+ * @nb_args: number of system call args expected.
+ * a negative value implies the maximum allowed.
+ */
+size_t ftrace_syscall_enter_state_size(int nb_args)
+{
+ /* syscall_get_arguments only supports up to 6 arguments. */
+ int arg_count = (nb_args >= 0 ? nb_args : 6);
+ size_t size = (sizeof(unsigned long) * arg_count) +
+ sizeof(struct syscall_trace_enter);
+ size = ALIGN(size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+ return size;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state_size);
+
+size_t ftrace_syscall_exit_state_size(void)
+{
+ return ALIGN(sizeof(struct syscall_trace_exit) + sizeof(u32),
+ sizeof(u64)) - sizeof(u32);
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_exit_state_size);
+
+/* ftrace_syscall_enter_state - build state for filter matching
+ *
+ * @buf: buffer to populate with current task state for matching
+ * @available: size available for use in the buffer.
+ * @entry: optional pointer to the trace_entry member of the state.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ * If @entry is NULL, it will be ignored.
+ */
+int ftrace_syscall_enter_state(u8 *buf, size_t available,
+ struct trace_entry **entry)
+{
+ struct syscall_trace_enter *sys_enter;
+ struct syscall_metadata *sys_data;
+ int size;
+ int syscall_nr;
+ struct pt_regs *regs = task_pt_regs(current);
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return -EINVAL;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return -EINVAL;
+
+ /* Determine the actual size needed. */
+ size = sizeof(unsigned long) * sys_data->nb_args +
+ sizeof(struct syscall_trace_enter);
+ size = ALIGN(size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ BUG_ON(size > available);
+ sys_enter = (struct syscall_trace_enter *)buf;
+
+ /* Populating the struct trace_sys_enter is left to the caller, but
+ * a pointer is returned to encourage opacity.
+ */
+ if (entry)
+ *entry = &sys_enter->ent;
+
+ sys_enter->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+ sys_enter->args);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state);
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
2011-06-01 3:10 ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
@ 2011-06-01 7:00 ` Ingo Molnar
2011-06-01 17:15 ` Will Drewry
0 siblings, 1 reply; 91+ messages in thread
From: Ingo Molnar @ 2011-06-01 7:00 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
Frederic Weisbecker, Ingo Molnar
* Will Drewry <wad@chromium.org> wrote:
> perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
> infrastructure. As such, many the helpers target at perf can be split
> into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
> consumer interface.
>
> This change splits out syscall_trace_enter construction from
> perf_syscall_enter for current into two helpers:
> - ftrace_syscall_enter_state
> - ftrace_syscall_enter_state_size
>
> And adds another helper for completeness:
> - ftrace_syscall_exit_state_size
>
> These helpers allow for shared code between perf ftrace events and
> any other consumers of CONFIG_FTRACE_SYSCALLS events. The proposed
> seccomp_filter patches use this code.
>
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
> include/trace/syscall.h | 4 ++
> kernel/trace/trace_syscalls.c | 96 +++++++++++++++++++++++++++++++++++------
> 2 files changed, 86 insertions(+), 14 deletions(-)
So, looking at the diffstat comparison again:
bitmask (2009): 6 files changed, 194 insertions(+), 22 deletions(-)
filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
event filters (2011): 5 files changed, 82 insertions(+), 16 deletions(-)
you went back to the middle solution again which is the worst of them
- why?
If you want this to be a stupid, limited hack then go for the v1
bitmask.
If you agree with my observation that filters allow the clean
user-space implementation of LSM equivalent security solutions (of
which sandboxes are just a *narrow special case*) then please use the
main highlevel abstraction we have defined around them: event
filters.
Now, my observation was not uncontested so let me try to sum up the
rather large discussion that erupted around it, as i see it.
I saw four main counter arguments:
- "Sandboxing is special and should stay separate from LSMs."
I think this is a technically bogus argument, see:
https://lkml.org/lkml/2011/5/26/85
That answer of mine went unchallenged.
- "Events should only be observers."
Even ignoring the question of why on earth it should be a problem
for a willing call-site to use event filtering results sensibly,
this argument misses the plain fact that events are *already*
active participants, see:
http://www.spinics.net/lists/mips/msg41075.html
That answer of mine went unchallenged too.
- "This feature is too simplistic."
That's wrong i think, the feature is highly flexible:
http://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg51387.html
This reply of mine went unchallenged as well.
- "Is this feature actually useful enough for applications, does it
justify the complexity?"
This is the *only* valid technical counter-argument i saw, and it's
a crutial one that is not fully answered yet. Since i think the feature
is an LSM equivalent i think it's at least as useful as any LSM is.
- [ if i missed any important argument then someone please insert it
here. ]
But what you do here is to use the filter engine directly which is
both a limited hack *and* complex (beyond the linecount it doubles
our ABI exposure, amongst other things), so i find that approach
rather counter-productive, now that i've seen the real thing.
Will this feature be just another example of the LSM status quo
dragging down a newcomer into the mud, until it's just as sucky and
limited as any existing LSMs? That would be a sad outcome!
Thanks,
Ingo
ps. Please start a new discussion thread for the next iteration!
This one is *way* too deep already.
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
2011-06-01 7:00 ` Ingo Molnar
@ 2011-06-01 17:15 ` Will Drewry
2011-06-02 14:29 ` Ingo Molnar
0 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 17:15 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
Frederic Weisbecker, Ingo Molnar
On Wed, Jun 1, 2011 at 2:00 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
>> infrastructure. As such, many the helpers target at perf can be split
>> into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
>> consumer interface.
>>
>> This change splits out syscall_trace_enter construction from
>> perf_syscall_enter for current into two helpers:
>> - ftrace_syscall_enter_state
>> - ftrace_syscall_enter_state_size
>>
>> And adds another helper for completeness:
>> - ftrace_syscall_exit_state_size
>>
>> These helpers allow for shared code between perf ftrace events and
>> any other consumers of CONFIG_FTRACE_SYSCALLS events. The proposed
>> seccomp_filter patches use this code.
>>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>> include/trace/syscall.h | 4 ++
>> kernel/trace/trace_syscalls.c | 96 +++++++++++++++++++++++++++++++++++------
>> 2 files changed, 86 insertions(+), 14 deletions(-)
>
> So, looking at the diffstat comparison again:
>
> bitmask (2009): 6 files changed, 194 insertions(+), 22 deletions(-)
> filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
> event filters (2011): 5 files changed, 82 insertions(+), 16 deletions(-)
>
> you went back to the middle solution again which is the worst of them
> - why?
In short, design for the future and implement now. I'll elaborate a
bit more below.
> If you want this to be a stupid, limited hack then go for the v1
> bitmask.
I only aim for the finest!
(bitmasks were bad for the other consumers of this patch series:
socketcall mulitplexing issues and ioctl # filtering).
> If you agree with my observation that filters allow the clean
> user-space implementation of LSM equivalent security solutions (of
> which sandboxes are just a *narrow special case*) then please use the
> main highlevel abstraction we have defined around them: event
> filters.
I agree that LSM-equivalent security solutions can be moved over to an
ftrace based infrastructure. However, LSMs and seccomp have different
semantics. Reducing the kernel attack surface in a
"sandboxing"-sort-of-way requires a default-deny interface that is
resilient to kernel changes (like new system calls) without
immediately degrading robustness. LSMs provide a fail-open mechanism
for taking an active role in kernel-defined pinch points. It is
possible to implement a default-deny LSM, but it requires a "hook" for
every security event and the addition of a security event results in a
hole in the not-so-default-deny infrastructure. ftrace + event
filters are the same.
Based on my observations while exploring the code, it appears that the
LSM security_* calls could easily become active trace events and the
LSM infrastructure moved over to use those as tracepoints or via
event_filters. There will be a need for new predicates for the
various new types (inode *, etc), and so on. However, the
trace_sys_enter/__secure_computing model will still be a special case.
Even if they fed into security event subsystem or something like
that, the absence of filters on a traced process would need to
default-deny as well as when there are no active matches. So while a
brand-new shared ABI may be possible (security_event_open,
active_event_open, ?), there will still be trickiness in making the
behaviors not have implicit side effects and ensure that newly added
system calls, for instance, that lack the macro wrapper don't poke a
hole in the "sandbox" model. There are a lot of options for designing
it though. Like making TIF_SECCOMP mean that any security_* filter
failure or match count of 0 == process death. It's just that
designing this new approach will be incredibly hairy, and we really
lack many of the concrete requirements that would be needed, in my
opinion.
> Now, my observation was not uncontested so let me try to sum up the
> rather large discussion that erupted around it, as i see it.
>
> I saw four main counter arguments:
>
> - "Sandboxing is special and should stay separate from LSMs."
>
> I think this is a technically bogus argument, see:
>
> https://lkml.org/lkml/2011/5/26/85
>
> That answer of mine went unchallenged.
I may have spoken to this above. I dunno.
> - "Events should only be observers."
>
> Even ignoring the question of why on earth it should be a problem
> for a willing call-site to use event filtering results sensibly,
> this argument misses the plain fact that events are *already*
> active participants, see:
>
> http://www.spinics.net/lists/mips/msg41075.html
>
> That answer of mine went unchallenged too.
>
> - "This feature is too simplistic."
>
> That's wrong i think, the feature is highly flexible:
>
> http://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg51387.html
>
> This reply of mine went unchallenged as well.
Well I did only implement a PoC. It couldn't handle attack surface
reduction after-the-fact, nor did I add a GET_FILTER call, etc. The
code was minimal in many ways because the functionality was too.
> - "Is this feature actually useful enough for applications, does it
> justify the complexity?"
>
> This is the *only* valid technical counter-argument i saw, and it's
> a crutial one that is not fully answered yet. Since i think the feature
> is an LSM equivalent i think it's at least as useful as any LSM is.
>
> - [ if i missed any important argument then someone please insert it
> here. ]
>
> But what you do here is to use the filter engine directly which is
> both a limited hack *and* complex (beyond the linecount it doubles
> our ABI exposure, amongst other things), so i find that approach
> rather counter-productive, now that i've seen the real thing.
>
> Will this feature be just another example of the LSM status quo
> dragging down a newcomer into the mud, until it's just as sucky and
> limited as any existing LSMs? That would be a sad outcome!
I hope not. I believe it will be easy to move the backend of
seccomp_filter over to a per-task ftrace event filter infrastructure
when that comes in the future. But for now, I'm trying to meet the
needs of possible consumers now: chromium, qemu, lxc, and lay
groundwork for a ftrace-future.
If this is a total fail, then perhaps we should have a separate
discussion over how we can tackle a lot of these needs. I was hoping
that we could push some of that off to the LinuxSecuritySummit -- I've
proposed/requested a QA panel on this topic :) But I'd love to not
wait until then for everything.
> ps. Please start a new discussion thread for the next iteration!
> This one is *way* too deep already.
Sorry - will do!
thanks!
will
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
2011-06-01 17:15 ` Will Drewry
@ 2011-06-02 14:29 ` Ingo Molnar
2011-06-02 15:18 ` Will Drewry
0 siblings, 1 reply; 91+ messages in thread
From: Ingo Molnar @ 2011-06-02 14:29 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
Frederic Weisbecker, Ingo Molnar
* Will Drewry <wad@chromium.org> wrote:
> > If you agree with my observation that filters allow the clean
> > user-space implementation of LSM equivalent security solutions
> > (of which sandboxes are just a *narrow special case*) then please
> > use the main highlevel abstraction we have defined around them:
> > event filters.
>
> I agree that LSM-equivalent security solutions can be moved over to
> an ftrace based infrastructure. However, LSMs and seccomp have
> different semantics. Reducing the kernel attack surface in a
> "sandboxing"-sort-of-way requires a default-deny interface that is
> resilient to kernel changes (like new system calls) without
> immediately degrading robustness. [...]
Correct. Because seccomp is the user of those syscall-surface events
it can use them in such a way - i see no problem there: unknown or
not permitted syscalls get denied for seccomp-mode-2 tasks.
> [...] LSMs provide a fail-open mechanism for taking an active role
> in kernel-defined pinch points. It is possible to implement a
> default-deny LSM, but it requires a "hook" for every security event
> and the addition of a security event results in a hole in the
> not-so-default-deny infrastructure. ftrace + event filters are the
> same.
Well, i only suggested that it's LSM-equivalent security
functionality, i did not suggest that you should implement an LSM in
security/. I do not think the LSM modularization is particularly well
fit for seccomp.
> Based on my observations while exploring the code, it appears that
> the LSM security_* calls could easily become active trace events
> and the LSM infrastructure moved over to use those as tracepoints
> or via event_filters. There will be a need for new predicates for
> the various new types (inode *, etc), and so on. However, the
> trace_sys_enter/__secure_computing model will still be a special
> case.
Yes, and that special event will not go away!
I did not suggest to *replace* those events with the security events.
I suggested to *combine* them - or at least have a model that
smoothly extends to those events as well and does not limit itself to
the syscall surface alone.
We'll want to have both.
But by hardcoding to only those events, and creating a
syscall-numbering special ABI, a wall will be risen between this
implementation and any future enhancement to cover other events. My
suggestion would be to use the event filter approach - that way
there's not a wall but an open door towards future extensions ;-)
Thanks,
Ingo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
2011-06-02 14:29 ` Ingo Molnar
@ 2011-06-02 15:18 ` Will Drewry
0 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-02 15:18 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
Frederic Weisbecker, Ingo Molnar
On Thu, Jun 2, 2011 at 9:29 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
[...]
>
>> Based on my observations while exploring the code, it appears that
>> the LSM security_* calls could easily become active trace events
>> and the LSM infrastructure moved over to use those as tracepoints
>> or via event_filters. There will be a need for new predicates for
>> the various new types (inode *, etc), and so on. However, the
>> trace_sys_enter/__secure_computing model will still be a special
>> case.
>
> Yes, and that special event will not go away!
>
> I did not suggest to *replace* those events with the security events.
> I suggested to *combine* them - or at least have a model that
> smoothly extends to those events as well and does not limit itself to
> the syscall surface alone.
>
> We'll want to have both.
>
> But by hardcoding to only those events, and creating a
> syscall-numbering special ABI, a wall will be risen between this
> implementation and any future enhancement to cover other events. My
> suggestion would be to use the event filter approach - that way
> there's not a wall but an open door towards future extensions ;-)
Yeah, I can definitely see that. We could have the prctl interface
take in the event id, but that introduces dependency on
CONFIG_PERF_EVENTS in addition
(to get the id exported) and means we'll have much more limited
coverage of syscalls until the syscall wrapping matures.
Could this be resolved in the proposed change by supporting both
mechanisms? Or is that just asking for trouble?
E.g., it could be an extra field:
prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_TYPE_EVENT, event_id,
filter_string);
prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_TYPE_SYSCALL,
__NR_somesyscall, filter_string);
[and the same for CLEAR_FILTER and GET_FILTER]
or even reserve negative values for event ids and positive for
syscalls (which feels more hackish). Adding event_id support wouldn't
be much more additional code (since it's just a layer of
dereferencing). Since there will likely be syscall-indexed entry
behavior no matter what (like there is for ftrace/perf_sysenter), it
won't necessarily be a large diversion in the future either.
If not, seccomp_filter could depend on both FTRACE_SYSCALLS and
exported PERF_EVENTS (or make "id"s not perf_event specific), then it
could just use the sys_enter event ids. Doing so does have some other
properties that I'm not as fond of, like requiring debugfs to be
compiled in, mounted, and readable by the caller in order to construct
a filterset, so I can still see some benefit for the syscall number
use in some cases (much easier to deploy on a server without debugfs
access, etc). Right now, having both interfaces doesn't really give
us anything, but having the field set aside for future exploration
isn't necessarily a bad thing!
What do you think? Would a change to support both be too crazy/dumb or
just crazy/dumb enough? Or do you see another path that could avoid
isolating any current work from a more fruitful future?
thanks!
will
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-05-26 18:49 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 01/13] tracing: split out filter initialization and clean up Will Drewry
2011-06-01 3:10 ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-02 17:36 ` Paul E. McKenney
2011-06-01 3:10 ` [PATCH v3 04/13] seccomp_filter: add process state reporting Will Drewry
` (9 subsequent siblings)
12 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Peter Zijlstra, Frederic Weisbecker, linux-security-module
This change adds a new seccomp mode which specifies the allowed system
calls dynamically. When in the new mode (2), all system calls are
checked against process-defined filters - first by system call number,
then by a filter string. If an entry exists for a given system call and
all filter predicates evaluate to true, then the task may proceed.
Otherwise, the task is killed.
Filter string parsing and evaluation is handled by the ftrace filter
engine. Related patches tweak to the perf filter trace and free
allowing the calls to be shared. Filters inherit their understanding of
types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
subsystem which already populates this information in syscall_metadata
associated enter_event (and exit_event) structures. If
CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
will be allowed.
The net result is a process may have its system calls filtered using the
ftrace filter engine's inherent understanding of systems calls. The set
of filters is specified through the PR_SET_SECCOMP_FILTER argument in
prctl(). For example, a filterset for a process, like pdftotext, that
should only process read-only input could (roughly) look like:
sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
prctl(PR_SET_SECCOMP, 2);
Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
be &&'d together to ensure that attack surface may only be reduced:
prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
With the earlier example, the active filter becomes:
"(fd == 1 || fd == 2) && fd != 2"
The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
The latter returns the current filter for a system call to userspace:
prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
while the former clears any filters for a given system call changing it
back to a defaulty deny:
prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
v3: - always block execve calls (as per linus torvalds)
- add __NR_seccomp_execve(_32) to seccomp-supporting arches
- ensure compat tasks can't reach ftrace:syscalls
- dropped new defines for seccomp modes.
- two level array instead of hlists (sugg. by olof johansson)
- added generic Kconfig entry that is not connected.
- dropped internal seccomp.h
- move prctl helpers to seccomp_filter
- killed seccomp_t typedef (as per checkpatch)
v2: - changed to use the existing syscall number ABI.
- prctl changes to minimize parsing in the kernel:
prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
- defined PR_SECCOMP_MODE_STRICT and ..._FILTER
- added flags
- provide a default fail syscall_nr_to_meta in ftrace
- provides fallback for unhooked system calls
- use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
- added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
- moved to a hlist and 4 bit hash of linked lists
- added support to operate without CONFIG_FTRACE_SYSCALLS
- moved Kconfig support next to SECCOMP
- made Kconfig entries dependent on EXPERIMENTAL
- added macros to avoid ifdefs from kernel/fork.c
- added compat task/filter matching
- drop seccomp.h inclusion in sched.h and drop seccomp_t
- added Filtering to "show" output
- added on_exec state dup'ing when enabling after a fast-path accept.
Signed-off-by: Will Drewry <wad@chromium.org>
---
include/linux/prctl.h | 5 +
include/linux/sched.h | 2 +-
include/linux/seccomp.h | 98 ++++++-
include/trace/syscall.h | 7 +
kernel/Makefile | 3 +
kernel/fork.c | 3 +
kernel/seccomp.c | 38 ++-
kernel/seccomp_filter.c | 784 +++++++++++++++++++++++++++++++++++++++++++++++
kernel/sys.c | 13 +-
security/Kconfig | 17 +
10 files changed, 954 insertions(+), 16 deletions(-)
create mode 100644 kernel/seccomp_filter.c
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..44723ce 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -64,6 +64,11 @@
#define PR_GET_SECCOMP 21
#define PR_SET_SECCOMP 22
+/* Get/set process seccomp filters */
+#define PR_GET_SECCOMP_FILTER 35
+#define PR_SET_SECCOMP_FILTER 36
+#define PR_CLEAR_SECCOMP_FILTER 37
+
/* Get/set the capability bounding set (as per security/commoncap.c) */
#define PR_CAPBSET_READ 23
#define PR_CAPBSET_DROP 24
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63ce..3f0bc8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1374,7 +1374,7 @@ struct task_struct {
uid_t loginuid;
unsigned int sessionid;
#endif
- seccomp_t seccomp;
+ struct seccomp_struct seccomp;
/* Thread group tracking */
u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..f4434ca 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,13 +1,33 @@
#ifndef _LINUX_SECCOMP_H
#define _LINUX_SECCOMP_H
+struct seq_file;
#ifdef CONFIG_SECCOMP
+#include <linux/errno.h>
#include <linux/thread_info.h>
+#include <linux/types.h>
#include <asm/seccomp.h>
-typedef struct { int mode; } seccomp_t;
+struct seccomp_filters;
+/**
+ * struct seccomp_struct - the state of a seccomp'ed process
+ *
+ * @mode:
+ * if this is 1, the process is under standard seccomp rules
+ * is 2, the process is only allowed to make system calls where
+ * associated filters evaluate successfully.
+ * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
+ * filters assignment/use should be RCU-protected and its contents
+ * should never be modified when attached to a seccomp_struct.
+ */
+struct seccomp_struct {
+ uint16_t mode;
+#ifdef CONFIG_SECCOMP_FILTER
+ struct seccomp_filters *filters;
+#endif
+};
extern void __secure_computing(int);
static inline void secure_computing(int this_syscall)
@@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
__secure_computing(this_syscall);
}
-extern long prctl_get_seccomp(void);
extern long prctl_set_seccomp(unsigned long);
+extern long prctl_get_seccomp(void);
#else /* CONFIG_SECCOMP */
#include <linux/errno.h>
-typedef struct { } seccomp_t;
-
+struct seccomp_struct { };
#define secure_computing(x) do { } while (0)
static inline long prctl_get_seccomp(void)
@@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
return -EINVAL;
}
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long a2);
{
return -EINVAL;
}
#endif /* CONFIG_SECCOMP */
+#ifdef CONFIG_SECCOMP_FILTER
+
+#define inherit_tsk_seccomp(_child, _orig) do { \
+ _child->seccomp.mode = _orig->seccomp.mode; \
+ _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
+ } while (0)
+#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
+
+extern int seccomp_show_filters(struct seccomp_filters *filters,
+ struct seq_file *);
+extern long seccomp_set_filter(int, char *);
+extern long seccomp_clear_filter(int);
+extern long seccomp_get_filter(int, char *, unsigned long);
+
+extern long prctl_set_seccomp_filter(unsigned long, char __user *);
+extern long prctl_get_seccomp_filter(unsigned long, char __user *,
+ unsigned long);
+extern long prctl_clear_seccomp_filter(unsigned long);
+
+extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
+extern void put_seccomp_filters(struct seccomp_filters *);
+
+extern int seccomp_test_filters(int);
+extern void seccomp_filter_log_failure(int);
+
+#else /* CONFIG_SECCOMP_FILTER */
+
+struct seccomp_filters { };
+#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
+#define put_tsk_seccomp(_tsk) do { } while (0)
+
+static inline int seccomp_show_filters(struct seccomp_filters *filters,
+ struct seq_file *m)
+{
+ return -ENOSYS;
+}
+
+static inline long seccomp_set_filter(int syscall_nr, char *filter)
+{
+ return -ENOSYS;
+}
+
+static inline long seccomp_clear_filter(int syscall_nr)
+{
+ return -ENOSYS;
+}
+
+static inline long seccomp_get_filter(int syscall_nr,
+ char *buf, unsigned long available)
+{
+ return -ENOSYS;
+}
+
+static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
+{
+ return -ENOSYS;
+}
+
+static inline long prctl_clear_seccomp_filter(unsigned long a2)
+{
+ return -ENOSYS;
+}
+
+static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
+ unsigned long a4)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
#endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..e061ad0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
extern unsigned long arch_syscall_addr(int nr);
extern int init_syscall_trace(struct ftrace_event_call *call);
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
extern int reg_event_syscall_enter(struct ftrace_event_call *call);
extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
extern int reg_event_syscall_exit(struct ftrace_event_call *call);
@@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event);
enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
struct trace_event *event);
+#else
+static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+ return NULL;
+}
#endif
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..84e7dfb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
+ifeq ($(CONFIG_SECCOMP_FILTER),y)
+obj-$(CONFIG_SECCOMP) += seccomp_filter.o
+endif
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548de..6f835e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_tsk_seccomp(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
if (err)
goto out;
+ inherit_tsk_seccomp(tsk, orig);
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..0a942be 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2,16 +2,20 @@
* linux/kernel/seccomp.c
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
*
* This defines a simple but solid secure-computing mode.
*/
#include <linux/seccomp.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/unistd.h>
+#include <linux/ftrace_event.h>
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
void __secure_computing(int this_syscall)
{
- int mode = current->seccomp.mode;
int * syscall;
- switch (mode) {
+ switch (current->seccomp.mode) {
case 1:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
@@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
return;
} while (*++syscall);
break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case 2:
+ if (this_syscall >= NR_syscalls || this_syscall < 0)
+ break;
+
+ if (!seccomp_test_filters(this_syscall))
+ return;
+
+ seccomp_filter_log_failure(this_syscall);
+ break;
+#endif
default:
BUG();
}
@@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
if (unlikely(current->seccomp.mode))
goto out;
- ret = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ ret = 0;
+ switch (seccomp_mode) {
+ case 1:
#ifdef TIF_NOTSC
disable_TSC();
#endif
- ret = 0;
+#ifdef CONFIG_SECCOMP_FILTER
+ case 2:
+#endif
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+ break;
+ default:
+ ret = -EINVAL;
}
- out:
+out:
return ret;
}
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..9782f25
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,784 @@
+/* filter engine-based seccomp system call filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/compat.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/ftrace_event.h>
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
+
+#define SECCOMP_FILTER_ALLOW "1"
+#define SECCOMP_ACTION_DENY 0xffff
+#define SECCOMP_ACTION_ALLOW 0xfffe
+
+/**
+ * struct seccomp_filters - container for seccomp filterset
+ *
+ * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
+ * May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
+ * @event_filters: array of pointers to ftrace event objects
+ * @count: size of @event_filters
+ * @flags: anonymous struct to wrap filters-specific flags
+ * @usage: reference count to simplify use.
+ */
+struct seccomp_filters {
+ uint16_t syscalls[NR_syscalls];
+ struct event_filter **event_filters;
+ uint16_t count;
+ struct {
+ uint32_t compat:1,
+ __reserved:31;
+ } flags;
+ atomic_t usage;
+};
+
+/* Handle ftrace symbol non-existence */
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define create_event_filter(_ef_pptr, _event_type, _str) \
+ ftrace_parse_filter(_ef_pptr, _event_type, _str)
+#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
+#define free_event_filter(_f) ftrace_free_filter(_f)
+
+#else
+
+#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
+#define get_filter_string(_ef) (NULL)
+#define free_event_filter(_f) do { } while (0)
+#endif
+
+/**
+ * seccomp_filters_new - allocates a new filters object
+ * @count: count to allocate for the event_filters array
+ *
+ * Returns ERR_PTR on error or an allocated object.
+ */
+static struct seccomp_filters *seccomp_filters_new(uint16_t count)
+{
+ struct seccomp_filters *f;
+
+ if (count >= SECCOMP_ACTION_ALLOW)
+ return ERR_PTR(-EINVAL);
+
+ f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
+ if (!f)
+ return ERR_PTR(-ENOMEM);
+
+ /* Lazy SECCOMP_ACTION_DENY assignment. */
+ memset(f->syscalls, 0xff, sizeof(f->syscalls));
+ atomic_set(&f->usage, 1);
+
+ f->event_filters = NULL;
+ f->count = count;
+ if (!count)
+ return f;
+
+ f->event_filters = kzalloc(count * sizeof(struct event_filter *),
+ GFP_KERNEL);
+ if (!f->event_filters) {
+ kfree(f);
+ f = ERR_PTR(-ENOMEM);
+ }
+ return f;
+}
+
+/**
+ * seccomp_filters_free - cleans up the filter list and frees the table
+ * @filters: NULL or live object to be completely destructed.
+ */
+static void seccomp_filters_free(struct seccomp_filters *filters)
+{
+ uint16_t count = 0;
+ if (!filters)
+ return;
+ while (count < filters->count) {
+ struct event_filter *f = filters->event_filters[count];
+ free_event_filter(f);
+ count++;
+ }
+ kfree(filters->event_filters);
+ kfree(filters);
+}
+
+static void __put_seccomp_filters(struct seccomp_filters *orig)
+{
+ WARN_ON(atomic_read(&orig->usage));
+ seccomp_filters_free(orig);
+}
+
+#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
+#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
+#define seccomp_filter_dynamic(_id) \
+ (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
+static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
+ int syscall_nr)
+{
+ if (!f)
+ return SECCOMP_ACTION_DENY;
+ return f->syscalls[syscall_nr];
+}
+
+static inline struct event_filter *seccomp_dynamic_filter(
+ const struct seccomp_filters *filters, uint16_t id)
+{
+ if (!seccomp_filter_dynamic(id))
+ return NULL;
+ return filters->event_filters[id];
+}
+
+static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
+ int syscall_nr, uint16_t id)
+{
+ filters->syscalls[syscall_nr] = id;
+}
+
+static inline void set_seccomp_filter(struct seccomp_filters *filters,
+ int syscall_nr, uint16_t id,
+ struct event_filter *dynamic_filter)
+{
+ filters->syscalls[syscall_nr] = id;
+ if (seccomp_filter_dynamic(id))
+ filters->event_filters[id] = dynamic_filter;
+}
+
+static struct event_filter *alloc_event_filter(int syscall_nr,
+ const char *filter_string)
+{
+ struct syscall_metadata *data;
+ struct event_filter *filter = NULL;
+ int err;
+
+ data = syscall_nr_to_meta(syscall_nr);
+ /* Argument-based filtering only works on ftrace-hooked syscalls. */
+ err = -ENOSYS;
+ if (!data)
+ goto fail;
+ err = create_event_filter(&filter,
+ data->enter_event->event.type,
+ filter_string);
+ if (err)
+ goto fail;
+
+ return filter;
+fail:
+ kfree(filter);
+ return ERR_PTR(err);
+}
+
+/**
+ * seccomp_filters_copy - copies filters from src to dst.
+ *
+ * @dst: seccomp_filters to populate.
+ * @src: table to read from.
+ * @skip: specifies an entry, by system call, to skip.
+ *
+ * Returns non-zero on failure.
+ * Both the source and the destination should have no simultaneous
+ * writers, and dst should be exclusive to the caller.
+ * If @skip is < 0, it is ignored.
+ */
+static int seccomp_filters_copy(struct seccomp_filters *dst,
+ const struct seccomp_filters *src,
+ int skip)
+{
+ int id = 0, ret = 0, nr;
+ memcpy(&dst->flags, &src->flags, sizeof(src->flags));
+ memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
+ if (!src->count)
+ goto done;
+ for (nr = 0; nr < NR_syscalls; ++nr) {
+ struct event_filter *filter;
+ const char *str;
+ uint16_t src_id = seccomp_filter_id(src, nr);
+ if (nr == skip) {
+ set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
+ NULL);
+ continue;
+ }
+ if (!seccomp_filter_dynamic(src_id))
+ continue;
+ if (id >= dst->count) {
+ ret = -EINVAL;
+ goto done;
+ }
+ str = get_filter_string(seccomp_dynamic_filter(src, src_id));
+ filter = alloc_event_filter(nr, str);
+ if (IS_ERR(filter)) {
+ ret = PTR_ERR(filter);
+ goto done;
+ }
+ set_seccomp_filter(dst, nr, id, filter);
+ id++;
+ }
+
+done:
+ return ret;
+}
+
+/**
+ * seccomp_extend_filter - appends more text to a syscall_nr's filter
+ * @filters: unattached filter object to operate on
+ * @syscall_nr: syscall number to update filters for
+ * @filter_string: string to append to the existing filter
+ *
+ * The new string will be &&'d to the original filter string to ensure that it
+ * always matches the existing predicates or less:
+ * (old_filter) && @filter_string
+ * A new seccomp_filters instance is returned on success and a ERR_PTR on
+ * failure.
+ */
+static int seccomp_extend_filter(struct seccomp_filters *filters,
+ int syscall_nr, char *filter_string)
+{
+ struct event_filter *filter;
+ uint16_t id = seccomp_filter_id(filters, syscall_nr);
+ char *merged = NULL;
+ int ret = -EINVAL, expected;
+
+ /* No extending with a "1". */
+ if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
+ goto out;
+
+ filter = seccomp_dynamic_filter(filters, id);
+ ret = -ENOENT;
+ if (!filter)
+ goto out;
+
+ merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!merged)
+ goto out;
+
+ expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
+ get_filter_string(filter), filter_string);
+ ret = -E2BIG;
+ if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
+ goto out;
+
+ /* Free the old filter */
+ free_event_filter(filter);
+ set_seccomp_filter(filters, syscall_nr, id, NULL);
+
+ /* Replace it */
+ filter = alloc_event_filter(syscall_nr, merged);
+ if (IS_ERR(filter)) {
+ ret = PTR_ERR(filter);
+ goto out;
+ }
+ set_seccomp_filter(filters, syscall_nr, id, filter);
+ ret = 0;
+
+out:
+ kfree(merged);
+ return ret;
+}
+
+/**
+ * seccomp_add_filter - adds a filter for an unfiltered syscall
+ * @filters: filters object to add a filter/action to
+ * @syscall_nr: system call number to add a filter for
+ * @filter_string: the filter string to apply
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
+ char *filter_string)
+{
+ struct event_filter *filter;
+ int ret = 0;
+
+ if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
+ set_seccomp_filter(filters, syscall_nr,
+ SECCOMP_ACTION_ALLOW, NULL);
+ goto out;
+ }
+
+ filter = alloc_event_filter(syscall_nr, filter_string);
+ if (IS_ERR(filter)) {
+ ret = PTR_ERR(filter);
+ goto out;
+ }
+ /* Always add to the last slot available since additions are
+ * are only done one at a time.
+ */
+ set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
+out:
+ return ret;
+}
+
+/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
+static int filter_match_current(struct event_filter *event_filter)
+{
+ int err = 0;
+#ifdef CONFIG_FTRACE_SYSCALLS
+ uint8_t syscall_state[64];
+
+ memset(syscall_state, 0, sizeof(syscall_state));
+
+ /* The generic tracing entry can remain zeroed. */
+ err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+ NULL);
+ if (err)
+ return 0;
+
+ err = filter_match_preds(event_filter, syscall_state);
+#endif
+ return err;
+}
+
+static const char *syscall_nr_to_name(int syscall)
+{
+ const char *syscall_name = "unknown";
+ struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+ if (data)
+ syscall_name = data->name;
+ return syscall_name;
+}
+
+static void filters_set_compat(struct seccomp_filters *filters)
+{
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ filters->flags.compat = 1;
+#endif
+}
+
+static inline int filters_compat_mismatch(struct seccomp_filters *filters)
+{
+ int ret = 0;
+ if (!filters)
+ return 0;
+#ifdef CONFIG_COMPAT
+ if (!!(is_compat_task()) == filters->flags.compat)
+ ret = 1;
+#endif
+ return ret;
+}
+
+static inline int syscall_is_execve(int syscall)
+{
+ int nr = __NR_execve;
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ nr = __NR_seccomp_execve_32;
+#endif
+ return syscall == nr;
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+void seccomp_filter_log_failure(int syscall)
+{
+ pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
+ current->comm, task_pid_nr(current), syscall,
+ syscall_nr_to_name(syscall), KSTK_EIP(current));
+}
+
+/* put_seccomp_state - decrements the reference count of @orig and may free. */
+void put_seccomp_filters(struct seccomp_filters *orig)
+{
+ if (!orig)
+ return;
+
+ if (atomic_dec_and_test(&orig->usage))
+ __put_seccomp_filters(orig);
+}
+
+/* get_seccomp_state - increments the reference count of @orig */
+struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
+{
+ if (!orig)
+ return NULL;
+ atomic_inc(&orig->usage);
+ return orig;
+}
+
+/**
+ * seccomp_test_filters - tests 'current' against the given syscall
+ * @state: seccomp_state of current to use.
+ * @syscall: number of the system call to test
+ *
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(int syscall)
+{
+ uint16_t id;
+ struct event_filter *filter;
+ struct seccomp_filters *filters;
+ int ret = -EACCES;
+
+ rcu_read_lock();
+ filters = get_seccomp_filters(current->seccomp.filters);
+ rcu_read_unlock();
+
+ if (!filters)
+ goto out;
+
+ if (filters_compat_mismatch(filters)) {
+ pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
+ current->comm, task_pid_nr(current));
+ goto out;
+ }
+
+ /* execve is never allowed. */
+ if (syscall_is_execve(syscall))
+ goto out;
+
+ ret = 0;
+ id = seccomp_filter_id(filters, syscall);
+ if (seccomp_filter_allow(id))
+ goto out;
+
+ ret = -EACCES;
+ if (!seccomp_filter_dynamic(id))
+ goto out;
+
+ filter = seccomp_dynamic_filter(filters, id);
+ if (filter && filter_match_current(filter))
+ ret = 0;
+out:
+ put_seccomp_filters(filters);
+ return ret;
+}
+
+/**
+ * seccomp_show_filters - prints the current filter state to a seq_file
+ * @filters: properly get()'d filters object
+ * @m: the prepared seq_file to receive the data
+ *
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
+{
+ int syscall;
+ seq_printf(m, "Mode: %d\n", current->seccomp.mode);
+ if (!filters)
+ goto out;
+
+ for (syscall = 0; syscall < NR_syscalls; ++syscall) {
+ uint16_t id = seccomp_filter_id(filters, syscall);
+ const char *filter_string = SECCOMP_FILTER_ALLOW;
+ if (seccomp_filter_deny(id))
+ continue;
+ seq_printf(m, "%d (%s): ",
+ syscall,
+ syscall_nr_to_name(syscall));
+ if (seccomp_filter_dynamic(id))
+ filter_string = get_filter_string(
+ seccomp_dynamic_filter(filters, id));
+ seq_printf(m, "%s\n", filter_string);
+ }
+out:
+ return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/**
+ * seccomp_get_filter - copies the filter_string into "buf"
+ * @syscall_nr: system call number to look up
+ * @buf: destination buffer
+ * @bufsize: available space in the buffer.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ * operates on current. current must be attempting a system call
+ * when this is called.
+ *
+ * Looks up the filter for the given system call number on current. If found,
+ * the string length of the NUL-terminated buffer is returned and < 0 is
+ * returned on error. The NUL byte is not included in the length.
+ */
+long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
+{
+ struct seccomp_filters *filters;
+ struct event_filter *filter;
+ long ret = -EINVAL;
+ uint16_t id;
+
+ if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
+ bufsize = SECCOMP_MAX_FILTER_LENGTH;
+
+ rcu_read_lock();
+ filters = get_seccomp_filters(current->seccomp.filters);
+ rcu_read_unlock();
+
+ if (!filters)
+ goto out;
+
+ ret = -ENOENT;
+ id = seccomp_filter_id(filters, syscall_nr);
+ if (seccomp_filter_deny(id))
+ goto out;
+
+ if (seccomp_filter_allow(id)) {
+ ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
+ goto copied;
+ }
+
+ filter = seccomp_dynamic_filter(filters, id);
+ if (!filter)
+ goto out;
+ ret = strlcpy(buf, get_filter_string(filter), bufsize);
+
+copied:
+ if (ret >= bufsize) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ /* Zero out any remaining buffer, just in case. */
+ memset(buf + ret, 0, bufsize - ret);
+out:
+ put_seccomp_filters(filters);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_get_filter);
+
+/**
+ * seccomp_clear_filter: clears the seccomp filter for a syscall.
+ * @syscall_nr: the system call number to clear filters for.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ * operates on current. current must be attempting a system call
+ * when this is called.
+ *
+ * Returns 0 on success.
+ */
+long seccomp_clear_filter(int syscall_nr)
+{
+ struct seccomp_filters *filters = NULL, *orig_filters;
+ uint16_t id;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ orig_filters = get_seccomp_filters(current->seccomp.filters);
+ rcu_read_unlock();
+
+ if (!orig_filters)
+ goto out;
+
+ if (filters_compat_mismatch(orig_filters))
+ goto out;
+
+ id = seccomp_filter_id(orig_filters, syscall_nr);
+ if (seccomp_filter_deny(id))
+ goto out;
+
+ /* Create a new filters object for the task */
+ if (seccomp_filter_dynamic(id))
+ filters = seccomp_filters_new(orig_filters->count - 1);
+ else
+ filters = seccomp_filters_new(orig_filters->count);
+
+ if (IS_ERR(filters)) {
+ ret = PTR_ERR(filters);
+ goto out;
+ }
+
+ /* Copy, but drop the requested entry. */
+ ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
+ if (ret)
+ goto out;
+ get_seccomp_filters(filters); /* simplify the out: path */
+
+ rcu_assign_pointer(current->seccomp.filters, filters);
+ synchronize_rcu();
+ put_seccomp_filters(orig_filters); /* for the task */
+out:
+ put_seccomp_filters(orig_filters); /* for the get */
+ put_seccomp_filters(filters); /* for the extra get */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_clear_filter);
+
+/**
+ * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
+ * @syscall_nr: system call number to apply the filter to.
+ * @filter: ftrace filter string to apply.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ * operates on current. current must be attempting a system call
+ * when this is called.
+ *
+ * New filters may be added for system calls when the current task is
+ * not in a secure computing mode (seccomp). Otherwise, existing filters may
+ * be extended.
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+long seccomp_set_filter(int syscall_nr, char *filter)
+{
+ struct seccomp_filters *filters = NULL, *orig_filters = NULL;
+ uint16_t id;
+ long ret = -EINVAL;
+ uint16_t filters_needed;
+
+ if (!filter)
+ goto out;
+
+ filter = strstrip(filter);
+ /* Disallow empty strings. */
+ if (filter[0] == 0)
+ goto out;
+
+ rcu_read_lock();
+ orig_filters = get_seccomp_filters(current->seccomp.filters);
+ rcu_read_unlock();
+
+ /* After the first call, compatibility mode is selected permanently. */
+ ret = -EACCES;
+ if (filters_compat_mismatch(orig_filters))
+ goto out;
+
+ filters_needed = orig_filters ? orig_filters->count : 0;
+ id = seccomp_filter_id(orig_filters, syscall_nr);
+ if (seccomp_filter_deny(id)) {
+ /* Don't allow DENYs to be changed when in a seccomp mode */
+ ret = -EACCES;
+ if (current->seccomp.mode)
+ goto out;
+ filters_needed++;
+ }
+
+ filters = seccomp_filters_new(filters_needed);
+ if (IS_ERR(filters)) {
+ ret = PTR_ERR(filters);
+ goto out;
+ }
+
+ filters_set_compat(filters);
+ if (orig_filters) {
+ ret = seccomp_filters_copy(filters, orig_filters, -1);
+ if (ret)
+ goto out;
+ }
+
+ if (seccomp_filter_deny(id))
+ ret = seccomp_add_filter(filters, syscall_nr, filter);
+ else
+ ret = seccomp_extend_filter(filters, syscall_nr, filter);
+ if (ret)
+ goto out;
+ get_seccomp_filters(filters); /* simplify the error paths */
+
+ rcu_assign_pointer(current->seccomp.filters, filters);
+ synchronize_rcu();
+ put_seccomp_filters(orig_filters); /* for the task */
+out:
+ put_seccomp_filters(orig_filters); /* for the get */
+ put_seccomp_filters(filters); /* for get or task, on err */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_set_filter);
+
+long prctl_set_seccomp_filter(unsigned long syscall_nr,
+ char __user *user_filter)
+{
+ int nr;
+ long ret;
+ char *filter = NULL;
+
+ ret = -EINVAL;
+ if (syscall_nr >= NR_syscalls)
+ goto out;
+
+ ret = -EFAULT;
+ if (!user_filter)
+ goto out;
+
+ filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!filter)
+ goto out;
+
+ ret = -EFAULT;
+ if (strncpy_from_user(filter, user_filter,
+ SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
+ goto out;
+
+ nr = (int) syscall_nr;
+ ret = seccomp_set_filter(nr, filter);
+
+out:
+ kfree(filter);
+ return ret;
+}
+
+long prctl_clear_seccomp_filter(unsigned long syscall_nr)
+{
+ int nr = -1;
+ long ret;
+
+ ret = -EINVAL;
+ if (syscall_nr >= NR_syscalls)
+ goto out;
+
+ nr = (int) syscall_nr;
+ ret = seccomp_clear_filter(nr);
+
+out:
+ return ret;
+}
+
+long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
+ unsigned long available)
+{
+ int ret, nr;
+ unsigned long copied;
+ char *buf = NULL;
+ ret = -EINVAL;
+ if (!available)
+ goto out;
+ /* Ignore extra buffer space. */
+ if (available > SECCOMP_MAX_FILTER_LENGTH)
+ available = SECCOMP_MAX_FILTER_LENGTH;
+
+ ret = -EINVAL;
+ if (syscall_nr >= NR_syscalls)
+ goto out;
+ nr = (int) syscall_nr;
+
+ ret = -ENOMEM;
+ buf = kmalloc(available, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ ret = seccomp_get_filter(nr, buf, available);
+ if (ret < 0)
+ goto out;
+
+ /* Include the NUL byte in the copy. */
+ copied = copy_to_user(dst, buf, ret + 1);
+ ret = -ENOSPC;
+ if (copied)
+ goto out;
+ ret = 0;
+out:
+ kfree(buf);
+ return ret;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index af468ed..ed60d06 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_ENDIAN:
error = SET_ENDIAN(me, arg2);
break;
-
case PR_GET_SECCOMP:
error = prctl_get_seccomp();
break;
case PR_SET_SECCOMP:
error = prctl_set_seccomp(arg2);
break;
+ case PR_SET_SECCOMP_FILTER:
+ error = prctl_set_seccomp_filter(arg2,
+ (char __user *) arg3);
+ break;
+ case PR_CLEAR_SECCOMP_FILTER:
+ error = prctl_clear_seccomp_filter(arg2);
+ break;
+ case PR_GET_SECCOMP_FILTER:
+ error = prctl_get_seccomp_filter(arg2,
+ (char __user *) arg3,
+ arg4);
+ break;
case PR_GET_TSC:
error = GET_TSC_CTL(arg2);
break;
diff --git a/security/Kconfig b/security/Kconfig
index 95accd4..c76adf2 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -2,6 +2,10 @@
# Security configuration
#
+# Make seccomp filter Kconfig switch below available
+config HAVE_SECCOMP_FILTER
+ bool
+
menu "Security options"
config KEYS
@@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
If you are unsure how to answer this question, answer N.
+config SECCOMP_FILTER
+ bool "Enable seccomp-based system call filtering"
+ select SECCOMP
+ depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
+ help
+ This kernel feature expands CONFIG_SECCOMP to allow computing
+ in environments with reduced kernel access dictated by the
+ application itself through prctl calls. If
+ CONFIG_FTRACE_SYSCALLS is available, then system call
+ argument-based filtering predicates may be used.
+
+ See Documentation/prctl/seccomp_filter.txt for more detail.
+
config SECURITY
bool "Enable different security models"
depends on SYSFS
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-06-01 3:10 ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
@ 2011-06-02 17:36 ` Paul E. McKenney
2011-06-02 18:14 ` Will Drewry
0 siblings, 1 reply; 91+ messages in thread
From: Paul E. McKenney @ 2011-06-02 17:36 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
Peter Zijlstra, Frederic Weisbecker, linux-security-module
On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
> This change adds a new seccomp mode which specifies the allowed system
> calls dynamically. When in the new mode (2), all system calls are
> checked against process-defined filters - first by system call number,
> then by a filter string. If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed.
A few questions below -- I can't say that I understand the RCU usage.
Thanx, Paul
> Filter string parsing and evaluation is handled by the ftrace filter
> engine. Related patches tweak to the perf filter trace and free
> allowing the calls to be shared. Filters inherit their understanding of
> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
> subsystem which already populates this information in syscall_metadata
> associated enter_event (and exit_event) structures. If
> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
> will be allowed.
>
> The net result is a process may have its system calls filtered using the
> ftrace filter engine's inherent understanding of systems calls. The set
> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
> prctl(). For example, a filterset for a process, like pdftotext, that
> should only process read-only input could (roughly) look like:
> sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
> prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
> prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
> prctl(PR_SET_SECCOMP, 2);
>
> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
> be &&'d together to ensure that attack surface may only be reduced:
> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
>
> With the earlier example, the active filter becomes:
> "(fd == 1 || fd == 2) && fd != 2"
>
> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
> The latter returns the current filter for a system call to userspace:
>
> prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
>
> while the former clears any filters for a given system call changing it
> back to a defaulty deny:
>
> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
>
> v3: - always block execve calls (as per linus torvalds)
> - add __NR_seccomp_execve(_32) to seccomp-supporting arches
> - ensure compat tasks can't reach ftrace:syscalls
> - dropped new defines for seccomp modes.
> - two level array instead of hlists (sugg. by olof johansson)
> - added generic Kconfig entry that is not connected.
> - dropped internal seccomp.h
> - move prctl helpers to seccomp_filter
> - killed seccomp_t typedef (as per checkpatch)
> v2: - changed to use the existing syscall number ABI.
> - prctl changes to minimize parsing in the kernel:
> prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
> prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
> - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
> - added flags
> - provide a default fail syscall_nr_to_meta in ftrace
> - provides fallback for unhooked system calls
> - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
> - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
> - moved to a hlist and 4 bit hash of linked lists
> - added support to operate without CONFIG_FTRACE_SYSCALLS
> - moved Kconfig support next to SECCOMP
> - made Kconfig entries dependent on EXPERIMENTAL
> - added macros to avoid ifdefs from kernel/fork.c
> - added compat task/filter matching
> - drop seccomp.h inclusion in sched.h and drop seccomp_t
> - added Filtering to "show" output
> - added on_exec state dup'ing when enabling after a fast-path accept.
>
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
> include/linux/prctl.h | 5 +
> include/linux/sched.h | 2 +-
> include/linux/seccomp.h | 98 ++++++-
> include/trace/syscall.h | 7 +
> kernel/Makefile | 3 +
> kernel/fork.c | 3 +
> kernel/seccomp.c | 38 ++-
> kernel/seccomp_filter.c | 784 +++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sys.c | 13 +-
> security/Kconfig | 17 +
> 10 files changed, 954 insertions(+), 16 deletions(-)
> create mode 100644 kernel/seccomp_filter.c
>
> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
> index a3baeb2..44723ce 100644
> --- a/include/linux/prctl.h
> +++ b/include/linux/prctl.h
> @@ -64,6 +64,11 @@
> #define PR_GET_SECCOMP 21
> #define PR_SET_SECCOMP 22
>
> +/* Get/set process seccomp filters */
> +#define PR_GET_SECCOMP_FILTER 35
> +#define PR_SET_SECCOMP_FILTER 36
> +#define PR_CLEAR_SECCOMP_FILTER 37
> +
> /* Get/set the capability bounding set (as per security/commoncap.c) */
> #define PR_CAPBSET_READ 23
> #define PR_CAPBSET_DROP 24
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 18d63ce..3f0bc8d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1374,7 +1374,7 @@ struct task_struct {
> uid_t loginuid;
> unsigned int sessionid;
> #endif
> - seccomp_t seccomp;
> + struct seccomp_struct seccomp;
>
> /* Thread group tracking */
> u32 parent_exec_id;
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 167c333..f4434ca 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -1,13 +1,33 @@
> #ifndef _LINUX_SECCOMP_H
> #define _LINUX_SECCOMP_H
>
> +struct seq_file;
>
> #ifdef CONFIG_SECCOMP
>
> +#include <linux/errno.h>
> #include <linux/thread_info.h>
> +#include <linux/types.h>
> #include <asm/seccomp.h>
>
> -typedef struct { int mode; } seccomp_t;
> +struct seccomp_filters;
> +/**
> + * struct seccomp_struct - the state of a seccomp'ed process
> + *
> + * @mode:
> + * if this is 1, the process is under standard seccomp rules
> + * is 2, the process is only allowed to make system calls where
> + * associated filters evaluate successfully.
> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
> + * filters assignment/use should be RCU-protected and its contents
> + * should never be modified when attached to a seccomp_struct.
> + */
> +struct seccomp_struct {
> + uint16_t mode;
> +#ifdef CONFIG_SECCOMP_FILTER
> + struct seccomp_filters *filters;
> +#endif
> +};
>
> extern void __secure_computing(int);
> static inline void secure_computing(int this_syscall)
> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
> __secure_computing(this_syscall);
> }
>
> -extern long prctl_get_seccomp(void);
> extern long prctl_set_seccomp(unsigned long);
> +extern long prctl_get_seccomp(void);
>
> #else /* CONFIG_SECCOMP */
>
> #include <linux/errno.h>
>
> -typedef struct { } seccomp_t;
> -
> +struct seccomp_struct { };
> #define secure_computing(x) do { } while (0)
>
> static inline long prctl_get_seccomp(void)
> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
> return -EINVAL;
> }
>
> -static inline long prctl_set_seccomp(unsigned long arg2)
> +static inline long prctl_set_seccomp(unsigned long a2);
> {
> return -EINVAL;
> }
>
> #endif /* CONFIG_SECCOMP */
>
> +#ifdef CONFIG_SECCOMP_FILTER
> +
> +#define inherit_tsk_seccomp(_child, _orig) do { \
> + _child->seccomp.mode = _orig->seccomp.mode; \
> + _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
> + } while (0)
> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
> +
> +extern int seccomp_show_filters(struct seccomp_filters *filters,
> + struct seq_file *);
> +extern long seccomp_set_filter(int, char *);
> +extern long seccomp_clear_filter(int);
> +extern long seccomp_get_filter(int, char *, unsigned long);
> +
> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
> + unsigned long);
> +extern long prctl_clear_seccomp_filter(unsigned long);
> +
> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
> +extern void put_seccomp_filters(struct seccomp_filters *);
> +
> +extern int seccomp_test_filters(int);
> +extern void seccomp_filter_log_failure(int);
> +
> +#else /* CONFIG_SECCOMP_FILTER */
> +
> +struct seccomp_filters { };
> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
> +#define put_tsk_seccomp(_tsk) do { } while (0)
> +
> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
> + struct seq_file *m)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline long seccomp_clear_filter(int syscall_nr)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline long seccomp_get_filter(int syscall_nr,
> + char *buf, unsigned long available)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
> + unsigned long a4)
> +{
> + return -ENOSYS;
> +}
> +#endif /* CONFIG_SECCOMP_FILTER */
> #endif /* _LINUX_SECCOMP_H */
> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
> index 242ae04..e061ad0 100644
> --- a/include/trace/syscall.h
> +++ b/include/trace/syscall.h
> @@ -35,6 +35,8 @@ struct syscall_metadata {
> extern unsigned long arch_syscall_addr(int nr);
> extern int init_syscall_trace(struct ftrace_event_call *call);
>
> +extern struct syscall_metadata *syscall_nr_to_meta(int);
> +
> extern int reg_event_syscall_enter(struct ftrace_event_call *call);
> extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
> extern int reg_event_syscall_exit(struct ftrace_event_call *call);
> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
> struct trace_event *event);
> enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
> struct trace_event *event);
> +#else
> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
> +{
> + return NULL;
> +}
> #endif
>
> #ifdef CONFIG_PERF_EVENTS
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 85cbfb3..84e7dfb 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
> obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
> obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
> obj-$(CONFIG_SECCOMP) += seccomp.o
> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
> +endif
> obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
> obj-$(CONFIG_TREE_RCU) += rcutree.o
> obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
> diff --git a/kernel/fork.c b/kernel/fork.c
> index e7548de..6f835e0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -34,6 +34,7 @@
> #include <linux/cgroup.h>
> #include <linux/security.h>
> #include <linux/hugetlb.h>
> +#include <linux/seccomp.h>
> #include <linux/swap.h>
> #include <linux/syscalls.h>
> #include <linux/jiffies.h>
> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
> free_thread_info(tsk->stack);
> rt_mutex_debug_task_free(tsk);
> ftrace_graph_exit_task(tsk);
> + put_tsk_seccomp(tsk);
> free_task_struct(tsk);
> }
> EXPORT_SYMBOL(free_task);
> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
> if (err)
> goto out;
>
> + inherit_tsk_seccomp(tsk, orig);
> setup_thread_stack(tsk, orig);
> clear_user_return_notifier(tsk);
> clear_tsk_need_resched(tsk);
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 57d4b13..0a942be 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -2,16 +2,20 @@
> * linux/kernel/seccomp.c
> *
> * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> *
> * This defines a simple but solid secure-computing mode.
> */
>
> #include <linux/seccomp.h>
> #include <linux/sched.h>
> +#include <linux/slab.h>
> #include <linux/compat.h>
> +#include <linux/unistd.h>
> +#include <linux/ftrace_event.h>
>
> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> /* #define SECCOMP_DEBUG 1 */
> -#define NR_SECCOMP_MODES 1
>
> /*
> * Secure computing mode 1 allows only read/write/exit/sigreturn.
> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
>
> void __secure_computing(int this_syscall)
> {
> - int mode = current->seccomp.mode;
> int * syscall;
>
> - switch (mode) {
> + switch (current->seccomp.mode) {
> case 1:
> syscall = mode1_syscalls;
> #ifdef CONFIG_COMPAT
> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
> return;
> } while (*++syscall);
> break;
> +#ifdef CONFIG_SECCOMP_FILTER
> + case 2:
> + if (this_syscall >= NR_syscalls || this_syscall < 0)
> + break;
> +
> + if (!seccomp_test_filters(this_syscall))
> + return;
> +
> + seccomp_filter_log_failure(this_syscall);
> + break;
> +#endif
> default:
> BUG();
> }
> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
> if (unlikely(current->seccomp.mode))
> goto out;
>
> - ret = -EINVAL;
> - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
> - current->seccomp.mode = seccomp_mode;
> - set_thread_flag(TIF_SECCOMP);
> + ret = 0;
> + switch (seccomp_mode) {
> + case 1:
> #ifdef TIF_NOTSC
> disable_TSC();
> #endif
> - ret = 0;
> +#ifdef CONFIG_SECCOMP_FILTER
> + case 2:
> +#endif
> + current->seccomp.mode = seccomp_mode;
> + set_thread_flag(TIF_SECCOMP);
> + break;
> + default:
> + ret = -EINVAL;
> }
>
> - out:
> +out:
> return ret;
> }
> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
> new file mode 100644
> index 0000000..9782f25
> --- /dev/null
> +++ b/kernel/seccomp_filter.c
> @@ -0,0 +1,784 @@
> +/* filter engine-based seccomp system call filtering
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> + */
> +
> +#include <linux/compat.h>
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/ftrace_event.h>
> +#include <linux/seccomp.h>
> +#include <linux/seq_file.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +
> +#include <asm/syscall.h>
> +#include <trace/syscall.h>
> +
> +
> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> +
> +#define SECCOMP_FILTER_ALLOW "1"
> +#define SECCOMP_ACTION_DENY 0xffff
> +#define SECCOMP_ACTION_ALLOW 0xfffe
> +
> +/**
> + * struct seccomp_filters - container for seccomp filterset
> + *
> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
> + * May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
> + * @event_filters: array of pointers to ftrace event objects
> + * @count: size of @event_filters
> + * @flags: anonymous struct to wrap filters-specific flags
> + * @usage: reference count to simplify use.
> + */
> +struct seccomp_filters {
> + uint16_t syscalls[NR_syscalls];
> + struct event_filter **event_filters;
> + uint16_t count;
> + struct {
> + uint32_t compat:1,
> + __reserved:31;
> + } flags;
> + atomic_t usage;
> +};
> +
> +/* Handle ftrace symbol non-existence */
> +#ifdef CONFIG_FTRACE_SYSCALLS
> +#define create_event_filter(_ef_pptr, _event_type, _str) \
> + ftrace_parse_filter(_ef_pptr, _event_type, _str)
> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
> +#define free_event_filter(_f) ftrace_free_filter(_f)
> +
> +#else
> +
> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
> +#define get_filter_string(_ef) (NULL)
> +#define free_event_filter(_f) do { } while (0)
> +#endif
> +
> +/**
> + * seccomp_filters_new - allocates a new filters object
> + * @count: count to allocate for the event_filters array
> + *
> + * Returns ERR_PTR on error or an allocated object.
> + */
> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
> +{
> + struct seccomp_filters *f;
> +
> + if (count >= SECCOMP_ACTION_ALLOW)
> + return ERR_PTR(-EINVAL);
> +
> + f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
> + if (!f)
> + return ERR_PTR(-ENOMEM);
> +
> + /* Lazy SECCOMP_ACTION_DENY assignment. */
> + memset(f->syscalls, 0xff, sizeof(f->syscalls));
> + atomic_set(&f->usage, 1);
> +
> + f->event_filters = NULL;
> + f->count = count;
> + if (!count)
> + return f;
> +
> + f->event_filters = kzalloc(count * sizeof(struct event_filter *),
> + GFP_KERNEL);
> + if (!f->event_filters) {
> + kfree(f);
> + f = ERR_PTR(-ENOMEM);
> + }
> + return f;
> +}
> +
> +/**
> + * seccomp_filters_free - cleans up the filter list and frees the table
> + * @filters: NULL or live object to be completely destructed.
> + */
> +static void seccomp_filters_free(struct seccomp_filters *filters)
> +{
> + uint16_t count = 0;
> + if (!filters)
> + return;
> + while (count < filters->count) {
> + struct event_filter *f = filters->event_filters[count];
> + free_event_filter(f);
> + count++;
> + }
> + kfree(filters->event_filters);
> + kfree(filters);
> +}
> +
> +static void __put_seccomp_filters(struct seccomp_filters *orig)
> +{
> + WARN_ON(atomic_read(&orig->usage));
> + seccomp_filters_free(orig);
> +}
> +
> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
> +#define seccomp_filter_dynamic(_id) \
> + (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
> + int syscall_nr)
> +{
> + if (!f)
> + return SECCOMP_ACTION_DENY;
> + return f->syscalls[syscall_nr];
> +}
> +
> +static inline struct event_filter *seccomp_dynamic_filter(
> + const struct seccomp_filters *filters, uint16_t id)
> +{
> + if (!seccomp_filter_dynamic(id))
> + return NULL;
> + return filters->event_filters[id];
> +}
> +
> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
> + int syscall_nr, uint16_t id)
> +{
> + filters->syscalls[syscall_nr] = id;
> +}
> +
> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
> + int syscall_nr, uint16_t id,
> + struct event_filter *dynamic_filter)
> +{
> + filters->syscalls[syscall_nr] = id;
> + if (seccomp_filter_dynamic(id))
> + filters->event_filters[id] = dynamic_filter;
> +}
> +
> +static struct event_filter *alloc_event_filter(int syscall_nr,
> + const char *filter_string)
> +{
> + struct syscall_metadata *data;
> + struct event_filter *filter = NULL;
> + int err;
> +
> + data = syscall_nr_to_meta(syscall_nr);
> + /* Argument-based filtering only works on ftrace-hooked syscalls. */
> + err = -ENOSYS;
> + if (!data)
> + goto fail;
> + err = create_event_filter(&filter,
> + data->enter_event->event.type,
> + filter_string);
> + if (err)
> + goto fail;
> +
> + return filter;
> +fail:
> + kfree(filter);
> + return ERR_PTR(err);
> +}
> +
> +/**
> + * seccomp_filters_copy - copies filters from src to dst.
> + *
> + * @dst: seccomp_filters to populate.
> + * @src: table to read from.
> + * @skip: specifies an entry, by system call, to skip.
> + *
> + * Returns non-zero on failure.
> + * Both the source and the destination should have no simultaneous
> + * writers, and dst should be exclusive to the caller.
> + * If @skip is < 0, it is ignored.
> + */
> +static int seccomp_filters_copy(struct seccomp_filters *dst,
> + const struct seccomp_filters *src,
> + int skip)
> +{
> + int id = 0, ret = 0, nr;
> + memcpy(&dst->flags, &src->flags, sizeof(src->flags));
> + memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
> + if (!src->count)
> + goto done;
> + for (nr = 0; nr < NR_syscalls; ++nr) {
> + struct event_filter *filter;
> + const char *str;
> + uint16_t src_id = seccomp_filter_id(src, nr);
> + if (nr == skip) {
> + set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
> + NULL);
> + continue;
> + }
> + if (!seccomp_filter_dynamic(src_id))
> + continue;
> + if (id >= dst->count) {
> + ret = -EINVAL;
> + goto done;
> + }
> + str = get_filter_string(seccomp_dynamic_filter(src, src_id));
> + filter = alloc_event_filter(nr, str);
> + if (IS_ERR(filter)) {
> + ret = PTR_ERR(filter);
> + goto done;
> + }
> + set_seccomp_filter(dst, nr, id, filter);
> + id++;
> + }
> +
> +done:
> + return ret;
> +}
> +
> +/**
> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
> + * @filters: unattached filter object to operate on
> + * @syscall_nr: syscall number to update filters for
> + * @filter_string: string to append to the existing filter
> + *
> + * The new string will be &&'d to the original filter string to ensure that it
> + * always matches the existing predicates or less:
> + * (old_filter) && @filter_string
> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
> + * failure.
> + */
> +static int seccomp_extend_filter(struct seccomp_filters *filters,
> + int syscall_nr, char *filter_string)
> +{
> + struct event_filter *filter;
> + uint16_t id = seccomp_filter_id(filters, syscall_nr);
> + char *merged = NULL;
> + int ret = -EINVAL, expected;
> +
> + /* No extending with a "1". */
> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
> + goto out;
> +
> + filter = seccomp_dynamic_filter(filters, id);
> + ret = -ENOENT;
> + if (!filter)
> + goto out;
> +
> + merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> + ret = -ENOMEM;
> + if (!merged)
> + goto out;
> +
> + expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
> + get_filter_string(filter), filter_string);
> + ret = -E2BIG;
> + if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
> + goto out;
> +
> + /* Free the old filter */
> + free_event_filter(filter);
> + set_seccomp_filter(filters, syscall_nr, id, NULL);
> +
> + /* Replace it */
> + filter = alloc_event_filter(syscall_nr, merged);
> + if (IS_ERR(filter)) {
> + ret = PTR_ERR(filter);
> + goto out;
> + }
> + set_seccomp_filter(filters, syscall_nr, id, filter);
> + ret = 0;
> +
> +out:
> + kfree(merged);
> + return ret;
> +}
> +
> +/**
> + * seccomp_add_filter - adds a filter for an unfiltered syscall
> + * @filters: filters object to add a filter/action to
> + * @syscall_nr: system call number to add a filter for
> + * @filter_string: the filter string to apply
> + *
> + * Returns 0 on success and non-zero otherwise.
> + */
> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
> + char *filter_string)
> +{
> + struct event_filter *filter;
> + int ret = 0;
> +
> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
> + set_seccomp_filter(filters, syscall_nr,
> + SECCOMP_ACTION_ALLOW, NULL);
> + goto out;
> + }
> +
> + filter = alloc_event_filter(syscall_nr, filter_string);
> + if (IS_ERR(filter)) {
> + ret = PTR_ERR(filter);
> + goto out;
> + }
> + /* Always add to the last slot available since additions are
> + * are only done one at a time.
> + */
> + set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
> +out:
> + return ret;
> +}
> +
> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
> +static int filter_match_current(struct event_filter *event_filter)
> +{
> + int err = 0;
> +#ifdef CONFIG_FTRACE_SYSCALLS
> + uint8_t syscall_state[64];
> +
> + memset(syscall_state, 0, sizeof(syscall_state));
> +
> + /* The generic tracing entry can remain zeroed. */
> + err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
> + NULL);
> + if (err)
> + return 0;
> +
> + err = filter_match_preds(event_filter, syscall_state);
> +#endif
> + return err;
> +}
> +
> +static const char *syscall_nr_to_name(int syscall)
> +{
> + const char *syscall_name = "unknown";
> + struct syscall_metadata *data = syscall_nr_to_meta(syscall);
> + if (data)
> + syscall_name = data->name;
> + return syscall_name;
> +}
> +
> +static void filters_set_compat(struct seccomp_filters *filters)
> +{
> +#ifdef CONFIG_COMPAT
> + if (is_compat_task())
> + filters->flags.compat = 1;
> +#endif
> +}
> +
> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
> +{
> + int ret = 0;
> + if (!filters)
> + return 0;
> +#ifdef CONFIG_COMPAT
> + if (!!(is_compat_task()) == filters->flags.compat)
> + ret = 1;
> +#endif
> + return ret;
> +}
> +
> +static inline int syscall_is_execve(int syscall)
> +{
> + int nr = __NR_execve;
> +#ifdef CONFIG_COMPAT
> + if (is_compat_task())
> + nr = __NR_seccomp_execve_32;
> +#endif
> + return syscall == nr;
> +}
> +
> +#ifndef KSTK_EIP
> +#define KSTK_EIP(x) 0L
> +#endif
> +
> +void seccomp_filter_log_failure(int syscall)
> +{
> + pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
> + current->comm, task_pid_nr(current), syscall,
> + syscall_nr_to_name(syscall), KSTK_EIP(current));
> +}
> +
> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
> +void put_seccomp_filters(struct seccomp_filters *orig)
> +{
> + if (!orig)
> + return;
> +
> + if (atomic_dec_and_test(&orig->usage))
> + __put_seccomp_filters(orig);
> +}
> +
> +/* get_seccomp_state - increments the reference count of @orig */
> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
Nit: the name does not match the comment.
> +{
> + if (!orig)
> + return NULL;
> + atomic_inc(&orig->usage);
> + return orig;
This is called in an RCU read-side critical section. What exactly is
RCU protecting? I would expect an rcu_dereference() or one of the
RCU list-traversal primitives somewhere, either here or at the caller.
> +}
> +
> +/**
> + * seccomp_test_filters - tests 'current' against the given syscall
> + * @state: seccomp_state of current to use.
> + * @syscall: number of the system call to test
> + *
> + * Returns 0 on ok and non-zero on error/failure.
> + */
> +int seccomp_test_filters(int syscall)
> +{
> + uint16_t id;
> + struct event_filter *filter;
> + struct seccomp_filters *filters;
> + int ret = -EACCES;
> +
> + rcu_read_lock();
> + filters = get_seccomp_filters(current->seccomp.filters);
> + rcu_read_unlock();
> +
> + if (!filters)
> + goto out;
> +
> + if (filters_compat_mismatch(filters)) {
> + pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
> + current->comm, task_pid_nr(current));
> + goto out;
> + }
> +
> + /* execve is never allowed. */
> + if (syscall_is_execve(syscall))
> + goto out;
> +
> + ret = 0;
> + id = seccomp_filter_id(filters, syscall);
> + if (seccomp_filter_allow(id))
> + goto out;
> +
> + ret = -EACCES;
> + if (!seccomp_filter_dynamic(id))
> + goto out;
> +
> + filter = seccomp_dynamic_filter(filters, id);
> + if (filter && filter_match_current(filter))
> + ret = 0;
> +out:
> + put_seccomp_filters(filters);
> + return ret;
> +}
> +
> +/**
> + * seccomp_show_filters - prints the current filter state to a seq_file
> + * @filters: properly get()'d filters object
> + * @m: the prepared seq_file to receive the data
> + *
> + * Returns 0 on a successful write.
> + */
> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
> +{
> + int syscall;
> + seq_printf(m, "Mode: %d\n", current->seccomp.mode);
> + if (!filters)
> + goto out;
> +
> + for (syscall = 0; syscall < NR_syscalls; ++syscall) {
> + uint16_t id = seccomp_filter_id(filters, syscall);
> + const char *filter_string = SECCOMP_FILTER_ALLOW;
> + if (seccomp_filter_deny(id))
> + continue;
> + seq_printf(m, "%d (%s): ",
> + syscall,
> + syscall_nr_to_name(syscall));
> + if (seccomp_filter_dynamic(id))
> + filter_string = get_filter_string(
> + seccomp_dynamic_filter(filters, id));
> + seq_printf(m, "%s\n", filter_string);
> + }
> +out:
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
> +
> +/**
> + * seccomp_get_filter - copies the filter_string into "buf"
> + * @syscall_nr: system call number to look up
> + * @buf: destination buffer
> + * @bufsize: available space in the buffer.
> + *
> + * Context: User context only. This function may sleep on allocation and
> + * operates on current. current must be attempting a system call
> + * when this is called.
> + *
> + * Looks up the filter for the given system call number on current. If found,
> + * the string length of the NUL-terminated buffer is returned and < 0 is
> + * returned on error. The NUL byte is not included in the length.
> + */
> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
> +{
> + struct seccomp_filters *filters;
> + struct event_filter *filter;
> + long ret = -EINVAL;
> + uint16_t id;
> +
> + if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
> + bufsize = SECCOMP_MAX_FILTER_LENGTH;
> +
> + rcu_read_lock();
> + filters = get_seccomp_filters(current->seccomp.filters);
> + rcu_read_unlock();
> +
> + if (!filters)
> + goto out;
> +
> + ret = -ENOENT;
> + id = seccomp_filter_id(filters, syscall_nr);
> + if (seccomp_filter_deny(id))
> + goto out;
> +
> + if (seccomp_filter_allow(id)) {
> + ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
> + goto copied;
> + }
> +
> + filter = seccomp_dynamic_filter(filters, id);
> + if (!filter)
> + goto out;
> + ret = strlcpy(buf, get_filter_string(filter), bufsize);
> +
> +copied:
> + if (ret >= bufsize) {
> + ret = -ENOSPC;
> + goto out;
> + }
> + /* Zero out any remaining buffer, just in case. */
> + memset(buf + ret, 0, bufsize - ret);
> +out:
> + put_seccomp_filters(filters);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
> +
> +/**
> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
> + * @syscall_nr: the system call number to clear filters for.
> + *
> + * Context: User context only. This function may sleep on allocation and
> + * operates on current. current must be attempting a system call
> + * when this is called.
> + *
> + * Returns 0 on success.
> + */
> +long seccomp_clear_filter(int syscall_nr)
> +{
> + struct seccomp_filters *filters = NULL, *orig_filters;
> + uint16_t id;
> + int ret = -EINVAL;
> +
> + rcu_read_lock();
> + orig_filters = get_seccomp_filters(current->seccomp.filters);
> + rcu_read_unlock();
> +
> + if (!orig_filters)
> + goto out;
> +
> + if (filters_compat_mismatch(orig_filters))
> + goto out;
> +
> + id = seccomp_filter_id(orig_filters, syscall_nr);
> + if (seccomp_filter_deny(id))
> + goto out;
> +
> + /* Create a new filters object for the task */
> + if (seccomp_filter_dynamic(id))
> + filters = seccomp_filters_new(orig_filters->count - 1);
> + else
> + filters = seccomp_filters_new(orig_filters->count);
> +
> + if (IS_ERR(filters)) {
> + ret = PTR_ERR(filters);
> + goto out;
> + }
> +
> + /* Copy, but drop the requested entry. */
> + ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
> + if (ret)
> + goto out;
> + get_seccomp_filters(filters); /* simplify the out: path */
> +
> + rcu_assign_pointer(current->seccomp.filters, filters);
What prevents two copies of seccomp_clear_filter() from running
concurrently?
> + synchronize_rcu();
> + put_seccomp_filters(orig_filters); /* for the task */
> +out:
> + put_seccomp_filters(orig_filters); /* for the get */
> + put_seccomp_filters(filters); /* for the extra get */
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
> +
> +/**
> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
> + * @syscall_nr: system call number to apply the filter to.
> + * @filter: ftrace filter string to apply.
> + *
> + * Context: User context only. This function may sleep on allocation and
> + * operates on current. current must be attempting a system call
> + * when this is called.
> + *
> + * New filters may be added for system calls when the current task is
> + * not in a secure computing mode (seccomp). Otherwise, existing filters may
> + * be extended.
> + *
> + * Returns 0 on success or an errno on failure.
> + */
> +long seccomp_set_filter(int syscall_nr, char *filter)
> +{
> + struct seccomp_filters *filters = NULL, *orig_filters = NULL;
> + uint16_t id;
> + long ret = -EINVAL;
> + uint16_t filters_needed;
> +
> + if (!filter)
> + goto out;
> +
> + filter = strstrip(filter);
> + /* Disallow empty strings. */
> + if (filter[0] == 0)
> + goto out;
> +
> + rcu_read_lock();
> + orig_filters = get_seccomp_filters(current->seccomp.filters);
> + rcu_read_unlock();
> +
> + /* After the first call, compatibility mode is selected permanently. */
> + ret = -EACCES;
> + if (filters_compat_mismatch(orig_filters))
> + goto out;
> +
> + filters_needed = orig_filters ? orig_filters->count : 0;
> + id = seccomp_filter_id(orig_filters, syscall_nr);
> + if (seccomp_filter_deny(id)) {
> + /* Don't allow DENYs to be changed when in a seccomp mode */
> + ret = -EACCES;
> + if (current->seccomp.mode)
> + goto out;
> + filters_needed++;
> + }
> +
> + filters = seccomp_filters_new(filters_needed);
> + if (IS_ERR(filters)) {
> + ret = PTR_ERR(filters);
> + goto out;
> + }
> +
> + filters_set_compat(filters);
> + if (orig_filters) {
> + ret = seccomp_filters_copy(filters, orig_filters, -1);
> + if (ret)
> + goto out;
> + }
> +
> + if (seccomp_filter_deny(id))
> + ret = seccomp_add_filter(filters, syscall_nr, filter);
> + else
> + ret = seccomp_extend_filter(filters, syscall_nr, filter);
> + if (ret)
> + goto out;
> + get_seccomp_filters(filters); /* simplify the error paths */
> +
> + rcu_assign_pointer(current->seccomp.filters, filters);
Again, what prevents two copies of seccomp_set_filter() from running
concurrently?
> + synchronize_rcu();
> + put_seccomp_filters(orig_filters); /* for the task */
> +out:
> + put_seccomp_filters(orig_filters); /* for the get */
> + put_seccomp_filters(filters); /* for get or task, on err */
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
> +
> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
> + char __user *user_filter)
> +{
> + int nr;
> + long ret;
> + char *filter = NULL;
> +
> + ret = -EINVAL;
> + if (syscall_nr >= NR_syscalls)
> + goto out;
> +
> + ret = -EFAULT;
> + if (!user_filter)
> + goto out;
> +
> + filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> + ret = -ENOMEM;
> + if (!filter)
> + goto out;
> +
> + ret = -EFAULT;
> + if (strncpy_from_user(filter, user_filter,
> + SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
> + goto out;
> +
> + nr = (int) syscall_nr;
> + ret = seccomp_set_filter(nr, filter);
> +
> +out:
> + kfree(filter);
> + return ret;
> +}
> +
> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
> +{
> + int nr = -1;
> + long ret;
> +
> + ret = -EINVAL;
> + if (syscall_nr >= NR_syscalls)
> + goto out;
> +
> + nr = (int) syscall_nr;
> + ret = seccomp_clear_filter(nr);
> +
> +out:
> + return ret;
> +}
> +
> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
> + unsigned long available)
> +{
> + int ret, nr;
> + unsigned long copied;
> + char *buf = NULL;
> + ret = -EINVAL;
> + if (!available)
> + goto out;
> + /* Ignore extra buffer space. */
> + if (available > SECCOMP_MAX_FILTER_LENGTH)
> + available = SECCOMP_MAX_FILTER_LENGTH;
> +
> + ret = -EINVAL;
> + if (syscall_nr >= NR_syscalls)
> + goto out;
> + nr = (int) syscall_nr;
> +
> + ret = -ENOMEM;
> + buf = kmalloc(available, GFP_KERNEL);
> + if (!buf)
> + goto out;
> +
> + ret = seccomp_get_filter(nr, buf, available);
> + if (ret < 0)
> + goto out;
> +
> + /* Include the NUL byte in the copy. */
> + copied = copy_to_user(dst, buf, ret + 1);
> + ret = -ENOSPC;
> + if (copied)
> + goto out;
> + ret = 0;
> +out:
> + kfree(buf);
> + return ret;
> +}
> diff --git a/kernel/sys.c b/kernel/sys.c
> index af468ed..ed60d06 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> case PR_SET_ENDIAN:
> error = SET_ENDIAN(me, arg2);
> break;
> -
> case PR_GET_SECCOMP:
> error = prctl_get_seccomp();
> break;
> case PR_SET_SECCOMP:
> error = prctl_set_seccomp(arg2);
> break;
> + case PR_SET_SECCOMP_FILTER:
> + error = prctl_set_seccomp_filter(arg2,
> + (char __user *) arg3);
> + break;
> + case PR_CLEAR_SECCOMP_FILTER:
> + error = prctl_clear_seccomp_filter(arg2);
> + break;
> + case PR_GET_SECCOMP_FILTER:
> + error = prctl_get_seccomp_filter(arg2,
> + (char __user *) arg3,
> + arg4);
> + break;
> case PR_GET_TSC:
> error = GET_TSC_CTL(arg2);
> break;
> diff --git a/security/Kconfig b/security/Kconfig
> index 95accd4..c76adf2 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -2,6 +2,10 @@
> # Security configuration
> #
>
> +# Make seccomp filter Kconfig switch below available
> +config HAVE_SECCOMP_FILTER
> + bool
> +
> menu "Security options"
>
> config KEYS
> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
>
> If you are unsure how to answer this question, answer N.
>
> +config SECCOMP_FILTER
> + bool "Enable seccomp-based system call filtering"
> + select SECCOMP
> + depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
> + help
> + This kernel feature expands CONFIG_SECCOMP to allow computing
> + in environments with reduced kernel access dictated by the
> + application itself through prctl calls. If
> + CONFIG_FTRACE_SYSCALLS is available, then system call
> + argument-based filtering predicates may be used.
> +
> + See Documentation/prctl/seccomp_filter.txt for more detail.
> +
> config SECURITY
> bool "Enable different security models"
> depends on SYSFS
> --
> 1.7.0.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-06-02 17:36 ` Paul E. McKenney
@ 2011-06-02 18:14 ` Will Drewry
2011-06-02 19:42 ` Paul E. McKenney
0 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-02 18:14 UTC (permalink / raw)
To: paulmck
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
Peter Zijlstra, Frederic Weisbecker, linux-security-module
On Thu, Jun 2, 2011 at 12:36 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
>> This change adds a new seccomp mode which specifies the allowed system
>> calls dynamically. When in the new mode (2), all system calls are
>> checked against process-defined filters - first by system call number,
>> then by a filter string. If an entry exists for a given system call and
>> all filter predicates evaluate to true, then the task may proceed.
>> Otherwise, the task is killed.
>
> A few questions below -- I can't say that I understand the RCU usage.
>
> Thanx, Paul
>
>> Filter string parsing and evaluation is handled by the ftrace filter
>> engine. Related patches tweak to the perf filter trace and free
>> allowing the calls to be shared. Filters inherit their understanding of
>> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
>> subsystem which already populates this information in syscall_metadata
>> associated enter_event (and exit_event) structures. If
>> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
>> will be allowed.
>>
>> The net result is a process may have its system calls filtered using the
>> ftrace filter engine's inherent understanding of systems calls. The set
>> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
>> prctl(). For example, a filterset for a process, like pdftotext, that
>> should only process read-only input could (roughly) look like:
>> sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
>> prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
>> prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
>> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
>> prctl(PR_SET_SECCOMP, 2);
>>
>> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
>> be &&'d together to ensure that attack surface may only be reduced:
>> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
>>
>> With the earlier example, the active filter becomes:
>> "(fd == 1 || fd == 2) && fd != 2"
>>
>> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
>> The latter returns the current filter for a system call to userspace:
>>
>> prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
>>
>> while the former clears any filters for a given system call changing it
>> back to a defaulty deny:
>>
>> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
>>
>> v3: - always block execve calls (as per linus torvalds)
>> - add __NR_seccomp_execve(_32) to seccomp-supporting arches
>> - ensure compat tasks can't reach ftrace:syscalls
>> - dropped new defines for seccomp modes.
>> - two level array instead of hlists (sugg. by olof johansson)
>> - added generic Kconfig entry that is not connected.
>> - dropped internal seccomp.h
>> - move prctl helpers to seccomp_filter
>> - killed seccomp_t typedef (as per checkpatch)
>> v2: - changed to use the existing syscall number ABI.
>> - prctl changes to minimize parsing in the kernel:
>> prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
>> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
>> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
>> prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
>> - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
>> - added flags
>> - provide a default fail syscall_nr_to_meta in ftrace
>> - provides fallback for unhooked system calls
>> - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
>> - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
>> - moved to a hlist and 4 bit hash of linked lists
>> - added support to operate without CONFIG_FTRACE_SYSCALLS
>> - moved Kconfig support next to SECCOMP
>> - made Kconfig entries dependent on EXPERIMENTAL
>> - added macros to avoid ifdefs from kernel/fork.c
>> - added compat task/filter matching
>> - drop seccomp.h inclusion in sched.h and drop seccomp_t
>> - added Filtering to "show" output
>> - added on_exec state dup'ing when enabling after a fast-path accept.
>>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>> include/linux/prctl.h | 5 +
>> include/linux/sched.h | 2 +-
>> include/linux/seccomp.h | 98 ++++++-
>> include/trace/syscall.h | 7 +
>> kernel/Makefile | 3 +
>> kernel/fork.c | 3 +
>> kernel/seccomp.c | 38 ++-
>> kernel/seccomp_filter.c | 784 +++++++++++++++++++++++++++++++++++++++++++++++
>> kernel/sys.c | 13 +-
>> security/Kconfig | 17 +
>> 10 files changed, 954 insertions(+), 16 deletions(-)
>> create mode 100644 kernel/seccomp_filter.c
>>
>> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
>> index a3baeb2..44723ce 100644
>> --- a/include/linux/prctl.h
>> +++ b/include/linux/prctl.h
>> @@ -64,6 +64,11 @@
>> #define PR_GET_SECCOMP 21
>> #define PR_SET_SECCOMP 22
>>
>> +/* Get/set process seccomp filters */
>> +#define PR_GET_SECCOMP_FILTER 35
>> +#define PR_SET_SECCOMP_FILTER 36
>> +#define PR_CLEAR_SECCOMP_FILTER 37
>> +
>> /* Get/set the capability bounding set (as per security/commoncap.c) */
>> #define PR_CAPBSET_READ 23
>> #define PR_CAPBSET_DROP 24
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 18d63ce..3f0bc8d 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1374,7 +1374,7 @@ struct task_struct {
>> uid_t loginuid;
>> unsigned int sessionid;
>> #endif
>> - seccomp_t seccomp;
>> + struct seccomp_struct seccomp;
>>
>> /* Thread group tracking */
>> u32 parent_exec_id;
>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> index 167c333..f4434ca 100644
>> --- a/include/linux/seccomp.h
>> +++ b/include/linux/seccomp.h
>> @@ -1,13 +1,33 @@
>> #ifndef _LINUX_SECCOMP_H
>> #define _LINUX_SECCOMP_H
>>
>> +struct seq_file;
>>
>> #ifdef CONFIG_SECCOMP
>>
>> +#include <linux/errno.h>
>> #include <linux/thread_info.h>
>> +#include <linux/types.h>
>> #include <asm/seccomp.h>
>>
>> -typedef struct { int mode; } seccomp_t;
>> +struct seccomp_filters;
>> +/**
>> + * struct seccomp_struct - the state of a seccomp'ed process
>> + *
>> + * @mode:
>> + * if this is 1, the process is under standard seccomp rules
>> + * is 2, the process is only allowed to make system calls where
>> + * associated filters evaluate successfully.
>> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
>> + * filters assignment/use should be RCU-protected and its contents
>> + * should never be modified when attached to a seccomp_struct.
>> + */
>> +struct seccomp_struct {
>> + uint16_t mode;
>> +#ifdef CONFIG_SECCOMP_FILTER
>> + struct seccomp_filters *filters;
>> +#endif
>> +};
>>
>> extern void __secure_computing(int);
>> static inline void secure_computing(int this_syscall)
>> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
>> __secure_computing(this_syscall);
>> }
>>
>> -extern long prctl_get_seccomp(void);
>> extern long prctl_set_seccomp(unsigned long);
>> +extern long prctl_get_seccomp(void);
>>
>> #else /* CONFIG_SECCOMP */
>>
>> #include <linux/errno.h>
>>
>> -typedef struct { } seccomp_t;
>> -
>> +struct seccomp_struct { };
>> #define secure_computing(x) do { } while (0)
>>
>> static inline long prctl_get_seccomp(void)
>> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
>> return -EINVAL;
>> }
>>
>> -static inline long prctl_set_seccomp(unsigned long arg2)
>> +static inline long prctl_set_seccomp(unsigned long a2);
>> {
>> return -EINVAL;
>> }
>>
>> #endif /* CONFIG_SECCOMP */
>>
>> +#ifdef CONFIG_SECCOMP_FILTER
>> +
>> +#define inherit_tsk_seccomp(_child, _orig) do { \
>> + _child->seccomp.mode = _orig->seccomp.mode; \
>> + _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
>> + } while (0)
>> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
>> +
>> +extern int seccomp_show_filters(struct seccomp_filters *filters,
>> + struct seq_file *);
>> +extern long seccomp_set_filter(int, char *);
>> +extern long seccomp_clear_filter(int);
>> +extern long seccomp_get_filter(int, char *, unsigned long);
>> +
>> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
>> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
>> + unsigned long);
>> +extern long prctl_clear_seccomp_filter(unsigned long);
>> +
>> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
>> +extern void put_seccomp_filters(struct seccomp_filters *);
>> +
>> +extern int seccomp_test_filters(int);
>> +extern void seccomp_filter_log_failure(int);
>> +
>> +#else /* CONFIG_SECCOMP_FILTER */
>> +
>> +struct seccomp_filters { };
>> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
>> +#define put_tsk_seccomp(_tsk) do { } while (0)
>> +
>> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
>> + struct seq_file *m)
>> +{
>> + return -ENOSYS;
>> +}
>> +
>> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
>> +{
>> + return -ENOSYS;
>> +}
>> +
>> +static inline long seccomp_clear_filter(int syscall_nr)
>> +{
>> + return -ENOSYS;
>> +}
>> +
>> +static inline long seccomp_get_filter(int syscall_nr,
>> + char *buf, unsigned long available)
>> +{
>> + return -ENOSYS;
>> +}
>> +
>> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
>> +{
>> + return -ENOSYS;
>> +}
>> +
>> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
>> +{
>> + return -ENOSYS;
>> +}
>> +
>> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
>> + unsigned long a4)
>> +{
>> + return -ENOSYS;
>> +}
>> +#endif /* CONFIG_SECCOMP_FILTER */
>> #endif /* _LINUX_SECCOMP_H */
>> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
>> index 242ae04..e061ad0 100644
>> --- a/include/trace/syscall.h
>> +++ b/include/trace/syscall.h
>> @@ -35,6 +35,8 @@ struct syscall_metadata {
>> extern unsigned long arch_syscall_addr(int nr);
>> extern int init_syscall_trace(struct ftrace_event_call *call);
>>
>> +extern struct syscall_metadata *syscall_nr_to_meta(int);
>> +
>> extern int reg_event_syscall_enter(struct ftrace_event_call *call);
>> extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
>> extern int reg_event_syscall_exit(struct ftrace_event_call *call);
>> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
>> struct trace_event *event);
>> enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
>> struct trace_event *event);
>> +#else
>> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
>> +{
>> + return NULL;
>> +}
>> #endif
>>
>> #ifdef CONFIG_PERF_EVENTS
>> diff --git a/kernel/Makefile b/kernel/Makefile
>> index 85cbfb3..84e7dfb 100644
>> --- a/kernel/Makefile
>> +++ b/kernel/Makefile
>> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
>> obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
>> obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
>> obj-$(CONFIG_SECCOMP) += seccomp.o
>> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
>> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
>> +endif
>> obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>> obj-$(CONFIG_TREE_RCU) += rcutree.o
>> obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index e7548de..6f835e0 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -34,6 +34,7 @@
>> #include <linux/cgroup.h>
>> #include <linux/security.h>
>> #include <linux/hugetlb.h>
>> +#include <linux/seccomp.h>
>> #include <linux/swap.h>
>> #include <linux/syscalls.h>
>> #include <linux/jiffies.h>
>> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
>> free_thread_info(tsk->stack);
>> rt_mutex_debug_task_free(tsk);
>> ftrace_graph_exit_task(tsk);
>> + put_tsk_seccomp(tsk);
>> free_task_struct(tsk);
>> }
>> EXPORT_SYMBOL(free_task);
>> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>> if (err)
>> goto out;
>>
>> + inherit_tsk_seccomp(tsk, orig);
>> setup_thread_stack(tsk, orig);
>> clear_user_return_notifier(tsk);
>> clear_tsk_need_resched(tsk);
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 57d4b13..0a942be 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -2,16 +2,20 @@
>> * linux/kernel/seccomp.c
>> *
>> * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
>> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> *
>> * This defines a simple but solid secure-computing mode.
>> */
>>
>> #include <linux/seccomp.h>
>> #include <linux/sched.h>
>> +#include <linux/slab.h>
>> #include <linux/compat.h>
>> +#include <linux/unistd.h>
>> +#include <linux/ftrace_event.h>
>>
>> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> /* #define SECCOMP_DEBUG 1 */
>> -#define NR_SECCOMP_MODES 1
>>
>> /*
>> * Secure computing mode 1 allows only read/write/exit/sigreturn.
>> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
>>
>> void __secure_computing(int this_syscall)
>> {
>> - int mode = current->seccomp.mode;
>> int * syscall;
>>
>> - switch (mode) {
>> + switch (current->seccomp.mode) {
>> case 1:
>> syscall = mode1_syscalls;
>> #ifdef CONFIG_COMPAT
>> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
>> return;
>> } while (*++syscall);
>> break;
>> +#ifdef CONFIG_SECCOMP_FILTER
>> + case 2:
>> + if (this_syscall >= NR_syscalls || this_syscall < 0)
>> + break;
>> +
>> + if (!seccomp_test_filters(this_syscall))
>> + return;
>> +
>> + seccomp_filter_log_failure(this_syscall);
>> + break;
>> +#endif
>> default:
>> BUG();
>> }
>> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
>> if (unlikely(current->seccomp.mode))
>> goto out;
>>
>> - ret = -EINVAL;
>> - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
>> - current->seccomp.mode = seccomp_mode;
>> - set_thread_flag(TIF_SECCOMP);
>> + ret = 0;
>> + switch (seccomp_mode) {
>> + case 1:
>> #ifdef TIF_NOTSC
>> disable_TSC();
>> #endif
>> - ret = 0;
>> +#ifdef CONFIG_SECCOMP_FILTER
>> + case 2:
>> +#endif
>> + current->seccomp.mode = seccomp_mode;
>> + set_thread_flag(TIF_SECCOMP);
>> + break;
>> + default:
>> + ret = -EINVAL;
>> }
>>
>> - out:
>> +out:
>> return ret;
>> }
>> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
>> new file mode 100644
>> index 0000000..9782f25
>> --- /dev/null
>> +++ b/kernel/seccomp_filter.c
>> @@ -0,0 +1,784 @@
>> +/* filter engine-based seccomp system call filtering
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
>> + *
>> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> + */
>> +
>> +#include <linux/compat.h>
>> +#include <linux/err.h>
>> +#include <linux/errno.h>
>> +#include <linux/ftrace_event.h>
>> +#include <linux/seccomp.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +
>> +#include <asm/syscall.h>
>> +#include <trace/syscall.h>
>> +
>> +
>> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> +
>> +#define SECCOMP_FILTER_ALLOW "1"
>> +#define SECCOMP_ACTION_DENY 0xffff
>> +#define SECCOMP_ACTION_ALLOW 0xfffe
>> +
>> +/**
>> + * struct seccomp_filters - container for seccomp filterset
>> + *
>> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
>> + * May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
>> + * @event_filters: array of pointers to ftrace event objects
>> + * @count: size of @event_filters
>> + * @flags: anonymous struct to wrap filters-specific flags
>> + * @usage: reference count to simplify use.
>> + */
>> +struct seccomp_filters {
>> + uint16_t syscalls[NR_syscalls];
>> + struct event_filter **event_filters;
>> + uint16_t count;
>> + struct {
>> + uint32_t compat:1,
>> + __reserved:31;
>> + } flags;
>> + atomic_t usage;
>> +};
>> +
>> +/* Handle ftrace symbol non-existence */
>> +#ifdef CONFIG_FTRACE_SYSCALLS
>> +#define create_event_filter(_ef_pptr, _event_type, _str) \
>> + ftrace_parse_filter(_ef_pptr, _event_type, _str)
>> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
>> +#define free_event_filter(_f) ftrace_free_filter(_f)
>> +
>> +#else
>> +
>> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
>> +#define get_filter_string(_ef) (NULL)
>> +#define free_event_filter(_f) do { } while (0)
>> +#endif
>> +
>> +/**
>> + * seccomp_filters_new - allocates a new filters object
>> + * @count: count to allocate for the event_filters array
>> + *
>> + * Returns ERR_PTR on error or an allocated object.
>> + */
>> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
>> +{
>> + struct seccomp_filters *f;
>> +
>> + if (count >= SECCOMP_ACTION_ALLOW)
>> + return ERR_PTR(-EINVAL);
>> +
>> + f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
>> + if (!f)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + /* Lazy SECCOMP_ACTION_DENY assignment. */
>> + memset(f->syscalls, 0xff, sizeof(f->syscalls));
>> + atomic_set(&f->usage, 1);
>> +
>> + f->event_filters = NULL;
>> + f->count = count;
>> + if (!count)
>> + return f;
>> +
>> + f->event_filters = kzalloc(count * sizeof(struct event_filter *),
>> + GFP_KERNEL);
>> + if (!f->event_filters) {
>> + kfree(f);
>> + f = ERR_PTR(-ENOMEM);
>> + }
>> + return f;
>> +}
>> +
>> +/**
>> + * seccomp_filters_free - cleans up the filter list and frees the table
>> + * @filters: NULL or live object to be completely destructed.
>> + */
>> +static void seccomp_filters_free(struct seccomp_filters *filters)
>> +{
>> + uint16_t count = 0;
>> + if (!filters)
>> + return;
>> + while (count < filters->count) {
>> + struct event_filter *f = filters->event_filters[count];
>> + free_event_filter(f);
>> + count++;
>> + }
>> + kfree(filters->event_filters);
>> + kfree(filters);
>> +}
>> +
>> +static void __put_seccomp_filters(struct seccomp_filters *orig)
>> +{
>> + WARN_ON(atomic_read(&orig->usage));
>> + seccomp_filters_free(orig);
>> +}
>> +
>> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
>> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
>> +#define seccomp_filter_dynamic(_id) \
>> + (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
>> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
>> + int syscall_nr)
>> +{
>> + if (!f)
>> + return SECCOMP_ACTION_DENY;
>> + return f->syscalls[syscall_nr];
>> +}
>> +
>> +static inline struct event_filter *seccomp_dynamic_filter(
>> + const struct seccomp_filters *filters, uint16_t id)
>> +{
>> + if (!seccomp_filter_dynamic(id))
>> + return NULL;
>> + return filters->event_filters[id];
>> +}
>> +
>> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
>> + int syscall_nr, uint16_t id)
>> +{
>> + filters->syscalls[syscall_nr] = id;
>> +}
>> +
>> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
>> + int syscall_nr, uint16_t id,
>> + struct event_filter *dynamic_filter)
>> +{
>> + filters->syscalls[syscall_nr] = id;
>> + if (seccomp_filter_dynamic(id))
>> + filters->event_filters[id] = dynamic_filter;
>> +}
>> +
>> +static struct event_filter *alloc_event_filter(int syscall_nr,
>> + const char *filter_string)
>> +{
>> + struct syscall_metadata *data;
>> + struct event_filter *filter = NULL;
>> + int err;
>> +
>> + data = syscall_nr_to_meta(syscall_nr);
>> + /* Argument-based filtering only works on ftrace-hooked syscalls. */
>> + err = -ENOSYS;
>> + if (!data)
>> + goto fail;
>> + err = create_event_filter(&filter,
>> + data->enter_event->event.type,
>> + filter_string);
>> + if (err)
>> + goto fail;
>> +
>> + return filter;
>> +fail:
>> + kfree(filter);
>> + return ERR_PTR(err);
>> +}
>> +
>> +/**
>> + * seccomp_filters_copy - copies filters from src to dst.
>> + *
>> + * @dst: seccomp_filters to populate.
>> + * @src: table to read from.
>> + * @skip: specifies an entry, by system call, to skip.
>> + *
>> + * Returns non-zero on failure.
>> + * Both the source and the destination should have no simultaneous
>> + * writers, and dst should be exclusive to the caller.
>> + * If @skip is < 0, it is ignored.
>> + */
>> +static int seccomp_filters_copy(struct seccomp_filters *dst,
>> + const struct seccomp_filters *src,
>> + int skip)
>> +{
>> + int id = 0, ret = 0, nr;
>> + memcpy(&dst->flags, &src->flags, sizeof(src->flags));
>> + memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
>> + if (!src->count)
>> + goto done;
>> + for (nr = 0; nr < NR_syscalls; ++nr) {
>> + struct event_filter *filter;
>> + const char *str;
>> + uint16_t src_id = seccomp_filter_id(src, nr);
>> + if (nr == skip) {
>> + set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
>> + NULL);
>> + continue;
>> + }
>> + if (!seccomp_filter_dynamic(src_id))
>> + continue;
>> + if (id >= dst->count) {
>> + ret = -EINVAL;
>> + goto done;
>> + }
>> + str = get_filter_string(seccomp_dynamic_filter(src, src_id));
>> + filter = alloc_event_filter(nr, str);
>> + if (IS_ERR(filter)) {
>> + ret = PTR_ERR(filter);
>> + goto done;
>> + }
>> + set_seccomp_filter(dst, nr, id, filter);
>> + id++;
>> + }
>> +
>> +done:
>> + return ret;
>> +}
>> +
>> +/**
>> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
>> + * @filters: unattached filter object to operate on
>> + * @syscall_nr: syscall number to update filters for
>> + * @filter_string: string to append to the existing filter
>> + *
>> + * The new string will be &&'d to the original filter string to ensure that it
>> + * always matches the existing predicates or less:
>> + * (old_filter) && @filter_string
>> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
>> + * failure.
>> + */
>> +static int seccomp_extend_filter(struct seccomp_filters *filters,
>> + int syscall_nr, char *filter_string)
>> +{
>> + struct event_filter *filter;
>> + uint16_t id = seccomp_filter_id(filters, syscall_nr);
>> + char *merged = NULL;
>> + int ret = -EINVAL, expected;
>> +
>> + /* No extending with a "1". */
>> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
>> + goto out;
>> +
>> + filter = seccomp_dynamic_filter(filters, id);
>> + ret = -ENOENT;
>> + if (!filter)
>> + goto out;
>> +
>> + merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> + ret = -ENOMEM;
>> + if (!merged)
>> + goto out;
>> +
>> + expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
>> + get_filter_string(filter), filter_string);
>> + ret = -E2BIG;
>> + if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
>> + goto out;
>> +
>> + /* Free the old filter */
>> + free_event_filter(filter);
>> + set_seccomp_filter(filters, syscall_nr, id, NULL);
>> +
>> + /* Replace it */
>> + filter = alloc_event_filter(syscall_nr, merged);
>> + if (IS_ERR(filter)) {
>> + ret = PTR_ERR(filter);
>> + goto out;
>> + }
>> + set_seccomp_filter(filters, syscall_nr, id, filter);
>> + ret = 0;
>> +
>> +out:
>> + kfree(merged);
>> + return ret;
>> +}
>> +
>> +/**
>> + * seccomp_add_filter - adds a filter for an unfiltered syscall
>> + * @filters: filters object to add a filter/action to
>> + * @syscall_nr: system call number to add a filter for
>> + * @filter_string: the filter string to apply
>> + *
>> + * Returns 0 on success and non-zero otherwise.
>> + */
>> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
>> + char *filter_string)
>> +{
>> + struct event_filter *filter;
>> + int ret = 0;
>> +
>> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
>> + set_seccomp_filter(filters, syscall_nr,
>> + SECCOMP_ACTION_ALLOW, NULL);
>> + goto out;
>> + }
>> +
>> + filter = alloc_event_filter(syscall_nr, filter_string);
>> + if (IS_ERR(filter)) {
>> + ret = PTR_ERR(filter);
>> + goto out;
>> + }
>> + /* Always add to the last slot available since additions are
>> + * are only done one at a time.
>> + */
>> + set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
>> +out:
>> + return ret;
>> +}
>> +
>> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
>> +static int filter_match_current(struct event_filter *event_filter)
>> +{
>> + int err = 0;
>> +#ifdef CONFIG_FTRACE_SYSCALLS
>> + uint8_t syscall_state[64];
>> +
>> + memset(syscall_state, 0, sizeof(syscall_state));
>> +
>> + /* The generic tracing entry can remain zeroed. */
>> + err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
>> + NULL);
>> + if (err)
>> + return 0;
>> +
>> + err = filter_match_preds(event_filter, syscall_state);
>> +#endif
>> + return err;
>> +}
>> +
>> +static const char *syscall_nr_to_name(int syscall)
>> +{
>> + const char *syscall_name = "unknown";
>> + struct syscall_metadata *data = syscall_nr_to_meta(syscall);
>> + if (data)
>> + syscall_name = data->name;
>> + return syscall_name;
>> +}
>> +
>> +static void filters_set_compat(struct seccomp_filters *filters)
>> +{
>> +#ifdef CONFIG_COMPAT
>> + if (is_compat_task())
>> + filters->flags.compat = 1;
>> +#endif
>> +}
>> +
>> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
>> +{
>> + int ret = 0;
>> + if (!filters)
>> + return 0;
>> +#ifdef CONFIG_COMPAT
>> + if (!!(is_compat_task()) == filters->flags.compat)
>> + ret = 1;
>> +#endif
>> + return ret;
>> +}
>> +
>> +static inline int syscall_is_execve(int syscall)
>> +{
>> + int nr = __NR_execve;
>> +#ifdef CONFIG_COMPAT
>> + if (is_compat_task())
>> + nr = __NR_seccomp_execve_32;
>> +#endif
>> + return syscall == nr;
>> +}
>> +
>> +#ifndef KSTK_EIP
>> +#define KSTK_EIP(x) 0L
>> +#endif
>> +
>> +void seccomp_filter_log_failure(int syscall)
>> +{
>> + pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
>> + current->comm, task_pid_nr(current), syscall,
>> + syscall_nr_to_name(syscall), KSTK_EIP(current));
>> +}
>> +
>> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
>> +void put_seccomp_filters(struct seccomp_filters *orig)
>> +{
>> + if (!orig)
>> + return;
>> +
>> + if (atomic_dec_and_test(&orig->usage))
>> + __put_seccomp_filters(orig);
>> +}
>> +
>> +/* get_seccomp_state - increments the reference count of @orig */
>> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
>
> Nit: the name does not match the comment.
Will fix it here and above. Thanks!
>> +{
>> + if (!orig)
>> + return NULL;
>> + atomic_inc(&orig->usage);
>> + return orig;
>
> This is called in an RCU read-side critical section. What exactly is
> RCU protecting? I would expect an rcu_dereference() or one of the
> RCU list-traversal primitives somewhere, either here or at the caller.
Ah, I spaced on rcu_dereference(). The goal was to make the
assignment and replacement of the seccomp_filters pointer
RCU-protected (in seccomp_state) so there's no concern over it being
replaced partial on platforms where pointer assignments are non-atomic
- such as via /proc/<pid>/seccomp_filters access or a call via the
exported symbols. Object lifetime is managed by reference counting so
that I don't have to worry about extending the RCU read-side critical
section by much or deal with pre-allocations.
I'll add rcu_dereference() to all the get_seccomp_filters() uses where
it makes sense, so that it is called safely. Just to make sure, does
it make sense to continue to rcu protect the specific pointer?
>> +}
>> +
>> +/**
>> + * seccomp_test_filters - tests 'current' against the given syscall
>> + * @state: seccomp_state of current to use.
>> + * @syscall: number of the system call to test
>> + *
>> + * Returns 0 on ok and non-zero on error/failure.
>> + */
>> +int seccomp_test_filters(int syscall)
>> +{
>> + uint16_t id;
>> + struct event_filter *filter;
>> + struct seccomp_filters *filters;
>> + int ret = -EACCES;
>> +
>> + rcu_read_lock();
>> + filters = get_seccomp_filters(current->seccomp.filters);
>> + rcu_read_unlock();
>> +
>> + if (!filters)
>> + goto out;
>> +
>> + if (filters_compat_mismatch(filters)) {
>> + pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
>> + current->comm, task_pid_nr(current));
>> + goto out;
>> + }
>> +
>> + /* execve is never allowed. */
>> + if (syscall_is_execve(syscall))
>> + goto out;
>> +
>> + ret = 0;
>> + id = seccomp_filter_id(filters, syscall);
>> + if (seccomp_filter_allow(id))
>> + goto out;
>> +
>> + ret = -EACCES;
>> + if (!seccomp_filter_dynamic(id))
>> + goto out;
>> +
>> + filter = seccomp_dynamic_filter(filters, id);
>> + if (filter && filter_match_current(filter))
>> + ret = 0;
>> +out:
>> + put_seccomp_filters(filters);
>> + return ret;
>> +}
>> +
>> +/**
>> + * seccomp_show_filters - prints the current filter state to a seq_file
>> + * @filters: properly get()'d filters object
>> + * @m: the prepared seq_file to receive the data
>> + *
>> + * Returns 0 on a successful write.
>> + */
>> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
>> +{
>> + int syscall;
>> + seq_printf(m, "Mode: %d\n", current->seccomp.mode);
>> + if (!filters)
>> + goto out;
>> +
>> + for (syscall = 0; syscall < NR_syscalls; ++syscall) {
>> + uint16_t id = seccomp_filter_id(filters, syscall);
>> + const char *filter_string = SECCOMP_FILTER_ALLOW;
>> + if (seccomp_filter_deny(id))
>> + continue;
>> + seq_printf(m, "%d (%s): ",
>> + syscall,
>> + syscall_nr_to_name(syscall));
>> + if (seccomp_filter_dynamic(id))
>> + filter_string = get_filter_string(
>> + seccomp_dynamic_filter(filters, id));
>> + seq_printf(m, "%s\n", filter_string);
>> + }
>> +out:
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
>> +
>> +/**
>> + * seccomp_get_filter - copies the filter_string into "buf"
>> + * @syscall_nr: system call number to look up
>> + * @buf: destination buffer
>> + * @bufsize: available space in the buffer.
>> + *
>> + * Context: User context only. This function may sleep on allocation and
>> + * operates on current. current must be attempting a system call
>> + * when this is called.
>> + *
>> + * Looks up the filter for the given system call number on current. If found,
>> + * the string length of the NUL-terminated buffer is returned and < 0 is
>> + * returned on error. The NUL byte is not included in the length.
>> + */
>> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
>> +{
>> + struct seccomp_filters *filters;
>> + struct event_filter *filter;
>> + long ret = -EINVAL;
>> + uint16_t id;
>> +
>> + if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
>> + bufsize = SECCOMP_MAX_FILTER_LENGTH;
>> +
>> + rcu_read_lock();
>> + filters = get_seccomp_filters(current->seccomp.filters);
>> + rcu_read_unlock();
>> +
>> + if (!filters)
>> + goto out;
>> +
>> + ret = -ENOENT;
>> + id = seccomp_filter_id(filters, syscall_nr);
>> + if (seccomp_filter_deny(id))
>> + goto out;
>> +
>> + if (seccomp_filter_allow(id)) {
>> + ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
>> + goto copied;
>> + }
>> +
>> + filter = seccomp_dynamic_filter(filters, id);
>> + if (!filter)
>> + goto out;
>> + ret = strlcpy(buf, get_filter_string(filter), bufsize);
>> +
>> +copied:
>> + if (ret >= bufsize) {
>> + ret = -ENOSPC;
>> + goto out;
>> + }
>> + /* Zero out any remaining buffer, just in case. */
>> + memset(buf + ret, 0, bufsize - ret);
>> +out:
>> + put_seccomp_filters(filters);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
>> +
>> +/**
>> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
>> + * @syscall_nr: the system call number to clear filters for.
>> + *
>> + * Context: User context only. This function may sleep on allocation and
>> + * operates on current. current must be attempting a system call
>> + * when this is called.
>> + *
>> + * Returns 0 on success.
>> + */
>> +long seccomp_clear_filter(int syscall_nr)
>> +{
>> + struct seccomp_filters *filters = NULL, *orig_filters;
>> + uint16_t id;
>> + int ret = -EINVAL;
>> +
>> + rcu_read_lock();
>> + orig_filters = get_seccomp_filters(current->seccomp.filters);
>> + rcu_read_unlock();
>> +
>> + if (!orig_filters)
>> + goto out;
>> +
>> + if (filters_compat_mismatch(orig_filters))
>> + goto out;
>> +
>> + id = seccomp_filter_id(orig_filters, syscall_nr);
>> + if (seccomp_filter_deny(id))
>> + goto out;
>> +
>> + /* Create a new filters object for the task */
>> + if (seccomp_filter_dynamic(id))
>> + filters = seccomp_filters_new(orig_filters->count - 1);
>> + else
>> + filters = seccomp_filters_new(orig_filters->count);
>> +
>> + if (IS_ERR(filters)) {
>> + ret = PTR_ERR(filters);
>> + goto out;
>> + }
>> +
>> + /* Copy, but drop the requested entry. */
>> + ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
>> + if (ret)
>> + goto out;
>> + get_seccomp_filters(filters); /* simplify the out: path */
>> +
>> + rcu_assign_pointer(current->seccomp.filters, filters);
>
> What prevents two copies of seccomp_clear_filter() from running
> concurrently?
Nothing - the last one wins assignment, but the objects themselves
should be internally consistent to the parallel calls. If that's a
concern, a per-task writer mutex could be used just to ensure
simultaneous calls to clear and set are performed serially. Would
that make more sense?
>> + synchronize_rcu();
>> + put_seccomp_filters(orig_filters); /* for the task */
>> +out:
>> + put_seccomp_filters(orig_filters); /* for the get */
>> + put_seccomp_filters(filters); /* for the extra get */
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
>> +
>> +/**
>> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
>> + * @syscall_nr: system call number to apply the filter to.
>> + * @filter: ftrace filter string to apply.
>> + *
>> + * Context: User context only. This function may sleep on allocation and
>> + * operates on current. current must be attempting a system call
>> + * when this is called.
>> + *
>> + * New filters may be added for system calls when the current task is
>> + * not in a secure computing mode (seccomp). Otherwise, existing filters may
>> + * be extended.
>> + *
>> + * Returns 0 on success or an errno on failure.
>> + */
>> +long seccomp_set_filter(int syscall_nr, char *filter)
>> +{
>> + struct seccomp_filters *filters = NULL, *orig_filters = NULL;
>> + uint16_t id;
>> + long ret = -EINVAL;
>> + uint16_t filters_needed;
>> +
>> + if (!filter)
>> + goto out;
>> +
>> + filter = strstrip(filter);
>> + /* Disallow empty strings. */
>> + if (filter[0] == 0)
>> + goto out;
>> +
>> + rcu_read_lock();
>> + orig_filters = get_seccomp_filters(current->seccomp.filters);
>> + rcu_read_unlock();
>> +
>> + /* After the first call, compatibility mode is selected permanently. */
>> + ret = -EACCES;
>> + if (filters_compat_mismatch(orig_filters))
>> + goto out;
>> +
>> + filters_needed = orig_filters ? orig_filters->count : 0;
>> + id = seccomp_filter_id(orig_filters, syscall_nr);
>> + if (seccomp_filter_deny(id)) {
>> + /* Don't allow DENYs to be changed when in a seccomp mode */
>> + ret = -EACCES;
>> + if (current->seccomp.mode)
>> + goto out;
>> + filters_needed++;
>> + }
>> +
>> + filters = seccomp_filters_new(filters_needed);
>> + if (IS_ERR(filters)) {
>> + ret = PTR_ERR(filters);
>> + goto out;
>> + }
>> +
>> + filters_set_compat(filters);
>> + if (orig_filters) {
>> + ret = seccomp_filters_copy(filters, orig_filters, -1);
>> + if (ret)
>> + goto out;
>> + }
>> +
>> + if (seccomp_filter_deny(id))
>> + ret = seccomp_add_filter(filters, syscall_nr, filter);
>> + else
>> + ret = seccomp_extend_filter(filters, syscall_nr, filter);
>> + if (ret)
>> + goto out;
>> + get_seccomp_filters(filters); /* simplify the error paths */
>> +
>> + rcu_assign_pointer(current->seccomp.filters, filters);
>
> Again, what prevents two copies of seccomp_set_filter() from running
> concurrently?
Same deal - nothing, but I'd be happy to add a guard if it makes sense.
Thanks!
>> + synchronize_rcu();
>> + put_seccomp_filters(orig_filters); /* for the task */
>> +out:
>> + put_seccomp_filters(orig_filters); /* for the get */
>> + put_seccomp_filters(filters); /* for get or task, on err */
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
>> +
>> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
>> + char __user *user_filter)
>> +{
>> + int nr;
>> + long ret;
>> + char *filter = NULL;
>> +
>> + ret = -EINVAL;
>> + if (syscall_nr >= NR_syscalls)
>> + goto out;
>> +
>> + ret = -EFAULT;
>> + if (!user_filter)
>> + goto out;
>> +
>> + filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> + ret = -ENOMEM;
>> + if (!filter)
>> + goto out;
>> +
>> + ret = -EFAULT;
>> + if (strncpy_from_user(filter, user_filter,
>> + SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
>> + goto out;
>> +
>> + nr = (int) syscall_nr;
>> + ret = seccomp_set_filter(nr, filter);
>> +
>> +out:
>> + kfree(filter);
>> + return ret;
>> +}
>> +
>> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
>> +{
>> + int nr = -1;
>> + long ret;
>> +
>> + ret = -EINVAL;
>> + if (syscall_nr >= NR_syscalls)
>> + goto out;
>> +
>> + nr = (int) syscall_nr;
>> + ret = seccomp_clear_filter(nr);
>> +
>> +out:
>> + return ret;
>> +}
>> +
>> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
>> + unsigned long available)
>> +{
>> + int ret, nr;
>> + unsigned long copied;
>> + char *buf = NULL;
>> + ret = -EINVAL;
>> + if (!available)
>> + goto out;
>> + /* Ignore extra buffer space. */
>> + if (available > SECCOMP_MAX_FILTER_LENGTH)
>> + available = SECCOMP_MAX_FILTER_LENGTH;
>> +
>> + ret = -EINVAL;
>> + if (syscall_nr >= NR_syscalls)
>> + goto out;
>> + nr = (int) syscall_nr;
>> +
>> + ret = -ENOMEM;
>> + buf = kmalloc(available, GFP_KERNEL);
>> + if (!buf)
>> + goto out;
>> +
>> + ret = seccomp_get_filter(nr, buf, available);
>> + if (ret < 0)
>> + goto out;
>> +
>> + /* Include the NUL byte in the copy. */
>> + copied = copy_to_user(dst, buf, ret + 1);
>> + ret = -ENOSPC;
>> + if (copied)
>> + goto out;
>> + ret = 0;
>> +out:
>> + kfree(buf);
>> + return ret;
>> +}
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index af468ed..ed60d06 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>> case PR_SET_ENDIAN:
>> error = SET_ENDIAN(me, arg2);
>> break;
>> -
>> case PR_GET_SECCOMP:
>> error = prctl_get_seccomp();
>> break;
>> case PR_SET_SECCOMP:
>> error = prctl_set_seccomp(arg2);
>> break;
>> + case PR_SET_SECCOMP_FILTER:
>> + error = prctl_set_seccomp_filter(arg2,
>> + (char __user *) arg3);
>> + break;
>> + case PR_CLEAR_SECCOMP_FILTER:
>> + error = prctl_clear_seccomp_filter(arg2);
>> + break;
>> + case PR_GET_SECCOMP_FILTER:
>> + error = prctl_get_seccomp_filter(arg2,
>> + (char __user *) arg3,
>> + arg4);
>> + break;
>> case PR_GET_TSC:
>> error = GET_TSC_CTL(arg2);
>> break;
>> diff --git a/security/Kconfig b/security/Kconfig
>> index 95accd4..c76adf2 100644
>> --- a/security/Kconfig
>> +++ b/security/Kconfig
>> @@ -2,6 +2,10 @@
>> # Security configuration
>> #
>>
>> +# Make seccomp filter Kconfig switch below available
>> +config HAVE_SECCOMP_FILTER
>> + bool
>> +
>> menu "Security options"
>>
>> config KEYS
>> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
>>
>> If you are unsure how to answer this question, answer N.
>>
>> +config SECCOMP_FILTER
>> + bool "Enable seccomp-based system call filtering"
>> + select SECCOMP
>> + depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
>> + help
>> + This kernel feature expands CONFIG_SECCOMP to allow computing
>> + in environments with reduced kernel access dictated by the
>> + application itself through prctl calls. If
>> + CONFIG_FTRACE_SYSCALLS is available, then system call
>> + argument-based filtering predicates may be used.
>> +
>> + See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> config SECURITY
>> bool "Enable different security models"
>> depends on SYSFS
>> --
>> 1.7.0.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-06-02 18:14 ` Will Drewry
@ 2011-06-02 19:42 ` Paul E. McKenney
2011-06-02 20:28 ` Will Drewry
0 siblings, 1 reply; 91+ messages in thread
From: Paul E. McKenney @ 2011-06-02 19:42 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
Peter Zijlstra, Frederic Weisbecker, linux-security-module
On Thu, Jun 02, 2011 at 01:14:54PM -0500, Will Drewry wrote:
> On Thu, Jun 2, 2011 at 12:36 PM, Paul E. McKenney
> <paulmck@linux.vnet.ibm.com> wrote:
> > On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
> >> This change adds a new seccomp mode which specifies the allowed system
> >> calls dynamically. When in the new mode (2), all system calls are
> >> checked against process-defined filters - first by system call number,
> >> then by a filter string. If an entry exists for a given system call and
> >> all filter predicates evaluate to true, then the task may proceed.
> >> Otherwise, the task is killed.
> >
> > A few questions below -- I can't say that I understand the RCU usage.
> >
> > Thanx, Paul
> >
> >> Filter string parsing and evaluation is handled by the ftrace filter
> >> engine. Related patches tweak to the perf filter trace and free
> >> allowing the calls to be shared. Filters inherit their understanding of
> >> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
> >> subsystem which already populates this information in syscall_metadata
> >> associated enter_event (and exit_event) structures. If
> >> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
> >> will be allowed.
> >>
> >> The net result is a process may have its system calls filtered using the
> >> ftrace filter engine's inherent understanding of systems calls. The set
> >> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
> >> prctl(). For example, a filterset for a process, like pdftotext, that
> >> should only process read-only input could (roughly) look like:
> >> sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
> >> prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
> >> prctl(PR_SET_SECCOMP, 2);
> >>
> >> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
> >> be &&'d together to ensure that attack surface may only be reduced:
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
> >>
> >> With the earlier example, the active filter becomes:
> >> "(fd == 1 || fd == 2) && fd != 2"
> >>
> >> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
> >> The latter returns the current filter for a system call to userspace:
> >>
> >> prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
> >>
> >> while the former clears any filters for a given system call changing it
> >> back to a defaulty deny:
> >>
> >> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
> >>
> >> v3: - always block execve calls (as per linus torvalds)
> >> - add __NR_seccomp_execve(_32) to seccomp-supporting arches
> >> - ensure compat tasks can't reach ftrace:syscalls
> >> - dropped new defines for seccomp modes.
> >> - two level array instead of hlists (sugg. by olof johansson)
> >> - added generic Kconfig entry that is not connected.
> >> - dropped internal seccomp.h
> >> - move prctl helpers to seccomp_filter
> >> - killed seccomp_t typedef (as per checkpatch)
> >> v2: - changed to use the existing syscall number ABI.
> >> - prctl changes to minimize parsing in the kernel:
> >> prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
> >> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
> >> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
> >> prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
> >> - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
> >> - added flags
> >> - provide a default fail syscall_nr_to_meta in ftrace
> >> - provides fallback for unhooked system calls
> >> - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
> >> - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
> >> - moved to a hlist and 4 bit hash of linked lists
> >> - added support to operate without CONFIG_FTRACE_SYSCALLS
> >> - moved Kconfig support next to SECCOMP
> >> - made Kconfig entries dependent on EXPERIMENTAL
> >> - added macros to avoid ifdefs from kernel/fork.c
> >> - added compat task/filter matching
> >> - drop seccomp.h inclusion in sched.h and drop seccomp_t
> >> - added Filtering to "show" output
> >> - added on_exec state dup'ing when enabling after a fast-path accept.
> >>
> >> Signed-off-by: Will Drewry <wad@chromium.org>
> >> ---
> >> include/linux/prctl.h | 5 +
> >> include/linux/sched.h | 2 +-
> >> include/linux/seccomp.h | 98 ++++++-
> >> include/trace/syscall.h | 7 +
> >> kernel/Makefile | 3 +
> >> kernel/fork.c | 3 +
> >> kernel/seccomp.c | 38 ++-
> >> kernel/seccomp_filter.c | 784 +++++++++++++++++++++++++++++++++++++++++++++++
> >> kernel/sys.c | 13 +-
> >> security/Kconfig | 17 +
> >> 10 files changed, 954 insertions(+), 16 deletions(-)
> >> create mode 100644 kernel/seccomp_filter.c
> >>
> >> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
> >> index a3baeb2..44723ce 100644
> >> --- a/include/linux/prctl.h
> >> +++ b/include/linux/prctl.h
> >> @@ -64,6 +64,11 @@
> >> #define PR_GET_SECCOMP 21
> >> #define PR_SET_SECCOMP 22
> >>
> >> +/* Get/set process seccomp filters */
> >> +#define PR_GET_SECCOMP_FILTER 35
> >> +#define PR_SET_SECCOMP_FILTER 36
> >> +#define PR_CLEAR_SECCOMP_FILTER 37
> >> +
> >> /* Get/set the capability bounding set (as per security/commoncap.c) */
> >> #define PR_CAPBSET_READ 23
> >> #define PR_CAPBSET_DROP 24
> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> index 18d63ce..3f0bc8d 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -1374,7 +1374,7 @@ struct task_struct {
> >> uid_t loginuid;
> >> unsigned int sessionid;
> >> #endif
> >> - seccomp_t seccomp;
> >> + struct seccomp_struct seccomp;
> >>
> >> /* Thread group tracking */
> >> u32 parent_exec_id;
> >> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> >> index 167c333..f4434ca 100644
> >> --- a/include/linux/seccomp.h
> >> +++ b/include/linux/seccomp.h
> >> @@ -1,13 +1,33 @@
> >> #ifndef _LINUX_SECCOMP_H
> >> #define _LINUX_SECCOMP_H
> >>
> >> +struct seq_file;
> >>
> >> #ifdef CONFIG_SECCOMP
> >>
> >> +#include <linux/errno.h>
> >> #include <linux/thread_info.h>
> >> +#include <linux/types.h>
> >> #include <asm/seccomp.h>
> >>
> >> -typedef struct { int mode; } seccomp_t;
> >> +struct seccomp_filters;
> >> +/**
> >> + * struct seccomp_struct - the state of a seccomp'ed process
> >> + *
> >> + * @mode:
> >> + * if this is 1, the process is under standard seccomp rules
> >> + * is 2, the process is only allowed to make system calls where
> >> + * associated filters evaluate successfully.
> >> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
> >> + * filters assignment/use should be RCU-protected and its contents
> >> + * should never be modified when attached to a seccomp_struct.
> >> + */
> >> +struct seccomp_struct {
> >> + uint16_t mode;
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> + struct seccomp_filters *filters;
> >> +#endif
> >> +};
> >>
> >> extern void __secure_computing(int);
> >> static inline void secure_computing(int this_syscall)
> >> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
> >> __secure_computing(this_syscall);
> >> }
> >>
> >> -extern long prctl_get_seccomp(void);
> >> extern long prctl_set_seccomp(unsigned long);
> >> +extern long prctl_get_seccomp(void);
> >>
> >> #else /* CONFIG_SECCOMP */
> >>
> >> #include <linux/errno.h>
> >>
> >> -typedef struct { } seccomp_t;
> >> -
> >> +struct seccomp_struct { };
> >> #define secure_computing(x) do { } while (0)
> >>
> >> static inline long prctl_get_seccomp(void)
> >> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
> >> return -EINVAL;
> >> }
> >>
> >> -static inline long prctl_set_seccomp(unsigned long arg2)
> >> +static inline long prctl_set_seccomp(unsigned long a2);
> >> {
> >> return -EINVAL;
> >> }
> >>
> >> #endif /* CONFIG_SECCOMP */
> >>
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +
> >> +#define inherit_tsk_seccomp(_child, _orig) do { \
> >> + _child->seccomp.mode = _orig->seccomp.mode; \
> >> + _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
> >> + } while (0)
> >> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
> >> +
> >> +extern int seccomp_show_filters(struct seccomp_filters *filters,
> >> + struct seq_file *);
> >> +extern long seccomp_set_filter(int, char *);
> >> +extern long seccomp_clear_filter(int);
> >> +extern long seccomp_get_filter(int, char *, unsigned long);
> >> +
> >> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
> >> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
> >> + unsigned long);
> >> +extern long prctl_clear_seccomp_filter(unsigned long);
> >> +
> >> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
> >> +extern void put_seccomp_filters(struct seccomp_filters *);
> >> +
> >> +extern int seccomp_test_filters(int);
> >> +extern void seccomp_filter_log_failure(int);
> >> +
> >> +#else /* CONFIG_SECCOMP_FILTER */
> >> +
> >> +struct seccomp_filters { };
> >> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
> >> +#define put_tsk_seccomp(_tsk) do { } while (0)
> >> +
> >> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
> >> + struct seq_file *m)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +
> >> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +
> >> +static inline long seccomp_clear_filter(int syscall_nr)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +
> >> +static inline long seccomp_get_filter(int syscall_nr,
> >> + char *buf, unsigned long available)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +
> >> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +
> >> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +
> >> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
> >> + unsigned long a4)
> >> +{
> >> + return -ENOSYS;
> >> +}
> >> +#endif /* CONFIG_SECCOMP_FILTER */
> >> #endif /* _LINUX_SECCOMP_H */
> >> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
> >> index 242ae04..e061ad0 100644
> >> --- a/include/trace/syscall.h
> >> +++ b/include/trace/syscall.h
> >> @@ -35,6 +35,8 @@ struct syscall_metadata {
> >> extern unsigned long arch_syscall_addr(int nr);
> >> extern int init_syscall_trace(struct ftrace_event_call *call);
> >>
> >> +extern struct syscall_metadata *syscall_nr_to_meta(int);
> >> +
> >> extern int reg_event_syscall_enter(struct ftrace_event_call *call);
> >> extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
> >> extern int reg_event_syscall_exit(struct ftrace_event_call *call);
> >> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
> >> struct trace_event *event);
> >> enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
> >> struct trace_event *event);
> >> +#else
> >> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
> >> +{
> >> + return NULL;
> >> +}
> >> #endif
> >>
> >> #ifdef CONFIG_PERF_EVENTS
> >> diff --git a/kernel/Makefile b/kernel/Makefile
> >> index 85cbfb3..84e7dfb 100644
> >> --- a/kernel/Makefile
> >> +++ b/kernel/Makefile
> >> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
> >> obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
> >> obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
> >> obj-$(CONFIG_SECCOMP) += seccomp.o
> >> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
> >> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
> >> +endif
> >> obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
> >> obj-$(CONFIG_TREE_RCU) += rcutree.o
> >> obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
> >> diff --git a/kernel/fork.c b/kernel/fork.c
> >> index e7548de..6f835e0 100644
> >> --- a/kernel/fork.c
> >> +++ b/kernel/fork.c
> >> @@ -34,6 +34,7 @@
> >> #include <linux/cgroup.h>
> >> #include <linux/security.h>
> >> #include <linux/hugetlb.h>
> >> +#include <linux/seccomp.h>
> >> #include <linux/swap.h>
> >> #include <linux/syscalls.h>
> >> #include <linux/jiffies.h>
> >> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
> >> free_thread_info(tsk->stack);
> >> rt_mutex_debug_task_free(tsk);
> >> ftrace_graph_exit_task(tsk);
> >> + put_tsk_seccomp(tsk);
> >> free_task_struct(tsk);
> >> }
> >> EXPORT_SYMBOL(free_task);
> >> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
> >> if (err)
> >> goto out;
> >>
> >> + inherit_tsk_seccomp(tsk, orig);
> >> setup_thread_stack(tsk, orig);
> >> clear_user_return_notifier(tsk);
> >> clear_tsk_need_resched(tsk);
> >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> >> index 57d4b13..0a942be 100644
> >> --- a/kernel/seccomp.c
> >> +++ b/kernel/seccomp.c
> >> @@ -2,16 +2,20 @@
> >> * linux/kernel/seccomp.c
> >> *
> >> * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> >> *
> >> * This defines a simple but solid secure-computing mode.
> >> */
> >>
> >> #include <linux/seccomp.h>
> >> #include <linux/sched.h>
> >> +#include <linux/slab.h>
> >> #include <linux/compat.h>
> >> +#include <linux/unistd.h>
> >> +#include <linux/ftrace_event.h>
> >>
> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> >> /* #define SECCOMP_DEBUG 1 */
> >> -#define NR_SECCOMP_MODES 1
> >>
> >> /*
> >> * Secure computing mode 1 allows only read/write/exit/sigreturn.
> >> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
> >>
> >> void __secure_computing(int this_syscall)
> >> {
> >> - int mode = current->seccomp.mode;
> >> int * syscall;
> >>
> >> - switch (mode) {
> >> + switch (current->seccomp.mode) {
> >> case 1:
> >> syscall = mode1_syscalls;
> >> #ifdef CONFIG_COMPAT
> >> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
> >> return;
> >> } while (*++syscall);
> >> break;
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> + case 2:
> >> + if (this_syscall >= NR_syscalls || this_syscall < 0)
> >> + break;
> >> +
> >> + if (!seccomp_test_filters(this_syscall))
> >> + return;
> >> +
> >> + seccomp_filter_log_failure(this_syscall);
> >> + break;
> >> +#endif
> >> default:
> >> BUG();
> >> }
> >> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
> >> if (unlikely(current->seccomp.mode))
> >> goto out;
> >>
> >> - ret = -EINVAL;
> >> - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
> >> - current->seccomp.mode = seccomp_mode;
> >> - set_thread_flag(TIF_SECCOMP);
> >> + ret = 0;
> >> + switch (seccomp_mode) {
> >> + case 1:
> >> #ifdef TIF_NOTSC
> >> disable_TSC();
> >> #endif
> >> - ret = 0;
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> + case 2:
> >> +#endif
> >> + current->seccomp.mode = seccomp_mode;
> >> + set_thread_flag(TIF_SECCOMP);
> >> + break;
> >> + default:
> >> + ret = -EINVAL;
> >> }
> >>
> >> - out:
> >> +out:
> >> return ret;
> >> }
> >> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
> >> new file mode 100644
> >> index 0000000..9782f25
> >> --- /dev/null
> >> +++ b/kernel/seccomp_filter.c
> >> @@ -0,0 +1,784 @@
> >> +/* filter engine-based seccomp system call filtering
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License, or
> >> + * (at your option) any later version.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License
> >> + * along with this program; if not, write to the Free Software
> >> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> >> + *
> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> >> + */
> >> +
> >> +#include <linux/compat.h>
> >> +#include <linux/err.h>
> >> +#include <linux/errno.h>
> >> +#include <linux/ftrace_event.h>
> >> +#include <linux/seccomp.h>
> >> +#include <linux/seq_file.h>
> >> +#include <linux/sched.h>
> >> +#include <linux/slab.h>
> >> +#include <linux/uaccess.h>
> >> +
> >> +#include <asm/syscall.h>
> >> +#include <trace/syscall.h>
> >> +
> >> +
> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> >> +
> >> +#define SECCOMP_FILTER_ALLOW "1"
> >> +#define SECCOMP_ACTION_DENY 0xffff
> >> +#define SECCOMP_ACTION_ALLOW 0xfffe
> >> +
> >> +/**
> >> + * struct seccomp_filters - container for seccomp filterset
> >> + *
> >> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
> >> + * May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
> >> + * @event_filters: array of pointers to ftrace event objects
> >> + * @count: size of @event_filters
> >> + * @flags: anonymous struct to wrap filters-specific flags
> >> + * @usage: reference count to simplify use.
> >> + */
> >> +struct seccomp_filters {
> >> + uint16_t syscalls[NR_syscalls];
> >> + struct event_filter **event_filters;
> >> + uint16_t count;
> >> + struct {
> >> + uint32_t compat:1,
> >> + __reserved:31;
> >> + } flags;
> >> + atomic_t usage;
> >> +};
> >> +
> >> +/* Handle ftrace symbol non-existence */
> >> +#ifdef CONFIG_FTRACE_SYSCALLS
> >> +#define create_event_filter(_ef_pptr, _event_type, _str) \
> >> + ftrace_parse_filter(_ef_pptr, _event_type, _str)
> >> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
> >> +#define free_event_filter(_f) ftrace_free_filter(_f)
> >> +
> >> +#else
> >> +
> >> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
> >> +#define get_filter_string(_ef) (NULL)
> >> +#define free_event_filter(_f) do { } while (0)
> >> +#endif
> >> +
> >> +/**
> >> + * seccomp_filters_new - allocates a new filters object
> >> + * @count: count to allocate for the event_filters array
> >> + *
> >> + * Returns ERR_PTR on error or an allocated object.
> >> + */
> >> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
> >> +{
> >> + struct seccomp_filters *f;
> >> +
> >> + if (count >= SECCOMP_ACTION_ALLOW)
> >> + return ERR_PTR(-EINVAL);
> >> +
> >> + f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
> >> + if (!f)
> >> + return ERR_PTR(-ENOMEM);
> >> +
> >> + /* Lazy SECCOMP_ACTION_DENY assignment. */
> >> + memset(f->syscalls, 0xff, sizeof(f->syscalls));
> >> + atomic_set(&f->usage, 1);
> >> +
> >> + f->event_filters = NULL;
> >> + f->count = count;
> >> + if (!count)
> >> + return f;
> >> +
> >> + f->event_filters = kzalloc(count * sizeof(struct event_filter *),
> >> + GFP_KERNEL);
> >> + if (!f->event_filters) {
> >> + kfree(f);
> >> + f = ERR_PTR(-ENOMEM);
> >> + }
> >> + return f;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_filters_free - cleans up the filter list and frees the table
> >> + * @filters: NULL or live object to be completely destructed.
> >> + */
> >> +static void seccomp_filters_free(struct seccomp_filters *filters)
> >> +{
> >> + uint16_t count = 0;
> >> + if (!filters)
> >> + return;
> >> + while (count < filters->count) {
> >> + struct event_filter *f = filters->event_filters[count];
> >> + free_event_filter(f);
> >> + count++;
> >> + }
> >> + kfree(filters->event_filters);
> >> + kfree(filters);
> >> +}
> >> +
> >> +static void __put_seccomp_filters(struct seccomp_filters *orig)
> >> +{
> >> + WARN_ON(atomic_read(&orig->usage));
> >> + seccomp_filters_free(orig);
> >> +}
> >> +
> >> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
> >> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
> >> +#define seccomp_filter_dynamic(_id) \
> >> + (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
> >> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
> >> + int syscall_nr)
> >> +{
> >> + if (!f)
> >> + return SECCOMP_ACTION_DENY;
> >> + return f->syscalls[syscall_nr];
> >> +}
> >> +
> >> +static inline struct event_filter *seccomp_dynamic_filter(
> >> + const struct seccomp_filters *filters, uint16_t id)
> >> +{
> >> + if (!seccomp_filter_dynamic(id))
> >> + return NULL;
> >> + return filters->event_filters[id];
> >> +}
> >> +
> >> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
> >> + int syscall_nr, uint16_t id)
> >> +{
> >> + filters->syscalls[syscall_nr] = id;
> >> +}
> >> +
> >> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
> >> + int syscall_nr, uint16_t id,
> >> + struct event_filter *dynamic_filter)
> >> +{
> >> + filters->syscalls[syscall_nr] = id;
> >> + if (seccomp_filter_dynamic(id))
> >> + filters->event_filters[id] = dynamic_filter;
> >> +}
> >> +
> >> +static struct event_filter *alloc_event_filter(int syscall_nr,
> >> + const char *filter_string)
> >> +{
> >> + struct syscall_metadata *data;
> >> + struct event_filter *filter = NULL;
> >> + int err;
> >> +
> >> + data = syscall_nr_to_meta(syscall_nr);
> >> + /* Argument-based filtering only works on ftrace-hooked syscalls. */
> >> + err = -ENOSYS;
> >> + if (!data)
> >> + goto fail;
> >> + err = create_event_filter(&filter,
> >> + data->enter_event->event.type,
> >> + filter_string);
> >> + if (err)
> >> + goto fail;
> >> +
> >> + return filter;
> >> +fail:
> >> + kfree(filter);
> >> + return ERR_PTR(err);
> >> +}
> >> +
> >> +/**
> >> + * seccomp_filters_copy - copies filters from src to dst.
> >> + *
> >> + * @dst: seccomp_filters to populate.
> >> + * @src: table to read from.
> >> + * @skip: specifies an entry, by system call, to skip.
> >> + *
> >> + * Returns non-zero on failure.
> >> + * Both the source and the destination should have no simultaneous
> >> + * writers, and dst should be exclusive to the caller.
> >> + * If @skip is < 0, it is ignored.
> >> + */
> >> +static int seccomp_filters_copy(struct seccomp_filters *dst,
> >> + const struct seccomp_filters *src,
> >> + int skip)
> >> +{
> >> + int id = 0, ret = 0, nr;
> >> + memcpy(&dst->flags, &src->flags, sizeof(src->flags));
> >> + memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
> >> + if (!src->count)
> >> + goto done;
> >> + for (nr = 0; nr < NR_syscalls; ++nr) {
> >> + struct event_filter *filter;
> >> + const char *str;
> >> + uint16_t src_id = seccomp_filter_id(src, nr);
> >> + if (nr == skip) {
> >> + set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
> >> + NULL);
> >> + continue;
> >> + }
> >> + if (!seccomp_filter_dynamic(src_id))
> >> + continue;
> >> + if (id >= dst->count) {
> >> + ret = -EINVAL;
> >> + goto done;
> >> + }
> >> + str = get_filter_string(seccomp_dynamic_filter(src, src_id));
> >> + filter = alloc_event_filter(nr, str);
> >> + if (IS_ERR(filter)) {
> >> + ret = PTR_ERR(filter);
> >> + goto done;
> >> + }
> >> + set_seccomp_filter(dst, nr, id, filter);
> >> + id++;
> >> + }
> >> +
> >> +done:
> >> + return ret;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
> >> + * @filters: unattached filter object to operate on
> >> + * @syscall_nr: syscall number to update filters for
> >> + * @filter_string: string to append to the existing filter
> >> + *
> >> + * The new string will be &&'d to the original filter string to ensure that it
> >> + * always matches the existing predicates or less:
> >> + * (old_filter) && @filter_string
> >> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
> >> + * failure.
> >> + */
> >> +static int seccomp_extend_filter(struct seccomp_filters *filters,
> >> + int syscall_nr, char *filter_string)
> >> +{
> >> + struct event_filter *filter;
> >> + uint16_t id = seccomp_filter_id(filters, syscall_nr);
> >> + char *merged = NULL;
> >> + int ret = -EINVAL, expected;
> >> +
> >> + /* No extending with a "1". */
> >> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
> >> + goto out;
> >> +
> >> + filter = seccomp_dynamic_filter(filters, id);
> >> + ret = -ENOENT;
> >> + if (!filter)
> >> + goto out;
> >> +
> >> + merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> >> + ret = -ENOMEM;
> >> + if (!merged)
> >> + goto out;
> >> +
> >> + expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
> >> + get_filter_string(filter), filter_string);
> >> + ret = -E2BIG;
> >> + if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
> >> + goto out;
> >> +
> >> + /* Free the old filter */
> >> + free_event_filter(filter);
> >> + set_seccomp_filter(filters, syscall_nr, id, NULL);
> >> +
> >> + /* Replace it */
> >> + filter = alloc_event_filter(syscall_nr, merged);
> >> + if (IS_ERR(filter)) {
> >> + ret = PTR_ERR(filter);
> >> + goto out;
> >> + }
> >> + set_seccomp_filter(filters, syscall_nr, id, filter);
> >> + ret = 0;
> >> +
> >> +out:
> >> + kfree(merged);
> >> + return ret;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_add_filter - adds a filter for an unfiltered syscall
> >> + * @filters: filters object to add a filter/action to
> >> + * @syscall_nr: system call number to add a filter for
> >> + * @filter_string: the filter string to apply
> >> + *
> >> + * Returns 0 on success and non-zero otherwise.
> >> + */
> >> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
> >> + char *filter_string)
> >> +{
> >> + struct event_filter *filter;
> >> + int ret = 0;
> >> +
> >> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
> >> + set_seccomp_filter(filters, syscall_nr,
> >> + SECCOMP_ACTION_ALLOW, NULL);
> >> + goto out;
> >> + }
> >> +
> >> + filter = alloc_event_filter(syscall_nr, filter_string);
> >> + if (IS_ERR(filter)) {
> >> + ret = PTR_ERR(filter);
> >> + goto out;
> >> + }
> >> + /* Always add to the last slot available since additions are
> >> + * are only done one at a time.
> >> + */
> >> + set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
> >> +out:
> >> + return ret;
> >> +}
> >> +
> >> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
> >> +static int filter_match_current(struct event_filter *event_filter)
> >> +{
> >> + int err = 0;
> >> +#ifdef CONFIG_FTRACE_SYSCALLS
> >> + uint8_t syscall_state[64];
> >> +
> >> + memset(syscall_state, 0, sizeof(syscall_state));
> >> +
> >> + /* The generic tracing entry can remain zeroed. */
> >> + err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
> >> + NULL);
> >> + if (err)
> >> + return 0;
> >> +
> >> + err = filter_match_preds(event_filter, syscall_state);
> >> +#endif
> >> + return err;
> >> +}
> >> +
> >> +static const char *syscall_nr_to_name(int syscall)
> >> +{
> >> + const char *syscall_name = "unknown";
> >> + struct syscall_metadata *data = syscall_nr_to_meta(syscall);
> >> + if (data)
> >> + syscall_name = data->name;
> >> + return syscall_name;
> >> +}
> >> +
> >> +static void filters_set_compat(struct seccomp_filters *filters)
> >> +{
> >> +#ifdef CONFIG_COMPAT
> >> + if (is_compat_task())
> >> + filters->flags.compat = 1;
> >> +#endif
> >> +}
> >> +
> >> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
> >> +{
> >> + int ret = 0;
> >> + if (!filters)
> >> + return 0;
> >> +#ifdef CONFIG_COMPAT
> >> + if (!!(is_compat_task()) == filters->flags.compat)
> >> + ret = 1;
> >> +#endif
> >> + return ret;
> >> +}
> >> +
> >> +static inline int syscall_is_execve(int syscall)
> >> +{
> >> + int nr = __NR_execve;
> >> +#ifdef CONFIG_COMPAT
> >> + if (is_compat_task())
> >> + nr = __NR_seccomp_execve_32;
> >> +#endif
> >> + return syscall == nr;
> >> +}
> >> +
> >> +#ifndef KSTK_EIP
> >> +#define KSTK_EIP(x) 0L
> >> +#endif
> >> +
> >> +void seccomp_filter_log_failure(int syscall)
> >> +{
> >> + pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
> >> + current->comm, task_pid_nr(current), syscall,
> >> + syscall_nr_to_name(syscall), KSTK_EIP(current));
> >> +}
> >> +
> >> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
> >> +void put_seccomp_filters(struct seccomp_filters *orig)
> >> +{
> >> + if (!orig)
> >> + return;
> >> +
> >> + if (atomic_dec_and_test(&orig->usage))
> >> + __put_seccomp_filters(orig);
> >> +}
> >> +
> >> +/* get_seccomp_state - increments the reference count of @orig */
> >> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
> >
> > Nit: the name does not match the comment.
>
> Will fix it here and above. Thanks!
>
> >> +{
> >> + if (!orig)
> >> + return NULL;
> >> + atomic_inc(&orig->usage);
> >> + return orig;
> >
> > This is called in an RCU read-side critical section. What exactly is
> > RCU protecting? I would expect an rcu_dereference() or one of the
> > RCU list-traversal primitives somewhere, either here or at the caller.
>
> Ah, I spaced on rcu_dereference(). The goal was to make the
> assignment and replacement of the seccomp_filters pointer
> RCU-protected (in seccomp_state) so there's no concern over it being
> replaced partial on platforms where pointer assignments are non-atomic
> - such as via /proc/<pid>/seccomp_filters access or a call via the
> exported symbols. Object lifetime is managed by reference counting so
> that I don't have to worry about extending the RCU read-side critical
> section by much or deal with pre-allocations.
>
> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
> it makes sense, so that it is called safely. Just to make sure, does
> it make sense to continue to rcu protect the specific pointer?
It might. The usual other options is to use a lock outside of the element
containing the reference count to protect reference-count manipulation.
If there is some convenient lock, especially if it is already held where
needed, then locking is more straightforward. Otherwise, RCU is usually
a reasonable option.
> >> +}
> >> +
> >> +/**
> >> + * seccomp_test_filters - tests 'current' against the given syscall
> >> + * @state: seccomp_state of current to use.
> >> + * @syscall: number of the system call to test
> >> + *
> >> + * Returns 0 on ok and non-zero on error/failure.
> >> + */
> >> +int seccomp_test_filters(int syscall)
> >> +{
> >> + uint16_t id;
> >> + struct event_filter *filter;
> >> + struct seccomp_filters *filters;
> >> + int ret = -EACCES;
> >> +
> >> + rcu_read_lock();
> >> + filters = get_seccomp_filters(current->seccomp.filters);
> >> + rcu_read_unlock();
> >> +
> >> + if (!filters)
> >> + goto out;
> >> +
> >> + if (filters_compat_mismatch(filters)) {
> >> + pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
> >> + current->comm, task_pid_nr(current));
> >> + goto out;
> >> + }
> >> +
> >> + /* execve is never allowed. */
> >> + if (syscall_is_execve(syscall))
> >> + goto out;
> >> +
> >> + ret = 0;
> >> + id = seccomp_filter_id(filters, syscall);
> >> + if (seccomp_filter_allow(id))
> >> + goto out;
> >> +
> >> + ret = -EACCES;
> >> + if (!seccomp_filter_dynamic(id))
> >> + goto out;
> >> +
> >> + filter = seccomp_dynamic_filter(filters, id);
> >> + if (filter && filter_match_current(filter))
> >> + ret = 0;
> >> +out:
> >> + put_seccomp_filters(filters);
> >> + return ret;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_show_filters - prints the current filter state to a seq_file
> >> + * @filters: properly get()'d filters object
> >> + * @m: the prepared seq_file to receive the data
> >> + *
> >> + * Returns 0 on a successful write.
> >> + */
> >> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
> >> +{
> >> + int syscall;
> >> + seq_printf(m, "Mode: %d\n", current->seccomp.mode);
> >> + if (!filters)
> >> + goto out;
> >> +
> >> + for (syscall = 0; syscall < NR_syscalls; ++syscall) {
> >> + uint16_t id = seccomp_filter_id(filters, syscall);
> >> + const char *filter_string = SECCOMP_FILTER_ALLOW;
> >> + if (seccomp_filter_deny(id))
> >> + continue;
> >> + seq_printf(m, "%d (%s): ",
> >> + syscall,
> >> + syscall_nr_to_name(syscall));
> >> + if (seccomp_filter_dynamic(id))
> >> + filter_string = get_filter_string(
> >> + seccomp_dynamic_filter(filters, id));
> >> + seq_printf(m, "%s\n", filter_string);
> >> + }
> >> +out:
> >> + return 0;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
> >> +
> >> +/**
> >> + * seccomp_get_filter - copies the filter_string into "buf"
> >> + * @syscall_nr: system call number to look up
> >> + * @buf: destination buffer
> >> + * @bufsize: available space in the buffer.
> >> + *
> >> + * Context: User context only. This function may sleep on allocation and
> >> + * operates on current. current must be attempting a system call
> >> + * when this is called.
> >> + *
> >> + * Looks up the filter for the given system call number on current. If found,
> >> + * the string length of the NUL-terminated buffer is returned and < 0 is
> >> + * returned on error. The NUL byte is not included in the length.
> >> + */
> >> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
> >> +{
> >> + struct seccomp_filters *filters;
> >> + struct event_filter *filter;
> >> + long ret = -EINVAL;
> >> + uint16_t id;
> >> +
> >> + if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
> >> + bufsize = SECCOMP_MAX_FILTER_LENGTH;
> >> +
> >> + rcu_read_lock();
> >> + filters = get_seccomp_filters(current->seccomp.filters);
> >> + rcu_read_unlock();
> >> +
> >> + if (!filters)
> >> + goto out;
> >> +
> >> + ret = -ENOENT;
> >> + id = seccomp_filter_id(filters, syscall_nr);
> >> + if (seccomp_filter_deny(id))
> >> + goto out;
> >> +
> >> + if (seccomp_filter_allow(id)) {
> >> + ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
> >> + goto copied;
> >> + }
> >> +
> >> + filter = seccomp_dynamic_filter(filters, id);
> >> + if (!filter)
> >> + goto out;
> >> + ret = strlcpy(buf, get_filter_string(filter), bufsize);
> >> +
> >> +copied:
> >> + if (ret >= bufsize) {
> >> + ret = -ENOSPC;
> >> + goto out;
> >> + }
> >> + /* Zero out any remaining buffer, just in case. */
> >> + memset(buf + ret, 0, bufsize - ret);
> >> +out:
> >> + put_seccomp_filters(filters);
> >> + return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
> >> +
> >> +/**
> >> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
> >> + * @syscall_nr: the system call number to clear filters for.
> >> + *
> >> + * Context: User context only. This function may sleep on allocation and
> >> + * operates on current. current must be attempting a system call
> >> + * when this is called.
> >> + *
> >> + * Returns 0 on success.
> >> + */
> >> +long seccomp_clear_filter(int syscall_nr)
> >> +{
> >> + struct seccomp_filters *filters = NULL, *orig_filters;
> >> + uint16_t id;
> >> + int ret = -EINVAL;
> >> +
> >> + rcu_read_lock();
> >> + orig_filters = get_seccomp_filters(current->seccomp.filters);
> >> + rcu_read_unlock();
> >> +
> >> + if (!orig_filters)
> >> + goto out;
> >> +
> >> + if (filters_compat_mismatch(orig_filters))
> >> + goto out;
> >> +
> >> + id = seccomp_filter_id(orig_filters, syscall_nr);
> >> + if (seccomp_filter_deny(id))
> >> + goto out;
> >> +
> >> + /* Create a new filters object for the task */
> >> + if (seccomp_filter_dynamic(id))
> >> + filters = seccomp_filters_new(orig_filters->count - 1);
> >> + else
> >> + filters = seccomp_filters_new(orig_filters->count);
> >> +
> >> + if (IS_ERR(filters)) {
> >> + ret = PTR_ERR(filters);
> >> + goto out;
> >> + }
> >> +
> >> + /* Copy, but drop the requested entry. */
> >> + ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
> >> + if (ret)
> >> + goto out;
> >> + get_seccomp_filters(filters); /* simplify the out: path */
> >> +
> >> + rcu_assign_pointer(current->seccomp.filters, filters);
> >
> > What prevents two copies of seccomp_clear_filter() from running
> > concurrently?
>
> Nothing - the last one wins assignment, but the objects themselves
> should be internally consistent to the parallel calls. If that's a
> concern, a per-task writer mutex could be used just to ensure
> simultaneous calls to clear and set are performed serially. Would
> that make more sense?
Here is the sequence of events that I am concerned about:
o CPU 0 sets orig_filters to point to the current filters.
o CPU 1 sets its local orig_filters to point to the current
set of filters.
o Both CPUs allocate new filters and use rcu_assign_pointer()
to do the update. As you say, the last one wins, but it appears
to me that the first one leaks memory.
o Both CPUs free the object referenced by their orig_filters,
which might or might not result in a double free, depending
on exactly what happens below. (You might actually be OK,
I didn't check -- leaking memory was enough for me to call
attention to this.)
So yes, please use some kind of mutual exclusion. Not sure what you
mean by "per-task mutex", but whatever it is must prevent two different
tasks from acting on the same set of filters at the same time. The
thing that I call "per-task mutex" would -not- do that.
> >> + synchronize_rcu();
> >> + put_seccomp_filters(orig_filters); /* for the task */
> >> +out:
> >> + put_seccomp_filters(orig_filters); /* for the get */
> >> + put_seccomp_filters(filters); /* for the extra get */
> >> + return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
> >> +
> >> +/**
> >> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
> >> + * @syscall_nr: system call number to apply the filter to.
> >> + * @filter: ftrace filter string to apply.
> >> + *
> >> + * Context: User context only. This function may sleep on allocation and
> >> + * operates on current. current must be attempting a system call
> >> + * when this is called.
> >> + *
> >> + * New filters may be added for system calls when the current task is
> >> + * not in a secure computing mode (seccomp). Otherwise, existing filters may
> >> + * be extended.
> >> + *
> >> + * Returns 0 on success or an errno on failure.
> >> + */
> >> +long seccomp_set_filter(int syscall_nr, char *filter)
> >> +{
> >> + struct seccomp_filters *filters = NULL, *orig_filters = NULL;
> >> + uint16_t id;
> >> + long ret = -EINVAL;
> >> + uint16_t filters_needed;
> >> +
> >> + if (!filter)
> >> + goto out;
> >> +
> >> + filter = strstrip(filter);
> >> + /* Disallow empty strings. */
> >> + if (filter[0] == 0)
> >> + goto out;
> >> +
> >> + rcu_read_lock();
> >> + orig_filters = get_seccomp_filters(current->seccomp.filters);
> >> + rcu_read_unlock();
> >> +
> >> + /* After the first call, compatibility mode is selected permanently. */
> >> + ret = -EACCES;
> >> + if (filters_compat_mismatch(orig_filters))
> >> + goto out;
> >> +
> >> + filters_needed = orig_filters ? orig_filters->count : 0;
> >> + id = seccomp_filter_id(orig_filters, syscall_nr);
> >> + if (seccomp_filter_deny(id)) {
> >> + /* Don't allow DENYs to be changed when in a seccomp mode */
> >> + ret = -EACCES;
> >> + if (current->seccomp.mode)
> >> + goto out;
> >> + filters_needed++;
> >> + }
> >> +
> >> + filters = seccomp_filters_new(filters_needed);
> >> + if (IS_ERR(filters)) {
> >> + ret = PTR_ERR(filters);
> >> + goto out;
> >> + }
> >> +
> >> + filters_set_compat(filters);
> >> + if (orig_filters) {
> >> + ret = seccomp_filters_copy(filters, orig_filters, -1);
> >> + if (ret)
> >> + goto out;
> >> + }
> >> +
> >> + if (seccomp_filter_deny(id))
> >> + ret = seccomp_add_filter(filters, syscall_nr, filter);
> >> + else
> >> + ret = seccomp_extend_filter(filters, syscall_nr, filter);
> >> + if (ret)
> >> + goto out;
> >> + get_seccomp_filters(filters); /* simplify the error paths */
> >> +
> >> + rcu_assign_pointer(current->seccomp.filters, filters);
> >
> > Again, what prevents two copies of seccomp_set_filter() from running
> > concurrently?
>
> Same deal - nothing, but I'd be happy to add a guard if it makes sense.
>
> Thanks!
>
> >> + synchronize_rcu();
> >> + put_seccomp_filters(orig_filters); /* for the task */
> >> +out:
> >> + put_seccomp_filters(orig_filters); /* for the get */
> >> + put_seccomp_filters(filters); /* for get or task, on err */
> >> + return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
> >> +
> >> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
> >> + char __user *user_filter)
> >> +{
> >> + int nr;
> >> + long ret;
> >> + char *filter = NULL;
> >> +
> >> + ret = -EINVAL;
> >> + if (syscall_nr >= NR_syscalls)
> >> + goto out;
> >> +
> >> + ret = -EFAULT;
> >> + if (!user_filter)
> >> + goto out;
> >> +
> >> + filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> >> + ret = -ENOMEM;
> >> + if (!filter)
> >> + goto out;
> >> +
> >> + ret = -EFAULT;
> >> + if (strncpy_from_user(filter, user_filter,
> >> + SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
> >> + goto out;
> >> +
> >> + nr = (int) syscall_nr;
> >> + ret = seccomp_set_filter(nr, filter);
> >> +
> >> +out:
> >> + kfree(filter);
> >> + return ret;
> >> +}
> >> +
> >> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
> >> +{
> >> + int nr = -1;
> >> + long ret;
> >> +
> >> + ret = -EINVAL;
> >> + if (syscall_nr >= NR_syscalls)
> >> + goto out;
> >> +
> >> + nr = (int) syscall_nr;
> >> + ret = seccomp_clear_filter(nr);
> >> +
> >> +out:
> >> + return ret;
> >> +}
> >> +
> >> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
> >> + unsigned long available)
> >> +{
> >> + int ret, nr;
> >> + unsigned long copied;
> >> + char *buf = NULL;
> >> + ret = -EINVAL;
> >> + if (!available)
> >> + goto out;
> >> + /* Ignore extra buffer space. */
> >> + if (available > SECCOMP_MAX_FILTER_LENGTH)
> >> + available = SECCOMP_MAX_FILTER_LENGTH;
> >> +
> >> + ret = -EINVAL;
> >> + if (syscall_nr >= NR_syscalls)
> >> + goto out;
> >> + nr = (int) syscall_nr;
> >> +
> >> + ret = -ENOMEM;
> >> + buf = kmalloc(available, GFP_KERNEL);
> >> + if (!buf)
> >> + goto out;
> >> +
> >> + ret = seccomp_get_filter(nr, buf, available);
> >> + if (ret < 0)
> >> + goto out;
> >> +
> >> + /* Include the NUL byte in the copy. */
> >> + copied = copy_to_user(dst, buf, ret + 1);
> >> + ret = -ENOSPC;
> >> + if (copied)
> >> + goto out;
> >> + ret = 0;
> >> +out:
> >> + kfree(buf);
> >> + return ret;
> >> +}
> >> diff --git a/kernel/sys.c b/kernel/sys.c
> >> index af468ed..ed60d06 100644
> >> --- a/kernel/sys.c
> >> +++ b/kernel/sys.c
> >> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >> case PR_SET_ENDIAN:
> >> error = SET_ENDIAN(me, arg2);
> >> break;
> >> -
> >> case PR_GET_SECCOMP:
> >> error = prctl_get_seccomp();
> >> break;
> >> case PR_SET_SECCOMP:
> >> error = prctl_set_seccomp(arg2);
> >> break;
> >> + case PR_SET_SECCOMP_FILTER:
> >> + error = prctl_set_seccomp_filter(arg2,
> >> + (char __user *) arg3);
> >> + break;
> >> + case PR_CLEAR_SECCOMP_FILTER:
> >> + error = prctl_clear_seccomp_filter(arg2);
> >> + break;
> >> + case PR_GET_SECCOMP_FILTER:
> >> + error = prctl_get_seccomp_filter(arg2,
> >> + (char __user *) arg3,
> >> + arg4);
> >> + break;
> >> case PR_GET_TSC:
> >> error = GET_TSC_CTL(arg2);
> >> break;
> >> diff --git a/security/Kconfig b/security/Kconfig
> >> index 95accd4..c76adf2 100644
> >> --- a/security/Kconfig
> >> +++ b/security/Kconfig
> >> @@ -2,6 +2,10 @@
> >> # Security configuration
> >> #
> >>
> >> +# Make seccomp filter Kconfig switch below available
> >> +config HAVE_SECCOMP_FILTER
> >> + bool
> >> +
> >> menu "Security options"
> >>
> >> config KEYS
> >> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
> >>
> >> If you are unsure how to answer this question, answer N.
> >>
> >> +config SECCOMP_FILTER
> >> + bool "Enable seccomp-based system call filtering"
> >> + select SECCOMP
> >> + depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
> >> + help
> >> + This kernel feature expands CONFIG_SECCOMP to allow computing
> >> + in environments with reduced kernel access dictated by the
> >> + application itself through prctl calls. If
> >> + CONFIG_FTRACE_SYSCALLS is available, then system call
> >> + argument-based filtering predicates may be used.
> >> +
> >> + See Documentation/prctl/seccomp_filter.txt for more detail.
> >> +
> >> config SECURITY
> >> bool "Enable different security models"
> >> depends on SYSFS
> >> --
> >> 1.7.0.4
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at http://www.tux.org/lkml/
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-06-02 19:42 ` Paul E. McKenney
@ 2011-06-02 20:28 ` Will Drewry
2011-06-02 20:46 ` Steven Rostedt
0 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-02 20:28 UTC (permalink / raw)
To: paulmck
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
Peter Zijlstra, Frederic Weisbecker, linux-security-module
On Thu, Jun 2, 2011 at 2:42 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Thu, Jun 02, 2011 at 01:14:54PM -0500, Will Drewry wrote:
>> On Thu, Jun 2, 2011 at 12:36 PM, Paul E. McKenney
>> <paulmck@linux.vnet.ibm.com> wrote:
>> > On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
>> >> This change adds a new seccomp mode which specifies the allowed system
>> >> calls dynamically. When in the new mode (2), all system calls are
>> >> checked against process-defined filters - first by system call number,
>> >> then by a filter string. If an entry exists for a given system call and
>> >> all filter predicates evaluate to true, then the task may proceed.
>> >> Otherwise, the task is killed.
>> >
>> > A few questions below -- I can't say that I understand the RCU usage.
>> >
>> > Thanx, Paul
>> >
>> >> Filter string parsing and evaluation is handled by the ftrace filter
>> >> engine. Related patches tweak to the perf filter trace and free
>> >> allowing the calls to be shared. Filters inherit their understanding of
>> >> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
>> >> subsystem which already populates this information in syscall_metadata
>> >> associated enter_event (and exit_event) structures. If
>> >> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
>> >> will be allowed.
>> >>
>> >> The net result is a process may have its system calls filtered using the
>> >> ftrace filter engine's inherent understanding of systems calls. The set
>> >> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
>> >> prctl(). For example, a filterset for a process, like pdftotext, that
>> >> should only process read-only input could (roughly) look like:
>> >> sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
>> >> prctl(PR_SET_SECCOMP, 2);
>> >>
>> >> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
>> >> be &&'d together to ensure that attack surface may only be reduced:
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
>> >>
>> >> With the earlier example, the active filter becomes:
>> >> "(fd == 1 || fd == 2) && fd != 2"
>> >>
>> >> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
>> >> The latter returns the current filter for a system call to userspace:
>> >>
>> >> prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
>> >>
>> >> while the former clears any filters for a given system call changing it
>> >> back to a defaulty deny:
>> >>
>> >> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
>> >>
>> >> v3: - always block execve calls (as per linus torvalds)
>> >> - add __NR_seccomp_execve(_32) to seccomp-supporting arches
>> >> - ensure compat tasks can't reach ftrace:syscalls
>> >> - dropped new defines for seccomp modes.
>> >> - two level array instead of hlists (sugg. by olof johansson)
>> >> - added generic Kconfig entry that is not connected.
>> >> - dropped internal seccomp.h
>> >> - move prctl helpers to seccomp_filter
>> >> - killed seccomp_t typedef (as per checkpatch)
>> >> v2: - changed to use the existing syscall number ABI.
>> >> - prctl changes to minimize parsing in the kernel:
>> >> prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
>> >> prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
>> >> prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
>> >> prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
>> >> - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
>> >> - added flags
>> >> - provide a default fail syscall_nr_to_meta in ftrace
>> >> - provides fallback for unhooked system calls
>> >> - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
>> >> - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
>> >> - moved to a hlist and 4 bit hash of linked lists
>> >> - added support to operate without CONFIG_FTRACE_SYSCALLS
>> >> - moved Kconfig support next to SECCOMP
>> >> - made Kconfig entries dependent on EXPERIMENTAL
>> >> - added macros to avoid ifdefs from kernel/fork.c
>> >> - added compat task/filter matching
>> >> - drop seccomp.h inclusion in sched.h and drop seccomp_t
>> >> - added Filtering to "show" output
>> >> - added on_exec state dup'ing when enabling after a fast-path accept.
>> >>
>> >> Signed-off-by: Will Drewry <wad@chromium.org>
>> >> ---
>> >> include/linux/prctl.h | 5 +
>> >> include/linux/sched.h | 2 +-
>> >> include/linux/seccomp.h | 98 ++++++-
>> >> include/trace/syscall.h | 7 +
>> >> kernel/Makefile | 3 +
>> >> kernel/fork.c | 3 +
>> >> kernel/seccomp.c | 38 ++-
>> >> kernel/seccomp_filter.c | 784 +++++++++++++++++++++++++++++++++++++++++++++++
>> >> kernel/sys.c | 13 +-
>> >> security/Kconfig | 17 +
>> >> 10 files changed, 954 insertions(+), 16 deletions(-)
>> >> create mode 100644 kernel/seccomp_filter.c
>> >>
>> >> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
>> >> index a3baeb2..44723ce 100644
>> >> --- a/include/linux/prctl.h
>> >> +++ b/include/linux/prctl.h
>> >> @@ -64,6 +64,11 @@
>> >> #define PR_GET_SECCOMP 21
>> >> #define PR_SET_SECCOMP 22
>> >>
>> >> +/* Get/set process seccomp filters */
>> >> +#define PR_GET_SECCOMP_FILTER 35
>> >> +#define PR_SET_SECCOMP_FILTER 36
>> >> +#define PR_CLEAR_SECCOMP_FILTER 37
>> >> +
>> >> /* Get/set the capability bounding set (as per security/commoncap.c) */
>> >> #define PR_CAPBSET_READ 23
>> >> #define PR_CAPBSET_DROP 24
>> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> >> index 18d63ce..3f0bc8d 100644
>> >> --- a/include/linux/sched.h
>> >> +++ b/include/linux/sched.h
>> >> @@ -1374,7 +1374,7 @@ struct task_struct {
>> >> uid_t loginuid;
>> >> unsigned int sessionid;
>> >> #endif
>> >> - seccomp_t seccomp;
>> >> + struct seccomp_struct seccomp;
>> >>
>> >> /* Thread group tracking */
>> >> u32 parent_exec_id;
>> >> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> >> index 167c333..f4434ca 100644
>> >> --- a/include/linux/seccomp.h
>> >> +++ b/include/linux/seccomp.h
>> >> @@ -1,13 +1,33 @@
>> >> #ifndef _LINUX_SECCOMP_H
>> >> #define _LINUX_SECCOMP_H
>> >>
>> >> +struct seq_file;
>> >>
>> >> #ifdef CONFIG_SECCOMP
>> >>
>> >> +#include <linux/errno.h>
>> >> #include <linux/thread_info.h>
>> >> +#include <linux/types.h>
>> >> #include <asm/seccomp.h>
>> >>
>> >> -typedef struct { int mode; } seccomp_t;
>> >> +struct seccomp_filters;
>> >> +/**
>> >> + * struct seccomp_struct - the state of a seccomp'ed process
>> >> + *
>> >> + * @mode:
>> >> + * if this is 1, the process is under standard seccomp rules
>> >> + * is 2, the process is only allowed to make system calls where
>> >> + * associated filters evaluate successfully.
>> >> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
>> >> + * filters assignment/use should be RCU-protected and its contents
>> >> + * should never be modified when attached to a seccomp_struct.
>> >> + */
>> >> +struct seccomp_struct {
>> >> + uint16_t mode;
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> + struct seccomp_filters *filters;
>> >> +#endif
>> >> +};
>> >>
>> >> extern void __secure_computing(int);
>> >> static inline void secure_computing(int this_syscall)
>> >> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
>> >> __secure_computing(this_syscall);
>> >> }
>> >>
>> >> -extern long prctl_get_seccomp(void);
>> >> extern long prctl_set_seccomp(unsigned long);
>> >> +extern long prctl_get_seccomp(void);
>> >>
>> >> #else /* CONFIG_SECCOMP */
>> >>
>> >> #include <linux/errno.h>
>> >>
>> >> -typedef struct { } seccomp_t;
>> >> -
>> >> +struct seccomp_struct { };
>> >> #define secure_computing(x) do { } while (0)
>> >>
>> >> static inline long prctl_get_seccomp(void)
>> >> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
>> >> return -EINVAL;
>> >> }
>> >>
>> >> -static inline long prctl_set_seccomp(unsigned long arg2)
>> >> +static inline long prctl_set_seccomp(unsigned long a2);
>> >> {
>> >> return -EINVAL;
>> >> }
>> >>
>> >> #endif /* CONFIG_SECCOMP */
>> >>
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> +
>> >> +#define inherit_tsk_seccomp(_child, _orig) do { \
>> >> + _child->seccomp.mode = _orig->seccomp.mode; \
>> >> + _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
>> >> + } while (0)
>> >> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
>> >> +
>> >> +extern int seccomp_show_filters(struct seccomp_filters *filters,
>> >> + struct seq_file *);
>> >> +extern long seccomp_set_filter(int, char *);
>> >> +extern long seccomp_clear_filter(int);
>> >> +extern long seccomp_get_filter(int, char *, unsigned long);
>> >> +
>> >> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
>> >> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
>> >> + unsigned long);
>> >> +extern long prctl_clear_seccomp_filter(unsigned long);
>> >> +
>> >> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
>> >> +extern void put_seccomp_filters(struct seccomp_filters *);
>> >> +
>> >> +extern int seccomp_test_filters(int);
>> >> +extern void seccomp_filter_log_failure(int);
>> >> +
>> >> +#else /* CONFIG_SECCOMP_FILTER */
>> >> +
>> >> +struct seccomp_filters { };
>> >> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
>> >> +#define put_tsk_seccomp(_tsk) do { } while (0)
>> >> +
>> >> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
>> >> + struct seq_file *m)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long seccomp_clear_filter(int syscall_nr)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long seccomp_get_filter(int syscall_nr,
>> >> + char *buf, unsigned long available)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
>> >> + unsigned long a4)
>> >> +{
>> >> + return -ENOSYS;
>> >> +}
>> >> +#endif /* CONFIG_SECCOMP_FILTER */
>> >> #endif /* _LINUX_SECCOMP_H */
>> >> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
>> >> index 242ae04..e061ad0 100644
>> >> --- a/include/trace/syscall.h
>> >> +++ b/include/trace/syscall.h
>> >> @@ -35,6 +35,8 @@ struct syscall_metadata {
>> >> extern unsigned long arch_syscall_addr(int nr);
>> >> extern int init_syscall_trace(struct ftrace_event_call *call);
>> >>
>> >> +extern struct syscall_metadata *syscall_nr_to_meta(int);
>> >> +
>> >> extern int reg_event_syscall_enter(struct ftrace_event_call *call);
>> >> extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
>> >> extern int reg_event_syscall_exit(struct ftrace_event_call *call);
>> >> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
>> >> struct trace_event *event);
>> >> enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
>> >> struct trace_event *event);
>> >> +#else
>> >> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
>> >> +{
>> >> + return NULL;
>> >> +}
>> >> #endif
>> >>
>> >> #ifdef CONFIG_PERF_EVENTS
>> >> diff --git a/kernel/Makefile b/kernel/Makefile
>> >> index 85cbfb3..84e7dfb 100644
>> >> --- a/kernel/Makefile
>> >> +++ b/kernel/Makefile
>> >> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
>> >> obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
>> >> obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
>> >> obj-$(CONFIG_SECCOMP) += seccomp.o
>> >> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
>> >> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
>> >> +endif
>> >> obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>> >> obj-$(CONFIG_TREE_RCU) += rcutree.o
>> >> obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
>> >> diff --git a/kernel/fork.c b/kernel/fork.c
>> >> index e7548de..6f835e0 100644
>> >> --- a/kernel/fork.c
>> >> +++ b/kernel/fork.c
>> >> @@ -34,6 +34,7 @@
>> >> #include <linux/cgroup.h>
>> >> #include <linux/security.h>
>> >> #include <linux/hugetlb.h>
>> >> +#include <linux/seccomp.h>
>> >> #include <linux/swap.h>
>> >> #include <linux/syscalls.h>
>> >> #include <linux/jiffies.h>
>> >> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
>> >> free_thread_info(tsk->stack);
>> >> rt_mutex_debug_task_free(tsk);
>> >> ftrace_graph_exit_task(tsk);
>> >> + put_tsk_seccomp(tsk);
>> >> free_task_struct(tsk);
>> >> }
>> >> EXPORT_SYMBOL(free_task);
>> >> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>> >> if (err)
>> >> goto out;
>> >>
>> >> + inherit_tsk_seccomp(tsk, orig);
>> >> setup_thread_stack(tsk, orig);
>> >> clear_user_return_notifier(tsk);
>> >> clear_tsk_need_resched(tsk);
>> >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> >> index 57d4b13..0a942be 100644
>> >> --- a/kernel/seccomp.c
>> >> +++ b/kernel/seccomp.c
>> >> @@ -2,16 +2,20 @@
>> >> * linux/kernel/seccomp.c
>> >> *
>> >> * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
>> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> >> *
>> >> * This defines a simple but solid secure-computing mode.
>> >> */
>> >>
>> >> #include <linux/seccomp.h>
>> >> #include <linux/sched.h>
>> >> +#include <linux/slab.h>
>> >> #include <linux/compat.h>
>> >> +#include <linux/unistd.h>
>> >> +#include <linux/ftrace_event.h>
>> >>
>> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> >> /* #define SECCOMP_DEBUG 1 */
>> >> -#define NR_SECCOMP_MODES 1
>> >>
>> >> /*
>> >> * Secure computing mode 1 allows only read/write/exit/sigreturn.
>> >> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
>> >>
>> >> void __secure_computing(int this_syscall)
>> >> {
>> >> - int mode = current->seccomp.mode;
>> >> int * syscall;
>> >>
>> >> - switch (mode) {
>> >> + switch (current->seccomp.mode) {
>> >> case 1:
>> >> syscall = mode1_syscalls;
>> >> #ifdef CONFIG_COMPAT
>> >> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
>> >> return;
>> >> } while (*++syscall);
>> >> break;
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> + case 2:
>> >> + if (this_syscall >= NR_syscalls || this_syscall < 0)
>> >> + break;
>> >> +
>> >> + if (!seccomp_test_filters(this_syscall))
>> >> + return;
>> >> +
>> >> + seccomp_filter_log_failure(this_syscall);
>> >> + break;
>> >> +#endif
>> >> default:
>> >> BUG();
>> >> }
>> >> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
>> >> if (unlikely(current->seccomp.mode))
>> >> goto out;
>> >>
>> >> - ret = -EINVAL;
>> >> - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
>> >> - current->seccomp.mode = seccomp_mode;
>> >> - set_thread_flag(TIF_SECCOMP);
>> >> + ret = 0;
>> >> + switch (seccomp_mode) {
>> >> + case 1:
>> >> #ifdef TIF_NOTSC
>> >> disable_TSC();
>> >> #endif
>> >> - ret = 0;
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> + case 2:
>> >> +#endif
>> >> + current->seccomp.mode = seccomp_mode;
>> >> + set_thread_flag(TIF_SECCOMP);
>> >> + break;
>> >> + default:
>> >> + ret = -EINVAL;
>> >> }
>> >>
>> >> - out:
>> >> +out:
>> >> return ret;
>> >> }
>> >> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
>> >> new file mode 100644
>> >> index 0000000..9782f25
>> >> --- /dev/null
>> >> +++ b/kernel/seccomp_filter.c
>> >> @@ -0,0 +1,784 @@
>> >> +/* filter engine-based seccomp system call filtering
>> >> + *
>> >> + * This program is free software; you can redistribute it and/or modify
>> >> + * it under the terms of the GNU General Public License as published by
>> >> + * the Free Software Foundation; either version 2 of the License, or
>> >> + * (at your option) any later version.
>> >> + *
>> >> + * This program is distributed in the hope that it will be useful,
>> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> >> + * GNU General Public License for more details.
>> >> + *
>> >> + * You should have received a copy of the GNU General Public License
>> >> + * along with this program; if not, write to the Free Software
>> >> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
>> >> + *
>> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> >> + */
>> >> +
>> >> +#include <linux/compat.h>
>> >> +#include <linux/err.h>
>> >> +#include <linux/errno.h>
>> >> +#include <linux/ftrace_event.h>
>> >> +#include <linux/seccomp.h>
>> >> +#include <linux/seq_file.h>
>> >> +#include <linux/sched.h>
>> >> +#include <linux/slab.h>
>> >> +#include <linux/uaccess.h>
>> >> +
>> >> +#include <asm/syscall.h>
>> >> +#include <trace/syscall.h>
>> >> +
>> >> +
>> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> >> +
>> >> +#define SECCOMP_FILTER_ALLOW "1"
>> >> +#define SECCOMP_ACTION_DENY 0xffff
>> >> +#define SECCOMP_ACTION_ALLOW 0xfffe
>> >> +
>> >> +/**
>> >> + * struct seccomp_filters - container for seccomp filterset
>> >> + *
>> >> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
>> >> + * May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
>> >> + * @event_filters: array of pointers to ftrace event objects
>> >> + * @count: size of @event_filters
>> >> + * @flags: anonymous struct to wrap filters-specific flags
>> >> + * @usage: reference count to simplify use.
>> >> + */
>> >> +struct seccomp_filters {
>> >> + uint16_t syscalls[NR_syscalls];
>> >> + struct event_filter **event_filters;
>> >> + uint16_t count;
>> >> + struct {
>> >> + uint32_t compat:1,
>> >> + __reserved:31;
>> >> + } flags;
>> >> + atomic_t usage;
>> >> +};
>> >> +
>> >> +/* Handle ftrace symbol non-existence */
>> >> +#ifdef CONFIG_FTRACE_SYSCALLS
>> >> +#define create_event_filter(_ef_pptr, _event_type, _str) \
>> >> + ftrace_parse_filter(_ef_pptr, _event_type, _str)
>> >> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
>> >> +#define free_event_filter(_f) ftrace_free_filter(_f)
>> >> +
>> >> +#else
>> >> +
>> >> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
>> >> +#define get_filter_string(_ef) (NULL)
>> >> +#define free_event_filter(_f) do { } while (0)
>> >> +#endif
>> >> +
>> >> +/**
>> >> + * seccomp_filters_new - allocates a new filters object
>> >> + * @count: count to allocate for the event_filters array
>> >> + *
>> >> + * Returns ERR_PTR on error or an allocated object.
>> >> + */
>> >> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
>> >> +{
>> >> + struct seccomp_filters *f;
>> >> +
>> >> + if (count >= SECCOMP_ACTION_ALLOW)
>> >> + return ERR_PTR(-EINVAL);
>> >> +
>> >> + f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
>> >> + if (!f)
>> >> + return ERR_PTR(-ENOMEM);
>> >> +
>> >> + /* Lazy SECCOMP_ACTION_DENY assignment. */
>> >> + memset(f->syscalls, 0xff, sizeof(f->syscalls));
>> >> + atomic_set(&f->usage, 1);
>> >> +
>> >> + f->event_filters = NULL;
>> >> + f->count = count;
>> >> + if (!count)
>> >> + return f;
>> >> +
>> >> + f->event_filters = kzalloc(count * sizeof(struct event_filter *),
>> >> + GFP_KERNEL);
>> >> + if (!f->event_filters) {
>> >> + kfree(f);
>> >> + f = ERR_PTR(-ENOMEM);
>> >> + }
>> >> + return f;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_filters_free - cleans up the filter list and frees the table
>> >> + * @filters: NULL or live object to be completely destructed.
>> >> + */
>> >> +static void seccomp_filters_free(struct seccomp_filters *filters)
>> >> +{
>> >> + uint16_t count = 0;
>> >> + if (!filters)
>> >> + return;
>> >> + while (count < filters->count) {
>> >> + struct event_filter *f = filters->event_filters[count];
>> >> + free_event_filter(f);
>> >> + count++;
>> >> + }
>> >> + kfree(filters->event_filters);
>> >> + kfree(filters);
>> >> +}
>> >> +
>> >> +static void __put_seccomp_filters(struct seccomp_filters *orig)
>> >> +{
>> >> + WARN_ON(atomic_read(&orig->usage));
>> >> + seccomp_filters_free(orig);
>> >> +}
>> >> +
>> >> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
>> >> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
>> >> +#define seccomp_filter_dynamic(_id) \
>> >> + (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
>> >> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
>> >> + int syscall_nr)
>> >> +{
>> >> + if (!f)
>> >> + return SECCOMP_ACTION_DENY;
>> >> + return f->syscalls[syscall_nr];
>> >> +}
>> >> +
>> >> +static inline struct event_filter *seccomp_dynamic_filter(
>> >> + const struct seccomp_filters *filters, uint16_t id)
>> >> +{
>> >> + if (!seccomp_filter_dynamic(id))
>> >> + return NULL;
>> >> + return filters->event_filters[id];
>> >> +}
>> >> +
>> >> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
>> >> + int syscall_nr, uint16_t id)
>> >> +{
>> >> + filters->syscalls[syscall_nr] = id;
>> >> +}
>> >> +
>> >> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
>> >> + int syscall_nr, uint16_t id,
>> >> + struct event_filter *dynamic_filter)
>> >> +{
>> >> + filters->syscalls[syscall_nr] = id;
>> >> + if (seccomp_filter_dynamic(id))
>> >> + filters->event_filters[id] = dynamic_filter;
>> >> +}
>> >> +
>> >> +static struct event_filter *alloc_event_filter(int syscall_nr,
>> >> + const char *filter_string)
>> >> +{
>> >> + struct syscall_metadata *data;
>> >> + struct event_filter *filter = NULL;
>> >> + int err;
>> >> +
>> >> + data = syscall_nr_to_meta(syscall_nr);
>> >> + /* Argument-based filtering only works on ftrace-hooked syscalls. */
>> >> + err = -ENOSYS;
>> >> + if (!data)
>> >> + goto fail;
>> >> + err = create_event_filter(&filter,
>> >> + data->enter_event->event.type,
>> >> + filter_string);
>> >> + if (err)
>> >> + goto fail;
>> >> +
>> >> + return filter;
>> >> +fail:
>> >> + kfree(filter);
>> >> + return ERR_PTR(err);
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_filters_copy - copies filters from src to dst.
>> >> + *
>> >> + * @dst: seccomp_filters to populate.
>> >> + * @src: table to read from.
>> >> + * @skip: specifies an entry, by system call, to skip.
>> >> + *
>> >> + * Returns non-zero on failure.
>> >> + * Both the source and the destination should have no simultaneous
>> >> + * writers, and dst should be exclusive to the caller.
>> >> + * If @skip is < 0, it is ignored.
>> >> + */
>> >> +static int seccomp_filters_copy(struct seccomp_filters *dst,
>> >> + const struct seccomp_filters *src,
>> >> + int skip)
>> >> +{
>> >> + int id = 0, ret = 0, nr;
>> >> + memcpy(&dst->flags, &src->flags, sizeof(src->flags));
>> >> + memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
>> >> + if (!src->count)
>> >> + goto done;
>> >> + for (nr = 0; nr < NR_syscalls; ++nr) {
>> >> + struct event_filter *filter;
>> >> + const char *str;
>> >> + uint16_t src_id = seccomp_filter_id(src, nr);
>> >> + if (nr == skip) {
>> >> + set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
>> >> + NULL);
>> >> + continue;
>> >> + }
>> >> + if (!seccomp_filter_dynamic(src_id))
>> >> + continue;
>> >> + if (id >= dst->count) {
>> >> + ret = -EINVAL;
>> >> + goto done;
>> >> + }
>> >> + str = get_filter_string(seccomp_dynamic_filter(src, src_id));
>> >> + filter = alloc_event_filter(nr, str);
>> >> + if (IS_ERR(filter)) {
>> >> + ret = PTR_ERR(filter);
>> >> + goto done;
>> >> + }
>> >> + set_seccomp_filter(dst, nr, id, filter);
>> >> + id++;
>> >> + }
>> >> +
>> >> +done:
>> >> + return ret;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
>> >> + * @filters: unattached filter object to operate on
>> >> + * @syscall_nr: syscall number to update filters for
>> >> + * @filter_string: string to append to the existing filter
>> >> + *
>> >> + * The new string will be &&'d to the original filter string to ensure that it
>> >> + * always matches the existing predicates or less:
>> >> + * (old_filter) && @filter_string
>> >> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
>> >> + * failure.
>> >> + */
>> >> +static int seccomp_extend_filter(struct seccomp_filters *filters,
>> >> + int syscall_nr, char *filter_string)
>> >> +{
>> >> + struct event_filter *filter;
>> >> + uint16_t id = seccomp_filter_id(filters, syscall_nr);
>> >> + char *merged = NULL;
>> >> + int ret = -EINVAL, expected;
>> >> +
>> >> + /* No extending with a "1". */
>> >> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
>> >> + goto out;
>> >> +
>> >> + filter = seccomp_dynamic_filter(filters, id);
>> >> + ret = -ENOENT;
>> >> + if (!filter)
>> >> + goto out;
>> >> +
>> >> + merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> >> + ret = -ENOMEM;
>> >> + if (!merged)
>> >> + goto out;
>> >> +
>> >> + expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
>> >> + get_filter_string(filter), filter_string);
>> >> + ret = -E2BIG;
>> >> + if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
>> >> + goto out;
>> >> +
>> >> + /* Free the old filter */
>> >> + free_event_filter(filter);
>> >> + set_seccomp_filter(filters, syscall_nr, id, NULL);
>> >> +
>> >> + /* Replace it */
>> >> + filter = alloc_event_filter(syscall_nr, merged);
>> >> + if (IS_ERR(filter)) {
>> >> + ret = PTR_ERR(filter);
>> >> + goto out;
>> >> + }
>> >> + set_seccomp_filter(filters, syscall_nr, id, filter);
>> >> + ret = 0;
>> >> +
>> >> +out:
>> >> + kfree(merged);
>> >> + return ret;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_add_filter - adds a filter for an unfiltered syscall
>> >> + * @filters: filters object to add a filter/action to
>> >> + * @syscall_nr: system call number to add a filter for
>> >> + * @filter_string: the filter string to apply
>> >> + *
>> >> + * Returns 0 on success and non-zero otherwise.
>> >> + */
>> >> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
>> >> + char *filter_string)
>> >> +{
>> >> + struct event_filter *filter;
>> >> + int ret = 0;
>> >> +
>> >> + if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
>> >> + set_seccomp_filter(filters, syscall_nr,
>> >> + SECCOMP_ACTION_ALLOW, NULL);
>> >> + goto out;
>> >> + }
>> >> +
>> >> + filter = alloc_event_filter(syscall_nr, filter_string);
>> >> + if (IS_ERR(filter)) {
>> >> + ret = PTR_ERR(filter);
>> >> + goto out;
>> >> + }
>> >> + /* Always add to the last slot available since additions are
>> >> + * are only done one at a time.
>> >> + */
>> >> + set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
>> >> +out:
>> >> + return ret;
>> >> +}
>> >> +
>> >> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
>> >> +static int filter_match_current(struct event_filter *event_filter)
>> >> +{
>> >> + int err = 0;
>> >> +#ifdef CONFIG_FTRACE_SYSCALLS
>> >> + uint8_t syscall_state[64];
>> >> +
>> >> + memset(syscall_state, 0, sizeof(syscall_state));
>> >> +
>> >> + /* The generic tracing entry can remain zeroed. */
>> >> + err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
>> >> + NULL);
>> >> + if (err)
>> >> + return 0;
>> >> +
>> >> + err = filter_match_preds(event_filter, syscall_state);
>> >> +#endif
>> >> + return err;
>> >> +}
>> >> +
>> >> +static const char *syscall_nr_to_name(int syscall)
>> >> +{
>> >> + const char *syscall_name = "unknown";
>> >> + struct syscall_metadata *data = syscall_nr_to_meta(syscall);
>> >> + if (data)
>> >> + syscall_name = data->name;
>> >> + return syscall_name;
>> >> +}
>> >> +
>> >> +static void filters_set_compat(struct seccomp_filters *filters)
>> >> +{
>> >> +#ifdef CONFIG_COMPAT
>> >> + if (is_compat_task())
>> >> + filters->flags.compat = 1;
>> >> +#endif
>> >> +}
>> >> +
>> >> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
>> >> +{
>> >> + int ret = 0;
>> >> + if (!filters)
>> >> + return 0;
>> >> +#ifdef CONFIG_COMPAT
>> >> + if (!!(is_compat_task()) == filters->flags.compat)
>> >> + ret = 1;
>> >> +#endif
>> >> + return ret;
>> >> +}
>> >> +
>> >> +static inline int syscall_is_execve(int syscall)
>> >> +{
>> >> + int nr = __NR_execve;
>> >> +#ifdef CONFIG_COMPAT
>> >> + if (is_compat_task())
>> >> + nr = __NR_seccomp_execve_32;
>> >> +#endif
>> >> + return syscall == nr;
>> >> +}
>> >> +
>> >> +#ifndef KSTK_EIP
>> >> +#define KSTK_EIP(x) 0L
>> >> +#endif
>> >> +
>> >> +void seccomp_filter_log_failure(int syscall)
>> >> +{
>> >> + pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
>> >> + current->comm, task_pid_nr(current), syscall,
>> >> + syscall_nr_to_name(syscall), KSTK_EIP(current));
>> >> +}
>> >> +
>> >> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
>> >> +void put_seccomp_filters(struct seccomp_filters *orig)
>> >> +{
>> >> + if (!orig)
>> >> + return;
>> >> +
>> >> + if (atomic_dec_and_test(&orig->usage))
>> >> + __put_seccomp_filters(orig);
>> >> +}
>> >> +
>> >> +/* get_seccomp_state - increments the reference count of @orig */
>> >> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
>> >
>> > Nit: the name does not match the comment.
>>
>> Will fix it here and above. Thanks!
>>
>> >> +{
>> >> + if (!orig)
>> >> + return NULL;
>> >> + atomic_inc(&orig->usage);
>> >> + return orig;
>> >
>> > This is called in an RCU read-side critical section. What exactly is
>> > RCU protecting? I would expect an rcu_dereference() or one of the
>> > RCU list-traversal primitives somewhere, either here or at the caller.
>>
>> Ah, I spaced on rcu_dereference(). The goal was to make the
>> assignment and replacement of the seccomp_filters pointer
>> RCU-protected (in seccomp_state) so there's no concern over it being
>> replaced partial on platforms where pointer assignments are non-atomic
>> - such as via /proc/<pid>/seccomp_filters access or a call via the
>> exported symbols. Object lifetime is managed by reference counting so
>> that I don't have to worry about extending the RCU read-side critical
>> section by much or deal with pre-allocations.
>>
>> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
>> it makes sense, so that it is called safely. Just to make sure, does
>> it make sense to continue to rcu protect the specific pointer?
>
> It might. The usual other options is to use a lock outside of the element
> containing the reference count to protect reference-count manipulation.
> If there is some convenient lock, especially if it is already held where
> needed, then locking is more straightforward. Otherwise, RCU is usually
> a reasonable option.
I was concerned about the overhead a lock would have at each system
call entry, but I didn't benchmark it to see. I'll add the
rcu_dereference right away, then look into seeing whether there's a
cleaner approach. I was trying to be overly protective of mutating
any data internal to the filters through complete replacement on any
change. I'll take a step back and see if
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_test_filters - tests 'current' against the given syscall
>> >> + * @state: seccomp_state of current to use.
>> >> + * @syscall: number of the system call to test
>> >> + *
>> >> + * Returns 0 on ok and non-zero on error/failure.
>> >> + */
>> >> +int seccomp_test_filters(int syscall)
>> >> +{
>> >> + uint16_t id;
>> >> + struct event_filter *filter;
>> >> + struct seccomp_filters *filters;
>> >> + int ret = -EACCES;
>> >> +
>> >> + rcu_read_lock();
>> >> + filters = get_seccomp_filters(current->seccomp.filters);
>> >> + rcu_read_unlock();
>> >> +
>> >> + if (!filters)
>> >> + goto out;
>> >> +
>> >> + if (filters_compat_mismatch(filters)) {
>> >> + pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
>> >> + current->comm, task_pid_nr(current));
>> >> + goto out;
>> >> + }
>> >> +
>> >> + /* execve is never allowed. */
>> >> + if (syscall_is_execve(syscall))
>> >> + goto out;
>> >> +
>> >> + ret = 0;
>> >> + id = seccomp_filter_id(filters, syscall);
>> >> + if (seccomp_filter_allow(id))
>> >> + goto out;
>> >> +
>> >> + ret = -EACCES;
>> >> + if (!seccomp_filter_dynamic(id))
>> >> + goto out;
>> >> +
>> >> + filter = seccomp_dynamic_filter(filters, id);
>> >> + if (filter && filter_match_current(filter))
>> >> + ret = 0;
>> >> +out:
>> >> + put_seccomp_filters(filters);
>> >> + return ret;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_show_filters - prints the current filter state to a seq_file
>> >> + * @filters: properly get()'d filters object
>> >> + * @m: the prepared seq_file to receive the data
>> >> + *
>> >> + * Returns 0 on a successful write.
>> >> + */
>> >> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
>> >> +{
>> >> + int syscall;
>> >> + seq_printf(m, "Mode: %d\n", current->seccomp.mode);
>> >> + if (!filters)
>> >> + goto out;
>> >> +
>> >> + for (syscall = 0; syscall < NR_syscalls; ++syscall) {
>> >> + uint16_t id = seccomp_filter_id(filters, syscall);
>> >> + const char *filter_string = SECCOMP_FILTER_ALLOW;
>> >> + if (seccomp_filter_deny(id))
>> >> + continue;
>> >> + seq_printf(m, "%d (%s): ",
>> >> + syscall,
>> >> + syscall_nr_to_name(syscall));
>> >> + if (seccomp_filter_dynamic(id))
>> >> + filter_string = get_filter_string(
>> >> + seccomp_dynamic_filter(filters, id));
>> >> + seq_printf(m, "%s\n", filter_string);
>> >> + }
>> >> +out:
>> >> + return 0;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
>> >> +
>> >> +/**
>> >> + * seccomp_get_filter - copies the filter_string into "buf"
>> >> + * @syscall_nr: system call number to look up
>> >> + * @buf: destination buffer
>> >> + * @bufsize: available space in the buffer.
>> >> + *
>> >> + * Context: User context only. This function may sleep on allocation and
>> >> + * operates on current. current must be attempting a system call
>> >> + * when this is called.
>> >> + *
>> >> + * Looks up the filter for the given system call number on current. If found,
>> >> + * the string length of the NUL-terminated buffer is returned and < 0 is
>> >> + * returned on error. The NUL byte is not included in the length.
>> >> + */
>> >> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
>> >> +{
>> >> + struct seccomp_filters *filters;
>> >> + struct event_filter *filter;
>> >> + long ret = -EINVAL;
>> >> + uint16_t id;
>> >> +
>> >> + if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
>> >> + bufsize = SECCOMP_MAX_FILTER_LENGTH;
>> >> +
>> >> + rcu_read_lock();
>> >> + filters = get_seccomp_filters(current->seccomp.filters);
>> >> + rcu_read_unlock();
>> >> +
>> >> + if (!filters)
>> >> + goto out;
>> >> +
>> >> + ret = -ENOENT;
>> >> + id = seccomp_filter_id(filters, syscall_nr);
>> >> + if (seccomp_filter_deny(id))
>> >> + goto out;
>> >> +
>> >> + if (seccomp_filter_allow(id)) {
>> >> + ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
>> >> + goto copied;
>> >> + }
>> >> +
>> >> + filter = seccomp_dynamic_filter(filters, id);
>> >> + if (!filter)
>> >> + goto out;
>> >> + ret = strlcpy(buf, get_filter_string(filter), bufsize);
>> >> +
>> >> +copied:
>> >> + if (ret >= bufsize) {
>> >> + ret = -ENOSPC;
>> >> + goto out;
>> >> + }
>> >> + /* Zero out any remaining buffer, just in case. */
>> >> + memset(buf + ret, 0, bufsize - ret);
>> >> +out:
>> >> + put_seccomp_filters(filters);
>> >> + return ret;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
>> >> +
>> >> +/**
>> >> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
>> >> + * @syscall_nr: the system call number to clear filters for.
>> >> + *
>> >> + * Context: User context only. This function may sleep on allocation and
>> >> + * operates on current. current must be attempting a system call
>> >> + * when this is called.
>> >> + *
>> >> + * Returns 0 on success.
>> >> + */
>> >> +long seccomp_clear_filter(int syscall_nr)
>> >> +{
>> >> + struct seccomp_filters *filters = NULL, *orig_filters;
>> >> + uint16_t id;
>> >> + int ret = -EINVAL;
>> >> +
>> >> + rcu_read_lock();
>> >> + orig_filters = get_seccomp_filters(current->seccomp.filters);
>> >> + rcu_read_unlock();
>> >> +
>> >> + if (!orig_filters)
>> >> + goto out;
>> >> +
>> >> + if (filters_compat_mismatch(orig_filters))
>> >> + goto out;
>> >> +
>> >> + id = seccomp_filter_id(orig_filters, syscall_nr);
>> >> + if (seccomp_filter_deny(id))
>> >> + goto out;
>> >> +
>> >> + /* Create a new filters object for the task */
>> >> + if (seccomp_filter_dynamic(id))
>> >> + filters = seccomp_filters_new(orig_filters->count - 1);
>> >> + else
>> >> + filters = seccomp_filters_new(orig_filters->count);
>> >> +
>> >> + if (IS_ERR(filters)) {
>> >> + ret = PTR_ERR(filters);
>> >> + goto out;
>> >> + }
>> >> +
>> >> + /* Copy, but drop the requested entry. */
>> >> + ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
>> >> + if (ret)
>> >> + goto out;
>> >> + get_seccomp_filters(filters); /* simplify the out: path */
>> >> +
>> >> + rcu_assign_pointer(current->seccomp.filters, filters);
>> >
>> > What prevents two copies of seccomp_clear_filter() from running
>> > concurrently?
>>
>> Nothing - the last one wins assignment, but the objects themselves
>> should be internally consistent to the parallel calls. If that's a
>> concern, a per-task writer mutex could be used just to ensure
>> simultaneous calls to clear and set are performed serially. Would
>> that make more sense?
>
> Here is the sequence of events that I am concerned about:
>
> o CPU 0 sets orig_filters to point to the current filters.
>
> o CPU 1 sets its local orig_filters to point to the current
> set of filters.
>
> o Both CPUs allocate new filters and use rcu_assign_pointer()
> to do the update. As you say, the last one wins, but it appears
> to me that the first one leaks memory.
>
> o Both CPUs free the object referenced by their orig_filters,
> which might or might not result in a double free, depending
> on exactly what happens below. (You might actually be OK,
> I didn't check -- leaking memory was enough for me to call
> attention to this.)
>
> So yes, please use some kind of mutual exclusion. Not sure what you
> mean by "per-task mutex", but whatever it is must prevent two different
> tasks from acting on the same set of filters at the same time. The
> thing that I call "per-task mutex" would -not- do that.
Ah nice. Yeah that would most definitely leak as there would be a
remaining increment for the task that isn't dropped.
Since those interfaces acquire the filter itself from
current->seccomp.filters, I was thinking a mutex in current, e.g.,
current->seccomp.write_mutex, would fit the bill to ensure that
pointer isn't accessed for replacement. I'll look at this and the rcu
usage to see if I'm taking the most logical approach. I pretty much
always get locking wrong in some way and perhaps I can simplify
further and get the correct guarantees. I really appreciate the keen
observation and explanation!
>> >> + synchronize_rcu();
>> >> + put_seccomp_filters(orig_filters); /* for the task */
>> >> +out:
>> >> + put_seccomp_filters(orig_filters); /* for the get */
>> >> + put_seccomp_filters(filters); /* for the extra get */
>> >> + return ret;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
>> >> +
>> >> +/**
>> >> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
>> >> + * @syscall_nr: system call number to apply the filter to.
>> >> + * @filter: ftrace filter string to apply.
>> >> + *
>> >> + * Context: User context only. This function may sleep on allocation and
>> >> + * operates on current. current must be attempting a system call
>> >> + * when this is called.
>> >> + *
>> >> + * New filters may be added for system calls when the current task is
>> >> + * not in a secure computing mode (seccomp). Otherwise, existing filters may
>> >> + * be extended.
>> >> + *
>> >> + * Returns 0 on success or an errno on failure.
>> >> + */
>> >> +long seccomp_set_filter(int syscall_nr, char *filter)
>> >> +{
>> >> + struct seccomp_filters *filters = NULL, *orig_filters = NULL;
>> >> + uint16_t id;
>> >> + long ret = -EINVAL;
>> >> + uint16_t filters_needed;
>> >> +
>> >> + if (!filter)
>> >> + goto out;
>> >> +
>> >> + filter = strstrip(filter);
>> >> + /* Disallow empty strings. */
>> >> + if (filter[0] == 0)
>> >> + goto out;
>> >> +
>> >> + rcu_read_lock();
>> >> + orig_filters = get_seccomp_filters(current->seccomp.filters);
>> >> + rcu_read_unlock();
>> >> +
>> >> + /* After the first call, compatibility mode is selected permanently. */
>> >> + ret = -EACCES;
>> >> + if (filters_compat_mismatch(orig_filters))
>> >> + goto out;
>> >> +
>> >> + filters_needed = orig_filters ? orig_filters->count : 0;
>> >> + id = seccomp_filter_id(orig_filters, syscall_nr);
>> >> + if (seccomp_filter_deny(id)) {
>> >> + /* Don't allow DENYs to be changed when in a seccomp mode */
>> >> + ret = -EACCES;
>> >> + if (current->seccomp.mode)
>> >> + goto out;
>> >> + filters_needed++;
>> >> + }
>> >> +
>> >> + filters = seccomp_filters_new(filters_needed);
>> >> + if (IS_ERR(filters)) {
>> >> + ret = PTR_ERR(filters);
>> >> + goto out;
>> >> + }
>> >> +
>> >> + filters_set_compat(filters);
>> >> + if (orig_filters) {
>> >> + ret = seccomp_filters_copy(filters, orig_filters, -1);
>> >> + if (ret)
>> >> + goto out;
>> >> + }
>> >> +
>> >> + if (seccomp_filter_deny(id))
>> >> + ret = seccomp_add_filter(filters, syscall_nr, filter);
>> >> + else
>> >> + ret = seccomp_extend_filter(filters, syscall_nr, filter);
>> >> + if (ret)
>> >> + goto out;
>> >> + get_seccomp_filters(filters); /* simplify the error paths */
>> >> +
>> >> + rcu_assign_pointer(current->seccomp.filters, filters);
>> >
>> > Again, what prevents two copies of seccomp_set_filter() from running
>> > concurrently?
>>
>> Same deal - nothing, but I'd be happy to add a guard if it makes sense.
>>
>> Thanks!
>>
>> >> + synchronize_rcu();
>> >> + put_seccomp_filters(orig_filters); /* for the task */
>> >> +out:
>> >> + put_seccomp_filters(orig_filters); /* for the get */
>> >> + put_seccomp_filters(filters); /* for get or task, on err */
>> >> + return ret;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
>> >> +
>> >> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
>> >> + char __user *user_filter)
>> >> +{
>> >> + int nr;
>> >> + long ret;
>> >> + char *filter = NULL;
>> >> +
>> >> + ret = -EINVAL;
>> >> + if (syscall_nr >= NR_syscalls)
>> >> + goto out;
>> >> +
>> >> + ret = -EFAULT;
>> >> + if (!user_filter)
>> >> + goto out;
>> >> +
>> >> + filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> >> + ret = -ENOMEM;
>> >> + if (!filter)
>> >> + goto out;
>> >> +
>> >> + ret = -EFAULT;
>> >> + if (strncpy_from_user(filter, user_filter,
>> >> + SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
>> >> + goto out;
>> >> +
>> >> + nr = (int) syscall_nr;
>> >> + ret = seccomp_set_filter(nr, filter);
>> >> +
>> >> +out:
>> >> + kfree(filter);
>> >> + return ret;
>> >> +}
>> >> +
>> >> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
>> >> +{
>> >> + int nr = -1;
>> >> + long ret;
>> >> +
>> >> + ret = -EINVAL;
>> >> + if (syscall_nr >= NR_syscalls)
>> >> + goto out;
>> >> +
>> >> + nr = (int) syscall_nr;
>> >> + ret = seccomp_clear_filter(nr);
>> >> +
>> >> +out:
>> >> + return ret;
>> >> +}
>> >> +
>> >> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
>> >> + unsigned long available)
>> >> +{
>> >> + int ret, nr;
>> >> + unsigned long copied;
>> >> + char *buf = NULL;
>> >> + ret = -EINVAL;
>> >> + if (!available)
>> >> + goto out;
>> >> + /* Ignore extra buffer space. */
>> >> + if (available > SECCOMP_MAX_FILTER_LENGTH)
>> >> + available = SECCOMP_MAX_FILTER_LENGTH;
>> >> +
>> >> + ret = -EINVAL;
>> >> + if (syscall_nr >= NR_syscalls)
>> >> + goto out;
>> >> + nr = (int) syscall_nr;
>> >> +
>> >> + ret = -ENOMEM;
>> >> + buf = kmalloc(available, GFP_KERNEL);
>> >> + if (!buf)
>> >> + goto out;
>> >> +
>> >> + ret = seccomp_get_filter(nr, buf, available);
>> >> + if (ret < 0)
>> >> + goto out;
>> >> +
>> >> + /* Include the NUL byte in the copy. */
>> >> + copied = copy_to_user(dst, buf, ret + 1);
>> >> + ret = -ENOSPC;
>> >> + if (copied)
>> >> + goto out;
>> >> + ret = 0;
>> >> +out:
>> >> + kfree(buf);
>> >> + return ret;
>> >> +}
>> >> diff --git a/kernel/sys.c b/kernel/sys.c
>> >> index af468ed..ed60d06 100644
>> >> --- a/kernel/sys.c
>> >> +++ b/kernel/sys.c
>> >> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>> >> case PR_SET_ENDIAN:
>> >> error = SET_ENDIAN(me, arg2);
>> >> break;
>> >> -
>> >> case PR_GET_SECCOMP:
>> >> error = prctl_get_seccomp();
>> >> break;
>> >> case PR_SET_SECCOMP:
>> >> error = prctl_set_seccomp(arg2);
>> >> break;
>> >> + case PR_SET_SECCOMP_FILTER:
>> >> + error = prctl_set_seccomp_filter(arg2,
>> >> + (char __user *) arg3);
>> >> + break;
>> >> + case PR_CLEAR_SECCOMP_FILTER:
>> >> + error = prctl_clear_seccomp_filter(arg2);
>> >> + break;
>> >> + case PR_GET_SECCOMP_FILTER:
>> >> + error = prctl_get_seccomp_filter(arg2,
>> >> + (char __user *) arg3,
>> >> + arg4);
>> >> + break;
>> >> case PR_GET_TSC:
>> >> error = GET_TSC_CTL(arg2);
>> >> break;
>> >> diff --git a/security/Kconfig b/security/Kconfig
>> >> index 95accd4..c76adf2 100644
>> >> --- a/security/Kconfig
>> >> +++ b/security/Kconfig
>> >> @@ -2,6 +2,10 @@
>> >> # Security configuration
>> >> #
>> >>
>> >> +# Make seccomp filter Kconfig switch below available
>> >> +config HAVE_SECCOMP_FILTER
>> >> + bool
>> >> +
>> >> menu "Security options"
>> >>
>> >> config KEYS
>> >> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
>> >>
>> >> If you are unsure how to answer this question, answer N.
>> >>
>> >> +config SECCOMP_FILTER
>> >> + bool "Enable seccomp-based system call filtering"
>> >> + select SECCOMP
>> >> + depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
>> >> + help
>> >> + This kernel feature expands CONFIG_SECCOMP to allow computing
>> >> + in environments with reduced kernel access dictated by the
>> >> + application itself through prctl calls. If
>> >> + CONFIG_FTRACE_SYSCALLS is available, then system call
>> >> + argument-based filtering predicates may be used.
>> >> +
>> >> + See Documentation/prctl/seccomp_filter.txt for more detail.
>> >> +
>> >> config SECURITY
>> >> bool "Enable different security models"
>> >> depends on SYSFS
>> >> --
>> >> 1.7.0.4
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> >> the body of a message to majordomo@vger.kernel.org
>> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> >> Please read the FAQ at http://www.tux.org/lkml/
>> >
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-06-02 20:28 ` Will Drewry
@ 2011-06-02 20:46 ` Steven Rostedt
2011-06-02 21:12 ` Paul E. McKenney
0 siblings, 1 reply; 91+ messages in thread
From: Steven Rostedt @ 2011-06-02 20:46 UTC (permalink / raw)
To: Will Drewry
Cc: paulmck, linux-kernel, kees.cook, torvalds, tglx, mingo, jmorris,
Peter Zijlstra, Frederic Weisbecker, linux-security-module
On Thu, 2011-06-02 at 15:28 -0500, Will Drewry wrote:
[ Snipped 860 lines of non relevant text ]
Seriously guys, Please trim your replies. These last few messages were
ridicules. I spent more than 30 seconds searching for what the email was
about. That's too much wasted time.
-- Steve
> >> Ah, I spaced on rcu_dereference(). The goal was to make the
> >> assignment and replacement of the seccomp_filters pointer
> >> RCU-protected (in seccomp_state) so there's no concern over it being
> >> replaced partial on platforms where pointer assignments are non-atomic
> >> - such as via /proc/<pid>/seccomp_filters access or a call via the
> >> exported symbols. Object lifetime is managed by reference counting so
> >> that I don't have to worry about extending the RCU read-side critical
> >> section by much or deal with pre-allocations.
> >>
> >> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
> >> it makes sense, so that it is called safely. Just to make sure, does
> >> it make sense to continue to rcu protect the specific pointer?
> >
> > It might. The usual other options is to use a lock outside of the element
> > containing the reference count to protect reference-count manipulation.
> > If there is some convenient lock, especially if it is already held where
> > needed, then locking is more straightforward. Otherwise, RCU is usually
> > a reasonable option.
>
> I was concerned about the overhead a lock would have at each system
> call entry, but I didn't benchmark it to see. I'll add the
> rcu_dereference right away, then look into seeing whether there's a
> cleaner approach. I was trying to be overly protective of mutating
> any data internal to the filters through complete replacement on any
> change. I'll take a step back and see if
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
2011-06-02 20:46 ` Steven Rostedt
@ 2011-06-02 21:12 ` Paul E. McKenney
0 siblings, 0 replies; 91+ messages in thread
From: Paul E. McKenney @ 2011-06-02 21:12 UTC (permalink / raw)
To: Steven Rostedt
Cc: Will Drewry, linux-kernel, kees.cook, torvalds, tglx, mingo,
jmorris, Peter Zijlstra, Frederic Weisbecker,
linux-security-module
On Thu, Jun 02, 2011 at 04:46:07PM -0400, Steven Rostedt wrote:
> On Thu, 2011-06-02 at 15:28 -0500, Will Drewry wrote:
>
> [ Snipped 860 lines of non relevant text ]
>
> Seriously guys, Please trim your replies. These last few messages were
> ridicules. I spent more than 30 seconds searching for what the email was
> about. That's too much wasted time.
Because every time I do trim the messages, I get a response from the
reviewee of the form "Oh, I take care of that in function foo()."
And of course function foo() will be in the part I trimmed. So I then
have to find the earlier message, copy the function back in, and by
that time something else has distracted me.
Thanx, Paul
> -- Steve
>
>
> > >> Ah, I spaced on rcu_dereference(). The goal was to make the
> > >> assignment and replacement of the seccomp_filters pointer
> > >> RCU-protected (in seccomp_state) so there's no concern over it being
> > >> replaced partial on platforms where pointer assignments are non-atomic
> > >> - such as via /proc/<pid>/seccomp_filters access or a call via the
> > >> exported symbols. Object lifetime is managed by reference counting so
> > >> that I don't have to worry about extending the RCU read-side critical
> > >> section by much or deal with pre-allocations.
> > >>
> > >> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
> > >> it makes sense, so that it is called safely. Just to make sure, does
> > >> it make sense to continue to rcu protect the specific pointer?
> > >
> > > It might. The usual other options is to use a lock outside of the element
> > > containing the reference count to protect reference-count manipulation.
> > > If there is some convenient lock, especially if it is already held where
> > > needed, then locking is more straightforward. Otherwise, RCU is usually
> > > a reasonable option.
> >
> > I was concerned about the overhead a lock would have at each system
> > call entry, but I didn't benchmark it to see. I'll add the
> > rcu_dereference right away, then look into seeing whether there's a
> > cleaner approach. I was trying to be overly protective of mutating
> > any data internal to the filters through complete replacement on any
> > change. I'll take a step back and see if
> >
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v3 04/13] seccomp_filter: add process state reporting
2011-05-26 18:49 ` Will Drewry
` (2 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
` (8 subsequent siblings)
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry
Adds seccomp and seccomp_filter status reporting to proc.
/proc/<pid>/seccomp_filter provides the current seccomp mode
and the list of allowed or dynamically filtered system calls.
v3: changed to using filters directly.
v2: removed status entry, added seccomp file.
(requested by kosaki.motohiro@jp.fujitsu.com)
allowed S_IRUGO reading of entries
(requested by viro@zeniv.linux.org.uk)
added flags
got rid of the seccomp_t type
dropped seccomp file
Signed-off-by: Will Drewry <wad@chromium.org>
---
fs/proc/base.c | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa5327..01473fe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/seccomp.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
@@ -579,6 +580,24 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+/*
+ * Print out the current seccomp filter set for the task.
+ */
+#ifdef CONFIG_SECCOMP_FILTER
+int proc_pid_seccomp_filter_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct seccomp_filters *filters;
+
+ rcu_read_lock();
+ filters = get_seccomp_filters(task->seccomp.filters);
+ rcu_read_unlock();
+ seccomp_show_filters(filters, m);
+ put_seccomp_filters(filters);
+ return 0;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+
/************************************************************************/
/* Here the fs part begins */
/************************************************************************/
@@ -2838,6 +2857,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF("syscall", S_IRUGO, proc_pid_syscall),
#endif
+#ifdef CONFIG_SECCOMP_FILTER
+ ONE("seccomp_filter", S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
@@ -3180,6 +3202,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF("syscall", S_IRUGO, proc_pid_syscall),
#endif
+#ifdef CONFIG_SECCOMP_FILTER
+ ONE("seccomp_filter", S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
2011-05-26 18:49 ` Will Drewry
` (3 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 04/13] seccomp_filter: add process state reporting Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 21:23 ` Kees Cook
2011-06-01 3:10 ` [PATCH v3 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve Will Drewry
` (7 subsequent siblings)
12 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Randy Dunlap, linux-doc
Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
implemented presently, and what it may be used for. In addition,
the limitations and caveats of the proposed implementation are
included.
v3: a little more cleanup
v2: moved to prctl/
updated for the v2 syntax.
adds a note about compat behavior
Signed-off-by: Will Drewry <wad@chromium.org>
---
Documentation/prctl/seccomp_filter.txt | 145 ++++++++++++++++++++++++++++++++
1 files changed, 145 insertions(+), 0 deletions(-)
create mode 100644 Documentation/prctl/seccomp_filter.txt
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
new file mode 100644
index 0000000..27ac5af
--- /dev/null
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -0,0 +1,145 @@
+ Seccomp filtering
+ =================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the process.
+As system calls change and mature, bugs are found and eradicated. A
+certain subset of userland applications benefit by having a reduce set
+of available system calls. The reduced set reduces the total kernel
+surface exposed to the application. System call filtering is meant for
+use with those applications.
+
+The implementation currently leverages both the existing seccomp
+infrastructure and the kernel tracing infrastructure. By centralizing
+hooks for attack surface reduction in seccomp, it is possible to assure
+attention to security that is less relevant in normal ftrace scenarios,
+such as time-of-check, time-of-use attacks. However, ftrace provides a
+rich, human-friendly environment for interfacing with system call
+specific arguments. (As such, this requires FTRACE_SYSCALLS for any
+introspective filtering support.)
+
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox. It provides a clearly defined
+mechanism for minimizing the exposed kernel surface. Beyond that,
+policy for logical behavior and information flow should be managed with
+a combinations of other system hardening techniques and, potentially, a
+LSM of your choosing. Expressive, dynamic filters based on the ftrace
+filter engine provide further options down this path (avoiding
+pathological sizes or selecting which of the multiplexed system calls in
+socketcall() is allowed, for instance) which could be construed,
+incorrectly, as a more complete sandboxing solution.
+
+
+Usage
+-----
+
+An additional seccomp mode is exposed through mode '2'.
+This mode depends on CONFIG_SECCOMP_FILTER. By default, it provides
+only the most trivial of filter support "1" or cleared. However, if
+CONFIG_FTRACE_SYSCALLS is enabled, the ftrace filter engine may be used
+for more expressive filters.
+
+A collection of filters may be supplied via prctl, and the current set
+of filters is exposed in /proc/<pid>/seccomp_filter.
+
+Interacting with seccomp filters can be done through three new prctl calls
+and one existing one.
+
+PR_SET_SECCOMP:
+ A pre-existing option for enabling strict seccomp mode (1) or
+ filtering seccomp (2).
+
+ Usage:
+ prctl(PR_SET_SECCOMP, 1); /* strict */
+ prctl(PR_SET_SECCOMP, 2); /* filters */
+
+PR_SET_SECCOMP_FILTER:
+ Allows the specification of a new filter for a given system
+ call, by number, and filter string. If CONFIG_FTRACE_SYSCALLS is
+ supported, the filter string may be any valid value for the
+ given system call. If it is not supported, the filter string
+ may only be "1".
+
+ All calls to PR_SET_SECCOMP_FILTER for a given system
+ call will append the supplied string to any existing filters.
+ Filter construction looks as follows:
+ (Nothing) + "fd == 1 || fd == 2" => fd == 1 || fd == 2
+ ... + "fd != 2" => (fd == 1 || fd == 2) && fd != 2
+ ... + "size < 100" =>
+ ((fd == 1 || fd == 2) && fd != 2) && size < 100
+ If there is no filter and the seccomp mode has already
+ transitioned to filtering, additions cannot be made. Filters
+ may only be added that reduce the available kernel surface.
+
+ Usage (per the construction example above):
+ prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
+ prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
+ prctl(PR_SET_SECCOMP_FILTER, __NR_write, "size < 100");
+
+PR_CLEAR_SECCOMP_FILTER:
+ Removes all filter entries for a given system call number. When
+ called prior to entering seccomp filtering mode, it allows for
+ new filters to be applied to the same system call. After
+ transition, however, it completely drops access to the call.
+
+ Usage:
+ prctl(PR_CLEAR_SECCOMP_FILTER, __NR_open);
+
+PR_GET_SECCOMP_FILTER: Returns the aggregated filter string for a system
+ call into a user-supplied buffer of a given length.
+
+ Usage:
+ prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf,
+ sizeof(buf));
+
+All of the above calls return 0 on success and non-zero on error.
+
+
+Example
+-------
+
+Assume a process would like to cleanly read and write to stdin/out/err
+as well as access its filters after seccomp enforcement begins. This
+may be done as follows:
+
+ prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 0");
+ prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
+ prctl(PR_SET_SECCOMP_FILTER, __NR_exit, "1");
+ prctl(PR_SET_SECCOMP_FILTER, __NR_prctl, "1");
+
+ prctl(PR_SET_SECCOMP, PR_SECCOMP_MODE_FILTER, 0);
+
+ /* Do stuff with fdset . . .*/
+
+ /* Drop read access and keep only write access to fd 1. */
+ prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
+ prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
+
+ /* Perform any final processing . . . */
+ syscall(__NR_exit, 0);
+
+
+Caveats
+-------
+
+- The filter event subsystem comes from CONFIG_TRACE_EVENTS, and the
+system call events come from CONFIG_FTRACE_SYSCALLS. However, if
+neither are available, a filter string of "1" will be honored, and it may
+be removed using PR_CLEAR_SECCOMP_FILTER. With ftrace filtering,
+calling PR_SET_SECCOMP_FILTER with a filter of "0" would have similar
+affect but would not be consistent on a kernel without the support.
+
+- Some platforms support a 32-bit userspace with 64-bit kernels. In
+these cases (CONFIG_COMPAT), system call numbers may not match across
+64-bit and 32-bit system calls. When the first PRCTL_SET_SECCOMP_FILTER
+is called, the in-memory filters state is annotated with whether the
+call has been made via the compat interface. All subsequent calls will
+be checked for compat call mismatch. In the long run, it may make sense
+to store compat and non-compat filters separately, but that is not
+supported at present.
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* Re: [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
2011-06-01 3:10 ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-06-01 21:23 ` Kees Cook
2011-06-01 23:03 ` Will Drewry
0 siblings, 1 reply; 91+ messages in thread
From: Kees Cook @ 2011-06-01 21:23 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, torvalds, tglx, mingo, rostedt, jmorris,
Randy Dunlap, linux-doc
Hi Will,
Minor typo corrections below...
On Tue, May 31, 2011 at 10:10:37PM -0500, Will Drewry wrote:
> Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
> implemented presently, and what it may be used for. In addition,
> the limitations and caveats of the proposed implementation are
> included.
>
> --- /dev/null
> +++ b/Documentation/prctl/seccomp_filter.txt
> @@ -0,0 +1,145 @@
> ...
> +certain subset of userland applications benefit by having a reduce set
reduced
> +of available system calls. The reduced set reduces the total kernel
Maybe "The resulting set reduces ... " ?
-Kees
--
Kees Cook
Ubuntu Security Team
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
2011-06-01 21:23 ` Kees Cook
@ 2011-06-01 23:03 ` Will Drewry
0 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 23:03 UTC (permalink / raw)
To: Kees Cook
Cc: linux-kernel, torvalds, tglx, mingo, rostedt, jmorris,
Randy Dunlap, linux-doc
On Wed, Jun 1, 2011 at 4:23 PM, Kees Cook <kees.cook@canonical.com> wrote:
> Hi Will,
>
> Minor typo corrections below...
>
> On Tue, May 31, 2011 at 10:10:37PM -0500, Will Drewry wrote:
>> Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
>> implemented presently, and what it may be used for. In addition,
>> the limitations and caveats of the proposed implementation are
>> included.
>>
>> --- /dev/null
>> +++ b/Documentation/prctl/seccomp_filter.txt
>> @@ -0,0 +1,145 @@
>> ...
>> +certain subset of userland applications benefit by having a reduce set
> reduced
>
>> +of available system calls. The reduced set reduces the total kernel
>
> Maybe "The resulting set reduces ... " ?
Cool - I'll clean it up in the next cut.
Thanks!
will
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v3 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve
2011-05-26 18:49 ` Will Drewry
` (4 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 07/13] arm: select HAVE_SECCOMP_FILTER Will Drewry
` (6 subsequent siblings)
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Ingo Molnar, H. Peter Anvin, x86
Adds support to the x86 architecture by providing a compatibility
mode wrapper for sys_execve's number and selecting HAVE_SECCOMP_FILTER
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/ia32_unistd.h | 1 +
arch/x86/include/asm/seccomp_64.h | 2 ++
3 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a..1843d17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -64,6 +64,7 @@ config X86
select HAVE_TEXT_POKE_SMP
select HAVE_GENERIC_HARDIRQS
select HAVE_SPARSE_IRQ
+ select HAVE_SECCOMP_FILTER
select GENERIC_FIND_FIRST_BIT
select GENERIC_FIND_NEXT_BIT
select GENERIC_IRQ_PROBE
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ec..8ed2922 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -12,6 +12,7 @@
#define __NR_ia32_exit 1
#define __NR_ia32_read 3
#define __NR_ia32_write 4
+#define __NR_ia32_execve 11
#define __NR_ia32_sigreturn 119
#define __NR_ia32_rt_sigreturn 173
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
index 84ec1bd..85c4219 100644
--- a/arch/x86/include/asm/seccomp_64.h
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -8,10 +8,12 @@
#define __NR_seccomp_write __NR_write
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
#define __NR_seccomp_read_32 __NR_ia32_read
#define __NR_seccomp_write_32 __NR_ia32_write
#define __NR_seccomp_exit_32 __NR_ia32_exit
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#define __NR_seccomp_execve_32 __NR_ia32_execve
#endif /* _ASM_X86_SECCOMP_64_H */
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 07/13] arm: select HAVE_SECCOMP_FILTER
2011-05-26 18:49 ` Will Drewry
` (5 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
` (5 subsequent siblings)
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Russell King, linux-arm-kernel
Enable support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER by
default.
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/arm/Kconfig | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 377a7a5..4725fbc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+ select HAVE_SECCOMP_FILTER
select HAVE_GENERIC_DMA_COHERENT
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-05-26 18:49 ` Will Drewry
` (6 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 07/13] arm: select HAVE_SECCOMP_FILTER Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 5:37 ` Michal Simek
2011-06-01 3:10 ` [PATCH v3 09/13] mips: " Will Drewry
` (4 subsequent siblings)
12 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Michal Simek, microblaze-uclinux
Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/microblaze/Kconfig | 1 +
arch/microblaze/include/asm/seccomp.h | 2 ++
2 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index eccdefe..30ef677 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -1,6 +1,7 @@
config MICROBLAZE
def_bool y
select HAVE_MEMBLOCK
+ select HAVE_SECCOMP_FILTER
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/microblaze/include/asm/seccomp.h b/arch/microblaze/include/asm/seccomp.h
index 0d91275..0e38eed 100644
--- a/arch/microblaze/include/asm/seccomp.h
+++ b/arch/microblaze/include/asm/seccomp.h
@@ -7,10 +7,12 @@
#define __NR_seccomp_write __NR_write
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_sigreturn
+#define __NR_seccomp_execve __NR_execve
#define __NR_seccomp_read_32 __NR_read
#define __NR_seccomp_write_32 __NR_write
#define __NR_seccomp_exit_32 __NR_exit
#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
#endif /* _ASM_MICROBLAZE_SECCOMP_H */
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* Re: [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-06-01 3:10 ` [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
@ 2011-06-01 5:37 ` Michal Simek
0 siblings, 0 replies; 91+ messages in thread
From: Michal Simek @ 2011-06-01 5:37 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
microblaze-uclinux
Will Drewry wrote:
> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
>
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
> arch/microblaze/Kconfig | 1 +
> arch/microblaze/include/asm/seccomp.h | 2 ++
> 2 files changed, 3 insertions(+), 0 deletions(-)
Acked-by: Michal Simek <monstr@monstr.eu>
--
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v3 09/13] mips: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-05-26 18:49 ` Will Drewry
` (7 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 10/13] s390: " Will Drewry
` (3 subsequent siblings)
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Ralf Baechle, linux-mips
Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/mips/Kconfig | 1 +
arch/mips/include/asm/seccomp.h | 3 +++
2 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8e256cc..d376f68 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -10,6 +10,7 @@ config MIPS
select HAVE_ARCH_KGDB
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ select HAVE_SECCOMP_FILTER
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_C_RECORDMCOUNT
diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index ae6306e..4014a3a 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -6,6 +6,7 @@
#define __NR_seccomp_write __NR_write
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
/*
* Kludge alert:
@@ -19,6 +20,7 @@
#define __NR_seccomp_write_32 4004
#define __NR_seccomp_exit_32 4001
#define __NR_seccomp_sigreturn_32 4193 /* rt_sigreturn */
+#define __NR_seccomp_execve_32 4011
#elif defined(CONFIG_MIPS32_N32)
@@ -26,6 +28,7 @@
#define __NR_seccomp_write_32 6001
#define __NR_seccomp_exit_32 6058
#define __NR_seccomp_sigreturn_32 6211 /* rt_sigreturn */
+#define __NR_seccomp_execve_32 6057
#endif /* CONFIG_MIPS32_O32 */
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 10/13] s390: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-05-26 18:49 ` Will Drewry
` (8 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 09/13] mips: " Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 11/13] powerpc: " Will Drewry
` (2 subsequent siblings)
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Martin Schwidefsky, Heiko Carstens, linux390, linux-s390
Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/s390/Kconfig | 1 +
arch/s390/include/asm/seccomp.h | 3 ++-
2 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2508a6f..9382198 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config S390
def_bool y
select USE_GENERIC_SMP_HELPERS if SMP
+ select HAVE_SECCOMP_FILTER
select HAVE_SYSCALL_WRAPPERS
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 781a9cf..e5792f5 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -7,10 +7,11 @@
#define __NR_seccomp_write __NR_write
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_sigreturn
+#define __NR_seccomp_execve __NR_execve
#define __NR_seccomp_read_32 __NR_read
#define __NR_seccomp_write_32 __NR_write
#define __NR_seccomp_exit_32 __NR_exit
-#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
#endif /* _ASM_S390_SECCOMP_H */
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-05-26 18:49 ` Will Drewry
` (9 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 10/13] s390: " Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:10 ` [PATCH v3 12/13] sparc: " Will Drewry
2011-06-01 3:10 ` [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER Will Drewry
12 siblings, 0 replies; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev
Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/powerpc/Kconfig | 1 +
arch/powerpc/include/asm/seccomp.h | 2 ++
2 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f4d50b..0bd4574 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
select HAVE_GENERIC_HARDIRQS
select HAVE_SPARSE_IRQ
+ select HAVE_SECCOMP_FILTER
select IRQ_PER_CPU
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 00c1d91..3cb9cc1 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -7,10 +7,12 @@
#define __NR_seccomp_write __NR_write
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
#define __NR_seccomp_read_32 __NR_read
#define __NR_seccomp_write_32 __NR_write
#define __NR_seccomp_exit_32 __NR_exit
#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
#endif /* _ASM_POWERPC_SECCOMP_H */
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-05-26 18:49 ` Will Drewry
` (10 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 11/13] powerpc: " Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-01 3:35 ` David Miller
2011-06-01 3:10 ` [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER Will Drewry
12 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
David S. Miller, sparclinux
Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
Signed-off-by: Will Drewry <wad@chromium.org>
---
arch/sparc/Kconfig | 2 ++
arch/sparc/include/asm/seccomp.h | 2 ++
2 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e560d10..5249760 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
select HAVE_ARCH_JUMP_LABEL
+ select HAVE_SECCOMP_FILTER
config SPARC32
def_bool !64BIT
@@ -39,6 +40,7 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_MEMBLOCK
+ select HAVE_SECCOMP_FILTER
select HAVE_SYSCALL_WRAPPERS
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index adca1bc..a1dac08 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -6,10 +6,12 @@
#define __NR_seccomp_write __NR_write
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
#define __NR_seccomp_read_32 __NR_read
#define __NR_seccomp_write_32 __NR_write
#define __NR_seccomp_exit_32 __NR_exit
#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
#endif /* _ASM_SECCOMP_H */
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* Re: [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
2011-06-01 3:10 ` [PATCH v3 12/13] sparc: " Will Drewry
@ 2011-06-01 3:35 ` David Miller
0 siblings, 0 replies; 91+ messages in thread
From: David Miller @ 2011-06-01 3:35 UTC (permalink / raw)
To: wad
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
sparclinux
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 May 2011 22:10:44 -0500
> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
>
> Signed-off-by: Will Drewry <wad@chromium.org>
Acked-by: David S. Miller <davem@davemloft.net>
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER
2011-05-26 18:49 ` Will Drewry
` (11 preceding siblings ...)
2011-06-01 3:10 ` [PATCH v3 12/13] sparc: " Will Drewry
@ 2011-06-01 3:10 ` Will Drewry
2011-06-02 5:27 ` Paul Mundt
12 siblings, 1 reply; 91+ messages in thread
From: Will Drewry @ 2011-06-01 3:10 UTC (permalink / raw)
To: linux-kernel
Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
Paul Mundt, linux-sh
Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.
---
arch/sh/Kconfig | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da2..41ea3a7 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,6 +10,7 @@ config SUPERH
select HAVE_DMA_API_DEBUG
select HAVE_DMA_ATTRS
select HAVE_IRQ_WORK
+ select HAVE_SECCOMP_FILTER
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select HAVE_KERNEL_GZIP
--
1.7.0.4
^ permalink raw reply related [flat|nested] 91+ messages in thread* Re: [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER
2011-06-01 3:10 ` [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER Will Drewry
@ 2011-06-02 5:27 ` Paul Mundt
0 siblings, 0 replies; 91+ messages in thread
From: Paul Mundt @ 2011-06-02 5:27 UTC (permalink / raw)
To: Will Drewry
Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
linux-sh
On Tue, May 31, 2011 at 10:10:45PM -0500, Will Drewry wrote:
> Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.
> ---
> arch/sh/Kconfig | 1 +
> 1 files changed, 1 insertions(+), 0 deletions(-)
>
Acked-by: Paul Mundt <lethal@linux-sh.org>
^ permalink raw reply [flat|nested] 91+ messages in thread