* [RFC v2 1/2] proc connector: add namespace events
[not found] ` <1476534370-4027-1-git-send-email-alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
@ 2016-10-15 12:26 ` Alban Crequy
[not found] ` <1476534370-4027-2-git-send-email-alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
2016-10-15 12:26 ` [RFC v2 2/2] proc connector: add a "get feature" op Alban Crequy
2016-10-16 14:57 ` [RFC v2 0/2] proc connector: get namespace events Eric W. Biederman
2 siblings, 1 reply; 6+ messages in thread
From: Alban Crequy @ 2016-10-15 12:26 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Iago Lopez Galeiras, Aaron Campbell, Jiri Benc, Jesper Derehag,
Alban Crequy, Tejun Heo, Evgeniy Polyakov, Dimitri John Ledkov
From: Alban Crequy <alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
The act of a process creating or joining a namespace via clone(),
unshare() or setns() is a useful signal for monitoring applications.
I am working on a monitoring application that keeps track of all the
containers and all processes inside each container. The current way of
doing it is by polling regularly in /proc for the list of processes and
in /proc/*/ns/* to know which namespaces they belong to. This is
inefficient on systems with a large number of containers and a large
number of processes.
Instead, I would inspect /proc only one time and get the updates with
the proc connector. Unfortunately, the proc connector gives me the list
of processes but does not notify me when a process changes namespaces.
So I would still need to inspect /proc/*/ns/*.
This patch adds namespace events for processes. It generates a namespace
event each time a process changes namespace via clone(), unshare() or
setns().
For example, the following command:
| # unshare -n -i -f ls -l /proc/self/ns/
| total 0
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 cgroup -> 'cgroup:[4026531835]'
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 ipc -> 'ipc:[4026532208]'
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 mnt -> 'mnt:[4026531840]'
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 net -> 'net:[4026532210]'
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 pid -> 'pid:[4026531836]'
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 user -> 'user:[4026531837]'
| lrwxrwxrwx 1 root root 0 Sep 25 22:31 uts -> 'uts:[4026531838]'
causes the proc connector to generate the following events:
| fork: ppid=691 pid=808
| exec: pid=808
| ns: pid=808 reason=unshare count=2
| type=ipc 4026531839 -> 4026532208
| type=net 4026531957 -> 4026532210
| fork: ppid=808 pid=809
| exec: pid=809
| exit: pid=809
| exit: pid=808
Signed-off-by: Alban Crequy <alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
---
drivers/connector/cn_proc.c | 138 +++++++++++++++++++++++++++++++++++++++++++
include/linux/cn_proc.h | 25 ++++++++
include/uapi/linux/cn_proc.h | 23 +++++++-
kernel/fork.c | 10 ++++
kernel/nsproxy.c | 6 ++
5 files changed, 201 insertions(+), 1 deletion(-)
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index a782ce8..c38733d 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -30,8 +30,13 @@
#include <linux/ptrace.h>
#include <linux/atomic.h>
#include <linux/pid_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/cn_proc.h>
+#include <linux/proc_ns.h>
/*
* Size of a cn_msg followed by a proc_event structure. Since the
@@ -296,6 +301,139 @@ void proc_exit_connector(struct task_struct *task)
send_msg(msg);
}
+void proc_ns_connector_prepare(struct ns_event_prepare *prepare, u16 reason)
+{
+ struct nsproxy *ns = current->nsproxy;
+ struct ns_common *mntns;
+
+ prepare->num_listeners = atomic_read(&proc_event_num_listeners);
+
+ if (prepare->num_listeners < 1)
+ return;
+
+ prepare->reason = reason;
+
+ prepare->user_inum = current->cred->user_ns->ns.inum;
+ prepare->uts_inum = ns->uts_ns->ns.inum;
+ prepare->ipc_inum = ns->ipc_ns->ns.inum;
+
+ mntns = mntns_operations.get(current);
+ if (mntns) {
+ prepare->mnt_inum = mntns->inum;
+ mntns_operations.put(mntns);
+ } else
+ prepare->mnt_inum = 0;
+
+ prepare->pid_inum = ns->pid_ns_for_children->ns.inum;
+ prepare->net_inum = ns->net_ns->ns.inum;
+ prepare->cgroup_inum = ns->cgroup_ns->ns.inum;
+}
+
+void proc_ns_connector_send(struct ns_event_prepare *prepare, struct task_struct *task)
+{
+ struct nsproxy *ns = task->nsproxy;
+ struct ns_common *mntns;
+ struct cn_msg *msg;
+ struct proc_event *ev;
+ __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
+ int count;
+
+ if (prepare->num_listeners < 1)
+ return;
+
+ if (atomic_read(&proc_event_num_listeners) < 1)
+ return;
+
+ msg = buffer_to_cn_msg(buffer);
+ ev = (struct proc_event *)msg->data;
+ memset(&ev->event_data, 0, sizeof(ev->event_data));
+ ev->timestamp_ns = ktime_get_ns();
+ ev->what = PROC_EVENT_NS;
+
+ ev->event_data.ns.process_pid = task->pid;
+ ev->event_data.ns.process_tgid = task->tgid;
+ ev->event_data.ns.reason = prepare->reason;
+ count = 0;
+
+ /* user */
+ if (prepare->user_inum != task->cred->user_ns->ns.inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWUSER;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->user_inum;
+ ev->event_data.ns.items[count].inum = task->cred->user_ns->ns.inum;
+ count++;
+ }
+
+ /* uts */
+ if (prepare->uts_inum != ns->uts_ns->ns.inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWUTS;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->uts_inum;
+ ev->event_data.ns.items[count].inum = ns->uts_ns->ns.inum;
+ count++;
+ }
+
+ /* ipc */
+ if (prepare->ipc_inum != ns->ipc_ns->ns.inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWIPC;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->ipc_inum;
+ ev->event_data.ns.items[count].inum = ns->ipc_ns->ns.inum;
+ count++;
+ }
+
+ /* mnt */
+ mntns = mntns_operations.get(task);
+ if (mntns) {
+ if (mntns && prepare->mnt_inum != mntns->inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWNS;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->mnt_inum;
+ ev->event_data.ns.items[count].inum = mntns->inum;
+ count++;
+ }
+ mntns_operations.put(mntns);
+ }
+
+ /* pid */
+ if (prepare->pid_inum != ns->pid_ns_for_children->ns.inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWPID;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->pid_inum;
+ ev->event_data.ns.items[count].inum = ns->pid_ns_for_children->ns.inum;
+ count++;
+ }
+
+ /* net */
+ if (prepare->net_inum != ns->net_ns->ns.inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWNET;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->net_inum;
+ ev->event_data.ns.items[count].inum = ns->net_ns->ns.inum;
+ count++;
+ }
+
+ /* cgroup */
+ if (prepare->cgroup_inum != ns->cgroup_ns->ns.inum) {
+ ev->event_data.ns.items[count].type = CLONE_NEWNET;
+ ev->event_data.ns.items[count].flags = 0;
+ ev->event_data.ns.items[count].old_inum = prepare->cgroup_inum;
+ ev->event_data.ns.items[count].inum = ns->cgroup_ns->ns.inum;
+ count++;
+ }
+
+ if (count == 0)
+ return;
+
+ ev->event_data.ns.count = count;
+
+ memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+ msg->ack = 0; /* not used */
+ msg->len = sizeof(*ev);
+ msg->flags = 0; /* not used */
+ send_msg(msg);
+}
+
/*
* Send an acknowledgement message to userspace
*
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1d5b02a..8bf42f4 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -19,6 +19,20 @@
#include <uapi/linux/cn_proc.h>
+struct ns_event_prepare {
+ int num_listeners;
+
+ u16 reason;
+
+ u64 user_inum;
+ u64 uts_inum;
+ u64 ipc_inum;
+ u64 mnt_inum;
+ u64 pid_inum;
+ u64 net_inum;
+ u64 cgroup_inum;
+};
+
#ifdef CONFIG_PROC_EVENTS
void proc_fork_connector(struct task_struct *task);
void proc_exec_connector(struct task_struct *task);
@@ -28,6 +42,9 @@ void proc_ptrace_connector(struct task_struct *task, int which_id);
void proc_comm_connector(struct task_struct *task);
void proc_coredump_connector(struct task_struct *task);
void proc_exit_connector(struct task_struct *task);
+
+void proc_ns_connector_prepare(struct ns_event_prepare *prepare, u16 reason);
+void proc_ns_connector_send(struct ns_event_prepare *prepare, struct task_struct *task);
#else
static inline void proc_fork_connector(struct task_struct *task)
{}
@@ -54,5 +71,13 @@ static inline void proc_coredump_connector(struct task_struct *task)
static inline void proc_exit_connector(struct task_struct *task)
{}
+
+static inline void proc_ns_connector_prepare(struct ns_event_prepare *prepare,
+ u16 reason)
+{}
+
+static inline void proc_ns_connector_send(struct ns_event_prepare *prepare,
+ struct task_struct *task)
+{}
#endif /* CONFIG_PROC_EVENTS */
#endif /* CN_PROC_H */
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index f6c2710..3270e8c 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -55,7 +55,8 @@ struct proc_event {
PROC_EVENT_SID = 0x00000080,
PROC_EVENT_PTRACE = 0x00000100,
PROC_EVENT_COMM = 0x00000200,
- /* "next" should be 0x00000400 */
+ PROC_EVENT_NS = 0x00000400,
+ /* "next" should be 0x00000800 */
/* "last" is the last process event: exit,
* while "next to last" is coredumping event */
PROC_EVENT_COREDUMP = 0x40000000,
@@ -112,6 +113,26 @@ struct proc_event {
char comm[16];
} comm;
+ /* There are 7 kind of namespaces */
+ #define MAX_NS_PROC_EVENT_COUNT 7
+ struct ns_proc_event {
+ __kernel_pid_t process_pid;
+ __kernel_pid_t process_tgid;
+ enum reason {
+ PROC_NS_REASON_CLONE = 0x00000001,
+ PROC_NS_REASON_SETNS = 0x00000002,
+ PROC_NS_REASON_UNSHARE = 0x00000003,
+ PROC_NS_REASON_LAST = 0x80000000,
+ } reason;
+ __u32 count;
+ struct {
+ __u32 type; /* CLONE_NEWNS, CLONE_NEWPID, ... */
+ __u32 flags; /* unused */
+ __u64 old_inum;
+ __u64 inum;
+ } items[MAX_NS_PROC_EVENT_COUNT];
+ } ns;
+
struct coredump_proc_event {
__kernel_pid_t process_pid;
__kernel_pid_t process_tgid;
diff --git a/kernel/fork.c b/kernel/fork.c
index beb3172..a625394 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1759,6 +1759,7 @@ long _do_fork(unsigned long clone_flags,
struct task_struct *p;
int trace = 0;
long nr;
+ struct ns_event_prepare ns_event;
/*
* Determine whether and which event to report to ptracer. When
@@ -1778,8 +1779,11 @@ long _do_fork(unsigned long clone_flags,
trace = 0;
}
+ proc_ns_connector_prepare(&ns_event, PROC_NS_REASON_CLONE);
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ proc_ns_connector_send(&ns_event, p);
+
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -2024,6 +2028,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
+ struct ns_event_prepare ns_event;
/*
* If unsharing a user namespace must also unshare the thread group
@@ -2050,6 +2055,9 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
err = check_unshare_flags(unshare_flags);
if (err)
goto bad_unshare_out;
+
+ proc_ns_connector_prepare(&ns_event, PROC_NS_REASON_UNSHARE);
+
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
@@ -2115,6 +2123,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
+ proc_ns_connector_send(&ns_event, current);
+
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e..16721fa 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
+#include <linux/cn_proc.h>
static struct kmem_cache *nsproxy_cachep;
@@ -239,6 +240,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
struct nsproxy *new_nsproxy;
struct file *file;
struct ns_common *ns;
+ struct ns_event_prepare ns_event;
int err;
file = proc_ns_fget(fd);
@@ -250,6 +252,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
if (nstype && (ns->ops->type != nstype))
goto out;
+ proc_ns_connector_prepare(&ns_event, PROC_NS_REASON_SETNS);
+
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new_nsproxy)) {
err = PTR_ERR(new_nsproxy);
@@ -262,6 +266,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
switch_task_namespaces(tsk, new_nsproxy);
+
+ proc_ns_connector_send(&ns_event, current);
out:
fput(file);
return err;
--
2.7.4
^ permalink raw reply related [flat|nested] 6+ messages in thread* [RFC v2 2/2] proc connector: add a "get feature" op
[not found] ` <1476534370-4027-1-git-send-email-alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
2016-10-15 12:26 ` [RFC v2 1/2] proc connector: add " Alban Crequy
@ 2016-10-15 12:26 ` Alban Crequy
[not found] ` <1476534370-4027-3-git-send-email-alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
2016-10-16 14:57 ` [RFC v2 0/2] proc connector: get namespace events Eric W. Biederman
2 siblings, 1 reply; 6+ messages in thread
From: Alban Crequy @ 2016-10-15 12:26 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Iago Lopez Galeiras, Aaron Campbell, Jiri Benc, Jesper Derehag,
Alban Crequy, Tejun Heo, Evgeniy Polyakov, Dimitri John Ledkov
From: Alban Crequy <alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
As more kinds of events are being added in the proc connector, userspace
needs a way to detect whether the kernel supports those new events.
When a kind of event is not supported, userspace should report an error
propertly, or fallback to other methods (regular polling of procfs).
The events fork, exec, uid, gid, sid, ptrace, comm, exit were added
together. Then commit 2b5faa4c ("connector: Added coredumping event to
the process connector") added coredump events but without a way for
userspace to detect if the kernel will emit those. So I am grouping
them all together in PROC_CN_FEATURE_BASIC.
- PROC_CN_FEATURE_BASIC: supports fork, exec, uid, gid, sid, ptrace,
comm, exit, coredump.
- PROC_CN_FEATURE_NS: supports ns.
Signed-off-by: Alban Crequy <alban-lYLaGTFnO9sWenYVfaLwtA@public.gmane.org>
---
drivers/connector/cn_proc.c | 25 +++++++++++++++----------
include/uapi/linux/cn_proc.h | 4 ++++
2 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index c38733d..5f9ace6 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -442,15 +442,12 @@ void proc_ns_connector_send(struct ns_event_prepare *prepare, struct task_struct
* values because it's not being returned via syscall return
* mechanisms.
*/
-static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
+static void cn_proc_ack(int err, u16 flags, int rcvd_seq, int rcvd_ack)
{
struct cn_msg *msg;
struct proc_event *ev;
__u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
- if (atomic_read(&proc_event_num_listeners) < 1)
- return;
-
msg = buffer_to_cn_msg(buffer);
ev = (struct proc_event *)msg->data;
memset(&ev->event_data, 0, sizeof(ev->event_data));
@@ -462,7 +459,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
msg->ack = rcvd_ack + 1;
msg->len = sizeof(*ev);
- msg->flags = 0; /* not used */
+ msg->flags = flags;
send_msg(msg);
}
@@ -475,9 +472,12 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
{
enum proc_cn_mcast_op *mc_op = NULL;
int err = 0;
+ u16 flags = 0;
- if (msg->len != sizeof(*mc_op))
- return;
+ if (msg->len != sizeof(*mc_op)) {
+ err = EINVAL;
+ goto out;
+ }
/*
* Events are reported with respect to the initial pid
@@ -485,8 +485,10 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
* other namespaces.
*/
if ((current_user_ns() != &init_user_ns) ||
- (task_active_pid_ns(current) != &init_pid_ns))
- return;
+ (task_active_pid_ns(current) != &init_pid_ns)) {
+ err = EPERM;
+ goto out;
+ }
/* Can only change if privileged. */
if (!__netlink_ns_capable(nsp, &init_user_ns, CAP_NET_ADMIN)) {
@@ -496,6 +498,9 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
mc_op = (enum proc_cn_mcast_op *)msg->data;
switch (*mc_op) {
+ case PROC_CN_GET_FEATURES:
+ flags = PROC_CN_FEATURE_BASIC | PROC_CN_FEATURE_NS;
+ break;
case PROC_CN_MCAST_LISTEN:
atomic_inc(&proc_event_num_listeners);
break;
@@ -508,7 +513,7 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg,
}
out:
- cn_proc_ack(err, msg->seq, msg->ack);
+ cn_proc_ack(err, flags, msg->seq, msg->ack);
}
/*
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index 3270e8c..2ea0e5d 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -25,10 +25,14 @@
* for events on the connector.
*/
enum proc_cn_mcast_op {
+ PROC_CN_GET_FEATURES = 0,
PROC_CN_MCAST_LISTEN = 1,
PROC_CN_MCAST_IGNORE = 2
};
+#define PROC_CN_FEATURE_BASIC 0x0001
+#define PROC_CN_FEATURE_NS 0x0002
+
/*
* From the user's point of view, the process
* ID is the thread group ID and thread ID is the internal
--
2.7.4
^ permalink raw reply related [flat|nested] 6+ messages in thread