Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH 3/7] task-diag: add ability to get information about all tasks
From: Andrey Vagin @ 2015-02-17  8:20 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Andrew Morton,
	Cyrill Gorcunov, Pavel Emelyanov, Roger Luethi, Andrey Vagin
In-Reply-To: <1424161226-15176-1-git-send-email-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>

For that we need to set NLM_F_DUMP. Currently here are no
filters. Any suggestions are welcome.

I think we can add request for children, threads, session or group
members.

Signed-off-by: Andrey Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
 kernel/taskdiag.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/kernel/taskdiag.c b/kernel/taskdiag.c
index 5faf3f0..da4a51b 100644
--- a/kernel/taskdiag.c
+++ b/kernel/taskdiag.c
@@ -102,6 +102,46 @@ err:
 	return err;
 }
 
+static int taskdiag_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct tgid_iter iter;
+	struct nlattr *na;
+	struct task_diag_pid *req;
+	int rc;
+
+	if (nlmsg_len(cb->nlh) < GENL_HDRLEN + sizeof(*req))
+		return -EINVAL;
+
+	na = nlmsg_data(cb->nlh) + GENL_HDRLEN;
+	if (na->nla_type < 0)
+		return -EINVAL;
+
+	req = (struct task_diag_pid *) nla_data(na);
+
+	iter.tgid = cb->args[0];
+	iter.task = NULL;
+	for (iter = next_tgid(ns, iter);
+	     iter.task;
+	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+		if (!ptrace_may_access(iter.task, PTRACE_MODE_READ))
+			continue;
+
+		rc = task_diag_fill(iter.task, skb, req->show_flags,
+				NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq);
+		if (rc < 0) {
+			put_task_struct(iter.task);
+			if (rc != -EMSGSIZE)
+				return rc;
+			break;
+		}
+	}
+
+	cb->args[0] = iter.tgid;
+
+	return skb->len;
+}
+
 static int taskdiag_doit(struct sk_buff *skb, struct genl_info *info)
 {
 	struct task_struct *tsk = NULL;
@@ -161,6 +201,7 @@ static const struct genl_ops taskdiag_ops[] = {
 	{
 		.cmd		= TASKDIAG_CMD_GET,
 		.doit		= taskdiag_doit,
+		.dumpit		= taskdiag_dumpid,
 		.policy		= taskstats_cmd_get_policy,
 	},
 };
-- 
2.1.0

^ permalink raw reply related

* [PATCH 4/7] task-diag: add a new group to get process credentials
From: Andrey Vagin @ 2015-02-17  8:20 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Andrew Morton,
	Cyrill Gorcunov, Pavel Emelyanov, Roger Luethi, Andrey Vagin
In-Reply-To: <1424161226-15176-1-git-send-email-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>

A response is represented by the task_diag_creds structure:

struct task_diag_creds {
       struct task_diag_caps cap_inheritable;
       struct task_diag_caps cap_permitted;
       struct task_diag_caps cap_effective;
       struct task_diag_caps cap_bset;

       __u32 uid;
       __u32 euid;
       __u32 suid;
       __u32 fsuid;
       __u32 gid;
       __u32 egid;
       __u32 sgid;
       __u32 fsgid;
};

This group is optional and it filled only if show_flags contains
TASK_DIAG_SHOW_CRED.

Signed-off-by: Andrey Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
 include/uapi/linux/taskdiag.h | 23 ++++++++++++++++++
 kernel/taskdiag.c             | 55 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/taskdiag.h b/include/uapi/linux/taskdiag.h
index e1feb35..db12f6d 100644
--- a/include/uapi/linux/taskdiag.h
+++ b/include/uapi/linux/taskdiag.h
@@ -9,11 +9,14 @@
 
 enum {
 	/* optional attributes which can be specified in show_flags */
+	TASK_DIAG_CRED,
 
 	/* other attributes */
 	TASK_DIAG_MSG = 64,
 };
 
+#define TASK_DIAG_SHOW_CRED (1ULL << TASK_DIAG_CRED)
+
 enum {
 	TASK_DIAG_RUNNING,
 	TASK_DIAG_INTERRUPTIBLE,
@@ -37,6 +40,26 @@ struct task_diag_msg {
 	char	comm[TASK_DIAG_COMM_LEN];
 };
 
+struct task_diag_caps {
+	__u32 cap[_LINUX_CAPABILITY_U32S_3];
+};
+
+struct task_diag_creds {
+	struct task_diag_caps cap_inheritable;
+	struct task_diag_caps cap_permitted;
+	struct task_diag_caps cap_effective;
+	struct task_diag_caps cap_bset;
+
+	__u32 uid;
+	__u32 euid;
+	__u32 suid;
+	__u32 fsuid;
+	__u32 gid;
+	__u32 egid;
+	__u32 sgid;
+	__u32 fsgid;
+};
+
 enum {
 	TASKDIAG_CMD_UNSPEC = 0,	/* Reserved */
 	TASKDIAG_CMD_GET,
diff --git a/kernel/taskdiag.c b/kernel/taskdiag.c
index da4a51b..6ccbcaf 100644
--- a/kernel/taskdiag.c
+++ b/kernel/taskdiag.c
@@ -15,7 +15,14 @@ static struct genl_family family = {
 
 static size_t taskdiag_packet_size(u64 show_flags)
 {
-	return nla_total_size(sizeof(struct task_diag_msg));
+	size_t size;
+
+	size = nla_total_size(sizeof(struct task_diag_msg));
+
+	if (show_flags & TASK_DIAG_SHOW_CRED)
+		size += nla_total_size(sizeof(struct task_diag_creds));
+
+	return size;
 }
 
 /*
@@ -82,6 +89,46 @@ static int fill_task_msg(struct task_struct *p, struct sk_buff *skb)
 	return 0;
 }
 
+static inline void caps2diag(struct task_diag_caps *diag, const kernel_cap_t *cap)
+{
+	int i;
+
+	for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++)
+		diag->cap[i] = cap->cap[i];
+}
+
+static int fill_creds(struct task_struct *p, struct sk_buff *skb)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	struct task_diag_creds *diag_cred;
+	const struct cred *cred;
+	struct nlattr *attr;
+
+	attr = nla_reserve(skb, TASK_DIAG_CRED, sizeof(struct task_diag_creds));
+	if (!attr)
+		return -EMSGSIZE;
+
+	diag_cred = nla_data(attr);
+
+	cred = get_task_cred(p);
+
+	caps2diag(&diag_cred->cap_inheritable, &cred->cap_inheritable);
+	caps2diag(&diag_cred->cap_permitted, &cred->cap_permitted);
+	caps2diag(&diag_cred->cap_effective, &cred->cap_effective);
+	caps2diag(&diag_cred->cap_bset, &cred->cap_bset);
+
+	diag_cred->uid   = from_kuid_munged(user_ns, cred->uid);
+	diag_cred->euid  = from_kuid_munged(user_ns, cred->euid);
+	diag_cred->suid  = from_kuid_munged(user_ns, cred->suid);
+	diag_cred->fsuid = from_kuid_munged(user_ns, cred->fsuid);
+	diag_cred->gid   = from_kgid_munged(user_ns, cred->gid);
+	diag_cred->egid  = from_kgid_munged(user_ns, cred->egid);
+	diag_cred->sgid  = from_kgid_munged(user_ns, cred->sgid);
+	diag_cred->fsgid = from_kgid_munged(user_ns, cred->fsgid);
+
+	return 0;
+}
+
 static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb,
 				u64 show_flags, u32 portid, u32 seq)
 {
@@ -96,6 +143,12 @@ static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb,
 	if (err)
 		goto err;
 
+	if (show_flags & TASK_DIAG_SHOW_CRED) {
+		err = fill_creds(tsk, skb);
+		if (err)
+			goto err;
+	}
+
 	return genlmsg_end(skb, reply);
 err:
 	genlmsg_cancel(skb, reply);
-- 
2.1.0

^ permalink raw reply related

* [PATCH 5/7] kernel: add ability to iterate children of a specified task
From: Andrey Vagin @ 2015-02-17  8:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, Oleg Nesterov, Andrew Morton, Cyrill Gorcunov,
	Pavel Emelyanov, Roger Luethi, Andrey Vagin
In-Reply-To: <1424161226-15176-1-git-send-email-avagin@openvz.org>

The interface is similar with the tgid iterator. It is used in
procfs and it will be used in task_diag.

Signed-off-by: Andrey Vagin <avagin@openvz.org>
---
 fs/proc/array.c         | 58 +++++++++++++------------------------------------
 include/linux/proc_fs.h |  6 +++++
 kernel/pid.c            | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d0..7197c6a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -579,54 +579,26 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
 {
 	struct task_struct *start, *task;
 	struct pid *pid = NULL;
+	struct child_iter iter;
 
-	read_lock(&tasklist_lock);
-
-	start = pid_task(proc_pid(inode), PIDTYPE_PID);
+	start = get_proc_task(inode);
 	if (!start)
-		goto out;
+		return NULL;
 
-	/*
-	 * Lets try to continue searching first, this gives
-	 * us significant speedup on children-rich processes.
-	 */
-	if (pid_prev) {
-		task = pid_task(pid_prev, PIDTYPE_PID);
-		if (task && task->real_parent == start &&
-		    !(list_empty(&task->sibling))) {
-			if (list_is_last(&task->sibling, &start->children))
-				goto out;
-			task = list_first_entry(&task->sibling,
-						struct task_struct, sibling);
-			pid = get_pid(task_pid(task));
-			goto out;
-		}
-	}
+	if (pid_prev)
+		task = get_pid_task(pid_prev, PIDTYPE_PID);
+	else
+		task = NULL;
 
-	/*
-	 * Slow search case.
-	 *
-	 * We might miss some children here if children
-	 * are exited while we were not holding the lock,
-	 * but it was never promised to be accurate that
-	 * much.
-	 *
-	 * "Just suppose that the parent sleeps, but N children
-	 *  exit after we printed their tids. Now the slow paths
-	 *  skips N extra children, we miss N tasks." (c)
-	 *
-	 * So one need to stop or freeze the leader and all
-	 * its children to get a precise result.
-	 */
-	list_for_each_entry(task, &start->children, sibling) {
-		if (pos-- == 0) {
-			pid = get_pid(task_pid(task));
-			break;
-		}
-	}
+	iter.parent = start;
+	iter.task = task;
+	iter.pos = pos;
+
+	iter = next_child(iter);
 
-out:
-	read_unlock(&tasklist_lock);
+	put_task_struct(start);
+	if (iter.task)
+		pid = get_pid(task_pid(iter.task));
 	return pid;
 }
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 136b6ed..eba98bc 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -89,4 +89,10 @@ struct tgid_iter {
 
 struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter);
 
+struct child_iter {
+	struct task_struct      *task, *parent;
+	unsigned int		pos;
+};
+
+struct child_iter next_child(struct child_iter iter);
 #endif /* _LINUX_PROC_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 082307a..6e3e42a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -606,6 +606,61 @@ retry:
 	return iter;
 }
 
+struct child_iter next_child(struct child_iter iter)
+{
+	struct task_struct *task;
+	loff_t pos = iter.pos;
+
+	read_lock(&tasklist_lock);
+
+	/*
+	 * Lets try to continue searching first, this gives
+	 * us significant speedup on children-rich processes.
+	 */
+	if (iter.task) {
+		task = iter.task;
+		if (task && task->real_parent == iter.parent &&
+		    !(list_empty(&task->sibling))) {
+			if (list_is_last(&task->sibling, &iter.parent->children)) {
+				task = NULL;
+				goto out;
+			}
+			task = list_first_entry(&task->sibling,
+						struct task_struct, sibling);
+			goto out;
+		}
+	}
+
+	/*
+	 * Slow search case.
+	 *
+	 * We might miss some children here if children
+	 * are exited while we were not holding the lock,
+	 * but it was never promised to be accurate that
+	 * much.
+	 *
+	 * "Just suppose that the parent sleeps, but N children
+	 *  exit after we printed their tids. Now the slow paths
+	 *  skips N extra children, we miss N tasks." (c)
+	 *
+	 * So one need to stop or freeze the leader and all
+	 * its children to get a precise result.
+	 */
+	list_for_each_entry(task, &iter.parent->children, sibling) {
+		if (pos-- == 0)
+			goto out;
+	}
+	task = NULL;
+out:
+	if (iter.task)
+		put_task_struct(iter.task);
+	if (task)
+		get_task_struct(task);
+	iter.task = task;
+	read_unlock(&tasklist_lock);
+	return iter;
+}
+
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
-- 
2.1.0

^ permalink raw reply related

* [PATCH 6/7] task_diag: add ability to dump children
From: Andrey Vagin @ 2015-02-17  8:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, Oleg Nesterov, Andrew Morton, Cyrill Gorcunov,
	Pavel Emelyanov, Roger Luethi, Andrey Vagin
In-Reply-To: <1424161226-15176-1-git-send-email-avagin@openvz.org>

Now we can dump all task or children of a specified task.
It's an example how this interface can be expanded for different
use-cases.

Signed-off-by: Andrey Vagin <avagin@openvz.org>
---
 include/uapi/linux/taskdiag.h |  1 +
 kernel/taskdiag.c             | 83 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/include/uapi/linux/taskdiag.h b/include/uapi/linux/taskdiag.h
index db12f6d..d8a9e92 100644
--- a/include/uapi/linux/taskdiag.h
+++ b/include/uapi/linux/taskdiag.h
@@ -68,6 +68,7 @@ enum {
 #define TASKDIAG_CMD_MAX (__TASKDIAG_CMD_MAX - 1)
 
 #define TASK_DIAG_DUMP_ALL	0
+#define TASK_DIAG_DUMP_CHILDREN	1
 
 struct task_diag_pid {
 	__u64	show_flags;
diff --git a/kernel/taskdiag.c b/kernel/taskdiag.c
index 6ccbcaf..951ecbd 100644
--- a/kernel/taskdiag.c
+++ b/kernel/taskdiag.c
@@ -155,12 +155,71 @@ err:
 	return err;
 }
 
+struct task_iter {
+	struct task_diag_pid *req;
+	struct pid_namespace *ns;
+	struct netlink_callback *cb;
+
+	union {
+		struct tgid_iter tgid;
+		struct child_iter child;
+	};
+};
+
+static struct task_struct *iter_start(struct task_iter *iter)
+{
+	switch (iter->req->dump_stratagy) {
+	case TASK_DIAG_DUMP_CHILDREN:
+		rcu_read_lock();
+		iter->child.parent = find_task_by_pid_ns(iter->req->pid, iter->ns);
+		if (iter->child.parent)
+			get_task_struct(iter->child.parent);
+		rcu_read_unlock();
+
+		if (iter->child.parent == NULL)
+			return ERR_PTR(-ESRCH);
+
+		iter->child.pos = iter->cb->args[0];
+		iter->child.task = NULL;
+		iter->child = next_child(iter->child);
+		return iter->child.task;
+
+	case TASK_DIAG_DUMP_ALL:
+		iter->tgid.tgid = iter->cb->args[0];
+		iter->tgid.task = NULL;
+		iter->tgid = next_tgid(iter->ns, iter->tgid);
+		return iter->tgid.task;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct task_struct *iter_next(struct task_iter *iter)
+{
+	switch (iter->req->dump_stratagy) {
+	case TASK_DIAG_DUMP_CHILDREN:
+		iter->child.pos += 1;
+		iter->child = next_child(iter->child);
+		iter->cb->args[0] = iter->child.pos;
+		return iter->child.task;
+
+	case TASK_DIAG_DUMP_ALL:
+		iter->tgid.tgid += 1;
+		iter->tgid = next_tgid(iter->ns, iter->tgid);
+		iter->cb->args[0] = iter->tgid.tgid;
+		return iter->tgid.task;
+	}
+
+	return NULL;
+}
+
 static int taskdiag_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct pid_namespace *ns = task_active_pid_ns(current);
-	struct tgid_iter iter;
+	struct task_iter iter;
 	struct nlattr *na;
 	struct task_diag_pid *req;
+	struct task_struct *task;
 	int rc;
 
 	if (nlmsg_len(cb->nlh) < GENL_HDRLEN + sizeof(*req))
@@ -172,26 +231,28 @@ static int taskdiag_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 
 	req = (struct task_diag_pid *) nla_data(na);
 
-	iter.tgid = cb->args[0];
-	iter.task = NULL;
-	for (iter = next_tgid(ns, iter);
-	     iter.task;
-	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
-		if (!ptrace_may_access(iter.task, PTRACE_MODE_READ))
+	iter.req = req;
+	iter.ns  = ns;
+	iter.cb  = cb;
+
+	task = iter_start(&iter);
+	if (IS_ERR(task) < 0)
+		return PTR_ERR(task);
+
+	for (; task; task = iter_next(&iter)) {
+		if (!ptrace_may_access(task, PTRACE_MODE_READ))
 			continue;
 
-		rc = task_diag_fill(iter.task, skb, req->show_flags,
+		rc = task_diag_fill(task, skb, req->show_flags,
 				NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq);
 		if (rc < 0) {
-			put_task_struct(iter.task);
+			put_task_struct(task);
 			if (rc != -EMSGSIZE)
 				return rc;
 			break;
 		}
 	}
 
-	cb->args[0] = iter.tgid;
-
 	return skb->len;
 }
 
-- 
2.1.0

^ permalink raw reply related

* [PATCH 7/7] selftest: check the task_diag functinonality
From: Andrey Vagin @ 2015-02-17  8:20 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-api, Oleg Nesterov, Andrew Morton, Cyrill Gorcunov,
	Pavel Emelyanov, Roger Luethi, Andrey Vagin
In-Reply-To: <1424161226-15176-1-git-send-email-avagin@openvz.org>

Here are two test (example) programs.

task_diag - request information for two processes.
test_diag_all - request information about all processes

Signed-off-by: Andrey Vagin <avagin@openvz.org>
---
 tools/testing/selftests/Makefile                   |   1 +
 tools/testing/selftests/task_diag/Makefile         |  16 ++
 tools/testing/selftests/task_diag/task_diag.c      |  56 ++++++
 tools/testing/selftests/task_diag/task_diag_all.c  |  82 ++++++++
 tools/testing/selftests/task_diag/task_diag_comm.c | 206 +++++++++++++++++++++
 tools/testing/selftests/task_diag/task_diag_comm.h |  47 +++++
 tools/testing/selftests/task_diag/taskdiag.h       |   1 +
 7 files changed, 409 insertions(+)
 create mode 100644 tools/testing/selftests/task_diag/Makefile
 create mode 100644 tools/testing/selftests/task_diag/task_diag.c
 create mode 100644 tools/testing/selftests/task_diag/task_diag_all.c
 create mode 100644 tools/testing/selftests/task_diag/task_diag_comm.c
 create mode 100644 tools/testing/selftests/task_diag/task_diag_comm.h
 create mode 120000 tools/testing/selftests/task_diag/taskdiag.h

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 4e51122..c73d888 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -17,6 +17,7 @@ TARGETS += sysctl
 TARGETS += timers
 TARGETS += user
 TARGETS += vm
+TARGETS += task_diag
 #Please keep the TARGETS list alphabetically sorted
 
 TARGETS_HOTPLUG = cpu-hotplug
diff --git a/tools/testing/selftests/task_diag/Makefile b/tools/testing/selftests/task_diag/Makefile
new file mode 100644
index 0000000..d6583c4
--- /dev/null
+++ b/tools/testing/selftests/task_diag/Makefile
@@ -0,0 +1,16 @@
+all: task_diag task_diag_all
+
+run_tests: all
+	@./task_diag && ./task_diag_all && echo "task_diag: [PASS]" || echo "task_diag: [FAIL]"
+
+CFLAGS += -Wall -O2
+
+task_diag.o: task_diag.c task_diag_comm.h
+task_diag_all.o: task_diag_all.c task_diag_comm.h
+task_diag_comm.o: task_diag_comm.c task_diag_comm.h
+
+task_diag_all: task_diag_all.o task_diag_comm.o
+task_diag: task_diag.o task_diag_comm.o
+
+clean:
+	rm -rf task_diag task_diag_all task_diag_comm.o task_diag_all.o task_diag.o
diff --git a/tools/testing/selftests/task_diag/task_diag.c b/tools/testing/selftests/task_diag/task_diag.c
new file mode 100644
index 0000000..fafeeac
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag.c
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <poll.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include <linux/genetlink.h>
+#include "taskdiag.h"
+#include "task_diag_comm.h"
+
+int main(int argc, char *argv[])
+{
+	int exit_status = 1;
+	int rc, rep_len, id;
+	int nl_sd = -1;
+	struct task_diag_pid req;
+	char buf[4096];
+
+	req.show_flags = TASK_DIAG_SHOW_CRED;
+	req.pid = getpid();
+
+	nl_sd = create_nl_socket(NETLINK_GENERIC);
+	if (nl_sd < 0)
+		return -1;
+
+	id = get_family_id(nl_sd);
+	if (!id)
+		goto err;
+
+	rc = send_cmd(nl_sd, id, getpid(), TASKDIAG_CMD_GET,
+		      TASKDIAG_CMD_ATTR_GET, &req, sizeof(req), 0);
+	pr_info("Sent pid/tgid, retval %d\n", rc);
+	if (rc < 0)
+		goto err;
+
+	rep_len = recv(nl_sd, buf, sizeof(buf), 0);
+	if (rep_len < 0) {
+		pr_perror("Unable to receive a response\n");
+		goto err;
+	}
+	pr_info("received %d bytes\n", rep_len);
+
+	nlmsg_receive(buf, rep_len, &show_task);
+
+	exit_status = 0;
+err:
+	close(nl_sd);
+	return exit_status;
+}
diff --git a/tools/testing/selftests/task_diag/task_diag_all.c b/tools/testing/selftests/task_diag/task_diag_all.c
new file mode 100644
index 0000000..85e1a0a
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag_all.c
@@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <poll.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include "task_diag_comm.h"
+#include "taskdiag.h"
+
+int tasks;
+
+
+extern int _show_task(struct nlmsghdr *hdr)
+{
+	tasks++;
+	return show_task(hdr);
+}
+
+int main(int argc, char *argv[])
+{
+	int exit_status = 1;
+	int rc, rep_len, id;
+	int nl_sd = -1;
+	struct {
+		struct task_diag_pid req;
+	} pid_req;
+	char buf[4096];
+
+	quiet = 0;
+
+	pid_req.req.show_flags = 0;
+	pid_req.req.dump_stratagy = TASK_DIAG_DUMP_ALL;
+	pid_req.req.pid = 1;
+
+	nl_sd = create_nl_socket(NETLINK_GENERIC);
+	if (nl_sd < 0)
+		return -1;
+
+	id = get_family_id(nl_sd);
+	if (!id)
+		goto err;
+
+	rc = send_cmd(nl_sd, id, getpid(), TASKDIAG_CMD_GET,
+		      TASKDIAG_CMD_ATTR_GET, &pid_req, sizeof(pid_req), 1);
+	pr_info("Sent pid/tgid, retval %d\n", rc);
+	if (rc < 0)
+		goto err;
+
+	while (1) {
+		int err;
+
+		rep_len = recv(nl_sd, buf, sizeof(buf), 0);
+		pr_info("received %d bytes\n", rep_len);
+
+		if (rep_len < 0) {
+			pr_perror("Unable to receive a response\n");
+			goto err;
+		}
+
+		if (rep_len == 0)
+			break;
+
+		err = nlmsg_receive(buf, rep_len, &_show_task);
+		if (err < 0)
+			goto err;
+		if (err == 0)
+			break;
+	}
+	printf("tasks: %d\n", tasks);
+
+	exit_status = 0;
+err:
+	close(nl_sd);
+	return exit_status;
+}
diff --git a/tools/testing/selftests/task_diag/task_diag_comm.c b/tools/testing/selftests/task_diag/task_diag_comm.c
new file mode 100644
index 0000000..df7780d
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag_comm.c
@@ -0,0 +1,206 @@
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/genetlink.h>
+
+#include "taskdiag.h"
+#include "task_diag_comm.h"
+
+int quiet = 0;
+
+/*
+ * Create a raw netlink socket and bind
+ */
+int create_nl_socket(int protocol)
+{
+	int fd;
+	struct sockaddr_nl local;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, protocol);
+	if (fd < 0)
+		return -1;
+
+	memset(&local, 0, sizeof(local));
+	local.nl_family = AF_NETLINK;
+
+	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
+		goto error;
+
+	return fd;
+error:
+	close(fd);
+	return -1;
+}
+
+
+int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+	     __u8 genl_cmd, __u16 nla_type,
+	     void *nla_data, int nla_len, int dump)
+{
+	struct nlattr *na;
+	struct sockaddr_nl nladdr;
+	int r, buflen;
+	char *buf;
+
+	struct msgtemplate msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	if (dump)
+		msg.n.nlmsg_flags |= NLM_F_DUMP;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + 1 + NLA_HDRLEN;
+	memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len;
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+	r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+			   sizeof(nladdr));
+	if (r != buflen) {
+		pr_perror("Unable to send %d (%d)", r, buflen);
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * Probe the controller in genetlink to find the family id
+ * for the TASKDIAG family
+ */
+int get_family_id(int sd)
+{
+	char name[100];
+	struct msgtemplate ans;
+
+	int id = 0, rc;
+	struct nlattr *na;
+	int rep_len;
+
+	strcpy(name, TASKDIAG_GENL_NAME);
+	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+			CTRL_ATTR_FAMILY_NAME, (void *)name,
+			strlen(TASKDIAG_GENL_NAME) + 1, 0);
+	if (rc < 0)
+		return -1;
+
+	rep_len = recv(sd, &ans, sizeof(ans), 0);
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+	    (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;
+
+	na = (struct nlattr *) GENLMSG_DATA(&ans);
+	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
+		id = *(__u16 *) NLA_DATA(na);
+
+	return id;
+}
+
+int nlmsg_receive(void *buf, int len, int (*cb)(struct nlmsghdr *))
+{
+	struct nlmsghdr *hdr;
+
+	for (hdr = (struct nlmsghdr *)buf;
+			NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
+
+		if (hdr->nlmsg_type == NLMSG_DONE) {
+			int *len = (int *)NLMSG_DATA(hdr);
+
+			if (*len < 0) {
+				pr_err("ERROR %d reported by netlink (%s)\n",
+					*len, strerror(-*len));
+				return *len;
+			}
+
+			return 0;
+		}
+
+		if (hdr->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr);
+
+			if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) {
+				pr_err("ERROR truncated\n");
+				return -1;
+			}
+
+			if (err->error == 0)
+				return 0;
+
+			return -1;
+		}
+		if (cb && cb(hdr))
+			return -1;
+	}
+
+	return 1;
+}
+
+int show_task(struct nlmsghdr *hdr)
+{
+	int msg_len;
+	struct msgtemplate *msg;
+	struct nlattr *na;
+	int len;
+
+	msg_len = GENLMSG_PAYLOAD(hdr);
+
+	msg = (struct msgtemplate *)hdr;
+	na = (struct nlattr *) GENLMSG_DATA(msg);
+	len = 0;
+	while (len < msg_len) {
+		len += NLA_ALIGN(na->nla_len);
+		switch (na->nla_type) {
+		case TASK_DIAG_MSG:
+		{
+			struct task_diag_msg *msg;
+
+			/* For nested attributes, na follows */
+			msg = (struct task_diag_msg *) NLA_DATA(na);
+			pr_info("pid %d ppid %d comm %s\n", msg->pid, msg->ppid, msg->comm);
+			break;
+		}
+		case TASK_DIAG_CRED:
+		{
+			struct task_diag_creds *creds;
+
+			creds = (struct task_diag_creds *) NLA_DATA(na);
+			pr_info("uid: %d %d %d %d\n", creds->uid,
+					creds->euid, creds->suid, creds->fsuid);
+			pr_info("gid: %d %d %d %d\n", creds->uid,
+					creds->euid, creds->suid, creds->fsuid);
+			pr_info("CapInh: %08x%08x\n",
+						creds->cap_inheritable.cap[1],
+						creds->cap_inheritable.cap[0]);
+			pr_info("CapPrm: %08x%08x\n",
+						creds->cap_permitted.cap[1],
+						creds->cap_permitted.cap[0]);
+			pr_info("CapEff: %08x%08x\n",
+						creds->cap_effective.cap[1],
+						creds->cap_effective.cap[0]);
+			pr_info("CapBnd: %08x%08x\n", creds->cap_bset.cap[1],
+						creds->cap_bset.cap[0]);
+			break;
+		}
+		default:
+			pr_err("Unknown nla_type %d\n",
+				na->nla_type);
+			return -1;
+		}
+		na = (struct nlattr *) (GENLMSG_DATA(msg) + len);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/task_diag/task_diag_comm.h b/tools/testing/selftests/task_diag/task_diag_comm.h
new file mode 100644
index 0000000..42f2088
--- /dev/null
+++ b/tools/testing/selftests/task_diag/task_diag_comm.h
@@ -0,0 +1,47 @@
+#ifndef __TASK_DIAG_COMM__
+#define __TASK_DIAG_COMM__
+
+#include <stdio.h>
+
+#include <linux/genetlink.h>
+#include "taskdiag.h"
+
+/*
+ * Generic macros for dealing with netlink sockets. Might be duplicated
+ * elsewhere. It is recommended that commercial grade applications use
+ * libnl or libnetlink and use the interfaces provided by the library
+ */
+#define GENLMSG_DATA(glh)	((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na)		((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len)	(len - NLA_HDRLEN)
+
+#define pr_err(fmt, ...)				\
+		fprintf(stderr, fmt, ##__VA_ARGS__)
+
+#define pr_perror(fmt, ...)				\
+		fprintf(stderr, fmt " : %m\n", ##__VA_ARGS__)
+
+extern int quiet;
+#define pr_info(fmt, arg...)			\
+	do {					\
+		if (!quiet)			\
+			printf(fmt, ##arg);	\
+	} while (0)				\
+
+struct msgtemplate {
+	struct nlmsghdr n;
+	struct genlmsghdr g;
+	char body[4096];
+};
+
+extern int create_nl_socket(int protocol);
+extern int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+	     __u8 genl_cmd, __u16 nla_type,
+	     void *nla_data, int nla_len, int dump);
+
+extern int get_family_id(int sd);
+extern int nlmsg_receive(void *buf, int len, int (*cb)(struct nlmsghdr *));
+extern int show_task(struct nlmsghdr *hdr);
+
+#endif /* __TASK_DIAG_COMM__ */
diff --git a/tools/testing/selftests/task_diag/taskdiag.h b/tools/testing/selftests/task_diag/taskdiag.h
new file mode 120000
index 0000000..83e857e
--- /dev/null
+++ b/tools/testing/selftests/task_diag/taskdiag.h
@@ -0,0 +1 @@
+../../../../include/uapi/linux/taskdiag.h
\ No newline at end of file
-- 
2.1.0

^ permalink raw reply related

* Re: [PATCH 0/7] [RFC] kernel: add a netlink interface to get information about processes
From: Arnd Bergmann @ 2015-02-17  8:53 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Andrew Morton,
	Cyrill Gorcunov, Pavel Emelyanov, Roger Luethi
In-Reply-To: <1424161226-15176-1-git-send-email-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>

On Tuesday 17 February 2015 11:20:19 Andrey Vagin wrote:
> task_diag is based on netlink sockets and looks like socket-diag, which
> is used to get information about sockets.
> 
> A request is described by the task_diag_pid structure:
> 
> struct task_diag_pid {
>        __u64   show_flags;      /* specify which information are required */
>        __u64   dump_stratagy;   /* specify a group of processes */
> 
>        __u32   pid;
> };

Can you explain how the interface relates to the 'taskstats' genetlink
API? Did you consider extending that interface to provide the
information you need instead of basing on the socket-diag?

	Arnd

^ permalink raw reply

* Re: [PATCH 15/45] dm-log-userspace.h: include stdint.h in userspace
From: Arnd Bergmann @ 2015-02-17  9:08 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Alasdair Kergon,
	Mike Snitzer, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1424127948-22484-16-git-send-email-mikko.rapeli-X3B1VOXEql0@public.gmane.org>

On Tuesday 17 February 2015 00:05:18 Mikko Rapeli wrote:
> --- a/include/uapi/linux/dm-log-userspace.h
> +++ b/include/uapi/linux/dm-log-userspace.h
> @@ -7,6 +7,11 @@
>  #ifndef __DM_LOG_USERSPACE_H__
>  #define __DM_LOG_USERSPACE_H__
>  
> +#ifdef __KERNEL__
> +#include <linux/types.h>
> +#else
> +#include <stdint.h>
> +#endif
>  #include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
> 

The normal way to do this in kernel headers is to use linux/types.h
but change the data structures to use __u64 instead of uint64_t
to avoid the build error.

It's possible that the maintainers of this code have a strong opinion
on this matter, but try doing that first.

	Arnd

^ permalink raw reply

* Re: [PATCH 45/45] include/uapi/asm-generic/ucontext.h: include signal.h and sigcontext.h
From: Arnd Bergmann @ 2015-02-17  9:10 UTC (permalink / raw)
  To: Mikko Rapeli; +Cc: linux-kernel, linux-arch, linux-api
In-Reply-To: <1424127948-22484-46-git-send-email-mikko.rapeli@iki.fi>

On Tuesday 17 February 2015 00:05:48 Mikko Rapeli wrote:
>  #ifndef __ASM_GENERIC_UCONTEXT_H
>  #define __ASM_GENERIC_UCONTEXT_H
>  
> +#include <asm-generic/signal.h>
> +#include <asm/sigcontext.h>
> +
>  struct ucontext {
> 

Including another asm-generic header here is a bad idea: it breaks
if an architecture overrides asm/signal.h with its own version
but wants to use the asm-generic/ucontext.h file.

It would be best to just use linux/signal.h here, which includes
the correct architecture specific files.

	Arnd

^ permalink raw reply

* Re: [PATCH 37/45] include/uapi/linux/socket.h: include sys/socket.h in userspace
From: Arnd Bergmann @ 2015-02-17  9:14 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1424127948-22484-38-git-send-email-mikko.rapeli-X3B1VOXEql0@public.gmane.org>

On Tuesday 17 February 2015 00:05:40 Mikko Rapeli wrote:
> This libc header has sockaddr definition for userspace.
> Fixes compilation errors like:
> error: field ‘ifru_addr’ has incomplete type
> struct sockaddr ifru_addr;
> 
> Signed-off-by: Mikko Rapeli <mikko.rapeli-X3B1VOXEql0@public.gmane.org>
> ---
>  include/uapi/linux/socket.h | 4 ++++
>  1 file changed, 4 insertions(+)

This looks wrong: where do you see a use of ifru_addr
in uapi/linux/socket.h?

	Arnd

^ permalink raw reply

* Re: [PATCH 34/45] include/uapi/asm-generic/signal.h: hide sigset_t definition in userspace
From: Arnd Bergmann @ 2015-02-17  9:16 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1424127948-22484-35-git-send-email-mikko.rapeli-X3B1VOXEql0@public.gmane.org>

On Tuesday 17 February 2015 00:05:37 Mikko Rapeli wrote:
> +#ifdef __KERNEL__
> +/* already defined in userspace via stdlib.h */
>  typedef struct {
>         unsigned long sig[_NSIG_WORDS];
>  } sigset_t;
> +#endif /* __KERNEL__ */

I'm not sure here: Is this structure always identical to the one
we see in user space?

If not, we might want to rename the typedef to __kernel_sigset_t
instead and use that inside of the uapi headers.

	Arnd

^ permalink raw reply

* Re: [PATCH 33/45] include/uapi/asm-generic/signal.h: include stdlib.h in userspace
From: Arnd Bergmann @ 2015-02-17  9:44 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1424127948-22484-34-git-send-email-mikko.rapeli-X3B1VOXEql0@public.gmane.org>

On Tuesday 17 February 2015 00:05:36 Mikko Rapeli wrote:
> Fixes compiler warning:
> error: unknown type name ‘size_t’
> 
> Signed-off-by: Mikko Rapeli <mikko.rapeli-X3B1VOXEql0@public.gmane.org>
> ---
>  include/uapi/asm-generic/signal.h | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/include/uapi/asm-generic/signal.h b/include/uapi/asm-generic/signal.h
> index 9df61f1..8a341a2 100644
> --- a/include/uapi/asm-generic/signal.h
> +++ b/include/uapi/asm-generic/signal.h
> @@ -3,6 +3,10 @@
>  
>  #include <linux/types.h>
>  
> +#ifndef __KERNEL__
> +#include <stdlib.h>
> +#endif
> +

Hmm, maybe we should use __kernel_size_t instead?

	Arnd

^ permalink raw reply

* Re: [PATCH 20/45] sctp.h: include stdint.h in userspace
From: Neil Horman @ 2015-02-17 13:11 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Vlad Yasevich,
	linux-sctp-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1424127948-22484-21-git-send-email-mikko.rapeli-X3B1VOXEql0@public.gmane.org>

On Tue, Feb 17, 2015 at 12:05:23AM +0100, Mikko Rapeli wrote:
> Fixes compilation error:
> 
> linux/sctp.h:652:2: error: unknown type name ‘uint32_t’
> 
> Signed-off-by: Mikko Rapeli <mikko.rapeli-X3B1VOXEql0@public.gmane.org>
Acked-by: Neil Horman <nhorman-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

> ---
>  include/uapi/linux/sctp.h | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
> index ce70fe6..9fd31cf 100644
> --- a/include/uapi/linux/sctp.h
> +++ b/include/uapi/linux/sctp.h
> @@ -53,7 +53,11 @@
>  #ifndef _UAPI_SCTP_H
>  #define _UAPI_SCTP_H
>  
> +#ifdef __KERNEL__
>  #include <linux/types.h>
> +#else
> +#include <stdint.h>
> +#endif
>  #include <linux/socket.h>
>  
>  typedef __s32 sctp_assoc_t;
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH 15/45] dm-log-userspace.h: include stdint.h in userspace
From: Alasdair G Kergon @ 2015-02-17 13:38 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Mikko Rapeli, linux-kernel, Alasdair Kergon, Mike Snitzer,
	dm-devel, linux-api
In-Reply-To: <2309592.iOl8K3u7d8@wuerfel>

On Tue, Feb 17, 2015 at 10:08:56AM +0100, Arnd Bergmann wrote:
> The normal way to do this in kernel headers is to use linux/types.h
> but change the data structures to use __u64 instead of uint64_t
> to avoid the build error.
 
That's what happened to dm-ioctl.h.
(Or someone could adjust linux/types.h to include these as standard.)

Alasdair

^ permalink raw reply

* Re: [PATCH 15/45] dm-log-userspace.h: include stdint.h in userspace
From: Arnd Bergmann @ 2015-02-17 13:55 UTC (permalink / raw)
  To: Alasdair G Kergon
  Cc: Mikko Rapeli, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Mike Snitzer,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150217133806.GA5220-FDJ95KluN3Z0klwcnFlA1dvLeJWuRmrY@public.gmane.org>

On Tuesday 17 February 2015 13:38:06 Alasdair G Kergon wrote:
> On Tue, Feb 17, 2015 at 10:08:56AM +0100, Arnd Bergmann wrote:
> > The normal way to do this in kernel headers is to use linux/types.h
> > but change the data structures to use __u64 instead of uint64_t
> > to avoid the build error.
>  
> That's what happened to dm-ioctl.h.

Ah, indeed. It turns out that it was my own change that did this:


commit 9adfbfb611307060db54691bc7e6d53fdc12312b
Author: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>
Date:   Thu Feb 26 00:51:40 2009 +0100

    make most exported headers use strict integer types
    
    This takes care of all files that have only a small number
    of non-strict integer type uses.

> (Or someone could adjust linux/types.h to include these as standard.)

No, that wouldn't work. The C user space headers are not meant to
be included implicitly by any standard headers, which might pull
in linux/types.h implicitly.

I think it would be best to change all patches in the new series
in the same way for consistency and try to avoid using stdint.h
as much as we can.

	Arnd

^ permalink raw reply

* Re: [PATCH 0/7] [RFC] kernel: add a netlink interface to get information about processes
From: David Ahern @ 2015-02-17 16:09 UTC (permalink / raw)
  To: Andrey Vagin, linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Andrew Morton,
	Cyrill Gorcunov, Pavel Emelyanov, Roger Luethi
In-Reply-To: <1424161226-15176-1-git-send-email-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>

On 2/17/15 1:20 AM, Andrey Vagin wrote:
> And here are statistics about syscalls which were called by each
> command.
> $ perf stat -e syscalls:sys_exit* -- ps ax -o pid,ppid  2>&1 | grep syscalls | sort -n -r | head -n 5
>              20,713      syscalls:sys_exit_open
>              20,710      syscalls:sys_exit_close
>              20,708      syscalls:sys_exit_read
>              10,348      syscalls:sys_exit_newstat
>                  31      syscalls:sys_exit_write
>
> $ perf stat -e syscalls:sys_exit* -- ./task_diag_all  2>&1 | grep syscalls | sort -n -r | head -n 5
>                 114      syscalls:sys_exit_recvfrom
>                  49      syscalls:sys_exit_write
>                   8      syscalls:sys_exit_mmap
>                   4      syscalls:sys_exit_mprotect
>                   3      syscalls:sys_exit_newfstat

'perf trace -s' gives the summary with stats.
e.g., perf trace -s --  ps ax -o pid,ppid

  ps (23850), 3117 events, 99.3%, 0.000 msec

    syscall            calls      min       avg       max      stddev
                                (msec)    (msec)    (msec)        (%)
    --------------- -------- --------- --------- ---------     ------
    read                 353     0.000     0.010     0.035      3.14%
    write                166     0.006     0.012     0.045      3.03%
    open                 365     0.002     0.005     0.178     11.29%
    close                354     0.001     0.002     0.024      3.57%
    stat                 170     0.002     0.007     0.662     52.99%
    fstat                 19     0.002     0.003     0.003      2.31%
    lseek                  2     0.003     0.003     0.003      6.49%
    mmap                  50     0.004     0.006     0.013      3.40%
...

^ permalink raw reply

* Re: [PATCH 0/7] [RFC] kernel: add a netlink interface to get information about processes
From: Andy Lutomirski @ 2015-02-17 19:05 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: Pavel Emelyanov, Roger Luethi, Oleg Nesterov, Cyrill Gorcunov,
	Andrew Morton, Linux API, linux-kernel@vger.kernel.org
In-Reply-To: <1424161226-15176-1-git-send-email-avagin@openvz.org>

On Feb 17, 2015 12:40 AM, "Andrey Vagin" <avagin@openvz.org> wrote:
>
> Here is a preview version. It provides restricted set of functionality.
> I would like to collect feedback about this idea.
>
> Currently we use the proc file system, where all information are
> presented in text files, what is convenient for humans.  But if we need
> to get information about processes from code (e.g. in C), the procfs
> doesn't look so cool.
>
> From code we would prefer to get information in binary format and to be
> able to specify which information and for which tasks are required. Here
> is a new interface with all these features, which is called task_diag.
> In addition it's much faster than procfs.
>
> task_diag is based on netlink sockets and looks like socket-diag, which
> is used to get information about sockets.
>
> A request is described by the task_diag_pid structure:
>
> struct task_diag_pid {
>        __u64   show_flags;      /* specify which information are required */
>        __u64   dump_stratagy;   /* specify a group of processes */
>
>        __u32   pid;
> };
>
> A respone is a set of netlink messages. Each message describes one task.
> All task properties are divided on groups. A message contains the
> TASK_DIAG_MSG group and other groups if they have been requested in
> show_flags. For example, if show_flags contains TASK_DIAG_SHOW_CRED, a
> response will contain the TASK_DIAG_CRED group which is described by the
> task_diag_creds structure.
>
> struct task_diag_msg {
>         __u32   tgid;
>         __u32   pid;
>         __u32   ppid;
>         __u32   tpid;
>         __u32   sid;
>         __u32   pgid;
>         __u8    state;
>         char    comm[TASK_DIAG_COMM_LEN];
> };
>
> Another good feature of task_diag is an ability to request information
> for a few processes. Currently here are two stratgies
> TASK_DIAG_DUMP_ALL      - get information for all tasks
> TASK_DIAG_DUMP_CHILDREN - get information for children of a specified
>                           tasks
>
> The task diag is much faster than the proc file system. We don't need to
> create a new file descriptor for each task. We need to send a request
> and get a response. It allows to get information for a few task in one
> request-response iteration.
>
> I have compared performance of procfs and task-diag for the
> "ps ax -o pid,ppid" command.
>
> A test stand contains 10348 processes.
> $ ps ax -o pid,ppid | wc -l
> 10348
>
> $ time ps ax -o pid,ppid > /dev/null
>
> real    0m1.073s
> user    0m0.086s
> sys     0m0.903s
>
> $ time ./task_diag_all > /dev/null
>
> real    0m0.037s
> user    0m0.004s
> sys     0m0.020s
>
> And here are statistics about syscalls which were called by each
> command.
> $ perf stat -e syscalls:sys_exit* -- ps ax -o pid,ppid  2>&1 | grep syscalls | sort -n -r | head -n 5
>             20,713      syscalls:sys_exit_open
>             20,710      syscalls:sys_exit_close
>             20,708      syscalls:sys_exit_read
>             10,348      syscalls:sys_exit_newstat
>                 31      syscalls:sys_exit_write
>
> $ perf stat -e syscalls:sys_exit* -- ./task_diag_all  2>&1 | grep syscalls | sort -n -r | head -n 5
>                114      syscalls:sys_exit_recvfrom
>                 49      syscalls:sys_exit_write
>                  8      syscalls:sys_exit_mmap
>                  4      syscalls:sys_exit_mprotect
>                  3      syscalls:sys_exit_newfstat
>
> You can find the test program from this experiment in the last patch.
>
> The idea of this functionality was suggested by Pavel Emelyanov
> (xemul@), when he found that operations with /proc forms a significant
> part of a checkpointing time.
>
> Ten years ago here was attempt to add a netlink interface to access to /proc
> information:
> http://lwn.net/Articles/99600/

I don't suppose this could use real syscalls instead of netlink.  If
nothing else, netlink seems to conflate pid and net namespaces.

Also, using an asynchronous interface (send, poll?, recv) for
something that's inherently synchronous (as the kernel a local
question) seems awkward to me.

--Andy

^ permalink raw reply

* [PATCH v2 0/2] Add epoll round robin wakeup mode
From: Jason Baron @ 2015-02-17 19:33 UTC (permalink / raw)
  To: peterz, mingo, viro
  Cc: akpm, normalperson, davidel, mtk.manpages, linux-kernel,
	linux-fsdevel, linux-api

When we are sharing a wakeup source among multiple epoll fds, we end up with
thundering herd wakeups, since there is currently no way to add to the
wakeup source exclusively. This series introduces 2 new epoll flags,
EPOLLEXCLUSIVE for adding to a wakeup source exclusively. And EPOLLROUNDROBIN
which is to be used in conjunction to EPOLLEXCLUSIVE to evenly
distribute the wakeups. This patch was originally motivated by a desire to
improve wakeup balance and cpu usage for a listen socket() shared amongst
multiple epoll fd sets.

See: http://lwn.net/Articles/632590/ for previous test program and testing
resutls.

Epoll manpage text:

EPOLLEXCLUSIVE
        Provides exclusive wakeups when attaching multiple epoll fds to a
        shared wakeup source. Must be specified with an EPOLL_CTL_ADD operation.

EPOLLROUNDROBIN
        Provides balancing for exclusive wakeups when attaching multiple epoll
        fds to a shared wakeup soruce. Depends on EPOLLEXCLUSIVE being set and
        must be specified with an EPOLL_CTL_ADD operation.

Thanks,

-Jason

Jason Baron (2):
  sched/wait: add round robin wakeup mode
  epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN

 fs/eventpoll.c                 | 25 ++++++++++++++++++++-----
 include/linux/wait.h           | 11 +++++++++++
 include/uapi/linux/eventpoll.h |  6 ++++++
 kernel/sched/wait.c            | 10 ++++++++--
 4 files changed, 45 insertions(+), 7 deletions(-)

-- 
1.8.2.rc2

^ permalink raw reply

* [PATCH v2 1/2] sched/wait: add round robin wakeup mode
From: Jason Baron @ 2015-02-17 19:33 UTC (permalink / raw)
  To: peterz, mingo, viro
  Cc: akpm, normalperson, davidel, mtk.manpages, linux-kernel,
	linux-fsdevel, linux-api
In-Reply-To: <cover.1424200151.git.jbaron@akamai.com>

The motivation for this flag is to allow the distribution of wakeups from
a shared source in a balanced manner. Currently, we can add threads exclusively
but that often results in the same thread woken up again and again. In the case
where we are trying to balance work across threads this is not desirable.

The WQ_FLAG_ROUND_ROBIN is restricted to being exclusive as well, otherwise we
do not know who is being woken up.

Signed-off-by: Jason Baron <jbaron@akamai.com>
---
 include/linux/wait.h | 11 +++++++++++
 kernel/sched/wait.c  | 10 ++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed1..bbdef98 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -16,6 +16,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *ke
 /* __wait_queue::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
+#define WQ_FLAG_ROUND_ROBIN	0x04
 
 struct __wait_queue {
 	unsigned int		flags;
@@ -109,6 +110,16 @@ static inline int waitqueue_active(wait_queue_head_t *q)
 
 extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
 extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
+
+/*
+ * rr relies on exclusive, otherwise we don't know which entry was woken
+ */
+static inline void add_wait_queue_rr(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_ROUND_ROBIN;
+	add_wait_queue_exclusive(q, wait);
+}
+
 extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
 
 static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a..dcb75dd 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -66,14 +66,20 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_t *curr, *next;
+	LIST_HEAD(rotate_list);
 
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
 		if (curr->func(curr, mode, wake_flags, key) &&
-				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-			break;
+					(flags & WQ_FLAG_EXCLUSIVE)) {
+			if ((flags & WQ_FLAG_ROUND_ROBIN) && (nr_exclusive > 0))
+				list_move_tail(&curr->task_list, &rotate_list);
+			if (!--nr_exclusive)
+				break;
+		}
 	}
+	list_splice_tail(&rotate_list, &q->task_list);
 }
 
 /**
-- 
1.8.2.rc2

^ permalink raw reply related

* [PATCH v2 2/2] epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
From: Jason Baron @ 2015-02-17 19:33 UTC (permalink / raw)
  To: peterz, mingo, viro
  Cc: akpm, normalperson, davidel, mtk.manpages, linux-kernel,
	linux-fsdevel, linux-api
In-Reply-To: <cover.1424200151.git.jbaron@akamai.com>

Epoll file descriptors that are added to a shared wakeup source are always
added in a non-exclusive manner. That means that when we have multiple epoll
fds attached to a shared wakeup source they are all woken up. This can
lead to excessive cpu usage and uneven load distribution.

This patch introduces two new 'events' flags that are intended to be used
with EPOLL_CTL_ADD operations. EPOLLEXCLUSIVE, adds the epoll fd to the event
source in an exclusive manner such that the minimum number of threads are
woken. EPOLLROUNDROBIN, which depends on EPOLLEXCLUSIVE also being set, can
also be added to the 'events' flag, such that we round robin through the set
of waiting threads.

An implementation note is that in the epoll wakeup routine,
'ep_poll_callback()', if EPOLLROUNDROBIN is set, we return 1, for a successful
wakeup, only when there are current waiters. The idea is to use this additional
heuristic in order minimize wakeup latencies.

Signed-off-by: Jason Baron <jbaron@akamai.com>
---
 fs/eventpoll.c                 | 25 ++++++++++++++++++++-----
 include/uapi/linux/eventpoll.h |  6 ++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f944..382c832 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -92,7 +92,8 @@
  */
 
 /* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | \
+			 EPOLLEXCLUSIVE | EPOLLROUNDROBIN)
 
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
@@ -1002,6 +1003,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	unsigned long flags;
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
+	int ewake = 0;
 
 	if ((unsigned long)key & POLLFREE) {
 		ep_pwq_from_wait(wait)->whead = NULL;
@@ -1066,8 +1068,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 	 * wait list.
 	 */
-	if (waitqueue_active(&ep->wq))
+	if (waitqueue_active(&ep->wq)) {
+		ewake = 1;
 		wake_up_locked(&ep->wq);
+	}
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -1078,6 +1082,8 @@ out_unlock:
 	if (pwake)
 		ep_poll_safewake(&ep->poll_wait);
 
+	if (epi->event.events & EPOLLROUNDROBIN)
+		return ewake;
 	return 1;
 }
 
@@ -1095,7 +1101,12 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
 		pwq->whead = whead;
 		pwq->base = epi;
-		add_wait_queue(whead, &pwq->wait);
+		if (epi->event.events & EPOLLROUNDROBIN)
+			add_wait_queue_rr(whead, &pwq->wait);
+		else if (epi->event.events & EPOLLEXCLUSIVE)
+			add_wait_queue_exclusive(whead, &pwq->wait);
+		else
+			add_wait_queue(whead, &pwq->wait);
 		list_add_tail(&pwq->llink, &epi->pwqlist);
 		epi->nwait++;
 	} else {
@@ -1820,8 +1831,7 @@ SYSCALL_DEFINE1(epoll_create, int, size)
 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		struct epoll_event __user *, event)
 {
-	int error;
-	int full_check = 0;
+	int error, full_check = 0, wait_flags = 0;
 	struct fd f, tf;
 	struct eventpoll *ep;
 	struct epitem *epi;
@@ -1861,6 +1871,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	if (f.file == tf.file || !is_file_epoll(f.file))
 		goto error_tgt_fput;
 
+	wait_flags = epds.events & (EPOLLEXCLUSIVE | EPOLLROUNDROBIN);
+	if (wait_flags && ((op == EPOLL_CTL_MOD) || ((op == EPOLL_CTL_ADD) &&
+	    ((wait_flags == EPOLLROUNDROBIN) || (is_file_epoll(tf.file))))))
+		goto error_tgt_fput;
+
 	/*
 	 * At this point it is safe to assume that the "private_data" contains
 	 * our own data structure.
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bc81fb2..10260a1 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -26,6 +26,12 @@
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
 
+/* Balance wakeups for a shared event source */
+#define EPOLLROUNDROBIN (1 << 27)
+
+/* Add exclusively */
+#define EPOLLEXCLUSIVE (1 << 28)
+
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
  * from happening while those events are being processed.
-- 
1.8.2.rc2

^ permalink raw reply related

* Re: [PATCH v2 0/2] Add epoll round robin wakeup mode
From: Andy Lutomirski @ 2015-02-17 19:46 UTC (permalink / raw)
  To: Jason Baron
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, Eric Wong,
	Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel@vger.kernel.org, Linux FS Devel, Linux API
In-Reply-To: <cover.1424200151.git.jbaron@akamai.com>

On Tue, Feb 17, 2015 at 11:33 AM, Jason Baron <jbaron@akamai.com> wrote:
> When we are sharing a wakeup source among multiple epoll fds, we end up with
> thundering herd wakeups, since there is currently no way to add to the
> wakeup source exclusively. This series introduces 2 new epoll flags,
> EPOLLEXCLUSIVE for adding to a wakeup source exclusively. And EPOLLROUNDROBIN
> which is to be used in conjunction to EPOLLEXCLUSIVE to evenly
> distribute the wakeups. This patch was originally motivated by a desire to
> improve wakeup balance and cpu usage for a listen socket() shared amongst
> multiple epoll fd sets.
>
> See: http://lwn.net/Articles/632590/ for previous test program and testing
> resutls.
>
> Epoll manpage text:
>
> EPOLLEXCLUSIVE
>         Provides exclusive wakeups when attaching multiple epoll fds to a
>         shared wakeup source. Must be specified with an EPOLL_CTL_ADD operation.
>
> EPOLLROUNDROBIN
>         Provides balancing for exclusive wakeups when attaching multiple epoll
>         fds to a shared wakeup soruce. Depends on EPOLLEXCLUSIVE being set and
>         must be specified with an EPOLL_CTL_ADD operation.
>
> Thanks,

What permissions do you need on the file descriptor to do this?  This
will be the first case where a poll-like operation has side effects,
and that's rather weird IMO.

--Andy

>
> -Jason
>
>
> Jason Baron (2):
>   sched/wait: add round robin wakeup mode
>   epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
>
>  fs/eventpoll.c                 | 25 ++++++++++++++++++++-----
>  include/linux/wait.h           | 11 +++++++++++
>  include/uapi/linux/eventpoll.h |  6 ++++++
>  kernel/sched/wait.c            | 10 ++++++++--
>  4 files changed, 45 insertions(+), 7 deletions(-)
>
> --
> 1.8.2.rc2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply

* Re: [PATCH 0/7] [RFC] kernel: add a netlink interface to get information about processes
From: Andrew Vagin @ 2015-02-17 20:32 UTC (permalink / raw)
  To: David Ahern
  Cc: Andrey Vagin, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Andrew Morton,
	Cyrill Gorcunov, Pavel Emelyanov, Roger Luethi
In-Reply-To: <54E367CB.9030309-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Tue, Feb 17, 2015 at 09:09:47AM -0700, David Ahern wrote:
> On 2/17/15 1:20 AM, Andrey Vagin wrote:
> >And here are statistics about syscalls which were called by each
> >command.
> >$ perf stat -e syscalls:sys_exit* -- ps ax -o pid,ppid  2>&1 | grep syscalls | sort -n -r | head -n 5
> >             20,713      syscalls:sys_exit_open
> >             20,710      syscalls:sys_exit_close
> >             20,708      syscalls:sys_exit_read
> >             10,348      syscalls:sys_exit_newstat
> >                 31      syscalls:sys_exit_write
> >
> >$ perf stat -e syscalls:sys_exit* -- ./task_diag_all  2>&1 | grep syscalls | sort -n -r | head -n 5
> >                114      syscalls:sys_exit_recvfrom
> >                 49      syscalls:sys_exit_write
> >                  8      syscalls:sys_exit_mmap
> >                  4      syscalls:sys_exit_mprotect
> >                  3      syscalls:sys_exit_newfstat
> 
> 'perf trace -s' gives the summary with stats.
> e.g., perf trace -s --  ps ax -o pid,ppid

Thank you for this command, I haven't used it before.

 ps (21301), 145271 events, 100.0%, 0.000 msec

   syscall            calls      min       avg       max      stddev
                               (msec)    (msec)    (msec)        (%)
   --------------- -------- --------- --------- ---------     ------
   read               20717     0.000     0.020     1.631      0.64%
   write                  1     0.019     0.019     0.019      0.00%
   open               20722     0.025     0.035     3.624      0.93%
   close              20719     0.006     0.009     1.059      0.95%
   stat               10352     0.015     0.025     1.748      0.95%
   fstat                 12     0.010     0.012     0.020      6.17%
   lseek                  2     0.011     0.012     0.012      3.08%
   mmap                  30     0.012     0.034     0.094      9.35%
   mprotect              17     0.034     0.045     0.067      4.86%
   munmap                 3     0.028     0.058     0.108     44.12%
   brk                    4     0.011     0.015     0.019     11.24%
   rt_sigaction          25     0.011     0.011     0.014      1.27%
   rt_sigprocmask         1     0.012     0.012     0.012      0.00%
   ioctl                  4     0.010     0.012     0.014      6.94%
   access                 1     0.034     0.034     0.034      0.00%
   execve                 6     0.000     0.496     2.794     92.58%
   uname                  1     0.015     0.015     0.015      0.00%
   getdents              12     0.019     0.691     1.158     13.04%
   getrlimit              1     0.012     0.012     0.012      0.00%
   geteuid                1     0.012     0.012     0.012      0.00%
   arch_prctl             1     0.013     0.013     0.013      0.00%
   futex                  1     0.020     0.020     0.020      0.00%
   set_tid_address        1     0.012     0.012     0.012      0.00%
   openat                 1     0.030     0.030     0.030      0.00%
   set_robust_list        1     0.011     0.011     0.011      0.00%


 task_diag_all (21304), 569 events, 98.6%, 0.000 msec

   syscall            calls      min       avg       max      stddev
                               (msec)    (msec)    (msec)        (%)
   --------------- -------- --------- --------- ---------     ------
   read                   2     0.000     0.045     0.090    100.00%
   write                 77     0.010     0.013     0.083      7.93%
   open                   2     0.031     0.038     0.045     19.64%
   close                  3     0.010     0.014     0.017     13.43%
   fstat                  3     0.011     0.011     0.012      3.79%
   mmap                   8     0.013     0.027     0.049     16.72%
   mprotect               4     0.034     0.043     0.052      8.86%
   munmap                 1     0.031     0.031     0.031      0.00%
   brk                    1     0.014     0.014     0.014      0.00%
   ioctl                  1     0.010     0.010     0.010      0.00%
   access                 1     0.030     0.030     0.030      0.00%
   getpid                 1     0.011     0.011     0.011      0.00%
   socket                 1     0.045     0.045     0.045      0.00%
   sendto                 2     0.091     0.104     0.117     12.63%
   recvfrom             175     0.026     0.093     0.141      1.10%
   bind                   1     0.014     0.014     0.014      0.00%
   execve                 1     0.000     0.000     0.000      0.00%
   arch_prctl             1     0.011     0.011     0.011      0.00%

> 
>  ps (23850), 3117 events, 99.3%, 0.000 msec
> 
>    syscall            calls      min       avg       max      stddev
>                                (msec)    (msec)    (msec)        (%)
>    --------------- -------- --------- --------- ---------     ------
>    read                 353     0.000     0.010     0.035      3.14%
>    write                166     0.006     0.012     0.045      3.03%
>    open                 365     0.002     0.005     0.178     11.29%
>    close                354     0.001     0.002     0.024      3.57%
>    stat                 170     0.002     0.007     0.662     52.99%
>    fstat                 19     0.002     0.003     0.003      2.31%
>    lseek                  2     0.003     0.003     0.003      6.49%
>    mmap                  50     0.004     0.006     0.013      3.40%
> ...

^ permalink raw reply

* Re: [PATCH v2 0/2] Add epoll round robin wakeup mode
From: Jason Baron @ 2015-02-17 20:33 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, Eric Wong,
	Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel@vger.kernel.org, Linux FS Devel, Linux API
In-Reply-To: <CALCETrWAH7SbQm0T+_7wviiU6pJ0isBdWE-cQBGk-joAx2OzFw@mail.gmail.com>

On 02/17/2015 02:46 PM, Andy Lutomirski wrote:
> On Tue, Feb 17, 2015 at 11:33 AM, Jason Baron <jbaron@akamai.com> wrote:
>> When we are sharing a wakeup source among multiple epoll fds, we end up with
>> thundering herd wakeups, since there is currently no way to add to the
>> wakeup source exclusively. This series introduces 2 new epoll flags,
>> EPOLLEXCLUSIVE for adding to a wakeup source exclusively. And EPOLLROUNDROBIN
>> which is to be used in conjunction to EPOLLEXCLUSIVE to evenly
>> distribute the wakeups. This patch was originally motivated by a desire to
>> improve wakeup balance and cpu usage for a listen socket() shared amongst
>> multiple epoll fd sets.
>>
>> See: http://lwn.net/Articles/632590/ for previous test program and testing
>> resutls.
>>
>> Epoll manpage text:
>>
>> EPOLLEXCLUSIVE
>>         Provides exclusive wakeups when attaching multiple epoll fds to a
>>         shared wakeup source. Must be specified with an EPOLL_CTL_ADD operation.
>>
>> EPOLLROUNDROBIN
>>         Provides balancing for exclusive wakeups when attaching multiple epoll
>>         fds to a shared wakeup soruce. Depends on EPOLLEXCLUSIVE being set and
>>         must be specified with an EPOLL_CTL_ADD operation.
>>
>> Thanks,
> What permissions do you need on the file descriptor to do this?  This
> will be the first case where a poll-like operation has side effects,
> and that's rather weird IMO.
>

So in the case where you have both non-exclusive and exclusive
waiters, all of the non-exclusive waiters will continue to get woken
up. However, I think you're getting at having multiple exclusive
waiters and potentially 'starving' out other exclusive waiters.

In general, I think wait queues are associated with a 'struct file',
so I think unless you are sharing your fd table, this isn't an issue.
However, there may be cases where this is not true? In which
case, perhaps, we could limit this to CAP_SYS_ADMIN...

Thanks,

-Jason


^ permalink raw reply

* Re: [PATCH v2 0/2] Add epoll round robin wakeup mode
From: Andy Lutomirski @ 2015-02-17 21:09 UTC (permalink / raw)
  To: Jason Baron
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, Eric Wong,
	Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux FS Devel, Linux API
In-Reply-To: <54E3A591.2050806-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org>

On Tue, Feb 17, 2015 at 12:33 PM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
> On 02/17/2015 02:46 PM, Andy Lutomirski wrote:
>> On Tue, Feb 17, 2015 at 11:33 AM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
>>> When we are sharing a wakeup source among multiple epoll fds, we end up with
>>> thundering herd wakeups, since there is currently no way to add to the
>>> wakeup source exclusively. This series introduces 2 new epoll flags,
>>> EPOLLEXCLUSIVE for adding to a wakeup source exclusively. And EPOLLROUNDROBIN
>>> which is to be used in conjunction to EPOLLEXCLUSIVE to evenly
>>> distribute the wakeups. This patch was originally motivated by a desire to
>>> improve wakeup balance and cpu usage for a listen socket() shared amongst
>>> multiple epoll fd sets.
>>>
>>> See: http://lwn.net/Articles/632590/ for previous test program and testing
>>> resutls.
>>>
>>> Epoll manpage text:
>>>
>>> EPOLLEXCLUSIVE
>>>         Provides exclusive wakeups when attaching multiple epoll fds to a
>>>         shared wakeup source. Must be specified with an EPOLL_CTL_ADD operation.
>>>
>>> EPOLLROUNDROBIN
>>>         Provides balancing for exclusive wakeups when attaching multiple epoll
>>>         fds to a shared wakeup soruce. Depends on EPOLLEXCLUSIVE being set and
>>>         must be specified with an EPOLL_CTL_ADD operation.
>>>
>>> Thanks,
>> What permissions do you need on the file descriptor to do this?  This
>> will be the first case where a poll-like operation has side effects,
>> and that's rather weird IMO.
>>
>
> So in the case where you have both non-exclusive and exclusive
> waiters, all of the non-exclusive waiters will continue to get woken
> up. However, I think you're getting at having multiple exclusive
> waiters and potentially 'starving' out other exclusive waiters.
>
> In general, I think wait queues are associated with a 'struct file',
> so I think unless you are sharing your fd table, this isn't an issue.
> However, there may be cases where this is not true? In which
> case, perhaps, we could limit this to CAP_SYS_ADMIN...

There's also SCM_RIGHTS, which can be used in conjunction with file
sealing and such.

In general, I feel like this patch series solves a problem that isn't
well understood and does it by adding a rather strange new mechanism.
Is there really a problem that can't be addressed by more normal epoll
features?

--Andy

>
> Thanks,
>
> -Jason
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply

* Re: [PATCH 0/7] [RFC] kernel: add a netlink interface to get information about processes
From: Andrew Vagin @ 2015-02-17 21:33 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andrey Vagin, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Andrew Morton,
	Cyrill Gorcunov, Pavel Emelyanov, Roger Luethi
In-Reply-To: <3238376.TxgiSSEfbZ@wuerfel>

On Tue, Feb 17, 2015 at 09:53:09AM +0100, Arnd Bergmann wrote:
> On Tuesday 17 February 2015 11:20:19 Andrey Vagin wrote:
> > task_diag is based on netlink sockets and looks like socket-diag, which
> > is used to get information about sockets.
> > 
> > A request is described by the task_diag_pid structure:
> > 
> > struct task_diag_pid {
> >        __u64   show_flags;      /* specify which information are required */
> >        __u64   dump_stratagy;   /* specify a group of processes */
> > 
> >        __u32   pid;
> > };
> 
> Can you explain how the interface relates to the 'taskstats' genetlink
> API? Did you consider extending that interface to provide the
> information you need instead of basing on the socket-diag?

It isn't based on the socket-diag, it looks like socket-diag.

Current task_diag registers a new genl family, but we can use the taskstats
family and add task_diag commands to it.

Thanks,
Andrew

> 
> 	Arnd

^ permalink raw reply

* Re: [PATCH v2 0/2] Add epoll round robin wakeup mode
From: Jason Baron @ 2015-02-18  3:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, Eric Wong,
	Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux FS Devel, Linux API, Linus Torvalds, Mathieu Desnoyers,
	edumazet-hpIqsD4AKlfQT0dZR+AlfA
In-Reply-To: <CALCETrWg9sdyoKg0-BkwKQgyANvJybQ_wqjTfvYEGW1+S1J5Bw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On 02/17/2015 04:09 PM, Andy Lutomirski wrote:
> On Tue, Feb 17, 2015 at 12:33 PM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
>> On 02/17/2015 02:46 PM, Andy Lutomirski wrote:
>>> On Tue, Feb 17, 2015 at 11:33 AM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
>>>> When we are sharing a wakeup source among multiple epoll fds, we end up with
>>>> thundering herd wakeups, since there is currently no way to add to the
>>>> wakeup source exclusively. This series introduces 2 new epoll flags,
>>>> EPOLLEXCLUSIVE for adding to a wakeup source exclusively. And EPOLLROUNDROBIN
>>>> which is to be used in conjunction to EPOLLEXCLUSIVE to evenly
>>>> distribute the wakeups. This patch was originally motivated by a desire to
>>>> improve wakeup balance and cpu usage for a listen socket() shared amongst
>>>> multiple epoll fd sets.
>>>>
>>>> See: http://lwn.net/Articles/632590/ for previous test program and testing
>>>> resutls.
>>>>
>>>> Epoll manpage text:
>>>>
>>>> EPOLLEXCLUSIVE
>>>>         Provides exclusive wakeups when attaching multiple epoll fds to a
>>>>         shared wakeup source. Must be specified with an EPOLL_CTL_ADD operation.
>>>>
>>>> EPOLLROUNDROBIN
>>>>         Provides balancing for exclusive wakeups when attaching multiple epoll
>>>>         fds to a shared wakeup soruce. Depends on EPOLLEXCLUSIVE being set and
>>>>         must be specified with an EPOLL_CTL_ADD operation.
>>>>
>>>> Thanks,
>>> What permissions do you need on the file descriptor to do this?  This
>>> will be the first case where a poll-like operation has side effects,
>>> and that's rather weird IMO.
>>>
>> So in the case where you have both non-exclusive and exclusive
>> waiters, all of the non-exclusive waiters will continue to get woken
>> up. However, I think you're getting at having multiple exclusive
>> waiters and potentially 'starving' out other exclusive waiters.
>>
>> In general, I think wait queues are associated with a 'struct file',
>> so I think unless you are sharing your fd table, this isn't an issue.
>> However, there may be cases where this is not true? In which
>> case, perhaps, we could limit this to CAP_SYS_ADMIN...
> There's also SCM_RIGHTS, which can be used in conjunction with file
> sealing and such.
>
> In general, I feel like this patch series solves a problem that isn't
> well understood and does it by adding a rather strange new mechanism.
> Is there really a problem that can't be addressed by more normal epoll
> features?
>
> --Andy

hmm....so I dug through some of the Linux archives a bit and this
problem seems to crop up every so often without resolution.
So I do believe that its an issue that ppl are more generally
interested in.

See:

http://lkml.iu.edu/hypermail/linux/kernel/1201.1/02620.html
http://marc.info/?l=linux-kernel&m=128638781921073&w=2

In the latter thread, Linus suggests adding it to the "requested events"
field to poll: http://marc.info/?l=linux-kernel&m=128639416832335&w=2

So, I think that this series at least moves in that suggested direction.

Thanks,

-Jason

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox