From: menage@google.com
To: akpm@linux-foundation.org, dev@sw.ru, xemul@sw.ru,
serue@us.ibm.com, vatsa@in.ibm.com, ebiederm@xmission.com,
haveblue@us.ibm.com, svaidy@linux.vnet.ibm.com,
balbir@in.ibm.com, pj@sgi.com, cpw@sgi.com
Cc: ckrm-tech@lists.sourceforge.net, linux-kernel@vger.kernel.org,
containers@lists.osdl.org, mbligh@google.com,
rohitseth@google.com, devel@openvz.org
Subject: [PATCH 03/10] Containers(V10): Add tasks file interface
Date: Tue, 29 May 2007 06:01:07 -0700 [thread overview]
Message-ID: <20070529132142.428218000@menage.corp.google.com> (raw)
In-Reply-To: 20070529130104.461765000@menage.corp.google.com
[-- Attachment #1: container_tasks.patch --]
[-- Type: text/plain, Size: 10646 bytes --]
This patch adds the per-directory "tasks" file for containerfs mounts;
this allows the user to determine which tasks are members of a
container by reading a container's "tasks", and to move a task into a
container by writing its pid to its "tasks".
Signed-off-by: Paul Menage <menage@google.com>
---
include/linux/container.h | 10 +
kernel/container.c | 335 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 345 insertions(+)
Index: container-2.6.22-rc2-mm1/include/linux/container.h
===================================================================
--- container-2.6.22-rc2-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc2-mm1/include/linux/container.h
@@ -128,6 +128,16 @@ int container_is_removed(const struct co
int container_path(const struct container *cont, char *buf, int buflen);
+int __container_task_count(const struct container *cont);
+static inline int container_task_count(const struct container *cont)
+{
+ int task_count;
+ rcu_read_lock();
+ task_count = __container_task_count(cont);
+ rcu_read_unlock();
+ return task_count;
+}
+
/* Return true if the container is a descendant of the current container */
int container_is_descendant(const struct container *cont);
Index: container-2.6.22-rc2-mm1/kernel/container.c
===================================================================
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -679,6 +679,109 @@ static inline void get_first_subsys(cons
*subsys_id = test_ss->subsys_id;
}
+/*
+ * Attach task 'tsk' to container 'cont'
+ *
+ * Call holding container_mutex. May take task_lock of
+ * the task 'pid' during call.
+ */
+
+static int attach_task(struct container *cont, struct task_struct *tsk)
+{
+ int retval = 0;
+ struct container_subsys *ss;
+ struct container *oldcont;
+ struct css_group *cg = &tsk->containers;
+ struct containerfs_root *root = cont->root;
+ int i;
+
+ int subsys_id;
+ get_first_subsys(cont, NULL, &subsys_id);
+
+ /* Nothing to do if the task is already in that container */
+ oldcont = task_container(tsk, subsys_id);
+ if (cont == oldcont)
+ return 0;
+
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cont, tsk);
+ if (retval) {
+ return retval;
+ }
+ }
+ }
+
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ return -ESRCH;
+ }
+ /* Update the css_group pointers for the subsystems in this
+ * hierarchy */
+ for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
+ if (root->subsys_bits & (1ull << i)) {
+ /* Subsystem is in this hierarchy. So we want
+ * the subsystem state from the new
+ * container. Transfer the refcount from the
+ * old to the new */
+ atomic_inc(&cont->count);
+ atomic_dec(&cg->subsys[i]->container->count);
+ rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
+ }
+ }
+ task_unlock(tsk);
+
+ for_each_subsys(root, ss) {
+ if (ss->attach) {
+ ss->attach(ss, cont, oldcont, tsk);
+ }
+ }
+
+ synchronize_rcu();
+ return 0;
+}
+
+/*
+ * Attach task with pid 'pid' to container 'cont'. Call with
+ * container_mutex, may take task_lock of task
+ *
+ */
+
+static int attach_task_by_pid(struct container *cont, char *pidbuf)
+{
+ pid_t pid;
+ struct task_struct *tsk;
+ int ret;
+
+ if (sscanf(pidbuf, "%d", &pid) != 1)
+ return -EIO;
+
+ if (pid) {
+ rcu_read_lock();
+ tsk = find_task_by_pid(pid);
+ if (!tsk || tsk->flags & PF_EXITING) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ if ((current->euid) && (current->euid != tsk->uid)
+ && (current->euid != tsk->suid)) {
+ put_task_struct(tsk);
+ return -EACCES;
+ }
+ } else {
+ tsk = current;
+ get_task_struct(tsk);
+ }
+
+ ret = attach_task(cont, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
/* The various types of files and directories in a container file system */
typedef enum {
@@ -687,6 +790,54 @@ typedef enum {
FILE_TASKLIST,
} container_filetype_t;
+static ssize_t container_common_file_write(struct container *cont,
+ struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
+{
+ container_filetype_t type = cft->private;
+ char *buffer;
+ int retval = 0;
+
+ if (nbytes >= PATH_MAX)
+ return -E2BIG;
+
+ /* +1 for nul-terminator */
+ if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+ return -ENOMEM;
+
+ if (copy_from_user(buffer, userbuf, nbytes)) {
+ retval = -EFAULT;
+ goto out1;
+ }
+ buffer[nbytes] = 0; /* nul-terminate */
+
+ mutex_lock(&container_mutex);
+
+ if (container_is_removed(cont)) {
+ retval = -ENODEV;
+ goto out2;
+ }
+
+ switch (type) {
+ case FILE_TASKLIST:
+ retval = attach_task_by_pid(cont, buffer);
+ break;
+ default:
+ retval = -EINVAL;
+ goto out2;
+ }
+
+ if (retval == 0)
+ retval = nbytes;
+out2:
+ mutex_unlock(&container_mutex);
+out1:
+ kfree(buffer);
+ return retval;
+}
+
static ssize_t container_file_write(struct file *file, const char __user *buf,
size_t nbytes, loff_t *ppos)
{
@@ -875,6 +1026,187 @@ int container_add_files(struct container
return 0;
}
+/* Count the number of tasks in a container. Could be made more
+ * time-efficient but less space-efficient with more linked lists
+ * running through each container and the css_group structures that
+ * referenced it. Must be called with tasklist_lock held for read or
+ * write or in an rcu critical section. */
+
+int __container_task_count(const struct container *cont)
+{
+ int count = 0;
+ struct task_struct *g, *p;
+ struct container_subsys_state *css;
+ int subsys_id;
+ get_first_subsys(cont, &css, &subsys_id);
+
+ do_each_thread(g, p) {
+ if (task_subsys_state(p, subsys_id) == css)
+ count ++;
+ } while_each_thread(g, p);
+ return count;
+}
+
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a container has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ * Upon tasks file open(), a struct ctr_struct is allocated, that
+ * will have a pointer to an array (also allocated here). The struct
+ * ctr_struct * is stored in file->private_data. Its resources will
+ * be freed by release() when the file is closed. The array is used
+ * to sprintf the PIDs and then used by read().
+ */
+
+/* containers_tasks_read array */
+
+struct ctr_struct {
+ char *buf;
+ int bufsz;
+};
+
+/*
+ * Load into 'pidarray' up to 'npids' of the tasks using container
+ * 'cont'. Return actual number of pids loaded. No need to
+ * task_lock(p) when reading out p->container, since we're in an RCU
+ * read section, so the css_group can't go away, and is
+ * immutable after creation.
+ */
+static int pid_array_load(pid_t *pidarray, int npids, struct container *cont)
+{
+ int n = 0;
+ struct task_struct *g, *p;
+ struct container_subsys_state *css;
+ int subsys_id;
+ get_first_subsys(cont, &css, &subsys_id);
+ rcu_read_lock();
+
+ do_each_thread(g, p) {
+ if (task_subsys_state(p, subsys_id) == css) {
+ pidarray[n++] = pid_nr(task_pid(p));
+ if (unlikely(n == npids))
+ goto array_full;
+ }
+ } while_each_thread(g, p);
+
+array_full:
+ rcu_read_unlock();
+ return n;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Convert array 'a' of 'npids' pid_t's to a string of newline separated
+ * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
+ * count 'cnt' of how many chars would be written if buf were large enough.
+ */
+static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+{
+ int cnt = 0;
+ int i;
+
+ for (i = 0; i < npids; i++)
+ cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
+ return cnt;
+}
+
+/*
+ * Handle an open on 'tasks' file. Prepare a buffer listing the
+ * process id's of tasks currently attached to the container being opened.
+ *
+ * Does not require any specific container mutexes, and does not take any.
+ */
+static int container_tasks_open(struct inode *unused, struct file *file)
+{
+ struct container *cont = __d_cont(file->f_dentry->d_parent);
+ struct ctr_struct *ctr;
+ pid_t *pidarray;
+ int npids;
+ char c;
+
+ if (!(file->f_mode & FMODE_READ))
+ return 0;
+
+ ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
+ if (!ctr)
+ goto err0;
+
+ /*
+ * If container gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional container users didn't
+ * show up until sometime later on.
+ */
+ npids = container_task_count(cont);
+ pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+ if (!pidarray)
+ goto err1;
+
+ npids = pid_array_load(pidarray, npids, cont);
+ sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+
+ /* Call pid_array_to_buf() twice, first just to get bufsz */
+ ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
+ ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
+ if (!ctr->buf)
+ goto err2;
+ ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
+
+ kfree(pidarray);
+ file->private_data = ctr;
+ return 0;
+
+err2:
+ kfree(pidarray);
+err1:
+ kfree(ctr);
+err0:
+ return -ENOMEM;
+}
+
+static ssize_t container_tasks_read(struct container *cont,
+ struct cftype *cft,
+ struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct ctr_struct *ctr = file->private_data;
+
+ return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
+}
+
+static int container_tasks_release(struct inode *unused_inode, struct file *file)
+{
+ struct ctr_struct *ctr;
+
+ if (file->f_mode & FMODE_READ) {
+ ctr = file->private_data;
+ kfree(ctr->buf);
+ kfree(ctr);
+ }
+ return 0;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype cft_tasks = {
+ .name = "tasks",
+ .open = container_tasks_open,
+ .read = container_tasks_read,
+ .write = container_common_file_write,
+ .release = container_tasks_release,
+ .private = FILE_TASKLIST,
+};
+
static int container_populate_dir(struct container *cont)
{
int err;
@@ -883,6 +1215,9 @@ static int container_populate_dir(struct
/* First clear out any existing files */
container_clear_directory(cont->dentry);
+ if ((err = container_add_file(cont, &cft_tasks)) < 0)
+ return err;
+
for_each_subsys(cont->root, ss) {
if (ss->populate && (err = ss->populate(ss, cont)) < 0)
return err;
--
next prev parent reply other threads:[~2007-05-29 13:24 UTC|newest]
Thread overview: 58+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-05-29 13:01 [PATCH 00/10] Containers(V10): Generic Process Containers menage
2007-05-29 13:01 ` [PATCH 01/10] Containers(V10): Basic container framework menage
2007-05-30 7:15 ` Andrew Morton
2007-05-30 7:15 ` Andrew Morton
2007-05-30 14:02 ` Paul Menage
2007-05-30 16:00 ` Andrew Morton
2007-06-13 10:17 ` Dhaval Giani
2007-05-29 13:01 ` [PATCH 02/10] Containers(V10): Example CPU accounting subsystem menage
2007-05-30 7:16 ` Andrew Morton
2007-05-29 13:01 ` menage [this message]
2007-06-07 14:00 ` [PATCH 03/10] Containers(V10): Add tasks file interface Cedric Le Goater
2007-06-07 17:12 ` Paul Menage
2007-05-29 13:01 ` [PATCH 04/10] Containers(V10): Add fork/exit hooks menage
2007-05-29 13:01 ` [PATCH 05/10] Containers(V10): Add container_clone() interface menage
2007-05-30 7:16 ` Andrew Morton
2007-05-31 19:56 ` Serge E. Hallyn
2007-05-29 13:01 ` [PATCH 06/10] Containers(V10): Add procfs interface menage
2007-05-29 13:01 ` [PATCH 07/10] Containers(V10): Make cpusets a client of containers menage
2007-05-29 13:01 ` [PATCH 08/10] Containers(V10): Share css_group arrays between tasks with same container memberships menage
2007-05-29 13:01 ` [PATCH 09/10] Containers(V10): Simple debug info subsystem menage
2007-05-29 13:01 ` [PATCH 10/10] Containers(V10): Support for automatic userspace release agents menage
2007-05-30 7:14 ` [PATCH 00/10] Containers(V10): Generic Process Containers Andrew Morton
2007-05-30 7:39 ` William Lee Irwin III
2007-06-28 21:27 ` Paul Menage
2007-06-28 22:13 ` [ckrm-tech] " Srivatsa Vaddagiri
2007-05-30 8:09 ` Balbir Singh
2007-05-30 9:02 ` Pavel Emelianov
2007-05-30 9:02 ` Balbir Singh
2007-05-30 10:48 ` Pavel Emelianov
2007-06-04 19:14 ` Serge E. Hallyn
2007-06-04 19:31 ` Paul Jackson
2007-06-04 20:30 ` Paul Menage
2007-06-04 20:37 ` Paul Jackson
2007-06-04 20:41 ` Serge E. Hallyn
2007-06-04 21:05 ` Paul Jackson
2007-06-06 22:39 ` Serge E. Hallyn
2007-06-06 22:43 ` Paul Jackson
2007-06-07 0:05 ` Serge E. Hallyn
2007-06-07 0:46 ` [ckrm-tech] " Paul Jackson
2007-06-07 18:01 ` Serge E. Hallyn
2007-06-07 19:21 ` Paul Jackson
2007-06-07 20:17 ` Serge E. Hallyn
2007-06-07 22:01 ` Paul Jackson
2007-06-08 14:32 ` Serge E. Hallyn
2007-06-08 15:55 ` Paul Menage
2007-06-08 16:08 ` Serge E. Hallyn
2007-06-08 16:16 ` Paul Menage
2007-06-08 18:08 ` Serge E. Hallyn
2007-06-08 18:13 ` Paul Menage
2007-06-08 19:42 ` Serge E. Hallyn
2007-06-08 20:05 ` Paul Jackson
2007-06-08 17:37 ` Paul Jackson
2007-06-04 20:32 ` Paul Menage
2007-06-04 20:51 ` Serge E. Hallyn
2007-06-04 20:56 ` Paul Menage
2007-06-04 21:11 ` Serge E. Hallyn
2007-06-04 21:16 ` Paul Jackson
2007-06-04 21:09 ` [ckrm-tech] " Paul Jackson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070529132142.428218000@menage.corp.google.com \
--to=menage@google.com \
--cc=akpm@linux-foundation.org \
--cc=balbir@in.ibm.com \
--cc=ckrm-tech@lists.sourceforge.net \
--cc=containers@lists.osdl.org \
--cc=cpw@sgi.com \
--cc=dev@sw.ru \
--cc=devel@openvz.org \
--cc=ebiederm@xmission.com \
--cc=haveblue@us.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mbligh@google.com \
--cc=pj@sgi.com \
--cc=rohitseth@google.com \
--cc=serue@us.ibm.com \
--cc=svaidy@linux.vnet.ibm.com \
--cc=vatsa@in.ibm.com \
--cc=xemul@sw.ru \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.