* [PATCH 1/6] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids
[not found] ` <20090724032033.2463.79256.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
@ 2009-07-24 3:21 ` Ben Blum
2009-07-24 3:21 ` [PATCH 2/6] Ensures correct concurrent opening/reading of pidlists across pid namespaces Ben Blum
` (4 subsequent siblings)
5 siblings, 0 replies; 29+ messages in thread
From: Ben Blum @ 2009-07-24 3:21 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
serue-r/Jw6+rmf7HQT0dZR+AlfA, lizf-BthXqXjhjHXQFUHtdCDX3A,
menage-hpIqsD4AKlfQT0dZR+AlfA, bblum-hpIqsD4AKlfQT0dZR+AlfA
Adds a read-only "procs" file similar to "tasks" that shows only unique tgids
struct cgroup used to have a bunch of fields for keeping track of the pidlist
for the tasks file. Those are now separated into a new struct cgroup_pidlist,
of which two are had, one for procs and one for tasks. The way the seq_file
operations are set up is changed so that just the pidlist struct gets passed
around as the private data.
Interface example: Suppose a multithreaded process has pid 1000 and other
threads with ids 1001, 1002, 1003:
$ cat tasks
1000
1001
1002
1003
$ cat cgroup.procs
1000
$
Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
include/linux/cgroup.h | 22 ++--
kernel/cgroup.c | 278 ++++++++++++++++++++++++++++++------------------
2 files changed, 186 insertions(+), 114 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 665fa70..8a3a3ac 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,6 +141,17 @@ enum {
CGRP_WAIT_ON_RMDIR,
};
+struct cgroup_pidlist {
+ /* protects the other fields */
+ struct rw_semaphore mutex;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* how many files are using the current array */
+ int use_count;
+};
+
struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */
@@ -179,14 +190,9 @@ struct cgroup {
*/
struct list_head release_list;
- /* pids_mutex protects the fields below */
- struct rw_semaphore pids_mutex;
- /* Array of process ids in the cgroup */
- pid_t *tasks_pids;
- /* How many files are using the current tasks_pids array */
- int pids_use_count;
- /* Length of the current tasks_pids array */
- int pids_length;
+ /* we will have two separate pidlists, one for pids (the tasks file)
+ * and one for tgids (the procs file). */
+ struct cgroup_pidlist tasks, procs;
/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a68..943d59b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -960,7 +960,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&cgrp->pids_mutex);
+ init_rwsem(&(cgrp->tasks.mutex));
+ init_rwsem(&(cgrp->procs.mutex));
}
static void init_cgroup_root(struct cgroupfs_root *root)
{
@@ -1408,15 +1409,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
return ret;
}
-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
- FILE_ROOT,
- FILE_DIR,
- FILE_TASKLIST,
- FILE_NOTIFY_ON_RELEASE,
- FILE_RELEASE_AGENT,
-};
-
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -2114,7 +2106,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
}
/*
- * Stuff for reading the 'tasks' file.
+ * Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
@@ -2124,27 +2116,106 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/
/*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'. Return actual number of pids loaded. No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * If the new stripped list is sufficiently smaller and there's enough memory
+ * to allocate a new buffer, will let go of the unneeded memory. Returns the
+ * number of unique elements.
*/
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+/* is the size difference enough that we should re-allocate the array? */
+#define PIDLIST_REALLOC_DIFFERENCE(old,new) ((old) - PAGE_SIZE >= (new))
+static int pidlist_uniq(pid_t **p, int length)
{
- int n = 0, pid;
+ int src, dest = 1;
+ pid_t *list = *p;
+ pid_t *newlist;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ /*
+ * if the length difference is large enough, we want to allocate a
+ * smaller buffer to save memory. if this fails due to out of memory,
+ * we'll just stay with what we've got.
+ */
+ if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
+ newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL);
+ if (newlist)
+ *p = newlist;
+ }
+ return dest;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
struct cgroup_iter it;
struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
- if (unlikely(n == npids))
+ if (unlikely(n == length))
break;
- pid = task_pid_vnr(tsk);
- if (pid > 0)
- pidarray[n++] = pid;
+ /* get tgid or pid for procs or tasks file respectively */
+ pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
}
cgroup_iter_end(cgrp, &it);
- return n;
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (procs) {
+ length = pidlist_uniq(&array, length);
+ l = &(cgrp->procs);
+ } else {
+ l = &(cgrp->tasks);
+ }
+ /* store array in cgroup, freeing old if necessary */
+ down_write(&l->mutex);
+ kfree(l->list);
+ l->list = array;
+ l->length = length;
+ l->use_count++;
+ up_write(&l->mutex);
+ return 0;
}
/**
@@ -2201,19 +2272,14 @@ err:
return ret;
}
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-
/*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
*/
-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
{
/*
* Initially we receive a position value that corresponds to
@@ -2221,46 +2287,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;
- down_read(&cgrp->pids_mutex);
+ down_read(&l->mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (l->list[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = l->list + index;
*pos = *iter;
return iter;
}
-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
- up_read(&cgrp->pids_mutex);
+ struct cgroup_pidlist *l = s->private;
+ up_read(&l->mutex);
}
-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
- int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
-
+ struct cgroup_pidlist *l = s->private;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
@@ -2274,98 +2339,94 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
}
}
-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
-static struct seq_operations cgroup_tasks_seq_operations = {
- .start = cgroup_tasks_start,
- .stop = cgroup_tasks_stop,
- .next = cgroup_tasks_next,
- .show = cgroup_tasks_show,
+/*
+ * seq_operations functions for iterating on pidlists through seq_file -
+ * independent of whether it's tasks or procs
+ */
+static const struct seq_operations cgroup_pidlist_seq_operations = {
+ .start = cgroup_pidlist_start,
+ .stop = cgroup_pidlist_stop,
+ .next = cgroup_pidlist_next,
+ .show = cgroup_pidlist_show,
};
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
- down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ down_write(&l->mutex);
+ BUG_ON(!l->use_count);
+ if (!--l->use_count) {
+ kfree(l->list);
+ l->list = NULL;
+ l->length = 0;
}
- up_write(&cgrp->pids_mutex);
+ up_write(&l->mutex);
}
-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static int cgroup_pidlist_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
+ struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
-
- release_cgroup_pid_array(cgrp);
+ /*
+ * the seq_file will only be initialized if the file was opened for
+ * reading; hence we check if it's not null only in that case.
+ */
+ l = ((struct seq_file *)file->private_data)->private;
+ cgroup_release_pid_array(l);
return seq_release(inode, file);
}
-static struct file_operations cgroup_tasks_operations = {
+static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
- .release = cgroup_tasks_release,
+ .release = cgroup_pidlist_release,
};
/*
- * Handle an open on 'tasks' file. Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
+ * The following functions handle opens on a file that displays a pidlist
+ * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
+ * in the cgroup.
*/
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+/* helper function for the two below it */
+static int cgroup_pidlist_open(struct file *file, bool procs)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- pid_t *pidarray;
- int npids;
+ struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;
- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- npids = cgroup_task_count(cgrp);
- pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
- if (!pidarray)
- return -ENOMEM;
- npids = pid_array_load(pidarray, npids, cgrp);
- sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
- /*
- * Store the array in the cgroup, freeing the old
- * array if necessary
- */
- down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
- up_write(&cgrp->pids_mutex);
-
- file->f_op = &cgroup_tasks_operations;
+ /* have the array populated */
+ retval = pidlist_array_load(cgrp, procs);
+ if (retval)
+ return retval;
+ /* configure file information */
+ file->f_op = &cgroup_pidlist_operations;
- retval = seq_open(file, &cgroup_tasks_seq_operations);
+ retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ cgroup_release_pid_array(l);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = l;
return 0;
}
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+ return cgroup_pidlist_open(file, false);
+}
+static int cgroup_procs_open(struct inode *unused, struct file *file)
+{
+ return cgroup_pidlist_open(file, true);
+}
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
@@ -2388,21 +2449,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
/*
* for the common functions, 'private' gives the type of file
*/
+/* for hysterical raisins, we can't put this on the older files */
+#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
static struct cftype files[] = {
{
.name = "tasks",
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
- .release = cgroup_tasks_release,
- .private = FILE_TASKLIST,
+ .release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
-
+ {
+ .name = CGROUP_FILE_GENERIC_PREFIX "procs",
+ .open = cgroup_procs_open,
+ /* .write_u64 = cgroup_procs_write, TODO */
+ .release = cgroup_pidlist_release,
+ .mode = S_IRUGO,
+ },
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
- .private = FILE_NOTIFY_ON_RELEASE,
},
};
@@ -2411,7 +2478,6 @@ static struct cftype cft_release_agent = {
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
- .private = FILE_RELEASE_AGENT,
};
static int cgroup_populate_dir(struct cgroup *cgrp)
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH 2/6] Ensures correct concurrent opening/reading of pidlists across pid namespaces
[not found] ` <20090724032033.2463.79256.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
2009-07-24 3:21 ` [PATCH 1/6] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids Ben Blum
@ 2009-07-24 3:21 ` Ben Blum
2009-07-24 3:21 ` [PATCH 3/6] Quick vmalloc vs kmalloc fix to the case where array size is too large Ben Blum
` (3 subsequent siblings)
5 siblings, 0 replies; 29+ messages in thread
From: Ben Blum @ 2009-07-24 3:21 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
serue-r/Jw6+rmf7HQT0dZR+AlfA, lizf-BthXqXjhjHXQFUHtdCDX3A,
menage-hpIqsD4AKlfQT0dZR+AlfA, bblum-hpIqsD4AKlfQT0dZR+AlfA
Ensures correct concurrent opening/reading of pidlists across pid namespaces
Previously there was the problem in which two processes from different pid
namespaces reading the tasks or procs file could result in one process seeing
results from the other's namespace. Rather than one pidlist for each file in a
cgroup, we now keep a list of pidlists keyed by namespace and file type (tasks
versus procs) in which entries are placed on demand. Each pidlist has its own
lock, and that the pidlists themselves are passed around in the seq_file's
private pointer means we don't have to touch the cgroup or its master list
except when creating and destroying entries.
Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
include/linux/cgroup.h | 34 +++++++++++++--
kernel/cgroup.c | 109 +++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 121 insertions(+), 22 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8a3a3ac..b934b72 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,15 +141,36 @@ enum {
CGRP_WAIT_ON_RMDIR,
};
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
struct cgroup_pidlist {
- /* protects the other fields */
- struct rw_semaphore mutex;
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* protects the other fields */
+ struct rw_semaphore mutex;
};
struct cgroup {
@@ -190,9 +211,12 @@ struct cgroup {
*/
struct list_head release_list;
- /* we will have two separate pidlists, one for pids (the tasks file)
- * and one for tgids (the procs file). */
- struct cgroup_pidlist tasks, procs;
+ /*
+ * list of pidlists, up to two for each namespace (one for procs, one
+ * for tasks); created on demand.
+ */
+ struct list_head pidlists;
+ struct mutex pidlist_mutex;
/* For RCU-protected deletion */
struct rcu_head rcu_head;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 943d59b..b3773a5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
#include <asm/atomic.h>
@@ -675,6 +676,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
*/
deactivate_super(cgrp->root->sb);
+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
}
iput(inode);
@@ -960,8 +967,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
- init_rwsem(&(cgrp->tasks.mutex));
- init_rwsem(&(cgrp->procs.mutex));
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
{
@@ -2167,9 +2174,59 @@ static int cmppid(const void *a, const void *b)
}
/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+ /*
+ * We can't drop the pidlist_mutex before taking the l->mutex in case
+ * the last ref-holder is trying to remove l from the list at the same
+ * time. Holding the pidlist_mutex precludes somebody taking whichever
+ * list we find out from under us - compare release_pid_array().
+ */
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry(l, &cgrp->pidlists, links) {
+ if (l->key.type == type && l->key.ns == ns) {
+ /* found a matching list - drop the extra refcount */
+ put_pid_ns(ns);
+ /* make sure l doesn't vanish out from under us */
+ down_write(&l->mutex);
+ mutex_unlock(&cgrp->pidlist_mutex);
+ l->use_count++;
+ return l;
+ }
+ }
+ /* entry not found; create a new one */
+ l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
+ put_pid_ns(ns);
+ return l;
+ }
+ init_rwsem(&l->mutex);
+ down_write(&l->mutex);
+ l->key.type = type;
+ l->key.ns = ns;
+ l->use_count = 0; /* don't increment here */
+ l->list = NULL;
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ mutex_unlock(&cgrp->pidlist_mutex);
+ return l;
+}
+
+/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
-static int pidlist_array_load(struct cgroup *cgrp, bool procs)
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
{
pid_t *array;
int length;
@@ -2194,7 +2251,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
- pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
@@ -2202,19 +2262,21 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs)
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (procs) {
+ if (type == CGROUP_FILE_PROCS) {
length = pidlist_uniq(&array, length);
- l = &(cgrp->procs);
- } else {
- l = &(cgrp->tasks);
}
- /* store array in cgroup, freeing old if necessary */
- down_write(&l->mutex);
+ l = cgroup_pidlist_find(cgrp, type);
+ if (!l) {
+ kfree(array);
+ return -ENOMEM;
+ }
+ /* store array, freeing old if necessary - lock already held */
kfree(l->list);
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->mutex);
+ *lp = l;
return 0;
}
@@ -2357,13 +2419,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
+ /*
+ * the case where we're the last user of this particular pidlist will
+ * have us remove it from the cgroup's list, which entails taking the
+ * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
+ * pidlist_mutex, we have to take pidlist_mutex first.
+ */
+ mutex_lock(&l->owner->pidlist_mutex);
down_write(&l->mutex);
BUG_ON(!l->use_count);
if (!--l->use_count) {
+ /* we're the last user if refcount is 0; remove and free */
+ list_del(&l->links);
+ mutex_unlock(&l->owner->pidlist_mutex);
kfree(l->list);
- l->list = NULL;
- l->length = 0;
+ put_pid_ns(l->key.ns);
+ up_write(&l->mutex);
+ kfree(l);
+ return;
}
+ mutex_unlock(&l->owner->pidlist_mutex);
up_write(&l->mutex);
}
@@ -2394,10 +2469,10 @@ static const struct file_operations cgroup_pidlist_operations = {
* in the cgroup.
*/
/* helper function for the two below it */
-static int cgroup_pidlist_open(struct file *file, bool procs)
+static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
+ struct cgroup_pidlist *l;
int retval;
/* Nothing to do for write-only files */
@@ -2405,7 +2480,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
return 0;
/* have the array populated */
- retval = pidlist_array_load(cgrp, procs);
+ retval = pidlist_array_load(cgrp, type, &l);
if (retval)
return retval;
/* configure file information */
@@ -2421,11 +2496,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs)
}
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
- return cgroup_pidlist_open(file, false);
+ return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
- return cgroup_pidlist_open(file, true);
+ return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
}
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH 3/6] Quick vmalloc vs kmalloc fix to the case where array size is too large
[not found] ` <20090724032033.2463.79256.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
2009-07-24 3:21 ` [PATCH 1/6] Adds a read-only "procs" file similar to "tasks" that shows only unique tgids Ben Blum
2009-07-24 3:21 ` [PATCH 2/6] Ensures correct concurrent opening/reading of pidlists across pid namespaces Ben Blum
@ 2009-07-24 3:21 ` Ben Blum
[not found] ` <20090724032150.2463.49019.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
[not found] ` <4A6D37A4.6020605@cn.fujitsu.com>
2009-07-24 3:21 ` [PATCH 4/6] Changes css_set freeing mechanism to be under RCU Ben Blum
` (2 subsequent siblings)
5 siblings, 2 replies; 29+ messages in thread
From: Ben Blum @ 2009-07-24 3:21 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
serue-r/Jw6+rmf7HQT0dZR+AlfA, lizf-BthXqXjhjHXQFUHtdCDX3A,
menage-hpIqsD4AKlfQT0dZR+AlfA, bblum-hpIqsD4AKlfQT0dZR+AlfA
Quick vmalloc vs kmalloc fix to the case where array size is too large
Separates all pidlist allocation requests to a separate function that judges
based on the requested size whether or not the array needs to be vmalloced or
can be gotten via kmalloc, and similar for kfree/vfree. Should be replaced
entirely with a kernel-wide solution to this general problem.
Depends on cgroup-pidlist-namespace.patch, cgroup-procs.patch
Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
kernel/cgroup.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 42 insertions(+), 5 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b3773a5..7e6b183 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -48,6 +48,7 @@
#include <linux/namei.h>
#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <asm/atomic.h>
@@ -2123,6 +2124,42 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/
/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+static void pidlist_free(void *p)
+{
+ if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ kfree(p);
+}
+static void *pidlist_resize(void *p, int newcount)
+{
+ void *newlist;
+ /* note: if new alloc fails, old p will still be valid either way */
+ if (is_vmalloc_addr(p)) {
+ newlist = vmalloc(newcount * sizeof(pid_t));
+ if (!newlist)
+ return NULL;
+ memcpy(newlist, p, newcount * sizeof(pid_t));
+ vfree(p);
+ } else {
+ newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+ }
+ return newlist;
+}
+
+/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* If the new stripped list is sufficiently smaller and there's enough memory
* to allocate a new buffer, will let go of the unneeded memory. Returns the
@@ -2161,7 +2198,7 @@ after:
* we'll just stay with what we've got.
*/
if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL);
+ newlist = pidlist_resize(list, dest);
if (newlist)
*p = newlist;
}
@@ -2242,7 +2279,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
- array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
+ array = pidlist_allocate(length);
if (!array)
return -ENOMEM;
/* now, populate the array */
@@ -2267,11 +2304,11 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
}
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
- kfree(array);
+ pidlist_free(array);
return -ENOMEM;
}
/* store array, freeing old if necessary - lock already held */
- kfree(l->list);
+ pidlist_free(l->list);
l->list = array;
l->length = length;
l->use_count++;
@@ -2432,7 +2469,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
- kfree(l->list);
+ pidlist_free(l->list);
put_pid_ns(l->key.ns);
up_write(&l->mutex);
kfree(l);
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH 4/6] Changes css_set freeing mechanism to be under RCU
[not found] ` <20090724032033.2463.79256.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
` (2 preceding siblings ...)
2009-07-24 3:21 ` [PATCH 3/6] Quick vmalloc vs kmalloc fix to the case where array size is too large Ben Blum
@ 2009-07-24 3:21 ` Ben Blum
2009-07-24 3:22 ` [PATCH 5/6] Makes procs file writable to move all threads by tgid at once Ben Blum
2009-07-24 3:22 ` [PATCH 6/6] Lets ss->can_attach and ss->attach do whole threadgroups at a time Ben Blum
5 siblings, 0 replies; 29+ messages in thread
From: Ben Blum @ 2009-07-24 3:21 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
serue-r/Jw6+rmf7HQT0dZR+AlfA, lizf-BthXqXjhjHXQFUHtdCDX3A,
menage-hpIqsD4AKlfQT0dZR+AlfA, bblum-hpIqsD4AKlfQT0dZR+AlfA
Changes css_set freeing mechanism to be under RCU
This is a prepatch for making the procs file writable. In order to free the
old css_sets for each task to be moved as they're being moved, the freeing
mechanism must be RCU-protected, or else we would have to have a call to
synchronize_rcu() for each task before freeing its old css_set.
Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
include/linux/cgroup.h | 3 +++
kernel/cgroup.c | 8 +++++++-
2 files changed, 10 insertions(+), 1 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b934b72..24e3f1a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -260,6 +260,9 @@ struct css_set {
* during subsystem registration (at boot time).
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+ /* For RCU-protected deletion */
+ struct rcu_head rcu_head;
};
/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7e6b183..637a54e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -288,6 +288,12 @@ static void unlink_css_set(struct css_set *cg)
}
}
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+ struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+ kfree(cg);
+}
+
static void __put_css_set(struct css_set *cg, int taskexit)
{
int i;
@@ -317,7 +323,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
}
rcu_read_unlock();
- kfree(cg);
+ call_rcu(&cg->rcu_head, free_css_set_rcu);
}
/*
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH 5/6] Makes procs file writable to move all threads by tgid at once
[not found] ` <20090724032033.2463.79256.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
` (3 preceding siblings ...)
2009-07-24 3:21 ` [PATCH 4/6] Changes css_set freeing mechanism to be under RCU Ben Blum
@ 2009-07-24 3:22 ` Ben Blum
[not found] ` <20090724100220.GF11101@hawkmoon.kerlabs.com>
` (3 more replies)
2009-07-24 3:22 ` [PATCH 6/6] Lets ss->can_attach and ss->attach do whole threadgroups at a time Ben Blum
5 siblings, 4 replies; 29+ messages in thread
From: Ben Blum @ 2009-07-24 3:22 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
serue-r/Jw6+rmf7HQT0dZR+AlfA, lizf-BthXqXjhjHXQFUHtdCDX3A,
menage-hpIqsD4AKlfQT0dZR+AlfA, bblum-hpIqsD4AKlfQT0dZR+AlfA
Makes procs file writable to move all threads by tgid at once
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a rwsem that's taken for
reading in the fork() path to prevent newly forking threads within the
threadgroup from "escaping" while moving is in progress.
Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
Documentation/cgroups/cgroups.txt | 12 +
include/linux/cgroup.h | 2
kernel/cgroup.c | 422 +++++++++++++++++++++++++++++++++----
kernel/fork.c | 2
4 files changed, 393 insertions(+), 45 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97..d579346 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -228,6 +228,7 @@ Each cgroup is represented by a directory in the cgroup file system
containing the following files describing that cgroup:
- tasks: list of tasks (by pid) attached to that cgroup
+ - cgroup.procs: list of unique tgids in the cgroup
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -374,7 +375,7 @@ Now you want to do something with this cgroup.
In this directory you can find several files:
# ls
-notify_on_release tasks
+cgroup.procs notify_on_release tasks
(plus whatever files added by the attached subsystems)
Now attach your shell to this cgroup:
@@ -408,6 +409,15 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+The cgroup.procs file is useful for managing all tasks in a threadgroup at
+once. It works the same way as the tasks file, but moves all tasks in the
+threadgroup with the specified tgid.
+
+Writing the pid of a task that's not the threadgroup leader (i.e., a pid
+that isn't a tgid) is treated as invalid. Writing a '0' to cgroup.procs will
+attach the writing task and all tasks in its threadgroup, but is invalid if
+the writing task is not the leader of the threadgroup.
+
3. Kernel API
=============
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 24e3f1a..cae7d3e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -34,6 +34,7 @@ extern void cgroup_fork(struct task_struct *p);
extern void cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_fork_failed(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
@@ -554,6 +555,7 @@ static inline void cgroup_fork(struct task_struct *p) {}
static inline void cgroup_fork_callbacks(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
+static inline void cgroup_fork_failed(struct task_struct *p, int callbacks) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 637a54e..3f8d323 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -254,6 +254,18 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
* reduces the fork()/exit() overhead for people who have cgroups
* compiled into their kernel but not actually in use */
static int use_task_css_set_links __read_mostly;
+/* This rwsem locks out cgroup_attach_proc() from races with fork().
+ * If a thread with a tgid that's being moved via the procs file tries
+ * to fork, its child thread could escape the iteration across the
+ * threadgroup if it copies its parent cgroup pointer before the parent
+ * gets moved but doesn't add itself to the threadgroup list or finish
+ * cgroup fork routines until afterwards. The way we solve this is by
+ * taking this lock in read mode in the fork path across the sensitive
+ * section (at the cost of a cache miss when there's no contention),
+ * and as a write lock in cgroup_attach_proc(). Note of course that
+ * there will never be more than one cgroup_attach_proc contending, as
+ * it needs to be holding the cgroup_mutex to begin with. */
+static DECLARE_RWSEM(cgroup_fork_mutex);
/* When we create or destroy a css_set, the operation simply
* takes/releases a reference count on all the cgroups referenced
@@ -1297,6 +1309,87 @@ static void get_first_subsys(const struct cgroup *cgrp,
*subsys_id = test_ss->subsys_id;
}
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail
+ * with -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, int guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /*
+ * locate or allocate a new css_set for this task. 'guarantee' tells
+ * us whether or not we are sure that a new css_set already exists;
+ * in that case, we are not allowed to fail, as we won't need malloc.
+ */
+ if (guarantee) {
+ /*
+ * our caller promises us that the css_set we want already
+ * exists, so we use find_existing_css_set directly.
+ */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /*
+ * we cannot move a task that's declared itself as exiting, as once
+ * PF_EXITING is set, the tsk->cgroups pointer is no longer safe.
+ */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list)) {
+ list_del(&tsk->cg_list);
+ list_add(&tsk->cg_list, &newcg->tasks);
+ }
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to
+ * drop it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1307,11 +1400,9 @@ static void get_first_subsys(const struct cgroup *cgrp,
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
int subsys_id;
@@ -1330,75 +1421,294 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
+ if (retval)
+ return retval;
+
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, tsk);
+ }
+
+ synchronize_rcu();
+
/*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
+ * wake up rmdir() waiter. the rmdir should fail since the cgroup
+ * is no longer empty.
*/
+ cgroup_wakeup_rmdir_waiters(cgrp);
+ return 0;
+}
+
+/*
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list, of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static int css_set_check_fetched(struct cgroup *cgrp, struct task_struct *tsk,
+ struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ /* doesn't exist at all? */
+ if (!newcg)
+ return 1;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return 0;
+ }
+ }
+ /* not found */
+ put_css_set(newcg);
+ return 1;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving
+ * the given task to the given cgroup. Returns 0 on success, -ENOMEM if we
+ * run out of memory.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ /* ensure a new css_set will exist for this thread */
newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
if (!newcg)
return -ENOMEM;
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
+ /* add new element to list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
put_css_set(newcg);
- return -ESRCH;
+ return -ENOMEM;
}
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ int subsys_id;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry;
+
+ /* first, make sure this came from a valid tgid */
+ if (!thread_group_leader(leader))
+ return -EINVAL;
+ /*
+ * check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval)
+ return retval;
+ }
}
- write_unlock(&css_set_lock);
+ get_first_subsys(cgrp, NULL, &subsys_id);
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup(leader, subsys_id);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ retval = -ESRCH;
+ goto list_teardown;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+again:
+ rcu_read_lock();
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case only has one css_set for all of them, usually O(n).
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup(tsk, subsys_id);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ retval = css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list);
+ if (retval) {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto again;
+ } else {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. the only fail case henceforth
+ * is if the threadgroup leader has PF_EXITING set (in which case all
+ * the other threads get killed) - if other threads happen to be
+ * exiting, we just ignore them and move on.
+ */
+ oldcgrp = task_cgroup(leader, subsys_id);
+ /* if leader is already there, skip moving him */
+ if (cgrp != oldcgrp) {
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, 1);
+ if (retval) {
+ BUG_ON(retval != -ESRCH);
+ goto list_teardown;
+ }
+ }
+ /*
+ * now move all the rest of the threads - need to lock against
+ * possible races with fork().
+ */
+ down_write(&cgroup_fork_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup(tsk, subsys_id);
+ if (cgrp == oldcgrp)
+ continue;
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 1);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ rcu_read_unlock();
+ up_write(&cgroup_fork_mutex);
+
+ /*
+ * step 3: attach whole threadgroup to each subsystem
+ */
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk);
+ ss->attach(ss, cgrp, oldcgrp, leader);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);
/*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
+ * step 4: success! ...and cleanup
*/
+ synchronize_rcu();
cgroup_wakeup_rmdir_waiters(cgrp);
- return 0;
+ retval = 0;
+list_teardown:
+ /* no longer need the list of css_sets, so get rid of it */
+ while (!list_empty(&newcg_list)) {
+ /* pop from the list */
+ cg_entry = list_first_entry(&newcg_list, struct cg_list_entry,
+ links);
+ list_del(&cg_entry->links);
+ /* drop the refcount */
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+ /* done! */
+ return retval;
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid,
+ int attach(struct cgroup *,
+ struct task_struct *))
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
if (!tsk || tsk->flags & PF_EXITING) {
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
-
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on the group leader, because
+ * even if another task has different permissions, the group
+ * leader will have sufficient access to change it.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
@@ -1408,19 +1718,25 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ /*
+ * Note that the check for whether the task is its threadgroup leader
+ * is done in cgroup_attach_proc. This means that writing 0 to the
+ * procs file will only work if the writing task is the leader.
+ */
+ ret = attach(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
- int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
- return ret;
+ return attach_task_by_pid(cgrp, pid, cgroup_attach_task);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
+ return attach_task_by_pid(cgrp, tgid, cgroup_attach_proc);
}
/**
@@ -2580,9 +2896,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -3185,6 +3501,7 @@ static struct file_operations proc_cgroupstats_operations = {
*/
void cgroup_fork(struct task_struct *child)
{
+ down_read(&cgroup_fork_mutex);
task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
@@ -3231,6 +3548,7 @@ void cgroup_post_fork(struct task_struct *child)
task_unlock(child);
write_unlock(&css_set_lock);
}
+ up_read(&cgroup_fork_mutex);
}
/**
* cgroup_exit - detach cgroup from exiting task
@@ -3302,6 +3620,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
/**
+ * cgroup_fork_failed - undo operations for fork failure
+ * @tsk: pointer to task_struct of exiting process
+ * @run_callback: run exit callbacks?
+ *
+ * Description: Undo cgroup operations after cgroup_fork in fork failure.
+ *
+ * We release the read lock that was taken in cgroup_fork(), since it is
+ * supposed to be dropped in cgroup_post_fork in the success case. The other
+ * thing that wants to be done is detaching the failed child task from the
+ * cgroup, so we wrap cgroup_exit.
+ */
+void cgroup_fork_failed(struct task_struct *tsk, int run_callbacks)
+{
+ up_read(&cgroup_fork_mutex);
+ cgroup_exit(tsk, run_callbacks);
+}
+
+/**
* cgroup_clone - clone the cgroup the given subsystem is attached to
* @tsk: the task to be moved
* @subsys: the given subsystem
diff --git a/kernel/fork.c b/kernel/fork.c
index 926c117..027ec16 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1300,7 +1300,7 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
- cgroup_exit(p, cgroup_callbacks_done);
+ cgroup_fork_failed(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
^ permalink raw reply related [flat|nested] 29+ messages in thread[parent not found: <20090724100220.GF11101@hawkmoon.kerlabs.com>]
[parent not found: <20090724155041.GF5878@count0.beaverton.ibm.com>]
[parent not found: <20090724032200.2463.82408.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>]
* Re: [PATCH 5/6] Makes procs file writable to move all threads by tgid at once
[not found] ` <20090724032200.2463.82408.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
@ 2009-07-24 10:02 ` Louis Rilling
2009-07-24 15:50 ` Matt Helsley
2009-11-09 17:07 ` Daniel Lezcano
2 siblings, 0 replies; 29+ messages in thread
From: Louis Rilling @ 2009-07-24 10:02 UTC (permalink / raw)
To: Ben Blum
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
menage-hpIqsD4AKlfQT0dZR+AlfA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
[-- Attachment #1.1: Type: text/plain, Size: 17216 bytes --]
Hi Ben,
On 23/07/09 20:22 -0700, Ben Blum wrote:
> Makes procs file writable to move all threads by tgid at once
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a rwsem that's taken for
> reading in the fork() path to prevent newly forking threads within the
> threadgroup from "escaping" while moving is in progress.
>
> Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Thank you for working on this interface. This can indeed be very useful. Please
find comments below, hoping that this will help making it better.
[...]
> diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> index 6eb1a97..d579346 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
[...]
> @@ -408,6 +409,15 @@ You can attach the current shell task by echoing 0:
>
> # echo 0 > tasks
>
> +The cgroup.procs file is useful for managing all tasks in a threadgroup at
> +once. It works the same way as the tasks file, but moves all tasks in the
> +threadgroup with the specified tgid.
> +
> +Writing the pid of a task that's not the threadgroup leader (i.e., a pid
> +that isn't a tgid) is treated as invalid. Writing a '0' to cgroup.procs will
> +attach the writing task and all tasks in its threadgroup, but is invalid if
> +the writing task is not the leader of the threadgroup.
> +
This restriction sounds unfortunate and I'm not sure that there are good reasons
for it (see below).
[...]
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 637a54e..3f8d323 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
[...]
> @@ -1330,75 +1421,294 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> }
> }
>
> - task_lock(tsk);
> - cg = tsk->cgroups;
> - get_css_set(cg);
> - task_unlock(tsk);
> + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
> + if (retval)
> + return retval;
> +
> + for_each_subsys(root, ss) {
> + if (ss->attach)
> + ss->attach(ss, cgrp, oldcgrp, tsk);
> + }
> +
> + synchronize_rcu();
> +
> /*
> - * Locate or allocate a new css_set for this task,
> - * based on its final set of cgroups
> + * wake up rmdir() waiter. the rmdir should fail since the cgroup
> + * is no longer empty.
> */
> + cgroup_wakeup_rmdir_waiters(cgrp);
> + return 0;
> +}
> +
> +/*
> + * cgroup_attach_proc works in two stages, the first of which prefetches all
> + * new css_sets needed (to make sure we have enough memory before committing
> + * to the move) and stores them in a list, of entries of the following type.
> + * TODO: possible optimization: use css_set->rcu_head for chaining instead
> + */
> +struct cg_list_entry {
> + struct css_set *cg;
> + struct list_head links;
> +};
> +
> +static int css_set_check_fetched(struct cgroup *cgrp, struct task_struct *tsk,
> + struct css_set *cg,
> + struct list_head *newcg_list)
> +{
> + struct css_set *newcg;
> + struct cg_list_entry *cg_entry;
> + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> + read_lock(&css_set_lock);
> + newcg = find_existing_css_set(cg, cgrp, template);
> + if (newcg)
> + get_css_set(newcg);
> + read_unlock(&css_set_lock);
> + /* doesn't exist at all? */
> + if (!newcg)
> + return 1;
> + /* see if it's already in the list */
> + list_for_each_entry(cg_entry, newcg_list, links) {
> + if (cg_entry->cg == newcg) {
> + put_css_set(newcg);
> + return 0;
> + }
> + }
> + /* not found */
> + put_css_set(newcg);
> + return 1;
> +}
> +
> +/*
> + * Find the new css_set and store it in the list in preparation for moving
> + * the given task to the given cgroup. Returns 0 on success, -ENOMEM if we
> + * run out of memory.
> + */
> +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
> + struct list_head *newcg_list)
> +{
> + struct css_set *newcg;
> + struct cg_list_entry *cg_entry;
> + /* ensure a new css_set will exist for this thread */
> newcg = find_css_set(cg, cgrp);
> - put_css_set(cg);
> if (!newcg)
> return -ENOMEM;
> -
> - task_lock(tsk);
> - if (tsk->flags & PF_EXITING) {
> - task_unlock(tsk);
> + /* add new element to list */
> + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
> + if (!cg_entry) {
> put_css_set(newcg);
> - return -ESRCH;
> + return -ENOMEM;
> }
> - rcu_assign_pointer(tsk->cgroups, newcg);
> - task_unlock(tsk);
> + cg_entry->cg = newcg;
> + list_add(&cg_entry->links, newcg_list);
> + return 0;
> +}
>
> - /* Update the css_set linked lists if we're using them */
> - write_lock(&css_set_lock);
> - if (!list_empty(&tsk->cg_list)) {
> - list_del(&tsk->cg_list);
> - list_add(&tsk->cg_list, &newcg->tasks);
> +/**
> + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
> + * @cgrp: the cgroup to attach to
> + * @leader: the threadgroup leader task_struct of the group to be attached
> + *
> + * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
> + * threadgroup individually in turn.
> + */
> +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> +{
> + int retval;
> + struct cgroup_subsys *ss;
> + struct cgroup *oldcgrp;
> + struct css_set *oldcg;
> + struct cgroupfs_root *root = cgrp->root;
> + int subsys_id;
> + /* threadgroup list cursor */
> + struct task_struct *tsk;
> + /*
> + * we need to make sure we have css_sets for all the tasks we're
> + * going to move -before- we actually start moving them, so that in
> + * case we get an ENOMEM we can bail out before making any changes.
> + */
> + struct list_head newcg_list;
> + struct cg_list_entry *cg_entry;
> +
> + /* first, make sure this came from a valid tgid */
> + if (!thread_group_leader(leader))
> + return -EINVAL;
> + /*
> + * check that we can legitimately attach to the cgroup.
> + */
> + for_each_subsys(root, ss) {
> + if (ss->can_attach) {
> + retval = ss->can_attach(ss, cgrp, leader);
> + if (retval)
> + return retval;
> + }
> }
So the semantics of ->can_attach() becomes: if called for a thread group leader,
the result should be valid for the whole thread group, even if only the thread
group leader is being attached. This looks a bit fuzzy and thus not desirable.
Why not checking ->can_attach() for all threads (and lock cgroup_fork_mutex
earlier)?
> - write_unlock(&css_set_lock);
>
> + get_first_subsys(cgrp, NULL, &subsys_id);
> +
> + /*
> + * step 1: make sure css_sets exist for all threads to be migrated.
> + * we use find_css_set, which allocates a new one if necessary.
> + */
> + INIT_LIST_HEAD(&newcg_list);
> + oldcgrp = task_cgroup(leader, subsys_id);
> + if (cgrp != oldcgrp) {
> + /* get old css_set */
> + task_lock(leader);
> + if (leader->flags & PF_EXITING) {
> + task_unlock(leader);
> + retval = -ESRCH;
> + goto list_teardown;
> + }
> + oldcg = leader->cgroups;
> + get_css_set(oldcg);
> + task_unlock(leader);
> + /* acquire new one */
> + retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> + put_css_set(oldcg);
> + if (retval)
> + goto list_teardown;
> + }
The only difference between leader's case (above) and other threads' case
(below) is the check for PF_EXITING. If all threads were handled equally in the
->can_attach check, this special handling would be pointless.
> +again:
> + rcu_read_lock();
> + /*
> + * if we need to fetch a new css_set for this task, we must exit the
> + * rcu_read section because allocating it can sleep. afterwards, we'll
> + * need to restart iteration on the threadgroup list - the whole thing
> + * will be O(nm) in the number of threads and css_sets; as the typical
> + * case only has one css_set for all of them, usually O(n).
> + */
> + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> + /* nothing to do if this task is already in the cgroup */
> + oldcgrp = task_cgroup(tsk, subsys_id);
> + if (cgrp == oldcgrp)
> + continue;
> + /* get old css_set pointer */
> + task_lock(tsk);
> + if (tsk->flags & PF_EXITING) {
> + /* ignore this task if it's going away */
> + task_unlock(tsk);
> + continue;
> + }
> + oldcg = tsk->cgroups;
> + get_css_set(oldcg);
> + task_unlock(tsk);
> + /* see if the new one for us is already in the list? */
> + retval = css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list);
> + if (retval) {
> + /* we don't already have it. get new one. */
> + rcu_read_unlock();
> + retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> + put_css_set(oldcg);
> + if (retval)
> + goto list_teardown;
> + /* begin iteration again. */
> + goto again;
> + } else {
> + /* was already there, nothing to do. */
> + put_css_set(oldcg);
> + }
> + }
> + rcu_read_unlock();
> +
> + /*
> + * step 2: now that we're guaranteed success wrt the css_sets, proceed
I don't see how css_sets are guaranteed while cgroup_fork_mutex is not held and
thus does not prevent new threads from being created right now. Could you
elaborate on that?
> + * to move all tasks to the new cgroup. the only fail case henceforth
> + * is if the threadgroup leader has PF_EXITING set (in which case all
> + * the other threads get killed) - if other threads happen to be
This statement is wrong. A thread group leader can have PF_EXITING (and even
become zombie) while other sub-threads continue their execution. For instance it
is perfectly valid for a thread group leader to call sys_exit(), and no other
thread will be affected.
> + * exiting, we just ignore them and move on.
> + */
> + oldcgrp = task_cgroup(leader, subsys_id);
> + /* if leader is already there, skip moving him */
> + if (cgrp != oldcgrp) {
> + retval = cgroup_task_migrate(cgrp, oldcgrp, leader, 1);
> + if (retval) {
> + BUG_ON(retval != -ESRCH);
> + goto list_teardown;
> + }
> + }
> + /*
> + * now move all the rest of the threads - need to lock against
> + * possible races with fork().
> + */
> + down_write(&cgroup_fork_mutex);
> + rcu_read_lock();
> + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> + /* leave current thread as it is if it's already there */
> + oldcgrp = task_cgroup(tsk, subsys_id);
> + if (cgrp == oldcgrp)
> + continue;
> + /* we don't care whether these threads are exiting */
> + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 1);
> + BUG_ON(retval != 0 && retval != -ESRCH);
> + }
> + rcu_read_unlock();
> + up_write(&cgroup_fork_mutex);
> +
> + /*
> + * step 3: attach whole threadgroup to each subsystem
> + */
> for_each_subsys(root, ss) {
> if (ss->attach)
> - ss->attach(ss, cgrp, oldcgrp, tsk);
> + ss->attach(ss, cgrp, oldcgrp, leader);
> }
So ->attach called for leader should attach all sub-threads too? Does this mean
that all subsystems should be changed accordingly? Again this looks making
->attach semantics fuzzy, and thus not desirable.
Thank,
Louis
> - set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> - synchronize_rcu();
> - put_css_set(cg);
>
> /*
> - * wake up rmdir() waiter. the rmdir should fail since the cgroup
> - * is no longer empty.
> + * step 4: success! ...and cleanup
> */
> + synchronize_rcu();
> cgroup_wakeup_rmdir_waiters(cgrp);
> - return 0;
> + retval = 0;
> +list_teardown:
> + /* no longer need the list of css_sets, so get rid of it */
> + while (!list_empty(&newcg_list)) {
> + /* pop from the list */
> + cg_entry = list_first_entry(&newcg_list, struct cg_list_entry,
> + links);
> + list_del(&cg_entry->links);
> + /* drop the refcount */
> + put_css_set(cg_entry->cg);
> + kfree(cg_entry);
> + }
> + /* done! */
> + return retval;
> }
>
> /*
> - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
> - * held. May take task_lock of task
> + * Find the task_struct of the task to attach by vpid and pass it along to the
> + * function to attach either it or all tasks in its threadgroup. Will take
> + * cgroup_mutex; may take task_lock of task.
> */
> -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
> +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid,
> + int attach(struct cgroup *,
> + struct task_struct *))
> {
> struct task_struct *tsk;
> const struct cred *cred = current_cred(), *tcred;
> int ret;
>
> + if (!cgroup_lock_live_group(cgrp))
> + return -ENODEV;
> +
> if (pid) {
> rcu_read_lock();
> tsk = find_task_by_vpid(pid);
> if (!tsk || tsk->flags & PF_EXITING) {
> rcu_read_unlock();
> + cgroup_unlock();
> return -ESRCH;
> }
> -
> + /*
> + * even if we're attaching all tasks in the thread group, we
> + * only need to check permissions on the group leader, because
> + * even if another task has different permissions, the group
> + * leader will have sufficient access to change it.
> + */
> tcred = __task_cred(tsk);
> if (cred->euid &&
> cred->euid != tcred->uid &&
> cred->euid != tcred->suid) {
> rcu_read_unlock();
> + cgroup_unlock();
> return -EACCES;
> }
> get_task_struct(tsk);
> @@ -1408,19 +1718,25 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
> get_task_struct(tsk);
> }
>
> - ret = cgroup_attach_task(cgrp, tsk);
> + /*
> + * Note that the check for whether the task is its threadgroup leader
> + * is done in cgroup_attach_proc. This means that writing 0 to the
> + * procs file will only work if the writing task is the leader.
> + */
> + ret = attach(cgrp, tsk);
> put_task_struct(tsk);
> + cgroup_unlock();
> return ret;
> }
>
> static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
> {
> - int ret;
> - if (!cgroup_lock_live_group(cgrp))
> - return -ENODEV;
> - ret = attach_task_by_pid(cgrp, pid);
> - cgroup_unlock();
> - return ret;
> + return attach_task_by_pid(cgrp, pid, cgroup_attach_task);
> +}
> +
> +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
> +{
> + return attach_task_by_pid(cgrp, tgid, cgroup_attach_proc);
> }
>
> /**
> @@ -2580,9 +2896,9 @@ static struct cftype files[] = {
> {
> .name = CGROUP_FILE_GENERIC_PREFIX "procs",
> .open = cgroup_procs_open,
> - /* .write_u64 = cgroup_procs_write, TODO */
> + .write_u64 = cgroup_procs_write,
> .release = cgroup_pidlist_release,
> - .mode = S_IRUGO,
> + .mode = S_IRUGO | S_IWUSR,
> },
> {
> .name = "notify_on_release",
> @@ -3185,6 +3501,7 @@ static struct file_operations proc_cgroupstats_operations = {
> */
> void cgroup_fork(struct task_struct *child)
> {
> + down_read(&cgroup_fork_mutex);
> task_lock(current);
> child->cgroups = current->cgroups;
> get_css_set(child->cgroups);
> @@ -3231,6 +3548,7 @@ void cgroup_post_fork(struct task_struct *child)
> task_unlock(child);
> write_unlock(&css_set_lock);
> }
> + up_read(&cgroup_fork_mutex);
> }
> /**
> * cgroup_exit - detach cgroup from exiting task
> @@ -3302,6 +3620,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> }
>
> /**
> + * cgroup_fork_failed - undo operations for fork failure
> + * @tsk: pointer to task_struct of exiting process
> + * @run_callback: run exit callbacks?
> + *
> + * Description: Undo cgroup operations after cgroup_fork in fork failure.
> + *
> + * We release the read lock that was taken in cgroup_fork(), since it is
> + * supposed to be dropped in cgroup_post_fork in the success case. The other
> + * thing that wants to be done is detaching the failed child task from the
> + * cgroup, so we wrap cgroup_exit.
> + */
> +void cgroup_fork_failed(struct task_struct *tsk, int run_callbacks)
> +{
> + up_read(&cgroup_fork_mutex);
> + cgroup_exit(tsk, run_callbacks);
> +}
> +
> +/**
> * cgroup_clone - clone the cgroup the given subsystem is attached to
> * @tsk: the task to be moved
> * @subsys: the given subsystem
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 926c117..027ec16 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1300,7 +1300,7 @@ bad_fork_cleanup_policy:
> mpol_put(p->mempolicy);
> bad_fork_cleanup_cgroup:
> #endif
> - cgroup_exit(p, cgroup_callbacks_done);
> + cgroup_fork_failed(p, cgroup_callbacks_done);
> delayacct_tsk_free(p);
> if (p->binfmt)
> module_put(p->binfmt->module);
>
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 5/6] Makes procs file writable to move all threads by tgid at once
[not found] ` <20090724032200.2463.82408.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
2009-07-24 10:02 ` Louis Rilling
@ 2009-07-24 15:50 ` Matt Helsley
2009-11-09 17:07 ` Daniel Lezcano
2 siblings, 0 replies; 29+ messages in thread
From: Matt Helsley @ 2009-07-24 15:50 UTC (permalink / raw)
To: Ben Blum
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
menage-hpIqsD4AKlfQT0dZR+AlfA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
On Thu, Jul 23, 2009 at 08:22:00PM -0700, Ben Blum wrote:
> Makes procs file writable to move all threads by tgid at once
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a rwsem that's taken for
> reading in the fork() path to prevent newly forking threads within the
There is much ado about not taking additional "global locks" in fork()
paths.
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
* (usually) take cgroup_mutex. These are the two most performance
* critical pieces of code here.
...
and as I recall cgroup_fork() doesn't ever take cgroup_mutex because it is
so performance critical. Assuming the above comments in kernel/cgroup.c
are correct then this patch adds a performance regression by introducing a
global mutex in the fork path, doesn't it?
Cheers,
-Matt Helsley
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 5/6] Makes procs file writable to move all threads by tgid at once
[not found] ` <20090724032200.2463.82408.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
2009-07-24 10:02 ` Louis Rilling
2009-07-24 15:50 ` Matt Helsley
@ 2009-11-09 17:07 ` Daniel Lezcano
2 siblings, 0 replies; 29+ messages in thread
From: Daniel Lezcano @ 2009-11-09 17:07 UTC (permalink / raw)
To: Ben Blum
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
menage-hpIqsD4AKlfQT0dZR+AlfA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
[-- Attachment #1: Type: text/plain, Size: 3163 bytes --]
Ben Blum wrote:
> Makes procs file writable to move all threads by tgid at once
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a rwsem that's taken for
> reading in the fork() path to prevent newly forking threads within the
> threadgroup from "escaping" while moving is in progress.
>
> Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
>
[ cut ]
> /**
> + * cgroup_fork_failed - undo operations for fork failure
> + * @tsk: pointer to task_struct of exiting process
> + * @run_callback: run exit callbacks?
> + *
> + * Description: Undo cgroup operations after cgroup_fork in fork failure.
> + *
> + * We release the read lock that was taken in cgroup_fork(), since it is
> + * supposed to be dropped in cgroup_post_fork in the success case. The other
> + * thing that wants to be done is detaching the failed child task from the
> + * cgroup, so we wrap cgroup_exit.
> + */
> +void cgroup_fork_failed(struct task_struct *tsk, int run_callbacks)
> +{
> + up_read(&cgroup_fork_mutex);
> + cgroup_exit(tsk, run_callbacks);
> +}
> +
> +/**
> * cgroup_clone - clone the cgroup the given subsystem is attached to
> * @tsk: the task to be moved
> * @subsys: the given subsystem
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 926c117..027ec16 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1300,7 +1300,7 @@ bad_fork_cleanup_policy:
> mpol_put(p->mempolicy);
> bad_fork_cleanup_cgroup:
> #endif
> - cgroup_exit(p, cgroup_callbacks_done);
> + cgroup_fork_failed(p, cgroup_callbacks_done);
> delayacct_tsk_free(p);
> if (p->binfmt)
> module_put(p->binfmt->module);
>
Hi Ben,
The current code (with or without your patch) may lead to an error
because the fork hook can fail and the exit hook is called in all the
cases making the fork / exit asymmetric.
I will take the usual example with a cgroup with a counter of tasks, in
the fork hook it increments the counter, in the exit hook it decrements
the counter. There is one process in the cgroup, hence the counter value
is 1. Now this process forks and the fork hook fails before the task
counter is incremented to 2, this is not detected in copy process
function because the cgroup_fork_callbacks does not return an error, so
the process will be forked without error and when the process will exits
the counter will be decremented reaching 0 instead of 1.
IMO, the correct fix should be to make the fork hook to return an error
and have the cgroup to call the exit method of the subsystem where the
fork hook was called. For example, there are 10 subsystems using the
fork / exit hooks, when the a process forks, the fork callbacks is
called for these subsystems but, let's say, the 3rd fails. So we undo,
by calling the exit hooks of the first two.
I wrote a patchset to consolidate the hooks called in the cgroup for
fork and exit, and one of them does a rollback for the fork hook when an
error occurs. I added an attachment the patch as an example.
Thanks
-- Daniel
>
[-- Attachment #2: 0001-cgroup-make-fork-hook-to-return-an-error.patch --]
[-- Type: text/x-diff, Size: 7208 bytes --]
Subject: make the fork hook to return an error
From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
The fork hooks does not do any error checking making possible to have
a cgroup subsystem to fail without detecting an error.
This patch makes the fork hook to return an error if a fork hook failed
in one of the subsystems. When a fork hook fails for a subsystem,
the function 'cgroup_fork_callbacks' rollbacks, by calling the exit
callback for all the subsystem the fork callback were called at this
point.
Signed-off-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Cedric Le Goater <clg-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
---
include/linux/cgroup.h | 8 ++++----
kernel/cgroup.c | 37 +++++++++++++++++++++++++++----------
kernel/cgroup_freezer.c | 6 ++++--
kernel/exit.c | 2 +-
kernel/fork.c | 13 +++++++------
5 files changed, 43 insertions(+), 23 deletions(-)
Index: linux-2.6/include/linux/cgroup.h
===================================================================
--- linux-2.6.orig/include/linux/cgroup.h
+++ linux-2.6/include/linux/cgroup.h
@@ -31,9 +31,9 @@ extern void cgroup_lock(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_fork_callbacks(struct task_struct *p);
+extern int cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
-extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_exit(struct task_struct *p);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
@@ -430,7 +430,7 @@ struct cgroup_subsys {
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *tsk,
bool threadgroup);
- void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
+ int (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
struct cgroup *cgrp);
@@ -569,7 +569,7 @@ unsigned short css_depth(struct cgroup_s
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline int cgroup_fork_callbacks(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
Index: linux-2.6/kernel/cgroup.c
===================================================================
--- linux-2.6.orig/kernel/cgroup.c
+++ linux-2.6/kernel/cgroup.c
@@ -3436,16 +3436,33 @@ void cgroup_fork(struct task_struct *chi
* tasklist. No need to take any locks since no-one can
* be operating on this task.
*/
-void cgroup_fork_callbacks(struct task_struct *child)
+int cgroup_fork_callbacks(struct task_struct *child)
{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(ss, child);
- }
+ int i, ret = 0;
+ struct cgroup_subsys *ss;
+
+ if (!need_forkexit_callback)
+ goto out;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ ss = subsys[i];
+ if (!ss->fork)
+ continue;
+ ret = ss->fork(ss, child);
+ if (ret)
+ goto out_exit;
}
+out:
+ return ret;
+
+out_exit:
+ for (i--; i >= 0; i--) {
+ ss = subsys[i];
+ if (!ss->exit)
+ continue;
+ ss->exit(ss, child);
+ }
+ goto out;
}
/**
@@ -3503,12 +3520,12 @@ void cgroup_post_fork(struct task_struct
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
int i;
struct css_set *cg;
- if (run_callbacks && need_forkexit_callback) {
+ if (need_forkexit_callback) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->exit)
Index: linux-2.6/kernel/cgroup_freezer.c
===================================================================
--- linux-2.6.orig/kernel/cgroup_freezer.c
+++ linux-2.6/kernel/cgroup_freezer.c
@@ -193,7 +193,7 @@ static int freezer_can_attach(struct cgr
return 0;
}
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+static int freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
@@ -210,7 +210,7 @@ static void freezer_fork(struct cgroup_s
* following check.
*/
if (!freezer->css.cgroup->parent)
- return;
+ goto out;
spin_lock_irq(&freezer->lock);
BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -219,6 +219,8 @@ static void freezer_fork(struct cgroup_s
if (freezer->state == CGROUP_FREEZING)
freeze_task(task, true);
spin_unlock_irq(&freezer->lock);
+out:
+ return 0;
}
/*
Index: linux-2.6/kernel/exit.c
===================================================================
--- linux-2.6.orig/kernel/exit.c
+++ linux-2.6/kernel/exit.c
@@ -968,7 +968,7 @@ NORET_TYPE void do_exit(long code)
exit_fs(tsk);
check_stack_usage();
exit_thread();
- cgroup_exit(tsk, 1);
+ cgroup_exit(tsk);
if (group_dead && tsk->signal->leader)
disassociate_ctty(1);
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -979,7 +979,6 @@ static struct task_struct *copy_process(
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1217,13 +1216,14 @@ static struct task_struct *copy_process(
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
* on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
+ retval = cgroup_fork_callbacks(p);
+ if (retval)
+ goto bad_fork_free_pid;
if (current->nsproxy != p->nsproxy) {
retval = ns_cgroup_clone(p, pid);
if (retval)
- goto bad_fork_free_pid;
+ goto bad_fork_cgroup_callbacks;
}
/* Need tasklist lock for parent etc handling! */
@@ -1268,7 +1268,7 @@ static struct task_struct *copy_process(
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cgroup_callbacks;
}
if (clone_flags & CLONE_THREAD) {
@@ -1306,6 +1306,8 @@ static struct task_struct *copy_process(
perf_event_fork(p);
return p;
+bad_fork_cgroup_callbacks:
+ cgroup_exit(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1335,7 +1337,6 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
- cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
[-- Attachment #3: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 29+ messages in thread
[parent not found: <4AF84C68.7010803@free.fr>]
* [PATCH 6/6] Lets ss->can_attach and ss->attach do whole threadgroups at a time
[not found] ` <20090724032033.2463.79256.stgit-/yCBOHwbXCxd3OlUiQof+WCaruZE5nAUZeezCHUQhQ4@public.gmane.org>
` (4 preceding siblings ...)
2009-07-24 3:22 ` [PATCH 5/6] Makes procs file writable to move all threads by tgid at once Ben Blum
@ 2009-07-24 3:22 ` Ben Blum
5 siblings, 0 replies; 29+ messages in thread
From: Ben Blum @ 2009-07-24 3:22 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
serue-r/Jw6+rmf7HQT0dZR+AlfA, lizf-BthXqXjhjHXQFUHtdCDX3A,
menage-hpIqsD4AKlfQT0dZR+AlfA, bblum-hpIqsD4AKlfQT0dZR+AlfA
Lets ss->can_attach and ss->attach do whole threadgroups at a time
This patch alters the ss->can_attach and ss->attach functions to be able to
deal with a whole threadgroup at a time, for use in cgroup_attach_proc. (This
is a post-patch to cgroup-procs-writable.patch.)
Signed-off-by: Ben Blum <bblum-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
include/linux/cgroup.h | 7 +++--
kernel/cgroup.c | 8 +++---
kernel/cgroup_freezer.c | 15 +++++++++--
kernel/cpuset.c | 65 ++++++++++++++++++++++++++++++++++++----------
kernel/ns_cgroup.c | 16 ++++++++++-
kernel/sched.c | 37 ++++++++++++++++++++++++--
mm/memcontrol.c | 3 +-
security/device_cgroup.c | 3 +-
8 files changed, 124 insertions(+), 30 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cae7d3e..5a4383c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -409,10 +409,11 @@ struct cgroup_subsys {
struct cgroup *cgrp);
int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
- int (*can_attach)(struct cgroup_subsys *ss,
- struct cgroup *cgrp, struct task_struct *tsk);
+ int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk, bool threadgroup);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *tsk);
+ struct cgroup *old_cgrp, struct task_struct *tsk,
+ bool threadgroup);
void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f8d323..abdfa5a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1415,7 +1415,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk);
+ retval = ss->can_attach(ss, cgrp, tsk, false);
if (retval)
return retval;
}
@@ -1427,7 +1427,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk);
+ ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
synchronize_rcu();
@@ -1537,7 +1537,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, leader);
+ retval = ss->can_attach(ss, cgrp, leader, true);
if (retval)
return retval;
}
@@ -1649,7 +1649,7 @@ again:
*/
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, leader);
+ ss->attach(ss, cgrp, oldcgrp, leader, true);
}
/*
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2..4e352ab 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,10 +159,9 @@ static bool is_task_frozen_enough(struct task_struct *task)
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task)
+ struct task_struct *task, bool threadgroup)
{
struct freezer *freezer;
-
/*
* Anything frozen can't move or be moved to/from.
*
@@ -177,6 +176,18 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ if (is_task_frozen_enough(c)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ }
+ rcu_read_unlock();
+ }
+
return 0;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41..86397f4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
static cpumask_var_t cpus_attach;
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss,
- struct cgroup *cont, struct task_struct *tsk)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct task_struct *tsk, bool threadgroup)
{
+ int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,50 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- return security_task_setscheduler(tsk, 0, NULL);
+ ret = security_task_setscheduler(tsk, 0, NULL);
+ if (ret)
+ return ret;
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ ret = security_task_setscheduler(c, 0, NULL);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+ rcu_read_unlock();
+ }
+ return 0;
+}
+
+static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+ struct cpuset *cs)
+{
+ int err;
+ /*
+ * can_attach beforehand should guarantee that this doesn't fail.
+ * TODO: have a better way to handle failure here
+ */
+ err = set_cpus_allowed_ptr(tsk, cpus_attach);
+ WARN_ON_ONCE(err);
+
+ task_lock(tsk);
+ cpuset_change_task_nodemask(tsk, to);
+ task_unlock(tsk);
+ cpuset_update_task_spread_flag(cs, tsk);
+
}
-static void cpuset_attach(struct cgroup_subsys *ss,
- struct cgroup *cont, struct cgroup *oldcont,
- struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct cgroup *oldcont, struct task_struct *tsk,
+ bool threadgroup)
{
nodemask_t from, to;
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- int err;
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &to);
}
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- if (err)
- return;
- task_lock(tsk);
- cpuset_change_task_nodemask(tsk, &to);
- task_unlock(tsk);
- cpuset_update_task_spread_flag(cs, tsk);
+ /* do per-task migration stuff possibly for each in the threadgroup */
+ cpuset_attach_task(tsk, &to, cs);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ cpuset_attach_task(c, &to, cs);
+ }
+ rcu_read_unlock();
+ }
+ /* change mm; only needs to be done once even if threadgroup */
from = oldcs->mems_allowed;
to = cs->mems_allowed;
mm = get_task_mm(tsk);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f..2a5dfec 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* (hence either you are in the same cgroup as task, or in an
* ancestor cgroup thereof)
*/
-static int ns_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task)
+static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
+ struct task_struct *task, bool threadgroup)
{
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ if (!cgroup_is_descendant(new_cgroup, c)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+ }
+ rcu_read_unlock();
+ }
+
return 0;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 3393c18..b5e371b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10194,8 +10194,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
}
static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk)
+cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10209,11 +10208,43 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
return 0;
}
+static int
+cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk, bool threadgroup)
+{
+ int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
+ if (retval)
+ return retval;
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ retval = cpu_cgroup_can_attach_task(cgrp, c);
+ if (retval) {
+ rcu_read_unlock();
+ return retval;
+ }
+ }
+ rcu_read_unlock();
+ }
+ return 0;
+
+}
+
static void
cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk)
+ struct cgroup *old_cont, struct task_struct *tsk
+ bool threadgroup)
{
sched_move_task(tsk);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ sched_move_task(c);
+ }
+ rcu_read_unlock();
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6ceb6f2..d9e9cf4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2584,7 +2584,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p)
+ struct task_struct *p,
+ bool threadgroup)
{
mutex_lock(&memcg_tasklist);
/*
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index b8186ba..6cf8fd2 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,7 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
struct cgroup_subsys devices_subsys;
static int devcgroup_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task)
+ struct cgroup *new_cgroup, struct task_struct *task,
+ bool threadgroup)
{
if (current != task && !capable(CAP_SYS_ADMIN))
return -EPERM;
^ permalink raw reply related [flat|nested] 29+ messages in thread