public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
@ 2008-09-11 10:30 Lai Jiangshan
  2008-09-11 10:56 ` Paul Jackson
  2008-09-11 16:45 ` Paul Menage
  0 siblings, 2 replies; 4+ messages in thread
From: Lai Jiangshan @ 2008-09-11 10:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Menage, Paul Jackson, Linux Kernel Mailing List

This new alternative allocation implementation can allocate memory
up to 64M in 32bits system or 512M in 64bits system.

This patch fix the problem for a really large cgroup.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb298de..974e898 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -403,6 +403,18 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * Basic struct of cgroup huge memory allocation,
+ * use typedef to hide its implementation.
+ */
+typedef struct {
+	struct page **page_array;
+	size_t page_count;
+} cgroup_huge_mem_t;
+
+void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge);
+void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 996865a..3ad4ff0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -142,6 +142,55 @@ static int notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+#define CGROUP_HUGE_PAGES_THRESHOLD 4
+
+void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge)
+{
+	unsigned int i, j, n_pages;
+	struct page **pages;
+	void *mem;
+
+	huge->page_array = NULL;
+	huge->page_count = 0;
+	if (size < PAGE_SIZE * CGROUP_HUGE_PAGES_THRESHOLD)
+		return kmalloc(size, GFP_KERNEL);
+
+	n_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = kmalloc(sizeof(*pages) * n_pages, GFP_KERNEL);
+	if (!pages)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!pages[i]))
+			goto depopulate;
+	}
+	mem = vmap(pages, n_pages, VM_MAP, PAGE_KERNEL);
+	if (mem) {
+		huge->page_array = pages;
+		huge->page_count = n_pages;
+		return mem;
+	}
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(pages[j]);
+	kfree(pages);
+	return NULL;
+}
+
+void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge)
+{
+	if (huge->page_count) {
+		unsigned int i;
+		vunmap(ptr);
+		for (i = 0; i < huge->page_count; i++)
+			__free_page(huge->page_array[i]);
+		kfree(huge->page_array);
+	} else
+		kfree(ptr);
+}
+
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
@@ -2106,7 +2155,6 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	down_read(&cgrp->pids_mutex);
 	if (pid) {
 		int end = cgrp->pids_length;
-		int i;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (cgrp->tasks_pids[mid] == pid) {
@@ -2164,12 +2212,35 @@ static struct seq_operations cgroup_tasks_seq_operations = {
 	.show = cgroup_tasks_show,
 };
 
+
+static void *cgroup_pid_array_alloc(size_t size)
+{
+	cgroup_huge_mem_t huge;
+	void *mem = cgroup_huge_mem_alloc(size + sizeof(huge), &huge);
+	if (mem) {
+		*(cgroup_huge_mem_t *)mem = huge;
+		return mem + sizeof(huge);
+	}
+	return NULL;
+}
+
+static void cgroup_pid_array_free(void *ptr)
+{
+	if (ptr) {
+		cgroup_huge_mem_t huge;
+		void *mem = ptr - sizeof(huge);
+
+		huge = *(cgroup_huge_mem_t *)mem;
+		cgroup_huge_mem_free(mem, &huge);
+	}
+}
+
 static void release_cgroup_pid_array(struct cgroup *cgrp)
 {
 	down_write(&cgrp->pids_mutex);
 	BUG_ON(!cgrp->pids_use_count);
 	if (!--cgrp->pids_use_count) {
-		kfree(cgrp->tasks_pids);
+		cgroup_pid_array_free(cgrp->tasks_pids);
 		cgrp->tasks_pids = NULL;
 		cgrp->pids_length = 0;
 	}
@@ -2217,7 +2288,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
-	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	pidarray = cgroup_pid_array_alloc(npids * sizeof(pid_t));
 	if (!pidarray)
 		return -ENOMEM;
 	npids = pid_array_load(pidarray, npids, cgrp);
@@ -2228,7 +2299,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * array if necessary
 	 */
 	down_write(&cgrp->pids_mutex);
-	kfree(cgrp->tasks_pids);
+	cgroup_pid_array_free(cgrp->tasks_pids);
 	cgrp->tasks_pids = pidarray;
 	cgrp->pids_length = npids;
 	cgrp->pids_use_count++;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f227bc1..38fde1e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -999,6 +999,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
 	struct task_struct *p;
 	struct mm_struct **mmarray;
+	cgroup_huge_mem_t huge;
 	int i, n, ntasks;
 	int migrate;
 	int fudge;
@@ -1021,14 +1022,15 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	while (1) {
 		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
 		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		mmarray = cgroup_huge_mem_alloc(ntasks * sizeof(*mmarray),
+				&huge);
 		if (!mmarray)
 			goto done;
 		read_lock(&tasklist_lock);		/* block fork */
 		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
 			break;				/* got enough */
 		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
+		cgroup_huge_mem_free(mmarray, &huge);
 	}
 
 	n = 0;
@@ -1075,7 +1077,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	}
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	kfree(mmarray);
+	cgroup_huge_mem_free(mmarray, &huge);
 	cpuset_being_rebound = NULL;
 	retval = 0;
 done:


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-09-11 19:55 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-11 10:30 [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks Lai Jiangshan
2008-09-11 10:56 ` Paul Jackson
2008-09-11 16:45 ` Paul Menage
2008-09-11 19:54   ` Paul Menage

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox