public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Lai Jiangshan <laijs@cn.fujitsu.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <menage@google.com>, Paul Jackson <pj@sgi.com>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
Date: Thu, 11 Sep 2008 18:30:06 +0800	[thread overview]
Message-ID: <48C8F32E.2020004@cn.fujitsu.com> (raw)

This new alternative allocation implementation can allocate memory
up to 64M in 32bits system or 512M in 64bits system.

This patch fix the problem for a really large cgroup.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb298de..974e898 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -403,6 +403,18 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * Basic struct of cgroup huge memory allocation,
+ * use typedef to hide its implementation.
+ */
+typedef struct {
+	struct page **page_array;
+	size_t page_count;
+} cgroup_huge_mem_t;
+
+void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge);
+void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 996865a..3ad4ff0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -142,6 +142,55 @@ static int notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+#define CGROUP_HUGE_PAGES_THRESHOLD 4
+
+void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge)
+{
+	unsigned int i, j, n_pages;
+	struct page **pages;
+	void *mem;
+
+	huge->page_array = NULL;
+	huge->page_count = 0;
+	if (size < PAGE_SIZE * CGROUP_HUGE_PAGES_THRESHOLD)
+		return kmalloc(size, GFP_KERNEL);
+
+	n_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = kmalloc(sizeof(*pages) * n_pages, GFP_KERNEL);
+	if (!pages)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!pages[i]))
+			goto depopulate;
+	}
+	mem = vmap(pages, n_pages, VM_MAP, PAGE_KERNEL);
+	if (mem) {
+		huge->page_array = pages;
+		huge->page_count = n_pages;
+		return mem;
+	}
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(pages[j]);
+	kfree(pages);
+	return NULL;
+}
+
+void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge)
+{
+	if (huge->page_count) {
+		unsigned int i;
+		vunmap(ptr);
+		for (i = 0; i < huge->page_count; i++)
+			__free_page(huge->page_array[i]);
+		kfree(huge->page_array);
+	} else
+		kfree(ptr);
+}
+
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
@@ -2106,7 +2155,6 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	down_read(&cgrp->pids_mutex);
 	if (pid) {
 		int end = cgrp->pids_length;
-		int i;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (cgrp->tasks_pids[mid] == pid) {
@@ -2164,12 +2212,35 @@ static struct seq_operations cgroup_tasks_seq_operations = {
 	.show = cgroup_tasks_show,
 };
 
+
+static void *cgroup_pid_array_alloc(size_t size)
+{
+	cgroup_huge_mem_t huge;
+	void *mem = cgroup_huge_mem_alloc(size + sizeof(huge), &huge);
+	if (mem) {
+		*(cgroup_huge_mem_t *)mem = huge;
+		return mem + sizeof(huge);
+	}
+	return NULL;
+}
+
+static void cgroup_pid_array_free(void *ptr)
+{
+	if (ptr) {
+		cgroup_huge_mem_t huge;
+		void *mem = ptr - sizeof(huge);
+
+		huge = *(cgroup_huge_mem_t *)mem;
+		cgroup_huge_mem_free(mem, &huge);
+	}
+}
+
 static void release_cgroup_pid_array(struct cgroup *cgrp)
 {
 	down_write(&cgrp->pids_mutex);
 	BUG_ON(!cgrp->pids_use_count);
 	if (!--cgrp->pids_use_count) {
-		kfree(cgrp->tasks_pids);
+		cgroup_pid_array_free(cgrp->tasks_pids);
 		cgrp->tasks_pids = NULL;
 		cgrp->pids_length = 0;
 	}
@@ -2217,7 +2288,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
-	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	pidarray = cgroup_pid_array_alloc(npids * sizeof(pid_t));
 	if (!pidarray)
 		return -ENOMEM;
 	npids = pid_array_load(pidarray, npids, cgrp);
@@ -2228,7 +2299,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * array if necessary
 	 */
 	down_write(&cgrp->pids_mutex);
-	kfree(cgrp->tasks_pids);
+	cgroup_pid_array_free(cgrp->tasks_pids);
 	cgrp->tasks_pids = pidarray;
 	cgrp->pids_length = npids;
 	cgrp->pids_use_count++;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f227bc1..38fde1e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -999,6 +999,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
 	struct task_struct *p;
 	struct mm_struct **mmarray;
+	cgroup_huge_mem_t huge;
 	int i, n, ntasks;
 	int migrate;
 	int fudge;
@@ -1021,14 +1022,15 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	while (1) {
 		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
 		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		mmarray = cgroup_huge_mem_alloc(ntasks * sizeof(*mmarray),
+				&huge);
 		if (!mmarray)
 			goto done;
 		read_lock(&tasklist_lock);		/* block fork */
 		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
 			break;				/* got enough */
 		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
+		cgroup_huge_mem_free(mmarray, &huge);
 	}
 
 	n = 0;
@@ -1075,7 +1077,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	}
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	kfree(mmarray);
+	cgroup_huge_mem_free(mmarray, &huge);
 	cpuset_being_rebound = NULL;
 	retval = 0;
 done:


             reply	other threads:[~2008-09-11 10:32 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-09-11 10:30 Lai Jiangshan [this message]
2008-09-11 10:56 ` [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks Paul Jackson
2008-09-11 16:45 ` Paul Menage
2008-09-11 19:54   ` Paul Menage

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48C8F32E.2020004@cn.fujitsu.com \
    --to=laijs@cn.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=pj@sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox