[PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
@ 2008-09-11 10:30 Lai Jiangshan
  2008-09-11 10:56 ` Paul Jackson
  2008-09-11 16:45 ` Paul Menage
  0 siblings, 2 replies; 4+ messages in thread
From: Lai Jiangshan @ 2008-09-11 10:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Menage, Paul Jackson, Linux Kernel Mailing List

This new alternative allocation implementation can allocate memory
up to 64M in 32bits system or 512M in 64bits system.

This patch fix the problem for a really large cgroup.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb298de..974e898 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -403,6 +403,18 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * Basic struct of cgroup huge memory allocation,
+ * use typedef to hide its implementation.
+ */
+typedef struct {
+	struct page **page_array;
+	size_t page_count;
+} cgroup_huge_mem_t;
+
+void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge);
+void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 996865a..3ad4ff0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -142,6 +142,55 @@ static int notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+#define CGROUP_HUGE_PAGES_THRESHOLD 4
+
+void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge)
+{
+	unsigned int i, j, n_pages;
+	struct page **pages;
+	void *mem;
+
+	huge->page_array = NULL;
+	huge->page_count = 0;
+	if (size < PAGE_SIZE * CGROUP_HUGE_PAGES_THRESHOLD)
+		return kmalloc(size, GFP_KERNEL);
+
+	n_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = kmalloc(sizeof(*pages) * n_pages, GFP_KERNEL);
+	if (!pages)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!pages[i]))
+			goto depopulate;
+	}
+	mem = vmap(pages, n_pages, VM_MAP, PAGE_KERNEL);
+	if (mem) {
+		huge->page_array = pages;
+		huge->page_count = n_pages;
+		return mem;
+	}
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(pages[j]);
+	kfree(pages);
+	return NULL;
+}
+
+void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge)
+{
+	if (huge->page_count) {
+		unsigned int i;
+		vunmap(ptr);
+		for (i = 0; i < huge->page_count; i++)
+			__free_page(huge->page_array[i]);
+		kfree(huge->page_array);
+	} else
+		kfree(ptr);
+}
+
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
@@ -2106,7 +2155,6 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	down_read(&cgrp->pids_mutex);
 	if (pid) {
 		int end = cgrp->pids_length;
-		int i;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (cgrp->tasks_pids[mid] == pid) {
@@ -2164,12 +2212,35 @@ static struct seq_operations cgroup_tasks_seq_operations = {
 	.show = cgroup_tasks_show,
 };
 
+
+static void *cgroup_pid_array_alloc(size_t size)
+{
+	cgroup_huge_mem_t huge;
+	void *mem = cgroup_huge_mem_alloc(size + sizeof(huge), &huge);
+	if (mem) {
+		*(cgroup_huge_mem_t *)mem = huge;
+		return mem + sizeof(huge);
+	}
+	return NULL;
+}
+
+static void cgroup_pid_array_free(void *ptr)
+{
+	if (ptr) {
+		cgroup_huge_mem_t huge;
+		void *mem = ptr - sizeof(huge);
+
+		huge = *(cgroup_huge_mem_t *)mem;
+		cgroup_huge_mem_free(mem, &huge);
+	}
+}
+
 static void release_cgroup_pid_array(struct cgroup *cgrp)
 {
 	down_write(&cgrp->pids_mutex);
 	BUG_ON(!cgrp->pids_use_count);
 	if (!--cgrp->pids_use_count) {
-		kfree(cgrp->tasks_pids);
+		cgroup_pid_array_free(cgrp->tasks_pids);
 		cgrp->tasks_pids = NULL;
 		cgrp->pids_length = 0;
 	}
@@ -2217,7 +2288,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
-	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	pidarray = cgroup_pid_array_alloc(npids * sizeof(pid_t));
 	if (!pidarray)
 		return -ENOMEM;
 	npids = pid_array_load(pidarray, npids, cgrp);
@@ -2228,7 +2299,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * array if necessary
 	 */
 	down_write(&cgrp->pids_mutex);
-	kfree(cgrp->tasks_pids);
+	cgroup_pid_array_free(cgrp->tasks_pids);
 	cgrp->tasks_pids = pidarray;
 	cgrp->pids_length = npids;
 	cgrp->pids_use_count++;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f227bc1..38fde1e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -999,6 +999,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
 	struct task_struct *p;
 	struct mm_struct **mmarray;
+	cgroup_huge_mem_t huge;
 	int i, n, ntasks;
 	int migrate;
 	int fudge;
@@ -1021,14 +1022,15 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	while (1) {
 		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
 		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		mmarray = cgroup_huge_mem_alloc(ntasks * sizeof(*mmarray),
+				&huge);
 		if (!mmarray)
 			goto done;
 		read_lock(&tasklist_lock);		/* block fork */
 		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
 			break;				/* got enough */
 		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
+		cgroup_huge_mem_free(mmarray, &huge);
 	}
 
 	n = 0;
@@ -1075,7 +1077,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	}
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	kfree(mmarray);
+	cgroup_huge_mem_free(mmarray, &huge);
 	cpuset_being_rebound = NULL;
 	retval = 0;
 done:


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
  2008-09-11 10:30 [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks Lai Jiangshan
@ 2008-09-11 10:56 ` Paul Jackson
  2008-09-11 16:45 ` Paul Menage
  1 sibling, 0 replies; 4+ messages in thread
From: Paul Jackson @ 2008-09-11 10:56 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: akpm, menage, linux-kernel

Lai Jiangshan wrote:
> This new alternative allocation implementation can allocate memory
> up to 64M in 32bits system or 512M in 64bits system.

Just a random idea - it seems to me that this new allocation
implementation might be more generally useful than just for cgroups.

So, instead of having cgroup_huge_mem_alloc() and cgroup_huge_mem_free()
in kernel/cgroup.c, one might have say big_kmalloc() and big_kfree() in
some mm/*.c file.  I used "big" instead of "huge" or "compound" or
"large", as these other adjectives already have other meanings in these
allocator functions.

However ... I would suggest that you do not spend anytime implementing
the above idea unless either (1) it strikes you as absolutely brilliant,
or (2) Paul Menage endorses it.  Paul M has been paying closer attention
to this change than I have, so his recommendations are worth far more
than mine are here.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
  2008-09-11 10:30 [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks Lai Jiangshan
  2008-09-11 10:56 ` Paul Jackson
@ 2008-09-11 16:45 ` Paul Menage
  2008-09-11 19:54   ` Paul Menage
  1 sibling, 1 reply; 4+ messages in thread
From: Paul Menage @ 2008-09-11 16:45 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Andrew Morton, Paul Jackson, Linux Kernel Mailing List

On Thu, Sep 11, 2008 at 3:30 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
> This new alternative allocation implementation can allocate memory
> up to 64M in 32bits system or 512M in 64bits system.

Isn't a lot of this patch just reimplementing vmalloc()?

Paul

>
> This patch fix the problem for a really large cgroup.
>
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> ---
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bb298de..974e898 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -403,6 +403,18 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
>  int cgroup_scan_tasks(struct cgroup_scanner *scan);
>  int cgroup_attach_task(struct cgroup *, struct task_struct *);
>
> +/*
> + * Basic struct of cgroup huge memory allocation,
> + * use typedef to hide its implementation.
> + */
> +typedef struct {
> +       struct page **page_array;
> +       size_t page_count;
> +} cgroup_huge_mem_t;
> +
> +void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge);
> +void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge);
> +
>  #else /* !CONFIG_CGROUPS */
>
>  static inline int cgroup_init_early(void) { return 0; }
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 996865a..3ad4ff0 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -142,6 +142,55 @@ static int notify_on_release(const struct cgroup *cgrp)
>        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
>  }
>
> +#define CGROUP_HUGE_PAGES_THRESHOLD 4
> +
> +void *cgroup_huge_mem_alloc(size_t size, cgroup_huge_mem_t *huge)
> +{
> +       unsigned int i, j, n_pages;
> +       struct page **pages;
> +       void *mem;
> +
> +       huge->page_array = NULL;
> +       huge->page_count = 0;
> +       if (size < PAGE_SIZE * CGROUP_HUGE_PAGES_THRESHOLD)
> +               return kmalloc(size, GFP_KERNEL);
> +
> +       n_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +       pages = kmalloc(sizeof(*pages) * n_pages, GFP_KERNEL);
> +       if (!pages)
> +               return NULL;
> +
> +       for (i = 0; i < n_pages; i++) {
> +               pages[i] = alloc_page(GFP_KERNEL);
> +               if (unlikely(!pages[i]))
> +                       goto depopulate;
> +       }
> +       mem = vmap(pages, n_pages, VM_MAP, PAGE_KERNEL);
> +       if (mem) {
> +               huge->page_array = pages;
> +               huge->page_count = n_pages;
> +               return mem;
> +       }
> +
> +depopulate:
> +       for (j = 0; j < i; j++)
> +               __free_page(pages[j]);
> +       kfree(pages);
> +       return NULL;
> +}
> +
> +void cgroup_huge_mem_free(void *ptr, cgroup_huge_mem_t *huge)
> +{
> +       if (huge->page_count) {
> +               unsigned int i;
> +               vunmap(ptr);
> +               for (i = 0; i < huge->page_count; i++)
> +                       __free_page(huge->page_array[i]);
> +               kfree(huge->page_array);
> +       } else
> +               kfree(ptr);
> +}
> +
>  /*
>  * for_each_subsys() allows you to iterate on each subsystem attached to
>  * an active hierarchy
> @@ -2106,7 +2155,6 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>        down_read(&cgrp->pids_mutex);
>        if (pid) {
>                int end = cgrp->pids_length;
> -               int i;
>                while (index < end) {
>                        int mid = (index + end) / 2;
>                        if (cgrp->tasks_pids[mid] == pid) {
> @@ -2164,12 +2212,35 @@ static struct seq_operations cgroup_tasks_seq_operations = {
>        .show = cgroup_tasks_show,
>  };
>
> +
> +static void *cgroup_pid_array_alloc(size_t size)
> +{
> +       cgroup_huge_mem_t huge;
> +       void *mem = cgroup_huge_mem_alloc(size + sizeof(huge), &huge);
> +       if (mem) {
> +               *(cgroup_huge_mem_t *)mem = huge;
> +               return mem + sizeof(huge);
> +       }
> +       return NULL;
> +}
> +
> +static void cgroup_pid_array_free(void *ptr)
> +{
> +       if (ptr) {
> +               cgroup_huge_mem_t huge;
> +               void *mem = ptr - sizeof(huge);
> +
> +               huge = *(cgroup_huge_mem_t *)mem;
> +               cgroup_huge_mem_free(mem, &huge);
> +       }
> +}
> +
>  static void release_cgroup_pid_array(struct cgroup *cgrp)
>  {
>        down_write(&cgrp->pids_mutex);
>        BUG_ON(!cgrp->pids_use_count);
>        if (!--cgrp->pids_use_count) {
> -               kfree(cgrp->tasks_pids);
> +               cgroup_pid_array_free(cgrp->tasks_pids);
>                cgrp->tasks_pids = NULL;
>                cgrp->pids_length = 0;
>        }
> @@ -2217,7 +2288,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>         * show up until sometime later on.
>         */
>        npids = cgroup_task_count(cgrp);
> -       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
> +       pidarray = cgroup_pid_array_alloc(npids * sizeof(pid_t));
>        if (!pidarray)
>                return -ENOMEM;
>        npids = pid_array_load(pidarray, npids, cgrp);
> @@ -2228,7 +2299,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>         * array if necessary
>         */
>        down_write(&cgrp->pids_mutex);
> -       kfree(cgrp->tasks_pids);
> +       cgroup_pid_array_free(cgrp->tasks_pids);
>        cgrp->tasks_pids = pidarray;
>        cgrp->pids_length = npids;
>        cgrp->pids_use_count++;
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index f227bc1..38fde1e 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -999,6 +999,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
>  {
>        struct task_struct *p;
>        struct mm_struct **mmarray;
> +       cgroup_huge_mem_t huge;
>        int i, n, ntasks;
>        int migrate;
>        int fudge;
> @@ -1021,14 +1022,15 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
>        while (1) {
>                ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
>                ntasks += fudge;
> -               mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
> +               mmarray = cgroup_huge_mem_alloc(ntasks * sizeof(*mmarray),
> +                               &huge);
>                if (!mmarray)
>                        goto done;
>                read_lock(&tasklist_lock);              /* block fork */
>                if (cgroup_task_count(cs->css.cgroup) <= ntasks)
>                        break;                          /* got enough */
>                read_unlock(&tasklist_lock);            /* try again */
> -               kfree(mmarray);
> +               cgroup_huge_mem_free(mmarray, &huge);
>        }
>
>        n = 0;
> @@ -1075,7 +1077,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
>        }
>
>        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
> -       kfree(mmarray);
> +       cgroup_huge_mem_free(mmarray, &huge);
>        cpuset_being_rebound = NULL;
>        retval = 0;
>  done:
>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks
  2008-09-11 16:45 ` Paul Menage
@ 2008-09-11 19:54   ` Paul Menage
  0 siblings, 0 replies; 4+ messages in thread
From: Paul Menage @ 2008-09-11 19:54 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Andrew Morton, Paul Jackson, Linux Kernel Mailing List,
	linux-mm@kvack.org

On Thu, Sep 11, 2008 at 9:45 AM, Paul Menage <menage@google.com> wrote:
> On Thu, Sep 11, 2008 at 3:30 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
>> This new alternative allocation implementation can allocate memory
>> up to 64M in 32bits system or 512M in 64bits system.
>
> Isn't a lot of this patch just reimplementing vmalloc()?

To extend on this, I think there are two ways of fixing the large
allocation problem:

1) just use vmalloc() rather than kmalloc() when the pid array is over
a certain threshold (probably 1 page?)

2) allocate pages/chunks in a similar way to your CL, but don't bother
mapping them. Instead we'd use the fact that each record (pid) is the
same size, and hence we can very easily use the high bits of an index
to select the chunk and the low bits to select the pid within the
chunk - no need to suffer the overhead of setting up and tearing down
ptes in order for the MMU do the same operation for us in hardware.

Obviously option 1 is a lot simpler, but option 2 avoids a
vmap()/vunmap() on every open/close of a tasks file. I'm not familiar
enough with the performance of vmap/vunmap on typical
hardware/workloads to know how high this overhead is  - maybe a VM
guru can comment?

Paul

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-09-11 19:55 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-11 10:30 [PATCH -mm] cgroup,cpuset: use alternative malloc to allocate large memory buf for tasks Lai Jiangshan
2008-09-11 10:56 ` Paul Jackson
2008-09-11 16:45 ` Paul Menage
2008-09-11 19:54   ` Paul Menage

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox