[PATCH mmotm] memcg use generic percpu allocator instead of private one

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH mmotm] memcg use generic percpu allocator instead of private one
@ 2010-01-20  7:18 KAMEZAWA Hiroyuki
  2010-01-20  9:37 ` Balbir Singh
  0 siblings, 1 reply; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-20  7:18 UTC (permalink / raw)
  To: linux-mm@kvack.org
  Cc: linux-kernel@vger.kernel.org, balbir@linux.vnet.ibm.com,
	nishimura@mxp.nes.nec.co.jp, kirill

This patch is onto mmotm Jan/15.
=
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

When per-cpu counter for memcg was implemneted, dynamic percpu allocator
was not very good. But now, we have good one and useful macros.
This patch replaces memcg's private percpu counter implementation with
generic dynamic percpu allocator and macros.

The benefits are
	- We can remove private implementation.
	- The counters will be NUMA-aware. (Current one is not...)
	- This patch reduces sizeof(struct mem_cgroup). Then,
	  struct mem_cgroup may be fit in page size on small config.

By this, size of text is reduced.
 [Before]
 [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
   text    data     bss     dec     hex filename
  24373    2528    4132   31033    7939 mm/memcontrol.o
 [After]
 [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
   text    data     bss     dec     hex filename
  23913    2528    4132   30573    776d mm/memcontrol.o

This includes no functional changes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memcontrol.c |  184 +++++++++++++++++++-------------------------------------
 1 file changed, 63 insertions(+), 121 deletions(-)

Index: mmotm-2.6.33-Jan15/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Jan15.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Jan15/mm/memcontrol.c
@@ -89,54 +89,8 @@ enum mem_cgroup_stat_index {
 
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
-	struct mem_cgroup_stat_cpu cpustat[0];
 };
 
-static inline void
-__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
-				enum mem_cgroup_stat_index idx, s64 val)
-{
-	stat->count[idx] = val;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
-				enum mem_cgroup_stat_index idx)
-{
-	return stat->count[idx];
-}
-
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
-		enum mem_cgroup_stat_index idx, int val)
-{
-	stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
-		enum mem_cgroup_stat_index idx)
-{
-	int cpu;
-	s64 ret = 0;
-	for_each_possible_cpu(cpu)
-		ret += stat->cpustat[cpu].count[idx];
-	return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
-	s64 ret;
-
-	ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
-	ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
-	return ret;
-}
-
 /*
  * per-zone information in memory controller.
  */
@@ -270,9 +224,9 @@ struct mem_cgroup {
 	unsigned long 	move_charge_at_immigrate;
 
 	/*
-	 * statistics. This must be placed at the end of memcg.
+	 * percpu counter.
 	 */
-	struct mem_cgroup_stat stat;
+	struct mem_cgroup_stat_cpu *stat;
 };
 
 /* Stuffs for move charges at task migration. */
@@ -441,19 +395,14 @@ mem_cgroup_remove_exceeded(struct mem_cg
 static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
 {
 	bool ret = false;
-	int cpu;
 	s64 val;
-	struct mem_cgroup_stat_cpu *cpustat;
 
-	cpu = get_cpu();
-	cpustat = &mem->stat.cpustat[cpu];
-	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
+	val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
 	if (unlikely(val < 0)) {
-		__mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT,
+		this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT],
 				SOFTLIMIT_EVENTS_THRESH);
 		ret = true;
 	}
-	put_cpu();
 	return ret;
 }
 
@@ -549,17 +498,31 @@ mem_cgroup_largest_soft_limit_node(struc
 	return mz;
 }
 
+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+		enum mem_cgroup_stat_index idx)
+{
+	int cpu;
+	s64 val = 0;
+
+	for_each_possible_cpu(cpu)
+		val += per_cpu(mem->stat->count[idx], cpu);
+	return val;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+	s64 ret;
+
+	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+	return ret;
+}
+
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
-	struct mem_cgroup_stat *stat = &mem->stat;
-	struct mem_cgroup_stat_cpu *cpustat;
-	int cpu = get_cpu();
-
-	cpustat = &stat->cpustat[cpu];
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
-	put_cpu();
+	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -567,26 +530,22 @@ static void mem_cgroup_charge_statistics
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
-	struct mem_cgroup_stat *stat = &mem->stat;
-	struct mem_cgroup_stat_cpu *cpustat;
-	int cpu = get_cpu();
 
-	cpustat = &stat->cpustat[cpu];
+	preempt_disable();
+
 	if (PageCgroupCache(pc))
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
 	else
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
 
 	if (charge)
-		__mem_cgroup_stat_add_safe(cpustat,
-				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
 	else
-		__mem_cgroup_stat_add_safe(cpustat,
-				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
+		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+	__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
+	__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
 
-	put_cpu();
+	preempt_enable();
 }
 
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -1244,7 +1203,7 @@ static int mem_cgroup_hierarchical_recla
 				}
 			}
 		}
-		if (!mem_cgroup_local_usage(&victim->stat)) {
+		if (!mem_cgroup_local_usage(victim)) {
 			/* this cgroup's local usage == 0 */
 			css_put(&victim->css);
 			continue;
@@ -1310,9 +1269,6 @@ static void record_last_oom(struct mem_c
 void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
-	struct mem_cgroup_stat *stat;
-	struct mem_cgroup_stat_cpu *cpustat;
-	int cpu;
 	struct page_cgroup *pc;
 
 	pc = lookup_page_cgroup(page);
@@ -1328,13 +1284,10 @@ void mem_cgroup_update_file_mapped(struc
 		goto done;
 
 	/*
-	 * Preemption is already disabled, we don't need get_cpu()
+	 * Preemption is already disabled. We can use __this_cpu_xxx
 	 */
-	cpu = smp_processor_id();
-	stat = &mem->stat;
-	cpustat = &stat->cpustat[cpu];
+	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
 done:
 	unlock_page_cgroup(pc);
 }
@@ -1761,9 +1714,6 @@ static void __mem_cgroup_move_account(st
 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	struct page *page;
-	int cpu;
-	struct mem_cgroup_stat *stat;
-	struct mem_cgroup_stat_cpu *cpustat;
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
@@ -1773,18 +1723,11 @@ static void __mem_cgroup_move_account(st
 
 	page = pc->page;
 	if (page_mapped(page) && !PageAnon(page)) {
-		cpu = smp_processor_id();
-		/* Update mapped_file data for mem_cgroup "from" */
-		stat = &from->stat;
-		cpustat = &stat->cpustat[cpu];
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-						-1);
-
-		/* Update mapped_file data for mem_cgroup "to" */
-		stat = &to->stat;
-		cpustat = &stat->cpustat[cpu];
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-						1);
+		/* Update mapped_file data for mem_cgroup */
+		preempt_disable();
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, pc, false);
 	if (uncharge)
@@ -2885,7 +2828,7 @@ static int
 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
 {
 	struct mem_cgroup_idx_data *d = data;
-	d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+	d->val += mem_cgroup_read_stat(mem, d->idx);
 	return 0;
 }
 
@@ -3126,18 +3069,18 @@ static int mem_cgroup_get_local_stat(str
 	s64 val;
 
 	/* per cpu stat */
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
 	s->stat[MCS_PGPGOUT] += val;
 	if (do_swap_account) {
-		val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
 
@@ -3287,19 +3230,14 @@ static int mem_cgroup_swappiness_write(s
 static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
 {
 	bool ret = false;
-	int cpu;
 	s64 val;
-	struct mem_cgroup_stat_cpu *cpustat;
 
-	cpu = get_cpu();
-	cpustat = &mem->stat.cpustat[cpu];
-	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
+	val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
 	if (unlikely(val < 0)) {
-		__mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
+		this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS],
 				THRESHOLDS_EVENTS_THRESH);
 		ret = true;
 	}
-	put_cpu();
 	return ret;
 }
 
@@ -3687,17 +3625,12 @@ static void free_mem_cgroup_per_zone_inf
 	kfree(mem->info.nodeinfo[node]);
 }
 
-static int mem_cgroup_size(void)
-{
-	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
-	return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *mem;
-	int size = mem_cgroup_size();
+	int size = sizeof(struct mem_cgroup);
 
+	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		mem = kmalloc(size, GFP_KERNEL);
 	else
@@ -3705,6 +3638,14 @@ static struct mem_cgroup *mem_cgroup_all
 
 	if (mem)
 		memset(mem, 0, size);
+	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+	if (!mem->stat) {
+		if (size < PAGE_SIZE)
+			kfree(mem);
+		else
+			vfree(mem);
+		mem = NULL;
+	}
 	return mem;
 }
 
@@ -3729,7 +3670,8 @@ static void __mem_cgroup_free(struct mem
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 
-	if (mem_cgroup_size() < PAGE_SIZE)
+	free_percpu(mem->stat);
+	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH mmotm] memcg use generic percpu allocator instead of private one
  2010-01-20  7:18 [PATCH mmotm] memcg use generic percpu allocator instead of private one KAMEZAWA Hiroyuki
@ 2010-01-20  9:37 ` Balbir Singh
  2010-01-20  9:47   ` KAMEZAWA Hiroyuki
  2010-01-21  2:07   ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 6+ messages in thread
From: Balbir Singh @ 2010-01-20  9:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	nishimura@mxp.nes.nec.co.jp, kirill

On Wednesday 20 January 2010 12:48 PM, KAMEZAWA Hiroyuki wrote:
> This patch is onto mmotm Jan/15.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> When per-cpu counter for memcg was implemneted, dynamic percpu allocator
> was not very good. But now, we have good one and useful macros.
> This patch replaces memcg's private percpu counter implementation with
> generic dynamic percpu allocator and macros.
> 
> The benefits are
> 	- We can remove private implementation.
> 	- The counters will be NUMA-aware. (Current one is not...)
> 	- This patch reduces sizeof(struct mem_cgroup). Then,
> 	  struct mem_cgroup may be fit in page size on small config.
> 
> By this, size of text is reduced.
>  [Before]
>  [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
>    text    data     bss     dec     hex filename
>   24373    2528    4132   31033    7939 mm/memcontrol.o
>  [After]
>  [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
>    text    data     bss     dec     hex filename
>   23913    2528    4132   30573    776d mm/memcontrol.o
> 
> This includes no functional changes.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


Before review, could you please post parallel pagefault data on a large
system, since root now uses these per cpu counters and its overhead is
now dependent on these counters. Also the data read from root cgroup is
also dependent on these, could you make sure that is not broken.

Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH mmotm] memcg use generic percpu allocator instead of private one
  2010-01-20  9:37 ` Balbir Singh
@ 2010-01-20  9:47   ` KAMEZAWA Hiroyuki
  2010-01-20 12:46     ` Balbir Singh
  2010-01-21  2:07   ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-20  9:47 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	nishimura@mxp.nes.nec.co.jp, kirill

On Wed, 20 Jan 2010 15:07:52 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> On Wednesday 20 January 2010 12:48 PM, KAMEZAWA Hiroyuki wrote:
> > This patch is onto mmotm Jan/15.
> > =
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > When per-cpu counter for memcg was implemneted, dynamic percpu allocator
> > was not very good. But now, we have good one and useful macros.
> > This patch replaces memcg's private percpu counter implementation with
> > generic dynamic percpu allocator and macros.
> > 
> > The benefits are
> > 	- We can remove private implementation.
> > 	- The counters will be NUMA-aware. (Current one is not...)
> > 	- This patch reduces sizeof(struct mem_cgroup). Then,
> > 	  struct mem_cgroup may be fit in page size on small config.
> > 
> > By this, size of text is reduced.
> >  [Before]
> >  [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
> >    text    data     bss     dec     hex filename
> >   24373    2528    4132   31033    7939 mm/memcontrol.o
> >  [After]
> >  [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
> >    text    data     bss     dec     hex filename
> >   23913    2528    4132   30573    776d mm/memcontrol.o
> > 
> > This includes no functional changes.
> > 
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> 
> Before review, could you please post parallel pagefault data on a large
> system, since root now uses these per cpu counters and its overhead is
> now dependent on these counters. Also the data read from root cgroup is
> also dependent on these, could you make sure that is not broken.
> 
No number difference before/after patch on my SMP quick test.
But I don't have NUMA. Could you test on NUMA ?

I'll measure again tomorrow if I have machine time.

Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH mmotm] memcg use generic percpu allocator instead of private one
  2010-01-20  9:47   ` KAMEZAWA Hiroyuki
@ 2010-01-20 12:46     ` Balbir Singh
  0 siblings, 0 replies; 6+ messages in thread
From: Balbir Singh @ 2010-01-20 12:46 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	nishimura@mxp.nes.nec.co.jp, kirill

On Wednesday 20 January 2010 03:17 PM, KAMEZAWA Hiroyuki wrote:
> On Wed, 20 Jan 2010 15:07:52 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> On Wednesday 20 January 2010 12:48 PM, KAMEZAWA Hiroyuki wrote:
>>> This patch is onto mmotm Jan/15.
>>> =
>>> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>>>
>>> When per-cpu counter for memcg was implemneted, dynamic percpu allocator
>>> was not very good. But now, we have good one and useful macros.
>>> This patch replaces memcg's private percpu counter implementation with
>>> generic dynamic percpu allocator and macros.
>>>
>>> The benefits are
>>> 	- We can remove private implementation.
>>> 	- The counters will be NUMA-aware. (Current one is not...)
>>> 	- This patch reduces sizeof(struct mem_cgroup). Then,
>>> 	  struct mem_cgroup may be fit in page size on small config.
>>>
>>> By this, size of text is reduced.
>>>  [Before]
>>>  [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
>>>    text    data     bss     dec     hex filename
>>>   24373    2528    4132   31033    7939 mm/memcontrol.o
>>>  [After]
>>>  [kamezawa@bluextal mmotm-2.6.33-Jan15]$ size mm/memcontrol.o
>>>    text    data     bss     dec     hex filename
>>>   23913    2528    4132   30573    776d mm/memcontrol.o
>>>
>>> This includes no functional changes.
>>>
>>> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>>
>>
>> Before review, could you please post parallel pagefault data on a large
>> system, since root now uses these per cpu counters and its overhead is
>> now dependent on these counters. Also the data read from root cgroup is
>> also dependent on these, could you make sure that is not broken.
>>
> No number difference before/after patch on my SMP quick test.
> But I don't have NUMA. Could you test on NUMA ?
> 
> I'll measure again tomorrow if I have machine time.

I'll do the same as well if possible.

Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH mmotm] memcg use generic percpu allocator instead of private one
  2010-01-20  9:37 ` Balbir Singh
  2010-01-20  9:47   ` KAMEZAWA Hiroyuki
@ 2010-01-21  2:07   ` KAMEZAWA Hiroyuki
  2010-01-21 15:30     ` Balbir Singh
  1 sibling, 1 reply; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-01-21  2:07 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	nishimura@mxp.nes.nec.co.jp, kirill

[-- Attachment #1: Type: text/plain, Size: 1592 bytes --]

On Wed, 20 Jan 2010 15:07:52 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
 
> > This includes no functional changes.
> > 
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> 
> Before review, could you please post parallel pagefault data on a large
> system, since root now uses these per cpu counters and its overhead is
> now dependent on these counters. Also the data read from root cgroup is
> also dependent on these, could you make sure that is not broken.
> 
Hmm, I rewrote test program for avoidng mmap_sem. This version does fork()
instead of pthread_create() and meausre parallel-process page fault speed.

[Before patch]
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8

 Performance counter stats for './multi-fault-fork 8' (5 runs):

       45256919  page-faults                ( +-   0.851% )
      602230144  cache-misses               ( +-   0.187% )

   61.020533723  seconds time elapsed   ( +-   0.002% 

[After patch]
[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8

 Performance counter stats for './multi-fault-fork 8' (5 runs):

       46007166  page-faults                ( +-   0.339% )
      599553505  cache-misses               ( +-   0.298% )

   61.020937843  seconds time elapsed   ( +-   0.004% )

slightly improved ? But this test program does some extreme behavior and
you can't see difference in real-world applications, I think.
So, I guess this is in error-range in famous (not small) benchmarks.

Thanks,
-Kame

[-- Attachment #2: multi-fault-fork.c --]
[-- Type: text/x-csrc, Size: 1455 bytes --]

/*
 * multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
 * % gcc -O2 -o multi-fault multi-fault.c -lpthread
 * % multi-fault # of cpus.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#include <stdlib.h>

/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define FAULT_LENGTH	(2 * 1024 * 1024)
#define PAGE_SIZE	4096

void alarm_handler(int sig)
{
}

void *worker(int cpu, int ppid)
{
	void *start, *end;
	char *c;
	cpu_set_t set;
	int i;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);

	start = mmap(NULL, FAULT_LENGTH, PROT_READ|PROT_WRITE,
			MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	if (start == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}
	end = start + FAULT_LENGTH;

	pause();
	//fprintf(stderr, "run%d", cpu);
	while (1) {
		for (c = (char*)start; (void *)c < end; c += PAGE_SIZE)
			*c = 0;
		madvise(start, FAULT_LENGTH, MADV_DONTNEED);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int num, i, ret, pid;

	if (argc < 2)
		return 0;

	setpgid(0, 0);
	signal(SIGALRM, alarm_handler);
	num = atoi(argv[1]);	
	pid = getpid();

	for (i = 0; i < num; ++i) {
		if (fork()) {
			worker(i, pid);
		}
	}
	sleep(1);
	kill(-pid, SIGALRM);
	sleep(60);
	kill(-pid, SIGKILL);
	return 0;
}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH mmotm] memcg use generic percpu allocator instead of private one
  2010-01-21  2:07   ` KAMEZAWA Hiroyuki
@ 2010-01-21 15:30     ` Balbir Singh
  0 siblings, 0 replies; 6+ messages in thread
From: Balbir Singh @ 2010-01-21 15:30 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	nishimura@mxp.nes.nec.co.jp, kirill

On Thursday 21 January 2010 07:37 AM, KAMEZAWA Hiroyuki wrote:
> On Wed, 20 Jan 2010 15:07:52 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>>> This includes no functional changes.
>>>
>>> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>>
>>
>> Before review, could you please post parallel pagefault data on a large
>> system, since root now uses these per cpu counters and its overhead is
>> now dependent on these counters. Also the data read from root cgroup is
>> also dependent on these, could you make sure that is not broken.
>>
> Hmm, I rewrote test program for avoidng mmap_sem. This version does fork()
> instead of pthread_create() and meausre parallel-process page fault speed.
> 
> [Before patch]
> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8
> 
>  Performance counter stats for './multi-fault-fork 8' (5 runs):
> 
>        45256919  page-faults                ( +-   0.851% )
>       602230144  cache-misses               ( +-   0.187% )
> 
>    61.020533723  seconds time elapsed   ( +-   0.002% 
> 
> [After patch]
> [root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8
> 
>  Performance counter stats for './multi-fault-fork 8' (5 runs):
> 
>        46007166  page-faults                ( +-   0.339% )
>       599553505  cache-misses               ( +-   0.298% )
> 
>    61.020937843  seconds time elapsed   ( +-   0.004% )
> 
> slightly improved ? But this test program does some extreme behavior and
> you can't see difference in real-world applications, I think.
> So, I guess this is in error-range in famous (not small) benchmarks.

Looks good, please give me a couple of days to test, I'll revert back
with numbers and review.

-- 
Three Cheers,
Balbir Singh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-01-21 15:30 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-20  7:18 [PATCH mmotm] memcg use generic percpu allocator instead of private one KAMEZAWA Hiroyuki
2010-01-20  9:37 ` Balbir Singh
2010-01-20  9:47   ` KAMEZAWA Hiroyuki
2010-01-20 12:46     ` Balbir Singh
2010-01-21  2:07   ` KAMEZAWA Hiroyuki
2010-01-21 15:30     ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).