All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] xen,xend,tools: NUMA support for Xen
@ 2006-07-11 15:35 Ryan Harper
  2006-07-11 15:57 ` Keir Fraser
  0 siblings, 1 reply; 7+ messages in thread
From: Ryan Harper @ 2006-07-11 15:35 UTC (permalink / raw)
  To: xen-devel

Reposting the latest patches, no significant changes since May when I
last received feedback.  I've done some simple overhead and performance
numbers for these patches.

Measuring NUMA allocator patch overhead via balloon driver and DOM0
involved starting DOM0 with all memory (4G) and ballooning down (256M)
as a starting point, [1]tracking the time it takes to balloon back up
to full memory.  The test was done with and without NUMA patches.

With NUMA:

Try1: 911ms
Try2: 907ms
Try3: 910ms

Without NUMA:

Try1: 606ms
Try2: 604ms
Try3: 608ms


Measuring NUMA allocator patch overhead via increase_reservation memory op
during domain creation we [2]modified the increase_reservation op to
track the time using get_s_time() from start to finish and sampled the
times for various memory sizes.

With NUMA:

MemSize  128M 512M 1G   2G    3G    
-------------------------------------
Try1:    6ms  26ms 53ms 221ms 390ms
Try2:    6ms  26ms 48ms 212ms 390ms
Try3:    6ms  26ms 48ms 212ms 390ms

Without NUMA:
MemSize  128M 512M 1G   2G    3G    
-------------------------------------
Try1:    4ms  16ms 25ms 70ms  100ms
Try2:    3ms  14ms 28ms 56ms  109ms
Try3:    3ms  14ms 23ms 56ms   95ms


Using a microbenchmark which mallocs memory and touches each byte, we
can observe the affects of local memory versus remote.  The domain is
created with 1GB memory, and 1 VCPU coming from the same node.  The
microbenchmark forks off one child per-VCPU and malloc/memsets 512M
buffer each.  We then compare worst-case (All non-local memory) and
best-case (all-local).

Machine Topology:
node_to_cpu            : node0:0
                         node1:1

Domain's memory placement
(bebop) tmp # xen_numastat -d 8
DOM8: NODE0: PAGES: 5
DOM8: NODE1: PAGES: 262144

Domain's vcpu placement
(bebop) tmp # xm vcpu-list 8
Name                              ID  VCPU  CPU  State  Time(s)  CPU
Affinity
hungerforce                        8     0    0   -b-      12.0  0

All-remote memory:
root@amd64-domU:/usr/src # while true; do ./memwrite -j1 -m512M; sleep 1; done
Time to write '0' to 512.000 MiB 1168711 usecs.  Throughput: 438.090 MiB/sec
Time to write '0' to 512.000 MiB 1175179 usecs.  Throughput: 435.678 MiB/sec
Time to write '0' to 512.000 MiB 1172454 usecs.  Throughput: 436.691 MiB/sec
Time to write '0' to 512.000 MiB 1170378 usecs.  Throughput: 437.466 MiB/sec

Domain's vcpu placement
(bebop) tmp # xm vcpu-list 8
Name                              ID  VCPU  CPU  State  Time(s)  CPU
Affinity
hungerforce                        8     0    0   -b-      15.9  1

All-local memory:
root@amd64-domU:/usr/src # while true; do ./memwrite -j1 -m512M; sleep 1; done
Time to write '0' to 512.000 MiB 759186 usecs.  Throughput: 674.406 MiB/sec
Time to write '0' to 512.000 MiB 765143 usecs.  Throughput: 669.156 MiB/sec
Time to write '0' to 512.000 MiB 768462 usecs.  Throughput: 666.266 MiB/sec
Time to write '0' to 512.000 MiB 763406 usecs.  Throughput: 670.679 MiB/sec


1.  diff -r ae245d35457b linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c
--- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c	Wed Jun 28 12:59:29 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c	Mon Jun 26 14:54:10 2006
@@ -44,6 +44,7 @@
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
+#include <linux/time.h>
 #include <xen/xen_proc.h>
 #include <asm/hypervisor.h>
 #include <xen/balloon.h>
@@ -63,6 +64,9 @@
 #endif
 
 static DECLARE_MUTEX(balloon_mutex);
+
+static struct timeval start, stop, delay;
+static int calc_delay = 0;
 
 /*
  * Protects atomic reservation decrease/increase against concurrent increases.
@@ -337,6 +341,14 @@
 	return need_sleep;
 }
 
+/** Convert to milliseconds */
+static inline __u64 tv_to_ms(const struct timeval* tv)
+{
+        __u64 ms = tv->tv_usec / 1000;
+        ms += (__u64)tv->tv_sec * (__u64)1000;
+        return ms;
+}
+
 /*
  * We avoid multiple worker processes conflicting via the balloon mutex.
  * We may of course race updates of the target counts (which are protected
@@ -350,6 +362,11 @@
 
 	down(&balloon_mutex);
 
+	if (calc_delay) {
+		do_gettimeofday(&delay);
+		calc_delay = 0;
+	}
+ 
 	do {
 		credit = current_target() - current_pages;
 		if (credit > 0)
@@ -366,6 +383,13 @@
 	/* Schedule more work if there is some still to be done. */
 	if (current_target() != current_pages)
 		mod_timer(&balloon_timer, jiffies + HZ);
+	else {
+		/* We've hit target, notify completion */
+		do_gettimeofday(&stop);
+		printk(KERN_WARNING "Ballooning complete.  startup delay: %lums", 
+					tv_to_ms(&delay)-tv_to_ms(&start));
+		printk(" total time: %lums\n", tv_to_ms(&stop)-tv_to_ms(&start));
+	}
 
 	up(&balloon_mutex);
 }
@@ -376,6 +400,11 @@
 	/* No need for lock. Not read-modify-write updates. */
 	hard_limit   = ~0UL;
 	target_pages = target;
+
+	/* note start time of balloon process */
+	do_gettimeofday(&start);
+	calc_delay = 1;
+
 	schedule_work(&balloon_worker);
 }

2.  diff -r c257ac74b5c7 xen/common/memory.c
--- a/xen/common/memory.c	Tue Jul  4 16:31:13 2006
+++ b/xen/common/memory.c	Wed Jul  5 12:12:43 2006
@@ -27,6 +27,10 @@
  * high-order bits of the @cmd parameter, which are otherwise unused and zero.
  */
 #define START_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */
+
+static int calc_start = 1;
+static unsigned long initial_extent;
+static s_time_t start, stop;
 
 static long
 increase_reservation(
@@ -574,6 +578,13 @@
         switch ( op )
         {
         case XENMEM_increase_reservation:
+            if ( calc_start ) {
+                 printk("WARK: calcing start time on IR of %lu pages\n",
+                         reservation.nr_extents);
+                 initial_extent = reservation.nr_extents;
+                 start = get_s_time();
+                 calc_start = 0;
+            }
             rc = increase_reservation(
                 d,
                 reservation.extent_start,
@@ -612,6 +623,12 @@
                 __HYPERVISOR_memory_op, "lh",
                 op | (rc << START_EXTENT_SHIFT), arg);
 
+        if ( op == XENMEM_increase_reservation ) {
+            stop = get_s_time();
+            printk("WARK: increase_reservation of %lu pages took->%lums\n", 
+                   initial_extent, ((u64)stop-(u64)start)/1000000 );
+            calc_start = 1;
+        }
         break;
 
     case XENMEM_exchange:
 
-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/6] xen,xend,tools: NUMA support for Xen
  2006-07-11 15:35 [PATCH 0/6] xen,xend,tools: NUMA support for Xen Ryan Harper
@ 2006-07-11 15:57 ` Keir Fraser
  2006-07-11 17:47   ` Ryan Harper
  0 siblings, 1 reply; 7+ messages in thread
From: Keir Fraser @ 2006-07-11 15:57 UTC (permalink / raw)
  To: Ryan Harper; +Cc: xen-devel


On 11 Jul 2006, at 16:35, Ryan Harper wrote:

> Reposting the latest patches, no significant changes since May when I
> last received feedback.  I've done some simple overhead and performance
> numbers for these patches.
>
> Measuring NUMA allocator patch overhead via balloon driver and DOM0
> involved starting DOM0 with all memory (4G) and ballooning down (256M)
> as a starting point, [1]tracking the time it takes to balloon back up
> to full memory.  The test was done with and without NUMA patches.

What sort of box are these numbers taken from? If it's not a NUMA 
system then the slowdowns are rather poor. We're particularly 
interested in not slowing down non-NUMA and small-NUMA (e.g., AMD K8) 
x86 systems. They are what we really want to see measurements from.

  -- Keir

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH 0/6] xen,xend,tools: NUMA support for Xen
@ 2006-07-11 16:40 Lu, Yinghai
  0 siblings, 0 replies; 7+ messages in thread
From: Lu, Yinghai @ 2006-07-11 16:40 UTC (permalink / raw)
  To: Keir Fraser, Ryan Harper; +Cc: xen-devel

So the virtual firmware need to have SRAT table in acpi tables. Or the
ioemu need to have NB device So the guest OS can get the numa info.

Also windows smp guest will need info to understand core to socket
mapping so it could enable two sockets ( 4 cores ).
So xen need to pass enough info about nodes-cores mapping to virtual
firmare and ioemu. So ioemu could put info in NB conf, and virtual
firmware apci tables. (---svm msr intercept for cores setting related
need to be changed too).

YH


-----Original Message-----
From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Keir Fraser
Sent: Tuesday, July 11, 2006 8:58 AM
To: Ryan Harper
Cc: xen-devel@lists.xensource.com
Subject: Re: [Xen-devel] [PATCH 0/6] xen,xend,tools: NUMA support for
Xen


On 11 Jul 2006, at 16:35, Ryan Harper wrote:

> Reposting the latest patches, no significant changes since May when I
> last received feedback.  I've done some simple overhead and
performance
> numbers for these patches.
>
> Measuring NUMA allocator patch overhead via balloon driver and DOM0
> involved starting DOM0 with all memory (4G) and ballooning down (256M)
> as a starting point, [1]tracking the time it takes to balloon back up
> to full memory.  The test was done with and without NUMA patches.

What sort of box are these numbers taken from? If it's not a NUMA 
system then the slowdowns are rather poor. We're particularly 
interested in not slowing down non-NUMA and small-NUMA (e.g., AMD K8) 
x86 systems. They are what we really want to see measurements from.

  -- Keir


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/6] xen,xend,tools: NUMA support for Xen
  2006-07-11 15:57 ` Keir Fraser
@ 2006-07-11 17:47   ` Ryan Harper
  0 siblings, 0 replies; 7+ messages in thread
From: Ryan Harper @ 2006-07-11 17:47 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

* Keir Fraser <Keir.Fraser@cl.cam.ac.uk> [2006-07-11 10:59]:
> 
> On 11 Jul 2006, at 16:35, Ryan Harper wrote:
> 
> >Reposting the latest patches, no significant changes since May when I
> >last received feedback.  I've done some simple overhead and performance
> >numbers for these patches.
> >
> >Measuring NUMA allocator patch overhead via balloon driver and DOM0
> >involved starting DOM0 with all memory (4G) and ballooning down (256M)
> >as a starting point, [1]tracking the time it takes to balloon back up
> >to full memory.  The test was done with and without NUMA patches.
> 
> What sort of box are these numbers taken from? If it's not a NUMA 
> system then the slowdowns are rather poor. We're particularly 
> interested in not slowing down non-NUMA and small-NUMA (e.g., AMD K8) 
> x86 systems. They are what we really want to see measurements from.

The measurements are taken from a two-way Operton 248 (2.1Ghz) ,
small-NUMA.  I agree that there is significant overhead, however, we
aren't talking about fast path here; correct me if I'm wrong.   We
are only adding overhead to during domain startup.  The end result
being we pay for local memory allocation at creation time while
benefiting from local memory access for the lifetime of the domain.

I'm going to gather some oprofile data to see if I missed something
obvious, but in general I think that having local memory is of greater
benefit for the lifetime of a domain than the cost we incur during its
creation.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH 0/6] xen,xend,tools: NUMA support for Xen
@ 2006-07-11 21:28 Ian Pratt
  2006-07-12  1:23 ` Ryan Harper
  0 siblings, 1 reply; 7+ messages in thread
From: Ian Pratt @ 2006-07-11 21:28 UTC (permalink / raw)
  To: Ryan Harper, Keir Fraser; +Cc: xen-devel

> > What sort of box are these numbers taken from? If it's not a NUMA
> > system then the slowdowns are rather poor. We're particularly
> > interested in not slowing down non-NUMA and small-NUMA (e.g., AMD
K8)
> > x86 systems. They are what we really want to see measurements from.
> 
> The measurements are taken from a two-way Operton 248 (2.1Ghz) ,
> small-NUMA.  I agree that there is significant overhead, however, we
> aren't talking about fast path here; correct me if I'm wrong.   We
> are only adding overhead to during domain startup.  The end result
> being we pay for local memory allocation at creation time while
> benefiting from local memory access for the lifetime of the domain.
> 
> I'm going to gather some oprofile data to see if I missed something
> obvious, but in general I think that having local memory is of greater
> benefit for the lifetime of a domain than the cost we incur during its
> creation.

What do the numbers look like on a 1 node system?

The shadow mode code potentially churns the page allocator a fair bit.
It'll be disappointing if we have to add complexity of quicklists etc. 

It does kind of surprise me that the overhead is as high as you've
measured. In the case where there's memory available in the favoured
node I'd expect allocation performance to be very similar. 4 times
slower and worsening for large allocations seems odd -- 0.3 microseconds
a page is a bit more than I'd expect during back-to-back allocations.
It's certainly worth trying to understand the overhead a bit more.

Ian

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/6] xen,xend,tools: NUMA support for Xen
  2006-07-11 21:28 Ian Pratt
@ 2006-07-12  1:23 ` Ryan Harper
  2006-07-12 20:30   ` Ryan Harper
  0 siblings, 1 reply; 7+ messages in thread
From: Ryan Harper @ 2006-07-12  1:23 UTC (permalink / raw)
  To: Ian Pratt; +Cc: xen-devel

* Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk> [2006-07-11 16:28]:
> > > What sort of box are these numbers taken from? If it's not a NUMA
> > > system then the slowdowns are rather poor. We're particularly
> > > interested in not slowing down non-NUMA and small-NUMA (e.g., AMD
> K8)
> > > x86 systems. They are what we really want to see measurements from.
> > 
> > The measurements are taken from a two-way Operton 248 (2.1Ghz) ,
> > small-NUMA.  I agree that there is significant overhead, however, we
> > aren't talking about fast path here; correct me if I'm wrong.   We
> > are only adding overhead to during domain startup.  The end result
> > being we pay for local memory allocation at creation time while
> > benefiting from local memory access for the lifetime of the domain.
> > 
> > I'm going to gather some oprofile data to see if I missed something
> > obvious, but in general I think that having local memory is of greater
> > benefit for the lifetime of a domain than the cost we incur during its
> > creation.
> 
> What do the numbers look like on a 1 node system?

For K8 small numa, I don't have a 1 node system available.

> 
> The shadow mode code potentially churns the page allocator a fair bit.
> It'll be disappointing if we have to add complexity of quicklists etc. 

Yeah, I forgot about shadow mode; good point.

> 
> It does kind of surprise me that the overhead is as high as you've
> measured. In the case where there's memory available in the favoured
> node I'd expect allocation performance to be very similar. 4 times
> slower and worsening for large allocations seems odd -- 0.3 microseconds
> a page is a bit more than I'd expect during back-to-back allocations.
> It's certainly worth trying to understand the overhead a bit more.

I agree.  I'm a little mystified by the overhead as well.  On the larger
system, ballooning up to 23G had something like 11% overhead, which was
more reasonable, though the domain creation tests showed more than 11%
on that system as well.  I'll get the oprofile data and take a look.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/6] xen,xend,tools: NUMA support for Xen
  2006-07-12  1:23 ` Ryan Harper
@ 2006-07-12 20:30   ` Ryan Harper
  0 siblings, 0 replies; 7+ messages in thread
From: Ryan Harper @ 2006-07-12 20:30 UTC (permalink / raw)
  To: Ian Pratt, Keir Fraser; +Cc: xen-devel

* Ryan Harper <ryanh@us.ibm.com> [2006-07-11 20:24]:
> * Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk> [2006-07-11 16:28]:
> > It does kind of surprise me that the overhead is as high as you've
> > measured. In the case where there's memory available in the favoured
> > node I'd expect allocation performance to be very similar. 4 times
> > slower and worsening for large allocations seems odd -- 0.3 microseconds
> > a page is a bit more than I'd expect during back-to-back allocations.
> > It's certainly worth trying to understand the overhead a bit more.
> 
> I agree.  I'm a little mystified by the overhead as well.  On the larger
> system, ballooning up to 23G had something like 11% overhead, which was
> more reasonable, though the domain creation tests showed more than 11%
> on that system as well.  I'll get the oprofile data and take a look.

Using oprofile I have some optimizations as well as some further
understanding of the behavior.  I've removed as much logic from the fast
path as possible:

1. Dropped some superfluous calls to num_online_nodes(); that doesn't
   change.
2. Only calculate the next node when the current node's memory has been
   exhausted.
3. Don't bother distributing pages evenly across vcpus in all cases.

I've a couple thoughts here: 

1) only distribute if the domains' processors are spanning more than one
node. This requires additional code to track on which nodes the domain
is running.  Fairly trivial for sedf to update the domain nodemask
during vcpu affinity ops, a bit more hairy for credit scheduler. 

2) We can take the easy path and just use vcpu0's processor as a chooser
of which node to pull memory from.  We currently tune our domU configs
based on topology info exported anyhow to keep the domU within the node.

4. Don't bother looking for pages in an empty node. ie, check if target
zone/node can support the request.

These changes brought all the allocations down; matching without-NUMA up
to 512M allocations, though there is still overhead.  Looking at the
oprofile data I collected over several runs, I'm not seeing anything
else sticking out.  Also we were seeing worse times for larger
allocations because we were exhausting memory from one node and and
pulling from a second which resulted in a large number of lookups in the
empty node.  Optimization (4) addresses that issue.

Here is the new data.  I can attach some of the oprofile data if anyone
is interested.  I'll update patches 2 and 3 if the current overhead
is acceptable.

Balloon up:
With NUMA
Try1: 911ms
Try2: 907ms
Try3: 910ms

With NUMA+optimizations
Try1: 709ms
Try2: 701ms
Try3: 703ms

Without NUMA:
Try1: 606ms
Try2: 604ms
Try3: 608ms


Increase reservation
With NUMA:
MemSize  128M 512M 1G   2G    3G
-------------------------------------
Try1:    6ms  26ms 53ms 221ms 390ms
Try2:    6ms  26ms 48ms 212ms 390ms
Try3:    6ms  26ms 48ms 212ms 390ms

With NUMA + optimizations
MemSize  128M 512M 1G   2G   3G
-------------------------------------
Try1:    4ms  15ms 30ms 80ms 150ms
Try2:    3ms  14ms 30ms 80ms 159ms
Try3:    4ms  15ms 33ms 80ms 159ms

Without NUMA:
MemSize  128M 512M 1G   2G   3G
-------------------------------------
Try1:    4ms  16ms 25ms 70ms 100ms
Try2:    3ms  14ms 28ms 56ms 109ms
Try3:    3ms  14ms 23ms 56ms  95ms


-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2006-07-12 20:30 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-11 15:35 [PATCH 0/6] xen,xend,tools: NUMA support for Xen Ryan Harper
2006-07-11 15:57 ` Keir Fraser
2006-07-11 17:47   ` Ryan Harper
  -- strict thread matches above, loose matches on Subject: below --
2006-07-11 16:40 Lu, Yinghai
2006-07-11 21:28 Ian Pratt
2006-07-12  1:23 ` Ryan Harper
2006-07-12 20:30   ` Ryan Harper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.