[Patch 2 of 2]: PV-domain SMP performance Linux-part

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Patch 2 of 2]: PV-domain SMP performance Linux-part
@ 2008-12-17 12:22 Juergen Gross
  2008-12-17 15:06 ` Jan Beulich
  0 siblings, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2008-12-17 12:22 UTC (permalink / raw)
  To: xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 351 bytes --]


-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

[-- Attachment #2: vcpu_desched-linux.patch --]
[-- Type: text/x-patch, Size: 3944 bytes --]

Mark vcpu as no_desched when interrupts are disabled.

Signed-off-by: juergen.gross@fujitsu-siemens.com

# HG changeset patch
# User juergen.gross@fujitsu-siemens.com
# Date 1229515973 -3600
# Node ID b5bbaec70b34d990e64e730caf712bb236490f95
# Parent  ff9683032b76f533509191bb9532df10cbb9830b
added support of vcpu preempt disable

diff -r ff9683032b76 -r b5bbaec70b34 arch/x86_64/kernel/xen_entry.S
--- a/arch/x86_64/kernel/xen_entry.S	Sat Dec 13 16:00:43 2008 +0000
+++ b/arch/x86_64/kernel/xen_entry.S	Wed Dec 17 13:12:53 2008 +0100
@@ -4,6 +4,7 @@
 /* Offsets into shared_info_t. */                
 #define evtchn_upcall_pending		/* 0 */
 #define evtchn_upcall_mask		1
+#define no_desched			2
 
 #define sizeof_vcpu_shift		6
 
@@ -25,8 +26,10 @@
 #define XEN_PUT_VCPU_INFO_fixup
 #endif
 
-#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
-#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
+#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)	; \
+					movb $1,no_desched(reg)
+#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)	; \
+					movb $0,no_desched(reg)
 #define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
 				XEN_LOCKED_BLOCK_EVENTS(reg)		; \
     				XEN_PUT_VCPU_INFO(reg)
diff -r ff9683032b76 -r b5bbaec70b34 include/asm-x86_64/mach-xen/asm/irqflags.h
--- a/include/asm-x86_64/mach-xen/asm/irqflags.h	Sat Dec 13 16:00:43 2008 +0000
+++ b/include/asm-x86_64/mach-xen/asm/irqflags.h	Wed Dec 17 13:12:53 2008 +0100
@@ -33,8 +33,12 @@ do {									\
 	vcpu_info_t *_vcpu;						\
 	barrier();							\
 	_vcpu = current_vcpu_info();		\
-	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
+	if ( !(x) ) {							\
+		_vcpu->no_desched = 0;					\
+	        _vcpu->evtchn_upcall_mask = 0;				\
 		barrier(); /* unmask then check (avoid races) */	\
+		if ( unlikely(_vcpu->desched_delay) )			\
+			(void)((HYPERVISOR_sched_op(SCHEDOP_yield, _vcpu))?:0); \
 		if ( unlikely(_vcpu->evtchn_upcall_pending) )		\
 			force_evtchn_callback();			\
 	}								\
@@ -69,7 +73,10 @@ static inline int raw_irqs_disabled_flag
 
 #define raw_local_irq_disable()						\
 do {									\
-	current_vcpu_info()->evtchn_upcall_mask = 1;					\
+	vcpu_info_t *_vcpu;						\
+	_vcpu = current_vcpu_info();		\
+	_vcpu->evtchn_upcall_mask = 1;					\
+	_vcpu->no_desched = 1;						\
 	barrier();							\
 } while (0)
 
@@ -78,8 +85,11 @@ do {									\
 	vcpu_info_t *_vcpu;						\
 	barrier();							\
 	_vcpu = current_vcpu_info();		\
+	_vcpu->no_desched = 0;						\
 	_vcpu->evtchn_upcall_mask = 0;					\
 	barrier(); /* unmask then check (avoid races) */		\
+	if ( unlikely(_vcpu->desched_delay) )				\
+		(void)((HYPERVISOR_sched_op(SCHEDOP_yield, _vcpu))?:0);	\
 	if ( unlikely(_vcpu->evtchn_upcall_pending) )			\
 		force_evtchn_callback();				\
 } while (0)
diff -r ff9683032b76 -r b5bbaec70b34 include/xen/interface/xen.h
--- a/include/xen/interface/xen.h	Sat Dec 13 16:00:43 2008 +0000
+++ b/include/xen/interface/xen.h	Wed Dec 17 13:12:53 2008 +0100
@@ -434,9 +434,18 @@ struct vcpu_info {
      * non-zero mask therefore guarantees that the VCPU will not receive
      * an upcall activation. The mask is cleared when the VCPU requests
      * to block: this avoids wakeup-waiting races.
+     *
+     * The guest can set 'no_desched' to a non-zero value to avoid being
+     * descheduled. If the hypervisor didn't deschedule the VCPU due to
+     * 'no_desched' being set, it will itself set 'desched_delay' to inform
+     * the guest to give up control voluntaryly later. This is just a wish
+     * of the guest which the hypervisor may not obey (and it will deschedule
+     * the guest after a reasonable time anyway).
      */
     uint8_t evtchn_upcall_pending;
     uint8_t evtchn_upcall_mask;
+    uint8_t no_desched;
+    uint8_t desched_delay;
     unsigned long evtchn_pending_sel;
     struct arch_vcpu_info arch;
     struct vcpu_time_info time;

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-17 12:22 [Patch 2 of 2]: PV-domain SMP performance Linux-part Juergen Gross
@ 2008-12-17 15:06 ` Jan Beulich
  2008-12-18  7:18   ` Juergen Gross
  0 siblings, 1 reply; 37+ messages in thread
From: Jan Beulich @ 2008-12-17 15:06 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel@lists.xensource.com

>--- a/include/asm-x86_64/mach-xen/asm/irqflags.h	Sat Dec 13 16:00:43 2008 +0000
>+++ b/include/asm-x86_64/mach-xen/asm/irqflags.h	Wed Dec 17 13:12:53 2008 +0100
>@@ -33,8 +33,12 @@ do {									\
> 	vcpu_info_t *_vcpu;						\
> 	barrier();							\
> 	_vcpu = current_vcpu_info();		\
>-	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
>+	if ( !(x) ) {							\

This isn't correct, as it breaks 0->1 transitions (there are a few instances of
this in the kernel).

>+		_vcpu->no_desched = 0;					\
>+	        _vcpu->evtchn_upcall_mask = 0;				\
> 		barrier(); /* unmask then check (avoid races) */	\
>+		if ( unlikely(_vcpu->desched_delay) )			\
>+			(void)((HYPERVISOR_sched_op(SCHEDOP_yield, _vcpu))?:0); \

Why not just cast the function result to void? Likewise further below...

> 		if ( unlikely(_vcpu->evtchn_upcall_pending) )		\
> 			force_evtchn_callback();			\
> 	}								\

Jan

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-17 15:06 ` Jan Beulich
@ 2008-12-18  7:18   ` Juergen Gross
  2008-12-18  7:41     ` Jan Beulich
  0 siblings, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2008-12-18  7:18 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel@lists.xensource.com

Jan Beulich wrote:
>> --- a/include/asm-x86_64/mach-xen/asm/irqflags.h	Sat Dec 13 16:00:43 2008 +0000
>> +++ b/include/asm-x86_64/mach-xen/asm/irqflags.h	Wed Dec 17 13:12:53 2008 +0100
>> @@ -33,8 +33,12 @@ do {									\
>> 	vcpu_info_t *_vcpu;						\
>> 	barrier();							\
>> 	_vcpu = current_vcpu_info();		\
>> -	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
>> +	if ( !(x) ) {							\
> 
> This isn't correct, as it breaks 0->1 transitions (there are a few instances of
> this in the kernel).

Thanks!
I will correct it.

> 
>> +		_vcpu->no_desched = 0;					\
>> +	        _vcpu->evtchn_upcall_mask = 0;				\
>> 		barrier(); /* unmask then check (avoid races) */	\
>> +		if ( unlikely(_vcpu->desched_delay) )			\
>> +			(void)((HYPERVISOR_sched_op(SCHEDOP_yield, _vcpu))?:0); \
> 
> Why not just cast the function result to void? Likewise further below...

I took that from include/xen/hypercall.h, which mentioned problems with just
casting the function result.


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-18  7:18   ` Juergen Gross
@ 2008-12-18  7:41     ` Jan Beulich
  2008-12-19  8:12       ` Juergen Gross
  0 siblings, 1 reply; 37+ messages in thread
From: Jan Beulich @ 2008-12-18  7:41 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel@lists.xensource.com

>>> Juergen Gross <juergen.gross@fujitsu-siemens.com> 18.12.08 08:18 >>>
>Jan Beulich wrote:
>> 
>>> +		_vcpu->no_desched = 0;					\
>>> +	        _vcpu->evtchn_upcall_mask = 0;				\
>>> 		barrier(); /* unmask then check (avoid races) */	\
>>> +		if ( unlikely(_vcpu->desched_delay) )			\
>>> +			(void)((HYPERVISOR_sched_op(SCHEDOP_yield, _vcpu))?:0); \
>> 
>> Why not just cast the function result to void? Likewise further below...
>
>I took that from include/xen/hypercall.h, which mentioned problems with just
>casting the function result.

Ah, yes, I recall. But then why don't you use that macro? After all, hypercall.h
must have been included if you're able to call HYPERVISOR_sched_op().

Jan

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-18  7:41     ` Jan Beulich
@ 2008-12-19  8:12       ` Juergen Gross
  2008-12-19  9:10         ` Keir Fraser
  0 siblings, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2008-12-19  8:12 UTC (permalink / raw)
  To: Jan Beulich, Keir Fraser; +Cc: xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 1503 bytes --]

Jan Beulich wrote:
>>>> Juergen Gross <juergen.gross@fujitsu-siemens.com> 18.12.08 08:18 >>>
>> Jan Beulich wrote:
>>>> +		_vcpu->no_desched = 0;					\
>>>> +	        _vcpu->evtchn_upcall_mask = 0;				\
>>>> 		barrier(); /* unmask then check (avoid races) */	\
>>>> +		if ( unlikely(_vcpu->desched_delay) )			\
>>>> +			(void)((HYPERVISOR_sched_op(SCHEDOP_yield, _vcpu))?:0); \
>>> Why not just cast the function result to void? Likewise further below...
>> I took that from include/xen/hypercall.h, which mentioned problems with just
>> casting the function result.
> 
> Ah, yes, I recall. But then why don't you use that macro? After all, hypercall.h
> must have been included if you're able to call HYPERVISOR_sched_op().

This was an artefact from previous tests, sorry.
Attached patch should be clean now.

Keir, what is your main objection?
Putting this interface into the hypervisor or only the Linux part?
I still think this would be good to have in linux, but our BS2000 port would
really be much easier having this interface in the hypervisor!
So if you don't mind adding only the first patch, this would be absolutely
okay for us.


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

[-- Attachment #2: vcpu_desched-linux.patch --]
[-- Type: text/x-patch, Size: 3824 bytes --]

Mark vcpu as no_desched when interrupts are disabled.

Signed-off-by: juergen.gross@fujitsu-siemens.com

# HG changeset patch
# User juergen.gross@fujitsu-siemens.com
# Date 1229673855 -3600
# Node ID 0b6ef982e456e4adbd2a547406d475d1b554fcbe
# Parent  ff9683032b76f533509191bb9532df10cbb9830b
added support of vcpu preempt disable

diff -r ff9683032b76 -r 0b6ef982e456 arch/x86_64/kernel/xen_entry.S
--- a/arch/x86_64/kernel/xen_entry.S	Sat Dec 13 16:00:43 2008 +0000
+++ b/arch/x86_64/kernel/xen_entry.S	Fri Dec 19 09:04:15 2008 +0100
@@ -4,6 +4,7 @@
 /* Offsets into shared_info_t. */                
 #define evtchn_upcall_pending		/* 0 */
 #define evtchn_upcall_mask		1
+#define no_desched			2
 
 #define sizeof_vcpu_shift		6
 
@@ -25,8 +26,10 @@
 #define XEN_PUT_VCPU_INFO_fixup
 #endif
 
-#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
-#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
+#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)	; \
+					movb $1,no_desched(reg)
+#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)	; \
+					movb $0,no_desched(reg)
 #define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
 				XEN_LOCKED_BLOCK_EVENTS(reg)		; \
     				XEN_PUT_VCPU_INFO(reg)
diff -r ff9683032b76 -r 0b6ef982e456 include/asm-x86_64/mach-xen/asm/irqflags.h
--- a/include/asm-x86_64/mach-xen/asm/irqflags.h	Sat Dec 13 16:00:43 2008 +0000
+++ b/include/asm-x86_64/mach-xen/asm/irqflags.h	Fri Dec 19 09:04:15 2008 +0100
@@ -34,7 +34,10 @@ do {									\
 	barrier();							\
 	_vcpu = current_vcpu_info();		\
 	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
+		_vcpu->no_desched = 0;					\
 		barrier(); /* unmask then check (avoid races) */	\
+		if ( unlikely(_vcpu->desched_delay) )			\
+			VOID(HYPERVISOR_sched_op(SCHEDOP_yield, 0));	\
 		if ( unlikely(_vcpu->evtchn_upcall_pending) )		\
 			force_evtchn_callback();			\
 	}								\
@@ -69,7 +72,10 @@ static inline int raw_irqs_disabled_flag
 
 #define raw_local_irq_disable()						\
 do {									\
-	current_vcpu_info()->evtchn_upcall_mask = 1;					\
+	vcpu_info_t *_vcpu;						\
+	_vcpu = current_vcpu_info();		\
+	_vcpu->evtchn_upcall_mask = 1;					\
+	_vcpu->no_desched = 1;						\
 	barrier();							\
 } while (0)
 
@@ -78,8 +84,11 @@ do {									\
 	vcpu_info_t *_vcpu;						\
 	barrier();							\
 	_vcpu = current_vcpu_info();		\
+	_vcpu->no_desched = 0;						\
 	_vcpu->evtchn_upcall_mask = 0;					\
 	barrier(); /* unmask then check (avoid races) */		\
+	if ( unlikely(_vcpu->desched_delay) )				\
+		VOID(HYPERVISOR_sched_op(SCHEDOP_yield, 0));		\
 	if ( unlikely(_vcpu->evtchn_upcall_pending) )			\
 		force_evtchn_callback();				\
 } while (0)
diff -r ff9683032b76 -r 0b6ef982e456 include/xen/interface/xen.h
--- a/include/xen/interface/xen.h	Sat Dec 13 16:00:43 2008 +0000
+++ b/include/xen/interface/xen.h	Fri Dec 19 09:04:15 2008 +0100
@@ -434,9 +434,18 @@ struct vcpu_info {
      * non-zero mask therefore guarantees that the VCPU will not receive
      * an upcall activation. The mask is cleared when the VCPU requests
      * to block: this avoids wakeup-waiting races.
+     *
+     * The guest can set 'no_desched' to a non-zero value to avoid being
+     * descheduled. If the hypervisor didn't deschedule the VCPU due to
+     * 'no_desched' being set, it will itself set 'desched_delay' to inform
+     * the guest to give up control voluntaryly later. This is just a wish
+     * of the guest which the hypervisor may not obey (and it will deschedule
+     * the guest after a reasonable time anyway).
      */
     uint8_t evtchn_upcall_pending;
     uint8_t evtchn_upcall_mask;
+    uint8_t no_desched;
+    uint8_t desched_delay;
     unsigned long evtchn_pending_sel;
     struct arch_vcpu_info arch;
     struct vcpu_time_info time;

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  8:12       ` Juergen Gross
@ 2008-12-19  9:10         ` Keir Fraser
  2008-12-19  9:25           ` Juergen Gross
  2008-12-19  9:33           ` Jan Beulich
  0 siblings, 2 replies; 37+ messages in thread
From: Keir Fraser @ 2008-12-19  9:10 UTC (permalink / raw)
  To: Juergen Gross, Jan Beulich; +Cc: xen-devel@lists.xensource.com

On 19/12/2008 08:12, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
wrote:

>> Ah, yes, I recall. But then why don't you use that macro? After all,
>> hypercall.h
>> must have been included if you're able to call HYPERVISOR_sched_op().
> 
> This was an artefact from previous tests, sorry.
> Attached patch should be clean now.
> 
> Keir, what is your main objection?
> Putting this interface into the hypervisor or only the Linux part?
> I still think this would be good to have in linux, but our BS2000 port would
> really be much easier having this interface in the hypervisor!
> So if you don't mind adding only the first patch, this would be absolutely
> okay for us.

I haven't seen any win on any real world setup. So I remain unconvinced, and
it'll need more than you alone championing the patch to get it in. There
have been no other general comments so far (Jan's have been about specific
details).

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:10         ` Keir Fraser
@ 2008-12-19  9:25           ` Juergen Gross
  2008-12-19  9:56             ` Keir Fraser
  2008-12-19  9:33           ` Jan Beulich
  1 sibling, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2008-12-19  9:25 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com

Keir Fraser wrote:
> On 19/12/2008 08:12, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
> wrote:
>> Keir, what is your main objection?
>> Putting this interface into the hypervisor or only the Linux part?
>> I still think this would be good to have in linux, but our BS2000 port would
>> really be much easier having this interface in the hypervisor!
>> So if you don't mind adding only the first patch, this would be absolutely
>> okay for us.
> 
> I haven't seen any win on any real world setup. So I remain unconvinced, and
> it'll need more than you alone championing the patch to get it in. There
> have been no other general comments so far (Jan's have been about specific
> details).

Okay, would the following scenario be "real world" enough?

Multiple domUs being busy leading to enough vcpu scheduling, several parallel
kernel builds in dom0 acting as benchmark.

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:10         ` Keir Fraser
  2008-12-19  9:25           ` Juergen Gross
@ 2008-12-19  9:33           ` Jan Beulich
  2008-12-19  9:56             ` Keir Fraser
  2008-12-19 10:06             ` Juergen Gross
  1 sibling, 2 replies; 37+ messages in thread
From: Jan Beulich @ 2008-12-19  9:33 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Juergen Gross, xen-devel@lists.xensource.com

>>> Keir Fraser <keir.fraser@eu.citrix.com> 19.12.08 10:10 >>>
>I haven't seen any win on any real world setup. So I remain unconvinced, and
>it'll need more than you alone championing the patch to get it in. There
>have been no other general comments so far (Jan's have been about specific
>details).

I think I'd generally welcome a change like this, but I'm not certain how far
I feel convinced that the submission meets one very basic criteria:
avoidance of mis-use of the feature by a domain (simply stating that a vCPU
will be de-scheduled after 1ms anyway doesn't seem sufficient to me). This
might need to include ways to differentiate between Dom0/DomU and/or
CPU- vs IO-bound vCPU-s.

Beyond that, it seems questionable that tying this to event delivery being
disabled on the vCPU is very useful - Xen could do this on its own, without
needing a second flag. Having an extra flag really seems useful only when
one can set it while holding spin locks, which at least the Linux part of the
patch doesn't seem to aim at.

Jan

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:25           ` Juergen Gross
@ 2008-12-19  9:56             ` Keir Fraser
  2009-01-16  7:16               ` Juergen Gross
  0 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2008-12-19  9:56 UTC (permalink / raw)
  To: Juergen Gross; +Cc: George Dunlap, xen-devel@lists.xensource.com

On 19/12/2008 09:25, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
wrote:

>> I haven't seen any win on any real world setup. So I remain unconvinced, and
>> it'll need more than you alone championing the patch to get it in. There
>> have been no other general comments so far (Jan's have been about specific
>> details).
> 
> Okay, would the following scenario be "real world" enough?
> 
> Multiple domUs being busy leading to enough vcpu scheduling, several parallel
> kernel builds in dom0 acting as benchmark.

Something like that would be better. Of course you'd need to measure work
done in the domUs as well, as one of the critical factors for this patch
would be how it affects fairness. It's one reason I'm leery of this patch --
our scheduler is unpredictable enough as it is without giving domains
another lever to pull!

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:33           ` Jan Beulich
@ 2008-12-19  9:56             ` Keir Fraser
  2008-12-19 15:15               ` George Dunlap
  2008-12-19 10:06             ` Juergen Gross
  1 sibling, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2008-12-19  9:56 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, Juergen Gross, xen-devel@lists.xensource.com

On 19/12/2008 09:33, "Jan Beulich" <jbeulich@novell.com> wrote:

>>>> Keir Fraser <keir.fraser@eu.citrix.com> 19.12.08 10:10 >>>
>> I haven't seen any win on any real world setup. So I remain unconvinced, and
>> it'll need more than you alone championing the patch to get it in. There
>> have been no other general comments so far (Jan's have been about specific
>> details).
> 
> I think I'd generally welcome a change like this, but I'm not certain how far
> I feel convinced that the submission meets one very basic criteria:
> avoidance of mis-use of the feature by a domain (simply stating that a vCPU
> will be de-scheduled after 1ms anyway doesn't seem sufficient to me). This
> might need to include ways to differentiate between Dom0/DomU and/or
> CPU- vs IO-bound vCPU-s.

The most likely person to comment on that in the coming weeks would be
George, who's kindly signed up to do some design work on the scheduler.

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:33           ` Jan Beulich
  2008-12-19  9:56             ` Keir Fraser
@ 2008-12-19 10:06             ` Juergen Gross
  2008-12-19 10:36               ` Jan Beulich
  1 sibling, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2008-12-19 10:06 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel@lists.xensource.com, Keir Fraser

Jan Beulich wrote:
>>>> Keir Fraser <keir.fraser@eu.citrix.com> 19.12.08 10:10 >>>
>> I haven't seen any win on any real world setup. So I remain unconvinced, and
>> it'll need more than you alone championing the patch to get it in. There
>> have been no other general comments so far (Jan's have been about specific
>> details).
> 
> I think I'd generally welcome a change like this, but I'm not certain how far
> I feel convinced that the submission meets one very basic criteria:
> avoidance of mis-use of the feature by a domain (simply stating that a vCPU
> will be de-scheduled after 1ms anyway doesn't seem sufficient to me). This
> might need to include ways to differentiate between Dom0/DomU and/or
> CPU- vs IO-bound vCPU-s.

It would be possible to sum up the mis-usages. The extra 1 msec could be
allowed only if the total consumed time of the vcpu is at least 1000 times
the extra time spent so far. Or we could allow it only if no mis-usage had
been recorded in the last second.

> 
> Beyond that, it seems questionable that tying this to event delivery being
> disabled on the vCPU is very useful - Xen could do this on its own, without
> needing a second flag. Having an extra flag really seems useful only when
> one can set it while holding spin locks, which at least the Linux part of the
> patch doesn't seem to aim at.

My first test was not limited to the irq disabling. I tried to tie it to the
preemption_count of the current thread by adding special functions for
manipulating the preemption_count. I found this approach was to complicated
for a first test as I obviously missed some details (the resulting system
did work, but was much slower as before).
I think it would be interesting to spend some more time to tune this, but I
wanted to get some feedback first. And, to be honest, I don't think I am
the right person to do this, as I'm really no Linux scheduling specialist :-)

Regarding handling this in Xen only: not to deschedule a vcpu in case of
interrupts disabled is easy, but how would you deschedule it after interrupts
are allowed again?

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19 10:06             ` Juergen Gross
@ 2008-12-19 10:36               ` Jan Beulich
  2008-12-19 10:42                 ` Juergen Gross
  2008-12-19 10:48                 ` Juergen Gross
  0 siblings, 2 replies; 37+ messages in thread
From: Jan Beulich @ 2008-12-19 10:36 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel@lists.xensource.com, Keir Fraser

>>> Juergen Gross <juergen.gross@fujitsu-siemens.com> 19.12.08 11:06 >>>
>Regarding handling this in Xen only: not to deschedule a vcpu in case of
>interrupts disabled is easy, but how would you deschedule it after interrupts
>are allowed again?

It's all the same as with the newly added flag of yours - it requires cooperation
from the guest (plus the forced de-schedule if it fails to do so). It's just that
you don't need to set two flags when disabling interrupts, that second flag
would only be needed when you want to avoid being de-scheduled for
reasons other than event delivery being disabled.

Jan

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance  Linux-part
  2008-12-19 10:36               ` Jan Beulich
@ 2008-12-19 10:42                 ` Juergen Gross
  2008-12-19 10:48                 ` Juergen Gross
  1 sibling, 0 replies; 37+ messages in thread
From: Juergen Gross @ 2008-12-19 10:42 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel@lists.xensource.com, Keir Fraser

Jan Beulich wrote:
>>>> Juergen Gross <juergen.gross@fujitsu-siemens.com> 19.12.08 11:06 >>>
>> Regarding handling this in Xen only: not to deschedule a vcpu in case of
>> interrupts disabled is easy, but how would you deschedule it after interrupts
>> are allowed again?
> 
> It's all the same as with the newly added flag of yours - it requires cooperation
> from the guest (plus the forced de-schedule if it fails to do so). It's just that
> you don't need to set two flags when disabling interrupts, that second flag
> would only be needed when you want to avoid being de-scheduled for
> reasons other than event delivery being disabled.

I just wanted to be independent from the event mechanism. I'm fine with
merging the two features, but it would be more limiting than necessary.
It's like disabling IRQs and avoiding preemption of a thread: you can avoid
preemption by disabling IRQs, but this is not the best way to do it...


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19 10:36               ` Jan Beulich
  2008-12-19 10:42                 ` Juergen Gross
@ 2008-12-19 10:48                 ` Juergen Gross
  1 sibling, 0 replies; 37+ messages in thread
From: Juergen Gross @ 2008-12-19 10:48 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel@lists.xensource.com, Keir Fraser

Jan Beulich wrote:
>>>> Juergen Gross <juergen.gross@fujitsu-siemens.com> 19.12.08 11:06 >>>
>> Regarding handling this in Xen only: not to deschedule a vcpu in case of
>> interrupts disabled is easy, but how would you deschedule it after interrupts
>> are allowed again?
> 
> It's all the same as with the newly added flag of yours - it requires cooperation
> from the guest (plus the forced de-schedule if it fails to do so). It's just that
> you don't need to set two flags when disabling interrupts, that second flag
> would only be needed when you want to avoid being de-scheduled for
> reasons other than event delivery being disabled.

Just another thought:
My approach was more compatible. Only a guest which is aware of the new flag
will set it and in turn respect the request of the hypervisor to give up
control after enabling de-scheduling again.
Old guests would always be regarded as non-cooperative.

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:56             ` Keir Fraser
@ 2008-12-19 15:15               ` George Dunlap
  2009-01-12 12:55                 ` Juergen Gross
  0 siblings, 1 reply; 37+ messages in thread
From: George Dunlap @ 2008-12-19 15:15 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Juergen Gross, xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 2593 bytes --]

The general idea seems interesting.  I think we've kicked it around
internally before, but ended up sticking with a "yield after spinning
for awhile" strategy just for simplicity.  However, as Jeurgen says,
this flag could, in principle, save all of the "spin for a while"
timewasting in the first place.

As for mis-use: If we do things right, a guest shouldn't be able to
get an advantage from setting the flag when it doesn't need to. If we
add the ability to preempt it after 1ms, and deduct the extra credits
from the VM for the extra time run, then it will only run a little
longer, and then have to wait longer to be scheduled again time.  (I
think the more accurate credit accounting part of Naoki's patches are
sure to be included in the scheduler revision.)  If it doesn't yield
after the critical section is over, it risks  being pre-empted at the
next critical section.

The thing to test would be concurrent kernel builds and dbench, with
multiple domains, each domain vcpus == pcpus.

Would you mind coding up a yield-after-spinning-awhile patch, and
comparing the results to your "don't-deschedule-me" patch kernel build
at least, and possibly dbench?  I'm including some patches which
should be included when testing the "yield after spinning awhile"
patch, otherwise nothing interesting will happen.  They're a bit
hackish, but seem to work pretty well for their purpose.

 -George


On Fri, Dec 19, 2008 at 9:56 AM, Keir Fraser <keir.fraser@eu.citrix.com> wrote:
> On 19/12/2008 09:33, "Jan Beulich" <jbeulich@novell.com> wrote:
>
>>>>> Keir Fraser <keir.fraser@eu.citrix.com> 19.12.08 10:10 >>>
>>> I haven't seen any win on any real world setup. So I remain unconvinced, and
>>> it'll need more than you alone championing the patch to get it in. There
>>> have been no other general comments so far (Jan's have been about specific
>>> details).
>>
>> I think I'd generally welcome a change like this, but I'm not certain how far
>> I feel convinced that the submission meets one very basic criteria:
>> avoidance of mis-use of the feature by a domain (simply stating that a vCPU
>> will be de-scheduled after 1ms anyway doesn't seem sufficient to me). This
>> might need to include ways to differentiate between Dom0/DomU and/or
>> CPU- vs IO-bound vCPU-s.
>
> The most likely person to comment on that in the coming weeks would be
> George, who's kindly signed up to do some design work on the scheduler.
>
>  -- Keir
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: scheduler.cpu_pick-avoids-redundancy.patch --]
[-- Type: text/x-diff; name=scheduler.cpu_pick-avoids-redundancy.patch, Size: 1051 bytes --]

diff -r 121d3e592294 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Wed Aug 15 14:33:28 2007 +0100
+++ b/xen/common/sched_credit.c	Tue Aug 21 11:14:46 2007 +0100
@@ -418,15 +418,29 @@ static int
 static int
 csched_cpu_pick(struct vcpu *vc)
 {
-    cpumask_t cpus;
+    cpumask_t cpus, cpus_avoid_same_domain;
     cpumask_t idlers;
     int cpu;
+    struct vcpu *vp;
 
     /*
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
     cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+
+    /* Avoid putting vcpus from the same domain on the same cpu */
+    cpus_avoid_same_domain=cpus;
+    for_each_vcpu( vc->domain, vp )
+    {
+        if(vp == vc)
+            continue;
+        cpu_clear(vp->processor, cpus_avoid_same_domain);
+    }
+
+    if( !cpus_empty(cpus_avoid_same_domain) )
+        cpus = cpus_avoid_same_domain;
+
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : __cycle_cpu(vc->processor, &cpus);

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: scheduler.push-redundant-vcpus.patch --]
[-- Type: text/x-diff; name=scheduler.push-redundant-vcpus.patch, Size: 2748 bytes --]

diff -r 7280488d98d1 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Fri Aug 31 15:56:20 2007 +0100
+++ b/xen/common/sched_credit.c	Fri Aug 31 16:18:46 2007 +0100
@@ -1066,6 +1066,50 @@ csched_tick(void *_cpu)
     set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
 }
 
+static void
+csched_distribute_domain_vcpus(int cpu, struct domain * ddom)
+{
+    const struct csched_pcpu * const pcpu = CSCHED_PCPU(cpu);
+    struct csched_vcpu *sp;
+    struct list_head * iter, *n;
+
+    list_for_each_safe( iter, n, &pcpu->runq )
+    {
+        sp = __runq_elem(iter);
+        if( ( sp->vcpu->domain == ddom ) && vcpu_runnable(sp->vcpu) ) {
+            /*
+             * There is a short period of time, between the beginning
+             * of csched_schedule() and context_switch(), where the
+             * currently running vcpu (aka prev in schedule.c) is on
+             * the runqueue, and the newly chosen vcpu (next) is off
+             * the runqueue but not yet running.  Scheduling the
+             * running vcpu on another cpu before it's context
+             * switched out is disastrous.
+             *
+             * Instead we set the migrating bit.  This is checked
+             * after context_switch() occurs.  vcpu_migrate will
+             * call cpu_pick() to choose another cpu at that time.
+             */
+            if( sp->vcpu->is_running )
+            {
+                __runq_remove(sp);
+                set_bit(_VPF_migrating, &sp->vcpu->pause_flags);
+            }
+            else 
+            {
+                int peer_cpu = csched_cpu_pick(sp->vcpu);
+                if ( spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
+                {
+                    __runq_remove(sp);
+                    sp->vcpu->processor = peer_cpu;
+                    csched_vcpu_wake(sp->vcpu);
+                    spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+                } 
+            }
+        }
+    }
+}
+
 static struct csched_vcpu *
 csched_runq_steal(int peer_cpu, int cpu, int pri)
 {
@@ -1220,6 +1264,13 @@ csched_schedule(s_time_t now)
     {
         cpu_clear(cpu, csched_priv.idlers);
     }
+
+    /* 
+     * Distrubute vcpus from the same domain from this
+     * runqueue to others
+     */
+    if( !is_idle_vcpu(snext->vcpu) )
+        csched_distribute_domain_vcpus(cpu, snext->vcpu->domain);
 
     /*
      * Return task to run next...
@@ -1407,3 +1458,13 @@ struct scheduler sched_credit_def = {
     .dump_settings  = csched_dump,
     .init           = csched_init,
 };
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #4: scheduler.yield-reduces-priority.patch --]
[-- Type: text/x-diff; name=scheduler.yield-reduces-priority.patch, Size: 3314 bytes --]

diff -r bb9a7d40d0b5 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Mon Sep 01 13:01:23 2008 +0100
+++ b/xen/common/sched_credit.c	Mon Sep 01 13:01:37 2008 +0100
@@ -61,7 +61,9 @@
 /*
  * Flags
  */
-#define CSCHED_FLAG_VCPU_PARKED 0x0001  /* VCPU over capped credits */
+#define CSCHED_FLAG_VCPU_PARKED    0x0001  /* VCPU over capped credits */
+#define CSCHED_FLAG_VCPU_YIELD     0x0002  /* VCPU yielding */
+#define CSCHED_FLAG_VCPU_YIELD_PRI 0x0004  /* VCPU's priority reduced to force a yield */
 
 
 /*
@@ -711,6 +713,15 @@ csched_vcpu_wake(struct vcpu *vc)
     __runq_tickle(cpu, svc);
 }
 
+static void
+csched_vcpu_yield(struct vcpu *v) 
+{
+    struct csched_vcpu * const sv = CSCHED_VCPU(v);
+
+    /* Temporarily lower priority of vcpus that yield */
+    sv->flags |= CSCHED_FLAG_VCPU_YIELD;
+}
+
 static int
 csched_dom_cntl(
     struct domain *d,
@@ -1231,12 +1242,38 @@ csched_schedule(s_time_t now)
     /*
      * Select next runnable local VCPU (ie top of local runq)
      */
+retry:
     if ( vcpu_runnable(current) )
         __runq_insert(cpu, scurr);
     else
         BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
 
     snext = __runq_elem(runq->next);
+
+    /*
+     * Check to see if yield worked; if not, temporarily reduce priority
+     */
+    if ( scurr->flags & CSCHED_FLAG_VCPU_YIELD )
+    {
+        scurr->flags &= ~(CSCHED_FLAG_VCPU_YIELD);
+        /* Check to see if yield worked.  If not, reduce priority and retry. */
+        if ( (snext == scurr)
+            && (scurr->pri > CSCHED_PRI_TS_OVER) )
+        {
+            __runq_remove(scurr);
+            scurr->flags |= CSCHED_FLAG_VCPU_YIELD_PRI;
+            scurr->pri = CSCHED_PRI_TS_OVER;
+            goto retry;
+        }
+    }
+
+    /* If we reduced priority due to yielding, reinstate it. */
+    if ( snext->flags & CSCHED_FLAG_VCPU_YIELD_PRI )
+    {
+        snext->flags &= ~CSCHED_FLAG_VCPU_YIELD_PRI;
+        if ( snext->pri == CSCHED_PRI_TS_OVER )
+            snext->pri = CSCHED_PRI_TS_UNDER;
+    }
 
     /*
      * SMP Load balance:
@@ -1448,6 +1485,7 @@ struct scheduler sched_credit_def = {
 
     .sleep          = csched_vcpu_sleep,
     .wake           = csched_vcpu_wake,
+    .yield          = csched_vcpu_yield,
 
     .adjust         = csched_dom_cntl,
 
diff -r bb9a7d40d0b5 xen/common/schedule.c
--- a/xen/common/schedule.c	Mon Sep 01 13:01:23 2008 +0100
+++ b/xen/common/schedule.c	Mon Sep 01 13:01:25 2008 +0100
@@ -386,6 +386,12 @@ static long do_poll(struct sched_poll *s
 /* Voluntarily yield the processor for this allocation. */
 static long do_yield(void)
 {
+    struct vcpu * v=current;
+
+    vcpu_schedule_lock_irq(v);
+    SCHED_OP(yield, current);
+    vcpu_schedule_unlock_irq(v);
+
     TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
     raise_softirq(SCHEDULE_SOFTIRQ);
     return 0;
diff -r bb9a7d40d0b5 xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h	Mon Sep 01 13:01:23 2008 +0100
+++ b/xen/include/xen/sched-if.h	Mon Sep 01 13:01:25 2008 +0100
@@ -69,6 +69,7 @@ struct scheduler {
 
     void         (*sleep)          (struct vcpu *);
     void         (*wake)           (struct vcpu *);
+    void         (*yield)          (struct vcpu *);
 
     struct task_slice (*do_schedule) (s_time_t);
 

[-- Attachment #5: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19 15:15               ` George Dunlap
@ 2009-01-12 12:55                 ` Juergen Gross
  2009-01-19 17:32                   ` George Dunlap
  0 siblings, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2009-01-12 12:55 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel@lists.xensource.com, Keir Fraser

George Dunlap wrote:
> The general idea seems interesting.  I think we've kicked it around
> internally before, but ended up sticking with a "yield after spinning
> for awhile" strategy just for simplicity.  However, as Jeurgen says,
> this flag could, in principle, save all of the "spin for a while"
> timewasting in the first place.
> 
> As for mis-use: If we do things right, a guest shouldn't be able to
> get an advantage from setting the flag when it doesn't need to. If we
> add the ability to preempt it after 1ms, and deduct the extra credits
> from the VM for the extra time run, then it will only run a little
> longer, and then have to wait longer to be scheduled again time.  (I
> think the more accurate credit accounting part of Naoki's patches are
> sure to be included in the scheduler revision.)  If it doesn't yield
> after the critical section is over, it risks  being pre-empted at the
> next critical section.
> 
> The thing to test would be concurrent kernel builds and dbench, with
> multiple domains, each domain vcpus == pcpus.
> 
> Would you mind coding up a yield-after-spinning-awhile patch, and
> comparing the results to your "don't-deschedule-me" patch kernel build
> at least, and possibly dbench?  I'm including some patches which
> should be included when testing the "yield after spinning awhile"
> patch, otherwise nothing interesting will happen.  They're a bit
> hackish, but seem to work pretty well for their purpose.

It took some time (other problems, as always ;-) ), but here are the results:

Hardware: 4 cpu x86_64 machine, 8 GB memory.
Domain 0 with 4 vcpus, 8 other domains with 1 vcpu each spinning to force vcpu
scheduling.
8 parallel xen hypervisor builds on domain 0 plus scp from another machine to
have some network load.
Additional test with dbench after the build jobs.

Results with patched system (no deschedule):
--------------------------------------------
Domain 0 consumed 581.2 seconds, the other domains each about 535 seconds.
While the builds were running 60 scp jobs finished.
Real time for the build was between 1167 and 1214 seconds (av. 1192 seconds).
Summed user time was 562.77 seconds, system time 12.17 seconds.
dbench: Throughput 141.764 MB/sec 10 procs
System reaction to shell commands: okay

Original system:
----------------
Domain 0 consumed 583.8 seconds, the other domains each about 540 seconds.
While the builds were running 60 scp jobs finished.
Real time for the build was between 1181 and 1222 seconds (av. 1204 seconds).
Summed user time was 563.02 seconds, system time 12.65 seconds.
dbench: Throughput 133.249 MB/sec 10 procs
System reaction to shell commands: slower than patched system

Yield in spinlock:
------------------
Domain 0 consumed 582.2 seconds, the other domains each about 555 seconds.
While the builds were running 50 scp jobs finished.
Real time for the build was between 1226 and 1254 seconds (av. 1244 seconds).
Summed user time was 563.43 seconds, system time 12.63 seconds.
dbench: Throughput 145.218 MB/sec 10 procs
System reaction to shell commands: sometimes "hickups" for up to 30 seconds
Included were the hypervisor patches of George


Conclusion:
-----------
Differences not really big, but my "no deschedule" patch had least elapsed
time for build-jobs, while scp was able to transfer same amount of data as
in slower original system.
The "Yield in spinlock" patch had slightly better dbench performance, but
interactive shell commands were a pain sometimes! I suspect some problem in
George's patches during low system load to be the main reason for this
behaviour. Without George's patches the "Yield in spinlock" was very similar
to the original system.


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2008-12-19  9:56             ` Keir Fraser
@ 2009-01-16  7:16               ` Juergen Gross
  2009-01-16  7:38                 ` Venefax
  2009-01-16  8:17                 ` Keir Fraser
  0 siblings, 2 replies; 37+ messages in thread
From: Juergen Gross @ 2009-01-16  7:16 UTC (permalink / raw)
  To: Keir Fraser; +Cc: George Dunlap, xen-devel@lists.xensource.com

Keir Fraser wrote:
> On 19/12/2008 09:25, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
> wrote:
> 
>>> I haven't seen any win on any real world setup. So I remain unconvinced, and
>>> it'll need more than you alone championing the patch to get it in. There
>>> have been no other general comments so far (Jan's have been about specific
>>> details).
>> Okay, would the following scenario be "real world" enough?
>>
>> Multiple domUs being busy leading to enough vcpu scheduling, several parallel
>> kernel builds in dom0 acting as benchmark.
> 
> Something like that would be better. Of course you'd need to measure work
> done in the domUs as well, as one of the critical factors for this patch
> would be how it affects fairness. It's one reason I'm leery of this patch --
> our scheduler is unpredictable enough as it is without giving domains
> another lever to pull!

Keir, is the data I posted recently okay?
I think my approach requires less changes than the "yield after spin" variant,
which needed more patches in the hypervisor and didn't seem to be settled.
Having my patches in the hypervisor at least would make life much easier for
our BS2000 system...
I would add some code to ensure a domain isn't misusing the new interface.

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  7:16               ` Juergen Gross
@ 2009-01-16  7:38                 ` Venefax
  2009-01-16  7:48                   ` Juergen Gross
  2009-01-16  8:17                 ` Keir Fraser
  1 sibling, 1 reply; 37+ messages in thread
From: Venefax @ 2009-01-16  7:38 UTC (permalink / raw)
  To: 'Juergen Gross', 'Keir Fraser'
  Cc: 'George Dunlap', xen-devel

I can test it a real-world situation.
I have SUSE 10-SP2 and have terrible performance issues with fully
virtualized SMP machines. I had to start using Standard PC as HAL to avoid
the penalty.
Federico


-----Original Message-----
From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Juergen Gross
Sent: Friday, January 16, 2009 2:16 AM
To: Keir Fraser
Cc: George Dunlap; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel] [Patch 2 of 2]: PV-domain SMP performance
Linux-part

Keir Fraser wrote:
> On 19/12/2008 09:25, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
> wrote:
> 
>>> I haven't seen any win on any real world setup. So I remain unconvinced,
and
>>> it'll need more than you alone championing the patch to get it in. There
>>> have been no other general comments so far (Jan's have been about
specific
>>> details).
>> Okay, would the following scenario be "real world" enough?
>>
>> Multiple domUs being busy leading to enough vcpu scheduling, several
parallel
>> kernel builds in dom0 acting as benchmark.
> 
> Something like that would be better. Of course you'd need to measure work
> done in the domUs as well, as one of the critical factors for this patch
> would be how it affects fairness. It's one reason I'm leery of this patch
--
> our scheduler is unpredictable enough as it is without giving domains
> another lever to pull!

Keir, is the data I posted recently okay?
I think my approach requires less changes than the "yield after spin"
variant,
which needed more patches in the hypervisor and didn't seem to be settled.
Having my patches in the hypervisor at least would make life much easier for
our BS2000 system...
I would add some code to ensure a domain isn't misusing the new interface.

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details:
www.fujitsu-siemens.com/imprint.html

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  7:38                 ` Venefax
@ 2009-01-16  7:48                   ` Juergen Gross
  2009-01-16  7:57                     ` Venefax
  2009-01-16 10:16                     ` James Harper
  0 siblings, 2 replies; 37+ messages in thread
From: Juergen Gross @ 2009-01-16  7:48 UTC (permalink / raw)
  To: Venefax; +Cc: 'George Dunlap', xen-devel, 'Keir Fraser'

Venefax wrote:
> I can test it a real-world situation.
> I have SUSE 10-SP2 and have terrible performance issues with fully
> virtualized SMP machines. I had to start using Standard PC as HAL to avoid
> the penalty.
> Federico

Federico, thanks for your support. But my patches are for PV-domains (or at
least for domains with PV-drivers) only. I think you are using Windows as
guest, so you would have to build a XEN-aware HAL...

If you are testing with PV-domains, too, you are welcome, of course!


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  7:48                   ` Juergen Gross
@ 2009-01-16  7:57                     ` Venefax
  2009-01-16  8:19                       ` Juergen Gross
  2009-01-16 10:16                     ` James Harper
  1 sibling, 1 reply; 37+ messages in thread
From: Venefax @ 2009-01-16  7:57 UTC (permalink / raw)
  To: 'Juergen Gross'
  Cc: 'George Dunlap', xen-devel, 'Keir Fraser'

Just for my education and the rest of the list, what are you talking about?
What is a PV domain compared to a Windows guest? I use the GPLPV drivers
from James.

-----Original Message-----
From: Juergen Gross [mailto:juergen.gross@fujitsu-siemens.com] 
Sent: Friday, January 16, 2009 2:49 AM
To: Venefax
Cc: 'Keir Fraser'; 'George Dunlap'; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel] [Patch 2 of 2]: PV-domain SMP performance
Linux-part

Venefax wrote:
> I can test it a real-world situation.
> I have SUSE 10-SP2 and have terrible performance issues with fully
> virtualized SMP machines. I had to start using Standard PC as HAL to avoid
> the penalty.
> Federico

Federico, thanks for your support. But my patches are for PV-domains (or at
least for domains with PV-drivers) only. I think you are using Windows as
guest, so you would have to build a XEN-aware HAL...

If you are testing with PV-domains, too, you are welcome, of course!


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details:
www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  7:16               ` Juergen Gross
  2009-01-16  7:38                 ` Venefax
@ 2009-01-16  8:17                 ` Keir Fraser
  2009-01-16  9:36                   ` Juergen Gross
  1 sibling, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-01-16  8:17 UTC (permalink / raw)
  To: Juergen Gross; +Cc: George Dunlap, xen-devel@lists.xensource.com

On 16/01/2009 07:16, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
wrote:

>> Something like that would be better. Of course you'd need to measure work
>> done in the domUs as well, as one of the critical factors for this patch
>> would be how it affects fairness. It's one reason I'm leery of this patch --
>> our scheduler is unpredictable enough as it is without giving domains
>> another lever to pull!
> 
> Keir, is the data I posted recently okay?
> I think my approach requires less changes than the "yield after spin" variant,
> which needed more patches in the hypervisor and didn't seem to be settled.
> Having my patches in the hypervisor at least would make life much easier for
> our BS2000 system...
> I would add some code to ensure a domain isn't misusing the new interface.

It didn't sound like there was much average difference between the two
approaches, also that George's patches may be going in anyway for general
scheduling stability reasons, and also that any other observed hiccups may
also simply point to limitations of the scheduler implementation which
George may look at further.

Do you have an explanation for why shell commands behave differently with
your patch, or alternatively why they can be delayed so long with the yield
approach?

The approach taken in Linux is not merely 'yield on spinlock' by the way, it
is 'block on event channel on spinlock' essentially turning a contended
spinlock into a sleeping mutex. I think that is quite different behaviour
from merely yielding, and expecting the scheduler to do something sensible
with your yield request.

Overall I think George should consider your patch as part of his overall
scheduler refurbishment work. I personally remain unconvinced that the
reactive approach cannot get predictable performance close to your approach,
and without needing new hypervisor interfaces.

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  7:57                     ` Venefax
@ 2009-01-16  8:19                       ` Juergen Gross
  0 siblings, 0 replies; 37+ messages in thread
From: Juergen Gross @ 2009-01-16  8:19 UTC (permalink / raw)
  To: Venefax; +Cc: xen-devel

Venefax wrote:
> Just for my education and the rest of the list, what are you talking about?
> What is a PV domain compared to a Windows guest? I use the GPLPV drivers
> from James.

PV-domains are adapted to XEN in many aspects (memory management, I/O, trap
handling, ...). Instead of using the privileged x86-instructions they normally
call the hypervisor via hypercalls to perform privileged operations.

HVM-domains like Windows require virtualization support by the processor (VT-i
for INTEL, Pacifica for AMD). To boost I/O-performance, it is possible to use
PV-drivers which use a virtual PCI-device to do I/O, but most of the
privileged actions are performed as on native systems, trapping into the
hypervisor if necessary.

You could read some of the links in
http://wiki.xensource.com/xenwiki/XenArchitecture especially
http://wiki.xensource.com/xenwiki/XenArchitecture?action=AttachFile&do=get&target=Xen+Architecture_Q1+2008.pdf
http://www.linuxjournal.com/article/8540 and
http://www.linuxjournal.com/article/8909 might be useful.

My patches require special actions in the domain to avoid descheduling in
critical paths. In Windows this is normally an area covered by HAL, so you
would have to port HAL to be XEN-aware, which I think would be a task for
Microsoft to do...

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  8:17                 ` Keir Fraser
@ 2009-01-16  9:36                   ` Juergen Gross
  2009-01-16  9:53                     ` Keir Fraser
  0 siblings, 1 reply; 37+ messages in thread
From: Juergen Gross @ 2009-01-16  9:36 UTC (permalink / raw)
  To: Keir Fraser; +Cc: George Dunlap, xen-devel@lists.xensource.com

Keir Fraser wrote:
> On 16/01/2009 07:16, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
> wrote:
> 
>>> Something like that would be better. Of course you'd need to measure work
>>> done in the domUs as well, as one of the critical factors for this patch
>>> would be how it affects fairness. It's one reason I'm leery of this patch --
>>> our scheduler is unpredictable enough as it is without giving domains
>>> another lever to pull!
>> Keir, is the data I posted recently okay?
>> I think my approach requires less changes than the "yield after spin" variant,
>> which needed more patches in the hypervisor and didn't seem to be settled.
>> Having my patches in the hypervisor at least would make life much easier for
>> our BS2000 system...
>> I would add some code to ensure a domain isn't misusing the new interface.
> 
> It didn't sound like there was much average difference between the two
> approaches, also that George's patches may be going in anyway for general
> scheduling stability reasons, and also that any other observed hiccups may
> also simply point to limitations of the scheduler implementation which
> George may look at further.

I think in extreme situations my approach will give better results.
The higher the number of vcpus the better it will be. Avoiding descheduling in
a critical path should always be preferred to a statistical search for the
processor locking a resource.

> 
> Do you have an explanation for why shell commands behave differently with
> your patch, or alternatively why they can be delayed so long with the yield
> approach?

No hard data. It must be related to the yield in my spinlock patch somehow,
as the problem did not occur with the same hypervisor and the "no deschedule"
patch in Linux. But the problem requires George's hypervisor patches to show
up.

> 
> The approach taken in Linux is not merely 'yield on spinlock' by the way, it
> is 'block on event channel on spinlock' essentially turning a contended
> spinlock into a sleeping mutex. I think that is quite different behaviour
> from merely yielding, and expecting the scheduler to do something sensible
> with your yield request.

Could you explain this a little bit more in detail, please?

> 
> Overall I think George should consider your patch as part of his overall
> scheduler refurbishment work. I personally remain unconvinced that the
> reactive approach cannot get predictable performance close to your approach,
> and without needing new hypervisor interfaces.

Perhaps a combination could be even better. My approach reduces latency while
holding a lock, while the 'block on event channel on spinlock' approach will
use the time of an otherwise spinning vcpu for productive work.


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  9:36                   ` Juergen Gross
@ 2009-01-16  9:53                     ` Keir Fraser
  2009-01-16 17:41                       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-01-16  9:53 UTC (permalink / raw)
  To: Juergen Gross
  Cc: George Dunlap, Jeremy Fitzhardinge, xen-devel@lists.xensource.com

On 16/01/2009 09:36, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
wrote:

>> The approach taken in Linux is not merely 'yield on spinlock' by the way, it
>> is 'block on event channel on spinlock' essentially turning a contended
>> spinlock into a sleeping mutex. I think that is quite different behaviour
>> from merely yielding, and expecting the scheduler to do something sensible
>> with your yield request.
> 
> Could you explain this a little bit more in detail, please?

Jeremy Fitzhardinge did the implementation for Linux, so I'm cc'ing him in
case he remembers more details than me.

Basically each CPU allocates itself an IPI event channel at start of day.
When a CPU attempts to acquire a spinlock it spins a short while (perhaps a
few microseconds?) and then adds itself to a bitmap stored in the lock
structure (I think, or it might be a linked list of sleepers?). It then
calls SCHEDOP_poll listing its IPI evtchn as its wakeup requirement. When
the lock holder releases the lock it checks for sleepers and if it sees one
then it pings one of them (or is it all of them?) on its event channel, thus
waking it to take the lock.

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  7:48                   ` Juergen Gross
  2009-01-16  7:57                     ` Venefax
@ 2009-01-16 10:16                     ` James Harper
  2009-01-16 10:31                       ` Juergen Gross
                                         ` (2 more replies)
  1 sibling, 3 replies; 37+ messages in thread
From: James Harper @ 2009-01-16 10:16 UTC (permalink / raw)
  To: Juergen Gross, Venefax; +Cc: George Dunlap, xen-devel, Keir Fraser

> 
> Venefax wrote:
> > I can test it a real-world situation.
> > I have SUSE 10-SP2 and have terrible performance issues with fully
> > virtualized SMP machines. I had to start using Standard PC as HAL to
> avoid
> > the penalty.
> > Federico
> 
> Federico, thanks for your support. But my patches are for PV-domains
(or
> at
> least for domains with PV-drivers) only. I think you are using Windows
as
> guest, so you would have to build a XEN-aware HAL...
> 
> If you are testing with PV-domains, too, you are welcome, of course!
> 

Juergen,

Do you think your changes could be applicable to HVM domains with
appropriately patched kernel spinlock routines?

I had previously wondered about optimizing spinlocks, my idea was
basically for Xen to set a bit in a structure to indicate what vcpus are
currently scheduled, and my modified spinlock acquire routine would
check if the current vcpu wants a spinlock that is held by a currently
unscheduled vcpu, and if so yield to Xen to let the other vcpu schedule.

The only thing I would need from Xen is to know which vcpus were
currently scheduled, the rest would be DomU based.

Does that approximate what you do? I'll re-read your patch, I seem to
remember something about borrowing time from Xen to keep the vcpu a
little longer if a spinlock was held, so maybe you are taking a
proactive approach to my reactive approach?

The likelihood of this actually doing anything useful assumes that:
. Windows always uses the KeAcquireXxx and KeReleaseXxx calls and there
is no inlined spinlock access in the kernel (which would bypass my
hooks)
. That when Windows spins, it doesn't yield already
. That Xen actually deschedules a vcpu with a spinlock held often enough
for this to matter

Kernel patching only works on 32 bits though, so I'm not sure I'll
bother.

James 

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 10:16                     ` James Harper
@ 2009-01-16 10:31                       ` Juergen Gross
  2009-01-16 10:41                       ` Keir Fraser
  2009-01-16 17:43                       ` Jeremy Fitzhardinge
  2 siblings, 0 replies; 37+ messages in thread
From: Juergen Gross @ 2009-01-16 10:31 UTC (permalink / raw)
  To: James Harper; +Cc: xen-devel

James Harper wrote:
> Juergen,
> 
> Do you think your changes could be applicable to HVM domains with
> appropriately patched kernel spinlock routines?

Absolutely, yes.
I'm doing this with our BS2000-port to x86/XEN, which is running as a
HVM-domain using some PV-interfaces.

> 
> I had previously wondered about optimizing spinlocks, my idea was
> basically for Xen to set a bit in a structure to indicate what vcpus are
> currently scheduled, and my modified spinlock acquire routine would
> check if the current vcpu wants a spinlock that is held by a currently
> unscheduled vcpu, and if so yield to Xen to let the other vcpu schedule.
> 
> The only thing I would need from Xen is to know which vcpus were
> currently scheduled, the rest would be DomU based.
> 
> Does that approximate what you do? I'll re-read your patch, I seem to
> remember something about borrowing time from Xen to keep the vcpu a
> little longer if a spinlock was held, so maybe you are taking a
> proactive approach to my reactive approach?

Correct. I avoid losing the physical cpu in critical paths.
In my case I've used my new interface always when asynchronous interrupts
are disabled explicitly, as this is done always when entering critical
code paths.


Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 10:16                     ` James Harper
  2009-01-16 10:31                       ` Juergen Gross
@ 2009-01-16 10:41                       ` Keir Fraser
  2009-01-16 11:01                         ` James Harper
  2009-01-16 17:43                       ` Jeremy Fitzhardinge
  2 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-01-16 10:41 UTC (permalink / raw)
  To: James Harper, Juergen Gross, Venefax; +Cc: George Dunlap, xen-devel

On 16/01/2009 10:16, "James Harper" <james.harper@bendigoit.com.au> wrote:

> I had previously wondered about optimizing spinlocks, my idea was
> basically for Xen to set a bit in a structure to indicate what vcpus are
> currently scheduled, and my modified spinlock acquire routine would
> check if the current vcpu wants a spinlock that is held by a currently
> unscheduled vcpu, and if so yield to Xen to let the other vcpu schedule.

That's a lot more like our existing Linux pv_ops spinlock handling
(yield/block instead of spin) than Juergen's patch (don't deschedule me
while in a critical section). The difference from what you suggest is that
we heuristically detect unscheduled lock holders by spinning a short while.

You can pv up your Windows spinlocks in the block-instead-of-spin way
already (and yield-instead-of-spin is obviously even easier).

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 10:41                       ` Keir Fraser
@ 2009-01-16 11:01                         ` James Harper
  2009-01-16 11:14                           ` Keir Fraser
                                             ` (2 more replies)
  0 siblings, 3 replies; 37+ messages in thread
From: James Harper @ 2009-01-16 11:01 UTC (permalink / raw)
  To: Keir Fraser, Juergen Gross, Venefax; +Cc: George Dunlap, xen-devel

> On 16/01/2009 10:16, "James Harper" <james.harper@bendigoit.com.au>
wrote:
> 
> > I had previously wondered about optimizing spinlocks, my idea was
> > basically for Xen to set a bit in a structure to indicate what vcpus
are
> > currently scheduled, and my modified spinlock acquire routine would
> > check if the current vcpu wants a spinlock that is held by a
currently
> > unscheduled vcpu, and if so yield to Xen to let the other vcpu
schedule.
> 
> That's a lot more like our existing Linux pv_ops spinlock handling
> (yield/block instead of spin) than Juergen's patch (don't deschedule
me
> while in a critical section). The difference from what you suggest is
that
> we heuristically detect unscheduled lock holders by spinning a short
> while.
> 
> You can pv up your Windows spinlocks in the block-instead-of-spin way
> already (and yield-instead-of-spin is obviously even easier).
> 

But only in spinlocks that I 'own' completely right? I'm more concerned
about spinlocks that I share with Windows (eg in NDIS).

James

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 11:01                         ` James Harper
@ 2009-01-16 11:14                           ` Keir Fraser
  2009-01-16 11:18                           ` Jan Beulich
  2009-01-16 14:40                           ` Steve Prochniak
  2 siblings, 0 replies; 37+ messages in thread
From: Keir Fraser @ 2009-01-16 11:14 UTC (permalink / raw)
  To: James Harper, Juergen Gross, Venefax; +Cc: George Dunlap, xen-devel

On 16/01/2009 11:01, "James Harper" <james.harper@bendigoit.com.au> wrote:

> But only in spinlocks that I 'own' completely right? I'm more concerned
> about spinlocks that I share with Windows (eg in NDIS).

So only some critical sections will be pv'ed? Then the best you can
currently do is yield-on-spin in your custom spin_lock().

 -- Keir

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 11:01                         ` James Harper
  2009-01-16 11:14                           ` Keir Fraser
@ 2009-01-16 11:18                           ` Jan Beulich
  2009-01-16 14:40                           ` Steve Prochniak
  2 siblings, 0 replies; 37+ messages in thread
From: Jan Beulich @ 2009-01-16 11:18 UTC (permalink / raw)
  To: James Harper
  Cc: George Dunlap, xen-devel, Juergen Gross, Keir Fraser, Venefax

>>> "James Harper" <james.harper@bendigoit.com.au> 16.01.09 12:01 >>>
>> You can pv up your Windows spinlocks in the block-instead-of-spin way
>> already (and yield-instead-of-spin is obviously even easier).
>> 
>
>But only in spinlocks that I 'own' completely right? I'm more concerned
>about spinlocks that I share with Windows (eg in NDIS).

As long as you can hook the respective OS interface (and you know it is
always used), you could do this on all spinlocks, since all that's needed is
logic in the acquire/release code (unless you want to add state to each
lock [like the spinning CPUs bitmap Keir mentioned], but that shouldn't
be necessary to achieve the intended effect).

Jan

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 11:01                         ` James Harper
  2009-01-16 11:14                           ` Keir Fraser
  2009-01-16 11:18                           ` Jan Beulich
@ 2009-01-16 14:40                           ` Steve Prochniak
  2 siblings, 0 replies; 37+ messages in thread
From: Steve Prochniak @ 2009-01-16 14:40 UTC (permalink / raw)
  To: James Harper, Keir Fraser, Juergen Gross, Venefax
  Cc: George Dunlap, xen-devel

'enlightened' windows OSs have SpinLock routines that make a hypercall
to yield CPU to another VM.  So if you comply with the windows
hypervisor spec, you can get a performance boost when virtualized.  This
doesn't help you out with Pre - vista versions though...  Look at the
disassembly for KfAcquireSpinLock.

-----Original Message-----
From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of James Harper
Sent: Friday, January 16, 2009 6:02 AM
To: Keir Fraser; Juergen Gross; Venefax
Cc: George Dunlap; xen-devel@lists.xensource.com
Subject: RE: [Xen-devel] [Patch 2 of 2]: PV-domain SMP performance
Linux-part

> On 16/01/2009 10:16, "James Harper" <james.harper@bendigoit.com.au>
wrote:
> 
> > I had previously wondered about optimizing spinlocks, my idea was
> > basically for Xen to set a bit in a structure to indicate what vcpus
are
> > currently scheduled, and my modified spinlock acquire routine would
> > check if the current vcpu wants a spinlock that is held by a
currently
> > unscheduled vcpu, and if so yield to Xen to let the other vcpu
schedule.
> 
> That's a lot more like our existing Linux pv_ops spinlock handling
> (yield/block instead of spin) than Juergen's patch (don't deschedule
me
> while in a critical section). The difference from what you suggest is
that
> we heuristically detect unscheduled lock holders by spinning a short
> while.
> 
> You can pv up your Windows spinlocks in the block-instead-of-spin way
> already (and yield-instead-of-spin is obviously even easier).
> 

But only in spinlocks that I 'own' completely right? I'm more concerned
about spinlocks that I share with Windows (eg in NDIS).

James

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16  9:53                     ` Keir Fraser
@ 2009-01-16 17:41                       ` Jeremy Fitzhardinge
  2009-01-19 17:15                         ` George Dunlap
  0 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-01-16 17:41 UTC (permalink / raw)
  To: Keir Fraser; +Cc: George Dunlap, Juergen Gross, xen-devel@lists.xensource.com

Keir Fraser wrote:
> On 16/01/2009 09:36, "Juergen Gross" <juergen.gross@fujitsu-siemens.com>
> wrote:
>
>   
>>> The approach taken in Linux is not merely 'yield on spinlock' by the way, it
>>> is 'block on event channel on spinlock' essentially turning a contended
>>> spinlock into a sleeping mutex. I think that is quite different behaviour
>>> from merely yielding, and expecting the scheduler to do something sensible
>>> with your yield request.
>>>       
>> Could you explain this a little bit more in detail, please?
>>     
>
> Jeremy Fitzhardinge did the implementation for Linux, so I'm cc'ing him in
> case he remembers more details than me.
>
> Basically each CPU allocates itself an IPI event channel at start of day.
> When a CPU attempts to acquire a spinlock it spins a short while (perhaps a
> few microseconds?) and then adds itself to a bitmap stored in the lock
> structure (I think, or it might be a linked list of sleepers?). It then
> calls SCHEDOP_poll listing its IPI evtchn as its wakeup requirement. When
> the lock holder releases the lock it checks for sleepers and if it sees one
> then it pings one of them (or is it all of them?) on its event channel, thus
> waking it to take the lock.
>   

Yes, that's more or less right.  Each lock has a count of how many cpus 
are waiting for the lock; if its non-zero on unlock, the unlocker kicks 
all the waiting cpus via IPI.  There's a per-cpu variable of "lock I am 
waiting for"; the kicker looks at each cpu's entry and kicks it if its 
waiting for the lock being unlocked.

The locking side does the expected "spin for a while, then block on 
timeout".  The timeout is settable if you have the appropriate debugfs 
option enabled (which also produces quite a lot of detailed stats about 
locking behaviour).  The IPI is never delivered as an event BTW; the 
locker uses the event poll hypercall to block until the event is pending 
(this hypercall had some performance problems until relatively recent 
versions of Xen; I'm not sure which release versions has the fix).

The lock itself is a simple byte spinlock, with no fairness guarantees; 
I'm assuming (hoping) that the pathological cases that ticket locks were 
introduced to solve will be mitigated by the timeout/blocking path 
(and/or less likely in a virtual environment anyway).

I measured a small performance improvement within the domain with this 
patch (kernbench-type workload), but an overall 10% reduction in 
system-wide CPU use with multiple competing domains.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 10:16                     ` James Harper
  2009-01-16 10:31                       ` Juergen Gross
  2009-01-16 10:41                       ` Keir Fraser
@ 2009-01-16 17:43                       ` Jeremy Fitzhardinge
  2 siblings, 0 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-01-16 17:43 UTC (permalink / raw)
  To: James Harper
  Cc: George Dunlap, Juergen Gross, Keir Fraser, xen-devel, Venefax

James Harper wrote:
> Do you think your changes could be applicable to HVM domains with
> appropriately patched kernel spinlock routines?
>
> I had previously wondered about optimizing spinlocks, my idea was
> basically for Xen to set a bit in a structure to indicate what vcpus are
> currently scheduled, and my modified spinlock acquire routine would
> check if the current vcpu wants a spinlock that is held by a currently
> unscheduled vcpu, and if so yield to Xen to let the other vcpu schedule.
>
> The only thing I would need from Xen is to know which vcpus were
> currently scheduled, the rest would be DomU based.
>   

In a PV domain you can already get that information from the 
runstate_info structure, which can be mapped into the domain's memory 
and just read directly.  I don't know if its available to an hvm domain, 
but I don't think it would be hard to implement if it isn't.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-16 17:41                       ` Jeremy Fitzhardinge
@ 2009-01-19 17:15                         ` George Dunlap
  2009-01-20 20:12                           ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 37+ messages in thread
From: George Dunlap @ 2009-01-19 17:15 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: xen-devel@lists.xensource.com, Juergen Gross, Keir Fraser

On Fri, Jan 16, 2009 at 5:41 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
> Yes, that's more or less right.  Each lock has a count of how many cpus are
> waiting for the lock; if its non-zero on unlock, the unlocker kicks all the
> waiting cpus via IPI.  There's a per-cpu variable of "lock I am waiting
> for"; the kicker looks at each cpu's entry and kicks it if its waiting for
> the lock being unlocked.
>
> The locking side does the expected "spin for a while, then block on
> timeout".  The timeout is settable if you have the appropriate debugfs
> option enabled (which also produces quite a lot of detailed stats about
> locking behaviour).  The IPI is never delivered as an event BTW; the locker
> uses the event poll hypercall to block until the event is pending (this
> hypercall had some performance problems until relatively recent versions of
> Xen; I'm not sure which release versions has the fix).
>
> The lock itself is a simple byte spinlock, with no fairness guarantees; I'm
> assuming (hoping) that the pathological cases that ticket locks were
> introduced to solve will be mitigated by the timeout/blocking path (and/or
> less likely in a virtual environment anyway).
>
> I measured a small performance improvement within the domain with this patch
> (kernbench-type workload), but an overall 10% reduction in system-wide CPU
> use with multiple competing domains.

This is in the pv-ops kernel; is it in the Xen 2.6.18 kernel yet?

The advantage of the block approach over yielding is that you don't
have these crazy priority problems:  The reason v0 (who is waiting for
the spinlock) is running right now and v1 (which holds the spinlock)
is not is usually because v1 is out of credits and v0 isn't; so
calling "schedule" often just results in v0 being chosen as the "best
candidate" over again.  The solution in the patch I sent is to
temporarily reduce the priority on a yield; but that's inherently a
little unpredictable.  (Another option might be to re-balance credits
to vcpus on a yield.)

The disadvantage of this approach is that it is rather complicated,
and would have to be re-implemented for each OS.  In theory it should
be able to be implemented in Windows, but it may not be that simple.
And it's got to be implemented all-or-nothing for each spinlock; i.e.,
if any caller of the spin_lock() for a given lock blocks, all callers
of spin_unlock() on that lock need to know to wake the blocker up.  I
don't expect that to be a problem in Windows, but it may be.

Another thing to consider is how the approach applies to a related
problem, that of "syncronous" IPI function calls: i.e., when v0 sends
an IPI to v1 to do something, and spins waiting for it to be done,
expecting it to be finished pretty quickly.  But v1 is over credits,
so it doesn't get to run, and v0 burns its credits waiting.

At any rate, I'm working on the scheduler now, and I'll be considering
the "don't-deschedule" option in due time. :-)

Peace,
 -George

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-12 12:55                 ` Juergen Gross
@ 2009-01-19 17:32                   ` George Dunlap
  2009-01-20  7:56                     ` Juergen Gross
  0 siblings, 1 reply; 37+ messages in thread
From: George Dunlap @ 2009-01-19 17:32 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Mon, Jan 12, 2009 at 12:55 PM, Juergen Gross
<juergen.gross@fujitsu-siemens.com> wrote:
> Conclusion:
> -----------
> Differences not really big, but my "no deschedule" patch had least elapsed
> time for build-jobs, while scp was able to transfer same amount of data as
> in slower original system.
> The "Yield in spinlock" patch had slightly better dbench performance, but
> interactive shell commands were a pain sometimes! I suspect some problem in
> George's patches during low system load to be the main reason for this
> behaviour. Without George's patches the "Yield in spinlock" was very similar
> to the original system.

Hmm, the shell performance is a little worrying.  There may be
something strange going on...

Without my patches (at least, without the "yield reduces priority"
patch), "yield" is basically a no-op, so "yield in spinlock" is
functionally equivalent to the original system.

According to your numbers, the "user time" and "system time" were
exactly the same (only 0.6 seconds longer on system time), even though
the overall build took 52 seconds longer.  Is it possible that the
"yield" patches actually made it run less often?

scp works over tcp, which is often sensitive to latency; so it's
possible that the lowered priority on yield caused "hiccoughs", both
in the scp connections, and the interactive shell performance.

Anyway, I'll be looking into it after doing a scheduler update.

Peace,
 -George

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-19 17:32                   ` George Dunlap
@ 2009-01-20  7:56                     ` Juergen Gross
  0 siblings, 0 replies; 37+ messages in thread
From: Juergen Gross @ 2009-01-20  7:56 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel@lists.xensource.com, Keir Fraser

George Dunlap wrote:
> On Mon, Jan 12, 2009 at 12:55 PM, Juergen Gross
> <juergen.gross@fujitsu-siemens.com> wrote:
>> Conclusion:
>> -----------
>> Differences not really big, but my "no deschedule" patch had least elapsed
>> time for build-jobs, while scp was able to transfer same amount of data as
>> in slower original system.
>> The "Yield in spinlock" patch had slightly better dbench performance, but
>> interactive shell commands were a pain sometimes! I suspect some problem in
>> George's patches during low system load to be the main reason for this
>> behaviour. Without George's patches the "Yield in spinlock" was very similar
>> to the original system.
> 
> Hmm, the shell performance is a little worrying.  There may be
> something strange going on...
> 
> Without my patches (at least, without the "yield reduces priority"
> patch), "yield" is basically a no-op, so "yield in spinlock" is
> functionally equivalent to the original system.
> 
> According to your numbers, the "user time" and "system time" were
> exactly the same (only 0.6 seconds longer on system time), even though
> the overall build took 52 seconds longer.  Is it possible that the
> "yield" patches actually made it run less often?

I assume this could be possible. Do you think the following is reasonable?

If a vcpu is waiting for a lock it will yield, which will reduce it's
priority. This will increase the latency, if other vcpus are ready to run.
Normally the vcpu waiting for a lock is in some kind of critical path which
might be delayed significantly by the yield.
In sum the complete machine isn't burning cycles spinning, but the code
paths with lock conflicts are the loosers...

> 
> scp works over tcp, which is often sensitive to latency; so it's
> possible that the lowered priority on yield caused "hiccoughs", both
> in the scp connections, and the interactive shell performance.

This sounds reasonable to me.

Juergen

-- 
Juergen Gross                             Principal Developer
IP SW OS6                      Telephone: +49 (0) 89 636 47950
Fujitsu Siemens Computers         e-mail: juergen.gross@fujitsu-siemens.com
Otto-Hahn-Ring 6                Internet: www.fujitsu-siemens.com
D-81739 Muenchen         Company details: www.fujitsu-siemens.com/imprint.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [Patch 2 of 2]: PV-domain SMP performance Linux-part
  2009-01-19 17:15                         ` George Dunlap
@ 2009-01-20 20:12                           ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-01-20 20:12 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel@lists.xensource.com, Juergen Gross, Keir Fraser

George Dunlap wrote:
> On Fri, Jan 16, 2009 at 5:41 PM, Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>   
>> Yes, that's more or less right.  Each lock has a count of how many cpus are
>> waiting for the lock; if its non-zero on unlock, the unlocker kicks all the
>> waiting cpus via IPI.  There's a per-cpu variable of "lock I am waiting
>> for"; the kicker looks at each cpu's entry and kicks it if its waiting for
>> the lock being unlocked.
>>
>> The locking side does the expected "spin for a while, then block on
>> timeout".  The timeout is settable if you have the appropriate debugfs
>> option enabled (which also produces quite a lot of detailed stats about
>> locking behaviour).  The IPI is never delivered as an event BTW; the locker
>> uses the event poll hypercall to block until the event is pending (this
>> hypercall had some performance problems until relatively recent versions of
>> Xen; I'm not sure which release versions has the fix).
>>
>> The lock itself is a simple byte spinlock, with no fairness guarantees; I'm
>> assuming (hoping) that the pathological cases that ticket locks were
>> introduced to solve will be mitigated by the timeout/blocking path (and/or
>> less likely in a virtual environment anyway).
>>
>> I measured a small performance improvement within the domain with this patch
>> (kernbench-type workload), but an overall 10% reduction in system-wide CPU
>> use with multiple competing domains.
>>     
>
> This is in the pv-ops kernel; is it in the Xen 2.6.18 kernel yet?
>   

Yes.  No plans to backport.

> Another thing to consider is how the approach applies to a related
> problem, that of "syncronous" IPI function calls: i.e., when v0 sends
> an IPI to v1 to do something, and spins waiting for it to be done,
> expecting it to be finished pretty quickly.  But v1 is over credits,
> so it doesn't get to run, and v0 burns its credits waiting.
>   

Yes.  Some kind of direct yield might work in that case.  In practice it 
hasn't been a huge problem in Linux because most synchronous IPIs are 
for cross-cpu TLB flushes, which we use a hypercall for anyway.

    J

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2009-01-20 20:12 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-17 12:22 [Patch 2 of 2]: PV-domain SMP performance Linux-part Juergen Gross
2008-12-17 15:06 ` Jan Beulich
2008-12-18  7:18   ` Juergen Gross
2008-12-18  7:41     ` Jan Beulich
2008-12-19  8:12       ` Juergen Gross
2008-12-19  9:10         ` Keir Fraser
2008-12-19  9:25           ` Juergen Gross
2008-12-19  9:56             ` Keir Fraser
2009-01-16  7:16               ` Juergen Gross
2009-01-16  7:38                 ` Venefax
2009-01-16  7:48                   ` Juergen Gross
2009-01-16  7:57                     ` Venefax
2009-01-16  8:19                       ` Juergen Gross
2009-01-16 10:16                     ` James Harper
2009-01-16 10:31                       ` Juergen Gross
2009-01-16 10:41                       ` Keir Fraser
2009-01-16 11:01                         ` James Harper
2009-01-16 11:14                           ` Keir Fraser
2009-01-16 11:18                           ` Jan Beulich
2009-01-16 14:40                           ` Steve Prochniak
2009-01-16 17:43                       ` Jeremy Fitzhardinge
2009-01-16  8:17                 ` Keir Fraser
2009-01-16  9:36                   ` Juergen Gross
2009-01-16  9:53                     ` Keir Fraser
2009-01-16 17:41                       ` Jeremy Fitzhardinge
2009-01-19 17:15                         ` George Dunlap
2009-01-20 20:12                           ` Jeremy Fitzhardinge
2008-12-19  9:33           ` Jan Beulich
2008-12-19  9:56             ` Keir Fraser
2008-12-19 15:15               ` George Dunlap
2009-01-12 12:55                 ` Juergen Gross
2009-01-19 17:32                   ` George Dunlap
2009-01-20  7:56                     ` Juergen Gross
2008-12-19 10:06             ` Juergen Gross
2008-12-19 10:36               ` Jan Beulich
2008-12-19 10:42                 ` Juergen Gross
2008-12-19 10:48                 ` Juergen Gross

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.