[PATCH][KVM] Add support for Pause Filtering to AMD SVM

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH][KVM] Add support for Pause Filtering to AMD SVM
@ 2009-05-05 14:09 Mark Langsdorf
  2009-05-05 16:05 ` Bert Wesarg
                   ` (2 more replies)
  0 siblings, 3 replies; 44+ messages in thread
From: Mark Langsdorf @ 2009-05-05 14:09 UTC (permalink / raw)
  To: joerg.roedel, linux-kernel

commit 6f15c833f56267baf5abdd0fbc90a81489573053
Author: Mark Langsdorf <mlangsdo@wshpnow.amd.com>
Date:   Mon May 4 15:02:38 2009 -0500

    New AMD processors will support the Pause Filter Feature.
    This feature creates a new field in the VMCB called Pause
    Filter Count.  If Pause Filter Count is greater than 0 and
    ntercepting PAUSEs is enabled, the processor will increment
    an internal counter when a PAUSE instruction occurs instead
    of intercepting.  When the internal counter reaches the
    Pause Filter Count value, a PAUSE intercept will occur.
    
    This feature can be used to detect contended spinlocks,
    especially when the lock holding VCPU is not scheduled.
    Rescheduling another VCPU prevents the VCPU seeking the
    lock from wasting its quantum by spinning idly.
    
    Experimental results show that most spinlocks are held
    for less than 1000 PAUSE cycles or more than a few
    thousand.  Default the Pause Filter Counter to 3000 to
    detect the contended spinlocks.
    
    Processor support for this feature is indicated by a CPUID
    bit.
    
    On a 24 core system running 4 guests each with 16 VCPUs,
    this patch improved overall performance of each guest's
    32 job kernbench by approximately 1%.  Further performance
    improvement may be possible with a more sophisticated
    yield algorithm.
    
    -Mark Langsdorf
    Operating System Research Center
    AMD
    
    Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7..1fecb7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
 	u64 intercept;
-	u8 reserved_1[44];
+	u8 reserved_1[42];
+	u16 pause_filter_count;
 	u64 iopm_base_pa;
 	u64 msrpm_base_pa;
 	u64 tsc_offset;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..14dab13 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
@@ -575,6 +576,12 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->nested_vmcb = 0;
 	svm->vcpu.arch.hflags = HF_GIF_MASK;
+
+	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+		control->pause_filter_count = 5000;
+		control->intercept |= (1ULL << INTERCEPT_PAUSE);
+	}
+
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -2087,6 +2094,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	return 1;
 }
 
+static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	/* Simple yield */
+	vcpu_put(&svm->vcpu);
+	schedule();
+	vcpu_load(&svm->vcpu);
+	return 1;
+}
+
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 				      struct kvm_run *kvm_run) = {
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
@@ -2123,6 +2139,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
+	[SVM_EXIT_PAUSE]			= pause_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
@@ -2227,6 +2244,7 @@ static void pre_svm_run(struct vcpu_svm *svm)
 	if (svm->vcpu.cpu != cpu ||
 	    svm->asid_generation != svm_data->asid_generation)
 		new_asid(svm, svm_data);
+
 }
 
 static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2b73e19..e2b730d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -710,6 +710,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
 }
+EXPORT_SYMBOL_GPL(vcpu_load);
 
 void vcpu_put(struct kvm_vcpu *vcpu)
 {
@@ -719,6 +720,7 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
+EXPORT_SYMBOL_GPL(vcpu_put);
 
 static void ack_flush(void *_completed)
 {


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM] Add support for Pause Filtering to AMD SVM
  2009-05-05 14:09 [PATCH][KVM] Add support for Pause Filtering to AMD SVM Mark Langsdorf
@ 2009-05-05 16:05 ` Bert Wesarg
  2009-05-07 13:55 ` Joerg Roedel
  2009-05-11 14:38 ` [PATCH][KVM] " Peter Zijlstra
  2 siblings, 0 replies; 44+ messages in thread
From: Bert Wesarg @ 2009-05-05 16:05 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: joerg.roedel, linux-kernel

On Tue, May 5, 2009 at 16:09, Mark Langsdorf <mark.langsdorf@amd.com> wrote:
> commit 6f15c833f56267baf5abdd0fbc90a81489573053
> Author: Mark Langsdorf <mlangsdo@wshpnow.amd.com>
> Date:   Mon May 4 15:02:38 2009 -0500
>
>    New AMD processors will support the Pause Filter Feature.
>    This feature creates a new field in the VMCB called Pause
>    Filter Count.  If Pause Filter Count is greater than 0 and
>    ntercepting PAUSEs is enabled, the processor will increment
>    an internal counter when a PAUSE instruction occurs instead
>    of intercepting.  When the internal counter reaches the
>    Pause Filter Count value, a PAUSE intercept will occur.
>
>    This feature can be used to detect contended spinlocks,
>    especially when the lock holding VCPU is not scheduled.
>    Rescheduling another VCPU prevents the VCPU seeking the
>    lock from wasting its quantum by spinning idly.
>
>    Experimental results show that most spinlocks are held
>    for less than 1000 PAUSE cycles or more than a few
>    thousand.  Default the Pause Filter Counter to 3000 to
>    detect the contended spinlocks.
>
>    Processor support for this feature is indicated by a CPUID
>    bit.
>
>    On a 24 core system running 4 guests each with 16 VCPUs,
>    this patch improved overall performance of each guest's
>    32 job kernbench by approximately 1%.  Further performance
>    improvement may be possible with a more sophisticated
>    yield algorithm.
>
>    -Mark Langsdorf
>    Operating System Research Center
>    AMD
>
>    Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index ef43a18..14dab13 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -575,6 +576,12 @@ static void init_vmcb(struct vcpu_svm *svm)
>
>        svm->nested_vmcb = 0;
>        svm->vcpu.arch.hflags = HF_GIF_MASK;
> +
> +       if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> +               control->pause_filter_count = 5000;
The commit message says something about 3000.

> +               control->intercept |= (1ULL << INTERCEPT_PAUSE);
> +       }
> +
>  }
>
>  static int svm_vcpu_reset(struct kvm_vcpu *vcpu)

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM] Add support for Pause Filtering to AMD SVM
  2009-05-05 14:09 [PATCH][KVM] Add support for Pause Filtering to AMD SVM Mark Langsdorf
  2009-05-05 16:05 ` Bert Wesarg
@ 2009-05-07 13:55 ` Joerg Roedel
  2009-05-07 15:00   ` [PATCH][KVM][retry 1] " Mark Langsdorf
  2009-05-11 14:38 ` [PATCH][KVM] " Peter Zijlstra
  2 siblings, 1 reply; 44+ messages in thread
From: Joerg Roedel @ 2009-05-07 13:55 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: linux-kernel

Beside the small style problem the patch looks good. Please remove this
empty line from the patch and resubmit. Please also add avi@redhat.com
and kvm@vger.kernel.org to the CC list.

Joerg

On Tue, May 05, 2009 at 09:09:58AM -0500, Mark Langsdorf wrote:
> commit 6f15c833f56267baf5abdd0fbc90a81489573053
> Author: Mark Langsdorf <mlangsdo@wshpnow.amd.com>
> Date:   Mon May 4 15:02:38 2009 -0500
> 
>     New AMD processors will support the Pause Filter Feature.
>     This feature creates a new field in the VMCB called Pause
>     Filter Count.  If Pause Filter Count is greater than 0 and
>     ntercepting PAUSEs is enabled, the processor will increment
>     an internal counter when a PAUSE instruction occurs instead
>     of intercepting.  When the internal counter reaches the
>     Pause Filter Count value, a PAUSE intercept will occur.
>     
>     This feature can be used to detect contended spinlocks,
>     especially when the lock holding VCPU is not scheduled.
>     Rescheduling another VCPU prevents the VCPU seeking the
>     lock from wasting its quantum by spinning idly.
>     
>     Experimental results show that most spinlocks are held
>     for less than 1000 PAUSE cycles or more than a few
>     thousand.  Default the Pause Filter Counter to 3000 to
>     detect the contended spinlocks.
>     
>     Processor support for this feature is indicated by a CPUID
>     bit.
>     
>     On a 24 core system running 4 guests each with 16 VCPUs,
>     this patch improved overall performance of each guest's
>     32 job kernbench by approximately 1%.  Further performance
>     improvement may be possible with a more sophisticated
>     yield algorithm.
>     
>     -Mark Langsdorf
>     Operating System Research Center
>     AMD
>     
>     Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
> 
> diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> index 85574b7..1fecb7e 100644
> --- a/arch/x86/include/asm/svm.h
> +++ b/arch/x86/include/asm/svm.h
> @@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
>  	u16 intercept_dr_write;
>  	u32 intercept_exceptions;
>  	u64 intercept;
> -	u8 reserved_1[44];
> +	u8 reserved_1[42];
> +	u16 pause_filter_count;
>  	u64 iopm_base_pa;
>  	u64 msrpm_base_pa;
>  	u64 tsc_offset;
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index ef43a18..14dab13 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
>  #define SVM_FEATURE_NPT  (1 << 0)
>  #define SVM_FEATURE_LBRV (1 << 1)
>  #define SVM_FEATURE_SVML (1 << 2)
> +#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
>  
>  #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
>  
> @@ -575,6 +576,12 @@ static void init_vmcb(struct vcpu_svm *svm)
>  
>  	svm->nested_vmcb = 0;
>  	svm->vcpu.arch.hflags = HF_GIF_MASK;
> +
> +	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> +		control->pause_filter_count = 5000;
> +		control->intercept |= (1ULL << INTERCEPT_PAUSE);
> +	}
> +
>  }
>  
>  static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
> @@ -2087,6 +2094,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
>  	return 1;
>  }
>  
> +static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> +{
> +	/* Simple yield */
> +	vcpu_put(&svm->vcpu);
> +	schedule();
> +	vcpu_load(&svm->vcpu);
> +	return 1;
> +}
> +
>  static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
>  				      struct kvm_run *kvm_run) = {
>  	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
> @@ -2123,6 +2139,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
>  	[SVM_EXIT_CPUID]			= cpuid_interception,
>  	[SVM_EXIT_IRET]                         = iret_interception,
>  	[SVM_EXIT_INVD]                         = emulate_on_interception,
> +	[SVM_EXIT_PAUSE]			= pause_interception,
>  	[SVM_EXIT_HLT]				= halt_interception,
>  	[SVM_EXIT_INVLPG]			= invlpg_interception,
>  	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
> @@ -2227,6 +2244,7 @@ static void pre_svm_run(struct vcpu_svm *svm)
>  	if (svm->vcpu.cpu != cpu ||
>  	    svm->asid_generation != svm_data->asid_generation)
>  		new_asid(svm, svm_data);
> +
>  }

Minor style nit, please move this empty line from the patch.

>  
>  static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2b73e19..e2b730d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -710,6 +710,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
>  	kvm_arch_vcpu_load(vcpu, cpu);
>  	put_cpu();
>  }
> +EXPORT_SYMBOL_GPL(vcpu_load);
>  
>  void vcpu_put(struct kvm_vcpu *vcpu)
>  {
> @@ -719,6 +720,7 @@ void vcpu_put(struct kvm_vcpu *vcpu)
>  	preempt_enable();
>  	mutex_unlock(&vcpu->mutex);
>  }
> +EXPORT_SYMBOL_GPL(vcpu_put);
>  
>  static void ack_flush(void *_completed)
>  {

-- 
           | Advanced Micro Devices GmbH
 Operating | Karl-Hammerschmidt-Str. 34, 85609 Dornach bei München
 System    | 
 Research  | Geschäftsführer: Thomas M. McCoy, Giuliano Meroni
 Center    | Sitz: Dornach, Gemeinde Aschheim, Landkreis München
           | Registergericht München, HRB Nr. 43632


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-07 13:55 ` Joerg Roedel
@ 2009-05-07 15:00   ` Mark Langsdorf
  2009-05-07 15:31     ` Avi Kivity
  2009-05-08 17:03     ` [PATCH][KVM][retry 2] " Mark Langsdorf
  0 siblings, 2 replies; 44+ messages in thread
From: Mark Langsdorf @ 2009-05-07 15:00 UTC (permalink / raw)
  To: Joerg Roedel, avi, kvm; +Cc: linux-kernel

commit 01813db8627e74018c8cec90df7e345839351f23
Author: root <root@xendinar01.amd.com>
Date:   Thu May 7 09:44:10 2009 -0500

    New AMD processors will support the Pause Filter Feature.
    This feature creates a new field in the VMCB called Pause
    Filter Count.  If Pause Filter Count is greater than 0 and
    intercepting PAUSEs is enabled, the processor will increment
    an internal counter when a PAUSE instruction occurs instead
    of intercepting.  When the internal counter reaches the
    Pause Filter Count value, a PAUSE intercept will occur.
    
    This feature can be used to detect contended spinlocks,
    especially when the lock holding VCPU is not scheduled.
    Rescheduling another VCPU prevents the VCPU seeking the
    lock from wasting its quantum by spinning idly.
    
    Experimental results show that most spinlocks are held
    for less than 1000 PAUSE cycles or more than a few
    thousand.  Default the Pause Filter Counter to 3000 to
    detect the contended spinlocks.
    
    Processor support for this feature is indicated by a CPUID
    bit.
    
    On a 24 core system running 4 guests each with 16 VCPUs,
    this patch improved overall performance of each guest's
    32 job kernbench by approximately 1%.  Further performance
    improvement may be possible with a more sophisticated
    yield algorithm.
    
    -Mark Langsdorf
    Operating System Research Center
    AMD
    
    Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7..1fecb7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
 	u64 intercept;
-	u8 reserved_1[44];
+	u8 reserved_1[42];
+	u16 pause_filter_count;
 	u64 iopm_base_pa;
 	u64 msrpm_base_pa;
 	u64 tsc_offset;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..4279141 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
@@ -575,6 +576,12 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->nested_vmcb = 0;
 	svm->vcpu.arch.hflags = HF_GIF_MASK;
+
+	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+		control->pause_filter_count = 5000;
+		control->intercept |= (1ULL << INTERCEPT_PAUSE);
+	}
+
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -2087,6 +2094,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	return 1;
 }
 
+static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	/* Simple yield */
+	vcpu_put(&svm->vcpu);
+	schedule();
+	vcpu_load(&svm->vcpu);
+	return 1;
+}
+
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 				      struct kvm_run *kvm_run) = {
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
@@ -2123,6 +2139,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
+	[SVM_EXIT_PAUSE]			= pause_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2b73e19..e2b730d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -710,6 +710,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
 }
+EXPORT_SYMBOL_GPL(vcpu_load);
 
 void vcpu_put(struct kvm_vcpu *vcpu)
 {
@@ -719,6 +720,7 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
+EXPORT_SYMBOL_GPL(vcpu_put);
 
 static void ack_flush(void *_completed)
 {


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-07 15:00   ` [PATCH][KVM][retry 1] " Mark Langsdorf
@ 2009-05-07 15:31     ` Avi Kivity
  2009-05-11 14:15       ` Ingo Molnar
  2009-05-08 17:03     ` [PATCH][KVM][retry 2] " Mark Langsdorf
  1 sibling, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-07 15:31 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: Joerg Roedel, kvm, linux-kernel, Ingo Molnar

(copying Ingo)

Mark Langsdorf wrote:
> commit 01813db8627e74018c8cec90df7e345839351f23
> Author: root <root@xendinar01.amd.com>
> Date:   Thu May 7 09:44:10 2009 -0500
>
>     New AMD processors will support the Pause Filter Feature.
>     This feature creates a new field in the VMCB called Pause
>     Filter Count.  If Pause Filter Count is greater than 0 and
>     intercepting PAUSEs is enabled, the processor will increment
>     an internal counter when a PAUSE instruction occurs instead
>     of intercepting.  When the internal counter reaches the
>     Pause Filter Count value, a PAUSE intercept will occur.
>     
>     This feature can be used to detect contended spinlocks,
>     especially when the lock holding VCPU is not scheduled.
>     Rescheduling another VCPU prevents the VCPU seeking the
>     lock from wasting its quantum by spinning idly.
>     
>     Experimental results show that most spinlocks are held
>     for less than 1000 PAUSE cycles or more than a few
>     thousand.  Default the Pause Filter Counter to 3000 to
>     detect the contended spinlocks.
>     
>     Processor support for this feature is indicated by a CPUID
>     bit.
>     
>     On a 24 core system running 4 guests each with 16 VCPUs,
>     this patch improved overall performance of each guest's
>     32 job kernbench by approximately 1%.  Further performance
>     improvement may be possible with a more sophisticated
>     yield algorithm.
>     
>     -Mark Langsdorf
>     Operating System Research Center
>     AMD
>     
>     Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
>   

(please use git format-patch rather than git show, and set up user.name 
and user.email properly)

>  
>  	svm->nested_vmcb = 0;
>  	svm->vcpu.arch.hflags = HF_GIF_MASK;
> +
> +	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> +		control->pause_filter_count = 5000;
> +		control->intercept |= (1ULL << INTERCEPT_PAUSE);
> +	}
> +
>  }

3000 or 5000?

>  
> +static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> +{
> +	/* Simple yield */
> +	vcpu_put(&svm->vcpu);
> +	schedule();
> +	vcpu_load(&svm->vcpu);
> +	return 1;
> +

Ingo, will this do anything under CFS, or will CFS note that nothing has 
changed in the accounting and reschedule us immediately?



-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH][KVM][retry 2] Add support for Pause Filtering to AMD SVM
  2009-05-07 15:00   ` [PATCH][KVM][retry 1] " Mark Langsdorf
  2009-05-07 15:31     ` Avi Kivity
@ 2009-05-08 17:03     ` Mark Langsdorf
  2009-05-08 18:44       ` Avi Kivity
  2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
  1 sibling, 2 replies; 44+ messages in thread
From: Mark Langsdorf @ 2009-05-08 17:03 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: avi, kvm, linux-kernel

>From 01813db8627e74018c8cec90df7e345839351f23 Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 7 May 2009 09:44:10 -0500
Subject: [PATCH] Add support for Pause Filtering to AMD SVM

This feature creates a new field in the VMCB called Pause
Filter Count.  If Pause Filter Count is greater than 0 and
intercepting PAUSEs is enabled, the processor will increment
an internal counter when a PAUSE instruction occurs instead
of intercepting.  When the internal counter reaches the
Pause Filter Count value, a PAUSE intercept will occur.

This feature can be used to detect contended spinlocks,
especially when the lock holding VCPU is not scheduled.
Rescheduling another VCPU prevents the VCPU seeking the
lock from wasting its quantum by spinning idly.

Experimental results show that most spinlocks are held
for less than 1000 PAUSE cycles or more than a few
thousand.  Default the Pause Filter Counter to 3000 to
detect the contended spinlocks.

Processor support for this feature is indicated by a CPUID
bit.

On a 24 core system running 4 guests each with 16 VCPUs,
this patch improved overall performance of each guest's
32 job kernbench by approximately 1%.  Further performance
improvement may be possible with a more sophisticated
yield algorithm.

-Mark Langsdorf
Operating System Research Center
AMD

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
---
 arch/x86/include/asm/svm.h |    3 ++-
 arch/x86/kvm/svm.c         |   17 +++++++++++++++++
 virt/kvm/kvm_main.c        |    2 ++
 3 files changed, 21 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7..1fecb7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
 	u64 intercept;
-	u8 reserved_1[44];
+	u8 reserved_1[42];
+	u16 pause_filter_count;
 	u64 iopm_base_pa;
 	u64 msrpm_base_pa;
 	u64 tsc_offset;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..4279141 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
@@ -575,6 +576,12 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->nested_vmcb = 0;
 	svm->vcpu.arch.hflags = HF_GIF_MASK;
+
+	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+		control->pause_filter_count = 5000;
+		control->intercept |= (1ULL << INTERCEPT_PAUSE);
+	}
+
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -2087,6 +2094,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	return 1;
 }
 
+static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	/* Simple yield */
+	vcpu_put(&svm->vcpu);
+	schedule();
+	vcpu_load(&svm->vcpu);
+	return 1;
+}
+
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 				      struct kvm_run *kvm_run) = {
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
@@ -2123,6 +2139,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
+	[SVM_EXIT_PAUSE]			= pause_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2b73e19..e2b730d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -710,6 +710,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
 }
+EXPORT_SYMBOL_GPL(vcpu_load);
 
 void vcpu_put(struct kvm_vcpu *vcpu)
 {
@@ -719,6 +720,7 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
+EXPORT_SYMBOL_GPL(vcpu_put);
 
 static void ack_flush(void *_completed)
 {
-- 
1.6.0.2



^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 2] Add support for Pause Filtering to AMD SVM
  2009-05-08 17:03     ` [PATCH][KVM][retry 2] " Mark Langsdorf
@ 2009-05-08 18:44       ` Avi Kivity
  2009-05-08 18:47         ` Langsdorf, Mark
  2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
  1 sibling, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-08 18:44 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: Joerg Roedel, kvm, linux-kernel

Mark Langsdorf wrote:
> From 01813db8627e74018c8cec90df7e345839351f23 Mon Sep 17 00:00:00 2001
> From: Mark Langsdorf <mark.langsdorf@amd.com>
> Date: Thu, 7 May 2009 09:44:10 -0500
> Subject: [PATCH] Add support for Pause Filtering to AMD SVM
>   

What's the differences wrt retry 1?

> This feature creates a new field in the VMCB called Pause
> Filter Count.  If Pause Filter Count is greater than 0 and
> intercepting PAUSEs is enabled, the processor will increment
> an internal counter when a PAUSE instruction occurs instead
> of intercepting.  When the internal counter reaches the
> Pause Filter Count value, a PAUSE intercept will occur.
>
> This feature can be used to detect contended spinlocks,
> especially when the lock holding VCPU is not scheduled.
> Rescheduling another VCPU prevents the VCPU seeking the
> lock from wasting its quantum by spinning idly.
>
> Experimental results show that most spinlocks are held
> for less than 1000 PAUSE cycles or more than a few
> thousand.  Default the Pause Filter Counter to 3000 to
> detect the contended spinlocks.
>   

3000.

> Processor support for this feature is indicated by a CPUID
> bit.
>
> On a 24 core system running 4 guests each with 16 VCPUs,
> this patch improved overall performance of each guest's
> 32 job kernbench by approximately 1%.  Further performance
> improvement may be possible with a more sophisticated
> yield algorithm.
>   

Like I mentioned earlier, I don't think schedule() does anything on CFS.

Try sched_yield(), but set /proc/sys/kernel/sched_compat_yield.

> +
> +	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> +		control->pause_filter_count = 5000;
> +		control->intercept |= (1ULL << INTERCEPT_PAUSE);
> +	}
> +
>   

Here, 5000?

>  }
>  
>  static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
> @@ -2087,6 +2094,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
>  	return 1;
>  }
>  
> +static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
> +{
> +	/* Simple yield */
> +	vcpu_put(&svm->vcpu);
> +	schedule();
> +	vcpu_load(&svm->vcpu);
> +	return 1;
> +}
> +
>   

You don't need to vcpu_put() and vcpu_load().  The scheduler will call 
them for you if/when it switches tasks.



-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH][KVM][retry 2] Add support for Pause Filtering to AMD SVM
  2009-05-08 18:44       ` Avi Kivity
@ 2009-05-08 18:47         ` Langsdorf, Mark
  0 siblings, 0 replies; 44+ messages in thread
From: Langsdorf, Mark @ 2009-05-08 18:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Roedel, Joerg, kvm, linux-kernel

> What's the differences wrt retry 1?

I'm using git format-patch as you requested.
 
> > This feature creates a new field in the VMCB called Pause
> > Filter Count.  If Pause Filter Count is greater than 0 and
> > intercepting PAUSEs is enabled, the processor will increment
> > an internal counter when a PAUSE instruction occurs instead
> > of intercepting.  When the internal counter reaches the
> > Pause Filter Count value, a PAUSE intercept will occur.
> >
> > This feature can be used to detect contended spinlocks,
> > especially when the lock holding VCPU is not scheduled.
> > Rescheduling another VCPU prevents the VCPU seeking the
> > lock from wasting its quantum by spinning idly.
> >
> > Experimental results show that most spinlocks are held
> > for less than 1000 PAUSE cycles or more than a few
> > thousand.  Default the Pause Filter Counter to 3000 to
> > detect the contended spinlocks.
> 
> 3000.

Thanks, I keep missing that.
 
> > On a 24 core system running 4 guests each with 16 VCPUs,
> > this patch improved overall performance of each guest's
> > 32 job kernbench by approximately 1%.  Further performance
> > improvement may be possible with a more sophisticated
> > yield algorithm.
> >   
> 
> Like I mentioned earlier, I don't think schedule() does 
> anything on CFS.
> 
> Try sched_yield(), but set /proc/sys/kernel/sched_compat_yield.

Will do.

> > +static int pause_interception(struct vcpu_svm *svm, struct 
> kvm_run *kvm_run)
> > +{
> > +	/* Simple yield */
> > +	vcpu_put(&svm->vcpu);
> > +	schedule();
> > +	vcpu_load(&svm->vcpu);
> > +	return 1;
> > +}
> > +
> >   
> 
> You don't need to vcpu_put() and vcpu_load().  The scheduler 
> will call them for you if/when it switches tasks.

I was waiting for feedback from Ingo on that issue, but I'll
try sched_yield() instead.

-Mark Langsdorf
Operating System Research Center
AMD


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-07 15:31     ` Avi Kivity
@ 2009-05-11 14:15       ` Ingo Molnar
  2009-05-11 14:24         ` Avi Kivity
  0 siblings, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2009-05-11 14:15 UTC (permalink / raw)
  To: Avi Kivity, Peter Zijlstra
  Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel


* Avi Kivity <avi@redhat.com> wrote:

>>  +static int pause_interception(struct vcpu_svm *svm, struct kvm_run 
>> *kvm_run)
>> +{
>> +	/* Simple yield */
>> +	vcpu_put(&svm->vcpu);
>> +	schedule();
>> +	vcpu_load(&svm->vcpu);
>> +	return 1;
>> +
>
> Ingo, will this do anything under CFS, or will CFS note that 
> nothing has changed in the accounting and reschedule us 
> immediately?

The scheduler will yield to another task only if the current task 
has become ineligible. I.e schedule() is largely a NOP on 
TASK_RUNNING tasks (i.e. here).

I.e. this is a somewhat poor solution as far as scheduling goes. But 
i'm wondering what the CPU side does. Can REP-NOP really take 
thousands of cycles? If yes, under what circumstances?

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:15       ` Ingo Molnar
@ 2009-05-11 14:24         ` Avi Kivity
  2009-05-11 14:33           ` Ingo Molnar
  2009-05-11 14:42           ` Peter Zijlstra
  0 siblings, 2 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 14:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Ingo Molnar wrote:
>   
>>>  +static int pause_interception(struct vcpu_svm *svm, struct kvm_run 
>>> *kvm_run)
>>> +{
>>> +	/* Simple yield */
>>> +	vcpu_put(&svm->vcpu);
>>> +	schedule();
>>> +	vcpu_load(&svm->vcpu);
>>> +	return 1;
>>> +
>>>       
>> Ingo, will this do anything under CFS, or will CFS note that 
>> nothing has changed in the accounting and reschedule us 
>> immediately?
>>     
>
> The scheduler will yield to another task only if the current task 
> has become ineligible. I.e schedule() is largely a NOP on 
> TASK_RUNNING tasks (i.e. here).
>   

Especially on preemptible kernels, where the schedule() would have 
already happened if it could cause anything, IIUC.

> I.e. this is a somewhat poor solution as far as scheduling goes. But 
> i'm wondering what the CPU side does. Can REP-NOP really take 
> thousands of cycles? If yes, under what circumstances?
>   

The guest is running rep-nop in a loop while trying to acquire a 
spinlock.  The hardware detects this (most likely, repeated rep-nop with 
the same rip) and exits.  We can program the loop count; obviously if 
we're spinning for only a short while it's better to keep spinning while 
hoping the lock will be released soon.

The idea is to detect that the guest is not making forward progress and 
yield.  If I could tell the scheduler, you may charge me a couple of 
milliseconds, I promise not to sue, that would be ideal.  Other tasks 
can become eligible, hopefully the task holding the spinlock, and by the 
time we're scheduled back the long running task will have finished and 
released the lock.

For newer Linux as a guest we're better off paravirtualizing this, so we 
can tell the host which vcpu holds the lock; in this case kvm will want 
to say, take a couple milliseconds off my account and transfer it to 
this task (so called directed yield).  However there's no reason to 
paravirtualize all cpu_relax() calls.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:24         ` Avi Kivity
@ 2009-05-11 14:33           ` Ingo Molnar
  2009-05-11 14:51             ` Avi Kivity
  2009-05-11 14:42           ` Peter Zijlstra
  1 sibling, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2009-05-11 14:33 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

* Avi Kivity <avi@redhat.com> wrote:

>> I.e. this is a somewhat poor solution as far as scheduling goes. 
>> But i'm wondering what the CPU side does. Can REP-NOP really take 
>> thousands of cycles? If yes, under what circumstances?
>
> The guest is running rep-nop in a loop while trying to acquire a 
> spinlock.  The hardware detects this (most likely, repeated 
> rep-nop with the same rip) and exits.  We can program the loop 
> count; obviously if we're spinning for only a short while it's 
> better to keep spinning while hoping the lock will be released 
> soon.
>
> The idea is to detect that the guest is not making forward 
> progress and yield.  If I could tell the scheduler, you may charge 
> me a couple of milliseconds, I promise not to sue, that would be 
> ideal. [...]

Ok, with such a waiver, who could refuse?

This really needs a new kernel-internal scheduler API though, which 
does a lot of fancy things to do:

        se->vruntime += 1000000;

i.e. add 1 msec worth of nanoseconds to the task's timeline. (first 
remove it from the rbtree, then add it back, and nice-weight it as 
well) And only do it if there's other tasks running on this CPU or 
so.

_That_ would be pretty efficient, and would do the right thing when 
two (or more) vcpus run on the same CPU, and it would also do the 
right thing if there are repeated VM-exits due to pause filtering.

Please dont even think about using yield for this though - that will 
just add a huge hit to this task and wont result in any sane 
behavior - and yield is bound to some historic user-space behavior 
as well.

A gradual and linear back-off from the current timeline is more of a 
fair negotiation process between vcpus and results in more or less 
sane (and fair) scheduling, and no unnecessary looping.

You could even do an exponential backoff up to a limit of 1-10 msecs 
or so, starting at 100 usecs.

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM] Add support for Pause Filtering to AMD SVM
  2009-05-05 14:09 [PATCH][KVM] Add support for Pause Filtering to AMD SVM Mark Langsdorf
  2009-05-05 16:05 ` Bert Wesarg
  2009-05-07 13:55 ` Joerg Roedel
@ 2009-05-11 14:38 ` Peter Zijlstra
  2009-05-11 14:51   ` Ingo Molnar
  2 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-11 14:38 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: joerg.roedel, linux-kernel, Ingo Molnar

On Tue, 2009-05-05 at 09:09 -0500, Mark Langsdorf wrote:
> commit 6f15c833f56267baf5abdd0fbc90a81489573053
> Author: Mark Langsdorf <mlangsdo@wshpnow.amd.com>
> Date:   Mon May 4 15:02:38 2009 -0500
> 
>     New AMD processors will support the Pause Filter Feature.
>     This feature creates a new field in the VMCB called Pause
>     Filter Count.  If Pause Filter Count is greater than 0 and
>     ntercepting PAUSEs is enabled, the processor will increment
>     an internal counter when a PAUSE instruction occurs instead
>     of intercepting.  When the internal counter reaches the
>     Pause Filter Count value, a PAUSE intercept will occur.
>     
>     This feature can be used to detect contended spinlocks,
>     especially when the lock holding VCPU is not scheduled.
>     Rescheduling another VCPU prevents the VCPU seeking the
>     lock from wasting its quantum by spinning idly.
>     
>     Experimental results show that most spinlocks are held
>     for less than 1000 PAUSE cycles or more than a few
>     thousand.  Default the Pause Filter Counter to 3000 to
>     detect the contended spinlocks.
>     
>     Processor support for this feature is indicated by a CPUID
>     bit.
>     
>     On a 24 core system running 4 guests each with 16 VCPUs,
>     this patch improved overall performance of each guest's
>     32 job kernbench by approximately 1%.  Further performance
>     improvement may be possible with a more sophisticated
>     yield algorithm.

Isn't a much better solution to the spinlock problem a usable
monitor-wait implementation?

If we implement virt spinlocks using monitor-wait they don't spin but
simply wait in place, the HV could then decide to run someone else.

This is the HV equivalent to futexes.

The only problem with this is that the current hardware has horrid mwait
wakeup latencies. If this were (much) improved you don't need such ugly
yield hacks like this.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:24         ` Avi Kivity
  2009-05-11 14:33           ` Ingo Molnar
@ 2009-05-11 14:42           ` Peter Zijlstra
  2009-05-11 15:05             ` Avi Kivity
  1 sibling, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-11 14:42 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

On Mon, 2009-05-11 at 17:24 +0300, Avi Kivity wrote:

> > I.e. this is a somewhat poor solution as far as scheduling goes. But 
> > i'm wondering what the CPU side does. Can REP-NOP really take 
> > thousands of cycles? If yes, under what circumstances?
> >   
> 
> The guest is running rep-nop in a loop while trying to acquire a 
> spinlock.  The hardware detects this (most likely, repeated rep-nop with 
> the same rip) and exits.  We can program the loop count; obviously if 
> we're spinning for only a short while it's better to keep spinning while 
> hoping the lock will be released soon.
> 
> The idea is to detect that the guest is not making forward progress and 
> yield.  If I could tell the scheduler, you may charge me a couple of 
> milliseconds, I promise not to sue, that would be ideal.  Other tasks 
> can become eligible, hopefully the task holding the spinlock, and by the 
> time we're scheduled back the long running task will have finished and 
> released the lock.
> 
> For newer Linux as a guest we're better off paravirtualizing this, so we 
> can tell the host which vcpu holds the lock; in this case kvm will want 
> to say, take a couple milliseconds off my account and transfer it to 
> this task (so called directed yield).  However there's no reason to 
> paravirtualize all cpu_relax() calls.

So we're now officially giving up on (soft) realtime virtualization?


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:33           ` Ingo Molnar
@ 2009-05-11 14:51             ` Avi Kivity
  2009-05-11 14:59               ` Ingo Molnar
  2009-05-11 15:01               ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM Peter Zijlstra
  0 siblings, 2 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 14:51 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Ingo Molnar wrote:
> * Avi Kivity <avi@redhat.com> wrote:
>
>   
>>> I.e. this is a somewhat poor solution as far as scheduling goes. 
>>> But i'm wondering what the CPU side does. Can REP-NOP really take 
>>> thousands of cycles? If yes, under what circumstances?
>>>       
>> The guest is running rep-nop in a loop while trying to acquire a 
>> spinlock.  The hardware detects this (most likely, repeated 
>> rep-nop with the same rip) and exits.  We can program the loop 
>> count; obviously if we're spinning for only a short while it's 
>> better to keep spinning while hoping the lock will be released 
>> soon.
>>
>> The idea is to detect that the guest is not making forward 
>> progress and yield.  If I could tell the scheduler, you may charge 
>> me a couple of milliseconds, I promise not to sue, that would be 
>> ideal. [...]
>>     
>
> Ok, with such a waiver, who could refuse?
>
> This really needs a new kernel-internal scheduler API though, which 
> does a lot of fancy things to do:
>
>         se->vruntime += 1000000;
>
> i.e. add 1 msec worth of nanoseconds to the task's timeline. (first 
> remove it from the rbtree, then add it back, and nice-weight it as 
> well) 

I suspected it would be as simple as this.

> And only do it if there's other tasks running on this CPU or 
> so.
>   

What would happen if there weren't?  I'd guess the task would continue 
running (but with a warped vruntime)?

> _That_ would be pretty efficient, and would do the right thing when 
> two (or more) vcpus run on the same CPU, and it would also do the 
> right thing if there are repeated VM-exits due to pause filtering.
>
> Please dont even think about using yield for this though - that will 
> just add a huge hit to this task and wont result in any sane 
> behavior - and yield is bound to some historic user-space behavior 
> as well.
>
> A gradual and linear back-off from the current timeline is more of a 
> fair negotiation process between vcpus and results in more or less 
> sane (and fair) scheduling, and no unnecessary looping.
>
> You could even do an exponential backoff up to a limit of 1-10 msecs 
> or so, starting at 100 usecs.
>   

Good idea, it eliminates another variable to be tuned.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:38 ` [PATCH][KVM] " Peter Zijlstra
@ 2009-05-11 14:51   ` Ingo Molnar
  0 siblings, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2009-05-11 14:51 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Mark Langsdorf, joerg.roedel, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, 2009-05-05 at 09:09 -0500, Mark Langsdorf wrote:
> > commit 6f15c833f56267baf5abdd0fbc90a81489573053
> > Author: Mark Langsdorf <mlangsdo@wshpnow.amd.com>
> > Date:   Mon May 4 15:02:38 2009 -0500
> > 
> >     New AMD processors will support the Pause Filter Feature.
> >     This feature creates a new field in the VMCB called Pause
> >     Filter Count.  If Pause Filter Count is greater than 0 and
> >     ntercepting PAUSEs is enabled, the processor will increment
> >     an internal counter when a PAUSE instruction occurs instead
> >     of intercepting.  When the internal counter reaches the
> >     Pause Filter Count value, a PAUSE intercept will occur.
> >     
> >     This feature can be used to detect contended spinlocks,
> >     especially when the lock holding VCPU is not scheduled.
> >     Rescheduling another VCPU prevents the VCPU seeking the
> >     lock from wasting its quantum by spinning idly.
> >     
> >     Experimental results show that most spinlocks are held
> >     for less than 1000 PAUSE cycles or more than a few
> >     thousand.  Default the Pause Filter Counter to 3000 to
> >     detect the contended spinlocks.
> >     
> >     Processor support for this feature is indicated by a CPUID
> >     bit.
> >     
> >     On a 24 core system running 4 guests each with 16 VCPUs,
> >     this patch improved overall performance of each guest's
> >     32 job kernbench by approximately 1%.  Further performance
> >     improvement may be possible with a more sophisticated
> >     yield algorithm.
> 
> Isn't a much better solution to the spinlock problem a usable 
> monitor-wait implementation?
> 
> If we implement virt spinlocks using monitor-wait they don't spin 
> but simply wait in place, the HV could then decide to run someone 
> else.
> 
> This is the HV equivalent to futexes.
> 
> The only problem with this is that the current hardware has horrid 
> mwait wakeup latencies. If this were (much) improved you don't 
> need such ugly yield hacks like this.

I've considered MWAIT, but its really hard on the hw side:

the hardware would have to generate a 'wakeup', meaning it either 
has to trap out, or has to send an irq.

Trapping out is only possible on the release-the-lock side - which 
is usually on the wrong physical CPU, and it also happens _too late_ 
- such monitor/wait thingies are usually based on MESI cache, and 
the originating CPU does not wait for everything to happen.

An irq (on the target CPU that notices the cacheline flush) is more 
feasible, but it is several thousand cycles to begin with.

Irqs/vectors are a lot harder to add in general as well, and incur a 
cost of several years of CPU-design-cycle latency. Furthermore, the 
bits around MESI updates are _very_ sensitive codepaths of the CPU, 
while REP; NOP is a slowpath to begin with.

But ... especially as SMT techniques spread, something like that 
will have to happen as well - but it will take years. Meanwhile, 
this particular CPU feature is there, it's fairly intuitive (plus a 
VM exit doesnt really change the CPU's behavior materially, so a lot 
easier to validate and get OS support for), so we could use it.

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:51             ` Avi Kivity
@ 2009-05-11 14:59               ` Ingo Molnar
  2009-05-11 15:12                 ` Avi Kivity
  2009-05-11 15:58                 ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMDSVM Langsdorf, Mark
  2009-05-11 15:01               ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM Peter Zijlstra
  1 sibling, 2 replies; 44+ messages in thread
From: Ingo Molnar @ 2009-05-11 14:59 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

* Avi Kivity <avi@redhat.com> wrote:

>> And only do it if there's other tasks running on this CPU or so.
>
> What would happen if there weren't?  I'd guess the task would 
> continue running (but with a warped vruntime)?

We dont want that warping to occur - we just want to go back and 
burn CPU time in VM context. The problem is (as Peter pointed it 
out) that this hw facility is incomplete and does not give us any 
event (interrupt) and does not give us any event key (address we are 
waiting for) either.

So the next best thing to do is to go back to the guest, because 
that is where we make the most progress, more likely, and that is 
where we want to be to make progress immediately, with the shortest 
latency.

( Perhaps we could also increase vruntime beyond the standard 
  latency value to make sure any freshly woken task gets executed 
  first if we are still looping. )

>> _That_ would be pretty efficient, and would do the right thing when  
>> two (or more) vcpus run on the same CPU, and it would also do the  
>> right thing if there are repeated VM-exits due to pause filtering.
>>
>> Please dont even think about using yield for this though - that will  
>> just add a huge hit to this task and wont result in any sane behavior - 
>> and yield is bound to some historic user-space behavior as well.
>>
>> A gradual and linear back-off from the current timeline is more of a  
>> fair negotiation process between vcpus and results in more or less  
>> sane (and fair) scheduling, and no unnecessary looping.
>>
>> You could even do an exponential backoff up to a limit of 1-10 msecs  
>> or so, starting at 100 usecs.
>>   
>
> Good idea, it eliminates another variable to be tuned.

It could be made fully self-tuning, if the filter threshold can be 
tuned fast enough. (an MSR write? A VM context field update?)

I.e. the 3000 cycles value itself could be eliminated as well. (with 
just a common-sense max of say 100,000 cycles enforced)

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:51             ` Avi Kivity
  2009-05-11 14:59               ` Ingo Molnar
@ 2009-05-11 15:01               ` Peter Zijlstra
  2009-05-11 15:06                 ` Avi Kivity
  1 sibling, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-11 15:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

On Mon, 2009-05-11 at 17:51 +0300, Avi Kivity wrote:
> Ingo Molnar wrote:
> > * Avi Kivity <avi@redhat.com> wrote:
> >
> >   
> >>> I.e. this is a somewhat poor solution as far as scheduling goes. 
> >>> But i'm wondering what the CPU side does. Can REP-NOP really take 
> >>> thousands of cycles? If yes, under what circumstances?
> >>>       
> >> The guest is running rep-nop in a loop while trying to acquire a 
> >> spinlock.  The hardware detects this (most likely, repeated 
> >> rep-nop with the same rip) and exits.  We can program the loop 
> >> count; obviously if we're spinning for only a short while it's 
> >> better to keep spinning while hoping the lock will be released 
> >> soon.
> >>
> >> The idea is to detect that the guest is not making forward 
> >> progress and yield.  If I could tell the scheduler, you may charge 
> >> me a couple of milliseconds, I promise not to sue, that would be 
> >> ideal. [...]
> >>     
> >
> > Ok, with such a waiver, who could refuse?
> >
> > This really needs a new kernel-internal scheduler API though, which 
> > does a lot of fancy things to do:
> >
> >         se->vruntime += 1000000;
> >
> > i.e. add 1 msec worth of nanoseconds to the task's timeline. (first 
> > remove it from the rbtree, then add it back, and nice-weight it as 
> > well) 
> 
> I suspected it would be as simple as this.

Is that thread guaranteed to run as SCHED_OTHER?


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:42           ` Peter Zijlstra
@ 2009-05-11 15:05             ` Avi Kivity
  0 siblings, 0 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 15:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Peter Zijlstra wrote:
> On Mon, 2009-05-11 at 17:24 +0300, Avi Kivity wrote:
>
>   
>>> I.e. this is a somewhat poor solution as far as scheduling goes. But 
>>> i'm wondering what the CPU side does. Can REP-NOP really take 
>>> thousands of cycles? If yes, under what circumstances?
>>>   
>>>       
>> The guest is running rep-nop in a loop while trying to acquire a 
>> spinlock.  The hardware detects this (most likely, repeated rep-nop with 
>> the same rip) and exits.  We can program the loop count; obviously if 
>> we're spinning for only a short while it's better to keep spinning while 
>> hoping the lock will be released soon.
>>
>> The idea is to detect that the guest is not making forward progress and 
>> yield.  If I could tell the scheduler, you may charge me a couple of 
>> milliseconds, I promise not to sue, that would be ideal.  Other tasks 
>> can become eligible, hopefully the task holding the spinlock, and by the 
>> time we're scheduled back the long running task will have finished and 
>> released the lock.
>>
>> For newer Linux as a guest we're better off paravirtualizing this, so we 
>> can tell the host which vcpu holds the lock; in this case kvm will want 
>> to say, take a couple milliseconds off my account and transfer it to 
>> this task (so called directed yield).  However there's no reason to 
>> paravirtualize all cpu_relax() calls.
>>     
>
> So we're now officially giving up on (soft) realtime virtualization?
>
>   

Wouldn't realtime guests be in a realtime scheduling class?  That ought 
to ignore this time_yield() (or however it is eventually named).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 15:01               ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM Peter Zijlstra
@ 2009-05-11 15:06                 ` Avi Kivity
  0 siblings, 0 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 15:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Peter Zijlstra wrote:
>>> This really needs a new kernel-internal scheduler API though, which 
>>> does a lot of fancy things to do:
>>>
>>>         se->vruntime += 1000000;
>>>
>>> i.e. add 1 msec worth of nanoseconds to the task's timeline. (first 
>>> remove it from the rbtree, then add it back, and nice-weight it as 
>>> well) 
>>>       
>> I suspected it would be as simple as this.
>>     
>
> Is that thread guaranteed to run as SCHED_OTHER?
>   

No, it's user specified.  But if it isn't SCHED_OTHER, we don't mind if 
it spins (and we hope realtime systems aren't overcommitted).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 14:59               ` Ingo Molnar
@ 2009-05-11 15:12                 ` Avi Kivity
  2009-05-11 15:18                   ` Ingo Molnar
  2009-05-11 15:58                 ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMDSVM Langsdorf, Mark
  1 sibling, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 15:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Ingo Molnar wrote:
> * Avi Kivity <avi@redhat.com> wrote:
>
>   
>>> And only do it if there's other tasks running on this CPU or so.
>>>       
>> What would happen if there weren't?  I'd guess the task would 
>> continue running (but with a warped vruntime)?
>>     
>
> We dont want that warping to occur - we just want to go back and 
> burn CPU time in VM context. The problem is (as Peter pointed it 
> out) that this hw facility is incomplete and does not give us any 
> event (interrupt) and does not give us any event key (address we are 
> waiting for) either.
>
> So the next best thing to do is to go back to the guest, because 
> that is where we make the most progress, more likely, and that is 
> where we want to be to make progress immediately, with the shortest 
> latency.
>   

Right, but I thought vruntime += blah would go back into the guest if 
there were no other runnable tasks.

Oh I see -- we'd exit immediately and warp vruntime very fast as long as 
we're spinning.

> ( Perhaps we could also increase vruntime beyond the standard 
>   latency value to make sure any freshly woken task gets executed 
>   first if we are still looping. )
>   

We could be halfway through our remaining time.  We could set it to the 
next task + epsilon.

But that may be too aggressive.  If the lock holder is on another cpu, 
it may well complete before the standard latency value expires.  Since 
we're giving up cpu time potentially to other guests, we don't want to 
give too much.

If we wake up too soon, we'll spin for a few microseconds and yield 
again.  So I think your 100 us + exponential backoff is best.

>>> _That_ would be pretty efficient, and would do the right thing when  
>>> two (or more) vcpus run on the same CPU, and it would also do the  
>>> right thing if there are repeated VM-exits due to pause filtering.
>>>
>>> Please dont even think about using yield for this though - that will  
>>> just add a huge hit to this task and wont result in any sane behavior - 
>>> and yield is bound to some historic user-space behavior as well.
>>>
>>> A gradual and linear back-off from the current timeline is more of a  
>>> fair negotiation process between vcpus and results in more or less  
>>> sane (and fair) scheduling, and no unnecessary looping.
>>>
>>> You could even do an exponential backoff up to a limit of 1-10 msecs  
>>> or so, starting at 100 usecs.
>>>   
>>>       
>> Good idea, it eliminates another variable to be tuned.
>>     
>
> It could be made fully self-tuning, if the filter threshold can be 
> tuned fast enough. (an MSR write? A VM context field update?)
>   

The latter.

> I.e. the 3000 cycles value itself could be eliminated as well. (with 
> just a common-sense max of say 100,000 cycles enforced)
>   

Yeah, though that has a much smaller effect as it's only responsible for 
a few microseconds of spinning.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 15:12                 ` Avi Kivity
@ 2009-05-11 15:18                   ` Ingo Molnar
  2009-05-11 15:28                     ` Avi Kivity
  0 siblings, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2009-05-11 15:18 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel


* Avi Kivity <avi@redhat.com> wrote:

>> I.e. the 3000 cycles value itself could be eliminated as well. 
>> (with just a common-sense max of say 100,000 cycles enforced)
>
> Yeah, though that has a much smaller effect as it's only 
> responsible for a few microseconds of spinning.

3000 cycles would be 1-2 usecs. Isnt the VM exit+entry cost still in 
that range?

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 15:18                   ` Ingo Molnar
@ 2009-05-11 15:28                     ` Avi Kivity
  2009-05-11 15:36                       ` Langsdorf, Mark
  0 siblings, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 15:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Ingo Molnar wrote:
> * Avi Kivity <avi@redhat.com> wrote:
>
>   
>>> I.e. the 3000 cycles value itself could be eliminated as well. 
>>> (with just a common-sense max of say 100,000 cycles enforced)
>>>       
>> Yeah, though that has a much smaller effect as it's only 
>> responsible for a few microseconds of spinning.
>>     
>
> 3000 cycles would be 1-2 usecs. Isnt the VM exit+entry cost still in 
> that range?
>   

It's 3000 executions of rep nop, so you need to account for the entire 
spinlock loop body.

The Linux spinlock is

             "1:\t"
             "cmpl %0, %2\n\t"
             "je 2f\n\t"
             "rep ; nop\n\t"
             "movzwl %1, %2\n\t"
             /* don't need lfence here, because loads are in-order */
             "jmp 1b\n"

5 instructions, maybe 2-3 cycles, not counting any special rep nop 
overhead.  Mark, any idea what the spin time is?

VM entry/exit is around 1us on the newer processors.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 15:28                     ` Avi Kivity
@ 2009-05-11 15:36                       ` Langsdorf, Mark
  2009-05-11 15:40                         ` Avi Kivity
  0 siblings, 1 reply; 44+ messages in thread
From: Langsdorf, Mark @ 2009-05-11 15:36 UTC (permalink / raw)
  To: Avi Kivity, Ingo Molnar; +Cc: Peter Zijlstra, Roedel, Joerg, kvm, linux-kernel


> Ingo Molnar wrote:
> > * Avi Kivity <avi@redhat.com> wrote:
> >
> >   
> >>> I.e. the 3000 cycles value itself could be eliminated as well. 
> >>> (with just a common-sense max of say 100,000 cycles enforced)
> >>>       
> >> Yeah, though that has a much smaller effect as it's only 
> >> responsible for a few microseconds of spinning.
> >>     
> >
> > 3000 cycles would be 1-2 usecs. Isnt the VM exit+entry cost 
> > still in that range?

For the processors that support this feature, VM exit+entry is
a little over 2000 cycles.
> 
> It's 3000 executions of rep nop, so you need to account for 
> the entire 
> spinlock loop body.
> 
> The Linux spinlock is
> 
>              "1:\t"
>              "cmpl %0, %2\n\t"
>              "je 2f\n\t"
>              "rep ; nop\n\t"
>              "movzwl %1, %2\n\t"
>              /* don't need lfence here, because loads are in-order */
>              "jmp 1b\n"
> 
> 5 instructions, maybe 2-3 cycles, not counting any special rep nop 
> overhead.  Mark, any idea what the spin time is?

If I'm understanding the question right, the contested
spin locks are being held for 5K to 10K iterations of PAUSE.
So 10K to 30K cycles if your estimate of the spinlock
cycle time is correct.  

-Mark Langsdorf
Operating System Research Center
AMD


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM
  2009-05-11 15:36                       ` Langsdorf, Mark
@ 2009-05-11 15:40                         ` Avi Kivity
  0 siblings, 0 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-11 15:40 UTC (permalink / raw)
  To: Langsdorf, Mark
  Cc: Ingo Molnar, Peter Zijlstra, Roedel, Joerg, kvm, linux-kernel

Langsdorf, Mark wrote:
>> The Linux spinlock is
>>
>>              "1:\t"
>>              "cmpl %0, %2\n\t"
>>              "je 2f\n\t"
>>              "rep ; nop\n\t"
>>              "movzwl %1, %2\n\t"
>>              /* don't need lfence here, because loads are in-order */
>>              "jmp 1b\n"
>>
>> 5 instructions, maybe 2-3 cycles, not counting any special rep nop 
>> overhead.  Mark, any idea what the spin time is?
>>     
>
> If I'm understanding the question right, the contested
> spin locks are being held for 5K to 10K iterations of PAUSE.
> So 10K to 30K cycles if your estimate of the spinlock
> cycle time is correct.  
>   

My estimate is not very reliable.  Can you measure this?

btw, I'd expect you'd get much more significant improvement on Windows.  
Booting 16-vcpu Windows 2008 x64 took forever on my dual core host, and 
my guess is spinlock contention.  Of course you'd need a working yield, 
as Ingo said schedule() does very little.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH][KVM][retry 1] Add support for Pause Filtering to AMDSVM
  2009-05-11 14:59               ` Ingo Molnar
  2009-05-11 15:12                 ` Avi Kivity
@ 2009-05-11 15:58                 ` Langsdorf, Mark
  1 sibling, 0 replies; 44+ messages in thread
From: Langsdorf, Mark @ 2009-05-11 15:58 UTC (permalink / raw)
  To: Ingo Molnar, Avi Kivity; +Cc: Peter Zijlstra, Roedel, Joerg, kvm, linux-kernel

> >> Please dont even think about using yield for this though - 

Oops.  I'm glad I waited to get some benchmark results before
submitting that version.

> >> A gradual and linear back-off from the current timeline is 
> >> more of a fair negotiation process between vcpus and
> >> results in more or less  
> >> sane (and fair) scheduling, and no unnecessary looping.
> >>
> >> You could even do an exponential backoff up to a limit of 
> >> 1-10 msecs or so, starting at 100 usecs.
> >
> > Good idea, it eliminates another variable to be tuned.
> 
> It could be made fully self-tuning, if the filter threshold can be 
> tuned fast enough. (an MSR write? A VM context field update?)

VMCB field update.

So the suggestion is to add a function similar to set_task_cpu()
that increases the vmruntime with an exponential backoff?  Is
that sufficient to cause a new VCPU to step in?  I'm obviously
not very familiar with the scheduler code.
 
> I.e. the 3000 cycles value itself could be eliminated as well. (with 
> just a common-sense max of say 100,000 cycles enforced)

I don't understand what you're saying here.  There needs to be
some value in the pause filter counter to trigger the intercept.

-Mark Langsdorf
Operating System Research Center
AMD


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-08 17:03     ` [PATCH][KVM][retry 2] " Mark Langsdorf
  2009-05-08 18:44       ` Avi Kivity
@ 2009-05-19 18:56       ` Mark Langsdorf
  2009-05-20  7:40         ` Ingo Molnar
                           ` (3 more replies)
  1 sibling, 4 replies; 44+ messages in thread
From: Mark Langsdorf @ 2009-05-19 18:56 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: avi, kvm, linux-kernel

>From 67f831e825b64be5dedae9936ff8a60b884959f2 Mon Sep 17 00:00:00 2001
From: mark.langsdorf@amd.com 
Date: Tue, 19 May 2009 07:46:11 -0500
Subject: [PATCH]

This feature creates a new field in the VMCB called Pause
Filter Count.  If Pause Filter Count is greater than 0 and
intercepting PAUSEs is enabled, the processor will increment
an internal counter when a PAUSE instruction occurs instead
of intercepting.  When the internal counter reaches the
Pause Filter Count value, a PAUSE intercept will occur.

This feature can be used to detect contended spinlocks,
especially when the lock holding VCPU is not scheduled.
Rescheduling another VCPU prevents the VCPU seeking the
lock from wasting its quantum by spinning idly.  Perform
the reschedule by increasing the the credited time on
the VCPU.

Experimental results show that most spinlocks are held
for less than 1000 PAUSE cycles or more than a few
thousand.  Default the Pause Filter Counter to 5000 to
detect the contended spinlocks.

Processor support for this feature is indicated by a CPUID
bit.

On a 24 core system running 4 guests each with 16 VCPUs,
this patch improved overall performance of each guest's
32 job kernbench by approximately 1%.  Further performance
improvement may be possible with a more sophisticated
yield algorithm.

-Mark Langsdorf
Operating System Research Center
AMD

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
---
 arch/x86/include/asm/svm.h |    3 ++-
 arch/x86/kvm/svm.c         |   13 +++++++++++++
 include/linux/sched.h      |    7 +++++++
 kernel/sched.c             |    5 +++++
 4 files changed, 27 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7..1fecb7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
 	u64 intercept;
-	u8 reserved_1[44];
+	u8 reserved_1[42];
+	u16 pause_filter_count;
 	u64 iopm_base_pa;
 	u64 msrpm_base_pa;
 	u64 tsc_offset;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..86df191 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
@@ -575,6 +576,11 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->nested_vmcb = 0;
 	svm->vcpu.arch.hflags = HF_GIF_MASK;
+
+	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+		control->pause_filter_count = 3000;
+		control->intercept |= (1ULL << INTERCEPT_PAUSE);
+	}
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -2087,6 +2093,12 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	return 1;
 }
 
+static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	set_task_delay(current, 1000000);
+	return 1;
+}
+
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 				      struct kvm_run *kvm_run) = {
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
@@ -2123,6 +2135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
+	[SVM_EXIT_PAUSE]			= pause_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..683bc65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2283,6 +2283,9 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return task_thread_info(p)->cpu;
 }
 
+extern void set_task_delay(struct task_struct *p, unsigned int delay);
+
+
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
 
 #else
@@ -2292,6 +2295,10 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return 0;
 }
 
+void set_task_delay(struct task_struct *p, unsigned int delay)
+{
+}
+
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e58..3174620 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1947,6 +1947,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
+void set_task_delay(struct task_struct *p, unsigned int delay)
+{
+	p->se.vruntime += delay;
+}
+EXPORT_SYMBOL(set_task_delay);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
-- 
1.6.0.2



^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
@ 2009-05-20  7:40         ` Ingo Molnar
  2009-05-20  7:59         ` Peter Zijlstra
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2009-05-20  7:40 UTC (permalink / raw)
  To: Mark Langsdorf, Peter Zijlstra; +Cc: Joerg Roedel, avi, kvm, linux-kernel


* Mark Langsdorf <mark.langsdorf@amd.com> wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b4c38bc..683bc65 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2283,6 +2283,9 @@ static inline unsigned int task_cpu(const struct task_struct *p)
>  	return task_thread_info(p)->cpu;
>  }
>  
> +extern void set_task_delay(struct task_struct *p, unsigned int delay);

> +++ b/kernel/sched.c
> @@ -1947,6 +1947,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
>  	return delta < (s64)sysctl_sched_migration_cost;
>  }
>  
> +void set_task_delay(struct task_struct *p, unsigned int delay)
> +{
> +	p->se.vruntime += delay;
> +}
> +EXPORT_SYMBOL(set_task_delay);

vruntime is nice level scaled, so this is broken. Please run this 
through the scheduler folks, we can get a facility into an isolated 
brach for Avi to pull, but it needs some thought.

	Ingo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
  2009-05-20  7:40         ` Ingo Molnar
@ 2009-05-20  7:59         ` Peter Zijlstra
  2009-05-20  8:38           ` Avi Kivity
  2009-05-20 12:00         ` Avi Kivity
  2009-05-20 22:25         ` [PATCH][KVM][retry 4] " Mark Langsdorf
  3 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-20  7:59 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: Joerg Roedel, avi, kvm, linux-kernel

On Tue, 2009-05-19 at 13:56 -0500, Mark Langsdorf wrote:
> @@ -1947,6 +1947,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
>         return delta < (s64)sysctl_sched_migration_cost;
>  }
>  
> +void set_task_delay(struct task_struct *p, unsigned int delay)
> +{
> +       p->se.vruntime += delay;
> +}
> +EXPORT_SYMBOL(set_task_delay);

That's broken, you cannot assume that a task is SCHED_OTHER like that.

Furthermore, you cannot simply change vruntime of any odd task, this
only works for current. Also, you really need to call schedule() after
doing this for it to have any immediate effect.

Also, if you mean delay to be ns, you need to scale it. Furthermore, I
would really really want to export this as GPL only (ok, preferably not
at all).

That said, I still thoroughly dislike this whole approach.


/*
 * Dumb broken yield like interface -- use at your own peril and know
 * RT people will hate you.
 *
 * Like yield, except for SCHED_OTHER/BATCH, where it will give up @ns
 * time for the 'good' cause.
 */
void sched_delay_yield(unsigned long ns)
{
	struct task_struct *curr = current;

	if (curr->sched_class == &fair_sched_class) {
		struct sched_entity *se = &curr->se;
		__update_curr(cfs_rq_of(se), se, ns);
		schedule();
		/* XXX: task accounting ? */
	} else
		sched_yield();
}
EXPORT_SYMBOL_GPL(sched_delay_yield);

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  7:59         ` Peter Zijlstra
@ 2009-05-20  8:38           ` Avi Kivity
  2009-05-20  8:42             ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-20  8:38 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Peter Zijlstra wrote:
> That said, I still thoroughly dislike this whole approach.
>   

Can you explain why?  We have a thread that has detected that it's 
spinning.  Keeping on spinning is a waste of cpu time.  Why not let 
something else use the cpu?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  8:38           ` Avi Kivity
@ 2009-05-20  8:42             ` Peter Zijlstra
  2009-05-20  8:49               ` Avi Kivity
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-20  8:42 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

On Wed, 2009-05-20 at 11:38 +0300, Avi Kivity wrote:
> Peter Zijlstra wrote:
> > That said, I still thoroughly dislike this whole approach.
> >   
> 
> Can you explain why?  We have a thread that has detected that it's 
> spinning.  Keeping on spinning is a waste of cpu time.  Why not let 
> something else use the cpu?

Because its a polling interface. I much prefer it if we were to get a
wakeup notification when the vcpu holding the lock releases it.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  8:42             ` Peter Zijlstra
@ 2009-05-20  8:49               ` Avi Kivity
  2009-05-20  8:54                 ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-20  8:49 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Peter Zijlstra wrote:
>>> That said, I still thoroughly dislike this whole approach.
>>>   
>>>       
>> Can you explain why?  We have a thread that has detected that it's 
>> spinning.  Keeping on spinning is a waste of cpu time.  Why not let 
>> something else use the cpu?
>>     
>
> Because its a polling interface. I much prefer it if we were to get a
> wakeup notification when the vcpu holding the lock releases it.
>   

It's a fully virtualized guest.  There's no way to get this without 
patching the guest kernel.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  8:49               ` Avi Kivity
@ 2009-05-20  8:54                 ` Peter Zijlstra
  2009-05-20  9:04                   ` Avi Kivity
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-20  8:54 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

On Wed, 2009-05-20 at 11:49 +0300, Avi Kivity wrote:
> Peter Zijlstra wrote:
> >>> That said, I still thoroughly dislike this whole approach.
> >>>   
> >>>       
> >> Can you explain why?  We have a thread that has detected that it's 
> >> spinning.  Keeping on spinning is a waste of cpu time.  Why not let 
> >> something else use the cpu?
> >>     
> >
> > Because its a polling interface. I much prefer it if we were to get a
> > wakeup notification when the vcpu holding the lock releases it.
> >   
> 
> It's a fully virtualized guest.  There's no way to get this without 
> patching the guest kernel.

Yes there is.. virtualized monitor-wait stuff coupled with a
monitor-wait based spinlock implementation.

Once we go change silicon, you might as well do it right.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  8:54                 ` Peter Zijlstra
@ 2009-05-20  9:04                   ` Avi Kivity
  2009-05-20  9:10                     ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Avi Kivity @ 2009-05-20  9:04 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Peter Zijlstra wrote:

>> It's a fully virtualized guest.  There's no way to get this without 
>> patching the guest kernel.
>>     
>
> Yes there is.. virtualized monitor-wait stuff coupled with a
> monitor-wait based spinlock implementation.
>   

That only works if the guest uses monitor/mwait.  Not all of the guests 
are under our control.  I don't know whether Windows uses 
monitor/mwait.  Further, we don't have timed exits on mwait like we do 
with pause.

I've also heard that monitor/mwait are very slow and only usable on idle 
loop stuff.

> Once we go change silicon, you might as well do it right.
>   

None of the major x86 vendors are under my control.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  9:04                   ` Avi Kivity
@ 2009-05-20  9:10                     ` Peter Zijlstra
  2009-05-20  9:17                       ` Avi Kivity
  2009-05-20 13:52                       ` Langsdorf, Mark
  0 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2009-05-20  9:10 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

On Wed, 2009-05-20 at 12:04 +0300, Avi Kivity wrote:
> Peter Zijlstra wrote:

> >> It's a fully virtualized guest.  There's no way to get this without 
> >> patching the guest kernel.
> >>     
> >
> > Yes there is.. virtualized monitor-wait stuff coupled with a
> > monitor-wait based spinlock implementation.
> >   
> 
> That only works if the guest uses monitor/mwait.  Not all of the guests 
> are under our control.  I don't know whether Windows uses 
> monitor/mwait.  Further, we don't have timed exits on mwait like we do 
> with pause.

Ugh, you really care about crap like windows?

> I've also heard that monitor/mwait are very slow and only usable on idle 
> loop stuff.

Yeah, current implementations suck, doesn't mean it has to stay that
way.

> > Once we go change silicon, you might as well do it right.
> >   
> 
> None of the major x86 vendors are under my control.

I thought this patch came from AMD, who changed their silicon so 'solve'
one of these virt problems.

/me goes hide again, and pretend all of virt doesn't exist :-) Think
happy thoughts.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  9:10                     ` Peter Zijlstra
@ 2009-05-20  9:17                       ` Avi Kivity
  2009-05-20 13:52                       ` Langsdorf, Mark
  1 sibling, 0 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-20  9:17 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Mark Langsdorf, Joerg Roedel, kvm, linux-kernel

Peter Zijlstra wrote:
>>>> It's a fully virtualized guest.  There's no way to get this without 
>>>> patching the guest kernel.
>>>>     
>>>>         
>>> Yes there is.. virtualized monitor-wait stuff coupled with a
>>> monitor-wait based spinlock implementation.
>>>   
>>>       
>> That only works if the guest uses monitor/mwait.  Not all of the guests 
>> are under our control.  I don't know whether Windows uses 
>> monitor/mwait.  Further, we don't have timed exits on mwait like we do 
>> with pause.
>>     
>
> Ugh, you really care about crap like windows?
>   

Yes, it is used by my users.  Either we convince them not to use 
Windows, or we find a way to support it well.

>> I've also heard that monitor/mwait are very slow and only usable on idle 
>> loop stuff.
>>     
>
> Yeah, current implementations suck, doesn't mean it has to stay that
> way.
>   

Well, I'm not speculating on future cpu changes.  I'd like to support 
current and near-future software and hardware, not how it should have 
been done software running on how it should have been done hardware.

>>> Once we go change silicon, you might as well do it right.
>>>   
>>>       
>> None of the major x86 vendors are under my control.
>>     
>
> I thought this patch came from AMD, who changed their silicon so 'solve'
> one of these virt problems.
>   

They changed the silicon to support existing guests.  For both Linux and 
Windows, the pause instruction is the only indication the guest is spinning.

> /me goes hide again, and pretend all of virt doesn't exist :-) Think
> happy thoughts.
>   

You'll end up running permanently in a guest, with no way out.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
  2009-05-20  7:40         ` Ingo Molnar
  2009-05-20  7:59         ` Peter Zijlstra
@ 2009-05-20 12:00         ` Avi Kivity
  2009-05-20 22:25         ` [PATCH][KVM][retry 4] " Mark Langsdorf
  3 siblings, 0 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-20 12:00 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: Joerg Roedel, kvm, linux-kernel

Mark Langsdorf wrote:
> On a 24 core system running 4 guests each with 16 VCPUs,
> this patch improved overall performance of each guest's
> 32 job kernbench by approximately 1%.  Further performance
> improvement may be possible with a more sophisticated
> yield algorithm.
>
>   

This result is approximately what you got on your previous patch.  Did 
you measure with the new patch?  approximately 1% seems to be too low.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH][KVM][retry 3] Add support for Pause Filtering to AMD SVM
  2009-05-20  9:10                     ` Peter Zijlstra
  2009-05-20  9:17                       ` Avi Kivity
@ 2009-05-20 13:52                       ` Langsdorf, Mark
  1 sibling, 0 replies; 44+ messages in thread
From: Langsdorf, Mark @ 2009-05-20 13:52 UTC (permalink / raw)
  To: Peter Zijlstra, Avi Kivity; +Cc: Roedel, Joerg, kvm, linux-kernel

> > > Once we go change silicon, you might as well do it right.
> > >   
> > 
> > None of the major x86 vendors are under my control.
> 
> I thought this patch came from AMD, who changed their silicon 
> so 'solve' one of these virt problems.

Right, and the change we came up with was to provide
a method for filtering PAUSEs and then intercepting
after a certain number.

It may not be the solution you wanted, but I can't
change the silicon design that has the feature at 
this point.

-Mark Langsdorf
Operating System Research Center
AMD


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
                           ` (2 preceding siblings ...)
  2009-05-20 12:00         ` Avi Kivity
@ 2009-05-20 22:25         ` Mark Langsdorf
  2009-05-21  8:47           ` Avi Kivity
  2009-07-08  5:19           ` Sheng Yang
  3 siblings, 2 replies; 44+ messages in thread
From: Mark Langsdorf @ 2009-05-20 22:25 UTC (permalink / raw)
  To: Joerg Roedel, peterz, Ingo Molnar; +Cc: avi, kvm, linux-kernel

This feature creates a new field in the VMCB called Pause
Filter Count.  If Pause Filter Count is greater than 0 and
intercepting PAUSEs is enabled, the processor will increment
an internal counter when a PAUSE instruction occurs instead
of intercepting.  When the internal counter reaches the
Pause Filter Count value, a PAUSE intercept will occur.

This feature can be used to detect contended spinlocks,
especially when the lock holding VCPU is not scheduled.
Rescheduling another VCPU prevents the VCPU seeking the
lock from wasting its quantum by spinning idly.  Perform
the reschedule by increasing the the credited time on
the VCPU.

Experimental results show that most spinlocks are held
for less than 1000 PAUSE cycles or more than a few
thousand.  Default the Pause Filter Counter to 3000 to
detect the contended spinlocks.

Processor support for this feature is indicated by a CPUID
bit.

On a 24 core system running 4 guests each with 16 VCPUs,
this patch improved overall performance of each guest's
32 job kernbench by approximately 1%.  Further performance
improvement may be possible with a more sophisticated
yield algorithm.

-Mark Langsdorf
Operating System Research Center
AMD

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
---
 arch/x86/include/asm/svm.h |    3 ++-
 arch/x86/kvm/svm.c         |   13 +++++++++++++
 include/linux/sched.h      |    7 +++++++
 kernel/sched.c             |   18 ++++++++++++++++++
 4 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7..1fecb7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
 	u64 intercept;
-	u8 reserved_1[44];
+	u8 reserved_1[42];
+	u16 pause_filter_count;
 	u64 iopm_base_pa;
 	u64 msrpm_base_pa;
 	u64 tsc_offset;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..dad6c4b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
@@ -575,6 +576,11 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->nested_vmcb = 0;
 	svm->vcpu.arch.hflags = HF_GIF_MASK;
+
+	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+		control->pause_filter_count = 3000;
+		control->intercept |= (1ULL << INTERCEPT_PAUSE);
+	}
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -2087,6 +2093,12 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	return 1;
 }
 
+static int pause_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	sched_delay_yield(1000000);
+	return 1;
+}
+
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 				      struct kvm_run *kvm_run) = {
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
@@ -2123,6 +2135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
+	[SVM_EXIT_PAUSE]			= pause_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..9cde585 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2283,6 +2283,9 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return task_thread_info(p)->cpu;
 }
 
+extern void sched_delay_yield(unsigned long ns);
+
+
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
 
 #else
@@ -2292,6 +2295,10 @@ static inline unsigned int task_cpu(const struct task_struct *p)
 	return 0;
 }
 
+void sched_delay_yield(struct task_struct *p, unsigned int delay)
+{
+}
+
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e58..3aed2f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1947,6 +1947,24 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
+/*
+ * Interface for yielding a thread by delaying it for a known
+ * interval.  Use at your own risk and not with real-time.
+ *
+ * Like yield, except for SCHED_OTHER/BATCH, where it will
+ * give us @ns time for the 'good' cause.
+ */
+void sched_delay_yield(unsigned long ns)
+{
+	struct task_struct *curr = current;
+	if (curr->sched_class == &fair_sched_class) {
+		struct sched_entity *se = &curr->se;
+		__update_curr(cfs_rq_of(se), se, ns);
+		schedule();
+	} else
+		yield();
+}
+EXPORT_SYMBOL_GPL(sched_delay_yield);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
-- 
1.6.0.2



^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-05-20 22:25         ` [PATCH][KVM][retry 4] " Mark Langsdorf
@ 2009-05-21  8:47           ` Avi Kivity
  2009-07-08  5:19           ` Sheng Yang
  1 sibling, 0 replies; 44+ messages in thread
From: Avi Kivity @ 2009-05-21  8:47 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: Joerg Roedel, peterz, Ingo Molnar, kvm, linux-kernel

Mark Langsdorf wrote:
> This feature creates a new field in the VMCB called Pause
> Filter Count.  If Pause Filter Count is greater than 0 and
> intercepting PAUSEs is enabled, the processor will increment
> an internal counter when a PAUSE instruction occurs instead
> of intercepting.  When the internal counter reaches the
> Pause Filter Count value, a PAUSE intercept will occur.
>
> This feature can be used to detect contended spinlocks,
> especially when the lock holding VCPU is not scheduled.
> Rescheduling another VCPU prevents the VCPU seeking the
> lock from wasting its quantum by spinning idly.  Perform
> the reschedule by increasing the the credited time on
> the VCPU.
>
> Experimental results show that most spinlocks are held
> for less than 1000 PAUSE cycles or more than a few
> thousand.  Default the Pause Filter Counter to 3000 to
> detect the contended spinlocks.
>
> Processor support for this feature is indicated by a CPUID
> bit.
>
> On a 24 core system running 4 guests each with 16 VCPUs,
> this patch improved overall performance of each guest's
> 32 job kernbench by approximately 1%.  Further performance
> improvement may be possible with a more sophisticated
> yield algorithm.
>   

Please split this into a scheduler patch and a kvm patch.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-05-20 22:25         ` [PATCH][KVM][retry 4] " Mark Langsdorf
  2009-05-21  8:47           ` Avi Kivity
@ 2009-07-08  5:19           ` Sheng Yang
  2009-07-08 14:59             ` Langsdorf, Mark
  1 sibling, 1 reply; 44+ messages in thread
From: Sheng Yang @ 2009-07-08  5:19 UTC (permalink / raw)
  To: Mark Langsdorf; +Cc: Joerg Roedel, peterz, Ingo Molnar, avi, kvm, linux-kernel

On Thursday 21 May 2009 06:25:17 Mark Langsdorf wrote:
> This feature creates a new field in the VMCB called Pause
> Filter Count.  If Pause Filter Count is greater than 0 and
> intercepting PAUSEs is enabled, the processor will increment
> an internal counter when a PAUSE instruction occurs instead
> of intercepting.  When the internal counter reaches the
> Pause Filter Count value, a PAUSE intercept will occur.
>

(dig it from archives...)

Any update for the patch(I mean the scheduler part)? I think people agreed on 
the approach?

-- 
regards
Yang, Sheng

> This feature can be used to detect contended spinlocks,
> especially when the lock holding VCPU is not scheduled.
> Rescheduling another VCPU prevents the VCPU seeking the
> lock from wasting its quantum by spinning idly.  Perform
> the reschedule by increasing the the credited time on
> the VCPU.
>
> Experimental results show that most spinlocks are held
> for less than 1000 PAUSE cycles or more than a few
> thousand.  Default the Pause Filter Counter to 3000 to
> detect the contended spinlocks.
>
> Processor support for this feature is indicated by a CPUID
> bit.
>
> On a 24 core system running 4 guests each with 16 VCPUs,
> this patch improved overall performance of each guest's
> 32 job kernbench by approximately 1%.  Further performance
> improvement may be possible with a more sophisticated
> yield algorithm.
>
> -Mark Langsdorf
> Operating System Research Center
> AMD
>
> Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
> ---
>  arch/x86/include/asm/svm.h |    3 ++-
>  arch/x86/kvm/svm.c         |   13 +++++++++++++
>  include/linux/sched.h      |    7 +++++++
>  kernel/sched.c             |   18 ++++++++++++++++++
>  4 files changed, 40 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> index 85574b7..1fecb7e 100644
> --- a/arch/x86/include/asm/svm.h
> +++ b/arch/x86/include/asm/svm.h
> @@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
>  	u16 intercept_dr_write;
>  	u32 intercept_exceptions;
>  	u64 intercept;
> -	u8 reserved_1[44];
> +	u8 reserved_1[42];
> +	u16 pause_filter_count;
>  	u64 iopm_base_pa;
>  	u64 msrpm_base_pa;
>  	u64 tsc_offset;
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index ef43a18..dad6c4b 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
>  #define SVM_FEATURE_NPT  (1 << 0)
>  #define SVM_FEATURE_LBRV (1 << 1)
>  #define SVM_FEATURE_SVML (1 << 2)
> +#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
>
>  #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
>
> @@ -575,6 +576,11 @@ static void init_vmcb(struct vcpu_svm *svm)
>
>  	svm->nested_vmcb = 0;
>  	svm->vcpu.arch.hflags = HF_GIF_MASK;
> +
> +	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> +		control->pause_filter_count = 3000;
> +		control->intercept |= (1ULL << INTERCEPT_PAUSE);
> +	}
>  }
>
>  static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
> @@ -2087,6 +2093,12 @@ static int interrupt_window_interception(struct
> vcpu_svm *svm, return 1;
>  }
>
> +static int pause_interception(struct vcpu_svm *svm, struct kvm_run
> *kvm_run) +{
> +	sched_delay_yield(1000000);
> +	return 1;
> +}
> +
>  static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
>  				      struct kvm_run *kvm_run) = {
>  	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
> @@ -2123,6 +2135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm
> *svm, [SVM_EXIT_CPUID]			= cpuid_interception,
>  	[SVM_EXIT_IRET]                         = iret_interception,
>  	[SVM_EXIT_INVD]                         = emulate_on_interception,
> +	[SVM_EXIT_PAUSE]			= pause_interception,
>  	[SVM_EXIT_HLT]				= halt_interception,
>  	[SVM_EXIT_INVLPG]			= invlpg_interception,
>  	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b4c38bc..9cde585 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2283,6 +2283,9 @@ static inline unsigned int task_cpu(const struct
> task_struct *p) return task_thread_info(p)->cpu;
>  }
>
> +extern void sched_delay_yield(unsigned long ns);
> +
> +
>  extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
>
>  #else
> @@ -2292,6 +2295,10 @@ static inline unsigned int task_cpu(const struct
> task_struct *p) return 0;
>  }
>
> +void sched_delay_yield(struct task_struct *p, unsigned int delay)
> +{
> +}
> +
>  static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
>  {
>  }
> diff --git a/kernel/sched.c b/kernel/sched.c
> index b902e58..3aed2f6 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -1947,6 +1947,24 @@ task_hot(struct task_struct *p, u64 now, struct
> sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost;
>  }
>
> +/*
> + * Interface for yielding a thread by delaying it for a known
> + * interval.  Use at your own risk and not with real-time.
> + *
> + * Like yield, except for SCHED_OTHER/BATCH, where it will
> + * give us @ns time for the 'good' cause.
> + */
> +void sched_delay_yield(unsigned long ns)
> +{
> +	struct task_struct *curr = current;
> +	if (curr->sched_class == &fair_sched_class) {
> +		struct sched_entity *se = &curr->se;
> +		__update_curr(cfs_rq_of(se), se, ns);
> +		schedule();
> +	} else
> +		yield();
> +}
> +EXPORT_SYMBOL_GPL(sched_delay_yield);
>
>  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
>  {



^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-07-08  5:19           ` Sheng Yang
@ 2009-07-08 14:59             ` Langsdorf, Mark
  2009-07-09  1:50               ` Sheng Yang
  0 siblings, 1 reply; 44+ messages in thread
From: Langsdorf, Mark @ 2009-07-08 14:59 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Roedel, Joerg, peterz, Ingo Molnar, avi, kvm, linux-kernel

The last variant of the scheduler that I tried
showed worse performance for both the baseline
case (no pause filter enabled) and the test
case (pause filter enabled) versus not changing
the scheduler.

Some other work came up and I haven't have a
chance to experiment with this for a while.

-Mark Langsdorf
Operating System Research Center
AMD 

> -----Original Message-----
> From: Sheng Yang [mailto:sheng@linux.intel.com] 
> Sent: Wednesday, July 08, 2009 12:20 AM
> To: Langsdorf, Mark
> Cc: Roedel, Joerg; peterz@infradead.org; Ingo Molnar; 
> avi@redhat.com; kvm@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH][KVM][retry 4] Add support for Pause 
> Filtering to AMD SVM
> 
> On Thursday 21 May 2009 06:25:17 Mark Langsdorf wrote:
> > This feature creates a new field in the VMCB called Pause
> > Filter Count.  If Pause Filter Count is greater than 0 and
> > intercepting PAUSEs is enabled, the processor will increment
> > an internal counter when a PAUSE instruction occurs instead
> > of intercepting.  When the internal counter reaches the
> > Pause Filter Count value, a PAUSE intercept will occur.
> >
> 
> (dig it from archives...)
> 
> Any update for the patch(I mean the scheduler part)? I think 
> people agreed on 
> the approach?
> 
> -- 
> regards
> Yang, Sheng
> 
> > This feature can be used to detect contended spinlocks,
> > especially when the lock holding VCPU is not scheduled.
> > Rescheduling another VCPU prevents the VCPU seeking the
> > lock from wasting its quantum by spinning idly.  Perform
> > the reschedule by increasing the the credited time on
> > the VCPU.
> >
> > Experimental results show that most spinlocks are held
> > for less than 1000 PAUSE cycles or more than a few
> > thousand.  Default the Pause Filter Counter to 3000 to
> > detect the contended spinlocks.
> >
> > Processor support for this feature is indicated by a CPUID
> > bit.
> >
> > On a 24 core system running 4 guests each with 16 VCPUs,
> > this patch improved overall performance of each guest's
> > 32 job kernbench by approximately 1%.  Further performance
> > improvement may be possible with a more sophisticated
> > yield algorithm.
> >
> > -Mark Langsdorf
> > Operating System Research Center
> > AMD
> >
> > Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
> > ---
> >  arch/x86/include/asm/svm.h |    3 ++-
> >  arch/x86/kvm/svm.c         |   13 +++++++++++++
> >  include/linux/sched.h      |    7 +++++++
> >  kernel/sched.c             |   18 ++++++++++++++++++
> >  4 files changed, 40 insertions(+), 1 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> > index 85574b7..1fecb7e 100644
> > --- a/arch/x86/include/asm/svm.h
> > +++ b/arch/x86/include/asm/svm.h
> > @@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) 
> vmcb_control_area {
> >  	u16 intercept_dr_write;
> >  	u32 intercept_exceptions;
> >  	u64 intercept;
> > -	u8 reserved_1[44];
> > +	u8 reserved_1[42];
> > +	u16 pause_filter_count;
> >  	u64 iopm_base_pa;
> >  	u64 msrpm_base_pa;
> >  	u64 tsc_offset;
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index ef43a18..dad6c4b 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
> >  #define SVM_FEATURE_NPT  (1 << 0)
> >  #define SVM_FEATURE_LBRV (1 << 1)
> >  #define SVM_FEATURE_SVML (1 << 2)
> > +#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
> >
> >  #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
> >
> > @@ -575,6 +576,11 @@ static void init_vmcb(struct vcpu_svm *svm)
> >
> >  	svm->nested_vmcb = 0;
> >  	svm->vcpu.arch.hflags = HF_GIF_MASK;
> > +
> > +	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> > +		control->pause_filter_count = 3000;
> > +		control->intercept |= (1ULL << INTERCEPT_PAUSE);
> > +	}
> >  }
> >
> >  static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
> > @@ -2087,6 +2093,12 @@ static int 
> interrupt_window_interception(struct
> > vcpu_svm *svm, return 1;
> >  }
> >
> > +static int pause_interception(struct vcpu_svm *svm, struct kvm_run
> > *kvm_run) +{
> > +	sched_delay_yield(1000000);
> > +	return 1;
> > +}
> > +
> >  static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
> >  				      struct kvm_run *kvm_run) = {
> >  	[SVM_EXIT_READ_CR0]           		= 
> emulate_on_interception,
> > @@ -2123,6 +2135,7 @@ static int 
> (*svm_exit_handlers[])(struct vcpu_svm
> > *svm, [SVM_EXIT_CPUID]			= cpuid_interception,
> >  	[SVM_EXIT_IRET]                         = iret_interception,
> >  	[SVM_EXIT_INVD]                         = 
> emulate_on_interception,
> > +	[SVM_EXIT_PAUSE]			= pause_interception,
> >  	[SVM_EXIT_HLT]				= halt_interception,
> >  	[SVM_EXIT_INVLPG]			= invlpg_interception,
> >  	[SVM_EXIT_INVLPGA]			= 
> invalid_op_interception,
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index b4c38bc..9cde585 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2283,6 +2283,9 @@ static inline unsigned int 
> task_cpu(const struct
> > task_struct *p) return task_thread_info(p)->cpu;
> >  }
> >
> > +extern void sched_delay_yield(unsigned long ns);
> > +
> > +
> >  extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
> >
> >  #else
> > @@ -2292,6 +2295,10 @@ static inline unsigned int 
> task_cpu(const struct
> > task_struct *p) return 0;
> >  }
> >
> > +void sched_delay_yield(struct task_struct *p, unsigned int delay)
> > +{
> > +}
> > +
> >  static inline void set_task_cpu(struct task_struct *p, 
> unsigned int cpu)
> >  {
> >  }
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index b902e58..3aed2f6 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -1947,6 +1947,24 @@ task_hot(struct task_struct *p, u64 
> now, struct
> > sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost;
> >  }
> >
> > +/*
> > + * Interface for yielding a thread by delaying it for a known
> > + * interval.  Use at your own risk and not with real-time.
> > + *
> > + * Like yield, except for SCHED_OTHER/BATCH, where it will
> > + * give us @ns time for the 'good' cause.
> > + */
> > +void sched_delay_yield(unsigned long ns)
> > +{
> > +	struct task_struct *curr = current;
> > +	if (curr->sched_class == &fair_sched_class) {
> > +		struct sched_entity *se = &curr->se;
> > +		__update_curr(cfs_rq_of(se), se, ns);
> > +		schedule();
> > +	} else
> > +		yield();
> > +}
> > +EXPORT_SYMBOL_GPL(sched_delay_yield);
> >
> >  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> >  {
> 
> 
> 
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-07-08 14:59             ` Langsdorf, Mark
@ 2009-07-09  1:50               ` Sheng Yang
  2009-07-22 22:40                 ` Langsdorf, Mark
  0 siblings, 1 reply; 44+ messages in thread
From: Sheng Yang @ 2009-07-09  1:50 UTC (permalink / raw)
  To: Langsdorf, Mark
  Cc: Roedel, Joerg, peterz, Ingo Molnar, avi, kvm, linux-kernel,
	Edwin Zhai

On Wednesday 08 July 2009 22:59:55 Langsdorf, Mark wrote:
> The last variant of the scheduler that I tried
> showed worse performance for both the baseline
> case (no pause filter enabled) and the test
> case (pause filter enabled) versus not changing
> the scheduler.
>
> Some other work came up and I haven't have a
> chance to experiment with this for a while.

Um, I am afraid we have the different result... With your scheduler patch, we 
got 1% more performance improvement in the quick test. Of course more tests 
are needed to find a better value of delay.

Do you have time to work on it recently? Maybe we can help to push the 
scheduler part. (oh, as you know, we need to push our PLE...)

-- 
regards
Yang, Sheng

> -Mark Langsdorf
> Operating System Research Center
> AMD
>
> > -----Original Message-----
> > From: Sheng Yang [mailto:sheng@linux.intel.com]
> > Sent: Wednesday, July 08, 2009 12:20 AM
> > To: Langsdorf, Mark
> > Cc: Roedel, Joerg; peterz@infradead.org; Ingo Molnar;
> > avi@redhat.com; kvm@vger.kernel.org; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH][KVM][retry 4] Add support for Pause
> > Filtering to AMD SVM
> >
> > On Thursday 21 May 2009 06:25:17 Mark Langsdorf wrote:
> > > This feature creates a new field in the VMCB called Pause
> > > Filter Count.  If Pause Filter Count is greater than 0 and
> > > intercepting PAUSEs is enabled, the processor will increment
> > > an internal counter when a PAUSE instruction occurs instead
> > > of intercepting.  When the internal counter reaches the
> > > Pause Filter Count value, a PAUSE intercept will occur.
> >
> > (dig it from archives...)
> >
> > Any update for the patch(I mean the scheduler part)? I think
> > people agreed on
> > the approach?
> >
> > --
> > regards
> > Yang, Sheng
> >
> > > This feature can be used to detect contended spinlocks,
> > > especially when the lock holding VCPU is not scheduled.
> > > Rescheduling another VCPU prevents the VCPU seeking the
> > > lock from wasting its quantum by spinning idly.  Perform
> > > the reschedule by increasing the the credited time on
> > > the VCPU.
> > >
> > > Experimental results show that most spinlocks are held
> > > for less than 1000 PAUSE cycles or more than a few
> > > thousand.  Default the Pause Filter Counter to 3000 to
> > > detect the contended spinlocks.
> > >
> > > Processor support for this feature is indicated by a CPUID
> > > bit.
> > >
> > > On a 24 core system running 4 guests each with 16 VCPUs,
> > > this patch improved overall performance of each guest's
> > > 32 job kernbench by approximately 1%.  Further performance
> > > improvement may be possible with a more sophisticated
> > > yield algorithm.
> > >
> > > -Mark Langsdorf
> > > Operating System Research Center
> > > AMD
> > >
> > > Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
> > > ---
> > >  arch/x86/include/asm/svm.h |    3 ++-
> > >  arch/x86/kvm/svm.c         |   13 +++++++++++++
> > >  include/linux/sched.h      |    7 +++++++
> > >  kernel/sched.c             |   18 ++++++++++++++++++
> > >  4 files changed, 40 insertions(+), 1 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> > > index 85574b7..1fecb7e 100644
> > > --- a/arch/x86/include/asm/svm.h
> > > +++ b/arch/x86/include/asm/svm.h
> > > @@ -57,7 +57,8 @@ struct __attribute__ ((__packed__))
> >
> > vmcb_control_area {
> >
> > >  	u16 intercept_dr_write;
> > >  	u32 intercept_exceptions;
> > >  	u64 intercept;
> > > -	u8 reserved_1[44];
> > > +	u8 reserved_1[42];
> > > +	u16 pause_filter_count;
> > >  	u64 iopm_base_pa;
> > >  	u64 msrpm_base_pa;
> > >  	u64 tsc_offset;
> > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > index ef43a18..dad6c4b 100644
> > > --- a/arch/x86/kvm/svm.c
> > > +++ b/arch/x86/kvm/svm.c
> > > @@ -45,6 +45,7 @@ MODULE_LICENSE("GPL");
> > >  #define SVM_FEATURE_NPT  (1 << 0)
> > >  #define SVM_FEATURE_LBRV (1 << 1)
> > >  #define SVM_FEATURE_SVML (1 << 2)
> > > +#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
> > >
> > >  #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
> > >
> > > @@ -575,6 +576,11 @@ static void init_vmcb(struct vcpu_svm *svm)
> > >
> > >  	svm->nested_vmcb = 0;
> > >  	svm->vcpu.arch.hflags = HF_GIF_MASK;
> > > +
> > > +	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
> > > +		control->pause_filter_count = 3000;
> > > +		control->intercept |= (1ULL << INTERCEPT_PAUSE);
> > > +	}
> > >  }
> > >
> > >  static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
> > > @@ -2087,6 +2093,12 @@ static int
> >
> > interrupt_window_interception(struct
> >
> > > vcpu_svm *svm, return 1;
> > >  }
> > >
> > > +static int pause_interception(struct vcpu_svm *svm, struct kvm_run
> > > *kvm_run) +{
> > > +	sched_delay_yield(1000000);
> > > +	return 1;
> > > +}
> > > +
> > >  static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
> > >  				      struct kvm_run *kvm_run) = {
> > >  	[SVM_EXIT_READ_CR0]           		=
> >
> > emulate_on_interception,
> >
> > > @@ -2123,6 +2135,7 @@ static int
> >
> > (*svm_exit_handlers[])(struct vcpu_svm
> >
> > > *svm, [SVM_EXIT_CPUID]			= cpuid_interception,
> > >  	[SVM_EXIT_IRET]                         = iret_interception,
> > >  	[SVM_EXIT_INVD]                         =
> >
> > emulate_on_interception,
> >
> > > +	[SVM_EXIT_PAUSE]			= pause_interception,
> > >  	[SVM_EXIT_HLT]				= halt_interception,
> > >  	[SVM_EXIT_INVLPG]			= invlpg_interception,
> > >  	[SVM_EXIT_INVLPGA]			=
> >
> > invalid_op_interception,
> >
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index b4c38bc..9cde585 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -2283,6 +2283,9 @@ static inline unsigned int
> >
> > task_cpu(const struct
> >
> > > task_struct *p) return task_thread_info(p)->cpu;
> > >  }
> > >
> > > +extern void sched_delay_yield(unsigned long ns);
> > > +
> > > +
> > >  extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
> > >
> > >  #else
> > > @@ -2292,6 +2295,10 @@ static inline unsigned int
> >
> > task_cpu(const struct
> >
> > > task_struct *p) return 0;
> > >  }
> > >
> > > +void sched_delay_yield(struct task_struct *p, unsigned int delay)
> > > +{
> > > +}
> > > +
> > >  static inline void set_task_cpu(struct task_struct *p,
> >
> > unsigned int cpu)
> >
> > >  {
> > >  }
> > > diff --git a/kernel/sched.c b/kernel/sched.c
> > > index b902e58..3aed2f6 100644
> > > --- a/kernel/sched.c
> > > +++ b/kernel/sched.c
> > > @@ -1947,6 +1947,24 @@ task_hot(struct task_struct *p, u64
> >
> > now, struct
> >
> > > sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost;
> > >  }
> > >
> > > +/*
> > > + * Interface for yielding a thread by delaying it for a known
> > > + * interval.  Use at your own risk and not with real-time.
> > > + *
> > > + * Like yield, except for SCHED_OTHER/BATCH, where it will
> > > + * give us @ns time for the 'good' cause.
> > > + */
> > > +void sched_delay_yield(unsigned long ns)
> > > +{
> > > +	struct task_struct *curr = current;
> > > +	if (curr->sched_class == &fair_sched_class) {
> > > +		struct sched_entity *se = &curr->se;
> > > +		__update_curr(cfs_rq_of(se), se, ns);
> > > +		schedule();
> > > +	} else
> > > +		yield();
> > > +}
> > > +EXPORT_SYMBOL_GPL(sched_delay_yield);
> > >
> > >  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> > >  {



^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-07-09  1:50               ` Sheng Yang
@ 2009-07-22 22:40                 ` Langsdorf, Mark
  2009-08-05  9:08                   ` Zhai, Edwin
  0 siblings, 1 reply; 44+ messages in thread
From: Langsdorf, Mark @ 2009-07-22 22:40 UTC (permalink / raw)
  To: Sheng Yang
  Cc: Roedel, Joerg, peterz, Ingo Molnar, avi, kvm, linux-kernel,
	Edwin Zhai

> Um, I am afraid we have the different result... With your 
> scheduler patch, we got 1% more performance improvement
> in the quick test. Of course more tests are needed to
> find a better value of delay.

What was your test case?  How many runs did you do?
My results had a lot of variance in them.

> Do you have time to work on it recently?

I've been working with the former VI guys on some scheduler
improvements for Xen, and hope to get back to this some
next week.

-Mark Langsdorf
Operating System Research Center
AMD


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH][KVM][retry 4] Add support for Pause Filtering to AMD SVM
  2009-07-22 22:40                 ` Langsdorf, Mark
@ 2009-08-05  9:08                   ` Zhai, Edwin
  0 siblings, 0 replies; 44+ messages in thread
From: Zhai, Edwin @ 2009-08-05  9:08 UTC (permalink / raw)
  To: Langsdorf, Mark
  Cc: Sheng Yang, Roedel, Joerg, peterz@infradead.org, Ingo Molnar,
	avi@redhat.com, kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	Zhai, Edwin

Mark,
Do you have time to push the Linux scheduler changes for Pause Filtering 
now?  It's almost done after your work. If needing any help, pls. let me 
know.

Thanks,
edwin

Langsdorf, Mark wrote:
>> Um, I am afraid we have the different result... With your 
>> scheduler patch, we got 1% more performance improvement
>> in the quick test. Of course more tests are needed to
>> find a better value of delay.
>>     
>
> What was your test case?  How many runs did you do?
> My results had a lot of variance in them.
>
>   
>> Do you have time to work on it recently?
>>     
>
> I've been working with the former VI guys on some scheduler
> improvements for Xen, and hope to get back to this some
> next week.
>
> -Mark Langsdorf
> Operating System Research Center
> AMD
>
>   

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2009-08-05  9:08 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-05 14:09 [PATCH][KVM] Add support for Pause Filtering to AMD SVM Mark Langsdorf
2009-05-05 16:05 ` Bert Wesarg
2009-05-07 13:55 ` Joerg Roedel
2009-05-07 15:00   ` [PATCH][KVM][retry 1] " Mark Langsdorf
2009-05-07 15:31     ` Avi Kivity
2009-05-11 14:15       ` Ingo Molnar
2009-05-11 14:24         ` Avi Kivity
2009-05-11 14:33           ` Ingo Molnar
2009-05-11 14:51             ` Avi Kivity
2009-05-11 14:59               ` Ingo Molnar
2009-05-11 15:12                 ` Avi Kivity
2009-05-11 15:18                   ` Ingo Molnar
2009-05-11 15:28                     ` Avi Kivity
2009-05-11 15:36                       ` Langsdorf, Mark
2009-05-11 15:40                         ` Avi Kivity
2009-05-11 15:58                 ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMDSVM Langsdorf, Mark
2009-05-11 15:01               ` [PATCH][KVM][retry 1] Add support for Pause Filtering to AMD SVM Peter Zijlstra
2009-05-11 15:06                 ` Avi Kivity
2009-05-11 14:42           ` Peter Zijlstra
2009-05-11 15:05             ` Avi Kivity
2009-05-08 17:03     ` [PATCH][KVM][retry 2] " Mark Langsdorf
2009-05-08 18:44       ` Avi Kivity
2009-05-08 18:47         ` Langsdorf, Mark
2009-05-19 18:56       ` [PATCH][KVM][retry 3] " Mark Langsdorf
2009-05-20  7:40         ` Ingo Molnar
2009-05-20  7:59         ` Peter Zijlstra
2009-05-20  8:38           ` Avi Kivity
2009-05-20  8:42             ` Peter Zijlstra
2009-05-20  8:49               ` Avi Kivity
2009-05-20  8:54                 ` Peter Zijlstra
2009-05-20  9:04                   ` Avi Kivity
2009-05-20  9:10                     ` Peter Zijlstra
2009-05-20  9:17                       ` Avi Kivity
2009-05-20 13:52                       ` Langsdorf, Mark
2009-05-20 12:00         ` Avi Kivity
2009-05-20 22:25         ` [PATCH][KVM][retry 4] " Mark Langsdorf
2009-05-21  8:47           ` Avi Kivity
2009-07-08  5:19           ` Sheng Yang
2009-07-08 14:59             ` Langsdorf, Mark
2009-07-09  1:50               ` Sheng Yang
2009-07-22 22:40                 ` Langsdorf, Mark
2009-08-05  9:08                   ` Zhai, Edwin
2009-05-11 14:38 ` [PATCH][KVM] " Peter Zijlstra
2009-05-11 14:51   ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).