public inbox for virtualization@lists.linux-foundation.org
 help / color / mirror / Atom feed
* [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops
       [not found] <1504007201-12904-1-git-send-email-yang.zhang.wz@gmail.com>
@ 2017-08-29 11:46 ` Yang Zhang
  2017-08-29 13:55   ` Konrad Rzeszutek Wilk
  2017-08-29 11:46 ` [RFC PATCH v2 4/7] x86/paravirt: Add update in x86/paravirt pv_idle_ops Yang Zhang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 7+ messages in thread
From: Yang Zhang @ 2017-08-29 11:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: Yang Zhang, Jeremy Fitzhardinge, kvm, rkrcmar, peterz, Pan Xinhui,
	virtualization, H. Peter Anvin, Alok Kataria, wanpeng.li, x86,
	Ingo Molnar, Kees Cook, Chris Wright, Andy Lutomirski, dmatlack,
	tglx, Quan Xu, linux-doc, mst, pbonzini, Kirill A. Shutemov

So far, pv_idle_ops.poll is the only ops for pv_idle. .poll is called in
idle path which will polling for a while before we enter the real idle
state.

In virtualization, idle path includes several heavy operations
includes timer access(LAPIC timer or TSC deadline timer) which will hurt
performance especially for latency intensive workload like message
passing task. The cost is mainly come from the vmexit which is a
hardware context switch between VM and hypervisor. Our solution is to
poll for a while and do not enter real idle path if we can get the
schedule event during polling.

Poll may cause the CPU waste so we adopt a smart polling mechanism to
reduce the useless poll.

Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
Signed-off-by: Quan Xu <quan.xu0@gmail.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: virtualization@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/paravirt.h       | 5 +++++
 arch/x86/include/asm/paravirt_types.h | 6 ++++++
 arch/x86/kernel/paravirt.c            | 6 ++++++
 3 files changed, 17 insertions(+)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac19..6d46760 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -202,6 +202,11 @@ static inline unsigned long long paravirt_read_pmc(int counter)
 
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 
+static inline void paravirt_idle_poll(void)
+{
+	PVOP_VCALL0(pv_idle_ops.poll);
+}
+
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9ffc36b..cf45726 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -324,6 +324,10 @@ struct pv_lock_ops {
 	struct paravirt_callee_save vcpu_is_preempted;
 } __no_randomize_layout;
 
+struct pv_idle_ops {
+	void (*poll)(void);
+} __no_randomize_layout;
+
 /* This contains all the paravirt structures: we get a convenient
  * number for each function using the offset which we use to indicate
  * what to patch. */
@@ -334,6 +338,7 @@ struct paravirt_patch_template {
 	struct pv_irq_ops pv_irq_ops;
 	struct pv_mmu_ops pv_mmu_ops;
 	struct pv_lock_ops pv_lock_ops;
+	struct pv_idle_ops pv_idle_ops;
 } __no_randomize_layout;
 
 extern struct pv_info pv_info;
@@ -343,6 +348,7 @@ struct paravirt_patch_template {
 extern struct pv_irq_ops pv_irq_ops;
 extern struct pv_mmu_ops pv_mmu_ops;
 extern struct pv_lock_ops pv_lock_ops;
+extern struct pv_idle_ops pv_idle_ops;
 
 #define PARAVIRT_PATCH(x)					\
 	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bc0a849..1b5b247 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -128,6 +128,7 @@ static void *get_call_destination(u8 type)
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 		.pv_lock_ops = pv_lock_ops,
 #endif
+		.pv_idle_ops = pv_idle_ops,
 	};
 	return *((void **)&tmpl + type);
 }
@@ -312,6 +313,10 @@ struct pv_time_ops pv_time_ops = {
 	.steal_clock = native_steal_clock,
 };
 
+struct pv_idle_ops pv_idle_ops = {
+	.poll = paravirt_nop,
+};
+
 __visible struct pv_irq_ops pv_irq_ops = {
 	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
 	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
@@ -471,3 +476,4 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 EXPORT_SYMBOL    (pv_mmu_ops);
 EXPORT_SYMBOL_GPL(pv_info);
 EXPORT_SYMBOL    (pv_irq_ops);
+EXPORT_SYMBOL    (pv_idle_ops);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [RFC PATCH v2 4/7] x86/paravirt: Add update in x86/paravirt pv_idle_ops
       [not found] <1504007201-12904-1-git-send-email-yang.zhang.wz@gmail.com>
  2017-08-29 11:46 ` [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops Yang Zhang
@ 2017-08-29 11:46 ` Yang Zhang
  2017-08-29 11:46 ` [RFC PATCH v2 5/7] Documentation: Add three sysctls for smart idle poll Yang Zhang
       [not found] ` <1504007201-12904-6-git-send-email-yang.zhang.wz@gmail.com>
  3 siblings, 0 replies; 7+ messages in thread
From: Yang Zhang @ 2017-08-29 11:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: Yang Zhang, Jeremy Fitzhardinge, kvm, rkrcmar, peterz, Pan Xinhui,
	virtualization, H. Peter Anvin, Alok Kataria, wanpeng.li, x86,
	Ingo Molnar, Kees Cook, Chris Wright, Andy Lutomirski, dmatlack,
	tglx, Quan Xu, linux-doc, mst, pbonzini, Kirill A. Shutemov

.update is used to adjust the next poll time.

Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
Signed-off-by: Quan Xu <quan.xu0@gmail.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: virtualization@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/paravirt.h       | 5 +++++
 arch/x86/include/asm/paravirt_types.h | 1 +
 arch/x86/kernel/paravirt.c            | 1 +
 3 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6d46760..32e1c06 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -207,6 +207,11 @@ static inline void paravirt_idle_poll(void)
 	PVOP_VCALL0(pv_idle_ops.poll);
 }
 
+static inline void paravirt_idle_update_poll_duration(unsigned long duration)
+{
+	PVOP_VCALL1(pv_idle_ops.update, duration);
+}
+
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index cf45726..3b4f95a 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -326,6 +326,7 @@ struct pv_lock_ops {
 
 struct pv_idle_ops {
 	void (*poll)(void);
+	void (*update)(unsigned long);
 } __no_randomize_layout;
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b5b247..a11b2c2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -315,6 +315,7 @@ struct pv_time_ops pv_time_ops = {
 
 struct pv_idle_ops pv_idle_ops = {
 	.poll = paravirt_nop,
+	.update = paravirt_nop,
 };
 
 __visible struct pv_irq_ops pv_irq_ops = {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [RFC PATCH v2 5/7] Documentation: Add three sysctls for smart idle poll
       [not found] <1504007201-12904-1-git-send-email-yang.zhang.wz@gmail.com>
  2017-08-29 11:46 ` [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops Yang Zhang
  2017-08-29 11:46 ` [RFC PATCH v2 4/7] x86/paravirt: Add update in x86/paravirt pv_idle_ops Yang Zhang
@ 2017-08-29 11:46 ` Yang Zhang
       [not found] ` <1504007201-12904-6-git-send-email-yang.zhang.wz@gmail.com>
  3 siblings, 0 replies; 7+ messages in thread
From: Yang Zhang @ 2017-08-29 11:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: Yang Zhang, Jeremy Fitzhardinge, kvm, rkrcmar, peterz,
	virtualization, H. Peter Anvin, Alok Kataria, wanpeng.li,
	Jessica Yu, Baoquan He, Jonathan Corbet, x86, Krzysztof Kozlowski,
	Ingo Molnar, zijun_hu, Petr Mladek, Kees Cook, Johannes Berg,
	Chris Wright, Ian Abbott, Josh Poimboeuf, dmatlack, tglx,
	Mauro Carvalho Chehab, Quan Xu, linux-doc, Luis R. Rodriguez

To reduce the cost of poll, we introduce three sysctl to control the
poll time.

Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
Signed-off-by: Quan Xu <quan.xu0@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Cc: zijun_hu <zijun_hu@htc.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Ian Abbott <abbotti@mev.co.uk>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: linux-fsdevel@vger.kernel.org
---
 Documentation/sysctl/kernel.txt | 25 +++++++++++++++++++++++++
 arch/x86/kernel/paravirt.c      |  4 ++++
 include/linux/kernel.h          |  6 ++++++
 kernel/sysctl.c                 | 23 +++++++++++++++++++++++
 4 files changed, 58 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index bac23c1..67447b8 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -63,6 +63,9 @@ show up in /proc/sys/kernel:
 - perf_event_max_stack
 - perf_event_max_contexts_per_stack
 - pid_max
+- poll_grow                   [ X86 only ]
+- poll_shrink                 [ X86 only ]
+- poll_threshold_ns           [ X86 only ]
 - powersave-nap               [ PPC only ]
 - printk
 - printk_delay
@@ -702,6 +705,28 @@ kernel tries to allocate a number starting from this one.
 
 ==============================================================
 
+poll_grow: (X86 only)
+
+This parameter is multiplied in the grow_poll_ns() to increase the poll time.
+By default, the values is 2.
+
+==============================================================
+poll_shrink: (X86 only)
+
+This parameter is divided in the shrink_poll_ns() to reduce the poll time.
+By default, the values is 2.
+
+==============================================================
+poll_threshold_ns: (X86 only)
+
+This parameter controls the max poll time before entering real idle path.
+This parameter is expected to take effect only when running inside a VM.
+It would make no sense to turn on it in bare mental.
+By default, the values is 0 means don't poll. It is recommended to change
+the value to non-zero if running latency-bound workloads inside VM.
+
+==============================================================
+
 powersave-nap: (PPC only)
 
 If set, Linux-PPC will use the 'nap' mode of powersaving,
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a11b2c2..0b92f8f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -318,6 +318,10 @@ struct pv_idle_ops pv_idle_ops = {
 	.update = paravirt_nop,
 };
 
+unsigned long poll_threshold_ns;
+unsigned int poll_shrink = 2;
+unsigned int poll_grow = 2;
+
 __visible struct pv_irq_ops pv_irq_ops = {
 	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
 	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bd6d96c..6cb2820 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -462,6 +462,12 @@ extern __scanf(2, 0)
 
 extern bool crash_kexec_post_notifiers;
 
+#ifdef CONFIG_PARAVIRT
+extern unsigned long poll_threshold_ns;
+extern unsigned int poll_shrink;
+extern unsigned int poll_grow;
+#endif
+
 /*
  * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
  * holds a CPU number which is executing panic() currently. A value of
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbb..9b86a8f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1229,6 +1229,29 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_PARAVIRT
+	{
+		.procname       = "halt_poll_threshold",
+		.data           = &poll_threshold_ns,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname       = "halt_poll_grow",
+		.data           = &poll_grow,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname       = "halt_poll_shrink",
+		.data           = &poll_shrink,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif
 	{ }
 };
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops
  2017-08-29 11:46 ` [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops Yang Zhang
@ 2017-08-29 13:55   ` Konrad Rzeszutek Wilk
  2017-08-30  7:33     ` Juergen Gross
  2017-09-01  6:50     ` Yang Zhang
  0 siblings, 2 replies; 7+ messages in thread
From: Konrad Rzeszutek Wilk @ 2017-08-29 13:55 UTC (permalink / raw)
  To: Yang Zhang, xen-devel, jgross, Boris Ostrovsky
  Cc: Jeremy Fitzhardinge, rkrcmar, kvm, mst, peterz, Pan Xinhui,
	virtualization, H. Peter Anvin, Alok Kataria, wanpeng.li, x86,
	Ingo Molnar, Kees Cook, Chris Wright, Andy Lutomirski, dmatlack,
	tglx, Quan Xu, linux-doc, linux-kernel, pbonzini,
	Kirill A. Shutemov

On Tue, Aug 29, 2017 at 11:46:35AM +0000, Yang Zhang wrote:
> So far, pv_idle_ops.poll is the only ops for pv_idle. .poll is called in
> idle path which will polling for a while before we enter the real idle
> state.
> 
> In virtualization, idle path includes several heavy operations
> includes timer access(LAPIC timer or TSC deadline timer) which will hurt
> performance especially for latency intensive workload like message
> passing task. The cost is mainly come from the vmexit which is a
> hardware context switch between VM and hypervisor. Our solution is to
> poll for a while and do not enter real idle path if we can get the
> schedule event during polling.
> 
> Poll may cause the CPU waste so we adopt a smart polling mechanism to
> reduce the useless poll.
> 
> Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
> Signed-off-by: Quan Xu <quan.xu0@gmail.com>
> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
> Cc: Chris Wright <chrisw@sous-sol.org>
> Cc: Alok Kataria <akataria@vmware.com>
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: virtualization@lists.linux-foundation.org
> Cc: linux-kernel@vger.kernel.org

Adding xen-devel.

Juergen, we really should replace Jeremy's name with xen-devel or
your name.. Wasn't there an patch by you that took some of the 
mainternship over it?

> ---
>  arch/x86/include/asm/paravirt.h       | 5 +++++
>  arch/x86/include/asm/paravirt_types.h | 6 ++++++
>  arch/x86/kernel/paravirt.c            | 6 ++++++
>  3 files changed, 17 insertions(+)
> 
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 9ccac19..6d46760 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -202,6 +202,11 @@ static inline unsigned long long paravirt_read_pmc(int counter)
>  
>  #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
>  
> +static inline void paravirt_idle_poll(void)
> +{
> +	PVOP_VCALL0(pv_idle_ops.poll);
> +}
> +
>  static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
>  {
>  	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index 9ffc36b..cf45726 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -324,6 +324,10 @@ struct pv_lock_ops {
>  	struct paravirt_callee_save vcpu_is_preempted;
>  } __no_randomize_layout;
>  
> +struct pv_idle_ops {
> +	void (*poll)(void);
> +} __no_randomize_layout;
> +
>  /* This contains all the paravirt structures: we get a convenient
>   * number for each function using the offset which we use to indicate
>   * what to patch. */
> @@ -334,6 +338,7 @@ struct paravirt_patch_template {
>  	struct pv_irq_ops pv_irq_ops;
>  	struct pv_mmu_ops pv_mmu_ops;
>  	struct pv_lock_ops pv_lock_ops;
> +	struct pv_idle_ops pv_idle_ops;
>  } __no_randomize_layout;
>  
>  extern struct pv_info pv_info;
> @@ -343,6 +348,7 @@ struct paravirt_patch_template {
>  extern struct pv_irq_ops pv_irq_ops;
>  extern struct pv_mmu_ops pv_mmu_ops;
>  extern struct pv_lock_ops pv_lock_ops;
> +extern struct pv_idle_ops pv_idle_ops;
>  
>  #define PARAVIRT_PATCH(x)					\
>  	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index bc0a849..1b5b247 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -128,6 +128,7 @@ static void *get_call_destination(u8 type)
>  #ifdef CONFIG_PARAVIRT_SPINLOCKS
>  		.pv_lock_ops = pv_lock_ops,
>  #endif
> +		.pv_idle_ops = pv_idle_ops,
>  	};
>  	return *((void **)&tmpl + type);
>  }
> @@ -312,6 +313,10 @@ struct pv_time_ops pv_time_ops = {
>  	.steal_clock = native_steal_clock,
>  };
>  
> +struct pv_idle_ops pv_idle_ops = {
> +	.poll = paravirt_nop,
> +};
> +
>  __visible struct pv_irq_ops pv_irq_ops = {
>  	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
>  	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
> @@ -471,3 +476,4 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>  EXPORT_SYMBOL    (pv_mmu_ops);
>  EXPORT_SYMBOL_GPL(pv_info);
>  EXPORT_SYMBOL    (pv_irq_ops);
> +EXPORT_SYMBOL    (pv_idle_ops);
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH v2 5/7] Documentation: Add three sysctls for smart idle poll
       [not found] ` <1504007201-12904-6-git-send-email-yang.zhang.wz@gmail.com>
@ 2017-08-29 17:20   ` Luis R. Rodriguez
  0 siblings, 0 replies; 7+ messages in thread
From: Luis R. Rodriguez @ 2017-08-29 17:20 UTC (permalink / raw)
  To: Yang Zhang
  Cc: Jeremy Fitzhardinge, rkrcmar, kvm, mst, peterz, virtualization,
	H. Peter Anvin, Alok Kataria, wanpeng.li, Jessica Yu, Baoquan He,
	Jonathan Corbet, x86, Krzysztof Kozlowski, Ingo Molnar, zijun_hu,
	Petr Mladek, Kees Cook, Johannes Berg, Rusty Russell,
	Chris Wright, Ian Abbott, Josh Poimboeuf, dmatlack, tglx,
	Mauro Carvalho Chehab, Juergen Gross, Quan Xu, linux-doc

On Tue, Aug 29, 2017 at 11:46:39AM +0000, Yang Zhang wrote:
> To reduce the cost of poll, we introduce three sysctl to control the
> poll time.

This commit does not describe in any way the fact that these knobs are
all for and only for PARAVIRT.

> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index bac23c1..67447b8 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -63,6 +63,9 @@ show up in /proc/sys/kernel:
>  - perf_event_max_stack
>  - perf_event_max_contexts_per_stack
>  - pid_max
> +- poll_grow                   [ X86 only ]
> +- poll_shrink                 [ X86 only ]
> +- poll_threshold_ns           [ X86 only ]

How about paravirt_ prefix?

>  - powersave-nap               [ PPC only ]
>  - printk
>  - printk_delay
> @@ -702,6 +705,28 @@ kernel tries to allocate a number starting from this one.
>  
>  ==============================================================
>  
> +poll_grow: (X86 only)
> +
> +This parameter is multiplied in the grow_poll_ns() to increase the poll time.
> +By default, the values is 2.
> +
> +==============================================================
> +poll_shrink: (X86 only)
> +
> +This parameter is divided in the shrink_poll_ns() to reduce the poll time.
> +By default, the values is 2.

These don't say anything about this being related to paravirt.

> +
> +==============================================================
> +poll_threshold_ns: (X86 only)
> +
> +This parameter controls the max poll time before entering real idle path.
> +This parameter is expected to take effect only when running inside a VM.
> +It would make no sense to turn on it in bare mental.

turn it on? Or modify, or use it?

> +By default, the values is 0 means don't poll. It is recommended to change
> +the value to non-zero if running latency-bound workloads inside VM.
> +
> +==============================================================
> +
>  powersave-nap: (PPC only)
>  
>  If set, Linux-PPC will use the 'nap' mode of powersaving,
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index a11b2c2..0b92f8f 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -318,6 +318,10 @@ struct pv_idle_ops pv_idle_ops = {
>  	.update = paravirt_nop,
>  };
>  
> +unsigned long poll_threshold_ns;
> +unsigned int poll_shrink = 2;
> +unsigned int poll_grow = 2;
> +
>  __visible struct pv_irq_ops pv_irq_ops = {
>  	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
>  	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index bd6d96c..6cb2820 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -462,6 +462,12 @@ extern __scanf(2, 0)
>  
>  extern bool crash_kexec_post_notifiers;
>  
> +#ifdef CONFIG_PARAVIRT
> +extern unsigned long poll_threshold_ns;
> +extern unsigned int poll_shrink;
> +extern unsigned int poll_grow;
> +#endif

What are these if we are on bare metal but support
paravirt? The docs are not clear in any way about it.

> +
>  /*
>   * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
>   * holds a CPU number which is executing panic() currently. A value of
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 6648fbb..9b86a8f 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1229,6 +1229,29 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
>  		.extra2		= &one,
>  	},
>  #endif
> +#ifdef CONFIG_PARAVIRT
> +	{
> +		.procname       = "halt_poll_threshold",
> +		.data           = &poll_threshold_ns,
> +		.maxlen         = sizeof(unsigned long),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec,
> +	},

proc_doulongvec_minmax() may be more appropriate here? What is the range?
Also what user did you see used proc_dointvec() but had unsigned long?
If the test below reveal this is invalid we would need to go fix them
as well.

> +	{
> +		.procname       = "halt_poll_grow",
> +		.data           = &poll_grow,
> +		.maxlen         = sizeof(unsigned int),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec,
> +	},
> +	{
> +		.procname       = "halt_poll_shrink",
> +		.data           = &poll_shrink,
> +		.maxlen         = sizeof(unsigned int),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec,

We have now a much more appropriate proc_douintvec() proc_douintvec_minmax().
It seems you want to support only unsigned int for two of these so that would
be more appropriate.

To test things out you can replicate your values using or extending the
test driver lib/test_sysctl.c CONFIG_TEST_SYSCTL=m with your type of
values and then fuzz testing them with arbitrary values to ensure you get
only as input what you really think these values should get.

Once done please extend the script:

tools/testing/selftests/sysctl/sysctl.sh

to cover the different tests you have run, we want tests to be generic.

  Luis

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops
  2017-08-29 13:55   ` Konrad Rzeszutek Wilk
@ 2017-08-30  7:33     ` Juergen Gross
  2017-09-01  6:50     ` Yang Zhang
  1 sibling, 0 replies; 7+ messages in thread
From: Juergen Gross @ 2017-08-30  7:33 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, Yang Zhang, xen-devel, Boris Ostrovsky
  Cc: Jeremy Fitzhardinge, rkrcmar, kvm, mst, peterz, Pan Xinhui,
	virtualization, H. Peter Anvin, Alok Kataria, wanpeng.li, x86,
	Ingo Molnar, Kees Cook, Chris Wright, Andy Lutomirski, dmatlack,
	tglx, Quan Xu, linux-doc, linux-kernel, pbonzini,
	Kirill A. Shutemov

On 29/08/17 15:55, Konrad Rzeszutek Wilk wrote:
> On Tue, Aug 29, 2017 at 11:46:35AM +0000, Yang Zhang wrote:
>> So far, pv_idle_ops.poll is the only ops for pv_idle. .poll is called in
>> idle path which will polling for a while before we enter the real idle
>> state.
>>
>> In virtualization, idle path includes several heavy operations
>> includes timer access(LAPIC timer or TSC deadline timer) which will hurt
>> performance especially for latency intensive workload like message
>> passing task. The cost is mainly come from the vmexit which is a
>> hardware context switch between VM and hypervisor. Our solution is to
>> poll for a while and do not enter real idle path if we can get the
>> schedule event during polling.
>>
>> Poll may cause the CPU waste so we adopt a smart polling mechanism to
>> reduce the useless poll.
>>
>> Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
>> Signed-off-by: Quan Xu <quan.xu0@gmail.com>
>> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
>> Cc: Chris Wright <chrisw@sous-sol.org>
>> Cc: Alok Kataria <akataria@vmware.com>
>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: x86@kernel.org
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>> Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: virtualization@lists.linux-foundation.org
>> Cc: linux-kernel@vger.kernel.org
> 
> Adding xen-devel.
> 
> Juergen, we really should replace Jeremy's name with xen-devel or
> your name..

I wouldn't mind being added. What does Jeremy think of being removed?

> Wasn't there an patch by you that took some of the 
> mainternship over it?

I added include/linux/hypervisor.h to the PARAVIRT section and offered
to maintain it in case the PARAVIRT maintainers didn't want to.


Juergen

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops
  2017-08-29 13:55   ` Konrad Rzeszutek Wilk
  2017-08-30  7:33     ` Juergen Gross
@ 2017-09-01  6:50     ` Yang Zhang
  1 sibling, 0 replies; 7+ messages in thread
From: Yang Zhang @ 2017-09-01  6:50 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, xen-devel, jgross, Boris Ostrovsky
  Cc: Jeremy Fitzhardinge, rkrcmar, kvm, mst, peterz, Pan Xinhui,
	virtualization, H. Peter Anvin, Alok Kataria, wanpeng.li, x86,
	Ingo Molnar, Kees Cook, Chris Wright, Andy Lutomirski, dmatlack,
	tglx, Quan Xu, linux-doc, linux-kernel, pbonzini,
	Kirill A. Shutemov

On 2017/8/29 21:55, Konrad Rzeszutek Wilk wrote:
> On Tue, Aug 29, 2017 at 11:46:35AM +0000, Yang Zhang wrote:
>> So far, pv_idle_ops.poll is the only ops for pv_idle. .poll is called in
>> idle path which will polling for a while before we enter the real idle
>> state.
>>
>> In virtualization, idle path includes several heavy operations
>> includes timer access(LAPIC timer or TSC deadline timer) which will hurt
>> performance especially for latency intensive workload like message
>> passing task. The cost is mainly come from the vmexit which is a
>> hardware context switch between VM and hypervisor. Our solution is to
>> poll for a while and do not enter real idle path if we can get the
>> schedule event during polling.
>>
>> Poll may cause the CPU waste so we adopt a smart polling mechanism to
>> reduce the useless poll.
>>
>> Signed-off-by: Yang Zhang <yang.zhang.wz@gmail.com>
>> Signed-off-by: Quan Xu <quan.xu0@gmail.com>
>> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
>> Cc: Chris Wright <chrisw@sous-sol.org>
>> Cc: Alok Kataria <akataria@vmware.com>
>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: x86@kernel.org
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>> Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: virtualization@lists.linux-foundation.org
>> Cc: linux-kernel@vger.kernel.org
> 
> Adding xen-devel.
> 
> Juergen, we really should replace Jeremy's name with xen-devel or
> your name.. Wasn't there an patch by you that took some of the
> mainternship over it?

Hi Konard, I didn't test it in Xen side since i don't have the 
environment but i can add it for Xen in next version if you think it is 
useful to Xen as well.

> 
>> ---
>>   arch/x86/include/asm/paravirt.h       | 5 +++++
>>   arch/x86/include/asm/paravirt_types.h | 6 ++++++
>>   arch/x86/kernel/paravirt.c            | 6 ++++++
>>   3 files changed, 17 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index 9ccac19..6d46760 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -202,6 +202,11 @@ static inline unsigned long long paravirt_read_pmc(int counter)
>>   
>>   #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
>>   
>> +static inline void paravirt_idle_poll(void)
>> +{
>> +	PVOP_VCALL0(pv_idle_ops.poll);
>> +}
>> +
>>   static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
>>   {
>>   	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
>> index 9ffc36b..cf45726 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -324,6 +324,10 @@ struct pv_lock_ops {
>>   	struct paravirt_callee_save vcpu_is_preempted;
>>   } __no_randomize_layout;
>>   
>> +struct pv_idle_ops {
>> +	void (*poll)(void);
>> +} __no_randomize_layout;
>> +
>>   /* This contains all the paravirt structures: we get a convenient
>>    * number for each function using the offset which we use to indicate
>>    * what to patch. */
>> @@ -334,6 +338,7 @@ struct paravirt_patch_template {
>>   	struct pv_irq_ops pv_irq_ops;
>>   	struct pv_mmu_ops pv_mmu_ops;
>>   	struct pv_lock_ops pv_lock_ops;
>> +	struct pv_idle_ops pv_idle_ops;
>>   } __no_randomize_layout;
>>   
>>   extern struct pv_info pv_info;
>> @@ -343,6 +348,7 @@ struct paravirt_patch_template {
>>   extern struct pv_irq_ops pv_irq_ops;
>>   extern struct pv_mmu_ops pv_mmu_ops;
>>   extern struct pv_lock_ops pv_lock_ops;
>> +extern struct pv_idle_ops pv_idle_ops;
>>   
>>   #define PARAVIRT_PATCH(x)					\
>>   	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index bc0a849..1b5b247 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -128,6 +128,7 @@ static void *get_call_destination(u8 type)
>>   #ifdef CONFIG_PARAVIRT_SPINLOCKS
>>   		.pv_lock_ops = pv_lock_ops,
>>   #endif
>> +		.pv_idle_ops = pv_idle_ops,
>>   	};
>>   	return *((void **)&tmpl + type);
>>   }
>> @@ -312,6 +313,10 @@ struct pv_time_ops pv_time_ops = {
>>   	.steal_clock = native_steal_clock,
>>   };
>>   
>> +struct pv_idle_ops pv_idle_ops = {
>> +	.poll = paravirt_nop,
>> +};
>> +
>>   __visible struct pv_irq_ops pv_irq_ops = {
>>   	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
>>   	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
>> @@ -471,3 +476,4 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>>   EXPORT_SYMBOL    (pv_mmu_ops);
>>   EXPORT_SYMBOL_GPL(pv_info);
>>   EXPORT_SYMBOL    (pv_irq_ops);
>> +EXPORT_SYMBOL    (pv_idle_ops);
>> -- 
>> 1.8.3.1
>>


-- 
Yang
Alibaba Cloud Computing

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-09-01  6:50 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1504007201-12904-1-git-send-email-yang.zhang.wz@gmail.com>
2017-08-29 11:46 ` [RFC PATCH v2 1/7] x86/paravirt: Add pv_idle_ops to paravirt ops Yang Zhang
2017-08-29 13:55   ` Konrad Rzeszutek Wilk
2017-08-30  7:33     ` Juergen Gross
2017-09-01  6:50     ` Yang Zhang
2017-08-29 11:46 ` [RFC PATCH v2 4/7] x86/paravirt: Add update in x86/paravirt pv_idle_ops Yang Zhang
2017-08-29 11:46 ` [RFC PATCH v2 5/7] Documentation: Add three sysctls for smart idle poll Yang Zhang
     [not found] ` <1504007201-12904-6-git-send-email-yang.zhang.wz@gmail.com>
2017-08-29 17:20   ` Luis R. Rodriguez

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox