Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH 1/2] clocksource/Hyper-v: Allocate Hyper-V tsc page statically
From: lantianyu1986 @ 2019-07-29  7:52 UTC (permalink / raw)
  To: luto, tglx, mingo, bp, hpa, x86, kys, haiyangz, sthemmin, sashal,
	daniel.lezcano, arnd, michael.h.kelley, ashal
  Cc: Tianyu Lan, linux-kernel, linux-hyperv, linux-arch
In-Reply-To: <20190729075243.22745-1-Tianyu.Lan@microsoft.com>

From: Tianyu Lan <Tianyu.Lan@microsoft.com>

This is to prepare to add Hyper-V sched clock callback and move
Hyper-V reference TSC initialization much earlier in the boot
process when timestamp is 0. So no discontinuity is observed
when pv_ops.time.sched_clock to calculate its offset. This earlier
initialization requires that the Hyper-V TSC page be allocated
statically instead of with vmalloc(), so fixup the references
to the TSC page and the method of getting its physical address.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
 arch/x86/entry/vdso/vma.c          |  2 +-
 drivers/clocksource/hyperv_timer.c | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 349a61d8bf34..f5937742b290 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -122,7 +122,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 
 		if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
 			return vmf_insert_pfn(vma, vmf->address,
-					vmalloc_to_pfn(tsc_pg));
+					virt_to_phys(tsc_pg) >> PAGE_SHIFT);
 	}
 
 	return VM_FAULT_SIGBUS;
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index ba2c79e6a0ee..86764ec9a854 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -214,17 +214,17 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 
 #ifdef CONFIG_HYPERV_TSCPAGE
 
-static struct ms_hyperv_tsc_page *tsc_pg;
+static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
 
 struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
-	return tsc_pg;
+	return &tsc_pg;
 }
 EXPORT_SYMBOL_GPL(hv_get_tsc_page);
 
 static u64 notrace read_hv_sched_clock_tsc(void)
 {
-	u64 current_tick = hv_read_tsc_page(tsc_pg);
+	u64 current_tick = hv_read_tsc_page(&tsc_pg);
 
 	if (current_tick == U64_MAX)
 		hv_get_time_ref_count(current_tick);
@@ -280,12 +280,8 @@ static bool __init hv_init_tsc_clocksource(void)
 	if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
 		return false;
 
-	tsc_pg = vmalloc(PAGE_SIZE);
-	if (!tsc_pg)
-		return false;
-
 	hyperv_cs = &hyperv_cs_tsc;
-	phys_addr = page_to_phys(vmalloc_to_page(tsc_pg));
+	phys_addr = virt_to_phys(&tsc_pg) & PAGE_MASK;
 
 	/*
 	 * The Hyper-V TLFS specifies to preserve the value of reserved
-- 
2.14.5


^ permalink raw reply related

* [PATCH 2/2] clocksource/Hyper-V:  Add Hyper-V specific sched clock function
From: lantianyu1986 @ 2019-07-29  7:52 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin, sashal, tglx, mingo, bp, hpa, x86,
	daniel.lezcano, arnd, michael.h.kelley, ashal
  Cc: Tianyu Lan, linux-hyperv, linux-kernel, linux-arch
In-Reply-To: <20190729075243.22745-1-Tianyu.Lan@microsoft.com>

From: Tianyu Lan <Tianyu.Lan@microsoft.com>

Hyper-V guests use the default native_sched_clock() in pv_ops.time.sched_clock
on x86.  But native_sched_clock() directly uses the raw TSC value, which
can be discontinuous in a Hyper-V VM. Add the generic hv_setup_sched_clock()
to set the sched clock function appropriately. On x86, this sets pv_ops.time.
sched_clock to read the Hyper-V reference TSC value that is scaled and adjusted
to be continuous.

Also move the Hyper-V reference TSC initialization much earlier in the boot
process so no discontinuity is observed when pv_ops.time.sched_clock
calculates its offset.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
 arch/x86/hyperv/hv_init.c          |  2 --
 arch/x86/kernel/cpu/mshyperv.c     |  8 ++++++++
 drivers/clocksource/hyperv_timer.c | 22 ++++++++++++----------
 include/asm-generic/mshyperv.h     |  1 +
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 0d258688c8cf..866dfb3dca48 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -301,8 +301,6 @@ void __init hyperv_init(void)
 
 	x86_init.pci.arch_init = hv_pci_init;
 
-	/* Register Hyper-V specific clocksource */
-	hv_init_clocksource();
 	return;
 
 remove_cpuhp_state:
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 062f77279ce3..53afd33990eb 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -29,6 +29,7 @@
 #include <asm/timer.h>
 #include <asm/reboot.h>
 #include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -338,9 +339,16 @@ static void __init ms_hyperv_init_platform(void)
 		x2apic_phys = 1;
 # endif
 
+	/* Register Hyper-V specific clocksource */
+	hv_init_clocksource();
 #endif
 }
 
+void hv_setup_sched_clock(void *sched_clock)
+{
+	pv_ops.time.sched_clock = sched_clock;
+}
+
 const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
 	.name			= "Microsoft Hyper-V",
 	.detect			= ms_hyperv_platform,
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 86764ec9a854..eafca89b44d7 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -215,6 +215,7 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 #ifdef CONFIG_HYPERV_TSCPAGE
 
 static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
+static u64 hv_sched_clock_offset __ro_after_init;
 
 struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
@@ -222,7 +223,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 }
 EXPORT_SYMBOL_GPL(hv_get_tsc_page);
 
-static u64 notrace read_hv_sched_clock_tsc(void)
+static u64 notrace read_hv_clock_tsc(struct clocksource *arg)
 {
 	u64 current_tick = hv_read_tsc_page(&tsc_pg);
 
@@ -232,9 +233,9 @@ static u64 notrace read_hv_sched_clock_tsc(void)
 	return current_tick;
 }
 
-static u64 read_hv_clock_tsc(struct clocksource *arg)
+static u64 read_hv_sched_clock_tsc(void)
 {
-	return read_hv_sched_clock_tsc();
+	return read_hv_clock_tsc(NULL) - hv_sched_clock_offset;
 }
 
 static struct clocksource hyperv_cs_tsc = {
@@ -246,7 +247,7 @@ static struct clocksource hyperv_cs_tsc = {
 };
 #endif
 
-static u64 notrace read_hv_sched_clock_msr(void)
+static u64 notrace read_hv_clock_msr(struct clocksource *arg)
 {
 	u64 current_tick;
 	/*
@@ -258,9 +259,9 @@ static u64 notrace read_hv_sched_clock_msr(void)
 	return current_tick;
 }
 
-static u64 read_hv_clock_msr(struct clocksource *arg)
+static u64 read_hv_sched_clock_msr(void)
 {
-	return read_hv_sched_clock_msr();
+	return read_hv_clock_msr(NULL) - hv_sched_clock_offset;
 }
 
 static struct clocksource hyperv_cs_msr = {
@@ -298,8 +299,9 @@ static bool __init hv_init_tsc_clocksource(void)
 	hv_set_clocksource_vdso(hyperv_cs_tsc);
 	clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
 
-	/* sched_clock_register is needed on ARM64 but is a no-op on x86 */
-	sched_clock_register(read_hv_sched_clock_tsc, 64, HV_CLOCK_HZ);
+	hv_sched_clock_offset = hyperv_cs->read(hyperv_cs);
+	hv_setup_sched_clock(read_hv_sched_clock_tsc);
+
 	return true;
 }
 #else
@@ -329,7 +331,7 @@ void __init hv_init_clocksource(void)
 	hyperv_cs = &hyperv_cs_msr;
 	clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
 
-	/* sched_clock_register is needed on ARM64 but is a no-op on x86 */
-	sched_clock_register(read_hv_sched_clock_msr, 64, HV_CLOCK_HZ);
+	hv_sched_clock_offset = hyperv_cs->read(hyperv_cs);
+	hv_setup_sched_clock(read_hv_sched_clock_msr);
 }
 EXPORT_SYMBOL_GPL(hv_init_clocksource);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 0becb7d9704d..18d8e2d8210f 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -167,6 +167,7 @@ void hyperv_report_panic(struct pt_regs *regs, long err);
 void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
 bool hv_is_hyperv_initialized(void);
 void hyperv_cleanup(void);
+void hv_setup_sched_clock(void *sched_clock);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
 static inline void hyperv_cleanup(void) {}
-- 
2.14.5


^ permalink raw reply related

* Re: [PATCH 0/2] clocksource/Hyper-V: Add Hyper-V specific sched clock function
From: Vitaly Kuznetsov @ 2019-07-29 10:59 UTC (permalink / raw)
  To: lantianyu1986
  Cc: Tianyu Lan, linux-arch, linux-hyperv, linux-kernel, luto, tglx,
	mingo, bp, hpa, x86, kys, haiyangz, sthemmin, sashal,
	daniel.lezcano, arnd, michael.h.kelley, ashal
In-Reply-To: <20190729075243.22745-1-Tianyu.Lan@microsoft.com>

lantianyu1986@gmail.com writes:

> From: Tianyu Lan <Tianyu.Lan@microsoft.com>
>
> Hyper-V guests use the default native_sched_clock() in pv_ops.time.sched_clock
> on x86.  But native_sched_clock() directly uses the raw TSC value, which
> can be discontinuous in a Hyper-V VM.   Add the generic hv_setup_sched_clock()
> to set the sched clock function appropriately.  On x86, this sets
> pv_ops.time.sched_clock to read the Hyper-V reference TSC value that is
> scaled and adjusted to be continuous.

Hypervisor can, in theory, disable TSC page and then we're forced to use
MSR-based clocksource but using it as sched_clock() can be very slow,
I'm afraid.

On the other hand, what we have now is probably worse: TSC can,
actually, jump backwards (e.g. on migration) and we're breaking the
requirements for sched_clock().

-- 
Vitaly

^ permalink raw reply

* Re: [PATCH 0/2] clocksource/Hyper-V: Add Hyper-V specific sched clock function
From: Peter Zijlstra @ 2019-07-29 11:09 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: lantianyu1986, Tianyu Lan, linux-arch, linux-hyperv, linux-kernel,
	luto, tglx, mingo, bp, hpa, x86, kys, haiyangz, sthemmin, sashal,
	daniel.lezcano, arnd, michael.h.kelley, ashal
In-Reply-To: <87zhkxksxd.fsf@vitty.brq.redhat.com>

On Mon, Jul 29, 2019 at 12:59:26PM +0200, Vitaly Kuznetsov wrote:
> lantianyu1986@gmail.com writes:
> 
> > From: Tianyu Lan <Tianyu.Lan@microsoft.com>
> >
> > Hyper-V guests use the default native_sched_clock() in pv_ops.time.sched_clock
> > on x86.  But native_sched_clock() directly uses the raw TSC value, which
> > can be discontinuous in a Hyper-V VM.   Add the generic hv_setup_sched_clock()
> > to set the sched clock function appropriately.  On x86, this sets
> > pv_ops.time.sched_clock to read the Hyper-V reference TSC value that is
> > scaled and adjusted to be continuous.
> 
> Hypervisor can, in theory, disable TSC page and then we're forced to use
> MSR-based clocksource but using it as sched_clock() can be very slow,
> I'm afraid.
> 
> On the other hand, what we have now is probably worse: TSC can,
> actually, jump backwards (e.g. on migration) and we're breaking the
> requirements for sched_clock().

That (obviously) also breaks the requirements for using TSC as
clocksource.

IOW, it breaks the entire purpose of having TSC in the first place.

^ permalink raw reply

* Re: [PATCH 0/2] clocksource/Hyper-V: Add Hyper-V specific sched clock function
From: Vitaly Kuznetsov @ 2019-07-29 12:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: lantianyu1986, Tianyu Lan, linux-arch, linux-hyperv, linux-kernel,
	luto, tglx, mingo, bp, hpa, x86, kys, haiyangz, sthemmin, sashal,
	daniel.lezcano, arnd, michael.h.kelley, ashal
In-Reply-To: <20190729110927.GC31398@hirez.programming.kicks-ass.net>

Peter Zijlstra <peterz@infradead.org> writes:

> On Mon, Jul 29, 2019 at 12:59:26PM +0200, Vitaly Kuznetsov wrote:
>> lantianyu1986@gmail.com writes:
>> 
>> > From: Tianyu Lan <Tianyu.Lan@microsoft.com>
>> >
>> > Hyper-V guests use the default native_sched_clock() in pv_ops.time.sched_clock
>> > on x86.  But native_sched_clock() directly uses the raw TSC value, which
>> > can be discontinuous in a Hyper-V VM.   Add the generic hv_setup_sched_clock()
>> > to set the sched clock function appropriately.  On x86, this sets
>> > pv_ops.time.sched_clock to read the Hyper-V reference TSC value that is
>> > scaled and adjusted to be continuous.
>> 
>> Hypervisor can, in theory, disable TSC page and then we're forced to use
>> MSR-based clocksource but using it as sched_clock() can be very slow,
>> I'm afraid.
>> 
>> On the other hand, what we have now is probably worse: TSC can,
>> actually, jump backwards (e.g. on migration) and we're breaking the
>> requirements for sched_clock().
>
> That (obviously) also breaks the requirements for using TSC as
> clocksource.
>
> IOW, it breaks the entire purpose of having TSC in the first place.

Currently, we mark raw TSC as unstable when running on Hyper-V (see
88c9281a9fba6), 'TSC page' (which is TSC * scale + offset) is being used
instead. The problem is that 'TSC page' can be disabled by the
hypervisor and in that case the only remaining clocksource is MSR-based
(slow).

-- 
Vitaly

^ permalink raw reply

* RE: [PATCH net] hv_sock: Fix hang when a connection is closed
From: Sunil Muthuswamy @ 2019-07-29 17:21 UTC (permalink / raw)
  To: Dexuan Cui, David Miller, netdev@vger.kernel.org
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org, Michael Kelley, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com
In-Reply-To: <PU1P153MB01690A7767ECDF420FF78D66BFC20@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>



> -----Original Message-----
> From: Dexuan Cui <decui@microsoft.com>
> Sent: Sunday, July 28, 2019 11:32 AM
> To: Sunil Muthuswamy <sunilmut@microsoft.com>; David Miller <davem@davemloft.net>; netdev@vger.kernel.org
> Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Stephen Hemminger
> <sthemmin@microsoft.com>; sashal@kernel.org; Michael Kelley <mikelley@microsoft.com>; linux-hyperv@vger.kernel.org; linux-
> kernel@vger.kernel.org; olaf@aepfle.de; apw@canonical.com; jasowang@redhat.com; vkuznets <vkuznets@redhat.com>;
> marcelo.cerri@canonical.com
> Subject: [PATCH net] hv_sock: Fix hang when a connection is closed
> 
> 
> hvs_do_close_lock_held() may decrease the reference count to 0 and free the
> sk struct completely, and then the following release_sock(sk) may hang.
> 
> Fixes: a9eeb998c28d ("hv_sock: Add support for delayed close")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Cc: stable@vger.kernel.org
> 
> ---
> With the proper kernel debugging options enabled, first a warning can
> appear:
> 
> kworker/1:0/4467 is freeing memory ..., with a lock still held there!
> stack backtrace:
> Workqueue: events vmbus_onmessage_work [hv_vmbus]
> Call Trace:
>  dump_stack+0x67/0x90
>  debug_check_no_locks_freed.cold.52+0x78/0x7d
>  slab_free_freelist_hook+0x85/0x140
>  kmem_cache_free+0xa5/0x380
>  __sk_destruct+0x150/0x260
>  hvs_close_connection+0x24/0x30 [hv_sock]
>  vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
>  process_one_work+0x241/0x600
>  worker_thread+0x3c/0x390
>  kthread+0x11b/0x140
>  ret_from_fork+0x24/0x30
> 
> and then the following release_sock(sk) can hang:
> 
> watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/1:0:4467]
> ...
> irq event stamp: 62890
> CPU: 1 PID: 4467 Comm: kworker/1:0 Tainted: G        W         5.2.0+ #39
> Workqueue: events vmbus_onmessage_work [hv_vmbus]
> RIP: 0010:queued_spin_lock_slowpath+0x2b/0x1e0
> ...
> Call Trace:
>  do_raw_spin_lock+0xab/0xb0
>  release_sock+0x19/0xb0
>  vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
>  process_one_work+0x241/0x600
>  worker_thread+0x3c/0x390
>  kthread+0x11b/0x140
>  ret_from_fork+0x24/0x30
> 
>  net/vmw_vsock/hyperv_transport.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
> index f2084e3f7aa4..efbda8ef1eff 100644
> --- a/net/vmw_vsock/hyperv_transport.c
> +++ b/net/vmw_vsock/hyperv_transport.c
> @@ -309,9 +309,16 @@ static void hvs_close_connection(struct vmbus_channel *chan)
>  {
>  	struct sock *sk = get_per_channel_state(chan);
> 
> +	/* Grab an extra reference since hvs_do_close_lock_held() may decrease
> +	 * the reference count to 0 by calling sock_put(sk).
> +	 */
> +	sock_hold(sk);
> +

To me, it seems like when 'hvs_close_connection' is called, there should always be
an outstanding reference to the socket. The reference that is dropped by
' hvs_do_close_lock_held' is a legitimate reference that was taken by 'hvs_close_lock_held'.
Or, in other words, I think the right solution is to always maintain a reference to socket
until this routine is called and drop that here. That can be done by taking the reference to
the socket prior to ' vmbus_set_chn_rescind_callback(chan, hvs_close_connection)' and
dropping that reference at the end of 'hvs_close_connection'.

>  	lock_sock(sk);
>  	hvs_do_close_lock_held(vsock_sk(sk), true);
>  	release_sock(sk);
> +
> +	sock_put(sk);
>  }
> 
>  static void hvs_open_connection(struct vmbus_channel *chan)
> --
> 2.19.1

Thanks for taking a look at this. We should queue this fix and the other hvsocket fixes
for the stable branch.

^ permalink raw reply

* RE: [PATCH net] hv_sock: Fix hang when a connection is closed
From: Dexuan Cui @ 2019-07-29 20:23 UTC (permalink / raw)
  To: Sunil Muthuswamy, David Miller, netdev@vger.kernel.org
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org, Michael Kelley, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com
In-Reply-To: <MW2PR2101MB1116DC8461F1B02C232019E2C0DD0@MW2PR2101MB1116.namprd21.prod.outlook.com>

> From: Sunil Muthuswamy <sunilmut@microsoft.com>
> Sent: Monday, July 29, 2019 10:21 AM
> > --- a/net/vmw_vsock/hyperv_transport.c
> > +++ b/net/vmw_vsock/hyperv_transport.c
> > @@ -309,9 +309,16 @@ static void hvs_close_connection(struct
> vmbus_channel *chan)
> >  {
> >  	struct sock *sk = get_per_channel_state(chan);
> >
> > +	/* Grab an extra reference since hvs_do_close_lock_held() may decrease
> > +	 * the reference count to 0 by calling sock_put(sk).
> > +	 */
> > +	sock_hold(sk);
> > +
> 
> To me, it seems like when 'hvs_close_connection' is called, there should always
> be an outstanding reference to the socket. 

I agree. There *should* be, but it turns out there is race condition: 

For an established connectin that is being closed by the guest, the refcnt is 4
at the end of hvs_release() (Note: here the 'remove_sock' is false):

1 for the initial value;
1 for the sk being in the bound list;
1 for the sk being in the connected list;
1 for the delayed close_work.

After hvs_release() finishes, __vsock_release() -> sock_put(sk) *may* decrease
the refcnt to 3. 

Concurrently, hvs_close_connection() runs in another thread:
  calls vsock_remove_sock() to decrease the refcnt by 2;
  call sock_put() to decrease the refcnt to 0, and free the sk;
  Next, the "release_sock(sk)" may hang due to use-after-free.

In the above, after hvs_release() finishes, if hvs_close_connection() runs
faster than "__vsock_release() -> sock_put(sk)", then there is not any issue,
because at the beginning of hvs_close_connection(), the refcnt is still 4.

So, this patch can work, but it's not the right fix. 
Your suggestion is correct and here is the patch. 
I'll give it more tests and send a v2.

--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -312,6 +312,11 @@ static void hvs_close_connection(struct vmbus_channel *chan)
        lock_sock(sk);
        hvs_do_close_lock_held(vsock_sk(sk), true);
        release_sock(sk);
+
+       /* Release the refcnt for the channel that's opened in
+        * hvs_open_connection().
+        */
+       sock_put(sk);
 }

 static void hvs_open_connection(struct vmbus_channel *chan)
@@ -407,6 +412,9 @@ static void hvs_open_connection(struct vmbus_channel *chan)
        }

        set_per_channel_state(chan, conn_from_host ? new : sk);
+
+       /* This reference will be dropped by hvs_close_connection(). */
+       sock_hold(conn_from_host ? new: sk);
        vmbus_set_chn_rescind_callback(chan, hvs_close_connection);

        /* Set the pending send size to max packet size to always get


> The reference that is dropped by
> ' hvs_do_close_lock_held' is a legitimate reference that was taken by
> 'hvs_close_lock_held'.

Correct.

> Or, in other words, I think the right solution is to always maintain a reference to
> socket
> until this routine is called and drop that here. That can be done by taking the
> reference to
> the socket prior to ' vmbus_set_chn_rescind_callback(chan,
> hvs_close_connection)' and
> dropping that reference at the end of 'hvs_close_connection'.
> 
> >  	lock_sock(sk);
> >  	hvs_do_close_lock_held(vsock_sk(sk), true);
> >  	release_sock(sk);
> > +
> > +	sock_put(sk);
> 
> Thanks for taking a look at this. We should queue this fix and the other
> hvsocket fixes
> for the stable branch.

I added a "Cc: stable@vger.kernel.org" tag so this pach will go to the
stable kernels automatically.

Your previous two fixes are in the v5.2.4 stable kernel, but not in the other
longterm stable kernels 4.19 and 4.14:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/log/?h=v5.2.4&qt=author&q=Muthuswamy

I'll request them to be backported for 4.19 and 4.14.
I'll also request the patch "vsock: correct removal of socket from the list"
to be backported.

The other two "hv_sock: perf" patches are more of features rather than
fixes. Usually the stable kernel maintaners don't backport feature patches.

Thanks,
-- Dexuan

^ permalink raw reply

* [PATCH 0/2] Drivers: hv: Remove dependencies on guest page size
From: Himadri Pandya @ 2019-07-30  9:49 UTC (permalink / raw)
  To: mikelley, kys, haiyangz, sthemmin, sashal, tglx, mingo, bp, hpa
  Cc: x86, linux-hyperv, linux-kernel, Himadri Pandya

Hyper-V assumes page size to be 4KB. This might not be the case on ARM64
architecture. The first patch in this patchset introduces a hyer-v
specific function for allocating a zeroed page which can have a
different implementation on ARM64 to address the issue of different
guest and host page sizes. The second patch removes dependencies on
guest page size in vmbus by using hyper-v specific page symbol and
functions. 

Himadri Pandya (2):
  x86: hv: Add function to allocate zeroed page for Hyper-V
  Drivers: hv: vmbus: Remove dependencies on guest page size

 arch/x86/hyperv/hv_init.c       |  8 ++++++++
 arch/x86/include/asm/mshyperv.h |  1 +
 drivers/hv/connection.c         | 14 +++++++-------
 drivers/hv/vmbus_drv.c          |  6 +++---
 4 files changed, 19 insertions(+), 10 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH 1/2] x86: hv: Add function to allocate zeroed page for Hyper-V
From: Himadri Pandya @ 2019-07-30  9:49 UTC (permalink / raw)
  To: mikelley, kys, haiyangz, sthemmin, sashal, tglx, mingo, bp, hpa
  Cc: x86, linux-hyperv, linux-kernel, Himadri Pandya
In-Reply-To: <20190730094944.96007-1-himadri18.07@gmail.com>

Hyper-V assumes page size to be 4K. While this assumption holds true on
x86 architecture, it might not  be true for ARM64 architecture. Hence
define hyper-v specific function to allocate a zeroed page which can
have a different implementation on ARM64 architecture to handle the
conflict between hyper-v's assumed page size and actual guest page size.

Signed-off-by: Himadri Pandya <himadri18.07@gmail.com>
---
 arch/x86/hyperv/hv_init.c       | 8 ++++++++
 arch/x86/include/asm/mshyperv.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index d314cf1e15fd..2d0b9b2bddf7 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -45,6 +45,14 @@ void *hv_alloc_hyperv_page(void)
 }
 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
 
+void *hv_alloc_hyperv_zeroed_page(void)
+{
+        BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
+
+        return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+}
+EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
+
 void hv_free_hyperv_page(unsigned long addr)
 {
 	free_page(addr);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f4138aeb4280..6b79515abb82 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -219,6 +219,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
 void __init hyperv_init(void);
 void hyperv_setup_mmu_ops(void);
 void *hv_alloc_hyperv_page(void);
+void *hv_alloc_hyperv_zeroed_page(void);
 void hv_free_hyperv_page(unsigned long addr);
 void hyperv_reenlightenment_intr(struct pt_regs *regs);
 void set_hv_tscchange_cb(void (*cb)(void));
-- 
2.17.1


^ permalink raw reply related

* [PATCH 2/2] Drivers: hv: vmbus: Remove dependencies on guest page size
From: Himadri Pandya @ 2019-07-30  9:49 UTC (permalink / raw)
  To: mikelley, kys, haiyangz, sthemmin, sashal, tglx, mingo, bp, hpa
  Cc: x86, linux-hyperv, linux-kernel, Himadri Pandya
In-Reply-To: <20190730094944.96007-1-himadri18.07@gmail.com>

Hyper-V assumes page size to be 4K. This might not be the case for ARM64
architecture. Hence use hyper-v page size and page allocation function
to avoid conflicts between different host and guest page size on ARM64.

Signed-off-by: Himadri Pandya <himadri18.07@gmail.com>
---
 drivers/hv/connection.c | 14 +++++++-------
 drivers/hv/vmbus_drv.c  |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 09829e15d4a0..dcb8f6a8c08c 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -202,7 +202,7 @@ int vmbus_connect(void)
 	 * abstraction stuff
 	 */
 	vmbus_connection.int_page =
-	(void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
+	(void *)hv_alloc_hyperv_zeroed_page();
 	if (vmbus_connection.int_page == NULL) {
 		ret = -ENOMEM;
 		goto cleanup;
@@ -211,14 +211,14 @@ int vmbus_connect(void)
 	vmbus_connection.recv_int_page = vmbus_connection.int_page;
 	vmbus_connection.send_int_page =
 		(void *)((unsigned long)vmbus_connection.int_page +
-			(PAGE_SIZE >> 1));
+			(HV_HYP_PAGE_SIZE >> 1));
 
 	/*
 	 * Setup the monitor notification facility. The 1st page for
 	 * parent->child and the 2nd page for child->parent
 	 */
-	vmbus_connection.monitor_pages[0] = (void *)__get_free_pages((GFP_KERNEL|__GFP_ZERO), 0);
-	vmbus_connection.monitor_pages[1] = (void *)__get_free_pages((GFP_KERNEL|__GFP_ZERO), 0);
+	vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_zeroed_page();
+	vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_zeroed_page();
 	if ((vmbus_connection.monitor_pages[0] == NULL) ||
 	    (vmbus_connection.monitor_pages[1] == NULL)) {
 		ret = -ENOMEM;
@@ -291,12 +291,12 @@ void vmbus_disconnect(void)
 		destroy_workqueue(vmbus_connection.work_queue);
 
 	if (vmbus_connection.int_page) {
-		free_pages((unsigned long)vmbus_connection.int_page, 0);
+		hv_free_hyperv_page((unsigned long)vmbus_connection.int_page);
 		vmbus_connection.int_page = NULL;
 	}
 
-	free_pages((unsigned long)vmbus_connection.monitor_pages[0], 0);
-	free_pages((unsigned long)vmbus_connection.monitor_pages[1], 0);
+	hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[0]);
+	hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[1]);
 	vmbus_connection.monitor_pages[0] = NULL;
 	vmbus_connection.monitor_pages[1] = NULL;
 }
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index ebd35fc35290..2ee388a23c8f 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1186,7 +1186,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
 	 * Write dump contents to the page. No need to synchronize; panic should
 	 * be single-threaded.
 	 */
-	kmsg_dump_get_buffer(dumper, true, hv_panic_page, PAGE_SIZE,
+	kmsg_dump_get_buffer(dumper, true, hv_panic_page, HV_HYP_PAGE_SIZE,
 			     &bytes_written);
 	if (bytes_written)
 		hyperv_report_panic_msg(panic_pa, bytes_written);
@@ -1290,7 +1290,7 @@ static int vmbus_bus_init(void)
 		 */
 		hv_get_crash_ctl(hyperv_crash_ctl);
 		if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) {
-			hv_panic_page = (void *)get_zeroed_page(GFP_KERNEL);
+			hv_panic_page = (void *)hv_alloc_hyperv_zeroed_page();
 			if (hv_panic_page) {
 				ret = kmsg_dump_register(&hv_kmsg_dumper);
 				if (ret)
@@ -1319,7 +1319,7 @@ static int vmbus_bus_init(void)
 	hv_remove_vmbus_irq();
 
 	bus_unregister(&hv_bus);
-	free_page((unsigned long)hv_panic_page);
+	hv_free_hyperv_page((unsigned long)hv_panic_page);
 	unregister_sysctl_table(hv_ctl_table_hdr);
 	hv_ctl_table_hdr = NULL;
 	return ret;
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH 0/2] clocksource/Hyper-V: Add Hyper-V specific sched clock function
From: Tianyu Lan @ 2019-07-30 13:41 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: Peter Zijlstra, Tianyu Lan, linux-arch, linux-hyperv,
	linux-kernel@vger kernel org, Andy Lutomirski, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	the arch/x86 maintainers, KY Srinivasan, Haiyang Zhang,
	Stephen Hemminger, Sasha Levin, Daniel Lezcano, Arnd Bergmann,
	michael.h.kelley, ashal
In-Reply-To: <87wog1kpib.fsf@vitty.brq.redhat.com>

Hi Vitaly & Peter:
    Thanks for your review.

On Mon, Jul 29, 2019 at 8:13 PM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
> Peter Zijlstra <peterz@infradead.org> writes:
>
> > On Mon, Jul 29, 2019 at 12:59:26PM +0200, Vitaly Kuznetsov wrote:
> >> lantianyu1986@gmail.com writes:
> >>
> >> > From: Tianyu Lan <Tianyu.Lan@microsoft.com>
> >> >
> >> > Hyper-V guests use the default native_sched_clock() in pv_ops.time.sched_clock
> >> > on x86.  But native_sched_clock() directly uses the raw TSC value, which
> >> > can be discontinuous in a Hyper-V VM.   Add the generic hv_setup_sched_clock()
> >> > to set the sched clock function appropriately.  On x86, this sets
> >> > pv_ops.time.sched_clock to read the Hyper-V reference TSC value that is
> >> > scaled and adjusted to be continuous.
> >>
> >> Hypervisor can, in theory, disable TSC page and then we're forced to use
> >> MSR-based clocksource but using it as sched_clock() can be very slow,
> >> I'm afraid.
> >>
> >> On the other hand, what we have now is probably worse: TSC can,
> >> actually, jump backwards (e.g. on migration) and we're breaking the
> >> requirements for sched_clock().
> >
> > That (obviously) also breaks the requirements for using TSC as
> > clocksource.
> >
> > IOW, it breaks the entire purpose of having TSC in the first place.
>
> Currently, we mark raw TSC as unstable when running on Hyper-V (see
> 88c9281a9fba6), 'TSC page' (which is TSC * scale + offset) is being used
> instead. The problem is that 'TSC page' can be disabled by the
> hypervisor and in that case the only remaining clocksource is MSR-based
> (slow).
>

Yes, that will be slow if Hyper-V doesn't expose hv tsc page and
kernel uses MSR based
clocksource. Each MSR read will trigger one VM-EXIT. This also happens on other
hypervisors (e,g, KVM doesn't expose KVM clock). Hypervisor should
take this into
account and determine which clocksource should be exposed or not.

-- 
Best regards
Tianyu Lan

^ permalink raw reply

* RE: [PATCH v2] x86/hyper-v: Zero out the VP ASSIST PAGE to fix CPU offlining
From: Michael Kelley @ 2019-07-30 20:22 UTC (permalink / raw)
  To: Dexuan Cui, Thomas Gleixner, vkuznets, Haiyang Zhang,
	KY Srinivasan, Stephen Hemminger, Sasha Levin,
	linux-hyperv@vger.kernel.org, Long Li, x86@kernel.org
  Cc: Ingo Molnar, Borislav Petkov, H. Peter Anvin, jasowang@redhat.com,
	driverdev-devel@linuxdriverproject.org,
	linux-kernel@vger.kernel.org, apw@canonical.com,
	marcelo.cerri@canonical.com, olaf@aepfle.de
In-Reply-To: <PU1P153MB0169B716A637FABF07433C04BFCB0@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>

From: Dexuan Cui <decui@microsoft.com> Sent: Thursday, July 18, 2019 8:23 PM
> 
> The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section
> 5.2.1 "GPA Overlay Pages" for the details) and here is an excerpt:
> 
> "
> The hypervisor defines several special pages that "overlay" the guest's
> Guest Physical Addresses (GPA) space. Overlays are addressed GPA but are
> not included in the normal GPA map maintained internally by the hypervisor.
> Conceptually, they exist in a separate map that overlays the GPA map.
> 
> If a page within the GPA space is overlaid, any SPA page mapped to the
> GPA page is effectively "obscured" and generally unreachable by the
> virtual processor through processor memory accesses.
> 
> If an overlay page is disabled, the underlying GPA page is "uncovered",
> and an existing mapping becomes accessible to the guest.
> "
> 
> SPA = System Physical Address = the final real physical address.
> 
> When a CPU (e.g. CPU1) is being onlined, in hv_cpu_init(), we allocate the
> VP ASSIST PAGE and enable the EOI optimization for this CPU by writing the
> MSR HV_X64_MSR_VP_ASSIST_PAGE. From now on, hvp->apic_assist belongs to the
> special SPA page, and this CPU *always* uses hvp->apic_assist (which is
> shared with the hypervisor) to decide if it needs to write the EOI MSR.
> 
> When a CPU (e.g. CPU1) is being offlined, on this CPU, we do:
> 1. in hv_cpu_die(), we disable the EOI optimizaton for this CPU, and from
>    now on hvp->apic_assist belongs to the original "normal" SPA page;
> 2. we finish the remaining work of stopping this CPU;
> 3. this CPU is completely stopped.
> 
> Between 1 and 3, this CPU can still receive interrupts (e.g. reschedule
> IPIs from CPU0, and Local APIC timer interrupts), and this CPU *must* write
> the EOI MSR for every interrupt received, otherwise the hypervisor may not
> deliver further interrupts, which may be needed to completely stop the CPU.
> 
> So, after we disable the EOI optimization in hv_cpu_die(), we need to make
> sure hvp->apic_assist's bit0 is zero. The easiest way is we just zero out
> the page when it's allocated in hv_cpu_init().
> 
> Note 1: after the "normal" SPA page is allocted and zeroed out, neither the
> hypervisor nor the guest writes into the page, so the page remains with
> zeros.
> 
> Note 2: see Section 10.3.5 "EOI Assist" for the details of the EOI
> optimization. When the optimization is enabled, the guest can still write
> the EOI MSR register irrespective of the "No EOI required" value, though
> by doing so we can't benefit from the optimization.
> 
> Fixes: ba696429d290 ("x86/hyper-v: Implement EOI assist")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
> 
> v2: there is no code change. I just improved the comment and the changelog
> according to the discussion with tglx:
> 
> https://lkml.org/lkml/2019/7/17/781
> https://lkml.org/lkml/2019/7/18/91 
> 
>  arch/x86/hyperv/hv_init.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 0e033ef11a9f..d26832cb38bb 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -60,8 +60,16 @@ static int hv_cpu_init(unsigned int cpu)
>  	if (!hv_vp_assist_page)
>  		return 0;
> 
> +	/*
> +	 * The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section
> +	 * 5.2.1 "GPA Overlay Pages"). Here it must be zeroed out to make sure
> +	 * we always write the EOI MSR in hv_apic_eoi_write() *after* the
> +	 * EOI optimization is disabled in hv_cpu_die(), otherwise a CPU may
> +	 * not be stopped in the case of CPU offlining and the VM will hang.
> +	 */
>  	if (!*hvp)
> -		*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
> +		*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO,
> +				 PAGE_KERNEL);
> 
>  	if (*hvp) {
>  		u64 val;
> --
> 2.19.1

Reviewed-by:  Michael Kelley <mikelley@microsoft.com>


^ permalink raw reply

* RE: [PATCH 1/7] x86/hyper-v: Suspend/resume the hypercall page for hibernation
From: Michael Kelley @ 2019-07-30 22:18 UTC (permalink / raw)
  To: Dexuan Cui, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <1562650084-99874-2-git-send-email-decui@microsoft.com>

From: Dexuan Cui <decui@microsoft.com> Sent: Monday, July 8, 2019 10:29 PM
> 
> This is needed for hibernation, e.g. when we resume the old kernel, we need
> to disable the "current" kernel's hypercall page and then resume the old
> kernel's.
> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  arch/x86/hyperv/hv_init.c | 34 ++++++++++++++++++++++++++++++++++
>  1 file changed, 34 insertions(+)
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 0e033ef..3005871 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -20,6 +20,7 @@
>  #include <linux/hyperv.h>
>  #include <linux/slab.h>
>  #include <linux/cpuhotplug.h>
> +#include <linux/syscore_ops.h>
>  #include <clocksource/hyperv_timer.h>
> 
>  void *hv_hypercall_pg;
> @@ -214,6 +215,34 @@ static int __init hv_pci_init(void)
>  	return 1;
>  }
> 
> +static int hv_suspend(void)
> +{
> +	union hv_x64_msr_hypercall_contents hypercall_msr;
> +
> +	/* Reset the hypercall page */
> +	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +	hypercall_msr.enable = 0;
> +	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +
> +	return 0;
> +}
> +
> +static void hv_resume(void)
> +{
> +	union hv_x64_msr_hypercall_contents hypercall_msr;
> +
> +	/* Re-enable the hypercall page */
> +	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +	hypercall_msr.enable = 1;
> +	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
> +	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +}
> +
> +static struct syscore_ops hv_syscore_ops = {
> +	.suspend = hv_suspend,
> +	.resume = hv_resume,
> +};
> +
>  /*
>   * This function is to be invoked early in the boot sequence after the
>   * hypervisor has been detected.
> @@ -294,6 +323,9 @@ void __init hyperv_init(void)
> 
>  	/* Register Hyper-V specific clocksource */
>  	hv_init_clocksource();
> +
> +	register_syscore_ops(&hv_syscore_ops);
> +
>  	return;
> 
>  remove_cpuhp_state:
> @@ -313,6 +345,8 @@ void hyperv_cleanup(void)
>  {
>  	union hv_x64_msr_hypercall_contents hypercall_msr;
> 
> +	unregister_syscore_ops(&hv_syscore_ops);
> +
>  	/* Reset our OS id */
>  	wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
> 
> --
> 1.8.3.1

Reviewed-by: Michael Kelley <mikelley@microsoft.com>


^ permalink raw reply

* RE: [PATCH 2/7] clocksource/drivers: Suspend/resume Hyper-V clocksource for hibernation
From: Michael Kelley @ 2019-07-30 22:23 UTC (permalink / raw)
  To: Dexuan Cui, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <1562650084-99874-3-git-send-email-decui@microsoft.com>

From: Dexuan Cui <decui@microsoft.com> Sent: Monday, July 8, 2019 10:29 PM
> 
> This is needed for hibernation, e.g. when we resume the old kernel, we need
> to disable the "current" kernel's TSC page and then resume the old kernel's.
> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  drivers/clocksource/hyperv_timer.c | 25 +++++++++++++++++++++++++
>  1 file changed, 25 insertions(+)
> 
> diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
> index ba2c79e6..41c31a7 100644
> --- a/drivers/clocksource/hyperv_timer.c
> +++ b/drivers/clocksource/hyperv_timer.c
> @@ -237,12 +237,37 @@ static u64 read_hv_clock_tsc(struct clocksource *arg)
>  	return read_hv_sched_clock_tsc();
>  }
> 
> +static void suspend_hv_clock_tsc(struct clocksource *arg)
> +{
> +	u64 tsc_msr;
> +
> +	/* Disable the TSC page */
> +	hv_get_reference_tsc(tsc_msr);
> +	tsc_msr &= ~BIT_ULL(0);
> +	hv_set_reference_tsc(tsc_msr);
> +}
> +
> +
> +static void resume_hv_clock_tsc(struct clocksource *arg)
> +{
> +	phys_addr_t phys_addr = page_to_phys(vmalloc_to_page(tsc_pg));
> +	u64 tsc_msr;
> +
> +	/* Re-enable the TSC page */
> +	hv_get_reference_tsc(tsc_msr);
> +	tsc_msr &= GENMASK_ULL(11, 0);
> +	tsc_msr |= BIT_ULL(0) | (u64)phys_addr;
> +	hv_set_reference_tsc(tsc_msr);
> +}
> +
>  static struct clocksource hyperv_cs_tsc = {
>  	.name	= "hyperv_clocksource_tsc_page",
>  	.rating	= 400,
>  	.read	= read_hv_clock_tsc,
>  	.mask	= CLOCKSOURCE_MASK(64),
>  	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
> +	.suspend= suspend_hv_clock_tsc,
> +	.resume	= resume_hv_clock_tsc,
>  };
>  #endif
> 
> --
> 1.8.3.1

Reviewed-by: Michael Kelley <mikelley@microsoft.com>


^ permalink raw reply

* Re: [PATCH] hv: Use the correct style for SPDX License Identifier
From: Sasha Levin @ 2019-07-30 22:31 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Nishad Kamdar, K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Joe Perches, Uwe Kleine-König, linux-hyperv, linux-kernel
In-Reply-To: <20190722140809.GA29862@kroah.com>

On Mon, Jul 22, 2019 at 04:08:09PM +0200, Greg Kroah-Hartman wrote:
>On Mon, Jul 22, 2019 at 07:01:17PM +0530, Nishad Kamdar wrote:
>> This patch corrects the SPDX License Identifier style
>> in the trace header file related to Microsoft Hyper-V
>> client drivers.
>> For C header files Documentation/process/license-rules.rst
>> mandates C-like comments (opposed to C source files where
>> C++ style should be used)
>>
>> Changes made by using a script provided by Joe Perches here:
>> https://lkml.org/lkml/2019/2/7/46
>>
>> Suggested-by: Joe Perches <joe@perches.com>
>> Signed-off-by: Nishad Kamdar <nishadkamdar@gmail.com>
>
>Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Queued up for hyperv-fixes, thanks!

--
Thanks,
Sasha

^ permalink raw reply

* RE: [PATCH 3/7] Drivers: hv: vmbus: Split hv_synic_init/cleanup into regs and timer settings
From: Michael Kelley @ 2019-07-30 22:35 UTC (permalink / raw)
  To: Dexuan Cui, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <1562650084-99874-4-git-send-email-decui@microsoft.com>

From: Dexuan Cui <decui@microsoft.com>  Sent: Monday, July 8, 2019 10:29 PM
> 
> There is only one functional change: the unnecessary check
> "if (sctrl.enable != 1) return -EFAULT;" is removed, because when we're in
> hv_synic_cleanup(), we're absolutely sure sctrl.enable must be 1.
> 
> The new functions hv_synic_disable/enable_regs() will be used by a later patch
> to support hibernation.

Seems like this commit message doesn't really describe the main change.
How about:

Break out synic enable and disable operations into separate
hv_synic_disable_regs() and hv_synic_enable_regs() functions for use by a
later patch to support hibernation.

There is no functional change except the unnecessary check
"if (sctrl.enable != 1) return -EFAULT;" is removed, because when we're in
hv_synic_cleanup(), we're absolutely sure sctrl.enable must be 1.

Otherwise,

Reviewed-by:  Michael Kelley <mikelley@microsoft.com>

> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  drivers/hv/hv.c           | 66 ++++++++++++++++++++++++++---------------------
>  drivers/hv/hyperv_vmbus.h |  2 ++
>  2 files changed, 39 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 6188fb7..fcc5279 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -154,7 +154,7 @@ void hv_synic_free(void)
>   * retrieve the initialized message and event pages.  Otherwise, we create and
>   * initialize the message and event pages.
>   */
> -int hv_synic_init(unsigned int cpu)
> +void hv_synic_enable_regs(unsigned int cpu)
>  {
>  	struct hv_per_cpu_context *hv_cpu
>  		= per_cpu_ptr(hv_context.cpu_context, cpu);
> @@ -196,6 +196,11 @@ int hv_synic_init(unsigned int cpu)
>  	sctrl.enable = 1;
> 
>  	hv_set_synic_state(sctrl.as_uint64);
> +}
> +
> +int hv_synic_init(unsigned int cpu)
> +{
> +	hv_synic_enable_regs(cpu);
> 
>  	hv_stimer_init(cpu);
> 
> @@ -205,20 +210,45 @@ int hv_synic_init(unsigned int cpu)
>  /*
>   * hv_synic_cleanup - Cleanup routine for hv_synic_init().
>   */
> -int hv_synic_cleanup(unsigned int cpu)
> +void hv_synic_disable_regs(unsigned int cpu)
>  {
>  	union hv_synic_sint shared_sint;
>  	union hv_synic_simp simp;
>  	union hv_synic_siefp siefp;
>  	union hv_synic_scontrol sctrl;
> +
> +	hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
> +
> +	shared_sint.masked = 1;
> +
> +	/* Need to correctly cleanup in the case of SMP!!! */
> +	/* Disable the interrupt */
> +	hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
> +
> +	hv_get_simp(simp.as_uint64);
> +	simp.simp_enabled = 0;
> +	simp.base_simp_gpa = 0;
> +
> +	hv_set_simp(simp.as_uint64);
> +
> +	hv_get_siefp(siefp.as_uint64);
> +	siefp.siefp_enabled = 0;
> +	siefp.base_siefp_gpa = 0;
> +
> +	hv_set_siefp(siefp.as_uint64);
> +
> +	/* Disable the global synic bit */
> +	hv_get_synic_state(sctrl.as_uint64);
> +	sctrl.enable = 0;
> +	hv_set_synic_state(sctrl.as_uint64);
> +}
> +
> +int hv_synic_cleanup(unsigned int cpu)
> +{
>  	struct vmbus_channel *channel, *sc;
>  	bool channel_found = false;
>  	unsigned long flags;
> 
> -	hv_get_synic_state(sctrl.as_uint64);
> -	if (sctrl.enable != 1)
> -		return -EFAULT;
> -
>  	/*
>  	 * Search for channels which are bound to the CPU we're about to
>  	 * cleanup. In case we find one and vmbus is still connected we need to
> @@ -249,29 +279,7 @@ int hv_synic_cleanup(unsigned int cpu)
> 
>  	hv_stimer_cleanup(cpu);
> 
> -	hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
> -
> -	shared_sint.masked = 1;
> -
> -	/* Need to correctly cleanup in the case of SMP!!! */
> -	/* Disable the interrupt */
> -	hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
> -
> -	hv_get_simp(simp.as_uint64);
> -	simp.simp_enabled = 0;
> -	simp.base_simp_gpa = 0;
> -
> -	hv_set_simp(simp.as_uint64);
> -
> -	hv_get_siefp(siefp.as_uint64);
> -	siefp.siefp_enabled = 0;
> -	siefp.base_siefp_gpa = 0;
> -
> -	hv_set_siefp(siefp.as_uint64);
> -
> -	/* Disable the global synic bit */
> -	sctrl.enable = 0;
> -	hv_set_synic_state(sctrl.as_uint64);
> +	hv_synic_disable_regs(cpu);
> 
>  	return 0;
>  }
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index 362e70e..26ea161 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -171,8 +171,10 @@ extern int hv_post_message(union hv_connection_id
> connection_id,
> 
>  extern void hv_synic_free(void);
> 
> +extern void hv_synic_enable_regs(unsigned int cpu);
>  extern int hv_synic_init(unsigned int cpu);
> 
> +extern void hv_synic_disable_regs(unsigned int cpu);
>  extern int hv_synic_cleanup(unsigned int cpu);
> 
>  /* Interface */
> --
> 1.8.3.1


^ permalink raw reply

* Re: [PATCH 1/2] hv_balloon: Use a static page for the balloon_up send buffer
From: Sasha Levin @ 2019-07-30 22:36 UTC (permalink / raw)
  To: Dexuan Cui
  Cc: linux-hyperv@vger.kernel.org, gregkh@linuxfoundation.org,
	Stephen Hemminger, Sasha Levin, Haiyang Zhang, KY Srinivasan,
	linux-kernel@vger.kernel.org, Michael Kelley, Tianyu Lan,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
In-Reply-To: <1560537692-37400-1-git-send-email-decui@microsoft.com>

On Fri, Jun 14, 2019 at 06:42:17PM +0000, Dexuan Cui wrote:
>It's unnecessary to dynamically allocate the buffer.
>
>Signed-off-by: Dexuan Cui <decui@microsoft.com>

I've queued these up for hyperv-next, thanks!

--
Thanks,
Sasha

^ permalink raw reply

* RE: [PATCH 5/7] Drivers: hv: vmbus: Ignore the offers when resuming from hibernation
From: Michael Kelley @ 2019-07-30 23:07 UTC (permalink / raw)
  To: Dexuan Cui, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <1562650084-99874-6-git-send-email-decui@microsoft.com>

From: Dexuan Cui <decui@microsoft.com> Sent: Monday, July 8, 2019 10:29 PM
> 
> When the VM resumes, the host re-sends the offers. We should not add the
> offers to the global vmbus_connection.chn_list again.
> 
> Added some debug code, in case the host screws up the exact info related to
> the offers.
> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  drivers/hv/channel_mgmt.c | 28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index addcef5..a9aeeab 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -854,12 +854,38 @@ void vmbus_initiate_unload(bool crash)
>  static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
>  {
>  	struct vmbus_channel_offer_channel *offer;
> -	struct vmbus_channel *newchannel;
> +	struct vmbus_channel *oldchannel, *newchannel;
> +	size_t offer_sz;
> 
>  	offer = (struct vmbus_channel_offer_channel *)hdr;
> 
>  	trace_vmbus_onoffer(offer);
> 
> +	mutex_lock(&vmbus_connection.channel_mutex);
> +	oldchannel = relid2channel(offer->child_relid);
> +	mutex_unlock(&vmbus_connection.channel_mutex);
> +
> +	if (oldchannel != NULL) {
> +		atomic_dec(&vmbus_connection.offer_in_progress);
> +
> +		/*
> +		 * We're resuming from hibernation: we expect the host to send
> +		 * exactly the same offers that we had before the hibernation.
> +		 */
> +		offer_sz = sizeof(*offer);
> +		if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0)
> +			return;

The offermsg contains "reserved" and "padding" fields.  Does Hyper-V
guarantee that all these fields are the same in the new offer after resuming
from hibernation?  Or should a less stringent check be made?  For example,
I could imagine a newer version of Hyper-V allowing a VM that was
hibernated on an older version to be resumed.  But one of the reserved fields
might be used in the newer version, and the comparison could fail
unnecessarily.

> +
> +		pr_err("Mismatched offer from the host (relid=%d)!\n",
> +		       offer->child_relid);
> +
> +		print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, 4,
> +				     4, &oldchannel->offermsg, offer_sz, false);
> +		print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, 4,
> +				     4, offer, offer_sz, false);

The third argument to print_hex_dump() is the rowsize and is specified as must
be 16 or 32.  

> +		return;
> +	}
> +
>  	/* Allocate the channel object and save this offer. */
>  	newchannel = alloc_channel();
>  	if (!newchannel) {
> --
> 1.8.3.1


^ permalink raw reply

* RE: [PATCH 3/7] Drivers: hv: vmbus: Split hv_synic_init/cleanup into regs and timer settings
From: Dexuan Cui @ 2019-07-30 23:18 UTC (permalink / raw)
  To: Michael Kelley, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <MWHPR21MB0784D7CFED4961C5B3D310B4D7DC0@MWHPR21MB0784.namprd21.prod.outlook.com>

> From: Michael Kelley <mikelley@microsoft.com>
> Sent: Tuesday, July 30, 2019 3:36 PM
> 
> From: Dexuan Cui <decui@microsoft.com>  Sent: Monday, July 8, 2019 10:29
> PM
> >
> > There is only one functional change: the unnecessary check
> > "if (sctrl.enable != 1) return -EFAULT;" is removed, because when we're in
> > hv_synic_cleanup(), we're absolutely sure sctrl.enable must be 1.
> >
> > The new functions hv_synic_disable/enable_regs() will be used by a later
> patch
> > to support hibernation.
> 
> Seems like this commit message doesn't really describe the main change.
> How about:
> 
> Break out synic enable and disable operations into separate
> hv_synic_disable_regs() and hv_synic_enable_regs() functions for use by a
> later patch to support hibernation.
> 
> There is no functional change except the unnecessary check
> "if (sctrl.enable != 1) return -EFAULT;" is removed, because when we're in
> hv_synic_cleanup(), we're absolutely sure sctrl.enable must be 1.
> 
> Otherwise,
> 
> Reviewed-by:  Michael Kelley <mikelley@microsoft.com>

Thanks! I'll use your version as the changelog of v2. I'll change the 
Subject accordingly.

Thanks,
-- Dexuan

^ permalink raw reply

* RE: [PATCH 6/7] Drivers: hv: vmbus: Suspend/resume the vmbus itself for hibernation
From: Michael Kelley @ 2019-07-30 23:25 UTC (permalink / raw)
  To: Dexuan Cui, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <1562650084-99874-7-git-send-email-decui@microsoft.com>

From: Dexuan Cui <decui@microsoft.com>  Sent: Monday, July 8, 2019 10:30 PM
> 
> This is needed when we resume the old kernel from the "current" kernel.

Perhaps a bit more descriptive commit message could be supplied?

> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  drivers/hv/connection.c   |  3 +--
>  drivers/hv/hyperv_vmbus.h |  2 ++
>  drivers/hv/vmbus_drv.c    | 40 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 43 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> index 09829e1..806319c 100644
> --- a/drivers/hv/connection.c
> +++ b/drivers/hv/connection.c
> @@ -59,8 +59,7 @@ static __u32 vmbus_get_next_version(__u32 current_version)
>  	}
>  }
> 
> -static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
> -					__u32 version)
> +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
>  {
>  	int ret = 0;
>  	unsigned int cur_cpu;
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index 26ea161..e657197 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -274,6 +274,8 @@ struct vmbus_msginfo {
> 
>  extern struct vmbus_connection vmbus_connection;
> 
> +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version);
> +
>  static inline void vmbus_send_interrupt(u32 relid)
>  {
>  	sync_set_bit(relid, vmbus_connection.send_int_page);
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 1c2d935..1730e7b 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -2045,6 +2045,41 @@ static int vmbus_acpi_add(struct acpi_device *device)
>  	return ret_val;
>  }
> 
> +static int vmbus_bus_suspend(struct device *dev)
> +{
> +	vmbus_initiate_unload(false);
> +
> +	vmbus_connection.conn_state = DISCONNECTED;
> +
> +	return 0;
> +}
> +
> +static int vmbus_bus_resume(struct device *dev)
> +{
> +	struct vmbus_channel_msginfo *msginfo;
> +	size_t msgsize;
> +	int ret;
> +
> +	msgsize = sizeof(*msginfo) +
> +		  sizeof(struct vmbus_channel_initiate_contact);
> +
> +	msginfo = kzalloc(msgsize, GFP_KERNEL);
> +
> +	if (msginfo == NULL)
> +		return -ENOMEM;
> +
> +	ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);

I think this code answers my earlier question:  Upon resume, we negotiate the same
VMbus protocol version we had at time of hibernation, even if running on a newer
version of Hyper-V that might support newer protocol versions.  Hence the offer
messages should not have any previously reserved fields now being used.   A
comment to this effect would be useful.

> +
> +	kfree(msginfo);
> +
> +	if (ret != 0)
> +		return ret;
> +
> +	vmbus_request_offers();
> +
> +	return 0;
> +}
> +
>  static const struct acpi_device_id vmbus_acpi_device_ids[] = {
>  	{"VMBUS", 0},
>  	{"VMBus", 0},
> @@ -2052,6 +2087,10 @@ static int vmbus_acpi_add(struct acpi_device *device)
>  };
>  MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
> 
> +static const struct dev_pm_ops vmbus_bus_pm = {
> +	SET_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume)
> +};
> +
>  static struct acpi_driver vmbus_acpi_driver = {
>  	.name = "vmbus",
>  	.ids = vmbus_acpi_device_ids,
> @@ -2059,6 +2098,7 @@ static int vmbus_acpi_add(struct acpi_device *device)
>  		.add = vmbus_acpi_add,
>  		.remove = vmbus_acpi_remove,
>  	},
> +	.drv.pm = &vmbus_bus_pm,
>  };
> 
>  static void hv_kexec_handler(void)
> --
> 1.8.3.1


^ permalink raw reply

* RE: [PATCH 7/7] Drivers: hv: vmbus: Implement suspend/resume for VSC drivers for hibernation
From: Michael Kelley @ 2019-07-30 23:38 UTC (permalink / raw)
  To: Dexuan Cui, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <1562650084-99874-8-git-send-email-decui@microsoft.com>

From: Dexuan Cui <decui@microsoft.com> Sent: Monday, July 8, 2019 10:30 PM
> 
> The high-level VSC drivers will implement device-specific callbacks.
> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  drivers/hv/vmbus_drv.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/hyperv.h |  3 +++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 1730e7b..e29e2171 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -911,6 +911,43 @@ static void vmbus_shutdown(struct device *child_device)
>  		drv->shutdown(dev);
>  }
> 
> +/*
> + * vmbus_suspend - Suspend a vmbus device
> + */
> +static int vmbus_suspend(struct device *child_device)
> +{
> +	struct hv_driver *drv;
> +	struct hv_device *dev = device_to_hv_device(child_device);
> +
> +	/* The device may not be attached yet */
> +	if (!child_device->driver)
> +		return 0;
> +
> +	drv = drv_to_hv_drv(child_device->driver);
> +	if (!drv->suspend)
> +		return -EOPNOTSUPP;
> +
> +	return drv->suspend(dev);
> +}
> +
> +/*
> + * vmbus_resume - Resume a vmbus device
> + */
> +static int vmbus_resume(struct device *child_device)
> +{
> +	struct hv_driver *drv;
> +	struct hv_device *dev = device_to_hv_device(child_device);
> +
> +	/* The device may not be attached yet */
> +	if (!child_device->driver)
> +		return 0;
> +
> +	drv = drv_to_hv_drv(child_device->driver);
> +	if (!drv->resume)
> +		return -EOPNOTSUPP;
> +
> +	return drv->resume(dev);
> +}
> 
>  /*
>   * vmbus_device_release - Final callback release of the vmbus child device
> @@ -926,6 +963,10 @@ static void vmbus_device_release(struct device *device)
>  	kfree(hv_dev);
>  }
> 
> +static const struct dev_pm_ops vmbus_pm = {
> +	SET_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume)
> +};
> +
>  /* The one and only one */
>  static struct bus_type  hv_bus = {
>  	.name =		"vmbus",
> @@ -936,6 +977,7 @@ static void vmbus_device_release(struct device *device)
>  	.uevent =		vmbus_uevent,
>  	.dev_groups =		vmbus_dev_groups,
>  	.drv_groups =		vmbus_drv_groups,
> +	.pm =			&vmbus_pm,
>  };
> 
>  struct onmessage_work_context {
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 6256cc3..94443c4 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1149,6 +1149,9 @@ struct hv_driver {
>  	int (*remove)(struct hv_device *);
>  	void (*shutdown)(struct hv_device *);
> 
> +	int (*suspend)(struct hv_device *);
> +	int (*resume)(struct hv_device *);
> +
>  };
> 
>  /* Base device object */
> --
> 1.8.3.1

Reviewed-by: Michael Kelley <mikelley@microsoft.com>


^ permalink raw reply

* RE: [PATCH 5/7] Drivers: hv: vmbus: Ignore the offers when resuming from hibernation
From: Dexuan Cui @ 2019-07-31  0:01 UTC (permalink / raw)
  To: Michael Kelley, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <MWHPR21MB078481A7C7F65E0297135D41D7DC0@MWHPR21MB0784.namprd21.prod.outlook.com>

> From: Michael Kelley <mikelley@microsoft.com>
> Sent: Tuesday, July 30, 2019 4:07 PM
> > +
> > +	if (oldchannel != NULL) {
> > +		atomic_dec(&vmbus_connection.offer_in_progress);
> > +
> > +		/*
> > +		 * We're resuming from hibernation: we expect the host to send
> > +		 * exactly the same offers that we had before the hibernation.
> > +		 */
> > +		offer_sz = sizeof(*offer);
> > +		if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0)
> > +			return;
> 
> The offermsg contains "reserved" and "padding" fields.  Does Hyper-V
> guarantee that all these fields are the same in the new offer after resuming
> from hibernation? 

Yes. I confirmed this with Hyper-V team. The reserved/padding fields don't change
across hiberantion. BTW, the fields are filled with zeros since they're not used.

>  Or should a less stringent check be made?  For example,
> I could imagine a newer version of Hyper-V allowing a VM that was
> hibernated on an older version to be resumed.  But one of the reserved fields
> might be used in the newer version, and the comparison could fail
> unnecessarily.

Upon resume, Linux VM always uses the same version, which was used when the
VM firstly booted up before suspend, to re-negotiate with the host.
 
> > +
> > +		pr_err("Mismatched offer from the host (relid=%d)!\n",
> > +		       offer->child_relid);
> > +
> > +		print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET,
> 4,
> > +				     4, &oldchannel->offermsg, offer_sz, false);
> > +		print_hex_dump_debug("New vmbus offer: ",
> DUMP_PREFIX_OFFSET, 4,
> > +				     4, offer, offer_sz, false);
> 
> The third argument to print_hex_dump() is the rowsize and is specified as must
> be 16 or 32.

Thanks! I misunderstood the argument. I'll change it to 16.

Thanks,
-- Dexuan

^ permalink raw reply

* RE: [PATCH v3 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
From: Michael Kelley @ 2019-07-31  0:13 UTC (permalink / raw)
  To: Nadav Amit, Andy Lutomirski, Dave Hansen
  Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Peter Zijlstra,
	Thomas Gleixner, Ingo Molnar, KY Srinivasan, Haiyang Zhang,
	Stephen Hemminger, Sasha Levin, Borislav Petkov, Juergen Gross,
	Paolo Bonzini, Boris Ostrovsky, linux-hyperv@vger.kernel.org,
	virtualization@lists.linux-foundation.org, kvm@vger.kernel.org,
	xen-devel@lists.xenproject.org
In-Reply-To: <20190719005837.4150-5-namit@vmware.com>

From: Nadav Amit <namit@vmware.com> Sent: Thursday, July 18, 2019 5:59 PM
> 
> To improve TLB shootdown performance, flush the remote and local TLBs
> concurrently. Introduce flush_tlb_multi() that does so. Introduce
> paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen
> and hyper-v are only compile-tested).
> 
> While the updated smp infrastructure is capable of running a function on
> a single local core, it is not optimized for this case. The multiple
> function calls and the indirect branch introduce some overhead, and
> might make local TLB flushes slower than they were before the recent
> changes.
> 
> Before calling the SMP infrastructure, check if only a local TLB flush
> is needed to restore the lost performance in this common case. This
> requires to check mm_cpumask() one more time, but unless this mask is
> updated very frequently, this should impact performance negatively.
> 
> Cc: "K. Y. Srinivasan" <kys@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Stephen Hemminger <sthemmin@microsoft.com>
> Cc: Sasha Levin <sashal@kernel.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: x86@kernel.org
> Cc: Juergen Gross <jgross@suse.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: linux-hyperv@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: virtualization@lists.linux-foundation.org
> Cc: kvm@vger.kernel.org
> Cc: xen-devel@lists.xenproject.org
> Signed-off-by: Nadav Amit <namit@vmware.com>
> ---
>  arch/x86/hyperv/mmu.c                 | 10 +++---
>  arch/x86/include/asm/paravirt.h       |  6 ++--
>  arch/x86/include/asm/paravirt_types.h |  4 +--
>  arch/x86/include/asm/tlbflush.h       |  8 ++---
>  arch/x86/include/asm/trace/hyperv.h   |  2 +-
>  arch/x86/kernel/kvm.c                 | 11 +++++--
>  arch/x86/kernel/paravirt.c            |  2 +-
>  arch/x86/mm/tlb.c                     | 47 ++++++++++++++++++---------
>  arch/x86/xen/mmu_pv.c                 | 11 +++----
>  include/trace/events/xen.h            |  2 +-
>  10 files changed, 62 insertions(+), 41 deletions(-)
> 

For the Hyper-V parts --
Reviewed-by: Michael Kelley <mikelley@microsoft.com>


^ permalink raw reply

* RE: [PATCH 6/7] Drivers: hv: vmbus: Suspend/resume the vmbus itself for hibernation
From: Dexuan Cui @ 2019-07-31  0:16 UTC (permalink / raw)
  To: Michael Kelley, linux-hyperv@vger.kernel.org,
	gregkh@linuxfoundation.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan,
	tglx@linutronix.de
  Cc: linux-kernel@vger.kernel.org
In-Reply-To: <MWHPR21MB078461F2DB6FE0A6D0647B70D7DC0@MWHPR21MB0784.namprd21.prod.outlook.com>

> From: Michael Kelley <mikelley@microsoft.com>
> Sent: Tuesday, July 30, 2019 4:26 PM
> From: Dexuan Cui <decui@microsoft.com>  Sent: Monday, July 8, 2019 10:30
> PM
> >
> > This is needed when we resume the old kernel from the "current" kernel.
> 
> Perhaps a bit more descriptive commit message could be supplied?

I'll use this as the new changelog:

Before Linux enters hibernation, it sends the CHANNELMSG_UNLOAD message to
the host so all the offers are gone. After hibernation, Linux needs to re-negotiate
with the host using the same vmbus protocol version (which was in use
before hibernation), and ask the host to re-offer the vmbus devices. 

> > +
> > +	ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);
> 
> I think this code answers my earlier question:  Upon resume, we negotiate
> the same
> VMbus protocol version we had at time of hibernation, even if running on a
> newer
> version of Hyper-V that might support newer protocol versions.  Hence the
> offer
> messages should not have any previously reserved fields now being used.   A
> comment to this effect would be useful.

Ok, let me add a comment before the line.

I'll post a v2 of the patchset, including the "[PATCH 4/7]", which needs a small change.

Thanks,
-- Dexuan

^ permalink raw reply

* [PATCH v2 net] hv_sock: Fix hang when a connection is closed
From: Dexuan Cui @ 2019-07-31  1:25 UTC (permalink / raw)
  To: Sunil Muthuswamy, David Miller, netdev@vger.kernel.org
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org, Michael Kelley, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com


There is a race condition for an established connection that is being closed
by the guest: the refcnt is 4 at the end of hvs_release() (Note: here the
'remove_sock' is false):

1 for the initial value;
1 for the sk being in the bound list;
1 for the sk being in the connected list;
1 for the delayed close_work.

After hvs_release() finishes, __vsock_release() -> sock_put(sk) *may*
decrease the refcnt to 3.

Concurrently, hvs_close_connection() runs in another thread:
  calls vsock_remove_sock() to decrease the refcnt by 2;
  call sock_put() to decrease the refcnt to 0, and free the sk;
  next, the "release_sock(sk)" may hang due to use-after-free.

In the above, after hvs_release() finishes, if hvs_close_connection() runs
faster than "__vsock_release() -> sock_put(sk)", then there is not any issue,
because at the beginning of hvs_close_connection(), the refcnt is still 4.

The issue can be resolved if an extra reference is taken when the
connection is established.

Fixes: a9eeb998c28d ("hv_sock: Add support for delayed close")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: stable@vger.kernel.org

---

Changes in v2: 
  Changed the location of the sock_hold() call. 
  Updated the changelog accordingly.
  
  Thanks Sunil for the suggestion!


With the proper kernel debugging options enabled, first a warning can
appear:

kworker/1:0/4467 is freeing memory ..., with a lock still held there!
stack backtrace:
Workqueue: events vmbus_onmessage_work [hv_vmbus]
Call Trace:
 dump_stack+0x67/0x90
 debug_check_no_locks_freed.cold.52+0x78/0x7d
 slab_free_freelist_hook+0x85/0x140
 kmem_cache_free+0xa5/0x380
 __sk_destruct+0x150/0x260
 hvs_close_connection+0x24/0x30 [hv_sock]
 vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
 process_one_work+0x241/0x600
 worker_thread+0x3c/0x390
 kthread+0x11b/0x140
 ret_from_fork+0x24/0x30

and then the following release_sock(sk) can hang:

watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/1:0:4467]
...
irq event stamp: 62890
CPU: 1 PID: 4467 Comm: kworker/1:0 Tainted: G        W         5.2.0+ #39
Workqueue: events vmbus_onmessage_work [hv_vmbus]
RIP: 0010:queued_spin_lock_slowpath+0x2b/0x1e0
...
Call Trace:
 do_raw_spin_lock+0xab/0xb0
 release_sock+0x19/0xb0
 vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
 process_one_work+0x241/0x600
 worker_thread+0x3c/0x390
 kthread+0x11b/0x140
 ret_from_fork+0x24/0x30

 net/vmw_vsock/hyperv_transport.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index f2084e3f7aa4..9d864ebeb7b3 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -312,6 +312,11 @@ static void hvs_close_connection(struct vmbus_channel *chan)
 	lock_sock(sk);
 	hvs_do_close_lock_held(vsock_sk(sk), true);
 	release_sock(sk);
+
+	/* Release the refcnt for the channel that's opened in
+	 * hvs_open_connection().
+	 */
+	sock_put(sk);
 }
 
 static void hvs_open_connection(struct vmbus_channel *chan)
@@ -407,6 +412,9 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	}
 
 	set_per_channel_state(chan, conn_from_host ? new : sk);
+
+	/* This reference will be dropped by hvs_close_connection(). */
+	sock_hold(conn_from_host ? new : sk);
 	vmbus_set_chn_rescind_callback(chan, hvs_close_connection);
 
 	/* Set the pending send size to max packet size to always get
-- 
2.19.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox