Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* Re: [PATCH 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
From: Nadav Amit @ 2019-06-26  2:35 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Peter Zijlstra, Andy Lutomirski, LKML, Ingo Molnar,
	Borislav Petkov, the arch/x86 maintainers, Thomas Gleixner,
	Dave Hansen, K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, Juergen Gross, Paolo Bonzini, Boris Ostrovsky,
	linux-hyperv@vger.kernel.org,
	virtualization@lists.linux-foundation.org, kvm@vger.kernel.org,
	xen-devel@lists.xenproject.org
In-Reply-To: <723d63ee-c8cb-14a1-0eb9-265e580360f4@intel.com>

> On Jun 25, 2019, at 2:29 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> 
> On 6/12/19 11:48 PM, Nadav Amit wrote:
>> To improve TLB shootdown performance, flush the remote and local TLBs
>> concurrently. Introduce flush_tlb_multi() that does so. The current
>> flush_tlb_others() interface is kept, since paravirtual interfaces need
>> to be adapted first before it can be removed. This is left for future
>> work. In such PV environments, TLB flushes are not performed, at this
>> time, concurrently.
>> 
>> Add a static key to tell whether this new interface is supported.
>> 
>> Cc: "K. Y. Srinivasan" <kys@microsoft.com>
>> Cc: Haiyang Zhang <haiyangz@microsoft.com>
>> Cc: Stephen Hemminger <sthemmin@microsoft.com>
>> Cc: Sasha Levin <sashal@kernel.org>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: x86@kernel.org
>> Cc: Juergen Gross <jgross@suse.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>> Cc: linux-hyperv@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> Cc: virtualization@lists.linux-foundation.org
>> Cc: kvm@vger.kernel.org
>> Cc: xen-devel@lists.xenproject.org
>> Signed-off-by: Nadav Amit <namit@vmware.com>
>> ---
>> arch/x86/hyperv/mmu.c                 |  2 +
>> arch/x86/include/asm/paravirt.h       |  8 +++
>> arch/x86/include/asm/paravirt_types.h |  6 +++
>> arch/x86/include/asm/tlbflush.h       |  6 +++
>> arch/x86/kernel/kvm.c                 |  1 +
>> arch/x86/kernel/paravirt.c            |  3 ++
>> arch/x86/mm/tlb.c                     | 71 ++++++++++++++++++++++-----
>> arch/x86/xen/mmu_pv.c                 |  2 +
>> 8 files changed, 87 insertions(+), 12 deletions(-)
>> 
>> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
>> index e65d7fe6489f..ca28b400c87c 100644
>> --- a/arch/x86/hyperv/mmu.c
>> +++ b/arch/x86/hyperv/mmu.c
>> @@ -233,4 +233,6 @@ void hyperv_setup_mmu_ops(void)
>> 	pr_info("Using hypercall for remote TLB flush\n");
>> 	pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
>> 	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
>> +
>> +	static_key_disable(&flush_tlb_multi_enabled.key);
>> }
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index c25c38a05c1c..192be7254457 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -47,6 +47,8 @@ static inline void slow_down_io(void)
>> #endif
>> }
>> 
>> +DECLARE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
>> +
>> static inline void __flush_tlb(void)
>> {
>> 	PVOP_VCALL0(mmu.flush_tlb_user);
>> @@ -62,6 +64,12 @@ static inline void __flush_tlb_one_user(unsigned long addr)
>> 	PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
>> }
>> 
>> +static inline void flush_tlb_multi(const struct cpumask *cpumask,
>> +				   const struct flush_tlb_info *info)
>> +{
>> +	PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
>> +}
>> +
>> static inline void flush_tlb_others(const struct cpumask *cpumask,
>> 				    const struct flush_tlb_info *info)
>> {
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
>> index 946f8f1f1efc..b93b3d90729a 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -211,6 +211,12 @@ struct pv_mmu_ops {
>> 	void (*flush_tlb_user)(void);
>> 	void (*flush_tlb_kernel)(void);
>> 	void (*flush_tlb_one_user)(unsigned long addr);
>> +	/*
>> +	 * flush_tlb_multi() is the preferred interface, which is capable to
>> +	 * flush both local and remote CPUs.
>> +	 */
>> +	void (*flush_tlb_multi)(const struct cpumask *cpus,
>> +				const struct flush_tlb_info *info);
>> 	void (*flush_tlb_others)(const struct cpumask *cpus,
>> 				 const struct flush_tlb_info *info);
>> 
>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>> index dee375831962..79272938cf79 100644
>> --- a/arch/x86/include/asm/tlbflush.h
>> +++ b/arch/x86/include/asm/tlbflush.h
>> @@ -569,6 +569,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
>> 	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
>> }
>> 
>> +void native_flush_tlb_multi(const struct cpumask *cpumask,
>> +			     const struct flush_tlb_info *info);
>> +
>> void native_flush_tlb_others(const struct cpumask *cpumask,
>> 			     const struct flush_tlb_info *info);
>> 
>> @@ -593,6 +596,9 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
>> extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
>> 
>> #ifndef CONFIG_PARAVIRT
>> +#define flush_tlb_multi(mask, info)	\
>> +	native_flush_tlb_multi(mask, info)
>> +
>> #define flush_tlb_others(mask, info)	\
>> 	native_flush_tlb_others(mask, info)
>> 
>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>> index 5169b8cc35bb..00d81e898717 100644
>> --- a/arch/x86/kernel/kvm.c
>> +++ b/arch/x86/kernel/kvm.c
>> @@ -630,6 +630,7 @@ static void __init kvm_guest_init(void)
>> 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
>> 		pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
>> 		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
>> +		static_key_disable(&flush_tlb_multi_enabled.key);
>> 	}
>> 
>> 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 98039d7fb998..ac00afed5570 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -159,6 +159,8 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
>> 	return insn_len;
>> }
>> 
>> +DEFINE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
>> +
>> static void native_flush_tlb(void)
>> {
>> 	__native_flush_tlb();
>> @@ -363,6 +365,7 @@ struct paravirt_patch_template pv_ops = {
>> 	.mmu.flush_tlb_user	= native_flush_tlb,
>> 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
>> 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
>> +	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
>> 	.mmu.flush_tlb_others	= native_flush_tlb_others,
>> 	.mmu.tlb_remove_table	=
>> 			(void (*)(struct mmu_gather *, void *))tlb_remove_page,
>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index c34bcf03f06f..db73d5f1dd43 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -551,7 +551,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>> 		 * garbage into our TLB.  Since switching to init_mm is barely
>> 		 * slower than a minimal flush, just switch to init_mm.
>> 		 *
>> -		 * This should be rare, with native_flush_tlb_others skipping
>> +		 * This should be rare, with native_flush_tlb_multi skipping
>> 		 * IPIs to lazy TLB mode CPUs.
>> 		 */
> 
> Nit, since we're messing with this, it can now be
> "native_flush_tlb_multi()" since it is a function.

Sure.

> 
>> switch_mm_irqs_off(NULL, &init_mm, NULL);
>> @@ -635,9 +635,12 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>> 	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
>> }
>> 
>> -static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
>> +static void flush_tlb_func_local(void *info)
>> {
>> 	const struct flush_tlb_info *f = info;
>> +	enum tlb_flush_reason reason;
>> +
>> +	reason = (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : TLB_LOCAL_MM_SHOOTDOWN;
> 
> Should we just add the "reason" to flush_tlb_info?  It's OK-ish to imply
> it like this, but seems like it would be nicer and easier to track down
> the origins of these things if we did this at the caller.

I prefer not to. I want later to inline flush_tlb_info into the same
cacheline that holds call_function_data. Increasing the size of
flush_tlb_info for no good reason will not help…

>> flush_tlb_func_common(f, true, reason);
>> }
>> @@ -655,14 +658,21 @@ static void flush_tlb_func_remote(void *info)
>> 	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
>> }
>> 
>> -static bool tlb_is_not_lazy(int cpu, void *data)
>> +static inline bool tlb_is_not_lazy(int cpu)
>> {
>> 	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
>> }
> 
> Nit: the compiler will probably inline this sucker anyway.  So, for
> these kinds of patches, I'd resist the urge to do these kinds of tweaks,
> especially since it starts to hide the important change on the line.

Of course.

> 
>> -void native_flush_tlb_others(const struct cpumask *cpumask,
>> -			     const struct flush_tlb_info *info)
>> +static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
>> +
>> +void native_flush_tlb_multi(const struct cpumask *cpumask,
>> +			    const struct flush_tlb_info *info)
>> {
>> +	/*
>> +	 * Do accounting and tracing. Note that there are (and have always been)
>> +	 * cases in which a remote TLB flush will be traced, but eventually
>> +	 * would not happen.
>> +	 */
>> 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
>> 	if (info->end == TLB_FLUSH_ALL)
>> 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
>> @@ -682,10 +692,14 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>> 		 * means that the percpu tlb_gen variables won't be updated
>> 		 * and we'll do pointless flushes on future context switches.
>> 		 *
>> -		 * Rather than hooking native_flush_tlb_others() here, I think
>> +		 * Rather than hooking native_flush_tlb_multi() here, I think
>> 		 * that UV should be updated so that smp_call_function_many(),
>> 		 * etc, are optimal on UV.
>> 		 */
>> +		local_irq_disable();
>> +		flush_tlb_func_local((__force void *)info);
>> +		local_irq_enable();
>> +
>> 		cpumask = uv_flush_tlb_others(cpumask, info);
>> 		if (cpumask)
>> 			smp_call_function_many(cpumask, flush_tlb_func_remote,
>> @@ -704,11 +718,39 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>> 	 * doing a speculative memory access.
>> 	 */
>> 	if (info->freed_tables)
>> -		smp_call_function_many(cpumask, flush_tlb_func_remote,
>> -			       (void *)info, 1);
>> -	else
>> -		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
>> -				(void *)info, 1, GFP_ATOMIC, cpumask);
>> +		__smp_call_function_many(cpumask, flush_tlb_func_remote,
>> +					 flush_tlb_func_local, (void *)info, 1);
>> +	else {
> 
> I prefer brackets be added for 'if' blocks like this since it doesn't
> take up any meaningful space and makes it less prone to compile errors.

If you say so.

> 
>> +		/*
>> +		 * Although we could have used on_each_cpu_cond_mask(),
>> +		 * open-coding it has several performance advantages: (1) we can
>> +		 * use specialized functions for remote and local flushes; (2)
>> +		 * no need for indirect branch to test if TLB is lazy; (3) we
>> +		 * can use a designated cpumask for evaluating the condition
>> +		 * instead of allocating a new one.
>> +		 *
>> +		 * This works under the assumption that there are no nested TLB
>> +		 * flushes, an assumption that is already made in
>> +		 * flush_tlb_mm_range().
>> +		 */
>> +		struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
> 
> This is logically a stack-local variable, right?  But, since we've got
> preempt off and cpumasks can be huge, we don't want to allocate it on
> the stack.  That might be worth a comment somewhere.

I will add a comment here.

> 
>> +		int cpu;
>> +
>> +		cpumask_clear(cond_cpumask);
>> +
>> +		for_each_cpu(cpu, cpumask) {
>> +			if (tlb_is_not_lazy(cpu))
>> +				__cpumask_set_cpu(cpu, cond_cpumask);
>> +		}
> 
> FWIW, it's probably worth calling out in the changelog that this loop
> exists in on_each_cpu_cond_mask() too.  It looks bad here, but it's no
> worse than what it replaces.

Added.

> 
>> +		__smp_call_function_many(cond_cpumask, flush_tlb_func_remote,
>> +					 flush_tlb_func_local, (void *)info, 1);
>> +	}
>> +}
> 
> There was a __force on an earlier 'info' cast.  Could you talk about
> that for a minute an explain why that one is needed?

I have no idea where the __force came from. I’ll remove it.

> 
>> +void native_flush_tlb_others(const struct cpumask *cpumask,
>> +			     const struct flush_tlb_info *info)
>> +{
>> +	native_flush_tlb_multi(cpumask, info);
>> }
>> 
>> /*
>> @@ -774,10 +816,15 @@ static void flush_tlb_on_cpus(const cpumask_t *cpumask,
>> {
>> 	int this_cpu = smp_processor_id();
>> 
>> +	if (static_branch_likely(&flush_tlb_multi_enabled)) {
>> +		flush_tlb_multi(cpumask, info);
>> +		return;
>> +	}
> 
> Probably needs a comment for posterity above the if()^^:
> 
> 	/* Use the optimized flush_tlb_multi() where we can. */

Right.

> 
>> --- a/arch/x86/xen/mmu_pv.c
>> +++ b/arch/x86/xen/mmu_pv.c
>> @@ -2474,6 +2474,8 @@ void __init xen_init_mmu_ops(void)
>> 
>> 	pv_ops.mmu = xen_mmu_ops;
>> 
>> +	static_key_disable(&flush_tlb_multi_enabled.key);
>> +
>> 	memset(dummy_mapping, 0xff, PAGE_SIZE);
>> }
> 
> More comments, please.  Perhaps:
> 
> 	Existing paravirt TLB flushes are incompatible with
> 	flush_tlb_multi() because....  Disable it when they are
> 	in use.

There is no inherent reason for them to be incompatible. Someone needs to
adapt them. I will use my affiliation as an excuse for the question “why
don’t you do it?” ;-)

Anyhow, I will add a comment.


^ permalink raw reply

* Re: [PATCH 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
From: Dave Hansen @ 2019-06-25 21:29 UTC (permalink / raw)
  To: Nadav Amit, Peter Zijlstra, Andy Lutomirski
  Cc: linux-kernel, Ingo Molnar, Borislav Petkov, x86, Thomas Gleixner,
	Dave Hansen, K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger,
	Sasha Levin, Juergen Gross, Paolo Bonzini, Boris Ostrovsky,
	linux-hyperv, virtualization, kvm, xen-devel
In-Reply-To: <20190613064813.8102-5-namit@vmware.com>

On 6/12/19 11:48 PM, Nadav Amit wrote:
> To improve TLB shootdown performance, flush the remote and local TLBs
> concurrently. Introduce flush_tlb_multi() that does so. The current
> flush_tlb_others() interface is kept, since paravirtual interfaces need
> to be adapted first before it can be removed. This is left for future
> work. In such PV environments, TLB flushes are not performed, at this
> time, concurrently.
> 
> Add a static key to tell whether this new interface is supported.
> 
> Cc: "K. Y. Srinivasan" <kys@microsoft.com>
> Cc: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: Stephen Hemminger <sthemmin@microsoft.com>
> Cc: Sasha Levin <sashal@kernel.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: x86@kernel.org
> Cc: Juergen Gross <jgross@suse.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: linux-hyperv@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: virtualization@lists.linux-foundation.org
> Cc: kvm@vger.kernel.org
> Cc: xen-devel@lists.xenproject.org
> Signed-off-by: Nadav Amit <namit@vmware.com>
> ---
>  arch/x86/hyperv/mmu.c                 |  2 +
>  arch/x86/include/asm/paravirt.h       |  8 +++
>  arch/x86/include/asm/paravirt_types.h |  6 +++
>  arch/x86/include/asm/tlbflush.h       |  6 +++
>  arch/x86/kernel/kvm.c                 |  1 +
>  arch/x86/kernel/paravirt.c            |  3 ++
>  arch/x86/mm/tlb.c                     | 71 ++++++++++++++++++++++-----
>  arch/x86/xen/mmu_pv.c                 |  2 +
>  8 files changed, 87 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> index e65d7fe6489f..ca28b400c87c 100644
> --- a/arch/x86/hyperv/mmu.c
> +++ b/arch/x86/hyperv/mmu.c
> @@ -233,4 +233,6 @@ void hyperv_setup_mmu_ops(void)
>  	pr_info("Using hypercall for remote TLB flush\n");
>  	pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
>  	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
> +
> +	static_key_disable(&flush_tlb_multi_enabled.key);
>  }
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index c25c38a05c1c..192be7254457 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -47,6 +47,8 @@ static inline void slow_down_io(void)
>  #endif
>  }
>  
> +DECLARE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
> +
>  static inline void __flush_tlb(void)
>  {
>  	PVOP_VCALL0(mmu.flush_tlb_user);
> @@ -62,6 +64,12 @@ static inline void __flush_tlb_one_user(unsigned long addr)
>  	PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
>  }
>  
> +static inline void flush_tlb_multi(const struct cpumask *cpumask,
> +				   const struct flush_tlb_info *info)
> +{
> +	PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
> +}
> +
>  static inline void flush_tlb_others(const struct cpumask *cpumask,
>  				    const struct flush_tlb_info *info)
>  {
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index 946f8f1f1efc..b93b3d90729a 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -211,6 +211,12 @@ struct pv_mmu_ops {
>  	void (*flush_tlb_user)(void);
>  	void (*flush_tlb_kernel)(void);
>  	void (*flush_tlb_one_user)(unsigned long addr);
> +	/*
> +	 * flush_tlb_multi() is the preferred interface, which is capable to
> +	 * flush both local and remote CPUs.
> +	 */
> +	void (*flush_tlb_multi)(const struct cpumask *cpus,
> +				const struct flush_tlb_info *info);
>  	void (*flush_tlb_others)(const struct cpumask *cpus,
>  				 const struct flush_tlb_info *info);
>  
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index dee375831962..79272938cf79 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -569,6 +569,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
>  	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
>  }
>  
> +void native_flush_tlb_multi(const struct cpumask *cpumask,
> +			     const struct flush_tlb_info *info);
> +
>  void native_flush_tlb_others(const struct cpumask *cpumask,
>  			     const struct flush_tlb_info *info);
>  
> @@ -593,6 +596,9 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
>  extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
>  
>  #ifndef CONFIG_PARAVIRT
> +#define flush_tlb_multi(mask, info)	\
> +	native_flush_tlb_multi(mask, info)
> +
>  #define flush_tlb_others(mask, info)	\
>  	native_flush_tlb_others(mask, info)
>  
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5169b8cc35bb..00d81e898717 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -630,6 +630,7 @@ static void __init kvm_guest_init(void)
>  	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
>  		pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
>  		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
> +		static_key_disable(&flush_tlb_multi_enabled.key);
>  	}
>  
>  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 98039d7fb998..ac00afed5570 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -159,6 +159,8 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
>  	return insn_len;
>  }
>  
> +DEFINE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
> +
>  static void native_flush_tlb(void)
>  {
>  	__native_flush_tlb();
> @@ -363,6 +365,7 @@ struct paravirt_patch_template pv_ops = {
>  	.mmu.flush_tlb_user	= native_flush_tlb,
>  	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
>  	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
> +	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
>  	.mmu.flush_tlb_others	= native_flush_tlb_others,
>  	.mmu.tlb_remove_table	=
>  			(void (*)(struct mmu_gather *, void *))tlb_remove_page,
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index c34bcf03f06f..db73d5f1dd43 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -551,7 +551,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>  		 * garbage into our TLB.  Since switching to init_mm is barely
>  		 * slower than a minimal flush, just switch to init_mm.
>  		 *
> -		 * This should be rare, with native_flush_tlb_others skipping
> +		 * This should be rare, with native_flush_tlb_multi skipping
>  		 * IPIs to lazy TLB mode CPUs.
>  		 */

Nit, since we're messing with this, it can now be
"native_flush_tlb_multi()" since it is a function.

>  		switch_mm_irqs_off(NULL, &init_mm, NULL);
> @@ -635,9 +635,12 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
>  	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
>  }
>  
> -static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
> +static void flush_tlb_func_local(void *info)
>  {
>  	const struct flush_tlb_info *f = info;
> +	enum tlb_flush_reason reason;
> +
> +	reason = (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : TLB_LOCAL_MM_SHOOTDOWN;

Should we just add the "reason" to flush_tlb_info?  It's OK-ish to imply
it like this, but seems like it would be nicer and easier to track down
the origins of these things if we did this at the caller.

>  	flush_tlb_func_common(f, true, reason);
>  }
> @@ -655,14 +658,21 @@ static void flush_tlb_func_remote(void *info)
>  	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
>  }
>  
> -static bool tlb_is_not_lazy(int cpu, void *data)
> +static inline bool tlb_is_not_lazy(int cpu)
>  {
>  	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
>  }

Nit: the compiler will probably inline this sucker anyway.  So, for
these kinds of patches, I'd resist the urge to do these kinds of tweaks,
especially since it starts to hide the important change on the line.

> -void native_flush_tlb_others(const struct cpumask *cpumask,
> -			     const struct flush_tlb_info *info)
> +static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
> +
> +void native_flush_tlb_multi(const struct cpumask *cpumask,
> +			    const struct flush_tlb_info *info)
>  {
> +	/*
> +	 * Do accounting and tracing. Note that there are (and have always been)
> +	 * cases in which a remote TLB flush will be traced, but eventually
> +	 * would not happen.
> +	 */
>  	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
>  	if (info->end == TLB_FLUSH_ALL)
>  		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
> @@ -682,10 +692,14 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>  		 * means that the percpu tlb_gen variables won't be updated
>  		 * and we'll do pointless flushes on future context switches.
>  		 *
> -		 * Rather than hooking native_flush_tlb_others() here, I think
> +		 * Rather than hooking native_flush_tlb_multi() here, I think
>  		 * that UV should be updated so that smp_call_function_many(),
>  		 * etc, are optimal on UV.
>  		 */
> +		local_irq_disable();
> +		flush_tlb_func_local((__force void *)info);
> +		local_irq_enable();
> +
>  		cpumask = uv_flush_tlb_others(cpumask, info);
>  		if (cpumask)
>  			smp_call_function_many(cpumask, flush_tlb_func_remote,
> @@ -704,11 +718,39 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
>  	 * doing a speculative memory access.
>  	 */
>  	if (info->freed_tables)
> -		smp_call_function_many(cpumask, flush_tlb_func_remote,
> -			       (void *)info, 1);
> -	else
> -		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
> -				(void *)info, 1, GFP_ATOMIC, cpumask);
> +		__smp_call_function_many(cpumask, flush_tlb_func_remote,
> +					 flush_tlb_func_local, (void *)info, 1);
> +	else {

I prefer brackets be added for 'if' blocks like this since it doesn't
take up any meaningful space and makes it less prone to compile errors.

> +		/*
> +		 * Although we could have used on_each_cpu_cond_mask(),
> +		 * open-coding it has several performance advantages: (1) we can
> +		 * use specialized functions for remote and local flushes; (2)
> +		 * no need for indirect branch to test if TLB is lazy; (3) we
> +		 * can use a designated cpumask for evaluating the condition
> +		 * instead of allocating a new one.
> +		 *
> +		 * This works under the assumption that there are no nested TLB
> +		 * flushes, an assumption that is already made in
> +		 * flush_tlb_mm_range().
> +		 */
> +		struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);

This is logically a stack-local variable, right?  But, since we've got
preempt off and cpumasks can be huge, we don't want to allocate it on
the stack.  That might be worth a comment somewhere.

> +		int cpu;
> +
> +		cpumask_clear(cond_cpumask);
> +
> +		for_each_cpu(cpu, cpumask) {
> +			if (tlb_is_not_lazy(cpu))
> +				__cpumask_set_cpu(cpu, cond_cpumask);
> +		}

FWIW, it's probably worth calling out in the changelog that this loop
exists in on_each_cpu_cond_mask() too.  It looks bad here, but it's no
worse than what it replaces.

> +		__smp_call_function_many(cond_cpumask, flush_tlb_func_remote,
> +					 flush_tlb_func_local, (void *)info, 1);
> +	}
> +}

There was a __force on an earlier 'info' cast.  Could you talk about
that for a minute an explain why that one is needed?

> +void native_flush_tlb_others(const struct cpumask *cpumask,
> +			     const struct flush_tlb_info *info)
> +{
> +	native_flush_tlb_multi(cpumask, info);
>  }
>  
>  /*
> @@ -774,10 +816,15 @@ static void flush_tlb_on_cpus(const cpumask_t *cpumask,
>  {
>  	int this_cpu = smp_processor_id();
>  
> +	if (static_branch_likely(&flush_tlb_multi_enabled)) {
> +		flush_tlb_multi(cpumask, info);
> +		return;
> +	}

Probably needs a comment for posterity above the if()^^:

	/* Use the optimized flush_tlb_multi() where we can. */

> --- a/arch/x86/xen/mmu_pv.c
> +++ b/arch/x86/xen/mmu_pv.c
> @@ -2474,6 +2474,8 @@ void __init xen_init_mmu_ops(void)
>  
>  	pv_ops.mmu = xen_mmu_ops;
>  
> +	static_key_disable(&flush_tlb_multi_enabled.key);
> +
>  	memset(dummy_mapping, 0xff, PAGE_SIZE);
>  }

More comments, please.  Perhaps:

	Existing paravirt TLB flushes are incompatible with
	flush_tlb_multi() because....  Disable it when they are
	in use.

^ permalink raw reply

* [PATCH v2 6/7] locking/spinlocks, paravirt, hyperv: Correct the hv_nopvspin case
From: Zhenzhong Duan @ 2019-06-24 12:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: tglx, mingo, bp, hpa, boris.ostrovsky, jgross, sstabellini,
	peterz, srinivas.eeda, Zhenzhong Duan, Waiman Long,
	K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	Ingo Molnar, linux-hyperv
In-Reply-To: <1561377779-28036-1-git-send-email-zhenzhong.duan@oracle.com>

With the boot parameter "hv_nopvspin" specified a Hyperv guest should
not make use of paravirt spinlocks, but behave as if running on bare
metal. This is not true, however, as the qspinlock code will fall back
to a test-and-set scheme when it is detecting a hypervisor.

In order to avoid this disable the virt_spin_lock_key.

Same change for XEN is already in Commit e6fd28eb3522
("locking/spinlocks, paravirt, xen: Correct the xen_nopvspin case")

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: linux-hyperv@vger.kernel.org
---
 arch/x86/hyperv/hv_spinlock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 07f21a0..d90b4b0 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -64,6 +64,9 @@ __visible bool hv_vcpu_is_preempted(int vcpu)
 
 void __init hv_init_spinlocks(void)
 {
+	if (unlikely(!hv_pvspin))
+		static_branch_disable(&virt_spin_lock_key);
+
 	if (!hv_pvspin || !apic ||
 	    !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) ||
 	    !(ms_hyperv.features & HV_X64_MSR_GUEST_IDLE_AVAILABLE)) {
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH 6/8] IB/srp: set virt_boundary_mask in the scsi host
From: Max Gurtovoy @ 2019-06-24 22:33 UTC (permalink / raw)
  To: Christoph Hellwig, Martin K . Petersen
  Cc: Sagi Grimberg, Bart Van Assche, linux-rdma, linux-scsi,
	megaraidlinux.pdl, MPT-FusionLinux.pdl, linux-hyperv,
	linux-kernel
In-Reply-To: <20190617122000.22181-7-hch@lst.de>


On 6/17/2019 3:19 PM, Christoph Hellwig wrote:
> This ensures all proper DMA layer handling is taken care of by the
> SCSI midlayer.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good,

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>


^ permalink raw reply

* Re: [PATCH 5/8] IB/iser: set virt_boundary_mask in the scsi host
From: Max Gurtovoy @ 2019-06-24 22:32 UTC (permalink / raw)
  To: Christoph Hellwig, Martin K . Petersen
  Cc: Sagi Grimberg, Bart Van Assche, linux-rdma, linux-scsi,
	megaraidlinux.pdl, MPT-FusionLinux.pdl, linux-hyperv,
	linux-kernel
In-Reply-To: <20190617122000.22181-6-hch@lst.de>

On 6/17/2019 3:19 PM, Christoph Hellwig wrote:
> This ensures all proper DMA layer handling is taken care of by the
> SCSI midlayer.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>


Looks good,

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>


^ permalink raw reply

* [PATCH 5/6] locking/spinlocks, paravirt, hyperv: Correct the hv_nopvspin case
From: Zhenzhong Duan @ 2019-06-23 13:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: tglx, mingo, bp, hpa, boris.ostrovsky, jgross, sstabellini,
	Zhenzhong Duan, K. Y. Srinivasan, Haiyang Zhang,
	Stephen Hemminger, Sasha Levin, linux-hyperv
In-Reply-To: <1561294903-6166-1-git-send-email-zhenzhong.duan@oracle.com>

With the boot parameter "hv_nopvspin" specified a Hyperv guest should
not make use of paravirt spinlocks, but behave as if running on bare
metal. This is not true, however, as the qspinlock code will fall back
to a test-and-set scheme when it is detecting a hypervisor.

In order to avoid this disable the virt_spin_lock_key.

Same change for XEN is already in Commit e6fd28eb3522
("locking/spinlocks, paravirt, xen: Correct the xen_nopvspin case")

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Sasha Levin <sashal@kernel.org>
Cc: linux-hyperv@vger.kernel.org
---
 arch/x86/hyperv/hv_spinlock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 07f21a0..210495b 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -64,6 +64,9 @@ __visible bool hv_vcpu_is_preempted(int vcpu)
 
 void __init hv_init_spinlocks(void)
 {
+	if (!hv_pvspin)
+		static_branch_disable(&virt_spin_lock_key);
+
 	if (!hv_pvspin || !apic ||
 	    !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) ||
 	    !(ms_hyperv.features & HV_X64_MSR_GUEST_IDLE_AVAILABLE)) {
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Thomas Gleixner @ 2019-06-23 22:12 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael Kelley, Vincenzo Frascino, linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies, linux-hyperv,
	Greg KH, Stephen Rothwell
In-Reply-To: <20190623190929.GL2226@sasha-vm>

Sasha,

On Sun, 23 Jun 2019, Sasha Levin wrote:
> On Sat, Jun 22, 2019 at 04:46:28PM +0200, Thomas Gleixner wrote:
> > Folks, please stop chosing Cc lists as you like. We have well established
> > rules for that. And please stop queueing random unreviewed patches in
> > next. Next is not a playground for not ready and unreviewed stuff. No, the
> > hyper-v inbreed Reviewed-by is not sufficient for anything x86 and
> > clocksource related.
> 
> I'm sorry for this, you were supposed to be Cc'ed on these patches and I
> see that you were not.

All good. I've vented steam and am back to normal pressure :)

> > After chasing and looking at those patches, which have horrible subject
> > lines and changelogs btw, I was not able to judge quickly whether that
> > stuff is self contained or not. So no, I fixed up the fallout and rebased
> > Vincenzos VDSO stuff on mainline w/o those hyperv changes simply because if
> > they are not self contained they will break bisection badly.
> > 
> > I'm going to push out the VDSO series later today. That will nicely break

Not yet, but soon :)

> > in combination with the hyper-next branch. Stephen, please drop that and do
> > not try to handle the fallout. That stuff needs to go through the proper
> > channels or at least be acked/reviewed by the relevant maintainers. So the
> > hyper-v folks can rebase themself and post it proper.
> 
> Okay, thank you. We'll rebase and resend.

I have no objections if you collect hyper-v stuff, quite the contrary, but
changes which touch other subsystems need to be coordinated upfront. That's
all I'm asking for.

Btw, that clocksource stuff looks good code wise, just the change logs need
some care and after the VDSO stuff hits next we need to sort out the
logistics. I hope these changes are completely self contained. If not we'll
find a solution.

Thanks,

	tglx

^ permalink raw reply

* RE: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Thomas Gleixner @ 2019-06-24  0:25 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Sasha Levin, Vincenzo Frascino, linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies,
	linux-hyperv@vger.kernel.org, Greg KH, Stephen Rothwell
In-Reply-To: <BYAPR21MB135202F46C4B023B51EBBFD0D7E00@BYAPR21MB1352.namprd21.prod.outlook.com>

On Mon, 24 Jun 2019, Michael Kelley wrote:
> From: Thomas Gleixner <tglx@linutronix.de> Sent: Sunday, June 23, 2019 3:13 PM
> > 
> > I have no objections if you collect hyper-v stuff, quite the contrary, but
> > changes which touch other subsystems need to be coordinated upfront. That's
> > all I'm asking for.
> > 
> > Btw, that clocksource stuff looks good code wise, just the change logs need
> > some care and after the VDSO stuff hits next we need to sort out the
> > logistics. I hope these changes are completely self contained. If not we'll
> > find a solution.
> >
> 
> In my view, the only thing that potentially needs a solution is where the
> Hyper-V clock code used by VDSO ends up in the code tree.  I think the
> right long term place is include/clocksource/hyperv_timer.h.   That location
> is architecture neutral, and the same Hyper-V clock code will be shared by
> the Hyper-V on ARM64 support that's in process.
> 
> Vincenzo's patch set creates a new file arch/x86/include/asm/mshyperv-tsc.h,
> which I will want to move when creating the separate Hyper-V clocksource
> driver.   If you're OK with that file existing for a release and then going away,
> that's fine.  Alternatively, put the code in include/clocksource/hyperv_timer.h
> now as part of the VDSO patch set so it's in the right place from the start.  My
> subsequent patch set will add a few additional tweaks to remove x86-isms
> and fully integrate with the separate Hyper-V clocksource driver.

I don't care whether this goes into 5.3 or later. If you can provide me
rebased self contained patches on top of

  git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/vdso

I'm happy to pull them in on top.

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Stephen Rothwell @ 2019-06-23 21:58 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Thomas Gleixner, Michael Kelley, Vincenzo Frascino,
	linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies, linux-hyperv,
	Greg KH
In-Reply-To: <20190623190929.GL2226@sasha-vm>

[-- Attachment #1: Type: text/plain, Size: 359 bytes --]

Hi Sasha,

On Sun, 23 Jun 2019 15:09:29 -0400 Sasha Levin <sashal@kernel.org> wrote:
>
> Appologies about this. I ended up with way more travel than I would have
> liked (writing this from an airport). I've reset our hyperv-next branch
> to remove these 3 commits until we figure this out.

But not pushed out, yet?

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Stephen Rothwell @ 2019-06-24  1:20 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Thomas Gleixner, Michael Kelley, Vincenzo Frascino,
	linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies, linux-hyperv,
	Greg KH
In-Reply-To: <20190624002430.GN2226@sasha-vm>

[-- Attachment #1: Type: text/plain, Size: 204 bytes --]

Hi Sasha,

On Sun, 23 Jun 2019 20:24:30 -0400 Sasha Levin <sashal@kernel.org> wrote:
>
> Pushed now. For some reason the airport wifi was blocking ssh :/

Thanks.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* RE: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Michael Kelley @ 2019-06-24  0:04 UTC (permalink / raw)
  To: Thomas Gleixner, Sasha Levin
  Cc: Vincenzo Frascino, linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies,
	linux-hyperv@vger.kernel.org, Greg KH, Stephen Rothwell
In-Reply-To: <alpine.DEB.2.21.1906240006090.32342@nanos.tec.linutronix.de>

From: Thomas Gleixner <tglx@linutronix.de> Sent: Sunday, June 23, 2019 3:13 PM
> 
> I have no objections if you collect hyper-v stuff, quite the contrary, but
> changes which touch other subsystems need to be coordinated upfront. That's
> all I'm asking for.
> 
> Btw, that clocksource stuff looks good code wise, just the change logs need
> some care and after the VDSO stuff hits next we need to sort out the
> logistics. I hope these changes are completely self contained. If not we'll
> find a solution.
>

In my view, the only thing that potentially needs a solution is where the
Hyper-V clock code used by VDSO ends up in the code tree.  I think the
right long term place is include/clocksource/hyperv_timer.h.   That location
is architecture neutral, and the same Hyper-V clock code will be shared by
the Hyper-V on ARM64 support that's in process.

Vincenzo's patch set creates a new file arch/x86/include/asm/mshyperv-tsc.h,
which I will want to move when creating the separate Hyper-V clocksource
driver.   If you're OK with that file existing for a release and then going away,
that's fine.  Alternatively, put the code in include/clocksource/hyperv_timer.h
now as part of the VDSO patch set so it's in the right place from the start.  My
subsequent patch set will add a few additional tweaks to remove x86-isms
and fully integrate with the separate Hyper-V clocksource driver.

Michael 

^ permalink raw reply

* Re: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Sasha Levin @ 2019-06-24  0:24 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Thomas Gleixner, Michael Kelley, Vincenzo Frascino,
	linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies, linux-hyperv,
	Greg KH
In-Reply-To: <20190624075834.2491a61a@canb.auug.org.au>

On Mon, Jun 24, 2019 at 07:58:34AM +1000, Stephen Rothwell wrote:
>Hi Sasha,
>
>On Sun, 23 Jun 2019 15:09:29 -0400 Sasha Levin <sashal@kernel.org> wrote:
>>
>> Appologies about this. I ended up with way more travel than I would have
>> liked (writing this from an airport). I've reset our hyperv-next branch
>> to remove these 3 commits until we figure this out.
>
>But not pushed out, yet?

Pushed now. For some reason the airport wifi was blocking ssh :/

--
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Sasha Levin @ 2019-06-23 19:09 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Michael Kelley, Vincenzo Frascino, linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies, linux-hyperv,
	Greg KH, Stephen Rothwell
In-Reply-To: <alpine.DEB.2.21.1906221542270.5503@nanos.tec.linutronix.de>

On Sat, Jun 22, 2019 at 04:46:28PM +0200, Thomas Gleixner wrote:
>On Fri, 14 Jun 2019, Sasha Levin wrote:
>> On Fri, Jun 14, 2019 at 01:15:23PM +0200, Thomas Gleixner wrote:
>> > On Thu, 30 May 2019, Michael Kelley wrote:
>> > > Vincenzo -- these changes for Hyper-V are a subset of a larger patch set
>> > > I have that moves all of the Hyper-V clock/timer code into a separate
>> > > clocksource driver in drivers/clocksource, with an include file in
>> > > includes/clocksource.  That new include file should be able to work
>> > > instead of your new mshyperv-tsc.h.  It also has the benefit of being
>> > > ISA neutral, so it will work with my in-progress patch set to support
>> > > Linux on Hyper-V on ARM64.  See https://lkml.org/lkml/2019/5/27/231
>> > > for the new clocksource driver patch set.
>> >
>> > Grrr. That's queued in hyperv-next for whatever reasons.
>>
>> I queue up our future pull requests there to give them some soaking in
>> -next.
>
>What? You queue completely unreviewed stuff which touches two other
>subsystems to let it soak in next?

It was out on LKML for 2+ weeks before I've pulled it in. As it mostly
touches hyperv bits I felt comfortable to give it time in -next (but not
actually to try and merge it until it gets a few acks).

>> > Sasha, can you please provide me the branch to pull from so I can have a
>> > common base for all the various changes floating around?
>>
>> I'll send you a unified pull request for these changes.
>
>Which has not materialized yet.

Appologies about this. I ended up with way more travel than I would have
liked (writing this from an airport). I've reset our hyperv-next branch
to remove these 3 commits until we figure this out.

>TBH, I'm pretty grumpy about those clocksource changes. Here is the
>diffstat:
>
> MAINTAINERS                          |    2
> arch/x86/entry/vdso/vclock_gettime.c |    1
> arch/x86/entry/vdso/vma.c            |    2
> arch/x86/hyperv/hv_init.c            |   91 ---------
> arch/x86/include/asm/hyperv-tlfs.h   |    6
> arch/x86/include/asm/mshyperv.h      |   81 +-------
> arch/x86/kernel/cpu/mshyperv.c       |    2
> arch/x86/kvm/x86.c                   |    1
> drivers/clocksource/Makefile         |    1
> drivers/clocksource/hyperv_timer.c   |  322 +++++++++++++++++++++++++++++++++++
> drivers/hv/Kconfig                   |    3
> drivers/hv/hv.c                      |  156 ----------------
> drivers/hv/hv_util.c                 |    1
> drivers/hv/hyperv_vmbus.h            |    3
> drivers/hv/vmbus_drv.c               |   42 ++--
> include/clocksource/hyperv_timer.h   |  105 +++++++++++
>
>While the world and some more people have been CC'ed on those patches,
>neither the clocksource nor the x86 maintainer have been.
>
>When I gave Vincenzo the advise to base his code on that hyper-v branch, I
>expected that I find the related patches in my mail backlog. No, they have
>not been there because I was not on CC.
>
>Folks, please stop chosing Cc lists as you like. We have well established
>rules for that. And please stop queueing random unreviewed patches in
>next. Next is not a playground for not ready and unreviewed stuff. No, the
>hyper-v inbreed Reviewed-by is not sufficient for anything x86 and
>clocksource related.

I'm sorry for this, you were supposed to be Cc'ed on these patches and I
see that you were not.

>After chasing and looking at those patches, which have horrible subject
>lines and changelogs btw, I was not able to judge quickly whether that
>stuff is self contained or not. So no, I fixed up the fallout and rebased
>Vincenzos VDSO stuff on mainline w/o those hyperv changes simply because if
>they are not self contained they will break bisection badly.
>
>I'm going to push out the VDSO series later today. That will nicely break
>in combination with the hyper-next branch. Stephen, please drop that and do
>not try to handle the fallout. That stuff needs to go through the proper
>channels or at least be acked/reviewed by the relevant maintainers. So the
>hyper-v folks can rebase themself and post it proper.

Okay, thank you. We'll rebase and resend.

--
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH v6 18/19] x86: Add support for generic vDSO
From: Thomas Gleixner @ 2019-06-22 14:46 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael Kelley, Vincenzo Frascino, linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	linux-kselftest@vger.kernel.org, Catalin Marinas, Will Deacon,
	Arnd Bergmann, Russell King, Ralf Baechle, Paul Burton,
	Daniel Lezcano, Mark Salyzyn, Peter Collingbourne, Shuah Khan,
	Dmitry Safonov, Rasmus Villemoes, Huw Davies, linux-hyperv,
	Greg KH, Stephen Rothwell
In-Reply-To: <20190614211710.GQ1513@sasha-vm>

On Fri, 14 Jun 2019, Sasha Levin wrote:
> On Fri, Jun 14, 2019 at 01:15:23PM +0200, Thomas Gleixner wrote:
> > On Thu, 30 May 2019, Michael Kelley wrote:
> > > Vincenzo -- these changes for Hyper-V are a subset of a larger patch set
> > > I have that moves all of the Hyper-V clock/timer code into a separate
> > > clocksource driver in drivers/clocksource, with an include file in
> > > includes/clocksource.  That new include file should be able to work
> > > instead of your new mshyperv-tsc.h.  It also has the benefit of being
> > > ISA neutral, so it will work with my in-progress patch set to support
> > > Linux on Hyper-V on ARM64.  See https://lkml.org/lkml/2019/5/27/231
> > > for the new clocksource driver patch set.
> > 
> > Grrr. That's queued in hyperv-next for whatever reasons.
> 
> I queue up our future pull requests there to give them some soaking in
> -next.

What? You queue completely unreviewed stuff which touches two other
subsystems to let it soak in next?

> > Sasha, can you please provide me the branch to pull from so I can have a
> > common base for all the various changes floating around?
> 
> I'll send you a unified pull request for these changes.

Which has not materialized yet.

TBH, I'm pretty grumpy about those clocksource changes. Here is the
diffstat:

 MAINTAINERS                          |    2 
 arch/x86/entry/vdso/vclock_gettime.c |    1 
 arch/x86/entry/vdso/vma.c            |    2 
 arch/x86/hyperv/hv_init.c            |   91 ---------
 arch/x86/include/asm/hyperv-tlfs.h   |    6 
 arch/x86/include/asm/mshyperv.h      |   81 +-------
 arch/x86/kernel/cpu/mshyperv.c       |    2 
 arch/x86/kvm/x86.c                   |    1 
 drivers/clocksource/Makefile         |    1 
 drivers/clocksource/hyperv_timer.c   |  322 +++++++++++++++++++++++++++++++++++
 drivers/hv/Kconfig                   |    3 
 drivers/hv/hv.c                      |  156 ----------------
 drivers/hv/hv_util.c                 |    1 
 drivers/hv/hyperv_vmbus.h            |    3 
 drivers/hv/vmbus_drv.c               |   42 ++--
 include/clocksource/hyperv_timer.h   |  105 +++++++++++

While the world and some more people have been CC'ed on those patches,
neither the clocksource nor the x86 maintainer have been.

When I gave Vincenzo the advise to base his code on that hyper-v branch, I
expected that I find the related patches in my mail backlog. No, they have
not been there because I was not on CC.

Folks, please stop chosing Cc lists as you like. We have well established
rules for that. And please stop queueing random unreviewed patches in
next. Next is not a playground for not ready and unreviewed stuff. No, the
hyper-v inbreed Reviewed-by is not sufficient for anything x86 and
clocksource related.

After chasing and looking at those patches, which have horrible subject
lines and changelogs btw, I was not able to judge quickly whether that
stuff is self contained or not. So no, I fixed up the fallout and rebased
Vincenzos VDSO stuff on mainline w/o those hyperv changes simply because if
they are not self contained they will break bisection badly.

I'm going to push out the VDSO series later today. That will nicely break
in combination with the hyper-next branch. Stephen, please drop that and do
not try to handle the fallout. That stuff needs to go through the proper
channels or at least be acked/reviewed by the relevant maintainers. So the
hyper-v folks can rebase themself and post it proper.

Yours grumpy,

	tglx

^ permalink raw reply

* RE: [PATCH v2] PCI: hv: Fix a use-after-free bug in hv_eject_device_work()
From: Michael Kelley @ 2019-06-21 23:58 UTC (permalink / raw)
  To: Dexuan Cui, linux-pci@vger.kernel.org, Lorenzo Pieralisi,
	bhelgaas@google.com, Haiyang Zhang, KY Srinivasan,
	Stephen Hemminger, Sasha Levin, linux-hyperv@vger.kernel.org,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
  Cc: Lili Deng (Wicresoft North America Ltd),
	linux-kernel@vger.kernel.org,
	driverdev-devel@linuxdriverproject.org
In-Reply-To: <PU1P153MB0169D420EAB61757DF4B337FBFE70@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>

From: Dexuan Cui <decui@microsoft.com> Sent: Friday, June 21, 2019 4:45 PM
> 
> The commit 05f151a73ec2 itself is correct, but it exposes this
> use-after-free bug, which is caught by some memory debug options.
> 
> Add a Fixes tag to indicate the dependency.
> 
> Fixes: 05f151a73ec2 ("PCI: hv: Fix a memory leak in hv_eject_device_work()")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Cc: stable@vger.kernel.org
> ---
> 
> In v2:
> Replaced "hpdev->hbus" with "hbus", since we have the new "hbus" variable. [Michael
> Kelley]
> 
>  drivers/pci/controller/pci-hyperv.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 808a182830e5..5dadc964ad3b 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -1880,6 +1880,7 @@ static void hv_pci_devices_present(struct hv_pcibus_device
> *hbus,
>  static void hv_eject_device_work(struct work_struct *work)
>  {
>  	struct pci_eject_response *ejct_pkt;
> +	struct hv_pcibus_device *hbus;
>  	struct hv_pci_dev *hpdev;
>  	struct pci_dev *pdev;
>  	unsigned long flags;
> @@ -1890,6 +1891,7 @@ static void hv_eject_device_work(struct work_struct *work)
>  	} ctxt;
> 
>  	hpdev = container_of(work, struct hv_pci_dev, wrk);
> +	hbus = hpdev->hbus;
> 
>  	WARN_ON(hpdev->state != hv_pcichild_ejecting);
> 
> @@ -1900,8 +1902,7 @@ static void hv_eject_device_work(struct work_struct *work)
>  	 * because hbus->pci_bus may not exist yet.
>  	 */
>  	wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
> -	pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
> -					   wslot);
> +	pdev = pci_get_domain_bus_and_slot(hbus->sysdata.domain, 0, wslot);
>  	if (pdev) {
>  		pci_lock_rescan_remove();
>  		pci_stop_and_remove_bus_device(pdev);
> @@ -1909,9 +1910,9 @@ static void hv_eject_device_work(struct work_struct *work)
>  		pci_unlock_rescan_remove();
>  	}
> 
> -	spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
> +	spin_lock_irqsave(&hbus->device_list_lock, flags);
>  	list_del(&hpdev->list_entry);
> -	spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
> +	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
> 
>  	if (hpdev->pci_slot)
>  		pci_destroy_slot(hpdev->pci_slot);
> @@ -1920,7 +1921,7 @@ static void hv_eject_device_work(struct work_struct *work)
>  	ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
>  	ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
>  	ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
> -	vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
> +	vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
>  			 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
>  			 VM_PKT_DATA_INBAND, 0);
> 
> @@ -1929,7 +1930,9 @@ static void hv_eject_device_work(struct work_struct *work)
>  	/* For the two refs got in new_pcichild_device() */
>  	put_pcichild(hpdev);
>  	put_pcichild(hpdev);
> -	put_hvpcibus(hpdev->hbus);
> +	/* hpdev has been freed. Do not use it any more. */
> +
> +	put_hvpcibus(hbus);
>  }
> 
>  /**
> --
> 2.17.1

Reviewed-by: Michael Kelley <mikelley@microsoft.com>


^ permalink raw reply

* [PATCH v2] PCI: hv: Fix a use-after-free bug in hv_eject_device_work()
From: Dexuan Cui @ 2019-06-21 23:45 UTC (permalink / raw)
  To: linux-pci@vger.kernel.org, Lorenzo Pieralisi, bhelgaas@google.com,
	Haiyang Zhang, KY Srinivasan, Stephen Hemminger, Sasha Levin,
	linux-hyperv@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com,
	Michael Kelley
  Cc: Lili Deng (Wicresoft North America Ltd),
	linux-kernel@vger.kernel.org,
	driverdev-devel@linuxdriverproject.org


The commit 05f151a73ec2 itself is correct, but it exposes this
use-after-free bug, which is caught by some memory debug options.

Add a Fixes tag to indicate the dependency.

Fixes: 05f151a73ec2 ("PCI: hv: Fix a memory leak in hv_eject_device_work()")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: stable@vger.kernel.org
---

In v2:
Replaced "hpdev->hbus" with "hbus", since we have the new "hbus" variable. [Michael Kelley]

 drivers/pci/controller/pci-hyperv.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 808a182830e5..5dadc964ad3b 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1880,6 +1880,7 @@ static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
 static void hv_eject_device_work(struct work_struct *work)
 {
 	struct pci_eject_response *ejct_pkt;
+	struct hv_pcibus_device *hbus;
 	struct hv_pci_dev *hpdev;
 	struct pci_dev *pdev;
 	unsigned long flags;
@@ -1890,6 +1891,7 @@ static void hv_eject_device_work(struct work_struct *work)
 	} ctxt;
 
 	hpdev = container_of(work, struct hv_pci_dev, wrk);
+	hbus = hpdev->hbus;
 
 	WARN_ON(hpdev->state != hv_pcichild_ejecting);
 
@@ -1900,8 +1902,7 @@ static void hv_eject_device_work(struct work_struct *work)
 	 * because hbus->pci_bus may not exist yet.
 	 */
 	wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
-	pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
-					   wslot);
+	pdev = pci_get_domain_bus_and_slot(hbus->sysdata.domain, 0, wslot);
 	if (pdev) {
 		pci_lock_rescan_remove();
 		pci_stop_and_remove_bus_device(pdev);
@@ -1909,9 +1910,9 @@ static void hv_eject_device_work(struct work_struct *work)
 		pci_unlock_rescan_remove();
 	}
 
-	spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
+	spin_lock_irqsave(&hbus->device_list_lock, flags);
 	list_del(&hpdev->list_entry);
-	spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
+	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
 
 	if (hpdev->pci_slot)
 		pci_destroy_slot(hpdev->pci_slot);
@@ -1920,7 +1921,7 @@ static void hv_eject_device_work(struct work_struct *work)
 	ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
 	ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
 	ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
-	vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
+	vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
 			 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
 			 VM_PKT_DATA_INBAND, 0);
 
@@ -1929,7 +1930,9 @@ static void hv_eject_device_work(struct work_struct *work)
 	/* For the two refs got in new_pcichild_device() */
 	put_pcichild(hpdev);
 	put_pcichild(hpdev);
-	put_hvpcibus(hpdev->hbus);
+	/* hpdev has been freed. Do not use it any more. */
+
+	put_hvpcibus(hbus);
 }
 
 /**
-- 
2.17.1


^ permalink raw reply related

* RE: [PATCH] PCI: hv: Fix a use-after-free bug in hv_eject_device_work()
From: Dexuan Cui @ 2019-06-21 23:31 UTC (permalink / raw)
  To: Michael Kelley, linux-pci@vger.kernel.org, Lorenzo Pieralisi,
	bhelgaas@google.com, Haiyang Zhang, KY Srinivasan,
	Stephen Hemminger, Sasha Levin, linux-hyperv@vger.kernel.org,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
  Cc: Lili Deng (Wicresoft North America Ltd),
	linux-kernel@vger.kernel.org,
	driverdev-devel@linuxdriverproject.org
In-Reply-To: <BYAPR21MB13524CDAAD1F9A19935E5F9BD7E70@BYAPR21MB1352.namprd21.prod.outlook.com>


> From: Michael Kelley <mikelley@microsoft.com>
> > @@ -1880,6 +1880,7 @@ static void hv_pci_devices_present(struct
> hv_pcibus_device
> > *hbus,
> >  static void hv_eject_device_work(struct work_struct *work)
> >  {
> >  	struct pci_eject_response *ejct_pkt;
> > +	struct hv_pcibus_device *hbus;
> >  	struct hv_pci_dev *hpdev;
> >  	struct pci_dev *pdev;
> >  	unsigned long flags;
> > @@ -1890,6 +1891,7 @@ static void hv_eject_device_work(struct
> work_struct *work)
> >  	} ctxt;
> >
> >  	hpdev = container_of(work, struct hv_pci_dev, wrk);
> > +	hbus = hpdev->hbus;
> 
> In the lines of code following this new assignment, there are four uses of
> hpdev->hbus besides the one at the bottom of the function that causes the
> use-after-free error.  With 'hbus' now available as a local variable, it looks
> rather strange to have those other places still using hpdev->hbus.  I'm
> thinking
> they should be shortened to just 'hbus' for consistency, even though such
> changes aren't directly related to fixing the bug.
> 
> Michael
 
Ok, let me post a v2 for this.

Thanks,
Dexuan

^ permalink raw reply

* RE: [PATCH] PCI: hv: Fix a use-after-free bug in hv_eject_device_work()
From: Michael Kelley @ 2019-06-21 23:24 UTC (permalink / raw)
  To: Dexuan Cui, linux-pci@vger.kernel.org, Lorenzo Pieralisi,
	bhelgaas@google.com, Haiyang Zhang, KY Srinivasan,
	Stephen Hemminger, Sasha Levin, linux-hyperv@vger.kernel.org,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
  Cc: Lili Deng (Wicresoft North America Ltd),
	linux-kernel@vger.kernel.org,
	driverdev-devel@linuxdriverproject.org
In-Reply-To: <PU1P153MB01691036654142C7972F3ACDBFE70@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>

From: Dexuan Cui <decui@microsoft.com> Sent: Friday, June 21, 2019 12:02 PM
> 
> The commit 05f151a73ec2 itself is correct, but it exposes this
> use-after-free bug, which is caught by some memory debug options.
> 
> Add the Fixes tag to indicate the dependency.
> 
> Fixes: 05f151a73ec2 ("PCI: hv: Fix a memory leak in hv_eject_device_work()")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Cc: stable@vger.kernel.org
> ---
> Sorry for not spotting the bug when sending 05f151a73ec2.
> 
> Now I have enabled the mm debug options to help catch such mistakes in future.
> 
>  drivers/pci/controller/pci-hyperv.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 808a182830e5..42ace1a690f9 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -1880,6 +1880,7 @@ static void hv_pci_devices_present(struct hv_pcibus_device
> *hbus,
>  static void hv_eject_device_work(struct work_struct *work)
>  {
>  	struct pci_eject_response *ejct_pkt;
> +	struct hv_pcibus_device *hbus;
>  	struct hv_pci_dev *hpdev;
>  	struct pci_dev *pdev;
>  	unsigned long flags;
> @@ -1890,6 +1891,7 @@ static void hv_eject_device_work(struct work_struct *work)
>  	} ctxt;
> 
>  	hpdev = container_of(work, struct hv_pci_dev, wrk);
> +	hbus = hpdev->hbus;

In the lines of code following this new assignment, there are four uses of
hpdev->hbus besides the one at the bottom of the function that causes the
use-after-free error.  With 'hbus' now available as a local variable, it looks
rather strange to have those other places still using hpdev->hbus.  I'm thinking
they should be shortened to just 'hbus' for consistency, even though such
changes aren't directly related to fixing the bug.

Michael

> 
>  	WARN_ON(hpdev->state != hv_pcichild_ejecting);
> 
> @@ -1929,7 +1931,9 @@ static void hv_eject_device_work(struct work_struct *work)
>  	/* For the two refs got in new_pcichild_device() */
>  	put_pcichild(hpdev);
>  	put_pcichild(hpdev);
> -	put_hvpcibus(hpdev->hbus);
> +	/* hpdev has been freed. Do not use it any more. */
> +
> +	put_hvpcibus(hbus);
>  }
> 
>  /**
> --
> 2.17.1


^ permalink raw reply

* [PATCH] PCI: hv: Fix a use-after-free bug in hv_eject_device_work()
From: Dexuan Cui @ 2019-06-21 19:02 UTC (permalink / raw)
  To: linux-pci@vger.kernel.org, Lorenzo Pieralisi, bhelgaas@google.com,
	Haiyang Zhang, KY Srinivasan, Stephen Hemminger, Sasha Levin,
	linux-hyperv@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com,
	Michael Kelley
  Cc: Lili Deng (Wicresoft North America Ltd),
	linux-kernel@vger.kernel.org,
	driverdev-devel@linuxdriverproject.org


The commit 05f151a73ec2 itself is correct, but it exposes this
use-after-free bug, which is caught by some memory debug options.

Add the Fixes tag to indicate the dependency.

Fixes: 05f151a73ec2 ("PCI: hv: Fix a memory leak in hv_eject_device_work()")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: stable@vger.kernel.org
---
Sorry for not spotting the bug when sending 05f151a73ec2. 

Now I have enabled the mm debug options to help catch such mistakes in future.

 drivers/pci/controller/pci-hyperv.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 808a182830e5..42ace1a690f9 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1880,6 +1880,7 @@ static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
 static void hv_eject_device_work(struct work_struct *work)
 {
 	struct pci_eject_response *ejct_pkt;
+	struct hv_pcibus_device *hbus;
 	struct hv_pci_dev *hpdev;
 	struct pci_dev *pdev;
 	unsigned long flags;
@@ -1890,6 +1891,7 @@ static void hv_eject_device_work(struct work_struct *work)
 	} ctxt;
 
 	hpdev = container_of(work, struct hv_pci_dev, wrk);
+	hbus = hpdev->hbus;
 
 	WARN_ON(hpdev->state != hv_pcichild_ejecting);
 
@@ -1929,7 +1931,9 @@ static void hv_eject_device_work(struct work_struct *work)
 	/* For the two refs got in new_pcichild_device() */
 	put_pcichild(hpdev);
 	put_pcichild(hpdev);
-	put_hvpcibus(hpdev->hbus);
+	/* hpdev has been freed. Do not use it any more. */
+
+	put_hvpcibus(hbus);
 }
 
 /**
-- 
2.17.1


^ permalink raw reply related

* RE: [PATCH] ACPI: PM: Export the function acpi_sleep_state_supported()
From: Dexuan Cui @ 2019-06-21  7:15 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Lorenzo Pieralisi, linux-acpi@vger.kernel.org, rjw@rjwysocki.net,
	lenb@kernel.org, robert.moore@intel.com, erik.schmauss@intel.com,
	Russell King, Russ Dill, Sebastian Capella, Michael Kelley,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	KY Srinivasan, Stephen Hemminger, Haiyang Zhang, Sasha Levin,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
In-Reply-To: <20190620113057.GA16460@atrey.karlin.mff.cuni.cz>

> From: linux-hyperv-owner@vger.kernel.org
> > ...
> > When a Linux guest runs on Hyper-V (x86_32, x86_64, or ARM64) , we have a
> > front-end balloon driver in the guest, which balloons up/down and
> > hot adds/removes the guest's memory when the host requests that. The
> > problem
> > is: the back-end driver on the host can not really save and restore the states
> > related to the front-end balloon driver on guest hibernation, so we made the
> > decision that balloon up/down and hot-add/remove are not supported when
> > we enable hibernation for a guest; BTW, we still want to load the front-end
> > driver in the guest, because the driver has a functionality of reporting the
> > guest's memory pressure to the host, which we think is useful.
> >
> > On x86_32 and x86_64, we enable hibernation for a guest by enabling
> > the virtual ACPI S4 state for the guest; on ARM64, so far we don't have the
> > host side changes required to support guest hibernation, so the details are
> > still unclear.
> >
> > After I discussed with Michael Kelley, it looks we don't really need to
> > export drivers/acpi/sleep.c: acpi_sleep_state_supported(), but I think we do
> > need to make it non-static.
> >
> > Now I propose the below changes. I plan to submit a patch first for the
> > changes made to drivers/acpi/sleep.c and include/acpi/acpi_bus.h in a few
> > days, if there is no objection.
> >
> > Please let me know how you think of this. Thanks!
> 
> No.
> 
> Hibernation should be always supported, no matter what firmware. If it
> can powerdown, it can hibernate.
> 
> That is for x86-32/64, too.
> 						Pavel

Hi Pavel,
Yes, I totally agree hibernation should be always supported, as long as the
system can power down. This is also true for Linux guest running on
Hyper-V, when the pava-virtualized drivers are not loaded in the guest

However, unluckily the situation is a little different when the pava-virtualized
drivers are loaded in the guest:

1) Old Hyper-V hosts are unable to support guest hibernation and these hosts
don't support the virtual ACPI S4 state.

2) The recent Hyper-V host is able to support guest hibernation, but when
guest hibernation is used, the guest needs to disable some features in the
guest balloon driver, as I explained in the previous mail; on the other hand,
we also want to keep the ability to enable the features, so the Hyper-V guys 
decided to use the absence of the virtual ACPI S4 state to signify the ability
of enabling the features, and there is a host command tool which can
enable or disable the virtual ACPI S4 state for a guest.

In summary:
1) On old Hyper-V hosts or a recent host, the virtual ACPI S4 state is
unsupported or disabled, respectively, and we don't allow Linux guest
hibernation so we can use the full features of the guest balloon driver.

2) On a recent host, after we use the host command tool to enable the
virtual ACPI S4 state, we allow Linux guest hibernation and we only use a
subset of the full features of the guest balloon driver. 

This is why I need to know if the virtual ACPI S4 state is supported-and-enabled.

Exporting acpi_sleep_state_supported() may be overkill, and now making it
non-static seems the only way to help me out. IMO the function is unlikely
to change in the future, and it should be stable enough to become non-static.

Looking forward to your insights!

Thanks,
-- Dexuan

^ permalink raw reply

* Re: [PATCH] ACPI: PM: Export the function acpi_sleep_state_supported()
From: Pavel Machek @ 2019-06-20 11:30 UTC (permalink / raw)
  To: Dexuan Cui
  Cc: Lorenzo Pieralisi, linux-acpi@vger.kernel.org, rjw@rjwysocki.net,
	lenb@kernel.org, robert.moore@intel.com, erik.schmauss@intel.com,
	Russell King, Russ Dill, Sebastian Capella, Michael Kelley,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	KY Srinivasan, Stephen Hemminger, Haiyang Zhang, Sasha Levin,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
In-Reply-To: <PU1P153MB016902786ABA34BD01430F83BFE50@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM>

> > From: linux-hyperv-owner@vger.kernel.org
> > <linux-hyperv-owner@vger.kernel.org> On Behalf Of Lorenzo Pieralisi
> > Sent: Monday, June 17, 2019 9:15 AM
> > > ...
> > > + some ARM experts who worked on arch/arm/kernel/hibernate.c.
> > >
> > > drivers/acpi/sleep.c is only built if ACPI_SYSTEM_POWER_STATES_SUPPORT
> > > is defined, but it looks this option is not defined on ARM.
> > >
> > > It looks ARM does not support the ACPI S4 state, then how do we know
> > > if an ARM host supports hibernation or not?
> > 
> > Maybe we should start from understanding why you need to know whether
> > Hibernate is possible to answer your question ?
> > 
> > On ARM64 platforms system states are entered through PSCI firmware
> > interface that works for ACPI and device tree alike.
> > 
> > Lorenzo
> 
> Hi Lorenzo,
> It looks I may have confused you as I didn't realize the word "ARM" only means
> 32-bit ARM. It looks the "ARM" arch and the "ARM64" arch are very different.
> 
> As far as I know, Hyper-V only supports x86 and "ARM64", and it's unlikely to
> support 32-bit ARM in the future, so actually I don't really need to know if and
> how a 32-bit ARM machine supports hibernation.
> 
> When a Linux guest runs on Hyper-V (x86_32, x86_64, or ARM64) , we have a
> front-end balloon driver in the guest, which balloons up/down and
> hot adds/removes the guest's memory when the host requests that. The problem
> is: the back-end driver on the host can not really save and restore the states
> related to the front-end balloon driver on guest hibernation, so we made the
> decision that balloon up/down and hot-add/remove are not supported when
> we enable hibernation for a guest; BTW, we still want to load the front-end
> driver in the guest, because the dirver has a functionality of reporting the
> guest's memory pressure to the host, which we think is useful.
> 
> On x86_32 and x86_64, we enable hibernation for a guest by enabling
> the virtual ACPI S4 state for the guest; on ARM64, so far we don't have the
> host side changes required to support guest hibernation, so the details are
> still unclear.
> 
> After I discussed with Michael Kelley, it looks we don't really need to
> export drivers/acpi/sleep.c: acpi_sleep_state_supported(), but I think we do
> need to make it non-static.
> 
> Now I propose the below changes. I plan to submit a patch first for the
> changes made to drivers/acpi/sleep.c and include/acpi/acpi_bus.h in a few
> days, if there is no objection.
> 
> Please let me know how you think of this. Thanks!

No.

Hibernation should be always supported, no matter what firmware. If it
can powerdown, it can hibernate.

That is for x86-32/64, too.
									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply

* Re: [PATCH 7/8] mpt3sas: set an unlimited max_segment_size for SAS 3.0 HBAs
From: Suganath Prabu Subramani @ 2019-06-20  7:08 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Martin K . Petersen, Sagi Grimberg,
	Max Gurtovoy, Bart Van Assche, linux-rdma, Linux SCSI List,
	megaraidlinux.pdl, PDL-MPT-FUSIONLINUX, linux-hyperv,
	Linux Kernel Mailing List
In-Reply-To: <CA+RiK64sFfY79i7q2YbN5HcZ4wzVOcLWgDJnPbf6=ycdcmC-Mg@mail.gmail.com>

Acked-by: Suganath Prabu <suganath-prabu.subramani@broadcom.com>

On Thu, Jun 20, 2019 at 12:34 PM Suganath Prabu Subramani
<suganath-prabu.subramani@broadcom.com> wrote:
>
> Please consider this as Acked-by: Suganath Prabu
> <suganath-prabu.subramani@broadcom.com>
>
>
> On Tue, Jun 18, 2019 at 6:16 AM Ming Lei <tom.leiming@gmail.com> wrote:
> >
> > On Mon, Jun 17, 2019 at 8:21 PM Christoph Hellwig <hch@lst.de> wrote:
> > >
> > > When using a virt_boundary_mask, as done for NVMe devices attached to
> > > mpt3sas controllers we require an unlimited max_segment_size, as the
> > > virt boundary merging code assumes that.  But we also need to propagate
> > > that to the DMA mapping layer to make dma-debug happy.  The SCSI layer
> > > takes care of that when using the per-host virt_boundary setting, but
> > > given that mpt3sas only wants to set the virt_boundary for actual
> > > NVMe devices we can't rely on that.  The DMA layer maximum segment
> > > is global to the HBA however, so we have to set it explicitly.  This
> > > patch assumes that mpt3sas does not have a segment size limitation,
> > > which seems true based on the SGL format, but will need to be verified.
> > >
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > >  drivers/scsi/mpt3sas/mpt3sas_scsih.c | 1 +
> > >  1 file changed, 1 insertion(+)
> > >
> > > diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> > > index 1ccfbc7eebe0..c719b807f6d8 100644
> > > --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> > > +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> > > @@ -10222,6 +10222,7 @@ static struct scsi_host_template mpt3sas_driver_template = {
> > >         .this_id                        = -1,
> > >         .sg_tablesize                   = MPT3SAS_SG_DEPTH,
> > >         .max_sectors                    = 32767,
> > > +       .max_segment_size               = 0xffffffff,
> >
> > .max_segment_size should be aligned, either setting it here correctly or
> > forcing to make it aligned in scsi-core.
> >
> > Thanks,
> > Ming Lei

^ permalink raw reply

* Re: [PATCH 7/8] mpt3sas: set an unlimited max_segment_size for SAS 3.0 HBAs
From: Suganath Prabu Subramani @ 2019-06-20  7:04 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Martin K . Petersen, Sagi Grimberg,
	Max Gurtovoy, Bart Van Assche, linux-rdma, Linux SCSI List,
	megaraidlinux.pdl, PDL-MPT-FUSIONLINUX, linux-hyperv,
	Linux Kernel Mailing List
In-Reply-To: <CACVXFVObpdjN6V9qS-C9NG5xcrPqmx-X22qVamOSZf81Vog6zw@mail.gmail.com>

Please consider this as Acked-by: Suganath Prabu
<suganath-prabu.subramani@broadcom.com>


On Tue, Jun 18, 2019 at 6:16 AM Ming Lei <tom.leiming@gmail.com> wrote:
>
> On Mon, Jun 17, 2019 at 8:21 PM Christoph Hellwig <hch@lst.de> wrote:
> >
> > When using a virt_boundary_mask, as done for NVMe devices attached to
> > mpt3sas controllers we require an unlimited max_segment_size, as the
> > virt boundary merging code assumes that.  But we also need to propagate
> > that to the DMA mapping layer to make dma-debug happy.  The SCSI layer
> > takes care of that when using the per-host virt_boundary setting, but
> > given that mpt3sas only wants to set the virt_boundary for actual
> > NVMe devices we can't rely on that.  The DMA layer maximum segment
> > is global to the HBA however, so we have to set it explicitly.  This
> > patch assumes that mpt3sas does not have a segment size limitation,
> > which seems true based on the SGL format, but will need to be verified.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  drivers/scsi/mpt3sas/mpt3sas_scsih.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> > index 1ccfbc7eebe0..c719b807f6d8 100644
> > --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> > +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
> > @@ -10222,6 +10222,7 @@ static struct scsi_host_template mpt3sas_driver_template = {
> >         .this_id                        = -1,
> >         .sg_tablesize                   = MPT3SAS_SG_DEPTH,
> >         .max_sectors                    = 32767,
> > +       .max_segment_size               = 0xffffffff,
>
> .max_segment_size should be aligned, either setting it here correctly or
> forcing to make it aligned in scsi-core.
>
> Thanks,
> Ming Lei

^ permalink raw reply

* RE: [PATCH 1/8] scsi: add a host / host template field for the virt boundary
From: Kashyap Desai @ 2019-06-20  6:17 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig
  Cc: Martin K . Petersen, Sagi Grimberg, Max Gurtovoy, Bart Van Assche,
	linux-rdma, Linux SCSI List, PDL,MEGARAIDLINUX,
	PDL-MPT-FUSIONLINUX, linux-hyperv, Linux Kernel Mailing List,
	Kashyap Desai
In-Reply-To: <CACVXFVOwCeM2JzefBpKsVZrEaWpSBR0DF8qp4oKfoHm+pwLBYw@mail.gmail.com>

> -----Original Message-----
> From: megaraidlinux.pdl@broadcom.com
> [mailto:megaraidlinux.pdl@broadcom.com] On Behalf Of Ming Lei
> Sent: Tuesday, June 18, 2019 6:05 AM
> To: Christoph Hellwig <hch@lst.de>
> Cc: Martin K . Petersen <martin.petersen@oracle.com>; Sagi Grimberg
> <sagi@grimberg.me>; Max Gurtovoy <maxg@mellanox.com>; Bart Van
> Assche <bvanassche@acm.org>; linux-rdma <linux-rdma@vger.kernel.org>;
> Linux SCSI List <linux-scsi@vger.kernel.org>;
> megaraidlinux.pdl@broadcom.com; MPT-FusionLinux.pdl@broadcom.com;
> linux-hyperv@vger.kernel.org; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>
> Subject: Re: [PATCH 1/8] scsi: add a host / host template field for the
> virt
> boundary
>
> On Mon, Jun 17, 2019 at 8:21 PM Christoph Hellwig <hch@lst.de> wrote:
> >
> > This allows drivers setting it up easily instead of branching out to
> > block layer calls in slave_alloc, and ensures the upgraded
> > max_segment_size setting gets picked up by the DMA layer.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  drivers/scsi/hosts.c     | 3 +++
> >  drivers/scsi/scsi_lib.c  | 3 ++-
> >  include/scsi/scsi_host.h | 3 +++
> >  3 files changed, 8 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index
> > ff0d8c6a8d0c..55522b7162d3 100644
> > --- a/drivers/scsi/hosts.c
> > +++ b/drivers/scsi/hosts.c
> > @@ -462,6 +462,9 @@ struct Scsi_Host *scsi_host_alloc(struct
> scsi_host_template *sht, int privsize)
> >         else
> >                 shost->dma_boundary = 0xffffffff;
> >
> > +       if (sht->virt_boundary_mask)
> > +               shost->virt_boundary_mask = sht->virt_boundary_mask;
> > +
> >         device_initialize(&shost->shost_gendev);
> >         dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
> >         shost->shost_gendev.bus = &scsi_bus_type; diff --git
> > a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index
> > 65d0a10c76ad..d333bb6b1c59 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -1775,7 +1775,8 @@ void __scsi_init_queue(struct Scsi_Host *shost,
> struct request_queue *q)
> >         dma_set_seg_boundary(dev, shost->dma_boundary);
> >
> >         blk_queue_max_segment_size(q, shost->max_segment_size);
> > -       dma_set_max_seg_size(dev, shost->max_segment_size);
> > +       blk_queue_virt_boundary(q, shost->virt_boundary_mask);
> > +       dma_set_max_seg_size(dev, queue_max_segment_size(q));
>
> The patch looks fine, also suggest to make sure that max_segment_size is
> block-size aligned, and un-aligned max segment size has caused trouble on
> mmc.

I validated changes on latest and few older series controllers.
Post changes, I noticed max_segment_size is updated.
find /sys/ -name max_segment_size  |grep sdb |xargs grep -r .
/sys/devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:04.0/0000:40:00.0/0000:41:00.0/0000:42:00.0/host0/target0:2:12/0:2:12:0/block/sdb/queue/max_segment_size:4294967295

I verify that single SGE having 1MB transfer length is working fine.

Acked-by: Kashyap Desai < kashyap.desai@broadcom.com>

>
> Thanks,
> Ming Lei
>

^ permalink raw reply

* RE: [PATCH] ACPI: PM: Export the function acpi_sleep_state_supported()
From: Dexuan Cui @ 2019-06-19 19:54 UTC (permalink / raw)
  To: Lorenzo Pieralisi, linux-acpi@vger.kernel.org, rjw@rjwysocki.net,
	lenb@kernel.org, robert.moore@intel.com, erik.schmauss@intel.com,
	Russell King, Russ Dill, Sebastian Capella, Pavel Machek
  Cc: Michael Kelley, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, KY Srinivasan, Stephen Hemminger,
	Haiyang Zhang, Sasha Levin, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com
In-Reply-To: <20190617161454.GB27113@e121166-lin.cambridge.arm.com>

> From: linux-hyperv-owner@vger.kernel.org
> <linux-hyperv-owner@vger.kernel.org> On Behalf Of Lorenzo Pieralisi
> Sent: Monday, June 17, 2019 9:15 AM
> > ...
> > + some ARM experts who worked on arch/arm/kernel/hibernate.c.
> >
> > drivers/acpi/sleep.c is only built if ACPI_SYSTEM_POWER_STATES_SUPPORT
> > is defined, but it looks this option is not defined on ARM.
> >
> > It looks ARM does not support the ACPI S4 state, then how do we know
> > if an ARM host supports hibernation or not?
> 
> Maybe we should start from understanding why you need to know whether
> Hibernate is possible to answer your question ?
> 
> On ARM64 platforms system states are entered through PSCI firmware
> interface that works for ACPI and device tree alike.
> 
> Lorenzo

Hi Lorenzo,
It looks I may have confused you as I didn't realize the word "ARM" only means
32-bit ARM. It looks the "ARM" arch and the "ARM64" arch are very different.

As far as I know, Hyper-V only supports x86 and "ARM64", and it's unlikely to
support 32-bit ARM in the future, so actually I don't really need to know if and
how a 32-bit ARM machine supports hibernation.

When a Linux guest runs on Hyper-V (x86_32, x86_64, or ARM64) , we have a
front-end balloon driver in the guest, which balloons up/down and
hot adds/removes the guest's memory when the host requests that. The problem
is: the back-end driver on the host can not really save and restore the states
related to the front-end balloon driver on guest hibernation, so we made the
decision that balloon up/down and hot-add/remove are not supported when
we enable hibernation for a guest; BTW, we still want to load the front-end
driver in the guest, because the dirver has a functionality of reporting the
guest's memory pressure to the host, which we think is useful.

On x86_32 and x86_64, we enable hibernation for a guest by enabling
the virtual ACPI S4 state for the guest; on ARM64, so far we don't have the
host side changes required to support guest hibernation, so the details are
still unclear.

After I discussed with Michael Kelley, it looks we don't really need to
export drivers/acpi/sleep.c: acpi_sleep_state_supported(), but I think we do
need to make it non-static.

Now I propose the below changes. I plan to submit a patch first for the
changes made to drivers/acpi/sleep.c and include/acpi/acpi_bus.h in a few
days, if there is no objection.

Please let me know how you think of this. Thanks!

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index a34deccd7317..18d2e5922448 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -79,7 +79,7 @@ static int acpi_sleep_prepare(u32 acpi_state)
 	return 0;
 }
 
-static bool acpi_sleep_state_supported(u8 sleep_state)
+bool acpi_sleep_state_supported(u8 sleep_state)
 {
 	acpi_status status;
 	u8 type_a, type_b;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 31b6c87d6240..3e6563e1a2c0 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -651,6 +651,12 @@ static inline int acpi_pm_set_bridge_wakeup(struct device *dev, bool enable)
 }
 #endif
 
+#ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT
+bool acpi_sleep_state_supported(u8 sleep_state);
+#else
+bool acpi_sleep_state_supported(u8 sleep_state) { return false; }
+#endif
+
 #ifdef CONFIG_ACPI_SLEEP
 u32 acpi_target_system_state(void);
 #else
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 79f626ab8306..197e8fa32871 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -17,6 +17,7 @@
  *
  */
 
+#include <linux/acpi.h>
 #include <linux/efi.h>
 #include <linux/types.h>
 #include <asm/apic.h>
@@ -420,3 +421,9 @@ bool hv_is_hyperv_initialized(void)
 	return hypercall_msr.enable;
 }
 EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
+
+bool hv_is_hibernation_supported(void)
+{
+	return acpi_sleep_state_supported(ACPI_STATE_S4);
+}
+EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 0becb7d9704d..1cb40016ee53 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -166,9 +166,11 @@ static inline int cpumask_to_vpset(struct hv_vpset *vpset,
 void hyperv_report_panic(struct pt_regs *regs, long err);
 void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
 bool hv_is_hyperv_initialized(void);
+bool hv_is_hibernation_supported(void);
 void hyperv_cleanup(void);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
+static inline bool hv_is_hibernation_supported(void) { return false; }
 static inline void hyperv_cleanup(void) {}
 #endif /* CONFIG_HYPERV */
 
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 111ea3599659..39fd4e4a8fd7 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -34,6 +34,8 @@
 
 #include <linux/hyperv.h>
 
+#include <asm/mshyperv.h>
+
 #define CREATE_TRACE_POINTS
 #include "hv_trace_balloon.h"
 
@@ -1682,6 +1684,9 @@ static int balloon_probe(struct hv_device *dev,
 {
 	int ret;
 
+	if (hv_is_hibernation_supported())
+		hot_add = false;
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 	do_hot_add = hot_add;
 #else


Thanks,
-- Dexuan

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox