Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* RE: [PATCH V2 1/9] x86/hyperv: Add sev-snp enlightened guest static key
From: Michael Kelley (LINUX) @ 2023-07-04 14:17 UTC (permalink / raw)
  To: Tianyu Lan, KY Srinivasan, Haiyang Zhang, wei.liu@kernel.org,
	Dexuan Cui, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com,
	daniel.lezcano@linaro.org, arnd@arndb.de
  Cc: Tianyu Lan, linux-arch@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	vkuznets@redhat.com
In-Reply-To: <20230627032248.2170007-2-ltykernel@gmail.com>

From: Tianyu Lan <ltykernel@gmail.com> Sent: Monday, June 26, 2023 8:23 PM
> 
> Introduce static key isolation_type_en_snp for enlightened
> sev-snp guest check.
> 
> Signed-off-by: Tianyu Lan <tiala@microsoft.com>
> ---
>  arch/x86/hyperv/ivm.c           | 11 +++++++++++
>  arch/x86/include/asm/mshyperv.h |  3 +++
>  arch/x86/kernel/cpu/mshyperv.c  |  9 +++++++--
>  drivers/hv/hv_common.c          |  6 ++++++
>  include/asm-generic/mshyperv.h  | 12 +++++++++---
>  5 files changed, 36 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
> index cc92388b7a99..5d3ee3124e00 100644
> --- a/arch/x86/hyperv/ivm.c
> +++ b/arch/x86/hyperv/ivm.c
> @@ -409,3 +409,14 @@ bool hv_isolation_type_snp(void)
>  {
>  	return static_branch_unlikely(&isolation_type_snp);
>  }
> +
> +DEFINE_STATIC_KEY_FALSE(isolation_type_en_snp);
> +/*
> + * hv_isolation_type_en_snp - Check system runs in the AMD SEV-SNP based
> + * isolation enlightened VM.
> + */
> +bool hv_isolation_type_en_snp(void)
> +{
> +	return static_branch_unlikely(&isolation_type_en_snp);
> +}
> +

Vitaly had suggested some renaming of this and related variables and
functions, which you had agreed to do.  But I don't see that renaming
reflected in this patch or throughout the full patch set.

Michael

> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index 49bb4f2bd300..31c476f4e656 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -26,6 +26,7 @@
>  union hv_ghcb;
> 
>  DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
> +DECLARE_STATIC_KEY_FALSE(isolation_type_en_snp);
> 
>  typedef int (*hyperv_fill_flush_list_func)(
>  		struct hv_guest_mapping_flush_list *flush,
> @@ -45,6 +46,8 @@ extern void *hv_hypercall_pg;
> 
>  extern u64 hv_current_partition_id;
> 
> +extern bool hv_isolation_type_en_snp(void);
> +
>  extern union hv_ghcb * __percpu *hv_ghcb_pg;
> 
>  int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index c7969e806c64..5398fb2f4d39 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -402,8 +402,12 @@ static void __init ms_hyperv_init_platform(void)
>  		pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
>  			ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
> 
> -		if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
> +
> +		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
> +			static_branch_enable(&isolation_type_en_snp);
> +		} else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
>  			static_branch_enable(&isolation_type_snp);
> +		}
>  	}
> 
>  	if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
> @@ -473,7 +477,8 @@ static void __init ms_hyperv_init_platform(void)
> 
>  #if IS_ENABLED(CONFIG_HYPERV)
>  	if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
> -	    (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
> +	    ((hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) &&
> +	    ms_hyperv.paravisor_present))
>  		hv_vtom_init();
>  	/*
>  	 * Setup the hook to get control post apic initialization.
> diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
> index 542a1d53b303..4b4aa53c34c2 100644
> --- a/drivers/hv/hv_common.c
> +++ b/drivers/hv/hv_common.c
> @@ -502,6 +502,12 @@ bool __weak hv_isolation_type_snp(void)
>  }
>  EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
> 
> +bool __weak hv_isolation_type_en_snp(void)
> +{
> +	return false;
> +}
> +EXPORT_SYMBOL_GPL(hv_isolation_type_en_snp);
> +
>  void __weak hv_setup_vmbus_handler(void (*handler)(void))
>  {
>  }
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index 402a8c1c202d..6b5c41f90398 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -36,15 +36,21 @@ struct ms_hyperv_info {
>  	u32 nested_features;
>  	u32 max_vp_index;
>  	u32 max_lp_index;
> -	u32 isolation_config_a;
> +	union {
> +		u32 isolation_config_a;
> +		struct {
> +			u32 paravisor_present : 1;
> +			u32 reserved_a1 : 31;
> +		};
> +	};
>  	union {
>  		u32 isolation_config_b;
>  		struct {
>  			u32 cvm_type : 4;
> -			u32 reserved1 : 1;
> +			u32 reserved_b1 : 1;
>  			u32 shared_gpa_boundary_active : 1;
>  			u32 shared_gpa_boundary_bits : 6;
> -			u32 reserved2 : 20;
> +			u32 reserved_b2 : 20;
>  		};
>  	};
>  	u64 shared_gpa_boundary;
> --
> 2.25.1


^ permalink raw reply

* RE: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload when host is unresponsive
From: Haiyang Zhang @ 2023-07-04 13:42 UTC (permalink / raw)
  To: Souradeep Chakrabarti, Alexander Lobakin, souradeep chakrabarti
  Cc: KY Srinivasan, wei.liu@kernel.org, Dexuan Cui,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, Long Li, Ajay Sharma, leon@kernel.org,
	cai.huoqing@linux.dev, ssengar@linux.microsoft.com,
	vkuznets@redhat.com, tglx@linutronix.de,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <PUZP153MB07880E6D692FD5D13C508694CC29A@PUZP153MB0788.APCP153.PROD.OUTLOOK.COM>



> -----Original Message-----
> From: Souradeep Chakrabarti <schakrabarti@microsoft.com>
> Sent: Monday, July 3, 2023 3:55 PM
> To: Alexander Lobakin <aleksander.lobakin@intel.com>; souradeep chakrabarti
> <schakrabarti@linux.microsoft.com>
> Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org;
> stable@vger.kernel.org
> Subject: RE: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload
> when host is unresponsive
> 
> 
> 
> >-----Original Message-----
> >From: Alexander Lobakin <aleksander.lobakin@intel.com>
> >Sent: Monday, July 3, 2023 10:18 PM
> >To: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
> >Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> ><haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> ><decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> >kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> >Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> >cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> >tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> >linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org;
> >stable@vger.kernel.org; Souradeep Chakrabarti
> <schakrabarti@microsoft.com>
> >Subject: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload when
> >host is unresponsive
> >
> >From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> >Date: Mon,  3 Jul 2023 01:49:31 -0700
> >
> >> From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> >
> >Please sync your Git name and Git mail account settings, so that your own
> >patches won't have "From:" when sending. From what I see, you need to
> >correct first letters of name and surname to capital in the Git email settings
> >block.
> Thank you for pointing, I will fix it.
> >
> >>
> >> When unloading the MANA driver, mana_dealloc_queues() waits for the
> >> MANA hardware to complete any inflight packets and set the pending
> >> send count to zero. But if the hardware has failed,
> >> mana_dealloc_queues() could wait forever.
> >>
> >> Fix this by adding a timeout to the wait. Set the timeout to 120
> >> seconds, which is a somewhat arbitrary value that is more than long
> >> enough for functional hardware to complete any sends.
> >>
> >> Signed-off-by: Souradeep Chakrabarti
> >> <schakrabarti@linux.microsoft.com>
> >
> >Where's "Fixes:" tagging the blamed commit?
> This is present from the day zero of the mana driver code.
> It has not been introduced in the code by any commit.
> >
> >> ---
> >> V3 -> V4:
> >> * Fixed the commit message to describe the context.
> >> * Removed the vf_unload_timeout, as it is not required.
> >> ---
> >>  drivers/net/ethernet/microsoft/mana/mana_en.c | 26
> >> ++++++++++++++++---
> >>  1 file changed, 23 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> >> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> >> index a499e460594b..d26f1da70411 100644
> >> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> >> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> >> @@ -2346,7 +2346,10 @@ static int mana_dealloc_queues(struct
> >> net_device *ndev)  {
> >>  	struct mana_port_context *apc = netdev_priv(ndev);
> >>  	struct gdma_dev *gd = apc->ac->gdma_dev;
> >> +	unsigned long timeout;
> >>  	struct mana_txq *txq;
> >> +	struct sk_buff *skb;
> >> +	struct mana_cq *cq;
> >>  	int i, err;
> >>
> >>  	if (apc->port_is_up)
> >> @@ -2363,15 +2366,32 @@ static int mana_dealloc_queues(struct
> >net_device *ndev)
> >>  	 * to false, but it doesn't matter since mana_start_xmit() drops any
> >>  	 * new packets due to apc->port_is_up being false.
> >>  	 *
> >> -	 * Drain all the in-flight TX packets
> >> +	 * Drain all the in-flight TX packets.
> >> +	 * A timeout of 120 seconds for all the queues is used.
> >> +	 * This will break the while loop when h/w is not responding.
> >> +	 * This value of 120 has been decided here considering max
> >> +	 * number of queues.
> >>  	 */
> >> +
> >> +	timeout = jiffies + 120 * HZ;
> >
> >Why not initialize it right when declaring?
> I will fix it in next version.
> >
> >>  	for (i = 0; i < apc->num_queues; i++) {
> >>  		txq = &apc->tx_qp[i].txq;
> >> -
> >> -		while (atomic_read(&txq->pending_sends) > 0)
> >> +		while (atomic_read(&txq->pending_sends) > 0 &&
> >> +		       time_before(jiffies, timeout)) {
> >>  			usleep_range(1000, 2000);> +		}
> >>  	}
> >
> >120 seconds by 2 msec step is 60000 iterations, by 1 msec is 120000
> >iterations. I know usleep_range() often is much less precise, but still.
> >Do you really need that much time? Has this been measured during the tests
> >that it can take up to 120 seconds or is it just some random value that "should
> >be enough"?
> >If you really need 120 seconds, I'd suggest using a timer / delayed work
> instead
> >of wasting resources.
> Here the intent is not waiting for 120 seconds, rather than avoid continue
> checking the
> pending_sends  of each tx queues for an indefinite time, before freeing
> sk_buffs.
> The pending_sends can only get decreased for a tx queue,  if mana_poll_tx_cq()
> gets called for a completion notification and increased by xmit.
> 
> In this particular bug, apc->port_is_up is not set to false, causing
> xmit to keep increasing the pending_sends for the queue and
> mana_poll_tx_cq()
> not getting called for the queue.
> 
> If we see the comment in the function mana_dealloc_queues(), it mentions it :
> 
> 2346     /* No packet can be transmitted now since apc->port_is_up is false.
> 2347      * There is still a tiny chance that mana_poll_tx_cq() can re-enable
> 2348      * a txq because it may not timely see apc->port_is_up being cleared
> 2349      * to false, but it doesn't matter since mana_start_xmit() drops any
> 2350      * new packets due to apc->port_is_up being false.
> 
> The value 120 seconds has been decided here based on maximum number of
> queues
> are allowed in this specific hardware, it is a safe assumption.

I agree. Also, this waiting time is usually much shorter than 120 sec. The long 
wait only happens in rare and unexpected NIC HW non-responding cases. To 
further reduce the resource consumption, we can double the usleep_range() 
time in every iteration. So, the number of iterations will be greatly reduced 
before reaching 120 sec.

Thanks,
- Haiyang

^ permalink raw reply

* Re: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload when host is unresponsive
From: Paolo Abeni @ 2023-07-04  6:59 UTC (permalink / raw)
  To: Souradeep Chakrabarti, Alexander Lobakin, souradeep chakrabarti
  Cc: KY Srinivasan, Haiyang Zhang, wei.liu@kernel.org, Dexuan Cui,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	Long Li, Ajay Sharma, leon@kernel.org, cai.huoqing@linux.dev,
	ssengar@linux.microsoft.com, vkuznets@redhat.com,
	tglx@linutronix.de, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-rdma@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <PUZP153MB07880E6D692FD5D13C508694CC29A@PUZP153MB0788.APCP153.PROD.OUTLOOK.COM>

On Mon, 2023-07-03 at 19:55 +0000, Souradeep Chakrabarti wrote:
> > -----Original Message-----
> > From: Alexander Lobakin <aleksander.lobakin@intel.com>
> > Sent: Monday, July 3, 2023 10:18 PM
> > To: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
> > Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> > <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> > <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> > kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> > Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> > cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> > tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org;
> > stable@vger.kernel.org; Souradeep Chakrabarti <schakrabarti@microsoft.com>
> > Subject: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload when
> > host is unresponsive
> > 
> > From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > Date: Mon,  3 Jul 2023 01:49:31 -0700
> > 
> > > From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > 
> > Please sync your Git name and Git mail account settings, so that your own
> > patches won't have "From:" when sending. From what I see, you need to
> > correct first letters of name and surname to capital in the Git email settings
> > block.
> Thank you for pointing, I will fix it.
> > 
> > > 
> > > When unloading the MANA driver, mana_dealloc_queues() waits for the
> > > MANA hardware to complete any inflight packets and set the pending
> > > send count to zero. But if the hardware has failed,
> > > mana_dealloc_queues() could wait forever.
> > > 
> > > Fix this by adding a timeout to the wait. Set the timeout to 120
> > > seconds, which is a somewhat arbitrary value that is more than long
> > > enough for functional hardware to complete any sends.
> > > 
> > > Signed-off-by: Souradeep Chakrabarti
> > > <schakrabarti@linux.microsoft.com>
> > 
> > Where's "Fixes:" tagging the blamed commit?
> This is present from the day zero of the mana driver code.
> It has not been introduced in the code by any commit.
> 

Then the fixes tag should be:

Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")

Cheers,

Paolo


^ permalink raw reply

* Re: [PATCH v3] hv/hv_kvp_daemon: Add support for keyfile config based connection profile in NM
From: Ani Sinha @ 2023-07-04  6:25 UTC (permalink / raw)
  To: Shradha Gupta
  Cc: Wei Liu, Olaf Hering, linux-kernel, linux-hyperv,
	K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger, Dexuan Cui,
	Long Li, Michael Kelley
In-Reply-To: <20230523053627.GA10913@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>



> On 23-May-2023, at 11:06 AM, Shradha Gupta <shradhagupta@linux.microsoft.com> wrote:
> 
> On Mon, May 08, 2023 at 05:16:19PM +0000, Wei Liu wrote:
>> On Mon, May 08, 2023 at 07:12:46PM +0200, Olaf Hering wrote:
>>> Mon, 8 May 2023 16:47:54 +0000 Wei Liu <wei.liu@kernel.org>:
>>> 
>>>> Olaf, is this a reviewed-by from you? :-)
>>> 
>>> Sorry, I did not review the new functionality, just tried to make sure there will be no regression for existing consumers.
>> 
>> Okay, this is fine, too. Thank you for looking into this.
>> 

Folks, can we close on this patchset sooner? We at Red Hat also are interested in seeing this feature merged.

Thanks,


^ permalink raw reply

* RE: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload when host is unresponsive
From: Souradeep Chakrabarti @ 2023-07-03 19:55 UTC (permalink / raw)
  To: Alexander Lobakin, souradeep chakrabarti
  Cc: KY Srinivasan, Haiyang Zhang, wei.liu@kernel.org, Dexuan Cui,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, Long Li, Ajay Sharma, leon@kernel.org,
	cai.huoqing@linux.dev, ssengar@linux.microsoft.com,
	vkuznets@redhat.com, tglx@linutronix.de,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <83ef6401-8736-8416-c898-2fbbb786726e@intel.com>



>-----Original Message-----
>From: Alexander Lobakin <aleksander.lobakin@intel.com>
>Sent: Monday, July 3, 2023 10:18 PM
>To: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
>Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
><haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
><decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
>kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
>Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
>cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
>tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
>linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org;
>stable@vger.kernel.org; Souradeep Chakrabarti <schakrabarti@microsoft.com>
>Subject: [EXTERNAL] Re: [PATCH V4 net] net: mana: Fix MANA VF unload when
>host is unresponsive
>
>From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
>Date: Mon,  3 Jul 2023 01:49:31 -0700
>
>> From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
>
>Please sync your Git name and Git mail account settings, so that your own
>patches won't have "From:" when sending. From what I see, you need to
>correct first letters of name and surname to capital in the Git email settings
>block.
Thank you for pointing, I will fix it.
>
>>
>> When unloading the MANA driver, mana_dealloc_queues() waits for the
>> MANA hardware to complete any inflight packets and set the pending
>> send count to zero. But if the hardware has failed,
>> mana_dealloc_queues() could wait forever.
>>
>> Fix this by adding a timeout to the wait. Set the timeout to 120
>> seconds, which is a somewhat arbitrary value that is more than long
>> enough for functional hardware to complete any sends.
>>
>> Signed-off-by: Souradeep Chakrabarti
>> <schakrabarti@linux.microsoft.com>
>
>Where's "Fixes:" tagging the blamed commit?
This is present from the day zero of the mana driver code.
It has not been introduced in the code by any commit.
>
>> ---
>> V3 -> V4:
>> * Fixed the commit message to describe the context.
>> * Removed the vf_unload_timeout, as it is not required.
>> ---
>>  drivers/net/ethernet/microsoft/mana/mana_en.c | 26
>> ++++++++++++++++---
>>  1 file changed, 23 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
>> b/drivers/net/ethernet/microsoft/mana/mana_en.c
>> index a499e460594b..d26f1da70411 100644
>> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
>> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
>> @@ -2346,7 +2346,10 @@ static int mana_dealloc_queues(struct
>> net_device *ndev)  {
>>  	struct mana_port_context *apc = netdev_priv(ndev);
>>  	struct gdma_dev *gd = apc->ac->gdma_dev;
>> +	unsigned long timeout;
>>  	struct mana_txq *txq;
>> +	struct sk_buff *skb;
>> +	struct mana_cq *cq;
>>  	int i, err;
>>
>>  	if (apc->port_is_up)
>> @@ -2363,15 +2366,32 @@ static int mana_dealloc_queues(struct
>net_device *ndev)
>>  	 * to false, but it doesn't matter since mana_start_xmit() drops any
>>  	 * new packets due to apc->port_is_up being false.
>>  	 *
>> -	 * Drain all the in-flight TX packets
>> +	 * Drain all the in-flight TX packets.
>> +	 * A timeout of 120 seconds for all the queues is used.
>> +	 * This will break the while loop when h/w is not responding.
>> +	 * This value of 120 has been decided here considering max
>> +	 * number of queues.
>>  	 */
>> +
>> +	timeout = jiffies + 120 * HZ;
>
>Why not initialize it right when declaring?
I will fix it in next version.
>
>>  	for (i = 0; i < apc->num_queues; i++) {
>>  		txq = &apc->tx_qp[i].txq;
>> -
>> -		while (atomic_read(&txq->pending_sends) > 0)
>> +		while (atomic_read(&txq->pending_sends) > 0 &&
>> +		       time_before(jiffies, timeout)) {
>>  			usleep_range(1000, 2000);> +		}
>>  	}
>
>120 seconds by 2 msec step is 60000 iterations, by 1 msec is 120000
>iterations. I know usleep_range() often is much less precise, but still.
>Do you really need that much time? Has this been measured during the tests
>that it can take up to 120 seconds or is it just some random value that "should
>be enough"?
>If you really need 120 seconds, I'd suggest using a timer / delayed work instead
>of wasting resources.
Here the intent is not waiting for 120 seconds, rather than avoid continue checking the 
pending_sends  of each tx queues for an indefinite time, before freeing sk_buffs.
The pending_sends can only get decreased for a tx queue,  if mana_poll_tx_cq()
gets called for a completion notification and increased by xmit.

In this particular bug, apc->port_is_up is not set to false, causing
xmit to keep increasing the pending_sends for the queue and mana_poll_tx_cq()
not getting called for the queue.

If we see the comment in the function mana_dealloc_queues(), it mentions it :

2346     /* No packet can be transmitted now since apc->port_is_up is false.
2347      * There is still a tiny chance that mana_poll_tx_cq() can re-enable
2348      * a txq because it may not timely see apc->port_is_up being cleared
2349      * to false, but it doesn't matter since mana_start_xmit() drops any
2350      * new packets due to apc->port_is_up being false.

The value 120 seconds has been decided here based on maximum number of queues
are allowed in this specific hardware, it is a safe assumption.
>
>>
>> +	for (i = 0; i < apc->num_queues; i++) {
>> +		txq = &apc->tx_qp[i].txq;
>> +		cq = &apc->tx_qp[i].tx_cq;
>
>cq can be just &txq->tx_cq.
mana_txq  structure does not have a pointer to mana_cq.
>
>> +		while (atomic_read(&txq->pending_sends)) {
>> +			skb = skb_dequeue(&txq->pending_skbs);
>> +			mana_unmap_skb(skb, apc);
>> +			napi_consume_skb(skb, cq->budget);
>
>(you already have comment about this one)
>
>> +			atomic_sub(1, &txq->pending_sends);
>> +		}
>> +	}
>>  	/* We're 100% sure the queues can no longer be woken up, because
>>  	 * we're sure now mana_poll_tx_cq() can't be running.
>>  	 */
>
>Thanks,
>Olek

^ permalink raw reply

* Re: [PATCH V4 net] net: mana: Fix MANA VF unload when host is unresponsive
From: Alexander Lobakin @ 2023-07-03 16:47 UTC (permalink / raw)
  To: souradeep chakrabarti
  Cc: kys, haiyangz, wei.liu, decui, davem, edumazet, kuba, pabeni,
	longli, sharmaajay, leon, cai.huoqing, ssengar, vkuznets, tglx,
	linux-hyperv, netdev, linux-kernel, linux-rdma, stable,
	schakrabarti
In-Reply-To: <1688374171-10534-1-git-send-email-schakrabarti@linux.microsoft.com>

From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
Date: Mon,  3 Jul 2023 01:49:31 -0700

> From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>

Please sync your Git name and Git mail account settings, so that your
own patches won't have "From:" when sending. From what I see, you need
to correct first letters of name and surname to capital in the Git email
settings block.

> 
> When unloading the MANA driver, mana_dealloc_queues() waits for the MANA
> hardware to complete any inflight packets and set the pending send count
> to zero. But if the hardware has failed, mana_dealloc_queues()
> could wait forever.
> 
> Fix this by adding a timeout to the wait. Set the timeout to 120 seconds,
> which is a somewhat arbitrary value that is more than long enough for
> functional hardware to complete any sends.
> 
> Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>

Where's "Fixes:" tagging the blamed commit?

> ---
> V3 -> V4:
> * Fixed the commit message to describe the context.
> * Removed the vf_unload_timeout, as it is not required.
> ---
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 26 ++++++++++++++++---
>  1 file changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a499e460594b..d26f1da70411 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -2346,7 +2346,10 @@ static int mana_dealloc_queues(struct net_device *ndev)
>  {
>  	struct mana_port_context *apc = netdev_priv(ndev);
>  	struct gdma_dev *gd = apc->ac->gdma_dev;
> +	unsigned long timeout;
>  	struct mana_txq *txq;
> +	struct sk_buff *skb;
> +	struct mana_cq *cq;
>  	int i, err;
>  
>  	if (apc->port_is_up)
> @@ -2363,15 +2366,32 @@ static int mana_dealloc_queues(struct net_device *ndev)
>  	 * to false, but it doesn't matter since mana_start_xmit() drops any
>  	 * new packets due to apc->port_is_up being false.
>  	 *
> -	 * Drain all the in-flight TX packets
> +	 * Drain all the in-flight TX packets.
> +	 * A timeout of 120 seconds for all the queues is used.
> +	 * This will break the while loop when h/w is not responding.
> +	 * This value of 120 has been decided here considering max
> +	 * number of queues.
>  	 */
> +
> +	timeout = jiffies + 120 * HZ;

Why not initialize it right when declaring?

>  	for (i = 0; i < apc->num_queues; i++) {
>  		txq = &apc->tx_qp[i].txq;
> -
> -		while (atomic_read(&txq->pending_sends) > 0)
> +		while (atomic_read(&txq->pending_sends) > 0 &&
> +		       time_before(jiffies, timeout)) {
>  			usleep_range(1000, 2000);> +		}
>  	}

120 seconds by 2 msec step is 60000 iterations, by 1 msec is 120000
iterations. I know usleep_range() often is much less precise, but still.
Do you really need that much time? Has this been measured during the
tests that it can take up to 120 seconds or is it just some random value
that "should be enough"?
If you really need 120 seconds, I'd suggest using a timer / delayed work
instead of wasting resources.

>  
> +	for (i = 0; i < apc->num_queues; i++) {
> +		txq = &apc->tx_qp[i].txq;
> +		cq = &apc->tx_qp[i].tx_cq;

cq can be just &txq->tx_cq.

> +		while (atomic_read(&txq->pending_sends)) {
> +			skb = skb_dequeue(&txq->pending_skbs);
> +			mana_unmap_skb(skb, apc);
> +			napi_consume_skb(skb, cq->budget);

(you already have comment about this one)

> +			atomic_sub(1, &txq->pending_sends);
> +		}
> +	}
>  	/* We're 100% sure the queues can no longer be woken up, because
>  	 * we're sure now mana_poll_tx_cq() can't be running.
>  	 */

Thanks,
Olek

^ permalink raw reply

* RE: [PATCH V4 net] net: mana: Fix MANA VF unload when host is unresponsive
From: Haiyang Zhang @ 2023-07-03 15:51 UTC (permalink / raw)
  To: souradeep chakrabarti, KY Srinivasan, wei.liu@kernel.org,
	Dexuan Cui, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, Long Li, Ajay Sharma,
	leon@kernel.org, cai.huoqing@linux.dev,
	ssengar@linux.microsoft.com, vkuznets@redhat.com,
	tglx@linutronix.de, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-rdma@vger.kernel.org
  Cc: stable@vger.kernel.org, Souradeep Chakrabarti
In-Reply-To: <1688374171-10534-1-git-send-email-schakrabarti@linux.microsoft.com>



> -----Original Message-----
> From: souradeep chakrabarti <schakrabarti@linux.microsoft.com>
> Sent: Monday, July 3, 2023 4:50 AM
> To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>; Ajay
> Sharma <sharmaajay@microsoft.com>; leon@kernel.org;
> cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
> tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-rdma@vger.kernel.org
> Cc: stable@vger.kernel.org; Souradeep Chakrabarti
> <schakrabarti@microsoft.com>; Souradeep Chakrabarti
> <schakrabarti@linux.microsoft.com>
> Subject: [PATCH V4 net] net: mana: Fix MANA VF unload when host is
> unresponsive
> 
> From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> 
> When unloading the MANA driver, mana_dealloc_queues() waits for the MANA
> hardware to complete any inflight packets and set the pending send count
> to zero. But if the hardware has failed, mana_dealloc_queues()
> could wait forever.
> 
> Fix this by adding a timeout to the wait. Set the timeout to 120 seconds,
> which is a somewhat arbitrary value that is more than long enough for
> functional hardware to complete any sends.
> 
> Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> ---
> V3 -> V4:
> * Fixed the commit message to describe the context.
> * Removed the vf_unload_timeout, as it is not required.
> ---
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 26 ++++++++++++++++---
>  1 file changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a499e460594b..d26f1da70411 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -2346,7 +2346,10 @@ static int mana_dealloc_queues(struct net_device
> *ndev)
>  {
>  	struct mana_port_context *apc = netdev_priv(ndev);
>  	struct gdma_dev *gd = apc->ac->gdma_dev;
> +	unsigned long timeout;
>  	struct mana_txq *txq;
> +	struct sk_buff *skb;
> +	struct mana_cq *cq;
>  	int i, err;
> 
>  	if (apc->port_is_up)
> @@ -2363,15 +2366,32 @@ static int mana_dealloc_queues(struct net_device
> *ndev)
>  	 * to false, but it doesn't matter since mana_start_xmit() drops any
>  	 * new packets due to apc->port_is_up being false.
>  	 *
> -	 * Drain all the in-flight TX packets
> +	 * Drain all the in-flight TX packets.
> +	 * A timeout of 120 seconds for all the queues is used.
> +	 * This will break the while loop when h/w is not responding.
> +	 * This value of 120 has been decided here considering max
> +	 * number of queues.
>  	 */
> +
> +	timeout = jiffies + 120 * HZ;
>  	for (i = 0; i < apc->num_queues; i++) {
>  		txq = &apc->tx_qp[i].txq;
> -
> -		while (atomic_read(&txq->pending_sends) > 0)
> +		while (atomic_read(&txq->pending_sends) > 0 &&
> +		       time_before(jiffies, timeout)) {
>  			usleep_range(1000, 2000);
> +		}
>  	}
> 
> +	for (i = 0; i < apc->num_queues; i++) {
> +		txq = &apc->tx_qp[i].txq;
> +		cq = &apc->tx_qp[i].tx_cq;
> +		while (atomic_read(&txq->pending_sends)) {
> +			skb = skb_dequeue(&txq->pending_skbs);
> +			mana_unmap_skb(skb, apc);
> +			napi_consume_skb(skb, cq->budget);

This is not in NAPI context, so it should be dev_consume_skb_any()

Thanks,
- Haiyang



^ permalink raw reply

* Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Paolo Abeni @ 2023-07-03 10:14 UTC (permalink / raw)
  To: Long Li, Jakub Kicinski
  Cc: Greg KH, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <PH7PR21MB3263ED62B45BF78370350AD7CE28A@PH7PR21MB3263.namprd21.prod.outlook.com>

On Sun, 2023-07-02 at 20:18 +0000, Long Li wrote:
> > > > > > > > Subject: Re: [Patch v3] net: mana: Batch ringing RX
> > > > > > > > queue
> > > > > > > > doorbell
> > > > > > > > on receiving
> > > > > > > > packets
> > > > > > > > 
> > > > > > > > On Fri, 30 Jun 2023 20:42:28 +0000 Long Li wrote:
> > > > > > > > > > > > > > > > > > > > 5.15 and kernel 6.1. (those
> > > > > > > > > > > > > > > > > > > > kernels are longterm)
> > > > > > > > > > > > > > > > > > > > They need
> > > > > > > > > > > > > > > > > > > > this
> > > > > > > > > > > > > > > > > > > > fix to achieve the performance
> > > > > > > > > > > > > > > > > > > > target.
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > Why can't they be upgraded to get that
> > > > > > > > > > > > > > > > performance
> > > > > > > > > > > > > > > > target, and
> > > > > > > > > > > > > > > > all
> > > > > > > > > > > > > > > > the other goodness that those kernels
> > > > > > > > > > > > > > > > have? We don't
> > > > > > > > > > > > > > > > normally
> > > > > > > > > > > > > > > > backport new features, right?
> > > > > > > > > > > > 
> > > > > > > > > > > > I think this should be considered as a fix, not
> > > > > > > > > > > > a new
> > > > > > > > > > > > feature.
> > > > > > > > > > > > 
> > > > > > > > > > > > MANA is designed to be 200GB full duplex at the
> > > > > > > > > > > > start. Due
> > > > > > > > > > > > to
> > > > > > > > > > > > lack of
> > > > > > > > > > > > hardware testing capability at early stage of
> > > > > > > > > > > > the project,
> > > > > > > > > > > > we
> > > > > > > > > > > > could
> > > > > > > > > > > > only test 100GB for the Linux driver. When
> > > > > > > > > > > > hardware is
> > > > > > > > > > > > fully
> > > > > > > > > > > > capable
> > > > > > > > > > > > of reaching designed spec, this bug in the
> > > > > > > > > > > > Linux driver
> > > > > > > > > > > > shows up.
> > > > > > > > 
> > > > > > > > That part we understand.
> > > > > > > > 
> > > > > > > > If I were you I'd try to convince Greg and Paolo that
> > > > > > > > the
> > > > > > > > change is
> > > > > > > > small and
> > > > > > > > significant for user experience. And answer Greg's
> > > > > > > > question why
> > > > > > > > upgrading the
> > > > > > > > kernel past 6.1 is a challenge in your environment.
> > > > 
> > > > I was under the impression that this patch was considered to be
> > > > a
> > > > feature, 
> > > > not a bug fix. I was trying to justify that the "Fixes:" tag
> > > > was
> > > > needed. 
> > > > 
> > > > I apologize for misunderstanding this.
> > > > 
> > > > Without this fix, it's not possible to run a typical workload
> > > > designed for 200Gb
> > > > physical link speed.
> > > > 
> > > > We see a large number of customers and Linux distributions
> > > > committed
> > > > on 5.15 
> > > > and 6.1 kernels. They planned the product cycles and
> > > > certification
> > > > processes 
> > > > around these longterm kernel versions. It's difficult for them
> > > > to
> > > > upgrade to newer
> > > > kernel versions.

I think there are some misunderstanding WRT distros and stable kernels.
(Commercial) distros will backport the patch as needed, regardless such
patch landing in the 5.15 upstream tree or not. Individual users
running their own vanilla 5.15 kernel can't expect performance
improvement landing there.

All in all I feel undecided. I would endorse this change going trough
net-next (without the stable tag). I would feel less torn with this
change targeting -net without the stable tag. Targeting -net with the
stable tag sounds a bit too much to me.

Cheers,
Paolo


^ permalink raw reply

* [PATCH V4 net] net: mana: Fix MANA VF unload when host is unresponsive
From: souradeep chakrabarti @ 2023-07-03  8:49 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, davem, edumazet, kuba, pabeni,
	longli, sharmaajay, leon, cai.huoqing, ssengar, vkuznets, tglx,
	linux-hyperv, netdev, linux-kernel, linux-rdma
  Cc: stable, schakrabarti, Souradeep Chakrabarti

From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>

When unloading the MANA driver, mana_dealloc_queues() waits for the MANA
hardware to complete any inflight packets and set the pending send count
to zero. But if the hardware has failed, mana_dealloc_queues()
could wait forever.

Fix this by adding a timeout to the wait. Set the timeout to 120 seconds,
which is a somewhat arbitrary value that is more than long enough for
functional hardware to complete any sends.

Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
---
V3 -> V4:
* Fixed the commit message to describe the context.
* Removed the vf_unload_timeout, as it is not required.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a499e460594b..d26f1da70411 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2346,7 +2346,10 @@ static int mana_dealloc_queues(struct net_device *ndev)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	struct gdma_dev *gd = apc->ac->gdma_dev;
+	unsigned long timeout;
 	struct mana_txq *txq;
+	struct sk_buff *skb;
+	struct mana_cq *cq;
 	int i, err;
 
 	if (apc->port_is_up)
@@ -2363,15 +2366,32 @@ static int mana_dealloc_queues(struct net_device *ndev)
 	 * to false, but it doesn't matter since mana_start_xmit() drops any
 	 * new packets due to apc->port_is_up being false.
 	 *
-	 * Drain all the in-flight TX packets
+	 * Drain all the in-flight TX packets.
+	 * A timeout of 120 seconds for all the queues is used.
+	 * This will break the while loop when h/w is not responding.
+	 * This value of 120 has been decided here considering max
+	 * number of queues.
 	 */
+
+	timeout = jiffies + 120 * HZ;
 	for (i = 0; i < apc->num_queues; i++) {
 		txq = &apc->tx_qp[i].txq;
-
-		while (atomic_read(&txq->pending_sends) > 0)
+		while (atomic_read(&txq->pending_sends) > 0 &&
+		       time_before(jiffies, timeout)) {
 			usleep_range(1000, 2000);
+		}
 	}
 
+	for (i = 0; i < apc->num_queues; i++) {
+		txq = &apc->tx_qp[i].txq;
+		cq = &apc->tx_qp[i].tx_cq;
+		while (atomic_read(&txq->pending_sends)) {
+			skb = skb_dequeue(&txq->pending_skbs);
+			mana_unmap_skb(skb, apc);
+			napi_consume_skb(skb, cq->budget);
+			atomic_sub(1, &txq->pending_sends);
+		}
+	}
 	/* We're 100% sure the queues can no longer be woken up, because
 	 * we're sure now mana_poll_tx_cq() can't be running.
 	 */
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH] hv_netvsc: support a new host capability AllowRscDisabledStatus
From: Shradha Gupta @ 2023-07-03  4:37 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: linux-kernel@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	KY Srinivasan, Wei Liu, Dexuan Cui, Long Li,
	Michael Kelley (LINUX), David S. Miller
In-Reply-To: <PH7PR21MB3116F77C196628B6BBADA3C7CA25A@PH7PR21MB3116.namprd21.prod.outlook.com>

On Thu, Jun 29, 2023 at 12:44:26PM +0000, Haiyang Zhang wrote:
> 
> 
> > -----Original Message-----
> > From: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > Sent: Thursday, June 29, 2023 5:59 AM
> > To: linux-kernel@vger.kernel.org; linux-hyperv@vger.kernel.org;
> > netdev@vger.kernel.org
> > Cc: Shradha Gupta <shradhagupta@linux.microsoft.com>; Eric Dumazet
> > <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> > <pabeni@redhat.com>; KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> > <haiyangz@microsoft.com>; Wei Liu <wei.liu@kernel.org>; Dexuan Cui
> > <decui@microsoft.com>; Long Li <longli@microsoft.com>; Michael Kelley
> > (LINUX) <mikelley@microsoft.com>; David S. Miller <davem@davemloft.net>
> > Subject: [PATCH] hv_netvsc: support a new host capability
> > AllowRscDisabledStatus
> > 
> > A future Azure host update has the potential to change RSC behavior
> > in the VMs. To avoid this invisble change, Vswitch will check the
> > netvsc version of a VM before sending its RSC capabilities, and will
> > always indicate that the host performs RSC if the VM doesn't have an
> > updated netvsc driver regardless of the actual host RSC capabilities.
> > Netvsc now advertises a new capability: AllowRscDisabledStatus
> > The host will check for this capability before sending RSC status,
> > and if a VM does not have this capability it will send RSC enabled
> > status regardless of host RSC settings
> > 
> > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > ---
> >  drivers/net/hyperv/hyperv_net.h | 3 +++
> >  drivers/net/hyperv/netvsc.c     | 8 ++++++++
> >  2 files changed, 11 insertions(+)
> > 
> > diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> > index dd5919ec408b..218e0f31dd66 100644
> > --- a/drivers/net/hyperv/hyperv_net.h
> > +++ b/drivers/net/hyperv/hyperv_net.h
> > @@ -572,6 +572,9 @@ struct nvsp_2_vsc_capability {
> >  			u64 teaming:1;
> >  			u64 vsubnetid:1;
> >  			u64 rsc:1;
> > +			u64 timestamp:1;
> > +			u64 reliablecorrelationid:1;
> > +			u64 allowrscdisabledstatus:1;
> >  		};
> >  	};
> >  } __packed;
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index da737d959e81..2eb1e85ba940 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -619,6 +619,14 @@ static int negotiate_nvsp_ver(struct hv_device
> > *device,
> >  	init_packet->msg.v2_msg.send_ndis_config.mtu = ndev->mtu +
> > ETH_HLEN;
> >  	init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
> > 
> > +	/* Don't need a version check while setting this bit because if we
> > +	 * have a New VM on an old host, the VM will set the bit but the host
> > +	 * won't check it. If we have an old VM on a new host, the host will
> > +	 * check the bit, see its zero, and it'll know the VM has an
> > +	 * older NetVsc
> > +	 */
> > +	init_packet-
> > >msg.v2_msg.send_ndis_config.capability.allowrscdisabledstatus = 1;
> 
> Have you tested on the new host to verify: Before this patch, the host shows
> RSC status on, and after this patch the host shows it's off?
I have completed the patch sanilty tests. We are working on an upgraded host setup
to test the rsc specific changes, will update with results soon.
> 
> Thanks,
> - Haiyang

^ permalink raw reply

* RE: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Long Li @ 2023-07-02 20:18 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Greg KH, Paolo Abeni, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <20230630163805.79c0bdf5@kernel.org>

> Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving
> packets
> 
> On Fri, 30 Jun 2023 20:42:28 +0000 Long Li wrote:
> > > > 5.15 and kernel 6.1. (those kernels are longterm) They need this
> > > > fix to achieve the performance target.
> > >
> > > Why can't they be upgraded to get that performance target, and all
> > > the other goodness that those kernels have?  We don't normally
> > > backport new features, right?
> >
> > I think this should be considered as a fix, not a new feature.
> >
> > MANA is designed to be 200GB full duplex at the start.  Due to lack of
> > hardware testing capability at early stage of the project, we could
> > only test 100GB for the Linux driver. When hardware is fully capable
> > of reaching designed spec, this bug in the Linux driver shows up.
> 
> That part we understand.
> 
> If I were you I'd try to convince Greg and Paolo that the change is small and
> significant for user experience. And answer Greg's question why upgrading the
> kernel past 6.1 is a challenge in your environment.

I was under the impression that this patch was considered to be a feature, 
not a bug fix. I was trying to justify that the "Fixes:" tag was needed. 

I apologize for misunderstanding this.

Without this fix, it's not possible to run a typical workload designed for 200Gb
physical link speed.

We see a large number of customers and Linux distributions committed on 5.15 
and 6.1 kernels. They planned the product cycles and certification processes 
around these longterm kernel versions. It's difficult for them to upgrade to newer
kernel versions.

Thanks,

Long

^ permalink raw reply

* Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Jakub Kicinski @ 2023-06-30 23:38 UTC (permalink / raw)
  To: Long Li
  Cc: Greg KH, Paolo Abeni, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <PH7PR21MB3263330E6A32D81D52B955FBCE2AA@PH7PR21MB3263.namprd21.prod.outlook.com>

On Fri, 30 Jun 2023 20:42:28 +0000 Long Li wrote:
> > > 5.15 and kernel 6.1. (those kernels are longterm) They need this fix to achieve
> > > the performance target.
> > 
> > Why can't they be upgraded to get that performance target, and all the other
> > goodness that those kernels have?  We don't normally backport new features,
> > right?  
> 
> I think this should be considered as a fix, not a new feature.
> 
> MANA is designed to be 200GB full duplex at the start.  Due to lack of
> hardware testing capability at early stage of the project, we could only test 100GB
> for the Linux driver. When hardware is fully capable of reaching designed spec,
> this bug in the Linux driver shows up.

That part we understand.

If I were you I'd try to convince Greg and Paolo that the change is
small and significant for user experience. And answer Greg's question
why upgrading the kernel past 6.1 is a challenge in your environment.

^ permalink raw reply

* RE: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Long Li @ 2023-06-30 20:42 UTC (permalink / raw)
  To: Greg KH
  Cc: Paolo Abeni, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski, linux-rdma@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <2023063001-agenda-spent-83c6@gregkh>

> Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on
> receiving packets
> 
> On Fri, Jun 30, 2023 at 05:31:48PM +0000, Long Li wrote:
> > > Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell
> > > on receiving packets
> > >
> > > On Thu, 2023-06-29 at 18:18 +0000, Long Li wrote:
> > > > > Subject: Re: [Patch v3] net: mana: Batch ringing RX queue
> > > > > doorbell on receiving packets
> > > > >
> > > > > On Mon, 2023-06-26 at 16:57 -0700, longli@linuxonhyperv.com
> wrote:
> > > > > > From: Long Li <longli@microsoft.com>
> > > > > >
> > > > > > It's inefficient to ring the doorbell page every time a WQE is
> > > > > > posted to the received queue. Excessive MMIO writes result in
> > > > > > CPU spending more time waiting on LOCK instructions (atomic
> > > > > > operations), resulting in poor scaling performance.
> > > > > >
> > > > > > Move the code for ringing doorbell page to where after we have
> > > > > > posted all WQEs to the receive queue during a callback from
> > > > > > napi_poll().
> > > > > >
> > > > > > With this change, tests showed an improvement from 120G/s to
> > > > > > 160G/s on a 200G physical link, with 16 or 32 hardware queues.
> > > > > >
> > > > > > Tests showed no regression in network latency benchmarks on
> > > > > > single connection.
> > > > > >
> > > > > > While we are making changes in this code path, change the code
> > > > > > for ringing doorbell to set the WQE_COUNT to 0 for Receive
> > > > > > Queue. The hardware specification specifies that it should set to 0.
> > > > > > Although
> > > > > > currently the hardware doesn't enforce the check, in the
> > > > > > future releases it may do.
> > > > > >
> > > > > > Cc: stable@vger.kernel.org
> > > > > > Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft
> > > > > > Azure Network Adapter (MANA)")
> > > > >
> > > > > Uhmmm... this looks like a performance improvement to me, more
> > > > > suitable for the net-next tree ?!? (Note that net-next is closed
> > > > > now).
> > > >
> > > > This issue is a blocker for usage on 200G physical link. I think
> > > > it can be categorized as a fix.
> > >
> > > Let me ask the question the other way around: is there any specific
> > > reason to have this fix into 6.5 and all the way back to 5.13?
> > > Especially the latest bit (CC-ing stable) looks at least debatable.
> >
> > There are many deployed Linux distributions with MANA driver on kernel
> 5.15 and kernel 6.1. (those kernels are longterm) They need this fix to achieve
> the performance target.
> 
> Why can't they be upgraded to get that performance target, and all the other
> goodness that those kernels have?  We don't normally backport new features,
> right?

I think this should be considered as a fix, not a new feature.

MANA is designed to be 200GB full duplex at the start.  Due to lack of
hardware testing capability at early stage of the project, we could only test 100GB
for the Linux driver. When hardware is fully capable of reaching designed spec,
this bug in the Linux driver shows up.

Thanks,

Long

> 
> thanks,
> 
> greg k-h

^ permalink raw reply

* Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Greg KH @ 2023-06-30 19:46 UTC (permalink / raw)
  To: Long Li
  Cc: Paolo Abeni, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski, linux-rdma@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <PH7PR21MB326330931CFDDA96E287E470CE2AA@PH7PR21MB3263.namprd21.prod.outlook.com>

On Fri, Jun 30, 2023 at 05:31:48PM +0000, Long Li wrote:
> > Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on
> > receiving packets
> > 
> > On Thu, 2023-06-29 at 18:18 +0000, Long Li wrote:
> > > > Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell
> > > > on receiving packets
> > > >
> > > > On Mon, 2023-06-26 at 16:57 -0700, longli@linuxonhyperv.com wrote:
> > > > > From: Long Li <longli@microsoft.com>
> > > > >
> > > > > It's inefficient to ring the doorbell page every time a WQE is
> > > > > posted to the received queue. Excessive MMIO writes result in CPU
> > > > > spending more time waiting on LOCK instructions (atomic
> > > > > operations), resulting in poor scaling performance.
> > > > >
> > > > > Move the code for ringing doorbell page to where after we have
> > > > > posted all WQEs to the receive queue during a callback from
> > > > > napi_poll().
> > > > >
> > > > > With this change, tests showed an improvement from 120G/s to
> > > > > 160G/s on a 200G physical link, with 16 or 32 hardware queues.
> > > > >
> > > > > Tests showed no regression in network latency benchmarks on single
> > > > > connection.
> > > > >
> > > > > While we are making changes in this code path, change the code for
> > > > > ringing doorbell to set the WQE_COUNT to 0 for Receive Queue. The
> > > > > hardware specification specifies that it should set to 0.
> > > > > Although
> > > > > currently the hardware doesn't enforce the check, in the future
> > > > > releases it may do.
> > > > >
> > > > > Cc: stable@vger.kernel.org
> > > > > Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure
> > > > > Network Adapter (MANA)")
> > > >
> > > > Uhmmm... this looks like a performance improvement to me, more
> > > > suitable for the net-next tree ?!? (Note that net-next is closed
> > > > now).
> > >
> > > This issue is a blocker for usage on 200G physical link. I think it
> > > can be categorized as a fix.
> > 
> > Let me ask the question the other way around: is there any specific reason to
> > have this fix into 6.5 and all the way back to 5.13?
> > Especially the latest bit (CC-ing stable) looks at least debatable.
> 
> There are many deployed Linux distributions with MANA driver on kernel 5.15 and kernel 6.1. (those kernels are longterm) They need this fix to achieve the performance target.

Why can't they be upgraded to get that performance target, and all the
other goodness that those kernels have?  We don't normally backport new
features, right?

thanks,

greg k-h

^ permalink raw reply

* RE: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Long Li @ 2023-06-30 17:31 UTC (permalink / raw)
  To: Paolo Abeni, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski
  Cc: linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <e5c3e5e5033290c2228bbad0307334a964eb065e.camel@redhat.com>

> Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on
> receiving packets
> 
> On Thu, 2023-06-29 at 18:18 +0000, Long Li wrote:
> > > Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell
> > > on receiving packets
> > >
> > > On Mon, 2023-06-26 at 16:57 -0700, longli@linuxonhyperv.com wrote:
> > > > From: Long Li <longli@microsoft.com>
> > > >
> > > > It's inefficient to ring the doorbell page every time a WQE is
> > > > posted to the received queue. Excessive MMIO writes result in CPU
> > > > spending more time waiting on LOCK instructions (atomic
> > > > operations), resulting in poor scaling performance.
> > > >
> > > > Move the code for ringing doorbell page to where after we have
> > > > posted all WQEs to the receive queue during a callback from
> > > > napi_poll().
> > > >
> > > > With this change, tests showed an improvement from 120G/s to
> > > > 160G/s on a 200G physical link, with 16 or 32 hardware queues.
> > > >
> > > > Tests showed no regression in network latency benchmarks on single
> > > > connection.
> > > >
> > > > While we are making changes in this code path, change the code for
> > > > ringing doorbell to set the WQE_COUNT to 0 for Receive Queue. The
> > > > hardware specification specifies that it should set to 0.
> > > > Although
> > > > currently the hardware doesn't enforce the check, in the future
> > > > releases it may do.
> > > >
> > > > Cc: stable@vger.kernel.org
> > > > Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure
> > > > Network Adapter (MANA)")
> > >
> > > Uhmmm... this looks like a performance improvement to me, more
> > > suitable for the net-next tree ?!? (Note that net-next is closed
> > > now).
> >
> > This issue is a blocker for usage on 200G physical link. I think it
> > can be categorized as a fix.
> 
> Let me ask the question the other way around: is there any specific reason to
> have this fix into 6.5 and all the way back to 5.13?
> Especially the latest bit (CC-ing stable) looks at least debatable.

There are many deployed Linux distributions with MANA driver on kernel 5.15 and kernel 6.1. (those kernels are longterm) They need this fix to achieve the performance target.

Thanks,

Long

^ permalink raw reply

* Re: [PATCH 07/12] arch/x86: Declare edid_info in <asm/screen_info.h>
From: Arnd Bergmann @ 2023-06-30 11:53 UTC (permalink / raw)
  To: Thomas Zimmermann, Helge Deller, Daniel Vetter, Dave Airlie
  Cc: linux-kernel, linux-alpha, linux-arm-kernel, linux-efi,
	linux-csky@vger.kernel.org, linux-hexagon, linux-ia64, loongarch,
	linux-mips, linuxppc-dev, linux-riscv, linux-sh, sparclinux,
	dri-devel, linux-hyperv, linux-fbdev, linux-staging, Linux-Arch,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Kees Cook, Paul E. McKenney, Peter Zijlstra,
	Andrew Morton, Frederic Weisbecker, Nicholas Piggin,
	Ard Biesheuvel, Sami Tolvanen, Juerg Haefliger
In-Reply-To: <ef7b3899-7d18-8018-47fa-aac0efaa61f4@suse.de>

On Fri, Jun 30, 2023, at 09:46, Thomas Zimmermann wrote:
> Am 29.06.23 um 15:21 schrieb Arnd Bergmann:
>> On Thu, Jun 29, 2023, at 15:01, Thomas Zimmermann wrote:
>>> Am 29.06.23 um 14:35 schrieb Arnd Bergmann:
>>>> On Thu, Jun 29, 2023, at 13:45, Thomas Zimmermann wrote:
>
>>>
>>> FIRMWARE_EDID is a user-selectable feature, while ARCH_HAS_EDID_INFO
>>> announces an architecture feature. They do different things.
>> 
>> I still have trouble seeing the difference.
>
> The idea here is that ARCH_HAS_ signals the architecture's support for 
> the feature.  Drivers set 'depends on' in their Kconfig.
>
> Another Kconfig token, VIDEO_SCREEN_INFO or FIRMWARE_EDID, would then 
> actually enable the feature.  Drivers select VIDEO_SCREEN_INFO or 
> FIRMWARE_EDID and the architectures contains code like

Fair enough. In that case, I guess FIRMWARE_EDID will just depend on
ARCH_HAS_EDID_INFO, or possibly "depends on FIRMWARE_EDID || EFI"
after it starts calling into an EFI specific function, right?

> #ifdef VIDEO_SCREEN_INFO
> struct screen_info screen_info = {
> 	/* set values here */
> }
> #endif
>
> This allows us to disable code that requires screen_info/edid_info, but 
> also disable screen_info/edid_info unless such code has been enabled in 
> the kernel config.
>
> Some architectures currently mimic this by guarding screen_info with 
> ifdef CONFIG_VT or similar. I'd like to make this more flexible. The 
> cost of a few more internal Kconfig tokens seems negligible.

I definitely get it for the screen_info, which needs the complexity.
For ARCHARCH_HAS_EDID_INFO I would hope that it's never selected by
anything other than x86, so I would still go with just a dependency
on x86 for simplicity, but I don't mind having the extra symbol if that
keeps it more consistent with how the screen_info is handled.

>> I suppose you could use FIRMWARE_EDID on EFI or OF systems without
>> the need for a global edid_info structure, but that would not
>> share any code with the current fb_firmware_edid() function.
>
> The current code is build on top of screen_info and edid_info. I'd 
> preferably not replace that, if possible.

One way I could imagine this looking in the end would be
something like

struct screen_info *fb_screen_info(struct device *dev)
{
      struct screen_info *si = NULL;

      if (IS_ENABLED(CONFIG_EFI))
            si = efi_get_screen_info(dev);

      if (IS_ENABLED(CONFIG_ARCH_HAS_SCREEN_INFO) && !si)
            si = screen_info;

      return si;
}

corresponding to fb_firmware_edid(). With this, any driver
that wants to access screen_info would call this function
instead of using the global pointer, plus either NULL pointer
check or a CONFIG_ARCH_HAS_SCREEN_INFO dependency.

This way we could completely eliminate the global screen_info
on arm64, riscv, and loongarch but still use the efi and
hyperv framebuffer/drm drivers.

    Arnd

^ permalink raw reply

* Re: [Patch v3] net: mana: Batch ringing RX queue doorbell on receiving packets
From: Paolo Abeni @ 2023-06-30 10:36 UTC (permalink / raw)
  To: Long Li, longli@linuxonhyperv.com, Jason Gunthorpe,
	Leon Romanovsky, Ajay Sharma, Dexuan Cui, KY Srinivasan,
	Haiyang Zhang, Wei Liu, David S. Miller, Eric Dumazet,
	Jakub Kicinski
  Cc: linux-rdma@vger.kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <PH7PR21MB3263B266E381BA15DCE45820CE25A@PH7PR21MB3263.namprd21.prod.outlook.com>

On Thu, 2023-06-29 at 18:18 +0000, Long Li wrote:
> > Subject: Re: [Patch v3] net: mana: Batch ringing RX queue doorbell
> > on receiving
> > packets
> > 
> > On Mon, 2023-06-26 at 16:57 -0700, longli@linuxonhyperv.com wrote:
> > > From: Long Li <longli@microsoft.com>
> > > 
> > > It's inefficient to ring the doorbell page every time a WQE is
> > > posted
> > > to the received queue. Excessive MMIO writes result in CPU
> > > spending
> > > more time waiting on LOCK instructions (atomic operations),
> > > resulting
> > > in poor scaling performance.
> > > 
> > > Move the code for ringing doorbell page to where after we have
> > > posted
> > > all WQEs to the receive queue during a callback from napi_poll().
> > > 
> > > With this change, tests showed an improvement from 120G/s to
> > > 160G/s on
> > > a 200G physical link, with 16 or 32 hardware queues.
> > > 
> > > Tests showed no regression in network latency benchmarks on
> > > single
> > > connection.
> > > 
> > > While we are making changes in this code path, change the code
> > > for
> > > ringing doorbell to set the WQE_COUNT to 0 for Receive Queue. The
> > > hardware specification specifies that it should set to 0.
> > > Although
> > > currently the hardware doesn't enforce the check, in the future
> > > releases it may do.
> > > 
> > > Cc: stable@vger.kernel.org
> > > Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure
> > > Network Adapter (MANA)")
> > 
> > Uhmmm... this looks like a performance improvement to me, more
> > suitable for
> > the net-next tree ?!? (Note that net-next is closed now).
> 
> This issue is a blocker for usage on 200G physical link. I think it
> can be categorized as a fix.

Let me ask the question the other way around: is there any specific
reason to have this fix into 6.5 and all the way back to 5.13?
Especially the latest bit (CC-ing stable) looks at least debatable.

Thanks,

Paolo


^ permalink raw reply

* Re: [PATCH 07/12] arch/x86: Declare edid_info in <asm/screen_info.h>
From: Thomas Zimmermann @ 2023-06-30  7:46 UTC (permalink / raw)
  To: Arnd Bergmann, Helge Deller, Daniel Vetter, Dave Airlie
  Cc: linux-kernel, linux-alpha, linux-arm-kernel, linux-efi,
	linux-csky@vger.kernel.org, linux-hexagon, linux-ia64, loongarch,
	linux-mips, linuxppc-dev, linux-riscv, linux-sh, sparclinux,
	dri-devel, linux-hyperv, linux-fbdev, linux-staging, Linux-Arch,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Kees Cook, Paul E. McKenney, Peter Zijlstra,
	Andrew Morton, Frederic Weisbecker, Nicholas Piggin,
	Ard Biesheuvel, Sami Tolvanen, Juerg Haefliger
In-Reply-To: <0dbbdfc4-0e91-4be4-9ca0-d8ba6f18453d@app.fastmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 2989 bytes --]

Hi

Am 29.06.23 um 15:21 schrieb Arnd Bergmann:
> On Thu, Jun 29, 2023, at 15:01, Thomas Zimmermann wrote:
>> Am 29.06.23 um 14:35 schrieb Arnd Bergmann:
>>> On Thu, Jun 29, 2023, at 13:45, Thomas Zimmermann wrote:
>>>> The global variable edid_info contains the firmware's EDID information
>>>> as an extension to the regular screen_info on x86. Therefore move it to
>>>> <asm/screen_info.h>.
>>>>
>>>> Add the Kconfig token ARCH_HAS_EDID_INFO to guard against access on
>>>> architectures that don't provide edid_info. Select it on x86.
>>>
>>> I'm not sure we need another symbol in addition to
>>> CONFIG_FIRMWARE_EDID. Since all the code behind that
>>> existing symbol is also x86 specific, would it be enough
>>> to just add either 'depends on X86' or 'depends on X86 ||
>>> COMPILE_TEST' there?
>>
>> FIRMWARE_EDID is a user-selectable feature, while ARCH_HAS_EDID_INFO
>> announces an architecture feature. They do different things.
> 
> I still have trouble seeing the difference.

The idea here is that ARCH_HAS_ signals the architecture's support for 
the feature.  Drivers set 'depends on' in their Kconfig.

Another Kconfig token, VIDEO_SCREEN_INFO or FIRMWARE_EDID, would then 
actually enable the feature.  Drivers select VIDEO_SCREEN_INFO or 
FIRMWARE_EDID and the architectures contains code like

#ifdef VIDEO_SCREEN_INFO
struct screen_info screen_info = {
	/* set values here */
}
#endif

This allows us to disable code that requires screen_info/edid_info, but 
also disable screen_info/edid_info unless such code has been enabled in 
the kernel config.

Some architectures currently mimic this by guarding screen_info with 
ifdef CONFIG_VT or similar. I'd like to make this more flexible. The 
cost of a few more internal Kconfig tokens seems negligible.

> 
>> Right now, ARCH_HAS_EDID_INFO only works on the old BIOS-based VESA
>> systems. In the future, I want to add support for EDID data from EFI and
>> OF as well. It would be stored in edid_info. I assume that the new
>> symbol will become useful then.
> 
> I don't see why an OF based system would have the same limitation
> as legacy BIOS with supporting only a single monitor, if we need
> to have a generic representation of EDID data in DT, that would
> probably be in a per device property anyway.

Sorry that was my mistake. OF has nothing to do with this.

> 
> I suppose you could use FIRMWARE_EDID on EFI or OF systems without
> the need for a global edid_info structure, but that would not
> share any code with the current fb_firmware_edid() function.

The current code is build on top of screen_info and edid_info. I'd 
preferably not replace that, if possible.

Best regards
Thomas

> 
>       Arnd

-- 
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]

^ permalink raw reply

* Re: [PATCH v3 1/1] RDMA/mana_ib: Add EQ interrupt support to mana ib driver.
From: kernel test robot @ 2023-06-30  3:19 UTC (permalink / raw)
  To: Wei Hu, netdev, linux-hyperv, linux-rdma, longli, sharmaajay, jgg,
	leon, kys, haiyangz, wei.liu, decui, davem, edumazet, kuba,
	pabeni, vkuznets, ssengar, shradhagupta
  Cc: llvm, oe-kbuild-all
In-Reply-To: <20230615111412.1687573-1-weh@microsoft.com>

Hi Wei,

kernel test robot noticed the following build warnings:

[auto build test WARNING on linus/master]
[also build test WARNING on horms-ipvs/master v6.4 next-20230629]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Wei-Hu/RDMA-mana_ib-Add-EQ-interrupt-support-to-mana-ib-driver/20230615-191709
base:   linus/master
patch link:    https://lore.kernel.org/r/20230615111412.1687573-1-weh%40microsoft.com
patch subject: [PATCH v3 1/1] RDMA/mana_ib: Add EQ interrupt support to mana ib driver.
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20230630/202306301145.qNZGZ0MP-lkp@intel.com/config)
compiler: clang version 15.0.7 (https://github.com/llvm/llvm-project.git 8dfdcc7b7bf66834a761bd8de445840ef68e4d1a)
reproduce: (https://download.01.org/0day-ci/archive/20230630/202306301145.qNZGZ0MP-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306301145.qNZGZ0MP-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/infiniband/hw/mana/main.c:150:20: warning: unused variable 'ibdev' [-Wunused-variable]
           struct ib_device *ibdev = ucontext->ibucontext.device;
                             ^
   1 warning generated.

vim +/ibdev +150 drivers/infiniband/hw/mana/main.c

   145	
   146	static void mana_ib_destroy_eq(struct mana_ib_ucontext *ucontext,
   147				       struct mana_ib_dev *mdev)
   148	{
   149		struct gdma_context *gc = mdev->gdma_dev->gdma_context;
 > 150		struct ib_device *ibdev = ucontext->ibucontext.device;
   151		struct gdma_queue *eq;
   152		int i;
   153	
   154		if (!ucontext->eqs)
   155			return;
   156	
   157		for (i = 0; i < gc->max_num_queues; i++) {
   158			eq = ucontext->eqs[i].eq;
   159			if (!eq)
   160				continue;
   161	
   162			mana_gd_destroy_queue(gc, eq);
   163		}
   164	
   165		kfree(ucontext->eqs);
   166		ucontext->eqs = NULL;
   167	}
   168	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* RE: [PATCH AUTOSEL 6.3 07/17] arm64/hyperv: Use CPUHP_AP_HYPERV_ONLINE state to fix CPU online sequencing
From: Michael Kelley (LINUX) @ 2023-06-29 22:18 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel@vger.kernel.org, stable@vger.kernel.org
  Cc: Dexuan Cui, Wei Liu, KY Srinivasan, Haiyang Zhang,
	catalin.marinas@arm.com, will@kernel.org,
	linux-hyperv@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20230629190049.907558-7-sashal@kernel.org>

From: Sasha Levin <sashal@kernel.org> Sent: Thursday, June 29, 2023 12:01 PM
> 
> From: Michael Kelley <mikelley@microsoft.com>
> 
> [ Upstream commit 52ae076c3a9b366b6fa9f7c7e67aed8b28716ed9 ]
> 
> State CPUHP_AP_HYPERV_ONLINE has been introduced to correctly sequence the
> initialization of hyperv_pcpu_input_arg. Use this new state for Hyper-V
> initialization so that hyperv_pcpu_input_arg is allocated early enough.
> 

State CPUHP_AP_HYPERV_ONLINE was introduced in 6.4.  There's no need
to backport this patch to stable releases.

Michael

^ permalink raw reply

* RE: [PATCH AUTOSEL 6.3 06/17] x86/hyperv: Fix hyperv_pcpu_input_arg handling when CPUs go online/offline
From: Michael Kelley (LINUX) @ 2023-06-29 22:17 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel@vger.kernel.org, stable@vger.kernel.org
  Cc: Vitaly Kuznetsov, Borislav Petkov, Dexuan Cui, Wei Liu,
	KY Srinivasan, Haiyang Zhang, tglx@linutronix.de,
	mingo@redhat.com, dave.hansen@linux.intel.com, x86@kernel.org,
	peterz@infradead.org, linux-hyperv@vger.kernel.org
In-Reply-To: <20230629190049.907558-6-sashal@kernel.org>

From: Sasha Levin <sashal@kernel.org> Sent: Thursday, June 29, 2023 12:01 PM
> 
> From: Michael Kelley <mikelley@microsoft.com>
> 
> [ Upstream commit 9636be85cc5bdd8b7a7f6a53405cbcc52161c93c ]
> 
> These commits
> 
> a494aef23dfc ("PCI: hv: Replace retarget_msi_interrupt_params with
> hyperv_pcpu_input_arg")
> 2c6ba4216844 ("PCI: hv: Enable PCI pass-thru devices in Confidential VMs")
> 
> update the Hyper-V virtual PCI driver to use the hyperv_pcpu_input_arg
> because that memory will be correctly marked as decrypted or encrypted
> for all VM types (CoCo or normal). But problems ensue when CPUs in the
> VM go online or offline after virtual PCI devices have been configured.

The two mentioned commits first appeared in 6.4.  So the problem fixed
by this patch doesn't occur with 6.3 and earlier.  There's no need to backport
this patch to stable releases.

Michael

^ permalink raw reply

* [PATCH net v4 2/2] net: mana: Use the correct WQE count for ringing RQ doorbell
From: longli @ 2023-06-29 22:09 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Leon Romanovsky, Shradha Gupta, Ajay Sharma, Shachar Raindel,
	Stephen Hemminger, linux-hyperv, netdev, linux-kernel
  Cc: linux-rdma, Long Li, stable
In-Reply-To: <1688076571-24938-1-git-send-email-longli@linuxonhyperv.com>

From: Long Li <longli@microsoft.com>

The hardware specification specifies that WQE_COUNT should set to 0 for
the Receive Queue. Although currently the hardware doesn't enforce the
check, in the future releases it may check on this value.

Cc: stable@vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v4:
Split the original patch into two: one for batching doorbell, one for setting the correct wqe count

 drivers/net/ethernet/microsoft/mana/gdma_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8f3f78b68592..3765d3389a9a 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -300,8 +300,11 @@ static void mana_gd_ring_doorbell(struct gdma_context *gc, u32 db_index,
 
 void mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue)
 {
+	/* Hardware Spec specifies that software client should set 0 for
+	 * wqe_cnt for Receive Queues. This value is not used in Send Queues.
+	 */
 	mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type,
-			      queue->id, queue->head * GDMA_WQE_BU_SIZE, 1);
+			      queue->id, queue->head * GDMA_WQE_BU_SIZE, 0);
 }
 
 void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v4 1/2] net: mana: Batch ringing RX queue doorbell on receiving packets
From: longli @ 2023-06-29 22:09 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Leon Romanovsky, Shradha Gupta, Ajay Sharma, Shachar Raindel,
	Stephen Hemminger, linux-hyperv, netdev, linux-kernel
  Cc: linux-rdma, Long Li, stable
In-Reply-To: <1688076571-24938-1-git-send-email-longli@linuxonhyperv.com>

From: Long Li <longli@microsoft.com>

It's inefficient to ring the doorbell page every time a WQE is posted to
the received queue. Excessive MMIO writes result in CPU spending more
time waiting on LOCK instructions (atomic operations), resulting in
poor scaling performance.

Move the code for ringing doorbell page to where after we have posted all
WQEs to the receive queue during a callback from napi_poll().

With this change, tests showed an improvement from 120G/s to 160G/s on a
200G physical link, with 16 or 32 hardware queues.

Tests showed no regression in network latency benchmarks on single
connection.

Cc: stable@vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Check for comp_read > 0 as it might be negative on completion error.
Set rq.wqe_cnt to 0 according to BNIC spec.

v3:
Add details in the commit on the reason of performance increase and test numbers.
Add details in the commit on why rq.wqe_cnt should be set to 0 according to hardware spec.
Add "Reviewed-by" from Haiyang and Dexuan.

v4:
Split the original patch into two: one for batching doorbell, one for setting the correct wqe count

 drivers/net/ethernet/microsoft/mana/mana_en.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index cd4d5ceb9f2d..1d8abe63fcb8 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1383,8 +1383,8 @@ static void mana_post_pkt_rxq(struct mana_rxq *rxq)
 
 	recv_buf_oob = &rxq->rx_oobs[curr_index];
 
-	err = mana_gd_post_and_ring(rxq->gdma_rq, &recv_buf_oob->wqe_req,
-				    &recv_buf_oob->wqe_inf);
+	err = mana_gd_post_work_request(rxq->gdma_rq, &recv_buf_oob->wqe_req,
+					&recv_buf_oob->wqe_inf);
 	if (WARN_ON_ONCE(err))
 		return;
 
@@ -1654,6 +1654,12 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
 		mana_process_rx_cqe(rxq, cq, &comp[i]);
 	}
 
+	if (comp_read > 0) {
+		struct gdma_context *gc = rxq->gdma_rq->gdma_dev->gdma_context;
+
+		mana_gd_wq_ring_doorbell(gc, rxq->gdma_rq);
+	}
+
 	if (rxq->xdp_flush)
 		xdp_do_flush();
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v4 0/2] net: mana: Fix doorbell access for receive queues
From: longli @ 2023-06-29 22:09 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Leon Romanovsky, Shradha Gupta, Ajay Sharma, Shachar Raindel,
	Stephen Hemminger, linux-hyperv, netdev, linux-kernel
  Cc: linux-rdma, Long Li

From: Long Li <longli@microsoft.com>

This patchset fixes the issues discovered during 200G physical link
tests. It fixes doorbell usage and WQE format for receive queues.

Long Li (2):
  net: mana: Batch ringing RX queue doorbell on receiving packets
  net: mana: Use the correct WQE count for ringing RQ doorbell

 drivers/net/ethernet/microsoft/mana/gdma_main.c |  5 ++++-
 drivers/net/ethernet/microsoft/mana/mana_en.c   | 10 ++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

-- 
2.34.1


^ permalink raw reply

* [PATCH AUTOSEL 6.3 06/17] x86/hyperv: Fix hyperv_pcpu_input_arg handling when CPUs go online/offline
From: Sasha Levin @ 2023-06-29 19:00 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Michael Kelley, Vitaly Kuznetsov, Borislav Petkov, Dexuan Cui,
	Wei Liu, Sasha Levin, kys, haiyangz, tglx, mingo, dave.hansen,
	x86, peterz, linux-hyperv
In-Reply-To: <20230629190049.907558-1-sashal@kernel.org>

From: Michael Kelley <mikelley@microsoft.com>

[ Upstream commit 9636be85cc5bdd8b7a7f6a53405cbcc52161c93c ]

These commits

a494aef23dfc ("PCI: hv: Replace retarget_msi_interrupt_params with hyperv_pcpu_input_arg")
2c6ba4216844 ("PCI: hv: Enable PCI pass-thru devices in Confidential VMs")

update the Hyper-V virtual PCI driver to use the hyperv_pcpu_input_arg
because that memory will be correctly marked as decrypted or encrypted
for all VM types (CoCo or normal). But problems ensue when CPUs in the
VM go online or offline after virtual PCI devices have been configured.

When a CPU is brought online, the hyperv_pcpu_input_arg for that CPU is
initialized by hv_cpu_init() running under state CPUHP_AP_ONLINE_DYN.
But this state occurs after state CPUHP_AP_IRQ_AFFINITY_ONLINE, which
may call the virtual PCI driver and fault trying to use the as yet
uninitialized hyperv_pcpu_input_arg. A similar problem occurs in a CoCo
VM if the MMIO read and write hypercalls are used from state
CPUHP_AP_IRQ_AFFINITY_ONLINE.

When a CPU is taken offline, IRQs may be reassigned in state
CPUHP_TEARDOWN_CPU. Again, the virtual PCI driver may fault trying to
use the hyperv_pcpu_input_arg that has already been freed by a
higher state.

Fix the onlining problem by adding state CPUHP_AP_HYPERV_ONLINE
immediately after CPUHP_AP_ONLINE_IDLE (similar to CPUHP_AP_KVM_ONLINE)
and before CPUHP_AP_IRQ_AFFINITY_ONLINE. Use this new state for
Hyper-V initialization so that hyperv_pcpu_input_arg is allocated
early enough.

Fix the offlining problem by not freeing hyperv_pcpu_input_arg when
a CPU goes offline. Retain the allocated memory, and reuse it if
the CPU comes back online later.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Link: https://lore.kernel.org/r/1684862062-51576-1-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/x86/hyperv/hv_init.c  |  2 +-
 drivers/hv/hv_common.c     | 48 +++++++++++++++++++-------------------
 include/linux/cpuhotplug.h |  1 +
 3 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 41ef036ebb7ba..5e26ac43552b0 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -414,7 +414,7 @@ void __init hyperv_init(void)
 			goto free_vp_assist_page;
 	}
 
-	cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+	cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online",
 				  hv_cpu_init, hv_cpu_die);
 	if (cpuhp < 0)
 		goto free_ghcb_page;
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 52a6f89ccdbd4..2d19118adf95b 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -133,13 +133,20 @@ int hv_common_cpu_init(unsigned int cpu)
 	flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
 
 	inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
-	*inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
-	if (!(*inputarg))
-		return -ENOMEM;
 
-	if (hv_root_partition) {
-		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
-		*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
+	/*
+	 * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
+	 * allocated if this CPU was previously online and then taken offline
+	 */
+	if (!*inputarg) {
+		*inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
+		if (!(*inputarg))
+			return -ENOMEM;
+
+		if (hv_root_partition) {
+			outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+			*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
+		}
 	}
 
 	msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
@@ -154,24 +161,17 @@ int hv_common_cpu_init(unsigned int cpu)
 
 int hv_common_cpu_die(unsigned int cpu)
 {
-	unsigned long flags;
-	void **inputarg, **outputarg;
-	void *mem;
-
-	local_irq_save(flags);
-
-	inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
-	mem = *inputarg;
-	*inputarg = NULL;
-
-	if (hv_root_partition) {
-		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
-		*outputarg = NULL;
-	}
-
-	local_irq_restore(flags);
-
-	kfree(mem);
+	/*
+	 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
+	 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
+	 * may be used by the Hyper-V vPCI driver in reassigning interrupts
+	 * as part of the offlining process.  The interrupt reassignment
+	 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
+	 * called this function.
+	 *
+	 * If a previously offlined CPU is brought back online again, the
+	 * originally allocated memory is reused in hv_common_cpu_init().
+	 */
 
 	return 0;
 }
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0f1001dca0e00..3ceb9dfa09933 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -200,6 +200,7 @@ enum cpuhp_state {
 
 	/* Online section invoked on the hotplugged CPU from the hotplug thread */
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_HYPERV_ONLINE,
 	CPUHP_AP_KVM_ONLINE,
 	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
-- 
2.39.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox