Linux-HyperV List
 help / color / mirror / Atom feed
* RE: [PATCH v2 1/3] drivers: hv: vmbus: Introduce latency testing
From: Michael Kelley @ 2019-08-21 22:52 UTC (permalink / raw)
  To: brandonbonaby94, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org
  Cc: brandonbonaby94, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <ebca54bf70d2af53de419c1b7ac8db5b77b888cb.1566266609.git.brandonbonaby94@gmail.com>

From: Branden Bonaby <brandonbonaby94@gmail.com> Sent: Monday, August 19, 2019 7:45 PM
> 
> Introduce user specified latency in the packet reception path.
> 
> Signed-off-by: Branden Bonaby <brandonbonaby94@gmail.com>
> ---
> Changes in v2:
>  - Add #ifdef in Kconfig file so test code will not interfere
>    with non-test code.
>  - Move test code functions for delay to hyperv_vmbus header
>    file.
>  - Wrap test code under #ifdef statement.
> 
>  drivers/hv/Kconfig        |  7 +++++++
>  drivers/hv/connection.c   |  3 +++
>  drivers/hv/hyperv_vmbus.h | 20 ++++++++++++++++++++
>  drivers/hv/ring_buffer.c  |  7 +++++++
>  include/linux/hyperv.h    | 21 +++++++++++++++++++++
>  5 files changed, 58 insertions(+)
> 
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 9a59957922d4..d97437ba0626 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -29,4 +29,11 @@ config HYPERV_BALLOON
>  	help
>  	  Select this option to enable Hyper-V Balloon driver.
> 
> +config HYPERV_TESTING
> +        bool "Hyper-V testing"
> +        default n
> +        depends on HYPERV && DEBUG_FS
> +        help
> +          Select this option to enable Hyper-V vmbus testing.
> +
>  endmenu
> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> index 09829e15d4a0..c9c63a4033cd 100644
> --- a/drivers/hv/connection.c
> +++ b/drivers/hv/connection.c
> @@ -357,6 +357,9 @@ void vmbus_on_event(unsigned long data)
> 
>  	trace_vmbus_on_event(channel);
> 
> +#ifdef CONFIG_HYPERV_TESTING
> +	hv_debug_delay_test(channel, INTERRUPT_DELAY);
> +#endif /* CONFIG_HYPERV_TESTING */

You are following Vitaly's suggestion to use #ifdef's so no code is
generated when HYPERV_TESTING is not enabled.  However, this
direct approach to using #ifdef's really clutters the code and makes
it harder to read and follow.  The better approach is to use the
#ifdef in the include file where the functions are defined.  If
HYPERV_TESTING is not enabled, provide a #else that defines
the function with an empty implementation for which the compiler
will generate no code.   An as example, see the function definition
for hyperv_init() in arch/x86/include/asm/mshyperv.h.  There are
several functions treated similarly in that include file.


>  	do {
>  		void (*callback_fn)(void *);
> 
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index 362e70e9d145..edf14f596d8c 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -357,4 +357,24 @@ enum hvutil_device_state {
>  	HVUTIL_DEVICE_DYING,     /* driver unload is in progress */
>  };
> 
> +#ifdef CONFIG_HYPERV_TESTING
> +#include <linux/debugfs.h>
> +#include <linux/delay.h>
> +#include <linux/err.h>

Generally #include files should go at the top of the file, even if they
are only needed conditionally.

> +#define TESTING "hyperv"

I'm not seeing what this line is for, or how it is used.

> +
> +enum delay {
> +	INTERRUPT_DELAY	= 0,
> +	MESSAGE_DELAY   = 1,
> +};
> +
> +int hv_debug_delay_files(struct hv_device *dev, struct dentry *root);
> +int hv_debug_add_dev_dir(struct hv_device *dev);
> +void hv_debug_rm_dev_dir(struct hv_device *dev);
> +void hv_debug_rm_all_dir(void);
> +void hv_debug_set_dir_dentry(struct hv_device *dev, struct dentry *root);
> +void hv_debug_delay_test(struct vmbus_channel *channel, enum delay delay_type);
> +

This is where you could put a #else and the null implementation of
the above functions.

> +#endif /* CONFIG_HYPERV_TESTING */
> +
>  #endif /* _HYPERV_VMBUS_H */
> diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
> index 9a03b163cbbd..51adda23b398 100644
> --- a/drivers/hv/ring_buffer.c
> +++ b/drivers/hv/ring_buffer.c
> @@ -396,6 +396,10 @@ struct vmpacket_descriptor *hv_pkt_iter_first(struct
> vmbus_channel *channel)
>  	struct hv_ring_buffer_info *rbi = &channel->inbound;
>  	struct vmpacket_descriptor *desc;
> 
> +#ifdef CONFIG_HYPERV_TESTING
> +	hv_debug_delay_test(channel, MESSAGE_DELAY);
> +#endif /* CONFIG_HYPERV_TESTING */
> +
>  	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
>  		return NULL;
> 
> @@ -421,6 +425,9 @@ __hv_pkt_iter_next(struct vmbus_channel *channel,
>  	u32 packetlen = desc->len8 << 3;
>  	u32 dsize = rbi->ring_datasize;
> 
> +#ifdef CONFIG_HYPERV_TESTING
> +	hv_debug_delay_test(channel, MESSAGE_DELAY);
> +#endif /* CONFIG_HYPERV_TESTING */
>  	/* bump offset to next potential packet */
>  	rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
>  	if (rbi->priv_read_index >= dsize)
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 6256cc34c4a6..6bf8ef5c780c 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -926,6 +926,21 @@ struct vmbus_channel {
>  	 * full outbound ring buffer.
>  	 */
>  	u64 out_full_first;
> +
> +#ifdef CONFIG_HYPERV_TESTING
> +	/* enabling/disabling fuzz testing on the channel (default is false)*/
> +	bool fuzz_testing_state;
> +
> +	/* Interrupt delay will delay the guest from emptying the ring buffer
> +	 * for a specific amount of time. The delay is in microseconds and will
> +	 * be between 1 to a maximum of 1000, its default is 0 (no delay).
> +	 * The  Message delay will delay guest reading on a per message basis
> +	 * in microseconds between 1 to 1000 with the default being 0
> +	 * (no delay).
> +	 */
> +	u32 fuzz_testing_interrupt_delay;
> +	u32 fuzz_testing_message_delay;
> +#endif /* CONFIG_HYPERV_TESTING */

For fields in a data structure like this, you don't have much choice
but to put the #ifdef directly inline.  However, for small fields like this
and where the data structure isn't size sensitive, you could consider
omitting the #ifdef and just always including the fields even when
HYPERV_TESTING is not enabled.  I don't have a strong preference
either way.

>  };
> 
>  static inline bool is_hvsock_channel(const struct vmbus_channel *c)
> @@ -1166,6 +1181,12 @@ struct hv_device {
> 
>  	struct vmbus_channel *channel;
>  	struct kset	     *channels_kset;
> +
> +#ifdef CONFIG_HYPERV_TESTING
> +	/* place holder to keep track of the dir for hv device in debugfs */
> +	struct dentry *debug_dir;
> +#endif /* CONFIG_HYPERV_TESTING */

Same here.

Michael

> +
>  };
> 
> 
> --
> 2.17.1


^ permalink raw reply

* RE: [PATCH v3 2/3] drivers: hv: vmbus: add test attributes to debugfs
From: Michael Kelley @ 2019-08-21 23:10 UTC (permalink / raw)
  To: brandonbonaby94, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org
  Cc: brandonbonaby94, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <a17474c59601a98576f1e002a57192f6314b4aaf.1566340843.git.brandonbonaby94@gmail.com>

From: Branden Bonaby <brandonbonaby94@gmail.com> Sent: Tuesday, August 20, 2019 4:39 PM
> 
> Expose the test parameters as part of the debugfs channel attributes.
> We will control the testing state via these attributes.
> 
> Signed-off-by: Branden Bonaby <brandonbonaby94@gmail.com>
> ---
> Changes in v3:
>  - Change call to IS_ERR_OR_NULL, to IS_ERR.
> 
> Changes in v2:
>  - Move test attributes to debugfs.
>  - Wrap test code under #ifdef statements.
>  - Add new documentation file under Documentation/ABI/testing.
>  - Make commit message reflect the change from from sysfs to debugfs.
> 
>  Documentation/ABI/testing/debugfs-hyperv |  21 +++
>  MAINTAINERS                              |   1 +
>  drivers/hv/vmbus_drv.c                   | 167 +++++++++++++++++++++++
>  3 files changed, 189 insertions(+)
>  create mode 100644 Documentation/ABI/testing/debugfs-hyperv
> 
> diff --git a/Documentation/ABI/testing/debugfs-hyperv
> b/Documentation/ABI/testing/debugfs-hyperv
> new file mode 100644
> index 000000000000..b25f751fafa8
> --- /dev/null
> +++ b/Documentation/ABI/testing/debugfs-hyperv
> @@ -0,0 +1,21 @@
> +What:           /sys/kernel/debug/hyperv/<UUID>/fuzz_test_state
> +Date:           August 2019
> +KernelVersion:  5.3
> +Contact:        Branden Bonaby <brandonbonaby94@gmail.com>
> +Description:    Fuzz testing status of a vmbus device, whether its in an ON
> +                state or a OFF state

Document what values are actually returned?  

> +Users:          Debugging tools
> +
> +What:           /sys/kernel/debug/hyperv/<UUID>/delay/fuzz_test_buffer_interrupt_delay
> +Date:           August 2019
> +KernelVersion:  5.3
> +Contact:        Branden Bonaby <brandonbonaby94@gmail.com>
> +Description:    Fuzz testing buffer delay value between 0 - 1000

It would be helpful to document the units -- I think this is 0 to 1000
microseconds.

> +Users:          Debugging tools
> +
> +What:           /sys/kernel/debug/hyperv/<UUID>/delay/fuzz_test_message_delay
> +Date:           August 2019
> +KernelVersion:  5.3
> +Contact:        Branden Bonaby <brandonbonaby94@gmail.com>
> +Description:    Fuzz testing message delay value between 0 - 1000

Same here.

> +Users:          Debugging tools
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e81e60bd7c26..120284a8185f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -7460,6 +7460,7 @@ F:	include/uapi/linux/hyperv.h
>  F:	include/asm-generic/mshyperv.h
>  F:	tools/hv/
>  F:	Documentation/ABI/stable/sysfs-bus-vmbus
> +F:	Documentation/ABI/testing/debugfs-hyperv
> 
>  HYPERBUS SUPPORT
>  M:	Vignesh Raghavendra <vigneshr@ti.com>
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index ebd35fc35290..d2e47f04d172 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -919,6 +919,10 @@ static void vmbus_device_release(struct device *device)
>  	struct hv_device *hv_dev = device_to_hv_device(device);
>  	struct vmbus_channel *channel = hv_dev->channel;
> 
> +#ifdef CONFIG_HYPERV_TESTING
> +	hv_debug_rm_dev_dir(hv_dev);
> +#endif /* CONFIG_HYPERV_TESTING */

Same comment in as previous patch about #ifdef inline in the code,
and similarly for other occurrences in this patch.

> +
>  	mutex_lock(&vmbus_connection.channel_mutex);
>  	hv_process_channel_removal(channel);
>  	mutex_unlock(&vmbus_connection.channel_mutex);
> @@ -1727,6 +1731,9 @@ int vmbus_device_register(struct hv_device *child_device_obj)
>  		pr_err("Unable to register primary channeln");
>  		goto err_kset_unregister;
>  	}
> +#ifdef CONFIG_HYPERV_TESTING
> +	hv_debug_add_dev_dir(child_device_obj);
> +#endif /* CONFIG_HYPERV_TESTING */
> 
>  	return 0;
> 
> @@ -2086,6 +2093,159 @@ static void hv_crash_handler(struct pt_regs *regs)
>  	hyperv_cleanup();
>  };
> 
> +#ifdef CONFIG_HYPERV_TESTING
> +
> +struct dentry *hv_root;
> +
> +static int hv_debugfs_delay_get(void *data, u64 *val)
> +{
> +	*val = *(u32 *)data;
> +	return 0;
> +}
> +
> +static int hv_debugfs_delay_set(void *data, u64 val)
> +{
> +	if (val >= 1 && val <= 1000)
> +		*(u32 *)data = val;
> +	/*Best to not use else statement here since we want
> +	 * the delay to remain the same if val > 1000
> +	 */

The standard multi-line comment style would be:

	/*
	 * Best to not use else statement here since we want
	 * the delay to remain the same if val > 1000
	 */

> +	else if (val <= 0)
> +		*(u32 *)data = 0;

You could consider returning an error for an invalid
value (< 0, or > 1000).

> +	return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(hv_debugfs_delay_fops, hv_debugfs_delay_get,
> +			 hv_debugfs_delay_set, "%llu\n");
> +

Michael

^ permalink raw reply

* RE: [PATCH v2] video: hyperv: hyperv_fb: Obtain screen resolution from Hyper-V host
From: Michael Kelley @ 2019-08-21 23:48 UTC (permalink / raw)
  To: Wei Hu, b.zolnierkie@samsung.com, linux-hyperv@vger.kernel.org,
	dri-devel@lists.freedesktop.org, linux-fbdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, sashal@kernel.org,
	Stephen Hemminger, Haiyang Zhang, KY Srinivasan, Dexuan Cui
  Cc: Iouri Tarassov
In-Reply-To: <20190821111007.3490-1-weh@microsoft.com>

From: Wei Hu <weh@microsoft.com> Sent: Wednesday, August 21, 2019 4:11 AM
> 
> Beginning from Windows 10 RS5+, VM screen resolution is obtained from host.
> The "video=hyperv_fb" boot time option is not needed, but still can be
> used to overwrite what the host specifies. The VM resolution on the host
> could be set by executing the powershell "set-vmvideo" command.
> 
> v2:
> - Implemented fallback when version negotiation failed.
> - Defined full size for supported_resolution array.
> 
> Signed-off-by: Iouri Tarassov <iourit@microsoft.com>
> Signed-off-by: Wei Hu <weh@microsoft.com>
> Reviewed-by: Michael Kelley <mikelley@microsoft.com>

Reviewed-by: lines should not be added to patches until the reviewer
has actually given a "Reviewed-by:" statement, and I haven't done that
yet. :-)  Such statements are typically not given until review
comments have been addressed and re-reviewed as necessary.

> ---
>  drivers/video/fbdev/hyperv_fb.c | 145 +++++++++++++++++++++++++++++---
>  1 file changed, 133 insertions(+), 12 deletions(-)
> 
> +
> +struct synthvid_supported_resolution_resp {
> +	u8 edid_block[SYNTHVID_EDID_BLOCK_SIZE];
> +	u8 resolution_count;
> +	u8 default_resolution_index;
> +	u8 is_standard;
> +	struct hvd_screen_info
> +		supported_resolution[SYNTHVID_MAX_RESOLUTION_COUNT];

Is there extra whitespace on this line?  Just wondering why it doesn't
line up.

> +} __packed;
> +
> @@ -448,11 +542,27 @@ static int synthvid_connect_vsp(struct hv_device *hdev)
>  	}
> 
>  	/* Negotiate the protocol version with host */
> -	if (vmbus_proto_version == VERSION_WS2008 ||
> -	    vmbus_proto_version == VERSION_WIN7)
> -		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN7);
> -	else
> +	switch (vmbus_proto_version) {
> +	case VERSION_WIN10:
> +	case VERSION_WIN10_V5:
> +		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN10);
> +		if (!ret)
> +			break;
> +		/* Fallthrough */
> +	case VERSION_WIN8:
> +	case VERSION_WIN8_1:
>  		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN8);
> +		if (!ret)
> +			break;
> +		/* Fallthrough */
> +	case VERSION_WS2008:
> +	case VERSION_WIN7:
> +		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN7);
> +		break;
> +	default:
> +		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN10);
> +		break;

I'm tempted to put "default:" up with VERSION_WIN10 and VERISON_WIN10_V5
so that it can also fallback to earlier versions.  You would have a couple of less
lines of code.  But arguably newer versions should always go with
SYNTHVID_VERSION_WIN10 and not fallback.  I don't have a strong opinion
either way.

> +	}
> 
>  	if (ret) {
>  		pr_err("Synthetic video device version not accepted\n");
> @@ -464,6 +574,12 @@ static int synthvid_connect_vsp(struct hv_device *hdev)
>  	else
>  		screen_depth = SYNTHVID_DEPTH_WIN8;
> 
> +	if (par->synthvid_version >= SYNTHVID_VERSION_WIN10) {

Unfortunately, this "greater than" comparison won't work correctly because
the minor version is stored in the high order bits.  Version 4.0 would compare
as less than version 3.5 (which is what SYNTHVID_VERSION_WIN10 is).

> +		ret = synthvid_get_supported_resolution(hdev);
> +		if (ret)
> +			pr_info("Failed to get supported resolution from host, use
> default\n");
> +	}
> +
>  	screen_fb_size = hdev->channel->offermsg.offer.
>  				mmio_megabytes * 1024 * 1024;
> 
> @@ -653,6 +769,8 @@ static void hvfb_get_option(struct fb_info *info)
>  	}
> 
>  	if (x < HVFB_WIDTH_MIN || y < HVFB_HEIGHT_MIN ||
> +	    (par->synthvid_version >= SYNTHVID_VERSION_WIN10 &&

Same comparison problem here.

> +	    (x > screen_width_max || y > screen_height_max)) ||
>  	    (par->synthvid_version == SYNTHVID_VERSION_WIN8 &&
>  	     x * y * screen_depth / 8 > SYNTHVID_FB_SIZE_WIN8) ||
>  	    (par->synthvid_version == SYNTHVID_VERSION_WIN7 &&
> @@ -689,8 +807,12 @@ static int hvfb_getmem(struct hv_device *hdev, struct fb_info
> *info)
>  		}
> 
>  		if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
> -		    pci_resource_len(pdev, 0) < screen_fb_size)
> +		    pci_resource_len(pdev, 0) < screen_fb_size) {
> +			pr_err("Resource not available or (0x%lx < 0x%lx)\n",
> +			       (unsigned long) pci_resource_len(pdev, 0),
> +			       (unsigned long) screen_fb_size);
>  			goto err1;
> +		}

Michael

^ permalink raw reply

* RE: [PATCH v3 3/3] tools: hv: add vmbus testing tool
From: Harry Zhang @ 2019-08-22  1:36 UTC (permalink / raw)
  To: brandonbonaby94, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org
  Cc: brandonbonaby94, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, Harry Zhang
In-Reply-To: <c63cae8e916cbfa4a3fe627da3a545736d0b45dc.1566340843.git.brandonbonaby94@gmail.com>

Tool function issues:  Please validate args errors for  '-p' and '--path',  in or following validate_args_path().  

Comments of functionality:
-	it's confusing when fuzz_testing are all OFF, then user run ' python3 /home/lisa/vmbus_testing -p /sys/kernel/debug/hyperv/000d3a6e-4548-000d-3a6e-4548000d3a6e delay -d 0 0 -D ' which will enable all delay testing state ('Y' in state files).  even I used "-D", "--dis_all" param. 
-	if we have subparsers of "disable-all" for the testing tool, then probably we don't need the mutually_exclusive_group under subparsers of "delay"
-	the path argument (-p) could be an argument for subparsers of "delay" and "view" only.

Regards,
Harry

-----Original Message-----
From: linux-hyperv-owner@vger.kernel.org <linux-hyperv-owner@vger.kernel.org> On Behalf Of Branden Bonaby
Sent: Tuesday, August 20, 2019 4:40 PM
To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Stephen Hemminger <sthemmin@microsoft.com>; sashal@kernel.org
Cc: brandonbonaby94 <brandonbonaby94@gmail.com>; linux-hyperv@vger.kernel.org; linux-kernel@vger.kernel.org
Subject: [PATCH v3 3/3] tools: hv: add vmbus testing tool

This is a userspace tool to drive the testing. Currently it supports introducing user specified delay in the host to guest communication path on a per-channel basis.

Signed-off-by: Branden Bonaby <brandonbonaby94@gmail.com>
---
Changes in v3:
 - Align python tool to match Linux coding style.

Changes in v2:
 - Move testing location to new location in debugfs.

 tools/hv/vmbus_testing | 342 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 342 insertions(+)
 create mode 100644 tools/hv/vmbus_testing

diff --git a/tools/hv/vmbus_testing b/tools/hv/vmbus_testing new file mode 100644 index 000000000000..0f249f6ee698
--- /dev/null
+++ b/tools/hv/vmbus_testing
@@ -0,0 +1,342 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Program to allow users to fuzz test Hyper-V drivers # by interfacing 
+with Hyper-V debugfs directories # author: Branden Bonaby
+
+import os
+import cmd
+import argparse
+from collections import defaultdict
+from argparse import RawDescriptionHelpFormatter
+
+# debugfs paths for vmbus must exist (same as in lsvmbus) 
+debugfs_sys_path = "/sys/kernel/debug/hyperv"
+if not os.path.isdir(debugfs_sys_path):
+        print("{} doesn't exist/check permissions".format(debugfs_sys_path))
+        exit(-1)
+# Do not change unless, you change the debugfs attributes # in 
+"/sys/kernel/debug/hyperv/<UUID>/". All fuzz testing # attributes will 
+start with "fuzz_test".
+pathlen = len(debugfs_sys_path)
+fuzz_state_location = "fuzz_test_state"
+fuzz_states = {
+        0 : "Disable",
+        1 : "Enable"
+}
+
+fuzz_methods = {
+        1 : "Delay_testing"
+}
+
+fuzz_delay_types = {
+        1 : "fuzz_test_buffer_interrupt_delay",
+        2 : "fuzz_test_message_delay"
+}
+
+def parse_args():
+        parser = argparse.ArgumentParser(description = "vmbus_testing "
+                "[-s] [0|1] [-q] [-p] <debugfs-path>\n""vmbus_testing [-s]"
+                " [0|1] [-q][-p] <debugfs-path> delay [-d] [val][val] [-E|-D]\n"
+                "vmbus_testing [-q] disable-all\n"
+                "vmbus_testing [-q] view [-v|-V]\n"
+                "vmbus_testing --version",
+                epilog = "Current testing options {}".format(fuzz_methods),
+                prog = 'vmbus_testing',
+                formatter_class = RawDescriptionHelpFormatter)
+        subparsers = parser.add_subparsers(dest = "action")
+        parser.add_argument("--version", action = "version",
+                        version = '%(prog)s 1.0')
+        parser.add_argument("-q","--quiet", action = "store_true",
+                        help = "silence none important test messages")
+        parser.add_argument("-s","--state", metavar = "", type = int,
+                        choices = range(0, 2),
+                        help = "Turn testing ON or OFF for a single device."
+                        " The value (1) will turn testing ON. The value"
+                        " of (0) will turn testing OFF with the default set"
+                        " to (0).")
+        parser.add_argument("-p","--path", metavar = "",
+                        help = "Refers to the debugfs path to a vmbus device."
+                        " If the path is not a valid path to a vmbus device,"
+                        " the program will exit. The path must be the"
+                        " absolute path; use the lsvmbus command to find"
+                        " the path.")
+        parser_delay = subparsers.add_parser("delay",
+                        help = "Delay buffer/message reads in microseconds.",
+                        description = "vmbus_testing -s [0|1] [-q] -p "
+                        "<debugfs-path> delay -d "
+                        "[buffer-delay-value] [message-delay-value]\n"
+                        "vmbus_testing [-q] delay [buffer-delay-value] "
+                                "[message-delay-value] -E\n"
+                        "vmbus_testing [-q] delay [buffer-delay-value] "
+                                "[message-delay-value] -D",
+                        formatter_class = RawDescriptionHelpFormatter)
+        delay_group = parser_delay.add_mutually_exclusive_group()
+        delay_group.add_argument("-E", "--en_all", action = "store_true",
+                        help = "Enable Buffer/Message Delay testing on ALL"
+                        " devices. Use -d option with this to set the values"
+                        " for both the buffer delay and the message delay. No"
+                        " value can be (0) or less than (-1). If testing is"
+                        " disabled on a device prior to running this command,"
+                        " testing will be enabled on the device as a result"
+                        " of this command.")
+        delay_group.add_argument("-D", "--dis_all", action = "store_true",
+                        help = "Disable Buffer/Message delay testing on ALL"
+                        " devices. A  value equal to (-1) will keep the"
+                        " current delay value, and a value equal to (0) will"
+                        " remove delay testing for the specfied delay column."
+                        " only values (-1) and (0) will be accepted but at"
+                        " least one value must be a (0) or a (-1).")
+        parser_delay.add_argument("-d", "--delay_time", metavar = "", nargs = 2,
+                        type = check_range, default = [0, 0], required = (True),
+                        help = "Buffer/message delay time. A value of (0) will"
+                        "disable delay testing on the specified delay column,"
+                        " while a value of (-1) will ignore the specified"
+                        " delay column. The default values are [0] & [0]."
+                        " The first column represents the buffer delay value"
+                        " and the second represents the message delay value."
+                        " Value constraints: -1 <= value <= 1000.")
+        parser_dis_all = subparsers.add_parser("disable-all",
+                        help = "Disable ALL testing on all vmbus devices.",
+                        description = "vmbus_testing disable-all",
+                        formatter_class = RawDescriptionHelpFormatter)
+        parser_view = subparsers.add_parser("view",
+                        help = "View testing on vmbus devices.",
+                        description = "vmbus_testing view -V\n"
+                        "vmbus_testing -p <debugfs-path> view -v",
+                        formatter_class = RawDescriptionHelpFormatter)
+        view_group = parser_view.add_mutually_exclusive_group()
+        view_group.add_argument("-V", "--view_all", action = "store_true",
+                        help = "View the test status for all vmbus devices.")
+        view_group.add_argument("-v", "--view_single", action = "store_true",
+                        help = "View test values for a single vmbus 
+device.")
+
+        return  parser.parse_args()
+
+# value checking for range checking input in parser def 
+check_range(arg1):
+        try:
+                val = int(arg1)
+        except ValueError as err:
+                raise argparse.ArgumentTypeError(str(err))
+        if val < -1 or val > 1000:
+                message = ("\n\nExpected -1 <= value <= 1000, got value"
+                            " {}\n").format(val)
+                raise argparse.ArgumentTypeError(message)
+        return val
+
+def main():
+        try:
+                dev_list = []
+                for dir in os.listdir(debugfs_sys_path):
+                        dev_list.append(os.path.join(debugfs_sys_path, dir))
+                #key value, pairs
+                #key = debugfs device path
+                #value = list of fuzz testing attributes.
+                dev_files = defaultdict(list)
+                for dev in dev_list:
+                        path = os.path.join(dev, "delay")
+                        for f in os.listdir(path):
+                                if (f.startswith("fuzz_test")):
+                                        dev_files[path].append(f)
+
+                dev_files.default_factory = None
+                args = parse_args()
+                path = args.path
+                state = args.state
+                quiet = args.quiet
+                if (not quiet):
+                        print("*** Use lsvmbus to get vmbus device type"
+                                " information.*** ")
+                if (state is not None and validate_args_path(path, dev_list)):
+                        if (state is not get_test_state(path)):
+                                change_test_state(path, quiet)
+                        state = get_test_state(path)
+                if (state == 0 and path is not None):
+                        disable_testing_single_device(path, 0, quiet)
+                        return
+                #Use subparsers as the key for different fuzz testing methods
+                if (args.action == "delay"):
+                        delay = args.delay_time
+                        if (validate_delay_values(args, delay)):
+                                delay_test_all_devices(dev_list, delay, quiet)
+                        elif (validate_args_path(path, dev_list)):
+                                if(get_test_state(path) == 1):
+                                        delay_test_store(path, delay, quiet)
+                                        return
+                                print("device testing OFF, use -s 1 to turn ON")
+                elif (args.action == "disable-all"):
+                        disable_all_testing(dev_list, quiet)
+                elif (args.action == "view"):
+                        if (args.view_all):
+                                all_devices_test_status(dev_list)
+                        elif (args.view_single):
+                                if (validate_args_path(path, dev_list)):
+                                        device_test_values(dev_files, path)
+                                        return
+                                print("Error,(check path) usage: -p"\
+                                            " <debugfs device path> view -v")
+        except AttributeError:
+                print("check usage, 1 or more elements not provided")
+                exit(-1)
+
+# Validate delay values to make sure they are acceptable to # to either 
+enable all delays on a device or disable all # delays on a device def 
+validate_delay_values(args, delay):
+        if (args.en_all):
+                for i in delay:
+                        if (i < -1 or i == 0):
+                                print("\nError, Values must be"
+                                        " equal to -1 or be > 0, use"
+                                        " -d option")
+                                exit(-1)
+                return True
+        elif (args.dis_all):
+                for i in delay:
+                        if (i < -1 or i > 0):
+                                print("\nError, at least 1 value"
+                                        " is not a (0) or a (-1)")
+                                exit(-1)
+                return True
+        else:
+                return False
+
+
+# Validate argument path
+def validate_args_path(path, dev_list):
+        if (path in dev_list):
+                return True
+        else:
+                return False
+
+# display Testing status of single device def 
+device_test_values(dev_files, path):
+
+        delay_path = os.path.join(path, 'delay')
+        for test in dev_files.get(delay_path):
+                print("{}".format(test), end = '')
+                print((" value =  {}")\
+                        
+ .format(read_test_files(os.path.join(delay_path, test))))
+
+# display Testing state of devices
+def all_devices_test_status(dev_list):
+    for device in dev_list:
+        if (get_test_state(device) is 1):
+                print("Testing = ON for: {}".format(device.split("/")[5]))
+        else:
+                print("Testing = OFF for: 
+{}".format(device.split("/")[5]))
+
+# read the vmbus device files, path must be absolute path before 
+calling def read_test_files(path):
+        try:
+                with open(path,"r") as f:
+                        state = f.readline().strip()
+                        if (state == 'N'):
+                                state = 0
+                        elif (state == 'Y'):
+                                state = 1
+                return int(state)
+
+        except IOError as e:
+                errno, strerror = e.args
+                print("I/O error({0}): {1} on file {2}"
+                        .format(errno, strerror, path))
+                exit(-1)
+        except ValueError:
+                print ("Element to int conversion error in: \n{}".format(path))
+                exit(-1)
+
+# writing to vmbus device files, path must be absolute path before 
+calling def write_test_files(path, value):
+        try:
+                with open(path,"w") as f:
+                        f.write("{}".format(value))
+        except IOError as e:
+                errno, strerror = e.args
+                print("I/O error({0}): {1} on file {2}"
+                        .format(errno, strerror, path))
+                exit(-1)
+
+# change testing state of device
+def change_test_state(device, quiet):
+        state_path = os.path.join(device, fuzz_state_location)
+        if (get_test_state(device) is 0):
+                write_test_files(state_path, 1)
+                if (not quiet):
+                            print("Testing = ON for device: {}"
+                                    .format(state_path.split("/")[5]))
+        else:
+                write_test_files(state_path, 0)
+                if (not quiet):
+                            print("Testing = OFF for device: {}"
+                                    .format(state_path.split("/")[5]))
+
+# get testing state of device
+def get_test_state(device):
+        #state == 1 - test = ON
+        #state == 0 - test = OFF
+        return  read_test_files(os.path.join(device, 
+fuzz_state_location))
+
+# Enter 1 - 1000 microseconds, into a single device using the # 
+fuzz_test_buffer_interrupt_delay and fuzz_test_message_delay # debugfs 
+attributes def delay_test_store(device,delay_length, quiet):
+
+        try:
+                # delay[0]- buffer delay, delay[1]- message delay
+                buff_test = os.path.join(os.path.sep,device, 'delay',
+                                            fuzz_delay_types.get(1))
+                mess_test = os.path.join(os.path.sep,device, 'delay',
+                                            fuzz_delay_types.get(2))
+
+                if (delay_length[0] >= 0):
+                        write_test_files(buff_test, delay_length[0])
+                if (delay_length[1] >= 0):
+                        write_test_files(mess_test, delay_length[1])
+                if (not quiet):
+                        print("Buffer delay testing = {} for: {}"
+                                .format(read_test_files(buff_test),
+                                buff_test.split("/")[5]))
+                        print("Message delay testing = {} for: {}"
+                                .format(read_test_files(mess_test),
+                                mess_test.split("/")[5]))
+        except IOError as e:
+                errno, strerror = e.args
+                print("I/O error({0}): {1} on files {2}{3}"
+                        .format(errno, strerror, buff_test, mess_test))
+                exit(-1)
+
+#enabling/disabling delay testing on all devices def 
+delay_test_all_devices(dev_list,delay,quiet):
+
+        for device in (dev_list):
+                if (get_test_state(device) is 0):
+                        change_test_state(device,quiet)
+                delay_test_store(device, delay, quiet)
+
+#disabling testing on single device
+def disable_testing_single_device(device,test_type,quiet):
+
+        #test_type represents corresponding key
+        #delay method in delay_methods dict.
+        #special type 0 , used to disable all
+        #testing on SINGLE device.
+
+        if (test_type is 1 or test_type is 0):
+                #disable list [buffer,message]
+                disable_delay = [0, 0]
+                if (get_test_state(device) is 1):
+                        change_test_state(device, quiet)
+                delay_test_store(device, disable_delay, quiet)
+
+#disabling testing on ALL devices
+def disable_all_testing(dev_list,quiet):
+
+        #delay disable list [buffer,message]
+        for device in dev_list:
+                disable_testing_single_device(device, 0, quiet)
+
+if __name__ == "__main__":
+        main()
--
2.17.1


^ permalink raw reply

* Re: [PATCH v3 3/3] tools: hv: add vmbus testing tool
From: Branden Bonaby @ 2019-08-22  3:16 UTC (permalink / raw)
  To: Harry Zhang
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <PS1P15301MB02497E19A17D71913DA32857C0A50@PS1P15301MB0249.APCP153.PROD.OUTLOOK.COM>

On Thu, Aug 22, 2019 at 01:36:09AM +0000, Harry Zhang wrote:
> Tool function issues:  Please validate args errors for  '-p' and '--path',  in or following validate_args_path().  
> 
> Comments of functionality:
> -	it's confusing when fuzz_testing are all OFF, then user run ' python3 /home/lisa/vmbus_testing -p /sys/kernel/debug/hyperv/000d3a6e-4548-000d-3a6e-4548000d3a6e delay -d 0 0 -D ' which will enable all delay testing state ('Y' in state files).  even I used "-D", "--dis_all" param. 
> -	if we have subparsers of "disable-all" for the testing tool, then probably we don't need the mutually_exclusive_group under subparsers of "delay"
> -	the path argument (-p) could be an argument for subparsers of "delay" and "view" only.
> 
> Regards,
> Harry
>

So I made the choice to keep disabling the state and disabling delay
testing seperate, because once we start adding other testing options
you wouldn't want to inadvertently disable all testing especially
if you were doing more than one type of test at a time.
So with your configuration

'python3 /home/lisa/vmbus_testing -p /sys/kernel/debug/hyperv/000d3a6e-4548-000d-3a6e-4548000d3a6e delay -d 0 0 -D '

this would stop all delay testing on all the devices but wouldn't change
their test state to OFF 'N'.So thats why I have the option -s --state to
change the state to Off with a -s 0. Then to disable all types of testing
and change the state to OFF thats where the 'disable-all' subparser  comes in.
with:

'python3 /home/lisa/vmbus_testing disable-all

For that last point I don't understand what you mean, are you saying it would be
better to have something like this using  delay as an example?

'python3 /home/lisa/vmbus_testing delay -p /sys/kernel/debug/hyperv/000d3a6e-4548-000d-3a6e-4548000d3a6e'

If thats what you mean I figured it was better to make the -p accessible
to all test type so I made it apart of the main parser. This would allow
us to just have it there once instead of having to make a -p for every
subparser.

Also maybe I need to change the examples and the help text
because with the -D option for delay you wouldnt actually need to put in 
the path. As

'python3 /home/lisa/vmbus_testing delay -d 0 0 -D '

would suffice to stop delay testing on all devices; -E would enable
it for all devices and change the state to On 'Y' if it wasn't already.

let me know your thoughts

branden bonaby

^ permalink raw reply

* Re: [PATCH v3 2/3] drivers: hv: vmbus: add test attributes to debugfs
From: Branden Bonaby @ 2019-08-22  3:36 UTC (permalink / raw)
  To: Michael Kelley
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <DM5PR21MB0137B4071E64688C5F902E83D7AA0@DM5PR21MB0137.namprd21.prod.outlook.com>

> > +What:           /sys/kernel/debug/hyperv/<UUID>/fuzz_test_state
> > +Date:           August 2019
> > +KernelVersion:  5.3
> > +Contact:        Branden Bonaby <brandonbonaby94@gmail.com>
> > +Description:    Fuzz testing status of a vmbus device, whether its in an ON
> > +                state or a OFF state
> 
> Document what values are actually returned?  
> 
> > +Users:          Debugging tools
> > +
> > +What:           /sys/kernel/debug/hyperv/<UUID>/delay/fuzz_test_buffer_interrupt_delay
> > +Date:           August 2019
> > +KernelVersion:  5.3
> > +Contact:        Branden Bonaby <brandonbonaby94@gmail.com>
> > +Description:    Fuzz testing buffer delay value between 0 - 1000
> 
> It would be helpful to document the units -- I think this is 0 to 1000
> microseconds.

you're right, that makes sense I'll add that information in. Also 
to confirm, it is microseconds like you said.

> > +static int hv_debugfs_delay_set(void *data, u64 val)
> > +{
> > +	if (val >= 1 && val <= 1000)
> > +		*(u32 *)data = val;
> > +	/*Best to not use else statement here since we want
> > +	 * the delay to remain the same if val > 1000
> > +	 */
> 
> The standard multi-line comment style would be:
> 
> 	/*
> 	 * Best to not use else statement here since we want
> 	 * the delay to remain the same if val > 1000
> 	 */
>

will change

> > +	else if (val <= 0)
> > +		*(u32 *)data = 0;
> 
> You could consider returning an error for an invalid
> value (< 0, or > 1000).
> 

its subtle but it does make sense and shows anyone
reading that the only acceptable values in the 
function are 0 <= 1000 at a glance. I'll add
that in.


^ permalink raw reply

* Re: [PATCH net-next,v3, 0/6] Add software backchannel and mlx5e HV VHCA stats
From: David Miller @ 2019-08-22  4:09 UTC (permalink / raw)
  To: haiyangz
  Cc: sashal, saeedm, leon, eranbe, lorenzo.pieralisi, bhelgaas,
	linux-pci, linux-hyperv, netdev, kys, sthemmin, linux-kernel
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>

From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 21 Aug 2019 00:23:19 +0000

> This patch set adds paravirtual backchannel in software in pci_hyperv,
> which is required by the mlx5e driver HV VHCA stats agent.
> 
> The stats agent is responsible on running a periodic rx/tx packets/bytes
> stats update.

These patches don't apply cleanly to net-next, probably due to some recent
mlx5 driver changes.

Please respin.

^ permalink raw reply

* RE: [PATCH net-next,v3, 0/6] Add software backchannel and mlx5e HV VHCA stats
From: Haiyang Zhang @ 2019-08-22  4:13 UTC (permalink / raw)
  To: David Miller
  Cc: sashal@kernel.org, saeedm@mellanox.com, leon@kernel.org,
	eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	KY Srinivasan, Stephen Hemminger, linux-kernel@vger.kernel.org
In-Reply-To: <20190821.210907.884869474698105971.davem@davemloft.net>



> -----Original Message-----
> From: linux-hyperv-owner@vger.kernel.org <linux-hyperv-
> owner@vger.kernel.org> On Behalf Of David Miller
> Sent: Wednesday, August 21, 2019 9:09 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: sashal@kernel.org; saeedm@mellanox.com; leon@kernel.org;
> eranbe@mellanox.com; lorenzo.pieralisi@arm.com; bhelgaas@google.com;
> linux-pci@vger.kernel.org; linux-hyperv@vger.kernel.org;
> netdev@vger.kernel.org; KY Srinivasan <kys@microsoft.com>; Stephen
> Hemminger <sthemmin@microsoft.com>; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,v3, 0/6] Add software backchannel and mlx5e
> HV VHCA stats
> 
> From: Haiyang Zhang <haiyangz@microsoft.com>
> Date: Wed, 21 Aug 2019 00:23:19 +0000
> 
> > This patch set adds paravirtual backchannel in software in pci_hyperv,
> > which is required by the mlx5e driver HV VHCA stats agent.
> >
> > The stats agent is responsible on running a periodic rx/tx
> > packets/bytes stats update.
> 
> These patches don't apply cleanly to net-next, probably due to some recent
> mlx5 driver changes.
> 
> Please respin.

I will do.
Thanks,

- Haiyang

^ permalink raw reply

* [PATCH net-next,v4, 0/6] Add software backchannel and mlx5e HV VHCA stats
From: Haiyang Zhang @ 2019-08-22  5:05 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org

This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent.

The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.

Dexuan Cui (1):
  PCI: hv: Add a paravirtual backchannel in software

Eran Ben Elisha (4):
  net/mlx5: Add wrappers for HyperV PCIe operations
  net/mlx5: Add HV VHCA infrastructure
  net/mlx5: Add HV VHCA control agent
  net/mlx5e: Add mlx5e HV VHCA stats agent

Haiyang Zhang (1):
  PCI: hv: Add a Hyper-V PCI interface driver for software backchannel
    interface

 MAINTAINERS                                        |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  13 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +++++++++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c   |  64 ++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h   |  22 ++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 371 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 104 ++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   7 +
 drivers/pci/Kconfig                                |   1 +
 drivers/pci/controller/Kconfig                     |   7 +
 drivers/pci/controller/Makefile                    |   1 +
 drivers/pci/controller/pci-hyperv-intf.c           |  67 ++++
 drivers/pci/controller/pci-hyperv.c                | 308 +++++++++++++++++
 include/linux/hyperv.h                             |  29 ++
 include/linux/mlx5/driver.h                        |   2 +
 18 files changed, 1189 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

-- 
1.8.3.1


^ permalink raw reply

* [PATCH net-next,v4, 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent
From: Haiyang Zhang @ 2019-08-22  5:06 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566450236-36757-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8d443fc..f4de9cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \
 					lib/geneve.o en/tc_tun_vxlan.o en/tc_tun_gre.o \
 					en/tc_tun_geneve.o diag/en_tc_tracepoint.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7316571..4467927 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -782,6 +783,15 @@ struct mlx5e_modify_sq_param {
 	int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+struct mlx5e_hv_vhca_stats_agent {
+	struct mlx5_hv_vhca_agent *agent;
+	struct delayed_work        work;
+	u16                        delay;
+	void                      *buf;
+};
+#endif
+
 struct mlx5e_xsk {
 	/* UMEMs are stored separately from channels, because we don't want to
 	 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -853,6 +863,9 @@ struct mlx5e_priv {
 	struct devlink_health_reporter *tx_reporter;
 	struct devlink_health_reporter *rx_reporter;
 	struct mlx5e_xsk           xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+	struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 0000000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+	u64     rx_packets;
+	u64     rx_bytes;
+	u64     tx_packets;
+	u64     tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+			      struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+	struct mlx5e_channel_stats *stats;
+	int tc;
+
+	stats = &priv->channel_stats[ch];
+	data->rx_packets = stats->rq.packets;
+	data->rx_bytes   = stats->rq.bytes;
+
+	for (tc = 0; tc < priv->max_opened_tc; tc++) {
+		data->tx_packets += stats->sq[tc].packets;
+		data->tx_bytes   += stats->sq[tc].bytes;
+	}
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int buf_len)
+{
+	int ch, i = 0;
+
+	for (ch = 0; ch < priv->max_nch; ch++) {
+		u64 *buf = data + i;
+
+		if (WARN_ON_ONCE(buf +
+				 sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+				 data + buf_len))
+			return;
+
+		mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+					      (struct mlx5e_hv_vhca_per_ring_stats *)buf);
+		i += sizeof(struct mlx5e_hv_vhca_per_ring_stats) / sizeof(u64);
+	}
+}
+
+static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv)
+{
+	return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) *
+		priv->max_nch);
+}
+
+static void mlx5e_hv_vhca_stats_work(struct work_struct *work)
+{
+	struct mlx5e_hv_vhca_stats_agent *sagent;
+	struct mlx5_hv_vhca_agent *agent;
+	struct delayed_work *dwork;
+	struct mlx5e_priv *priv;
+	int buf_len, rc;
+	void *buf;
+
+	dwork = to_delayed_work(work);
+	sagent = container_of(dwork, struct mlx5e_hv_vhca_stats_agent, work);
+	priv = container_of(sagent, struct mlx5e_priv, stats_agent);
+	buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
+	agent = sagent->agent;
+	buf = sagent->buf;
+
+	memset(buf, 0, buf_len);
+	mlx5e_hv_vhca_fill_stats(priv, buf, buf_len);
+
+	rc = mlx5_hv_vhca_agent_write(agent, buf, buf_len);
+	if (rc) {
+		mlx5_core_err(priv->mdev,
+			      "%s: Failed to write stats, err = %d\n",
+			      __func__, rc);
+		return;
+	}
+
+	if (sagent->delay)
+		queue_delayed_work(priv->wq, &sagent->work, sagent->delay);
+}
+
+enum {
+	MLX5_HV_VHCA_STATS_VERSION     = 1,
+	MLX5_HV_VHCA_STATS_UPDATE_ONCE = 0xFFFF,
+};
+
+static void mlx5e_hv_vhca_stats_control(struct mlx5_hv_vhca_agent *agent,
+					struct mlx5_hv_vhca_control_block *block)
+{
+	struct mlx5e_hv_vhca_stats_agent *sagent;
+	struct mlx5e_priv *priv;
+
+	priv = mlx5_hv_vhca_agent_priv(agent);
+	sagent = &priv->stats_agent;
+
+	block->version = MLX5_HV_VHCA_STATS_VERSION;
+	block->rings   = priv->max_nch;
+
+	if (!block->command) {
+		cancel_delayed_work_sync(&priv->stats_agent.work);
+		return;
+	}
+
+	sagent->delay = block->command == MLX5_HV_VHCA_STATS_UPDATE_ONCE ? 0 :
+			msecs_to_jiffies(block->command * 100);
+
+	queue_delayed_work(priv->wq, &sagent->work, sagent->delay);
+}
+
+static void mlx5e_hv_vhca_stats_cleanup(struct mlx5_hv_vhca_agent *agent)
+{
+	struct mlx5e_priv *priv = mlx5_hv_vhca_agent_priv(agent);
+
+	cancel_delayed_work_sync(&priv->stats_agent.work);
+}
+
+int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+{
+	int buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
+	struct mlx5_hv_vhca_agent *agent;
+
+	priv->stats_agent.buf = kvzalloc(buf_len, GFP_KERNEL);
+	if (!priv->stats_agent.buf)
+		return -ENOMEM;
+
+	agent = mlx5_hv_vhca_agent_create(priv->mdev->hv_vhca,
+					  MLX5_HV_VHCA_AGENT_STATS,
+					  mlx5e_hv_vhca_stats_control, NULL,
+					  mlx5e_hv_vhca_stats_cleanup,
+					  priv);
+
+	if (IS_ERR_OR_NULL(agent)) {
+		if (IS_ERR(agent))
+			netdev_warn(priv->netdev,
+				    "Failed to create hv vhca stats agent, err = %ld\n",
+				    PTR_ERR(agent));
+
+		kfree(priv->stats_agent.buf);
+		return IS_ERR_OR_NULL(agent);
+	}
+
+	priv->stats_agent.agent = agent;
+	INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work);
+
+	return 0;
+}
+
+void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
+{
+	if (IS_ERR_OR_NULL(priv->stats_agent.agent))
+		return;
+
+	mlx5_hv_vhca_agent_destroy(priv->stats_agent.agent);
+	kfree(priv->stats_agent.buf);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
new file mode 100644
index 0000000..664463f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_STATS_VHCA_H__
+#define __MLX5_EN_STATS_VHCA_H__
+#include "en.h"
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv);
+void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv);
+
+#else
+
+static inline int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
+{
+}
+#endif
+
+#endif /* __MLX5_EN_STATS_VHCA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7fdea64..fa4bf2d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -62,6 +62,7 @@
 #include "en/xsk/setup.h"
 #include "en/xsk/rx.h"
 #include "en/xsk/tx.h"
+#include "en/hv_vhca_stats.h"
 
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
@@ -5109,6 +5110,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 	if (mlx5e_monitor_counter_supported(priv))
 		mlx5e_monitor_counter_init(priv);
 
+	mlx5e_hv_vhca_stats_create(priv);
 	if (netdev->reg_state != NETREG_REGISTERED)
 		return;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
@@ -5141,6 +5143,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 
+	mlx5e_hv_vhca_stats_destroy(priv);
 	if (mlx5e_monitor_counter_supported(priv))
 		mlx5e_monitor_counter_cleanup(priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
index 984e7ad..4bad6a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -13,6 +13,7 @@
 
 enum mlx5_hv_vhca_agent_type {
 	MLX5_HV_VHCA_AGENT_CONTROL = 0,
+	MLX5_HV_VHCA_AGENT_STATS   = 1,
 	MLX5_HV_VHCA_AGENT_MAX = 32,
 };
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v4, 4/6] net/mlx5: Add HV VHCA infrastructure
From: Haiyang Zhang @ 2019-08-22  5:05 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566450236-36757-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   7 +
 include/linux/mlx5/driver.h                        |   2 +
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fd32a5b..8d443fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 0000000..84d1d75
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include <linux/hyperv.h>
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+	struct mlx5_core_dev       *dev;
+	struct workqueue_struct    *work_queue;
+	struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+	struct mutex                agents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+	struct work_struct     invalidate_work;
+	struct mlx5_hv_vhca   *hv_vhca;
+	u64                    block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+	u16     sequence;
+	u16     offset;
+	u8      reserved[4];
+	u64     data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+	enum mlx5_hv_vhca_agent_type	 type;
+	struct mlx5_hv_vhca		*hv_vhca;
+	void				*priv;
+	u16                              seq;
+	void (*control)(struct mlx5_hv_vhca_agent *agent,
+			struct mlx5_hv_vhca_control_block *block);
+	void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+			   u64 block_mask);
+	void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_hv_vhca *hv_vhca = NULL;
+
+	hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+	if (!hv_vhca)
+		return ERR_PTR(-ENOMEM);
+
+	hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+	if (!hv_vhca->work_queue) {
+		kfree(hv_vhca);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	hv_vhca->dev = dev;
+	mutex_init(&hv_vhca->agents_lock);
+
+	return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return;
+
+	destroy_workqueue(hv_vhca->work_queue);
+	kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+	struct mlx5_hv_vhca_work *hwork;
+	struct mlx5_hv_vhca *hv_vhca;
+	int i;
+
+	hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work);
+	hv_vhca = hwork->hv_vhca;
+
+	mutex_lock(&hv_vhca->agents_lock);
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+		if (!agent || !agent->invalidate)
+			continue;
+
+		if (!(BIT(agent->type) & hwork->block_mask))
+			continue;
+
+		agent->invalidate(agent, hwork->block_mask);
+	}
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	kfree(hwork);
+}
+
+void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
+{
+	struct mlx5_hv_vhca *hv_vhca = (struct mlx5_hv_vhca *)context;
+	struct mlx5_hv_vhca_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	INIT_WORK(&work->invalidate_work, mlx5_hv_vhca_invalidate_work);
+	work->hv_vhca    = hv_vhca;
+	work->block_mask = block_mask;
+
+	queue_work(hv_vhca->work_queue, &work->invalidate_work);
+}
+
+int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
+{
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return IS_ERR_OR_NULL(hv_vhca);
+
+	return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+					   mlx5_hv_vhca_invalidate);
+}
+
+void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return;
+
+	mutex_lock(&hv_vhca->agents_lock);
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
+		WARN_ON(hv_vhca->agents[i]);
+
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	mlx5_hv_unregister_invalidate(hv_vhca->dev);
+}
+
+struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
+			  enum mlx5_hv_vhca_agent_type type,
+			  void (*control)(struct mlx5_hv_vhca_agent*,
+					  struct mlx5_hv_vhca_control_block *block),
+			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
+					     u64 block_mask),
+			  void (*cleaup)(struct mlx5_hv_vhca_agent *agent),
+			  void *priv)
+{
+	struct mlx5_hv_vhca_agent *agent;
+
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return ERR_PTR(-ENOMEM);
+
+	if (type >= MLX5_HV_VHCA_AGENT_MAX)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&hv_vhca->agents_lock);
+	if (hv_vhca->agents[type]) {
+		mutex_unlock(&hv_vhca->agents_lock);
+		return ERR_PTR(-EINVAL);
+	}
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	agent = kzalloc(sizeof(*agent), GFP_KERNEL);
+	if (!agent)
+		return ERR_PTR(-ENOMEM);
+
+	agent->type      = type;
+	agent->hv_vhca   = hv_vhca;
+	agent->priv      = priv;
+	agent->control   = control;
+	agent->invalidate = invalidate;
+	agent->cleanup   = cleaup;
+
+	mutex_lock(&hv_vhca->agents_lock);
+	hv_vhca->agents[type] = agent;
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	return agent;
+}
+
+void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+	struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+
+	mutex_lock(&hv_vhca->agents_lock);
+
+	if (WARN_ON(agent != hv_vhca->agents[agent->type])) {
+		mutex_unlock(&hv_vhca->agents_lock);
+		return;
+	}
+
+	hv_vhca->agents[agent->type] = NULL;
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	if (agent->cleanup)
+		agent->cleanup(agent);
+
+	kfree(agent);
+}
+
+static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
+					   struct mlx5_hv_vhca_data_block *data_block,
+					   void *src, int len, int *offset)
+{
+	int bytes = min_t(int, (int)sizeof(data_block->data), len);
+
+	data_block->sequence = agent->seq;
+	data_block->offset   = (*offset)++;
+	memcpy(data_block->data, src, bytes);
+
+	return bytes;
+}
+
+static void mlx5_hv_vhca_agent_seq_update(struct mlx5_hv_vhca_agent *agent)
+{
+	agent->seq++;
+}
+
+int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
+			     void *buf, int len)
+{
+	int offset = agent->type * HV_CONFIG_BLOCK_SIZE_MAX;
+	int block_offset = 0;
+	int total = 0;
+	int err;
+
+	while (len) {
+		struct mlx5_hv_vhca_data_block data_block = {0};
+		int bytes;
+
+		bytes = mlx5_hv_vhca_data_block_prepare(agent, &data_block,
+							buf + total,
+							len, &block_offset);
+		if (!bytes)
+			return -ENOMEM;
+
+		err = mlx5_hv_write_config(agent->hv_vhca->dev, &data_block,
+					   sizeof(data_block), offset);
+		if (err)
+			return err;
+
+		total += bytes;
+		len   -= bytes;
+	}
+
+	mlx5_hv_vhca_agent_seq_update(agent);
+
+	return 0;
+}
+
+void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent)
+{
+	return agent->priv;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
new file mode 100644
index 0000000..cdf1303
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_VHCA_H__
+#define __LIB_HV_VHCA_H__
+
+#include "en.h"
+#include "lib/hv.h"
+
+struct mlx5_hv_vhca_agent;
+struct mlx5_hv_vhca;
+struct mlx5_hv_vhca_control_block;
+
+enum mlx5_hv_vhca_agent_type {
+	MLX5_HV_VHCA_AGENT_MAX = 32,
+};
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+struct mlx5_hv_vhca_control_block {
+	u32     capabilities;
+	u32     control;
+	u16     command;
+	u16     command_ack;
+	u16     version;
+	u16     rings;
+	u32     reserved1[28];
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev);
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca);
+int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca);
+void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca);
+void mlx5_hv_vhca_invalidate(void *context, u64 block_mask);
+
+struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
+			  enum mlx5_hv_vhca_agent_type type,
+			  void (*control)(struct mlx5_hv_vhca_agent*,
+					  struct mlx5_hv_vhca_control_block *block),
+			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
+					     u64 block_mask),
+			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
+			  void *context);
+
+void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent);
+int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
+			     void *buf, int len);
+void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent);
+
+#else
+
+static inline struct mlx5_hv_vhca *
+mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+	return NULL;
+}
+
+static inline void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+}
+
+static inline int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
+{
+	return 0;
+}
+
+static inline void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
+{
+}
+
+static inline void mlx5_hv_vhca_invalidate(void *context,
+					   u64 block_mask)
+{
+}
+
+static inline struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
+			  enum mlx5_hv_vhca_agent_type type,
+			  void (*control)(struct mlx5_hv_vhca_agent*,
+					  struct mlx5_hv_vhca_control_block *block),
+			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
+					     u64 block_mask),
+			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
+			  void *context)
+{
+	return NULL;
+}
+
+static inline void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+}
+
+static inline int
+mlx5_hv_vhca_write_agent(struct mlx5_hv_vhca_agent *agent,
+			 void *buf, int len)
+{
+	return 0;
+}
+#endif
+
+#endif /* __LIB_HV_VHCA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0b70b1d..61388ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -69,6 +69,7 @@
 #include "lib/pci_vsc.h"
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
+#include "lib/hv_vhca.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -870,6 +871,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	}
 
 	dev->tracer = mlx5_fw_tracer_create(dev);
+	dev->hv_vhca = mlx5_hv_vhca_create(dev);
 
 	return 0;
 
@@ -900,6 +902,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 {
+	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_fpga_cleanup(dev);
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
@@ -1067,6 +1070,8 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 		goto err_fw_tracer;
 	}
 
+	mlx5_hv_vhca_init(dev->hv_vhca);
+
 	err = mlx5_fpga_device_start(dev);
 	if (err) {
 		mlx5_core_err(dev, "fpga device start failed %d\n", err);
@@ -1122,6 +1127,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 err_ipsec_start:
 	mlx5_fpga_device_stop(dev);
 err_fpga_start:
+	mlx5_hv_vhca_cleanup(dev->hv_vhca);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 err_fw_tracer:
 	mlx5_eq_table_destroy(dev);
@@ -1142,6 +1148,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
+	mlx5_hv_vhca_cleanup(dev->hv_vhca);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 	mlx5_eq_table_destroy(dev);
 	mlx5_irq_table_destroy(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index df23f17..13b4cf2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -659,6 +659,7 @@ struct mlx5_clock {
 struct mlx5_fw_tracer;
 struct mlx5_vxlan;
 struct mlx5_geneve;
+struct mlx5_hv_vhca;
 
 struct mlx5_core_dev {
 	struct device *device;
@@ -706,6 +707,7 @@ struct mlx5_core_dev {
 	struct mlx5_ib_clock_info  *clock_info;
 	struct mlx5_fw_tracer   *tracer;
 	u32                      vsc_addr;
+	struct mlx5_hv_vhca	*hv_vhca;
 };
 
 struct mlx5_db {
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v4, 5/6] net/mlx5: Add HV VHCA control agent
From: Haiyang Zhang @ 2019-08-22  5:05 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566450236-36757-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 ++++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
 	queue_work(hv_vhca->work_queue, &work->invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+					struct mlx5_hv_vhca_control_block *block)
+{
+	int i;
+
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+		if (!agent || !agent->control)
+			continue;
+
+		if (!(AGENT_MASK(agent->type) & block->control))
+			continue;
+
+		agent->control(agent, block);
+	}
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+				      u32 *capabilities)
+{
+	int i;
+
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+		if (agent)
+			*capabilities |= AGENT_MASK(agent->type);
+	}
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+				      u64 block_mask)
+{
+	struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+	struct mlx5_core_dev *dev = hv_vhca->dev;
+	struct mlx5_hv_vhca_control_block *block;
+	u32 capabilities = 0;
+	int err;
+
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (!block)
+		return;
+
+	err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+	if (err)
+		goto free_block;
+
+	mlx5_hv_vhca_capabilities(hv_vhca, &capabilities);
+
+	/* In case no capabilities, send empty block in return */
+	if (!capabilities) {
+		memset(block, 0, sizeof(*block));
+		goto write;
+	}
+
+	if (block->capabilities != capabilities)
+		block->capabilities = capabilities;
+
+	if (block->control & ~capabilities)
+		goto free_block;
+
+	mlx5_hv_vhca_agents_control(hv_vhca, block);
+	block->command_ack = block->command;
+
+write:
+	mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+	kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+	return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+					 NULL,
+					 mlx5_hv_vhca_control_agent_invalidate,
+					 NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+	mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+	struct mlx5_hv_vhca_agent *agent;
+	int err;
+
 	if (IS_ERR_OR_NULL(hv_vhca))
 		return IS_ERR_OR_NULL(hv_vhca);
 
-	return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-					   mlx5_hv_vhca_invalidate);
+	err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+					  mlx5_hv_vhca_invalidate);
+	if (err)
+		return err;
+
+	agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+	if (IS_ERR_OR_NULL(agent)) {
+		mlx5_hv_unregister_invalidate(hv_vhca->dev);
+		return IS_ERR_OR_NULL(agent);
+	}
+
+	hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+	return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+	struct mlx5_hv_vhca_agent *agent;
 	int i;
 
 	if (IS_ERR_OR_NULL(hv_vhca))
 		return;
 
+	agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL];
+	if (agent)
+		mlx5_hv_vhca_control_agent_destroy(agent);
+
 	mutex_lock(&hv_vhca->agents_lock);
 	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
 		WARN_ON(hv_vhca->agents[i]);
@@ -134,6 +243,11 @@ void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 	mlx5_hv_unregister_invalidate(hv_vhca->dev);
 }
 
+static void mlx5_hv_vhca_agents_update(struct mlx5_hv_vhca *hv_vhca)
+{
+	mlx5_hv_vhca_invalidate(hv_vhca, BIT(MLX5_HV_VHCA_AGENT_CONTROL));
+}
+
 struct mlx5_hv_vhca_agent *
 mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
 			  enum mlx5_hv_vhca_agent_type type,
@@ -174,6 +288,8 @@ struct mlx5_hv_vhca_agent *
 	hv_vhca->agents[type] = agent;
 	mutex_unlock(&hv_vhca->agents_lock);
 
+	mlx5_hv_vhca_agents_update(hv_vhca);
+
 	return agent;
 }
 
@@ -195,6 +311,8 @@ void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
 		agent->cleanup(agent);
 
 	kfree(agent);
+
+	mlx5_hv_vhca_agents_update(hv_vhca);
 }
 
 static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
index cdf1303..984e7ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -12,6 +12,7 @@
 struct mlx5_hv_vhca_control_block;
 
 enum mlx5_hv_vhca_agent_type {
+	MLX5_HV_VHCA_AGENT_CONTROL = 0,
 	MLX5_HV_VHCA_AGENT_MAX = 32,
 };
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v4, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations
From: Haiyang Zhang @ 2019-08-22  5:05 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566450236-36757-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_INTERFACE is set.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 ++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 ++++++++
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index bcf3655..fd32a5b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 0000000..cf08d02
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include <linux/hyperv.h>
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+				 int offset, bool read)
+{
+	int rc = -EOPNOTSUPP;
+	int bytes_returned;
+	int block_id;
+
+	if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+		return -EINVAL;
+
+	block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+	rc = read ?
+	     hyperv_read_cfg_blk(dev->pdev, buf,
+				 HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+				 &bytes_returned) :
+	     hyperv_write_cfg_blk(dev->pdev, buf,
+				  HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+	/* Make sure len bytes were read successfully  */
+	if (read)
+		rc |= !(len == bytes_returned);
+
+	if (rc) {
+		mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, offset = %d\n",
+			      read ? "read" : "write", rc, len,
+			      offset);
+		return rc;
+	}
+
+	return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+			int offset)
+{
+	return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+			 int offset)
+{
+	return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask))
+{
+	return hyperv_reg_block_invalidate(dev->pdev, context,
+					   block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+	hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 0000000..f9a4557
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+#include <linux/hyperv.h>
+#include <linux/mlx5/driver.h>
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+			int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+			 int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v4, 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface
From: Haiyang Zhang @ 2019-08-22  5:05 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566450236-36757-1-git-send-email-haiyangz@microsoft.com>

This interface driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 MAINTAINERS                              |  1 +
 drivers/pci/Kconfig                      |  1 +
 drivers/pci/controller/Kconfig           |  7 ++++
 drivers/pci/controller/Makefile          |  1 +
 drivers/pci/controller/pci-hyperv-intf.c | 67 ++++++++++++++++++++++++++++++++
 drivers/pci/controller/pci-hyperv.c      | 12 ++++--
 include/linux/hyperv.h                   | 30 ++++++++++----
 7 files changed, 108 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a406947..9860853 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7469,6 +7469,7 @@ F:	drivers/hid/hid-hyperv.c
 F:	drivers/hv/
 F:	drivers/input/serio/hyperv-keyboard.c
 F:	drivers/pci/controller/pci-hyperv.c
+F:	drivers/pci/controller/pci-hyperv-intf.c
 F:	drivers/net/hyperv/
 F:	drivers/scsi/storvsc_drv.c
 F:	drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..c313de9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
         tristate "Hyper-V PCI Frontend"
         depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+	select PCI_HYPERV_INTERFACE
         help
           The PCI device frontend driver allows the kernel to import arbitrary
           PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..70e0782 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
 	  To compile this driver as a module, choose M here: the
 	  module will be called vmd.
 
+config PCI_HYPERV_INTERFACE
+	tristate "Hyper-V PCI Interface"
+	depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+	help
+	  The Hyper-V PCI Interface is a helper driver allows other drivers to
+	  have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..a2a22c9 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_INTERFACE) += pci-hyperv-intf.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-intf.c b/drivers/pci/controller/pci-hyperv-intf.c
new file mode 100644
index 0000000..cc96be4
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-intf.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang <haiyangz@microsoft.com>
+ *
+ * This small module is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hyperv.h>
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL_GPL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			unsigned int block_id, unsigned int *bytes_returned)
+{
+	if (!hvpci_block_ops.read_block)
+		return -EOPNOTSUPP;
+
+	return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+					  bytes_returned);
+}
+EXPORT_SYMBOL_GPL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+			 unsigned int block_id)
+{
+	if (!hvpci_block_ops.write_block)
+		return -EOPNOTSUPP;
+
+	return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL_GPL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask))
+{
+	if (!hvpci_block_ops.reg_blk_invalidate)
+		return -EOPNOTSUPP;
+
+	return hvpci_block_ops.reg_blk_invalidate(dev, context,
+						  block_invalidate);
+}
+EXPORT_SYMBOL_GPL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_intf(void)
+{
+}
+
+static int __init init_hv_pci_intf(void)
+{
+	return 0;
+}
+
+module_init(init_hv_pci_intf);
+module_exit(exit_hv_pci_intf);
+
+MODULE_DESCRIPTION("Hyper-V PCI Interface");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 57adeca..9c93ac2 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -983,7 +983,6 @@ int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
 	*bytes_returned = comp_pkt.bytes_returned;
 	return 0;
 }
-EXPORT_SYMBOL(hv_read_config_block);
 
 /**
  * hv_pci_write_config_compl() - Invoked when a response packet for a write
@@ -1070,7 +1069,6 @@ int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
 
 	return 0;
 }
-EXPORT_SYMBOL(hv_write_config_block);
 
 /**
  * hv_register_block_invalidate() - Invoked when a config block invalidation
@@ -1101,7 +1099,6 @@ int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
 	return 0;
 
 }
-EXPORT_SYMBOL(hv_register_block_invalidate);
 
 /* Interrupt management hooks */
 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
@@ -3045,10 +3042,19 @@ static int hv_pci_remove(struct hv_device *hdev)
 static void __exit exit_hv_pci_drv(void)
 {
 	vmbus_driver_unregister(&hv_pci_drv);
+
+	hvpci_block_ops.read_block = NULL;
+	hvpci_block_ops.write_block = NULL;
+	hvpci_block_ops.reg_blk_invalidate = NULL;
 }
 
 static int __init init_hv_pci_drv(void)
 {
+	/* Initialize PCI block r/w interface */
+	hvpci_block_ops.read_block = hv_read_config_block;
+	hvpci_block_ops.write_block = hv_write_config_block;
+	hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
+
 	return vmbus_driver_register(&hv_pci_drv);
 }
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 9d37f8c..2afe6fd 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1579,18 +1579,32 @@ struct vmpacket_descriptor *
 	    pkt = hv_pkt_iter_next(channel, pkt))
 
 /*
- * Functions for passing data between SR-IOV PF and VF drivers.  The VF driver
+ * Interface for passing data between SR-IOV PF and VF drivers. The VF driver
  * sends requests to read and write blocks. Each block must be 128 bytes or
  * smaller. Optionally, the VF driver can register a callback function which
  * will be invoked when the host says that one or more of the first 64 block
  * IDs is "invalid" which means that the VF driver should reread them.
  */
 #define HV_CONFIG_BLOCK_SIZE_MAX 128
-int hv_read_config_block(struct pci_dev *dev, void *buf, unsigned int buf_len,
-			 unsigned int block_id, unsigned int *bytes_returned);
-int hv_write_config_block(struct pci_dev *dev, void *buf, unsigned int len,
-			  unsigned int block_id);
-int hv_register_block_invalidate(struct pci_dev *dev, void *context,
-				 void (*block_invalidate)(void *context,
-							  u64 block_mask));
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			unsigned int block_id, unsigned int *bytes_returned);
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+			 unsigned int block_id);
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask));
+
+struct hyperv_pci_block_ops {
+	int (*read_block)(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			  unsigned int block_id, unsigned int *bytes_returned);
+	int (*write_block)(struct pci_dev *dev, void *buf, unsigned int len,
+			   unsigned int block_id);
+	int (*reg_blk_invalidate)(struct pci_dev *dev, void *context,
+				  void (*block_invalidate)(void *context,
+							   u64 block_mask));
+};
+
+extern struct hyperv_pci_block_ops hvpci_block_ops;
+
 #endif /* _HYPERV_H */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v4, 1/6] PCI: hv: Add a paravirtual backchannel in software
From: Haiyang Zhang @ 2019-08-22  5:05 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org, Dexuan Cui, Jake Oshins
In-Reply-To: <1566450236-36757-1-git-send-email-haiyangz@microsoft.com>

From: Dexuan Cui <decui@microsoft.com>

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver.  These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional.  Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver.  The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks.  This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/pci/controller/pci-hyperv.c | 302 ++++++++++++++++++++++++++++++++++++
 include/linux/hyperv.h              |  15 ++
 2 files changed, 317 insertions(+)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
 	struct tran_int_desc int_desc;
 } __packed;
 
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+	struct pci_message message_type;
+	u32 block_id;
+	union win_slot_encoding wslot;
+	u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+	struct vmpacket_descriptor hdr;
+	u32 status;
+	u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+	struct pci_message message_type;
+	u32 block_id;
+	union win_slot_encoding wslot;
+	u32 byte_count;
+	u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+	struct pci_incoming_message incoming;
+	union win_slot_encoding wslot;
+	u64 block_mask;
+} __packed;
+
 struct pci_dev_incoming {
 	struct pci_incoming_message incoming;
 	union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
 	struct hv_pcibus_device *hbus;
 	struct work_struct wrk;
 
+	void (*block_invalidate)(void *context, u64 block_mask);
+	void *invalidate_context;
+
 	/*
 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
 	 * read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
 	.write = hv_pcifront_write_config,
 };
 
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver.  These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional.  Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver.  The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one or
+ * more of the first 64 blocks.  This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+	struct hv_pci_compl comp_pkt;
+	void *buf;
+	unsigned int len;
+	unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context:		Identifies the read config operation
+ * @resp:		The response packet itself
+ * @resp_packet_size:	Size in bytes of the response packet
+ */
+static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
+				     int resp_packet_size)
+{
+	struct hv_read_config_compl *comp = context;
+	struct pci_read_block_response *read_resp =
+		(struct pci_read_block_response *)resp;
+	unsigned int data_len, hdr_len;
+
+	hdr_len = offsetof(struct pci_read_block_response, bytes);
+	if (resp_packet_size < hdr_len) {
+		comp->comp_pkt.completion_status = -1;
+		goto out;
+	}
+
+	data_len = resp_packet_size - hdr_len;
+	if (data_len > 0 && read_resp->status == 0) {
+		comp->bytes_returned = min(comp->len, data_len);
+		memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
+	} else {
+		comp->bytes_returned = 0;
+	}
+
+	comp->comp_pkt.completion_status = read_resp->status;
+out:
+	complete(&comp->comp_pkt.host_event);
+}
+
+/**
+ * hv_read_config_block() - Sends a read config block request to
+ * the back-end driver running in the Hyper-V parent partition.
+ * @pdev:		The PCI driver's representation for this device.
+ * @buf:		Buffer into which the config block will be copied.
+ * @len:		Size in bytes of buf.
+ * @block_id:		Identifies the config block which has been requested.
+ * @bytes_returned:	Size which came back from the back-end driver.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
+			 unsigned int block_id, unsigned int *bytes_returned)
+{
+	struct hv_pcibus_device *hbus =
+		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+			     sysdata);
+	struct {
+		struct pci_packet pkt;
+		char buf[sizeof(struct pci_read_block)];
+	} pkt;
+	struct hv_read_config_compl comp_pkt;
+	struct pci_read_block *read_blk;
+	int ret;
+
+	if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
+		return -EINVAL;
+
+	init_completion(&comp_pkt.comp_pkt.host_event);
+	comp_pkt.buf = buf;
+	comp_pkt.len = len;
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.pkt.completion_func = hv_pci_read_config_compl;
+	pkt.pkt.compl_ctxt = &comp_pkt;
+	read_blk = (struct pci_read_block *)&pkt.pkt.message;
+	read_blk->message_type.type = PCI_READ_BLOCK;
+	read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
+	read_blk->block_id = block_id;
+	read_blk->bytes_requested = len;
+
+	ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
+			       sizeof(*read_blk), (unsigned long)&pkt.pkt,
+			       VM_PKT_DATA_INBAND,
+			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+	if (ret)
+		return ret;
+
+	ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
+	if (ret)
+		return ret;
+
+	if (comp_pkt.comp_pkt.completion_status != 0 ||
+	    comp_pkt.bytes_returned == 0) {
+		dev_err(&hbus->hdev->device,
+			"Read Config Block failed: 0x%x, bytes_returned=%d\n",
+			comp_pkt.comp_pkt.completion_status,
+			comp_pkt.bytes_returned);
+		return -EIO;
+	}
+
+	*bytes_returned = comp_pkt.bytes_returned;
+	return 0;
+}
+EXPORT_SYMBOL(hv_read_config_block);
+
+/**
+ * hv_pci_write_config_compl() - Invoked when a response packet for a write
+ * config block operation arrives.
+ * @context:		Identifies the write config operation
+ * @resp:		The response packet itself
+ * @resp_packet_size:	Size in bytes of the response packet
+ */
+static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
+				      int resp_packet_size)
+{
+	struct hv_pci_compl *comp_pkt = context;
+
+	comp_pkt->completion_status = resp->status;
+	complete(&comp_pkt->host_event);
+}
+
+/**
+ * hv_write_config_block() - Sends a write config block request to the
+ * back-end driver running in the Hyper-V parent partition.
+ * @pdev:		The PCI driver's representation for this device.
+ * @buf:		Buffer from which the config block will	be copied.
+ * @len:		Size in bytes of buf.
+ * @block_id:		Identifies the config block which is being written.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
+			  unsigned int block_id)
+{
+	struct hv_pcibus_device *hbus =
+		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+			     sysdata);
+	struct {
+		struct pci_packet pkt;
+		char buf[sizeof(struct pci_write_block)];
+		u32 reserved;
+	} pkt;
+	struct hv_pci_compl comp_pkt;
+	struct pci_write_block *write_blk;
+	u32 pkt_size;
+	int ret;
+
+	if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
+		return -EINVAL;
+
+	init_completion(&comp_pkt.host_event);
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.pkt.completion_func = hv_pci_write_config_compl;
+	pkt.pkt.compl_ctxt = &comp_pkt;
+	write_blk = (struct pci_write_block *)&pkt.pkt.message;
+	write_blk->message_type.type = PCI_WRITE_BLOCK;
+	write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
+	write_blk->block_id = block_id;
+	write_blk->byte_count = len;
+	memcpy(write_blk->bytes, buf, len);
+	pkt_size = offsetof(struct pci_write_block, bytes) + len;
+	/*
+	 * This quirk is required on some hosts shipped around 2018, because
+	 * these hosts don't check the pkt_size correctly (new hosts have been
+	 * fixed since early 2019). The quirk is also safe on very old hosts
+	 * and new hosts, because, on them, what really matters is the length
+	 * specified in write_blk->byte_count.
+	 */
+	pkt_size += sizeof(pkt.reserved);
+
+	ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
+			       (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
+			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+	if (ret)
+		return ret;
+
+	ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
+	if (ret)
+		return ret;
+
+	if (comp_pkt.completion_status != 0) {
+		dev_err(&hbus->hdev->device,
+			"Write Config Block failed: 0x%x\n",
+			comp_pkt.completion_status);
+		return -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(hv_write_config_block);
+
+/**
+ * hv_register_block_invalidate() - Invoked when a config block invalidation
+ * arrives from the back-end driver.
+ * @pdev:		The PCI driver's representation for this device.
+ * @context:		Identifies the device.
+ * @block_invalidate:	Identifies all of the blocks being invalidated.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
+				 void (*block_invalidate)(void *context,
+							  u64 block_mask))
+{
+	struct hv_pcibus_device *hbus =
+		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+			     sysdata);
+	struct hv_pci_dev *hpdev;
+
+	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
+	if (!hpdev)
+		return -ENODEV;
+
+	hpdev->block_invalidate = block_invalidate;
+	hpdev->invalidate_context = context;
+
+	put_pcichild(hpdev);
+	return 0;
+
+}
+EXPORT_SYMBOL(hv_register_block_invalidate);
+
 /* Interrupt management hooks */
 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
 			     struct tran_int_desc *int_desc)
@@ -1968,6 +2254,7 @@ static void hv_pci_onchannelcallback(void *context)
 	struct pci_response *response;
 	struct pci_incoming_message *new_message;
 	struct pci_bus_relations *bus_rel;
+	struct pci_dev_inval_block *inval;
 	struct pci_dev_incoming *dev_message;
 	struct hv_pci_dev *hpdev;
 
@@ -2045,6 +2332,21 @@ static void hv_pci_onchannelcallback(void *context)
 				}
 				break;
 
+			case PCI_INVALIDATE_BLOCK:
+
+				inval = (struct pci_dev_inval_block *)buffer;
+				hpdev = get_pcichild_wslot(hbus,
+							   inval->wslot.slot);
+				if (hpdev) {
+					if (hpdev->block_invalidate) {
+						hpdev->block_invalidate(
+						    hpdev->invalidate_context,
+						    inval->block_mask);
+					}
+					put_pcichild(hpdev);
+				}
+				break;
+
 			default:
 				dev_warn(&hbus->hdev->device,
 					"Unimplemented protocol message %x\n",
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 6256cc3..9d37f8c 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1578,4 +1578,19 @@ struct vmpacket_descriptor *
 	for (pkt = hv_pkt_iter_first(channel); pkt; \
 	    pkt = hv_pkt_iter_next(channel, pkt))
 
+/*
+ * Functions for passing data between SR-IOV PF and VF drivers.  The VF driver
+ * sends requests to read and write blocks. Each block must be 128 bytes or
+ * smaller. Optionally, the VF driver can register a callback function which
+ * will be invoked when the host says that one or more of the first 64 block
+ * IDs is "invalid" which means that the VF driver should reread them.
+ */
+#define HV_CONFIG_BLOCK_SIZE_MAX 128
+int hv_read_config_block(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			 unsigned int block_id, unsigned int *bytes_returned);
+int hv_write_config_block(struct pci_dev *dev, void *buf, unsigned int len,
+			  unsigned int block_id);
+int hv_register_block_invalidate(struct pci_dev *dev, void *context,
+				 void (*block_invalidate)(void *context,
+							  u64 block_mask));
 #endif /* _HYPERV_H */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH] x86/Hyper-V: Fix build error with CONFIG_HYPERV_TSCPAGE=N
From: lantianyu1986 @ 2019-08-22  5:38 UTC (permalink / raw)
  To: kys, haiyangz, sthemmin, sashal, tglx, mingo, bp, hpa, x86,
	daniel.lezcano, michael.h.kelley
  Cc: Tianyu Lan, linux-hyperv, linux-kernel

From: Tianyu Lan <Tianyu.Lan@microsoft.com>

Both Hyper-V tsc page and Hyper-V tsc MSR code use variable
hv_sched_clock_offset for their sched clock callback and so
define the variable regardless of CONFIG_HYPERV_TSCPAGE setting.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
This patch is based on the top of "git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
timers/core".

 drivers/clocksource/hyperv_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index dad8af198e20..c322ab4d3689 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -22,6 +22,7 @@
 #include <asm/mshyperv.h>
 
 static struct clock_event_device __percpu *hv_clock_event;
+static u64 hv_sched_clock_offset __ro_after_init;
 
 /*
  * If false, we're using the old mechanism for stimer0 interrupts
@@ -215,7 +216,6 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 #ifdef CONFIG_HYPERV_TSCPAGE
 
 static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
-static u64 hv_sched_clock_offset __ro_after_init;
 
 struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
-- 
2.14.5


^ permalink raw reply related

* Re: [PATCH] x86/hyper-v: enable TSC page clocksource on 32bit
From: Vitaly Kuznetsov @ 2019-08-22  7:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-hyperv, linux-kernel, x86, Ingo Molnar, Borislav Petkov,
	H. Peter Anvin, K. Y. Srinivasan, Haiyang Zhang,
	Stephen Hemminger, Sasha Levin, Daniel Lezcano, Michael Kelley,
	Tianyu Lan, Peter Zijlstra
In-Reply-To: <alpine.DEB.2.21.1908212321320.1983@nanos.tec.linutronix.de>

Thomas Gleixner <tglx@linutronix.de> writes:

> On Wed, 21 Aug 2019, Thomas Gleixner wrote:
>
>> On Wed, 21 Aug 2019, Vitaly Kuznetsov wrote:
>> 
>> > There is no particular reason to not enable TSC page clocksource
>> > on 32-bit. mul_u64_u64_shr() is available and despite the increased
>> > computational complexity (compared to 64bit) TSC page is still a huge
>> > win compared to MSR-based clocksource.
>> > 
>> > In-kernel reads:
>> >   MSR based clocksource: 3361 cycles
>> >   TSC page clocksource: 49 cycles
>> > 
>> > Reads from userspace (unilizing vDSO in case of TSC page):
>> >   MSR based clocksource: 5664 cycles
>> >   TSC page clocksource: 131 cycles
>> > 
>> > Enabling TSC page on 32bits allows us to get rid of CONFIG_HYPERV_TSCPAGE
>> 
>> s/allows us/allows/
>> 
>> > as it is now not any different from CONFIG_HYPERV_TIMER.
>> > 
>> > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>> > ---
>> >  arch/x86/include/asm/vdso/gettimeofday.h |  6 +++---
>> >  drivers/clocksource/hyperv_timer.c       | 11 -----------
>> >  drivers/hv/Kconfig                       |  3 ---
>> >  include/clocksource/hyperv_timer.h       |  6 ++----
>> >  4 files changed, 5 insertions(+), 21 deletions(-)
>> 
>> Really nice cleanup as a side effect of adding functionality.
>
> That said, could you please rebase that on
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
>
> as I just applied the TSC page patches there and this conflicts left and
> right.

Sure, v2 is coming!

-- 
Vitaly

^ permalink raw reply

* [PATCH v2] x86/hyper-v: enable TSC page clocksource on 32bit
From: Vitaly Kuznetsov @ 2019-08-22  8:36 UTC (permalink / raw)
  To: linux-hyperv
  Cc: linux-kernel, x86, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	H. Peter Anvin, K. Y. Srinivasan, Haiyang Zhang,
	Stephen Hemminger, Sasha Levin, Daniel Lezcano, Michael Kelley,
	Tianyu Lan, Peter Zijlstra

There is no particular reason to not enable TSC page clocksource
on 32-bit. mul_u64_u64_shr() is available and despite the increased
computational complexity (compared to 64bit) TSC page is still a huge
win compared to MSR-based clocksource.

In-kernel reads:
  MSR based clocksource: 3361 cycles
  TSC page clocksource: 49 cycles

Reads from userspace (utilizing vDSO in case of TSC page):
  MSR based clocksource: 5664 cycles
  TSC page clocksource: 131 cycles

Enabling TSC page on 32bits allows to get rid of CONFIG_HYPERV_TSCPAGE as
it is now not any different from CONFIG_HYPERV_TIMER.

Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
Changes since v1:
- Fix a couple of nits in changelog [Michael Kelley, Thomas Gleixner]
- Fixed a comment with CONFIG_HYPERV_TSC_PAGE [Michael Kelley]
- Added Michael's Reviewed-by: tag
- Rebased on timers/core [Thomas Gleixner]
---
 arch/x86/include/asm/vdso/gettimeofday.h |  6 +++---
 drivers/clocksource/hyperv_timer.c       | 11 -----------
 drivers/hv/Kconfig                       |  3 ---
 include/clocksource/hyperv_timer.h       |  8 +++-----
 4 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index ae91429129a6..bcbf901befbe 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -51,7 +51,7 @@ extern struct pvclock_vsyscall_time_info pvclock_page
 	__attribute__((visibility("hidden")));
 #endif
 
-#ifdef CONFIG_HYPERV_TSCPAGE
+#ifdef CONFIG_HYPERV_TIMER
 extern struct ms_hyperv_tsc_page hvclock_page
 	__attribute__((visibility("hidden")));
 #endif
@@ -192,7 +192,7 @@ static u64 vread_pvclock(void)
 }
 #endif
 
-#ifdef CONFIG_HYPERV_TSCPAGE
+#ifdef CONFIG_HYPERV_TIMER
 static u64 vread_hvclock(void)
 {
 	return hv_read_tsc_page(&hvclock_page);
@@ -215,7 +215,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode)
 		return vread_pvclock();
 	}
 #endif
-#ifdef CONFIG_HYPERV_TSCPAGE
+#ifdef CONFIG_HYPERV_TIMER
 	if (clock_mode == VCLOCK_HVCLOCK) {
 		barrier();
 		return vread_hvclock();
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index dad8af198e20..51b4d7ba959c 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -212,8 +212,6 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
 struct clocksource *hyperv_cs;
 EXPORT_SYMBOL_GPL(hyperv_cs);
 
-#ifdef CONFIG_HYPERV_TSCPAGE
-
 static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
 static u64 hv_sched_clock_offset __ro_after_init;
 
@@ -245,7 +243,6 @@ static struct clocksource hyperv_cs_tsc = {
 	.mask	= CLOCKSOURCE_MASK(64),
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
-#endif
 
 static u64 notrace read_hv_clock_msr(struct clocksource *arg)
 {
@@ -272,7 +269,6 @@ static struct clocksource hyperv_cs_msr = {
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-#ifdef CONFIG_HYPERV_TSCPAGE
 static bool __init hv_init_tsc_clocksource(void)
 {
 	u64		tsc_msr;
@@ -304,13 +300,6 @@ static bool __init hv_init_tsc_clocksource(void)
 
 	return true;
 }
-#else
-static bool __init hv_init_tsc_clocksource(void)
-{
-	return false;
-}
-#endif
-
 
 void __init hv_init_clocksource(void)
 {
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 9a59957922d4..79e5356a737a 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -14,9 +14,6 @@ config HYPERV
 config HYPERV_TIMER
 	def_bool HYPERV
 
-config HYPERV_TSCPAGE
-       def_bool HYPERV && X86_64
-
 config HYPERV_UTILS
 	tristate "Microsoft Hyper-V Utilities driver"
 	depends on HYPERV && CONNECTOR && NLS
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index a821deb8ecb2..422f5e5237be 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -28,12 +28,10 @@ extern void hv_stimer_cleanup(unsigned int cpu);
 extern void hv_stimer_global_cleanup(void);
 extern void hv_stimer0_isr(void);
 
-#if IS_ENABLED(CONFIG_HYPERV)
+#ifdef CONFIG_HYPERV_TIMER
 extern struct clocksource *hyperv_cs;
 extern void hv_init_clocksource(void);
-#endif /* CONFIG_HYPERV */
 
-#ifdef CONFIG_HYPERV_TSCPAGE
 extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
 
 static inline notrace u64
@@ -91,7 +89,7 @@ hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
 	return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
 }
 
-#else /* CONFIG_HYPERV_TSC_PAGE */
+#else /* CONFIG_HYPERV_TIMER */
 static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
 	return NULL;
@@ -102,6 +100,6 @@ static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
 {
 	return U64_MAX;
 }
-#endif /* CONFIG_HYPERV_TSCPAGE */
+#endif /* CONFIG_HYPERV_TIMER */
 
 #endif
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH] x86/Hyper-V: Fix build error with CONFIG_HYPERV_TSCPAGE=N
From: Vitaly Kuznetsov @ 2019-08-22  8:39 UTC (permalink / raw)
  To: lantianyu1986
  Cc: Tianyu Lan, linux-hyperv, linux-kernel, kys, haiyangz, sthemmin,
	sashal, tglx, mingo, bp, hpa, x86, daniel.lezcano,
	michael.h.kelley
In-Reply-To: <20190822053852.239309-1-Tianyu.Lan@microsoft.com>

lantianyu1986@gmail.com writes:

> From: Tianyu Lan <Tianyu.Lan@microsoft.com>
>
> Both Hyper-V tsc page and Hyper-V tsc MSR code use variable
> hv_sched_clock_offset for their sched clock callback and so
> define the variable regardless of CONFIG_HYPERV_TSCPAGE setting.

CONFIG_HYPERV_TSCPAGE is gone after my "x86/hyper-v: enable TSC page
clocksource on 32bit" patch. Do we still have an issue to fix?

>
> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> ---
> This patch is based on the top of "git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> timers/core".
>
>  drivers/clocksource/hyperv_timer.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
> index dad8af198e20..c322ab4d3689 100644
> --- a/drivers/clocksource/hyperv_timer.c
> +++ b/drivers/clocksource/hyperv_timer.c
> @@ -22,6 +22,7 @@
>  #include <asm/mshyperv.h>
>  
>  static struct clock_event_device __percpu *hv_clock_event;
> +static u64 hv_sched_clock_offset __ro_after_init;
>  
>  /*
>   * If false, we're using the old mechanism for stimer0 interrupts
> @@ -215,7 +216,6 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
>  #ifdef CONFIG_HYPERV_TSCPAGE
>  
>  static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
> -static u64 hv_sched_clock_offset __ro_after_init;
>  
>  struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
>  {

-- 
Vitaly

^ permalink raw reply

* Re: [PATCH] x86/Hyper-V: Fix build error with CONFIG_HYPERV_TSCPAGE=N
From: Tianyu Lan @ 2019-08-22  9:14 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: Tianyu Lan, linux-hyperv, linux-kernel@vger kernel org,
	KY Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	the arch/x86 maintainers, Daniel Lezcano, michael.h.kelley
In-Reply-To: <87zhk1pp9p.fsf@vitty.brq.redhat.com>

On Thu, Aug 22, 2019 at 4:39 PM Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>
> lantianyu1986@gmail.com writes:
>
> > From: Tianyu Lan <Tianyu.Lan@microsoft.com>
> >
> > Both Hyper-V tsc page and Hyper-V tsc MSR code use variable
> > hv_sched_clock_offset for their sched clock callback and so
> > define the variable regardless of CONFIG_HYPERV_TSCPAGE setting.
>
> CONFIG_HYPERV_TSCPAGE is gone after my "x86/hyper-v: enable TSC page
> clocksource on 32bit" patch. Do we still have an issue to fix?
>
Hi Vtialy:
             Your patch also fixs the build issue. If it's not
necessary to have a dedicated patch
to fix the issue, please ignore it. Thanks.

> >
> > Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> > ---
> > This patch is based on the top of "git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > timers/core".
> >
> >  drivers/clocksource/hyperv_timer.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
> > index dad8af198e20..c322ab4d3689 100644
> > --- a/drivers/clocksource/hyperv_timer.c
> > +++ b/drivers/clocksource/hyperv_timer.c
> > @@ -22,6 +22,7 @@
> >  #include <asm/mshyperv.h>
> >
> >  static struct clock_event_device __percpu *hv_clock_event;
> > +static u64 hv_sched_clock_offset __ro_after_init;
> >
> >  /*
> >   * If false, we're using the old mechanism for stimer0 interrupts
> > @@ -215,7 +216,6 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
> >  #ifdef CONFIG_HYPERV_TSCPAGE
> >
> >  static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE);
> > -static u64 hv_sched_clock_offset __ro_after_init;
> >
> >  struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
> >  {
>
> --
> Vitaly



-- 
Best regards
Tianyu Lan

^ permalink raw reply

* Re: [PATCH] vsock: Fix a lockdep warning in __vsock_release()
From: Stefano Garzarella @ 2019-08-22 10:25 UTC (permalink / raw)
  To: Dexuan Cui
  Cc: jhansen@vmware.com, davem@davemloft.net, stefanha@redhat.com,
	netdev@vger.kernel.org, Stephen Hemminger, Sasha Levin,
	sashal@kernel.org, Haiyang Zhang, KY Srinivasan, Michael Kelley,
	linux-hyperv@vger.kernel.org, gregkh@linuxfoundation.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566270830-28981-1-git-send-email-decui@microsoft.com>

On Tue, Aug 20, 2019 at 03:14:22AM +0000, Dexuan Cui wrote:
> Lockdep is unhappy if two locks from the same class are held.
> 
> Fix the below warning by making __vsock_release() non-recursive -- this
> patch is kind of ugly, but it looks to me there is not a better way to
> deal with the problem here.
> 
> ============================================
> WARNING: possible recursive locking detected
> 5.2.0+ #6 Not tainted
> --------------------------------------------
> a.out/1020 is trying to acquire lock:
> 0000000074731a98 (sk_lock-AF_VSOCK){+.+.}, at: hvs_release+0x10/0x120 [hv_sock]
> 
> but task is already holding lock:
> 0000000014ff8397 (sk_lock-AF_VSOCK){+.+.}, at: __vsock_release+0x2e/0xf0 [vsock]
> 
> other info that might help us debug this:
>  Possible unsafe locking scenario:
> 
>        CPU0
>        ----
>   lock(sk_lock-AF_VSOCK);
>   lock(sk_lock-AF_VSOCK);
> 
>  *** DEADLOCK ***
> 
>  May be due to missing lock nesting notation
> 
> 2 locks held by a.out/1020:
>  #0: 00000000f8bceaa7 (&sb->s_type->i_mutex_key#10){+.+.}, at: __sock_release+0x2d/0xa0
>  #1: 0000000014ff8397 (sk_lock-AF_VSOCK){+.+.}, at: __vsock_release+0x2e/0xf0 [vsock]
> 
> stack backtrace:
> CPU: 7 PID: 1020 Comm: a.out Not tainted 5.2.0+ #6
> Call Trace:
>  dump_stack+0x67/0x90
>  __lock_acquire.cold.66+0x14d/0x1f8
>  lock_acquire+0xb5/0x1c0
>  lock_sock_nested+0x6d/0x90
>  hvs_release+0x10/0x120 [hv_sock]
>  __vsock_release+0x24/0xf0 [vsock]
>  __vsock_release+0xa0/0xf0 [vsock]
>  vsock_release+0x12/0x30 [vsock]
>  __sock_release+0x37/0xa0
>  sock_close+0x14/0x20
>  __fput+0xc1/0x250
>  task_work_run+0x98/0xc0
>  do_exit+0x3dd/0xc60
>  do_group_exit+0x47/0xc0
>  get_signal+0x169/0xc60
>  do_signal+0x30/0x710
>  exit_to_usermode_loop+0x50/0xa0
>  do_syscall_64+0x1fc/0x220
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  net/vmw_vsock/af_vsock.c         | 33 ++++++++++++++++++++++++++++++++-
>  net/vmw_vsock/hyperv_transport.c |  2 +-
>  2 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index ab47bf3..420f605 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -638,6 +638,37 @@ struct sock *__vsock_create(struct net *net,
>  }
>  EXPORT_SYMBOL_GPL(__vsock_create);
>  
> +static void __vsock_release2(struct sock *sk)
> +{
> +	if (sk) {
> +		struct sk_buff *skb;
> +		struct vsock_sock *vsk;
> +
> +		vsk = vsock_sk(sk);
> +
> +		/* The release call is supposed to use lock_sock_nested()
> +		 * rather than lock_sock(), if a lock should be acquired.
> +		 */
> +		transport->release(vsk);
> +
> +		/* Use the nested version to avoid the warning
> +		 * "possible recursive locking detected".
> +		 */
> +		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);

What about using lock_sock_nested() in the __vsock_release() without
define this new function?

> +		sock_orphan(sk);
> +		sk->sk_shutdown = SHUTDOWN_MASK;
> +
> +		while ((skb = skb_dequeue(&sk->sk_receive_queue)))
> +			kfree_skb(skb);
> +
> +		/* This sk can not be a listener, so it's unnecessary
> +		 * to call vsock_dequeue_accept().
> +		 */
> +		release_sock(sk);
> +		sock_put(sk);
> +	}
> +}
> +
>  static void __vsock_release(struct sock *sk)
>  {
>  	if (sk) {
> @@ -659,7 +690,7 @@ static void __vsock_release(struct sock *sk)
>  
>  		/* Clean up any sockets that never were accepted. */
>  		while ((pending = vsock_dequeue_accept(sk)) != NULL) {
> -			__vsock_release(pending);
> +			__vsock_release2(pending);
>  			sock_put(pending);
>  		}
>  
> diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
> index 9d864eb..4b126b2 100644
> --- a/net/vmw_vsock/hyperv_transport.c
> +++ b/net/vmw_vsock/hyperv_transport.c
> @@ -559,7 +559,7 @@ static void hvs_release(struct vsock_sock *vsk)
>  	struct sock *sk = sk_vsock(vsk);
>  	bool remove_sock;
>  
> -	lock_sock(sk);
> +	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);

Should we update also other transports?

Thanks,
Stefano

^ permalink raw reply

* Re: [PATCH] storvsc: setup 1:1 mapping between hardware queue and CPU queue
From: Ming Lei @ 2019-08-22 10:55 UTC (permalink / raw)
  To: longli
  Cc: K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger, Sasha Levin,
	James E.J. Bottomley, Martin K. Petersen, linux-hyperv,
	Linux SCSI List, Linux Kernel Mailing List, Long Li
In-Reply-To: <1566243316-113690-1-git-send-email-longli@linuxonhyperv.com>

On Tue, Aug 20, 2019 at 3:36 AM <longli@linuxonhyperv.com> wrote:
>
> From: Long Li <longli@microsoft.com>
>
> storvsc doesn't use a dedicated hardware queue for a given CPU queue. When
> issuing I/O, it selects returning CPU (hardware queue) dynamically based on
> vmbus channel usage across all channels.
>
> This patch sets up a 1:1 mapping between hardware queue and CPU queue, thus
> avoiding unnecessary locking at upper layer when issuing I/O.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
>  drivers/scsi/storvsc_drv.c | 16 ++++++++++++++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index b89269120a2d..26c16d40ec46 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -1682,6 +1682,18 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
>         return 0;
>  }
>
> +static int storvsc_map_queues(struct Scsi_Host *shost)
> +{
> +       unsigned int cpu;
> +       struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
> +
> +       for_each_possible_cpu(cpu) {
> +               qmap->mq_map[cpu] = cpu;
> +       }

Block layer provides the helper of blk_mq_map_queues(), so suggest you to use
the default cpu mapping, instead of inventing a new one.

thanks,
Ming Lei

^ permalink raw reply

* [PATCH V4 0/3] KVM/Hyper-V: Add Hyper-V direct tlb flush support
From: lantianyu1986 @ 2019-08-22 14:30 UTC (permalink / raw)
  To: pbonzini, rkrcmar, corbet, kys, haiyangz, sthemmin, sashal, tglx,
	mingo, bp, hpa, x86, michael.h.kelley
  Cc: Tianyu Lan, kvm, linux-doc, linux-hyperv, linux-kernel, vkuznets

From: Tianyu Lan <Tianyu.Lan@microsoft.com>

This patchset is to add Hyper-V direct tlb support in KVM. Hyper-V
in L0 can delegate L1 hypervisor to handle tlb flush request from
L2 guest when direct tlb flush is enabled in L1.

Patch 2 introduces new cap KVM_CAP_HYPERV_DIRECT_TLBFLUSH to enable
feature from user space. User space should enable this feature only
when Hyper-V hypervisor capability is exposed to guest and KVM profile
is hided. There is a parameter conflict between KVM and Hyper-V hypercall.
We hope L2 guest doesn't use KVM hypercall when the feature is
enabled. Detail please see comment of new API "KVM_CAP_HYPERV_DIRECT_TLBFLUSH"

Change since v3:
       - Update changelog in each patches. 

Change since v2:
       - Move hv assist page(hv_pa_pg) from struct kvm  to struct kvm_hv.

Change since v1:
       - Fix offset issue in the patch 1.
       - Update description of KVM KVM_CAP_HYPERV_DIRECT_TLBFLUSH.

Tianyu Lan (2):
  x86/Hyper-V: Fix definition of struct hv_vp_assist_page
  KVM/Hyper-V: Add new KVM capability KVM_CAP_HYPERV_DIRECT_TLBFLUSH

Vitaly Kuznetsov (1):
  KVM/Hyper-V/VMX: Add direct tlb flush support

 Documentation/virtual/kvm/api.txt  | 13 +++++++++++++
 arch/x86/include/asm/hyperv-tlfs.h | 24 ++++++++++++++++++-----
 arch/x86/include/asm/kvm_host.h    |  4 ++++
 arch/x86/kvm/vmx/evmcs.h           |  2 ++
 arch/x86/kvm/vmx/vmx.c             | 39 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                 |  8 ++++++++
 include/uapi/linux/kvm.h           |  1 +
 7 files changed, 86 insertions(+), 5 deletions(-)

-- 
2.14.5


^ permalink raw reply

* [PATCH V4 1/3] x86/Hyper-V: Fix definition of struct hv_vp_assist_page
From: lantianyu1986 @ 2019-08-22 14:30 UTC (permalink / raw)
  To: pbonzini, rkrcmar, corbet, kys, haiyangz, sthemmin, sashal, tglx,
	mingo, bp, hpa, x86, michael.h.kelley
  Cc: Tianyu Lan, kvm, linux-doc, linux-kernel, linux-hyperv, vkuznets
In-Reply-To: <20190822143021.7518-1-Tianyu.Lan@microsoft.com>

From: Tianyu Lan <Tianyu.Lan@microsoft.com>

The struct hv_vp_assist_page was defined incorrectly.
The "vtl_control" should be u64[3], "nested_enlightenments
_control" should be a u64 and there are 7 reserved bytes
following "enlighten_vmentry". Fix the definition.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
Change since v3:
	- Update changelog 
Change since v1:
	- Move definition of struct hv_nested_enlightenments_control
       into this patch to fix offset issue.
---
 arch/x86/include/asm/hyperv-tlfs.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index af78cd72b8f3..cf0b2a04271d 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -514,14 +514,24 @@ struct hv_timer_message_payload {
 	__u64 delivery_time;	/* When the message was delivered */
 } __packed;
 
+struct hv_nested_enlightenments_control {
+	struct {
+		__u32 directhypercall:1;
+		__u32 reserved:31;
+	} features;
+	struct {
+		__u32 reserved;
+	} hypercallControls;
+} __packed;
+
 /* Define virtual processor assist page structure. */
 struct hv_vp_assist_page {
 	__u32 apic_assist;
-	__u32 reserved;
-	__u64 vtl_control[2];
-	__u64 nested_enlightenments_control[2];
-	__u32 enlighten_vmentry;
-	__u32 padding;
+	__u32 reserved1;
+	__u64 vtl_control[3];
+	struct hv_nested_enlightenments_control nested_control;
+	__u8 enlighten_vmentry;
+	__u8 reserved2[7];
 	__u64 current_nested_vmcs;
 } __packed;
 
-- 
2.14.5


^ permalink raw reply related

* [PATCH V4 2/3] KVM/Hyper-V: Add new KVM capability KVM_CAP_HYPERV_DIRECT_TLBFLUSH
From: lantianyu1986 @ 2019-08-22 14:30 UTC (permalink / raw)
  To: pbonzini, rkrcmar, corbet, kys, haiyangz, sthemmin, sashal, tglx,
	mingo, bp, hpa, x86, michael.h.kelley
  Cc: Tianyu Lan, kvm, linux-doc, linux-kernel, linux-hyperv, vkuznets
In-Reply-To: <20190822143021.7518-1-Tianyu.Lan@microsoft.com>

From: Tianyu Lan <Tianyu.Lan@microsoft.com>

Hyper-V direct tlb flush function should be enabled for
guest that only uses Hyper-V hypercall. User space
hypervisor(e.g, Qemu) can disable KVM identification in
CPUID and just exposes Hyper-V identification to make
sure the precondition. Add new KVM capability KVM_CAP_
HYPERV_DIRECT_TLBFLUSH for user space to enable Hyper-V
direct tlb function and this function is default to be
disabled in KVM.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
---
Change since v3:
	- Update Changelog.
Change since v1:
	- Update description of KVM_CAP_HYPERV_DIRECT_TLBFLUSH
        in the KVM API doc.
---
 Documentation/virtual/kvm/api.txt | 13 +++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/x86.c                |  8 ++++++++
 include/uapi/linux/kvm.h          |  1 +
 4 files changed, 24 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 2cd6250b2896..0c6e1b25d0c8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -5289,3 +5289,16 @@ Architectures: x86
 This capability indicates that KVM supports paravirtualized Hyper-V IPI send
 hypercalls:
 HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
+8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
+
+Architecture: x86
+
+This capability indicates that KVM running on top of Hyper-V hypervisor
+enables Direct TLB flush for its guests meaning that TLB flush
+hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM.
+Due to the different ABI for hypercall parameters between Hyper-V and
+KVM, enabling this capability effectively disables all hypercall
+handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
+flush hypercalls by Hyper-V) so userspace should disable KVM identification
+in CPUID and only exposes Hyper-V identification. In this case, guest
+thinks it's running on Hyper-V and only use Hyper-V hypercalls.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0cc5b611a113..667d154e89d4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1205,6 +1205,8 @@ struct kvm_x86_ops {
 	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
 
 	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+
+	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d7b9e6a0939..a9d8ee7f7bf0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3183,6 +3183,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = kvm_x86_ops->get_nested_state ?
 			kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
 		break;
+	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+		r = kvm_x86_ops->enable_direct_tlbflush ? 1 : 0;
+		break;
 	default:
 		break;
 	}
@@ -3953,6 +3956,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 				r = -EFAULT;
 		}
 		return r;
+	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+		if (!kvm_x86_ops->enable_direct_tlbflush)
+			return -ENOTTY;
+
+		return kvm_x86_ops->enable_direct_tlbflush(vcpu);
 
 	default:
 		return -EINVAL;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a7c19540ce21..cb959bc925b1 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -996,6 +996,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172
 #define KVM_CAP_PMU_EVENT_FILTER 173
+#define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 174
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.14.5


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox