Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH 09/15] Drivers: hv: Introduce hv_output_arg_exists in hv_common
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

This is a more flexible approach for determining whether to allocate the
output page.
This will be used in both mshv_vtl and root partition.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/hv_common.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 99d9b262b8a7..16f069beda78 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -57,6 +57,18 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
 void * __percpu *hyperv_pcpu_output_arg;
 EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
 
+/*
+ * Determine whether output arg is in use, for allocation/deallocation
+ */
+static bool hv_output_arg_exists(void)
+{
+	bool ret = hv_root_partition ? true : false;
+#ifdef CONFIG_MSHV_VTL
+	ret = true;
+#endif
+	return ret;
+}
+
 static void hv_kmsg_dump_unregister(void);
 
 static struct ctl_table_header *hv_ctl_table_hdr;
@@ -338,10 +350,12 @@ int __init hv_common_init(void)
 	hyperv_pcpu_input_arg = alloc_percpu(void  *);
 	BUG_ON(!hyperv_pcpu_input_arg);
 
-	/* Allocate the per-CPU state for output arg for root */
-	if (hv_root_partition) {
+	if (hv_output_arg_exists()) {
 		hyperv_pcpu_output_arg = alloc_percpu(void *);
 		BUG_ON(!hyperv_pcpu_output_arg);
+	}
+
+	if (hv_root_partition) {
 		hv_synic_eventring_tail = alloc_percpu(u8 *);
 		BUG_ON(hv_synic_eventring_tail == NULL);
 	}
@@ -371,7 +385,7 @@ int hv_common_cpu_init(unsigned int cpu)
 	u8 **synic_eventring_tail;
 	u64 msr_vp_index;
 	gfp_t flags;
-	int pgcount = hv_root_partition ? 2 : 1;
+	int pgcount = hv_output_arg_exists() ? 2 : 1;
 
 	/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
 	flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -381,9 +395,12 @@ int hv_common_cpu_init(unsigned int cpu)
 	if (!(*inputarg))
 		return -ENOMEM;
 
-	if (hv_root_partition) {
+	if (hv_output_arg_exists()) {
 		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
 		*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
+	}
+
+	if (hv_root_partition) {
 		synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
 		*synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, sizeof(u8),
 						flags);
@@ -417,9 +434,12 @@ int hv_common_cpu_die(unsigned int cpu)
 	mem = *inputarg;
 	*inputarg = NULL;
 
-	if (hv_root_partition) {
+	if (hv_output_arg_exists()) {
 		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
 		*outputarg = NULL;
+	}
+
+	if (hv_root_partition) {
 		synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
 		kfree(*synic_eventring_tail);
 		*synic_eventring_tail = NULL;
-- 
2.25.1


^ permalink raw reply related

* [PATCH 08/15] Drivers: hv: Introduce per-cpu event ring tail
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

Add a pointer hv_synic_eventring_tail to track the tail pointer for the
SynIC event ring buffer for each SINT.
This will be used by the mshv driver, but must be tracked independently
since the driver module could be removed and re-inserted.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/hv_common.c         | 25 +++++++++++++++++++++++++
 include/asm-generic/mshyperv.h |  2 ++
 2 files changed, 27 insertions(+)

diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 9f9c3dc89bb2..99d9b262b8a7 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -61,6 +61,16 @@ static void hv_kmsg_dump_unregister(void);
 
 static struct ctl_table_header *hv_ctl_table_hdr;
 
+/*
+ * Per-cpu array holding the tail pointer for the SynIC event ring buffer
+ * for each SINT.
+ *
+ * We cannot maintain this in mshv driver because the tail pointer should
+ * persist even if the mshv driver is unloaded.
+ */
+u8 __percpu **hv_synic_eventring_tail;
+EXPORT_SYMBOL_GPL(hv_synic_eventring_tail);
+
 /*
  * Hyper-V specific initialization and shutdown code that is
  * common across all architectures.  Called from architecture
@@ -332,6 +342,8 @@ int __init hv_common_init(void)
 	if (hv_root_partition) {
 		hyperv_pcpu_output_arg = alloc_percpu(void *);
 		BUG_ON(!hyperv_pcpu_output_arg);
+		hv_synic_eventring_tail = alloc_percpu(u8 *);
+		BUG_ON(hv_synic_eventring_tail == NULL);
 	}
 
 	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
@@ -356,6 +368,7 @@ int __init hv_common_init(void)
 int hv_common_cpu_init(unsigned int cpu)
 {
 	void **inputarg, **outputarg;
+	u8 **synic_eventring_tail;
 	u64 msr_vp_index;
 	gfp_t flags;
 	int pgcount = hv_root_partition ? 2 : 1;
@@ -371,6 +384,14 @@ int hv_common_cpu_init(unsigned int cpu)
 	if (hv_root_partition) {
 		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
 		*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
+		synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+		*synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, sizeof(u8),
+						flags);
+
+		if (unlikely(!*synic_eventring_tail)) {
+			kfree(*inputarg);
+			return -ENOMEM;
+		}
 	}
 
 	msr_vp_index = hv_get_register(HV_MSR_VP_INDEX);
@@ -387,6 +408,7 @@ int hv_common_cpu_die(unsigned int cpu)
 {
 	unsigned long flags;
 	void **inputarg, **outputarg;
+	u8 **synic_eventring_tail;
 	void *mem;
 
 	local_irq_save(flags);
@@ -398,6 +420,9 @@ int hv_common_cpu_die(unsigned int cpu)
 	if (hv_root_partition) {
 		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
 		*outputarg = NULL;
+		synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+		kfree(*synic_eventring_tail);
+		*synic_eventring_tail = NULL;
 	}
 
 	local_irq_restore(flags);
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 0c94d20b4d44..9118d678b27a 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -73,6 +73,8 @@ extern bool hv_nested;
 extern void * __percpu *hyperv_pcpu_input_arg;
 extern void * __percpu *hyperv_pcpu_output_arg;
 
+extern u8 __percpu **hv_synic_eventring_tail;
+
 extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
 extern u64 hv_do_fast_hypercall8(u16 control, u64 input8);
 extern bool hv_isolation_type_snp(void);
-- 
2.25.1


^ permalink raw reply related

* [PATCH 07/15] Drivers: hv: Move hv_call_deposit_pages and hv_call_create_vp to common code
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

These hypercalls are not arch-specific.
Move them to common code.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 arch/x86/hyperv/hv_proc.c       | 152 --------------------------------
 arch/x86/include/asm/mshyperv.h |   2 -
 drivers/hv/hv_common.c          | 147 ++++++++++++++++++++++++++++++
 include/asm-generic/mshyperv.h  |   2 +
 4 files changed, 149 insertions(+), 154 deletions(-)

diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
index ed80da64649e..0a35cb865427 100644
--- a/arch/x86/hyperv/hv_proc.c
+++ b/arch/x86/hyperv/hv_proc.c
@@ -3,7 +3,6 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/clockchips.h>
-#include <linux/acpi.h>
 #include <linux/hyperv.h>
 #include <linux/slab.h>
 #include <linux/cpuhotplug.h>
@@ -14,106 +13,6 @@
 
 #include <asm/trace/hyperv.h>
 
-/*
- * See struct hv_deposit_memory. The first u64 is partition ID, the rest
- * are GPAs.
- */
-#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
-
-/* Deposits exact number of pages. Must be called with interrupts enabled.  */
-int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
-{
-	struct page **pages, *page;
-	int *counts;
-	int num_allocations;
-	int i, j, page_count;
-	int order;
-	u64 status;
-	int ret;
-	u64 base_pfn;
-	struct hv_deposit_memory *input_page;
-	unsigned long flags;
-
-	if (num_pages > HV_DEPOSIT_MAX)
-		return -E2BIG;
-	if (!num_pages)
-		return 0;
-
-	/* One buffer for page pointers and counts */
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-	pages = page_address(page);
-
-	counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
-	if (!counts) {
-		free_page((unsigned long)pages);
-		return -ENOMEM;
-	}
-
-	/* Allocate all the pages before disabling interrupts */
-	i = 0;
-
-	while (num_pages) {
-		/* Find highest order we can actually allocate */
-		order = 31 - __builtin_clz(num_pages);
-
-		while (1) {
-			pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
-			if (pages[i])
-				break;
-			if (!order) {
-				ret = -ENOMEM;
-				num_allocations = i;
-				goto err_free_allocations;
-			}
-			--order;
-		}
-
-		split_page(pages[i], order);
-		counts[i] = 1 << order;
-		num_pages -= counts[i];
-		i++;
-	}
-	num_allocations = i;
-
-	local_irq_save(flags);
-
-	input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
-
-	input_page->partition_id = partition_id;
-
-	/* Populate gpa_page_list - these will fit on the input page */
-	for (i = 0, page_count = 0; i < num_allocations; ++i) {
-		base_pfn = page_to_pfn(pages[i]);
-		for (j = 0; j < counts[i]; ++j, ++page_count)
-			input_page->gpa_page_list[page_count] = base_pfn + j;
-	}
-	status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
-				     page_count, 0, input_page, NULL);
-	local_irq_restore(flags);
-	if (!hv_result_success(status)) {
-		pr_err("Failed to deposit pages: %lld\n", status);
-		ret = hv_result(status);
-		goto err_free_allocations;
-	}
-
-	ret = 0;
-	goto free_buf;
-
-err_free_allocations:
-	for (i = 0; i < num_allocations; ++i) {
-		base_pfn = page_to_pfn(pages[i]);
-		for (j = 0; j < counts[i]; ++j)
-			__free_page(pfn_to_page(base_pfn + j));
-	}
-
-free_buf:
-	free_page((unsigned long)pages);
-	kfree(counts);
-	return ret;
-}
-
 int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
 {
 	struct hv_add_logical_processor_in *input;
@@ -156,54 +55,3 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
 	return ret;
 }
 
-int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
-{
-	struct hv_create_vp *input;
-	u64 status;
-	unsigned long irq_flags;
-	int ret = HV_STATUS_SUCCESS;
-	int pxm = node_to_pxm(node);
-
-	/* Root VPs don't seem to need pages deposited */
-	if (partition_id != hv_current_partition_id) {
-		/* The value 90 is empirically determined. It may change. */
-		ret = hv_call_deposit_pages(node, partition_id, 90);
-		if (ret)
-			return ret;
-	}
-
-	do {
-		local_irq_save(irq_flags);
-
-		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
-
-		input->partition_id = partition_id;
-		input->vp_index = vp_index;
-		input->flags = flags;
-		input->subnode_type = HvSubnodeAny;
-		if (node != NUMA_NO_NODE) {
-			input->proximity_domain_info.domain_id = pxm;
-			input->proximity_domain_info.flags.reserved = 0;
-			input->proximity_domain_info.flags.proximity_info_valid = 1;
-			input->proximity_domain_info.flags.proximity_preferred = 1;
-		} else {
-			input->proximity_domain_info.as_uint64 = 0;
-		}
-		status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
-		local_irq_restore(irq_flags);
-
-		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
-			if (!hv_result_success(status)) {
-				pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
-				       vp_index, flags, status);
-				ret = hv_result(status);
-			}
-			break;
-		}
-		ret = hv_call_deposit_pages(node, partition_id, 1);
-
-	} while (!ret);
-
-	return ret;
-}
-
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 23cdcf6525dc..1a0655a93672 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -47,9 +47,7 @@ extern u64 hv_current_partition_id;
 
 extern union hv_ghcb * __percpu *hv_ghcb_pg;
 
-int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
 int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
-int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
 
 static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 {
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 522d57a5e8a0..9f9c3dc89bb2 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -542,3 +542,150 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s
 	return HV_STATUS_INVALID_PARAMETER;
 }
 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+	struct hv_create_vp *input;
+	u64 status;
+	unsigned long irq_flags;
+	int ret = HV_STATUS_SUCCESS;
+
+	/* Root VPs don't seem to need pages deposited */
+	if (partition_id != hv_current_partition_id) {
+		/* The value 90 is empirically determined. It may change. */
+		ret = hv_call_deposit_pages(node, partition_id, 90);
+		if (ret)
+			return ret;
+	}
+
+	do {
+		local_irq_save(irq_flags);
+
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		input->partition_id = partition_id;
+		input->vp_index = vp_index;
+		input->flags = flags;
+		input->subnode_type = HvSubnodeAny;
+		input->proximity_domain_info =
+			numa_node_to_proximity_domain_info(node);
+		status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+		local_irq_restore(irq_flags);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			if (!hv_result_success(status)) {
+				pr_err("%s: vcpu %u, lp %u, %s\n", __func__,
+				       vp_index, flags, hv_status_to_string(status));
+				ret = hv_status_to_errno(status);
+			}
+			break;
+		}
+		ret = hv_call_deposit_pages(node, partition_id, 1);
+
+	} while (!ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_create_vp);
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled.  */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+	struct page **pages, *page;
+	int *counts;
+	int num_allocations;
+	int i, j, page_count;
+	int order;
+	u64 status;
+	int ret;
+	u64 base_pfn;
+	struct hv_deposit_memory *input_page;
+	unsigned long flags;
+
+	if (num_pages > HV_DEPOSIT_MAX)
+		return -E2BIG;
+	if (!num_pages)
+		return 0;
+
+	/* One buffer for page pointers and counts */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	pages = page_address(page);
+
+	counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+	if (!counts) {
+		free_page((unsigned long)pages);
+		return -ENOMEM;
+	}
+
+	/* Allocate all the pages before disabling interrupts */
+	i = 0;
+
+	while (num_pages) {
+		/* Find highest order we can actually allocate */
+		order = 31 - __builtin_clz(num_pages);
+
+		while (1) {
+			pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+			if (pages[i])
+				break;
+			if (!order) {
+				ret = -ENOMEM;
+				num_allocations = i;
+				goto err_free_allocations;
+			}
+			--order;
+		}
+
+		split_page(pages[i], order);
+		counts[i] = 1 << order;
+		num_pages -= counts[i];
+		i++;
+	}
+	num_allocations = i;
+
+	local_irq_save(flags);
+
+	input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	input_page->partition_id = partition_id;
+
+	/* Populate gpa_page_list - these will fit on the input page */
+	for (i = 0, page_count = 0; i < num_allocations; ++i) {
+		base_pfn = page_to_pfn(pages[i]);
+		for (j = 0; j < counts[i]; ++j, ++page_count)
+			input_page->gpa_page_list[page_count] = base_pfn + j;
+	}
+	status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+				     page_count, 0, input_page, NULL);
+	local_irq_restore(flags);
+	if (!hv_result_success(status)) {
+		pr_err("Failed to deposit pages: %s\n", hv_status_to_string(status));
+		ret = hv_status_to_errno(status);
+		goto err_free_allocations;
+	}
+
+	ret = 0;
+	goto free_buf;
+
+err_free_allocations:
+	for (i = 0; i < num_allocations; ++i) {
+		base_pfn = page_to_pfn(pages[i]);
+		for (j = 0; j < counts[i]; ++j)
+			__free_page(pfn_to_page(base_pfn + j));
+	}
+
+free_buf:
+	free_page((unsigned long)pages);
+	kfree(counts);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
+
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index bf87721828f6..0c94d20b4d44 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -343,6 +343,8 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
 void hyperv_cleanup(void);
 bool hv_query_ext_cap(u64 cap_query);
 void hv_setup_dma_ops(struct device *dev, bool coherent);
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
 static inline bool hv_is_hibernation_supported(void) { return false; }
-- 
2.25.1


^ permalink raw reply related

* [PATCH 00/15] Introduce /dev/mshv drivers
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas

This series introduces support for creating and running guest machines
while running on the Microsoft Hypervisor. [0]
This is done via an IOCTL interface accessed through /dev/mshv, similar to
/dev/kvm. Another series introducing this support was previously posted.
[1]

These interfaces support VMMs running in:
1. The root patition - provided in the mshv_root module, and
2. VTL 2 - provided in the mshv_vtl module [2]

Patches breakdown
-----------------
The first 7 patches are refactoring and adding some helper functions.
They provide some benefit on their own and could be applied independently
as cleanup patches.

The following 5 patches just set things up for the driver code to come.
These are very small. They are separated so that the remaining patches are
more self-contained.

The final 3 patches are the meat of the series:
- Patch 13 contains new header files used by the driver.
  These are designed to mirror the ABI headers exported by Hyper-V. This is
  done to avoid polluting hyperv-tlfs.h and help track changes to the ABIs
  that are still unstable. (See FAQ below).
- Patch 14 conditionally includes these new header files into mshyperv.h
  and linux/hyperv.h, in order to be able to use these files in the new
  drivers while remaining independent from hyperv-tlfs.h.
- Patch 15 contains the new driver code located in drivers/hv. This is a
  large amount of code and new files, but it is mostly self-contained and
  all within drivers/hv - apart from the IOCTL interface itself in uapi.

FAQ on include/uapi/hyperv/*.h
------------------------------
Q:
Why not just add these definitions to hyperv-tlfs.h?
A:
The intention of hyperv-tlfs.h is to contain stable definitions documented
in the public TLFS document. These new definitions don't fit that criteria,
so they should be separate.

Q:
Why are these files named hvgdk.h, hvgdk_mini.h, hvhdk.h and hvhdk_mini.h?
A:
The precise meaning of the names reflects conventions used internally at
Microsoft.
Naming them this way makes it easy to find where particular Hyper-V
definitions come from, and check their correctness.
It also facilitates the future work of automatically generating these files.

Q:
Why are they in uapi?
A:
In short, to keep things simple. There are many definitions needed in both
the kernel and the VMM in userspace. Separating them doesn't serve much
purpose, and makes it more laborious to import definitions from Hyper-V
code.

Q:
The new headers redefine many things that are already in hyperv-tlfs.h - why?
A:
Some definitions are extended compared to what is documented in the TLFS.
In order to avoid adding undocumented or unstable definitions to hyperv-tlfs.h,
the new headers must compile independently.
Therefore, the new headers must redefine many things in hyperv-tlfs.h in order
to compile.

--------------------------
[0] "Hyper-V" is more well-known, but it really refers to the whole stack
    including the hypervisor and other components that run in Windows
    kernel and userspace.
[1] Previous /dev/mshv patch series and discussion:
    https://lore.kernel.org/linux-hyperv/1632853875-20261-1-git-send-email-nunodasneves@linux.microsoft.com/
[2] Virtual Secure Mode (VSM) and Virtual Trust Levels (VTL):
    https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/vsm

Nuno Das Neves (15):
  hyperv-tlfs: Change shared HV_REGISTER_* defines to HV_MSR_*
  mshyperv: Introduce hv_get_hypervisor_version
  mshyperv: Introduce numa_node_to_proximity_domain_info
  asm-generic/mshyperv: Introduce hv_recommend_using_aeoi()
  hyperv: Move hv_connection_id to hyperv-tlfs
  hyperv-tlfs: Introduce hv_status_to_string and hv_status_to_errno
  Drivers: hv: Move hv_call_deposit_pages and hv_call_create_vp to
    common code
  Drivers: hv: Introduce per-cpu event ring tail
  Drivers: hv: Introduce hv_output_arg_exists in hv_common
  x86: hyperv: Add mshv_handler irq handler and setup function
  Drivers: hv: export vmbus_isr, hv_context and hv_post_message
  Documentation: Reserve ioctl number for mshv driver
  uapi: hyperv: Add mshv driver headers hvhdk.h, hvhdk_mini.h, hvgdk.h,
    hvgdk_mini.h
  asm-generic: hyperv: Use mshv headers conditionally. Add
    asm-generic/hyperv-defs.h
  Drivers: hv: Add modules to expose /dev/mshv to VMMs running on
    Hyper-V

 .../userspace-api/ioctl/ioctl-number.rst      |    2 +
 arch/arm64/hyperv/mshyperv.c                  |   23 +-
 arch/arm64/include/asm/hyperv-tlfs.h          |   25 +
 arch/arm64/include/asm/mshyperv.h             |    2 +-
 arch/x86/hyperv/hv_init.c                     |    2 +-
 arch/x86/hyperv/hv_proc.c                     |  166 +-
 arch/x86/include/asm/hyperv-tlfs.h            |  137 +-
 arch/x86/include/asm/mshyperv.h               |   13 +-
 arch/x86/kernel/cpu/mshyperv.c                |   71 +-
 drivers/acpi/numa/srat.c                      |    1 +
 drivers/clocksource/hyperv_timer.c            |   24 +-
 drivers/hv/Kconfig                            |   54 +
 drivers/hv/Makefile                           |   21 +
 drivers/hv/hv.c                               |   46 +-
 drivers/hv/hv_call.c                          |  119 +
 drivers/hv/hv_common.c                        |  225 +-
 drivers/hv/hyperv_vmbus.h                     |    2 +-
 drivers/hv/mshv.h                             |  156 ++
 drivers/hv/mshv_eventfd.c                     |  758 +++++++
 drivers/hv/mshv_eventfd.h                     |   80 +
 drivers/hv/mshv_main.c                        |  208 ++
 drivers/hv/mshv_msi.c                         |  129 ++
 drivers/hv/mshv_portid_table.c                |   84 +
 drivers/hv/mshv_root.h                        |  194 ++
 drivers/hv/mshv_root_hv_call.c                | 1064 +++++++++
 drivers/hv/mshv_root_main.c                   | 1964 +++++++++++++++++
 drivers/hv/mshv_synic.c                       |  689 ++++++
 drivers/hv/mshv_vtl.h                         |   52 +
 drivers/hv/mshv_vtl_main.c                    | 1541 +++++++++++++
 drivers/hv/vmbus_drv.c                        |    3 +-
 drivers/hv/xfer_to_guest.c                    |   28 +
 include/asm-generic/hyperv-defs.h             |   26 +
 include/asm-generic/hyperv-tlfs.h             |   77 +-
 include/asm-generic/mshyperv.h                |   76 +-
 include/linux/hyperv.h                        |   11 +-
 include/uapi/hyperv/hvgdk.h                   |   41 +
 include/uapi/hyperv/hvgdk_mini.h              | 1077 +++++++++
 include/uapi/hyperv/hvhdk.h                   | 1352 ++++++++++++
 include/uapi/hyperv/hvhdk_mini.h              |  164 ++
 include/uapi/linux/mshv.h                     |  298 +++
 40 files changed, 10653 insertions(+), 352 deletions(-)
 create mode 100644 drivers/hv/hv_call.c
 create mode 100644 drivers/hv/mshv.h
 create mode 100644 drivers/hv/mshv_eventfd.c
 create mode 100644 drivers/hv/mshv_eventfd.h
 create mode 100644 drivers/hv/mshv_main.c
 create mode 100644 drivers/hv/mshv_msi.c
 create mode 100644 drivers/hv/mshv_portid_table.c
 create mode 100644 drivers/hv/mshv_root.h
 create mode 100644 drivers/hv/mshv_root_hv_call.c
 create mode 100644 drivers/hv/mshv_root_main.c
 create mode 100644 drivers/hv/mshv_synic.c
 create mode 100644 drivers/hv/mshv_vtl.h
 create mode 100644 drivers/hv/mshv_vtl_main.c
 create mode 100644 drivers/hv/xfer_to_guest.c
 create mode 100644 include/asm-generic/hyperv-defs.h
 create mode 100644 include/uapi/hyperv/hvgdk.h
 create mode 100644 include/uapi/hyperv/hvgdk_mini.h
 create mode 100644 include/uapi/hyperv/hvhdk.h
 create mode 100644 include/uapi/hyperv/hvhdk_mini.h
 create mode 100644 include/uapi/linux/mshv.h

-- 
2.25.1


^ permalink raw reply

* [PATCH 06/15] hyperv-tlfs: Introduce hv_status_to_string and hv_status_to_errno
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

hv_status_to_errno translates hyperv statuses to linux error codes.
This is useful for returning something linux-friendly from a hypercall
helper function.
hv_status_to_string improves clarity of error messages.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 arch/x86/hyperv/hv_init.c         |  2 +-
 arch/x86/hyperv/hv_proc.c         |  6 ++---
 include/asm-generic/hyperv-tlfs.h | 45 ++++++++++++++++++++++---------
 include/asm-generic/mshyperv.h    | 33 +++++++++++++++++++++++
 4 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index a5f9474f08e1..460e09c3cdf9 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -371,7 +371,7 @@ static void __init hv_get_partition_id(void)
 	status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
 	if (!hv_result_success(status)) {
 		/* No point in proceeding if this failed */
-		pr_err("Failed to get partition ID: %lld\n", status);
+		pr_err("Failed to get partition ID: %s\n", hv_status_to_string(status));
 		BUG();
 	}
 	hv_current_partition_id = output_page->partition_id;
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
index 5ba5ca1b2089..ed80da64649e 100644
--- a/arch/x86/hyperv/hv_proc.c
+++ b/arch/x86/hyperv/hv_proc.c
@@ -144,9 +144,9 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
 
 		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
 			if (!hv_result_success(status)) {
-				pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
-				       lp_index, apic_id, status);
-				ret = hv_result(status);
+				pr_err("%s: cpu %u apic ID %u, %s\n", __func__,
+				       lp_index, apic_id, hv_status_to_string(status));
+				ret = hv_status_to_errno(status);
 			}
 			break;
 		}
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 8fc5e5a9d7cb..e7b468f06de7 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -212,18 +212,39 @@ enum HV_GENERIC_SET_FORMAT {
 					 HV_HYPERCALL_RSVD2_MASK)
 
 /* hypercall status code */
-#define HV_STATUS_SUCCESS			0
-#define HV_STATUS_INVALID_HYPERCALL_CODE	2
-#define HV_STATUS_INVALID_HYPERCALL_INPUT	3
-#define HV_STATUS_INVALID_ALIGNMENT		4
-#define HV_STATUS_INVALID_PARAMETER		5
-#define HV_STATUS_ACCESS_DENIED			6
-#define HV_STATUS_OPERATION_DENIED		8
-#define HV_STATUS_INSUFFICIENT_MEMORY		11
-#define HV_STATUS_INVALID_PORT_ID		17
-#define HV_STATUS_INVALID_CONNECTION_ID		18
-#define HV_STATUS_INSUFFICIENT_BUFFERS		19
-#define HV_STATUS_VTL_ALREADY_ENABLED		134
+#define __HV_STATUS_DEF(OP) \
+	OP(HV_STATUS_SUCCESS,				0x0) \
+	OP(HV_STATUS_INVALID_HYPERCALL_CODE,		0x2) \
+	OP(HV_STATUS_INVALID_HYPERCALL_INPUT,		0x3) \
+	OP(HV_STATUS_INVALID_ALIGNMENT,			0x4) \
+	OP(HV_STATUS_INVALID_PARAMETER,			0x5) \
+	OP(HV_STATUS_ACCESS_DENIED,			0x6) \
+	OP(HV_STATUS_INVALID_PARTITION_STATE,		0x7) \
+	OP(HV_STATUS_OPERATION_DENIED,			0x8) \
+	OP(HV_STATUS_UNKNOWN_PROPERTY,			0x9) \
+	OP(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE,	0xA) \
+	OP(HV_STATUS_INSUFFICIENT_MEMORY,		0xB) \
+	OP(HV_STATUS_INVALID_PARTITION_ID,		0xD) \
+	OP(HV_STATUS_INVALID_VP_INDEX,			0xE) \
+	OP(HV_STATUS_NOT_FOUND,				0x10) \
+	OP(HV_STATUS_INVALID_PORT_ID,			0x11) \
+	OP(HV_STATUS_INVALID_CONNECTION_ID,		0x12) \
+	OP(HV_STATUS_INSUFFICIENT_BUFFERS,		0x13) \
+	OP(HV_STATUS_NOT_ACKNOWLEDGED,			0x14) \
+	OP(HV_STATUS_INVALID_VP_STATE,			0x15) \
+	OP(HV_STATUS_NO_RESOURCES,			0x1D) \
+	OP(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED,	0x20) \
+	OP(HV_STATUS_INVALID_LP_INDEX,			0x41) \
+	OP(HV_STATUS_INVALID_REGISTER_VALUE,		0x50) \
+	OP(HV_STATUS_CALL_PENDING,			0x79) \
+	OP(HV_STATUS_VTL_ALREADY_ENABLED,		0x86)
+
+#define __HV_MAKE_HV_STATUS_ENUM(NAME, VAL) NAME = (VAL),
+#define __HV_MAKE_HV_STATUS_CASE(NAME, VAL) case (NAME): return (#NAME);
+
+enum hv_status {
+	__HV_STATUS_DEF(__HV_MAKE_HV_STATUS_ENUM)
+};
 
 /*
  * The Hyper-V TimeRefCount register and the TSC
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 90fcbb95f1ee..bf87721828f6 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -300,6 +300,39 @@ static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset,
 	return __cpumask_to_vpset(vpset, cpus, func);
 }
 
+
+static inline int hv_status_to_errno(u64 hv_status)
+{
+	switch (hv_result(hv_status)) {
+	case HV_STATUS_SUCCESS:
+		return 0;
+	case HV_STATUS_INVALID_PARAMETER:
+	case HV_STATUS_UNKNOWN_PROPERTY:
+	case HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE:
+	case HV_STATUS_INVALID_VP_INDEX:
+	case HV_STATUS_INVALID_REGISTER_VALUE:
+	case HV_STATUS_INVALID_LP_INDEX:
+	case HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED:
+		return -EINVAL;
+	case HV_STATUS_ACCESS_DENIED:
+	case HV_STATUS_OPERATION_DENIED:
+		return -EACCES;
+	case HV_STATUS_NOT_ACKNOWLEDGED:
+	case HV_STATUS_INVALID_VP_STATE:
+	case HV_STATUS_INVALID_PARTITION_STATE:
+		return -EBADFD;
+	}
+	return -ENOTRECOVERABLE;
+}
+
+static inline const char *hv_status_to_string(u64 hv_status)
+{
+	switch (hv_result(hv_status)) {
+	__HV_STATUS_DEF(__HV_MAKE_HV_STATUS_CASE)
+	default : return "Unknown";
+	}
+}
+
 void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
 bool hv_is_hyperv_initialized(void);
 bool hv_is_hibernation_supported(void);
-- 
2.25.1


^ permalink raw reply related

* [PATCH 05/15] hyperv: Move hv_connection_id to hyperv-tlfs
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

This structure should be in hyperv-tlfs.h anyway, since it is part of
the TLFS document.
The definition conflicts with one added in hvgdk.h as part of the mshv
driver so must be moved to hyperv-tlfs.h.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 include/asm-generic/hyperv-tlfs.h | 9 +++++++++
 include/linux/hyperv.h            | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 373f26efa18a..8fc5e5a9d7cb 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -845,4 +845,13 @@ struct hv_mmio_write_input {
 	u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH];
 } __packed;
 
+/* Define connection identifier type. */
+union hv_connection_id {
+	u32 asu32;
+	struct {
+		u32 id:24;
+		u32 reserved:8;
+	} u;
+};
+
 #endif
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index bfbc37ce223b..f90de5abcd50 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -748,15 +748,6 @@ struct vmbus_close_msg {
 	struct vmbus_channel_close_channel msg;
 };
 
-/* Define connection identifier type. */
-union hv_connection_id {
-	u32 asu32;
-	struct {
-		u32 id:24;
-		u32 reserved:8;
-	} u;
-};
-
 enum vmbus_device_type {
 	HV_IDE = 0,
 	HV_SCSI,
-- 
2.25.1


^ permalink raw reply related

* [PATCH 01/15] hyperv-tlfs: Change shared HV_REGISTER_* defines to HV_MSR_*
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

In x86 hyperv-tlfs, HV_REGISTER_ prefix is used to indicate MSRs
accessed via rdmsrl/wrmsrl. But in ARM64, HV_REGISTER_ instead indicates
VP registers accessed via get/set vp registers hypercall.

This is due to HV_REGISTER_* names being used by hv_set/get_register,
with the arch-specific version delegating to the appropriate mechanism.

The problem is, using prefix HV_REGISTER_ for MSRs will conflict with
VP registers when they are introduced for x86 in future.

This patch solves the issue by:

1. Defining all the x86 MSRs with a consistent prefix: HV_X64_MSR_.
   This is so HV_REGISTER_ can be reserved for VP registers.

2. Change the non-arch-specific alias used by hv_set/get_register to
   HV_MSR_. This is also happens to be the same name HyperV uses for this
   purpose.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 arch/arm64/include/asm/hyperv-tlfs.h |  25 +++++
 arch/x86/include/asm/hyperv-tlfs.h   | 137 +++++++++++++--------------
 arch/x86/include/asm/mshyperv.h      |   8 +-
 arch/x86/kernel/cpu/mshyperv.c       |  22 ++---
 drivers/clocksource/hyperv_timer.c   |  24 ++---
 drivers/hv/hv.c                      |  32 +++----
 drivers/hv/hv_common.c               |  18 ++--
 include/asm-generic/mshyperv.h       |   2 +-
 8 files changed, 146 insertions(+), 122 deletions(-)

diff --git a/arch/arm64/include/asm/hyperv-tlfs.h b/arch/arm64/include/asm/hyperv-tlfs.h
index bc6c7ac934a1..a6e852c2fc3a 100644
--- a/arch/arm64/include/asm/hyperv-tlfs.h
+++ b/arch/arm64/include/asm/hyperv-tlfs.h
@@ -64,6 +64,31 @@
 #define HV_REGISTER_STIMER0_CONFIG	0x000B0000
 #define HV_REGISTER_STIMER0_COUNT	0x000B0001
 
+/*
+ * To support non-arch-specific code calling hv_set/get_register:
+ * - On x86, HV_MSR_ indicates an MSR accessed via rdmsrl/wrmsrl
+ * - On ARM, HV_MSR_ indicates a VP register accessed via hypercall
+ */
+#define HV_MSR_VP_INDEX		(HV_REGISTER_VP_INDEX)
+#define HV_MSR_TIME_REF_COUNT	(HV_REGISTER_TIME_REF_COUNT)
+#define HV_MSR_REFERENCE_TS	(HV_REGISTER_REFERENCE_TSC)
+
+#define HV_MSR_STIMER0_CONFIG	(HV_REGISTER_STIMER0_CONFIG)
+#define HV_MSR_STIMER0_COUNT	(HV_REGISTER_STIMER0_COUNT)
+
+#define HV_MSR_SCONTROL		(HV_REGISTER_SCONTROL)
+#define HV_MSR_SIEFP		(HV_REGISTER_SIEFP)
+#define HV_MSR_SIMP		(HV_REGISTER_SIMP)
+#define HV_MSR_EOM		(HV_REGISTER_EOM)
+#define HV_MSR_SINT0		(HV_REGISTER_SINT0)
+
+#define HV_MSR_CRASH_P0		(HV_REGISTER_CRASH_P0)
+#define HV_MSR_CRASH_P1		(HV_REGISTER_CRASH_P1)
+#define HV_MSR_CRASH_P2		(HV_REGISTER_CRASH_P2)
+#define HV_MSR_CRASH_P3		(HV_REGISTER_CRASH_P3)
+#define HV_MSR_CRASH_P4		(HV_REGISTER_CRASH_P4)
+#define HV_MSR_CRASH_CTL	(HV_REGISTER_CRASH_CTL)
+
 union hv_msi_entry {
 	u64 as_uint64[2];
 	struct {
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index cea95dcd27c2..40902a767733 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -181,7 +181,7 @@ enum hv_isolation_type {
 #define HV_X64_MSR_HYPERCALL			0x40000001
 
 /* MSR used to provide vcpu index */
-#define HV_REGISTER_VP_INDEX			0x40000002
+#define HV_X64_MSR_VP_INDEX			0x40000002
 
 /* MSR used to reset the guest OS. */
 #define HV_X64_MSR_RESET			0x40000003
@@ -190,10 +190,10 @@ enum hv_isolation_type {
 #define HV_X64_MSR_VP_RUNTIME			0x40000010
 
 /* MSR used to read the per-partition time reference counter */
-#define HV_REGISTER_TIME_REF_COUNT		0x40000020
+#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
 /* A partition's reference time stamp counter (TSC) page */
-#define HV_REGISTER_REFERENCE_TSC		0x40000021
+#define HV_X64_MSR_REFERENCE_TSC		0x40000021
 
 /* MSR used to retrieve the TSC frequency */
 #define HV_X64_MSR_TSC_FREQUENCY		0x40000022
@@ -208,61 +208,61 @@ enum hv_isolation_type {
 #define HV_X64_MSR_VP_ASSIST_PAGE		0x40000073
 
 /* Define synthetic interrupt controller model specific registers. */
-#define HV_REGISTER_SCONTROL			0x40000080
-#define HV_REGISTER_SVERSION			0x40000081
-#define HV_REGISTER_SIEFP			0x40000082
-#define HV_REGISTER_SIMP			0x40000083
-#define HV_REGISTER_EOM				0x40000084
-#define HV_REGISTER_SINT0			0x40000090
-#define HV_REGISTER_SINT1			0x40000091
-#define HV_REGISTER_SINT2			0x40000092
-#define HV_REGISTER_SINT3			0x40000093
-#define HV_REGISTER_SINT4			0x40000094
-#define HV_REGISTER_SINT5			0x40000095
-#define HV_REGISTER_SINT6			0x40000096
-#define HV_REGISTER_SINT7			0x40000097
-#define HV_REGISTER_SINT8			0x40000098
-#define HV_REGISTER_SINT9			0x40000099
-#define HV_REGISTER_SINT10			0x4000009A
-#define HV_REGISTER_SINT11			0x4000009B
-#define HV_REGISTER_SINT12			0x4000009C
-#define HV_REGISTER_SINT13			0x4000009D
-#define HV_REGISTER_SINT14			0x4000009E
-#define HV_REGISTER_SINT15			0x4000009F
+#define HV_X64_MSR_SCONTROL			0x40000080
+#define HV_X64_MSR_SVERSION			0x40000081
+#define HV_X64_MSR_SIEFP			0x40000082
+#define HV_X64_MSR_SIMP				0x40000083
+#define HV_X64_MSR_EOM				0x40000084
+#define HV_X64_MSR_SINT0			0x40000090
+#define HV_X64_MSR_SINT1			0x40000091
+#define HV_X64_MSR_SINT2			0x40000092
+#define HV_X64_MSR_SINT3			0x40000093
+#define HV_X64_MSR_SINT4			0x40000094
+#define HV_X64_MSR_SINT5			0x40000095
+#define HV_X64_MSR_SINT6			0x40000096
+#define HV_X64_MSR_SINT7			0x40000097
+#define HV_X64_MSR_SINT8			0x40000098
+#define HV_X64_MSR_SINT9			0x40000099
+#define HV_X64_MSR_SINT10			0x4000009A
+#define HV_X64_MSR_SINT11			0x4000009B
+#define HV_X64_MSR_SINT12			0x4000009C
+#define HV_X64_MSR_SINT13			0x4000009D
+#define HV_X64_MSR_SINT14			0x4000009E
+#define HV_X64_MSR_SINT15			0x4000009F
 
 /*
  * Define synthetic interrupt controller model specific registers for
  * nested hypervisor.
  */
-#define HV_REGISTER_NESTED_SCONTROL            0x40001080
-#define HV_REGISTER_NESTED_SVERSION            0x40001081
-#define HV_REGISTER_NESTED_SIEFP               0x40001082
-#define HV_REGISTER_NESTED_SIMP                0x40001083
-#define HV_REGISTER_NESTED_EOM                 0x40001084
-#define HV_REGISTER_NESTED_SINT0               0x40001090
+#define HV_X64_MSR_NESTED_SCONTROL		0x40001080
+#define HV_X64_MSR_NESTED_SVERSION		0x40001081
+#define HV_X64_MSR_NESTED_SIEFP			0x40001082
+#define HV_X64_MSR_NESTED_SIMP			0x40001083
+#define HV_X64_MSR_NESTED_EOM			0x40001084
+#define HV_X64_MSR_NESTED_SINT0			0x40001090
 
 /*
  * Synthetic Timer MSRs. Four timers per vcpu.
  */
-#define HV_REGISTER_STIMER0_CONFIG		0x400000B0
-#define HV_REGISTER_STIMER0_COUNT		0x400000B1
-#define HV_REGISTER_STIMER1_CONFIG		0x400000B2
-#define HV_REGISTER_STIMER1_COUNT		0x400000B3
-#define HV_REGISTER_STIMER2_CONFIG		0x400000B4
-#define HV_REGISTER_STIMER2_COUNT		0x400000B5
-#define HV_REGISTER_STIMER3_CONFIG		0x400000B6
-#define HV_REGISTER_STIMER3_COUNT		0x400000B7
+#define HV_X64_MSR_STIMER0_CONFIG		0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT		0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG		0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT		0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG		0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT		0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
 /* Hyper-V guest idle MSR */
 #define HV_X64_MSR_GUEST_IDLE			0x400000F0
 
 /* Hyper-V guest crash notification MSR's */
-#define HV_REGISTER_CRASH_P0			0x40000100
-#define HV_REGISTER_CRASH_P1			0x40000101
-#define HV_REGISTER_CRASH_P2			0x40000102
-#define HV_REGISTER_CRASH_P3			0x40000103
-#define HV_REGISTER_CRASH_P4			0x40000104
-#define HV_REGISTER_CRASH_CTL			0x40000105
+#define HV_X64_MSR_CRASH_P0			0x40000100
+#define HV_X64_MSR_CRASH_P1			0x40000101
+#define HV_X64_MSR_CRASH_P2			0x40000102
+#define HV_X64_MSR_CRASH_P3			0x40000103
+#define HV_X64_MSR_CRASH_P4			0x40000104
+#define HV_X64_MSR_CRASH_CTL			0x40000105
 
 /* TSC emulation after migration */
 #define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
@@ -275,31 +275,30 @@ enum hv_isolation_type {
 /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
 #define HV_EXPOSE_INVARIANT_TSC		BIT_ULL(0)
 
-/* Register name aliases for temporary compatibility */
-#define HV_X64_MSR_STIMER0_COUNT	HV_REGISTER_STIMER0_COUNT
-#define HV_X64_MSR_STIMER0_CONFIG	HV_REGISTER_STIMER0_CONFIG
-#define HV_X64_MSR_STIMER1_COUNT	HV_REGISTER_STIMER1_COUNT
-#define HV_X64_MSR_STIMER1_CONFIG	HV_REGISTER_STIMER1_CONFIG
-#define HV_X64_MSR_STIMER2_COUNT	HV_REGISTER_STIMER2_COUNT
-#define HV_X64_MSR_STIMER2_CONFIG	HV_REGISTER_STIMER2_CONFIG
-#define HV_X64_MSR_STIMER3_COUNT	HV_REGISTER_STIMER3_COUNT
-#define HV_X64_MSR_STIMER3_CONFIG	HV_REGISTER_STIMER3_CONFIG
-#define HV_X64_MSR_SCONTROL		HV_REGISTER_SCONTROL
-#define HV_X64_MSR_SVERSION		HV_REGISTER_SVERSION
-#define HV_X64_MSR_SIMP			HV_REGISTER_SIMP
-#define HV_X64_MSR_SIEFP		HV_REGISTER_SIEFP
-#define HV_X64_MSR_VP_INDEX		HV_REGISTER_VP_INDEX
-#define HV_X64_MSR_EOM			HV_REGISTER_EOM
-#define HV_X64_MSR_SINT0		HV_REGISTER_SINT0
-#define HV_X64_MSR_SINT15		HV_REGISTER_SINT15
-#define HV_X64_MSR_CRASH_P0		HV_REGISTER_CRASH_P0
-#define HV_X64_MSR_CRASH_P1		HV_REGISTER_CRASH_P1
-#define HV_X64_MSR_CRASH_P2		HV_REGISTER_CRASH_P2
-#define HV_X64_MSR_CRASH_P3		HV_REGISTER_CRASH_P3
-#define HV_X64_MSR_CRASH_P4		HV_REGISTER_CRASH_P4
-#define HV_X64_MSR_CRASH_CTL		HV_REGISTER_CRASH_CTL
-#define HV_X64_MSR_TIME_REF_COUNT	HV_REGISTER_TIME_REF_COUNT
-#define HV_X64_MSR_REFERENCE_TSC	HV_REGISTER_REFERENCE_TSC
+/*
+ * To support non-arch-specific code calling hv_set/get_register:
+ * - On x86, HV_MSR_ indicates an MSR accessed via rdmsrl/wrmsrl
+ * - On ARM, HV_MSR_ indicates a VP register accessed via hypercall
+ */
+#define HV_MSR_VP_INDEX		(HV_X64_MSR_VP_INDEX)
+#define HV_MSR_TIME_REF_COUNT	(HV_X64_MSR_TIME_REF_COUNT)
+#define HV_MSR_REFERENCE_TSC	(HV_X64_MSR_REFERENCE_TSC)
+
+#define HV_MSR_STIMER0_CONFIG	(HV_X64_MSR_STIMER0_CONFIG)
+#define HV_MSR_STIMER0_COUNT	(HV_X64_MSR_STIMER0_COUNT)
+
+#define HV_MSR_SCONTROL		(HV_X64_MSR_SCONTROL)
+#define HV_MSR_SIEFP		(HV_X64_MSR_SIEFP)
+#define HV_MSR_SIMP		(HV_X64_MSR_SIMP)
+#define HV_MSR_EOM		(HV_X64_MSR_EOM)
+#define HV_MSR_SINT0		(HV_X64_MSR_SINT0)
+
+#define HV_MSR_CRASH_P0		(HV_X64_MSR_CRASH_P0)
+#define HV_MSR_CRASH_P1		(HV_X64_MSR_CRASH_P1)
+#define HV_MSR_CRASH_P2		(HV_X64_MSR_CRASH_P2)
+#define HV_MSR_CRASH_P3		(HV_X64_MSR_CRASH_P3)
+#define HV_MSR_CRASH_P4		(HV_X64_MSR_CRASH_P4)
+#define HV_MSR_CRASH_CTL	(HV_X64_MSR_CRASH_CTL)
 
 /* Hyper-V memory host visibility */
 enum hv_mem_host_visibility {
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index b445e252aa83..23cdcf6525dc 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -242,14 +242,14 @@ extern bool hv_isolation_type_snp(void);
 
 static inline bool hv_is_synic_reg(unsigned int reg)
 {
-	return (reg >= HV_REGISTER_SCONTROL) &&
-	       (reg <= HV_REGISTER_SINT15);
+	return (reg >= HV_X64_MSR_SCONTROL) &&
+	       (reg <= HV_X64_MSR_SINT15);
 }
 
 static inline bool hv_is_sint_reg(unsigned int reg)
 {
-	return (reg >= HV_REGISTER_SINT0) &&
-	       (reg <= HV_REGISTER_SINT15);
+	return (reg >= HV_X64_MSR_SINT0) &&
+	       (reg <= HV_X64_MSR_SINT15);
 }
 
 u64 hv_get_register(unsigned int reg);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 29e1c0d5347a..57f6a5879b30 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -43,19 +43,19 @@ struct ms_hyperv_info ms_hyperv;
 static inline unsigned int hv_get_nested_reg(unsigned int reg)
 {
 	if (hv_is_sint_reg(reg))
-		return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0;
+		return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
 
 	switch (reg) {
-	case HV_REGISTER_SIMP:
-		return HV_REGISTER_NESTED_SIMP;
-	case HV_REGISTER_SIEFP:
-		return HV_REGISTER_NESTED_SIEFP;
-	case HV_REGISTER_SVERSION:
-		return HV_REGISTER_NESTED_SVERSION;
-	case HV_REGISTER_SCONTROL:
-		return HV_REGISTER_NESTED_SCONTROL;
-	case HV_REGISTER_EOM:
-		return HV_REGISTER_NESTED_EOM;
+	case HV_X64_MSR_SIMP:
+		return HV_X64_MSR_NESTED_SIMP;
+	case HV_X64_MSR_SIEFP:
+		return HV_X64_MSR_NESTED_SIEFP;
+	case HV_X64_MSR_SVERSION:
+		return HV_X64_MSR_NESTED_SVERSION;
+	case HV_X64_MSR_SCONTROL:
+		return HV_X64_MSR_NESTED_SCONTROL;
+	case HV_X64_MSR_EOM:
+		return HV_X64_MSR_NESTED_EOM;
 	default:
 		return reg;
 	}
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index bcd9042a0c9f..0e82b0db4f32 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -81,14 +81,14 @@ static int hv_ce_set_next_event(unsigned long delta,
 
 	current_tick = hv_read_reference_counter();
 	current_tick += delta;
-	hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick);
+	hv_set_register(HV_MSR_STIMER0_COUNT, current_tick);
 	return 0;
 }
 
 static int hv_ce_shutdown(struct clock_event_device *evt)
 {
-	hv_set_register(HV_REGISTER_STIMER0_COUNT, 0);
-	hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0);
+	hv_set_register(HV_MSR_STIMER0_COUNT, 0);
+	hv_set_register(HV_MSR_STIMER0_CONFIG, 0);
 	if (direct_mode_enabled && stimer0_irq >= 0)
 		disable_percpu_irq(stimer0_irq);
 
@@ -119,7 +119,7 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt)
 		timer_cfg.direct_mode = 0;
 		timer_cfg.sintx = stimer0_message_sint;
 	}
-	hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64);
+	hv_set_register(HV_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);
 	return 0;
 }
 
@@ -398,7 +398,7 @@ static u64 notrace read_hv_clock_tsc(void)
 	u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
 
 	if (current_tick == U64_MAX)
-		current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+		current_tick = hv_get_register(HV_MSR_TIME_REF_COUNT);
 
 	return current_tick;
 }
@@ -419,9 +419,9 @@ static void suspend_hv_clock_tsc(struct clocksource *arg)
 	union hv_reference_tsc_msr tsc_msr;
 
 	/* Disable the TSC page */
-	tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC);
+	tsc_msr.as_uint64 = hv_get_register(HV_MSR_REFERENCE_TSC);
 	tsc_msr.enable = 0;
-	hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64);
+	hv_set_register(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
 }
 
 
@@ -430,10 +430,10 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
 	union hv_reference_tsc_msr tsc_msr;
 
 	/* Re-enable the TSC page */
-	tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC);
+	tsc_msr.as_uint64 = hv_get_register(HV_MSR_REFERENCE_TSC);
 	tsc_msr.enable = 1;
 	tsc_msr.pfn = tsc_pfn;
-	hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64);
+	hv_set_register(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
 }
 
 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
@@ -467,7 +467,7 @@ static u64 notrace read_hv_clock_msr(void)
 	 * is set to 0 when the partition is created and is incremented in
 	 * 100 nanosecond units.
 	 */
-	return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+	return hv_get_register(HV_MSR_TIME_REF_COUNT);
 }
 
 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
@@ -554,14 +554,14 @@ static bool __init hv_init_tsc_clocksource(void)
 	 * thus TSC clocksource will work even without the real TSC page
 	 * mapped.
 	 */
-	tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC);
+	tsc_msr.as_uint64 = hv_get_register(HV_MSR_REFERENCE_TSC);
 	if (hv_root_partition)
 		tsc_pfn = tsc_msr.pfn;
 	else
 		tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page));
 	tsc_msr.enable = 1;
 	tsc_msr.pfn = tsc_pfn;
-	hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64);
+	hv_set_register(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
 
 	clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
 
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 7f6b00af7f01..bffd15ce06f3 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -172,7 +172,7 @@ void hv_synic_enable_regs(unsigned int cpu)
 	union hv_synic_scontrol sctrl;
 
 	/* Setup the Synic's message page */
-	simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
+	simp.as_uint64 = hv_get_register(HV_MSR_SIMP);
 	simp.simp_enabled = 1;
 
 	if (hv_isolation_type_snp() || hv_root_partition) {
@@ -188,10 +188,10 @@ void hv_synic_enable_regs(unsigned int cpu)
 			>> HV_HYP_PAGE_SHIFT;
 	}
 
-	hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
+	hv_set_register(HV_MSR_SIMP, simp.as_uint64);
 
 	/* Setup the Synic's event page */
-	siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
+	siefp.as_uint64 = hv_get_register(HV_MSR_SIEFP);
 	siefp.siefp_enabled = 1;
 
 	if (hv_isolation_type_snp() || hv_root_partition) {
@@ -207,12 +207,12 @@ void hv_synic_enable_regs(unsigned int cpu)
 			>> HV_HYP_PAGE_SHIFT;
 	}
 
-	hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
+	hv_set_register(HV_MSR_SIEFP, siefp.as_uint64);
 
 	/* Setup the shared SINT. */
 	if (vmbus_irq != -1)
 		enable_percpu_irq(vmbus_irq, 0);
-	shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
+	shared_sint.as_uint64 = hv_get_register(HV_MSR_SINT0 +
 					VMBUS_MESSAGE_SINT);
 
 	shared_sint.vector = vmbus_interrupt;
@@ -228,14 +228,14 @@ void hv_synic_enable_regs(unsigned int cpu)
 #else
 	shared_sint.auto_eoi = 0;
 #endif
-	hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
+	hv_set_register(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT,
 				shared_sint.as_uint64);
 
 	/* Enable the global synic bit */
-	sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
+	sctrl.as_uint64 = hv_get_register(HV_MSR_SCONTROL);
 	sctrl.enable = 1;
 
-	hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
+	hv_set_register(HV_MSR_SCONTROL, sctrl.as_uint64);
 }
 
 int hv_synic_init(unsigned int cpu)
@@ -259,17 +259,17 @@ void hv_synic_disable_regs(unsigned int cpu)
 	union hv_synic_siefp siefp;
 	union hv_synic_scontrol sctrl;
 
-	shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
+	shared_sint.as_uint64 = hv_get_register(HV_MSR_SINT0 +
 					VMBUS_MESSAGE_SINT);
 
 	shared_sint.masked = 1;
 
 	/* Need to correctly cleanup in the case of SMP!!! */
 	/* Disable the interrupt */
-	hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
+	hv_set_register(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT,
 				shared_sint.as_uint64);
 
-	simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
+	simp.as_uint64 = hv_get_register(HV_MSR_SIMP);
 	/*
 	 * In Isolation VM, sim and sief pages are allocated by
 	 * paravisor. These pages also will be used by kdump
@@ -284,9 +284,9 @@ void hv_synic_disable_regs(unsigned int cpu)
 		simp.base_simp_gpa = 0;
 	}
 
-	hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
+	hv_set_register(HV_MSR_SIMP, simp.as_uint64);
 
-	siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
+	siefp.as_uint64 = hv_get_register(HV_MSR_SIEFP);
 	siefp.siefp_enabled = 0;
 
 	if (hv_isolation_type_snp() || hv_root_partition) {
@@ -296,12 +296,12 @@ void hv_synic_disable_regs(unsigned int cpu)
 		siefp.base_siefp_gpa = 0;
 	}
 
-	hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
+	hv_set_register(HV_MSR_SIEFP, siefp.as_uint64);
 
 	/* Disable the global synic bit */
-	sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
+	sctrl.as_uint64 = hv_get_register(HV_MSR_SCONTROL);
 	sctrl.enable = 0;
-	hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
+	hv_set_register(HV_MSR_SCONTROL, sctrl.as_uint64);
 
 	if (vmbus_irq != -1)
 		disable_percpu_irq(vmbus_irq);
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 64f9ceca887b..522d57a5e8a0 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -227,17 +227,17 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
 	 * contain the size of the panic data in that page. Rest of the
 	 * registers are no-op when the NOTIFY_MSG flag is set.
 	 */
-	hv_set_register(HV_REGISTER_CRASH_P0, 0);
-	hv_set_register(HV_REGISTER_CRASH_P1, 0);
-	hv_set_register(HV_REGISTER_CRASH_P2, 0);
-	hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
-	hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
+	hv_set_register(HV_MSR_CRASH_P0, 0);
+	hv_set_register(HV_MSR_CRASH_P1, 0);
+	hv_set_register(HV_MSR_CRASH_P2, 0);
+	hv_set_register(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page));
+	hv_set_register(HV_MSR_CRASH_P4, bytes_written);
 
 	/*
 	 * Let Hyper-V know there is crash data available along with
 	 * the panic message.
 	 */
-	hv_set_register(HV_REGISTER_CRASH_CTL,
+	hv_set_register(HV_MSR_CRASH_CTL,
 			(HV_CRASH_CTL_CRASH_NOTIFY |
 			 HV_CRASH_CTL_CRASH_NOTIFY_MSG));
 }
@@ -310,7 +310,7 @@ int __init hv_common_init(void)
 		 * Register for panic kmsg callback only if the right
 		 * capability is supported by the hypervisor.
 		 */
-		hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
+		hyperv_crash_ctl = hv_get_register(HV_MSR_CRASH_CTL);
 		if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
 			hv_kmsg_dump_register();
 
@@ -373,7 +373,7 @@ int hv_common_cpu_init(unsigned int cpu)
 		*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
 	}
 
-	msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
+	msr_vp_index = hv_get_register(HV_MSR_VP_INDEX);
 
 	hv_vp_index[cpu] = msr_vp_index;
 
@@ -477,7 +477,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
  */
 static u64 __hv_read_ref_counter(void)
 {
-	return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+	return hv_get_register(HV_MSR_TIME_REF_COUNT);
 }
 
 u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter;
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 402a8c1c202d..094c57320ed1 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -149,7 +149,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 		 * possibly deliver another msg from the
 		 * hypervisor
 		 */
-		hv_set_register(HV_REGISTER_EOM, 0);
+		hv_set_register(HV_MSR_EOM, 0);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH 04/15] asm-generic/mshyperv: Introduce hv_recommend_using_aeoi()
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

Factor out logic for determining if we should set the auto eoi flag in SINT
register.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/hv.c                | 12 +-----------
 include/asm-generic/mshyperv.h | 13 +++++++++++++
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index bffd15ce06f3..a897951634af 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -217,17 +217,7 @@ void hv_synic_enable_regs(unsigned int cpu)
 
 	shared_sint.vector = vmbus_interrupt;
 	shared_sint.masked = false;
-
-	/*
-	 * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
-	 * it doesn't provide a recommendation flag and AEOI must be disabled.
-	 */
-#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
-	shared_sint.auto_eoi =
-			!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
-#else
-	shared_sint.auto_eoi = 0;
-#endif
+	shared_sint.auto_eoi = hv_recommend_using_aeoi();
 	hv_set_register(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT,
 				shared_sint.as_uint64);
 
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 447e7ebe67ee..90fcbb95f1ee 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -77,6 +77,19 @@ extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
 extern u64 hv_do_fast_hypercall8(u16 control, u64 input8);
 extern bool hv_isolation_type_snp(void);
 
+/*
+ * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
+ * it doesn't provide a recommendation flag and AEOI must be disabled.
+ */
+static inline bool hv_recommend_using_aeoi(void)
+{
+#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
+	return !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
+#else
+	return false;
+#endif
+}
+
 /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */
 static inline int hv_result(u64 status)
 {
-- 
2.25.1


^ permalink raw reply related

* [PATCH 03/15] mshyperv: Introduce numa_node_to_proximity_domain_info
From: Nuno Das Neves @ 2023-07-27 19:54 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, x86, linux-arm-kernel, linux-arch
  Cc: mikelley, kys, wei.liu, haiyangz, decui, ssengar, mukeshrathor,
	stanislav.kinsburskiy, jinankjain, apais, Tianyu.Lan, vkuznets,
	tglx, mingo, bp, dave.hansen, hpa, will, catalin.marinas
In-Reply-To: <1690487690-2428-1-git-send-email-nunodasneves@linux.microsoft.com>

Factor out logic for converting numa node to proximity domain info into
a helper function, and export it.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 arch/x86/hyperv/hv_proc.c      |  8 ++------
 drivers/acpi/numa/srat.c       |  1 +
 include/asm-generic/mshyperv.h | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
index 68a0843d4750..5ba5ca1b2089 100644
--- a/arch/x86/hyperv/hv_proc.c
+++ b/arch/x86/hyperv/hv_proc.c
@@ -121,7 +121,6 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
 	u64 status;
 	unsigned long flags;
 	int ret = HV_STATUS_SUCCESS;
-	int pxm = node_to_pxm(node);
 
 	/*
 	 * When adding a logical processor, the hypervisor may return
@@ -137,11 +136,8 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
 
 		input->lp_index = lp_index;
 		input->apic_id = apic_id;
-		input->flags = 0;
-		input->proximity_domain_info.domain_id = pxm;
-		input->proximity_domain_info.flags.reserved = 0;
-		input->proximity_domain_info.flags.proximity_info_valid = 1;
-		input->proximity_domain_info.flags.proximity_preferred = 1;
+		input->proximity_domain_info =
+			numa_node_to_proximity_domain_info(node);
 		status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
 					 input, output);
 		local_irq_restore(flags);
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 1f4fc5f8a819..0cf9f0574495 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -48,6 +48,7 @@ int node_to_pxm(int node)
 		return PXM_INVAL;
 	return node_to_pxm_map[node];
 }
+EXPORT_SYMBOL(node_to_pxm);
 
 static void __acpi_map_pxm_to_node(int pxm, int node)
 {
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 233c976344e5..447e7ebe67ee 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
+#include <acpi/acpi_numa.h>
 #include <linux/cpumask.h>
 #include <linux/nmi.h>
 #include <asm/ptrace.h>
@@ -28,6 +29,23 @@
 
 #define VTPM_BASE_ADDRESS 0xfed40000
 
+static inline union hv_proximity_domain_info
+numa_node_to_proximity_domain_info(int node)
+{
+	union hv_proximity_domain_info proximity_domain_info;
+
+	if (node != NUMA_NO_NODE) {
+		proximity_domain_info.domain_id = node_to_pxm(node);
+		proximity_domain_info.flags.reserved = 0;
+		proximity_domain_info.flags.proximity_info_valid = 1;
+		proximity_domain_info.flags.proximity_preferred = 1;
+	} else {
+		proximity_domain_info.as_uint64 = 0;
+	}
+
+	return proximity_domain_info;
+}
+
 struct ms_hyperv_info {
 	u32 features;
 	u32 priv_high;
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH RFC net-next v5 13/14] virtio/vsock: implement datagram support
From: Arseniy Krasnov @ 2023-07-27  8:09 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv, bpf
In-Reply-To: <ZMFetBpO0OdzXtnK@bullseye>



On 26.07.2023 20:58, Bobby Eshleman wrote:
> On Sat, Jul 22, 2023 at 11:45:29AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit implements datagram support for virtio/vsock by teaching
>>> virtio to use the general virtio transport ->dgram_addr_init() function
>>> and implementation a new version of ->dgram_allow().
>>>
>>> Additionally, it drops virtio_transport_dgram_allow() as an exported
>>> symbol because it is no longer used in other transports.
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  include/linux/virtio_vsock.h            |  1 -
>>>  net/vmw_vsock/virtio_transport.c        | 24 +++++++++++++++++++++++-
>>>  net/vmw_vsock/virtio_transport_common.c |  6 ------
>>>  3 files changed, 23 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>> index b3856b8a42b3..d0a4f08b12c1 100644
>>> --- a/include/linux/virtio_vsock.h
>>> +++ b/include/linux/virtio_vsock.h
>>> @@ -211,7 +211,6 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val);
>>>  u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
>>>  bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
>>>  bool virtio_transport_stream_allow(u32 cid, u32 port);
>>> -bool virtio_transport_dgram_allow(u32 cid, u32 port);
>>>  void virtio_transport_dgram_addr_init(struct sk_buff *skb,
>>>  				      struct sockaddr_vm *addr);
>>>  
>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>> index ac2126c7dac5..713718861bd4 100644
>>> --- a/net/vmw_vsock/virtio_transport.c
>>> +++ b/net/vmw_vsock/virtio_transport.c
>>> @@ -63,6 +63,7 @@ struct virtio_vsock {
>>>  
>>>  	u32 guest_cid;
>>>  	bool seqpacket_allow;
>>> +	bool dgram_allow;
>>>  };
>>>  
>>>  static u32 virtio_transport_get_local_cid(void)
>>> @@ -413,6 +414,7 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
>>>  	queue_work(virtio_vsock_workqueue, &vsock->rx_work);
>>>  }
>>>  
>>> +static bool virtio_transport_dgram_allow(u32 cid, u32 port);
>>
>> May be add body here? Without prototyping? Same for loopback and vhost.
>>
> 
> Sounds okay with me, but this seems to go against the pattern
> established by seqpacket. Any reason why?

Stefano Garzarella <sgarzare@redhat.com> commented my patch with the same approach:

https://lore.kernel.org/netdev/lex6l5suez7azhirt22lidndtjomkbagfbpvvi5p7c2t7klzas@4l2qly7at37c/

Thanks, Arseniy


> 
>>>  static bool virtio_transport_seqpacket_allow(u32 remote_cid);
>>>  
>>>  static struct virtio_transport virtio_transport = {
>>> @@ -430,6 +432,7 @@ static struct virtio_transport virtio_transport = {
>>>  
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>>  		.dgram_allow              = virtio_transport_dgram_allow,
>>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>>>  
>>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
>>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>>> @@ -462,6 +465,21 @@ static struct virtio_transport virtio_transport = {
>>>  	.send_pkt = virtio_transport_send_pkt,
>>>  };
>>>  
>>> +static bool virtio_transport_dgram_allow(u32 cid, u32 port)
>>> +{
>>> +	struct virtio_vsock *vsock;
>>> +	bool dgram_allow;
>>> +
>>> +	dgram_allow = false;
>>> +	rcu_read_lock();
>>> +	vsock = rcu_dereference(the_virtio_vsock);
>>> +	if (vsock)
>>> +		dgram_allow = vsock->dgram_allow;
>>> +	rcu_read_unlock();
>>> +
>>> +	return dgram_allow;
>>> +}
>>> +
>>>  static bool virtio_transport_seqpacket_allow(u32 remote_cid)
>>>  {
>>>  	struct virtio_vsock *vsock;
>>> @@ -655,6 +673,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
>>>  	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
>>>  		vsock->seqpacket_allow = true;
>>>  
>>> +	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_DGRAM))
>>> +		vsock->dgram_allow = true;
>>> +
>>>  	vdev->priv = vsock;
>>>  
>>>  	ret = virtio_vsock_vqs_init(vsock);
>>> @@ -747,7 +768,8 @@ static struct virtio_device_id id_table[] = {
>>>  };
>>>  
>>>  static unsigned int features[] = {
>>> -	VIRTIO_VSOCK_F_SEQPACKET
>>> +	VIRTIO_VSOCK_F_SEQPACKET,
>>> +	VIRTIO_VSOCK_F_DGRAM
>>>  };
>>>  
>>>  static struct virtio_driver virtio_vsock_driver = {
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index 96118e258097..77898f5325cd 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -783,12 +783,6 @@ bool virtio_transport_stream_allow(u32 cid, u32 port)
>>>  }
>>>  EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
>>>  
>>> -bool virtio_transport_dgram_allow(u32 cid, u32 port)
>>> -{
>>> -	return false;
>>> -}
>>> -EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
>>> -
>>>  int virtio_transport_connect(struct vsock_sock *vsk)
>>>  {
>>>  	struct virtio_vsock_pkt_info info = {
>>>
>>
>> Thanks, Arseniy
> 
> Thanks,
> Bobby

^ permalink raw reply

* Re: [PATCH RFC net-next v5 11/14] vhost/vsock: implement datagram support
From: Arseniy Krasnov @ 2023-07-27  8:04 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv, bpf
In-Reply-To: <ZMFd+Jd/LrfpJsVA@bullseye>



On 26.07.2023 20:55, Bobby Eshleman wrote:
> On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit implements datagram support for vhost/vsock by teaching
>>> vhost to use the common virtio transport datagram functions.
>>>
>>> If the virtio RX buffer is too small, then the transmission is
>>> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
>>> error queue.
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>>>  net/vmw_vsock/af_vsock.c |  5 +++-
>>>  2 files changed, 63 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index d5d6a3c3f273..da14260c6654 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -8,6 +8,7 @@
>>>   */
>>>  #include <linux/miscdevice.h>
>>>  #include <linux/atomic.h>
>>> +#include <linux/errqueue.h>
>>>  #include <linux/module.h>
>>>  #include <linux/mutex.h>
>>>  #include <linux/vmalloc.h>
>>> @@ -32,7 +33,8 @@
>>>  enum {
>>>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>>>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
>>> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
>>> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
>>> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>>>  };
>>>  
>>>  enum {
>>> @@ -56,6 +58,7 @@ struct vhost_vsock {
>>>  	atomic_t queued_replies;
>>>  
>>>  	u32 guest_cid;
>>> +	bool dgram_allow;
>>>  	bool seqpacket_allow;
>>>  };
>>>  
>>> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>>>  	return NULL;
>>>  }
>>>  
>>> +/* Claims ownership of the skb, do not free the skb after calling! */
>>> +static void
>>> +vhost_transport_error(struct sk_buff *skb, int err)
>>> +{
>>> +	struct sock_exterr_skb *serr;
>>> +	struct sock *sk = skb->sk;
>>> +	struct sk_buff *clone;
>>> +
>>> +	serr = SKB_EXT_ERR(skb);
>>> +	memset(serr, 0, sizeof(*serr));
>>> +	serr->ee.ee_errno = err;
>>> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
>>> +
>>> +	clone = skb_clone(skb, GFP_KERNEL);
>>
>> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
>> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
>> but i think that there is no need in data as we insert it to error queue of the socket.
>>
>> What do You think?
> 
> IIUC skb_clone() is often used in this scenario so that the user can
> retrieve the error-causing packet from the error queue.  Is there some
> reason we shouldn't do this?
> 
> I'm seeing that the serr bits need to occur on the clone here, not the
> original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
> actually sure how this passes the test case since ->cb isn't cloned.
> 
>>
>>> +	if (!clone)
>>> +		return;
>>
>> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
>>
> 
> Ah yes, true.
> 
>>> +
>>> +	if (sock_queue_err_skb(sk, clone))
>>> +		kfree_skb(clone);
>>> +
>>> +	sk->sk_err = err;
>>> +	sk_error_report(sk);
>>> +
>>> +	kfree_skb(skb);
>>> +}
>>> +
>>>  static void
>>>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  			    struct vhost_virtqueue *vq)
>>> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  		hdr = virtio_vsock_hdr(skb);
>>>  
>>>  		/* If the packet is greater than the space available in the
>>> -		 * buffer, we split it using multiple buffers.
>>> +		 * buffer, we split it using multiple buffers for connectible
>>> +		 * sockets and drop the packet for datagram sockets.
>>>  		 */
>>>  		if (payload_len > iov_len - sizeof(*hdr)) {
>>> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
>>> +				vhost_transport_error(skb, EHOSTUNREACH);
>>> +				continue;
>>> +			}
>>> +
>>>  			payload_len = iov_len - sizeof(*hdr);
>>>  
>>>  			/* As we are copying pieces of large packet's buffer to
>>> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>>>  	return val < vq->num;
>>>  }
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>>>  
>>>  static struct virtio_transport vhost_transport = {
>>> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>>>  
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>> -		.dgram_allow              = virtio_transport_dgram_allow,
>>> +		.dgram_allow              = vhost_transport_dgram_allow,
>>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>>>  
>>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
>>> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>>>  	.send_pkt = vhost_transport_send_pkt,
>>>  };
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
>>> +{
>>> +	struct vhost_vsock *vsock;
>>> +	bool dgram_allow = false;
>>> +
>>> +	rcu_read_lock();
>>> +	vsock = vhost_vsock_get(cid);
>>> +
>>> +	if (vsock)
>>> +		dgram_allow = vsock->dgram_allow;
>>> +
>>> +	rcu_read_unlock();
>>> +
>>> +	return dgram_allow;
>>> +}
>>> +
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>>>  {
>>>  	struct vhost_vsock *vsock;
>>> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>>>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>>>  		vsock->seqpacket_allow = true;
>>>  
>>> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
>>> +		vsock->dgram_allow = true;
>>> +
>>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>>>  		vq = &vsock->vqs[i];
>>>  		mutex_lock(&vq->mutex);
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index e73f3b2c52f1..449ed63ac2b0 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>>>  #endif
>>>  
>>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>> +	if (unlikely(flags & MSG_OOB))
>>>  		return -EOPNOTSUPP;
>>>  
>>> +	if (unlikely(flags & MSG_ERRQUEUE))
>>> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
>>> +
>>
>> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
>> include/linux/socket.h and to uapi files also for future use in userspace.
>>
> 
> Strange, I built each patch individually without issue. My base is
> netdev/main with your SOL_VSOCK patch applied. I will look today and see
> if I'm missing something.

I see, this is difference, because i'm trying to run this patchset on the last net-next (as it is
supposed to be merged to net-next). I guess You should add this define anyway when You be ready to
be merged to net-next (I really don't know which SOL_VSOCK will be merged first - "Your" or "my" :) )

Thanks, Arseniy

> 
>> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
>> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
>>
> 
> Got it, thanks.
> 
>>>  	transport = vsk->transport;
>>>  
>>>  	/* Retrieve the head sk_buff from the socket's receive queue. */
>>>
>>
>> Thanks, Arseniy
> 
> Thanks,
> Bobby

^ permalink raw reply

* Re: [PATCH RFC net-next v5 11/14] vhost/vsock: implement datagram support
From: Arseniy Krasnov @ 2023-07-27  8:00 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv, bpf
In-Reply-To: <ZMFd+Jd/LrfpJsVA@bullseye>



On 26.07.2023 20:55, Bobby Eshleman wrote:
> On Sat, Jul 22, 2023 at 11:42:38AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit implements datagram support for vhost/vsock by teaching
>>> vhost to use the common virtio transport datagram functions.
>>>
>>> If the virtio RX buffer is too small, then the transmission is
>>> abandoned, the packet dropped, and EHOSTUNREACH is added to the socket's
>>> error queue.
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  drivers/vhost/vsock.c    | 62 +++++++++++++++++++++++++++++++++++++++++++++---
>>>  net/vmw_vsock/af_vsock.c |  5 +++-
>>>  2 files changed, 63 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index d5d6a3c3f273..da14260c6654 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -8,6 +8,7 @@
>>>   */
>>>  #include <linux/miscdevice.h>
>>>  #include <linux/atomic.h>
>>> +#include <linux/errqueue.h>
>>>  #include <linux/module.h>
>>>  #include <linux/mutex.h>
>>>  #include <linux/vmalloc.h>
>>> @@ -32,7 +33,8 @@
>>>  enum {
>>>  	VHOST_VSOCK_FEATURES = VHOST_FEATURES |
>>>  			       (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
>>> -			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
>>> +			       (1ULL << VIRTIO_VSOCK_F_SEQPACKET) |
>>> +			       (1ULL << VIRTIO_VSOCK_F_DGRAM)
>>>  };
>>>  
>>>  enum {
>>> @@ -56,6 +58,7 @@ struct vhost_vsock {
>>>  	atomic_t queued_replies;
>>>  
>>>  	u32 guest_cid;
>>> +	bool dgram_allow;
>>>  	bool seqpacket_allow;
>>>  };
>>>  
>>> @@ -86,6 +89,32 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>>>  	return NULL;
>>>  }
>>>  
>>> +/* Claims ownership of the skb, do not free the skb after calling! */
>>> +static void
>>> +vhost_transport_error(struct sk_buff *skb, int err)
>>> +{
>>> +	struct sock_exterr_skb *serr;
>>> +	struct sock *sk = skb->sk;
>>> +	struct sk_buff *clone;
>>> +
>>> +	serr = SKB_EXT_ERR(skb);
>>> +	memset(serr, 0, sizeof(*serr));
>>> +	serr->ee.ee_errno = err;
>>> +	serr->ee.ee_origin = SO_EE_ORIGIN_NONE;
>>> +
>>> +	clone = skb_clone(skb, GFP_KERNEL);
>>
>> May for skb which is error carrier we can use 'sock_omalloc()', not 'skb_clone()' ? TCP uses skb
>> allocated by this function as carriers of error structure. I guess 'skb_clone()' also clones data of origin,
>> but i think that there is no need in data as we insert it to error queue of the socket.
>>
>> What do You think?
> 
> IIUC skb_clone() is often used in this scenario so that the user can
> retrieve the error-causing packet from the error queue.  Is there some
> reason we shouldn't do this?
> 
> I'm seeing that the serr bits need to occur on the clone here, not the
> original. I didn't realize the SKB_EXT_ERR() is a skb->cb cast. I'm not
> actually sure how this passes the test case since ->cb isn't cloned.

Ah yes, sorry, You are right, I just confused this case with zerocopy completion
handling - there we allocate "empty" skb which carries completion metadata in its
'cb' field.

Hm, but can't we just reinsert current skb (update it's 'cb' as 'sock_exterr_skb')
to error queue of the socket without cloning it ?

Thanks, Arseniy

> 
>>
>>> +	if (!clone)
>>> +		return;
>>
>> What will happen here 'if (!clone)' ? skb will leak as it was removed from queue?
>>
> 
> Ah yes, true.
> 
>>> +
>>> +	if (sock_queue_err_skb(sk, clone))
>>> +		kfree_skb(clone);
>>> +
>>> +	sk->sk_err = err;
>>> +	sk_error_report(sk);
>>> +
>>> +	kfree_skb(skb);
>>> +}
>>> +
>>>  static void
>>>  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  			    struct vhost_virtqueue *vq)
>>> @@ -160,9 +189,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>>>  		hdr = virtio_vsock_hdr(skb);
>>>  
>>>  		/* If the packet is greater than the space available in the
>>> -		 * buffer, we split it using multiple buffers.
>>> +		 * buffer, we split it using multiple buffers for connectible
>>> +		 * sockets and drop the packet for datagram sockets.
>>>  		 */
>>>  		if (payload_len > iov_len - sizeof(*hdr)) {
>>> +			if (le16_to_cpu(hdr->type) == VIRTIO_VSOCK_TYPE_DGRAM) {
>>> +				vhost_transport_error(skb, EHOSTUNREACH);
>>> +				continue;
>>> +			}
>>> +
>>>  			payload_len = iov_len - sizeof(*hdr);
>>>  
>>>  			/* As we are copying pieces of large packet's buffer to
>>> @@ -394,6 +429,7 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
>>>  	return val < vq->num;
>>>  }
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port);
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>>>  
>>>  static struct virtio_transport vhost_transport = {
>>> @@ -410,7 +446,8 @@ static struct virtio_transport vhost_transport = {
>>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>>>  
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>> -		.dgram_allow              = virtio_transport_dgram_allow,
>>> +		.dgram_allow              = vhost_transport_dgram_allow,
>>> +		.dgram_addr_init          = virtio_transport_dgram_addr_init,
>>>  
>>>  		.stream_enqueue           = virtio_transport_stream_enqueue,
>>>  		.stream_dequeue           = virtio_transport_stream_dequeue,
>>> @@ -443,6 +480,22 @@ static struct virtio_transport vhost_transport = {
>>>  	.send_pkt = vhost_transport_send_pkt,
>>>  };
>>>  
>>> +static bool vhost_transport_dgram_allow(u32 cid, u32 port)
>>> +{
>>> +	struct vhost_vsock *vsock;
>>> +	bool dgram_allow = false;
>>> +
>>> +	rcu_read_lock();
>>> +	vsock = vhost_vsock_get(cid);
>>> +
>>> +	if (vsock)
>>> +		dgram_allow = vsock->dgram_allow;
>>> +
>>> +	rcu_read_unlock();
>>> +
>>> +	return dgram_allow;
>>> +}
>>> +
>>>  static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>>>  {
>>>  	struct vhost_vsock *vsock;
>>> @@ -799,6 +852,9 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>>>  	if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
>>>  		vsock->seqpacket_allow = true;
>>>  
>>> +	if (features & (1ULL << VIRTIO_VSOCK_F_DGRAM))
>>> +		vsock->dgram_allow = true;
>>> +
>>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>>>  		vq = &vsock->vqs[i];
>>>  		mutex_lock(&vq->mutex);
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index e73f3b2c52f1..449ed63ac2b0 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1427,9 +1427,12 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>>>  #endif
>>>  
>>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>> +	if (unlikely(flags & MSG_OOB))
>>>  		return -EOPNOTSUPP;
>>>  
>>> +	if (unlikely(flags & MSG_ERRQUEUE))
>>> +		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
>>> +
>>
>> Sorry, but I get build error here, because SOL_VSOCK in undefined. I think it should be added to
>> include/linux/socket.h and to uapi files also for future use in userspace.
>>
> 
> Strange, I built each patch individually without issue. My base is
> netdev/main with your SOL_VSOCK patch applied. I will look today and see
> if I'm missing something.
> 
>> Also Stefano Garzarella <sgarzare@redhat.com> suggested to add define something like VSOCK_RECVERR,
>> in the same way as IP_RECVERR, and use it as last parameter of 'sock_recv_errqueue()'.
>>
> 
> Got it, thanks.
> 
>>>  	transport = vsk->transport;
>>>  
>>>  	/* Retrieve the head sk_buff from the socket's receive queue. */
>>>
>>
>> Thanks, Arseniy
> 
> Thanks,
> Bobby

^ permalink raw reply

* Re: [PATCH RFC net-next v5 07/14] virtio/vsock: add common datagram send path
From: Arseniy Krasnov @ 2023-07-27  7:57 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv, bpf
In-Reply-To: <ZMFS+MlAPTso6wjQ@bullseye>



On 26.07.2023 20:08, Bobby Eshleman wrote:
> On Sat, Jul 22, 2023 at 11:16:05AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit implements the common function
>>> virtio_transport_dgram_enqueue for enqueueing datagrams. It does not add
>>> usage in either vhost or virtio yet.
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  net/vmw_vsock/virtio_transport_common.c | 76 ++++++++++++++++++++++++++++++++-
>>>  1 file changed, 75 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index ffcbdd77feaa..3bfaff758433 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -819,7 +819,81 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
>>>  			       struct msghdr *msg,
>>>  			       size_t dgram_len)
>>>  {
>>> -	return -EOPNOTSUPP;
>>> +	/* Here we are only using the info struct to retain style uniformity
>>> +	 * and to ease future refactoring and merging.
>>> +	 */
>>> +	struct virtio_vsock_pkt_info info_stack = {
>>> +		.op = VIRTIO_VSOCK_OP_RW,
>>> +		.msg = msg,
>>> +		.vsk = vsk,
>>> +		.type = VIRTIO_VSOCK_TYPE_DGRAM,
>>> +	};
>>> +	const struct virtio_transport *t_ops;
>>> +	struct virtio_vsock_pkt_info *info;
>>> +	struct sock *sk = sk_vsock(vsk);
>>> +	struct virtio_vsock_hdr *hdr;
>>> +	u32 src_cid, src_port;
>>> +	struct sk_buff *skb;
>>> +	void *payload;
>>> +	int noblock;
>>> +	int err;
>>> +
>>> +	info = &info_stack;
>>
>> I think 'info' assignment could be moved below, to the place where it is used
>> first time.
>>
>>> +
>>> +	if (dgram_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>>> +		return -EMSGSIZE;
>>> +
>>> +	t_ops = virtio_transport_get_ops(vsk);
>>> +	if (unlikely(!t_ops))
>>> +		return -EFAULT;
>>> +
>>> +	/* Unlike some of our other sending functions, this function is not
>>> +	 * intended for use without a msghdr.
>>> +	 */
>>> +	if (WARN_ONCE(!msg, "vsock dgram bug: no msghdr found for dgram enqueue\n"))
>>> +		return -EFAULT;
>>
>> Sorry, but is that possible? I thought 'msg' is always provided by general socket layer (e.g. before
>> af_vsock.c code) and can't be NULL for DGRAM. Please correct me if i'm wrong.
>>
>> Also I see, that in af_vsock.c , 'vsock_dgram_sendmsg()' dereferences 'msg' for checking MSG_OOB without any
>> checks (before calling transport callback - this function in case of virtio). So I think if we want to keep
>> this type of check - such check must be placed in af_vsock.c or somewhere before first dereference of this pointer.
>>
> 
> There is some talk about dgram sockets adding additional messages types
> in the future that help with congestion control. Those messages won't
> come from the socket layer, so msghdr will be null. Since there is no
> other function for sending datagrams, it seemed likely that this
> function would be reworked for that purpose. I felt that adding this
> check was a direct way to make it explicit that this function is
> currently designed only for the socket-layer caller.
> 
> Perhaps a comment would suffice?

I see, thanks, it is for future usage. Sorry for dumb question: but if msg is NULL, how
we will decide what to do in this call? Interface of this callback will be updated or
some fields of 'vsock_sock' will contain type of such messages ?

Thanks, Arseniy

> 
>>> +
>>> +	noblock = msg->msg_flags & MSG_DONTWAIT;
>>> +
>>> +	/* Use sock_alloc_send_skb to throttle by sk_sndbuf. This helps avoid
>>> +	 * triggering the OOM.
>>> +	 */
>>> +	skb = sock_alloc_send_skb(sk, dgram_len + VIRTIO_VSOCK_SKB_HEADROOM,
>>> +				  noblock, &err);
>>> +	if (!skb)
>>> +		return err;
>>> +
>>> +	skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM);
>>> +
>>> +	src_cid = t_ops->transport.get_local_cid();
>>> +	src_port = vsk->local_addr.svm_port;
>>> +
>>> +	hdr = virtio_vsock_hdr(skb);
>>> +	hdr->type	= cpu_to_le16(info->type);
>>> +	hdr->op		= cpu_to_le16(info->op);
>>> +	hdr->src_cid	= cpu_to_le64(src_cid);
>>> +	hdr->dst_cid	= cpu_to_le64(remote_addr->svm_cid);
>>> +	hdr->src_port	= cpu_to_le32(src_port);
>>> +	hdr->dst_port	= cpu_to_le32(remote_addr->svm_port);
>>> +	hdr->flags	= cpu_to_le32(info->flags);
>>> +	hdr->len	= cpu_to_le32(dgram_len);
>>> +
>>> +	skb_set_owner_w(skb, sk);
>>> +
>>> +	payload = skb_put(skb, dgram_len);
>>> +	err = memcpy_from_msg(payload, msg, dgram_len);
>>> +	if (err)
>>> +		return err;
>>
>> Do we need free allocated skb here ?
>>
> 
> Yep, thanks.
> 
>>> +
>>> +	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> +					 remote_addr->svm_cid,
>>> +					 remote_addr->svm_port,
>>> +					 dgram_len,
>>> +					 info->type,
>>> +					 info->op,
>>> +					 0);
>>> +
>>> +	return t_ops->send_pkt(skb);
>>>  }
>>>  EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
>>>  
>>>
>>
>> Thanks, Arseniy
> 
> Thanks for the review!
> 
> Best,
> Bobby

^ permalink raw reply

* Re: [PATCH RFC net-next v5 00/14] virtio/vsock: support datagrams
From: Michael S. Tsirkin @ 2023-07-27  7:51 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Stefan Hajnoczi, Stefano Garzarella, Jason Wang, Xuan Zhuo,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan,
	Vishnu Dasa, VMware PV-Drivers Reviewers, Dan Carpenter,
	Simon Horman, Krasnov Arseniy, kvm, virtualization, netdev,
	linux-kernel, linux-hyperv, bpf, Jiang Wang
In-Reply-To: <20230413-b4-vsock-dgram-v5-0-581bd37fdb26@bytedance.com>

On Wed, Jul 19, 2023 at 12:50:04AM +0000, Bobby Eshleman wrote:
> Hey all!
> 
> This series introduces support for datagrams to virtio/vsock.
> 
> It is a spin-off (and smaller version) of this series from the summer:
>   https://lore.kernel.org/all/cover.1660362668.git.bobby.eshleman@bytedance.com/
> 
> Please note that this is an RFC and should not be merged until
> associated changes are made to the virtio specification, which will
> follow after discussion from this series.
> 
> Another aside, the v4 of the series has only been mildly tested with a
> run of tools/testing/vsock/vsock_test. Some code likely needs cleaning
> up, but I'm hoping to get some of the design choices agreed upon before
> spending too much time making it pretty.
> 
> This series first supports datagrams in a basic form for virtio, and
> then optimizes the sendpath for all datagram transports.
> 
> The result is a very fast datagram communication protocol that
> outperforms even UDP on multi-queue virtio-net w/ vhost on a variety
> of multi-threaded workload samples.
> 
> For those that are curious, some summary data comparing UDP and VSOCK
> DGRAM (N=5):
> 
> 	vCPUS: 16
> 	virtio-net queues: 16
> 	payload size: 4KB
> 	Setup: bare metal + vm (non-nested)
> 
> 	UDP: 287.59 MB/s
> 	VSOCK DGRAM: 509.2 MB/s
> 
> Some notes about the implementation...
> 
> This datagram implementation forces datagrams to self-throttle according
> to the threshold set by sk_sndbuf. It behaves similar to the credits
> used by streams in its effect on throughput and memory consumption, but
> it is not influenced by the receiving socket as credits are.
> 
> The device drops packets silently.
> 
> As discussed previously, this series introduces datagrams and defers
> fairness to future work. See discussion in v2 for more context around
> datagrams, fairness, and this implementation.

it's a big thread - can't you summarize here?


> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>


could you give a bit more motivation? which applications do
you have in mind? for example, on localhost loopback datagrams
are actually reliable and a bunch of apps came to depend
on that even if they shouldn't.



> ---
> Changes in v5:
> - teach vhost to drop dgram when a datagram exceeds the receive buffer
>   - now uses MSG_ERRQUEUE and depends on Arseniy's zerocopy patch:
> 	"vsock: read from socket's error queue"
> - replace multiple ->dgram_* callbacks with single ->dgram_addr_init()
>   callback
> - refactor virtio dgram skb allocator to reduce conflicts w/ zerocopy series
> - add _fallback/_FALLBACK suffix to dgram transport variables/macros
> - add WARN_ONCE() for table_size / VSOCK_HASH issue
> - add static to vsock_find_bound_socket_common
> - dedupe code in vsock_dgram_sendmsg() using module_got var
> - drop concurrent sendmsg() for dgram and defer to future series
> - Add more tests
>   - test EHOSTUNREACH in errqueue
>   - test stream + dgram address collision
> - improve clarity of dgram msg bounds test code
> - Link to v4: https://lore.kernel.org/r/20230413-b4-vsock-dgram-v4-0-0cebbb2ae899@bytedance.com
> 
> Changes in v4:
> - style changes
>   - vsock: use sk_vsock(vsk) in vsock_dgram_recvmsg instead of
>     &sk->vsk
>   - vsock: fix xmas tree declaration
>   - vsock: fix spacing issues
>   - virtio/vsock: virtio_transport_recv_dgram returns void because err
>     unused
> - sparse analysis warnings/errors
>   - virtio/vsock: fix unitialized skerr on destroy
>   - virtio/vsock: fix uninitialized err var on goto out
>   - vsock: fix declarations that need static
>   - vsock: fix __rcu annotation order
> - bugs
>   - vsock: fix null ptr in remote_info code
>   - vsock/dgram: make transport_dgram a fallback instead of first
>     priority
>   - vsock: remove redundant rcu read lock acquire in getname()
> - tests
>   - add more tests (message bounds and more)
>   - add vsock_dgram_bind() helper
>   - add vsock_dgram_connect() helper
> 
> Changes in v3:
> - Support multi-transport dgram, changing logic in connect/bind
>   to support VMCI case
> - Support per-pkt transport lookup for sendto() case
> - Fix dgram_allow() implementation
> - Fix dgram feature bit number (now it is 3)
> - Fix binding so dgram and connectible (cid,port) spaces are
>   non-overlapping
> - RCU protect transport ptr so connect() calls never leave
>   a lockless read of the transport and remote_addr are always
>   in sync
> - Link to v2: https://lore.kernel.org/r/20230413-b4-vsock-dgram-v2-0-079cc7cee62e@bytedance.com
> 
> ---
> Bobby Eshleman (13):
>       af_vsock: generalize vsock_dgram_recvmsg() to all transports
>       af_vsock: refactor transport lookup code
>       af_vsock: support multi-transport datagrams
>       af_vsock: generalize bind table functions
>       af_vsock: use a separate dgram bind table
>       virtio/vsock: add VIRTIO_VSOCK_TYPE_DGRAM
>       virtio/vsock: add common datagram send path
>       af_vsock: add vsock_find_bound_dgram_socket()
>       virtio/vsock: add common datagram recv path
>       virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit
>       vhost/vsock: implement datagram support
>       vsock/loopback: implement datagram support
>       virtio/vsock: implement datagram support
> 
> Jiang Wang (1):
>       test/vsock: add vsock dgram tests
> 
>  drivers/vhost/vsock.c                   |  64 ++-
>  include/linux/virtio_vsock.h            |  10 +-
>  include/net/af_vsock.h                  |  14 +-
>  include/uapi/linux/virtio_vsock.h       |   2 +
>  net/vmw_vsock/af_vsock.c                | 281 ++++++++++---
>  net/vmw_vsock/hyperv_transport.c        |  13 -
>  net/vmw_vsock/virtio_transport.c        |  26 +-
>  net/vmw_vsock/virtio_transport_common.c | 190 +++++++--
>  net/vmw_vsock/vmci_transport.c          |  60 +--
>  net/vmw_vsock/vsock_loopback.c          |  10 +-
>  tools/testing/vsock/util.c              | 141 ++++++-
>  tools/testing/vsock/util.h              |   6 +
>  tools/testing/vsock/vsock_test.c        | 680 ++++++++++++++++++++++++++++++++
>  13 files changed, 1320 insertions(+), 177 deletions(-)
> ---
> base-commit: 37cadc266ebdc7e3531111c2b3304fa01b2131e8
> change-id: 20230413-b4-vsock-dgram-3b6eba6a64e5
> 
> Best regards,
> -- 
> Bobby Eshleman <bobby.eshleman@bytedance.com>


^ permalink raw reply

* Re: [PATCH RFC net-next v5 01/14] af_vsock: generalize vsock_dgram_recvmsg() to all transports
From: Arseniy Krasnov @ 2023-07-27  7:53 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Stefan Hajnoczi, Stefano Garzarella,
	Michael S. Tsirkin, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv, bpf
In-Reply-To: <ZMFkFE0AqaOUfric@bullseye>



On 26.07.2023 21:21, Bobby Eshleman wrote:
> On Mon, Jul 24, 2023 at 09:11:44PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 19.07.2023 03:50, Bobby Eshleman wrote:
>>> This commit drops the transport->dgram_dequeue callback and makes
>>> vsock_dgram_recvmsg() generic to all transports.
>>>
>>> To make this possible, two transport-level changes are introduced:
>>> - implementation of the ->dgram_addr_init() callback to initialize
>>>   the sockaddr_vm structure with data from incoming socket buffers.
>>> - transport implementations set the skb->data pointer to the beginning
>>>   of the payload prior to adding the skb to the socket's receive queue.
>>>   That is, they must use skb_pull() before enqueuing. This is an
>>>   agreement between the transport and the socket layer that skb->data
>>>   always points to the beginning of the payload (and not, for example,
>>>   the packet header).
>>>
>>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>>> ---
>>>  drivers/vhost/vsock.c                   |  1 -
>>>  include/linux/virtio_vsock.h            |  5 ---
>>>  include/net/af_vsock.h                  |  3 +-
>>>  net/vmw_vsock/af_vsock.c                | 40 ++++++++++++++++++++++-
>>>  net/vmw_vsock/hyperv_transport.c        |  7 ----
>>>  net/vmw_vsock/virtio_transport.c        |  1 -
>>>  net/vmw_vsock/virtio_transport_common.c |  9 -----
>>>  net/vmw_vsock/vmci_transport.c          | 58 ++++++---------------------------
>>>  net/vmw_vsock/vsock_loopback.c          |  1 -
>>>  9 files changed, 50 insertions(+), 75 deletions(-)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index 6578db78f0ae..ae8891598a48 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -410,7 +410,6 @@ static struct virtio_transport vhost_transport = {
>>>  		.cancel_pkt               = vhost_transport_cancel_pkt,
>>>  
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>> -		.dgram_dequeue            = virtio_transport_dgram_dequeue,
>>>  		.dgram_bind               = virtio_transport_dgram_bind,
>>>  		.dgram_allow              = virtio_transport_dgram_allow,
>>>  
>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>> index c58453699ee9..18cbe8d37fca 100644
>>> --- a/include/linux/virtio_vsock.h
>>> +++ b/include/linux/virtio_vsock.h
>>> @@ -167,11 +167,6 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk,
>>>  				size_t len,
>>>  				int type);
>>>  int
>>> -virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
>>> -			       struct msghdr *msg,
>>> -			       size_t len, int flags);
>>> -
>>> -int
>>>  virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
>>>  				   struct msghdr *msg,
>>>  				   size_t len);
>>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>>> index 0e7504a42925..305d57502e89 100644
>>> --- a/include/net/af_vsock.h
>>> +++ b/include/net/af_vsock.h
>>> @@ -120,11 +120,10 @@ struct vsock_transport {
>>>  
>>>  	/* DGRAM. */
>>>  	int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
>>> -	int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>>> -			     size_t len, int flags);
>>>  	int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
>>>  			     struct msghdr *, size_t len);
>>>  	bool (*dgram_allow)(u32 cid, u32 port);
>>> +	void (*dgram_addr_init)(struct sk_buff *skb, struct sockaddr_vm *addr);
>>>  
>>>  	/* STREAM. */
>>>  	/* TODO: stream_bind() */
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index deb72a8c44a7..ad71e084bf2f 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1272,11 +1272,15 @@ static int vsock_dgram_connect(struct socket *sock,
>>>  int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>  			size_t len, int flags)
>>>  {
>>> +	const struct vsock_transport *transport;
>>>  #ifdef CONFIG_BPF_SYSCALL
>>>  	const struct proto *prot;
>>>  #endif
>>>  	struct vsock_sock *vsk;
>>> +	struct sk_buff *skb;
>>> +	size_t payload_len;
>>>  	struct sock *sk;
>>> +	int err;
>>>  
>>>  	sk = sock->sk;
>>>  	vsk = vsock_sk(sk);
>>> @@ -1287,7 +1291,41 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>>  		return prot->recvmsg(sk, msg, len, flags, NULL);
>>>  #endif
>>>  
>>> -	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
>>> +	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>> +		return -EOPNOTSUPP;
>>> +
>>> +	transport = vsk->transport;
>>> +
>>> +	/* Retrieve the head sk_buff from the socket's receive queue. */
>>> +	err = 0;
>>> +	skb = skb_recv_datagram(sk_vsock(vsk), flags, &err);
>>> +	if (!skb)
>>> +		return err;
>>> +
>>> +	payload_len = skb->len;
>>> +
>>> +	if (payload_len > len) {
>>> +		payload_len = len;
>>> +		msg->msg_flags |= MSG_TRUNC;
>>> +	}
>>> +
>>> +	/* Place the datagram payload in the user's iovec. */
>>> +	err = skb_copy_datagram_msg(skb, 0, msg, payload_len);
>>> +	if (err)
>>> +		goto out;
>>> +
>>> +	if (msg->msg_name) {
>>> +		/* Provide the address of the sender. */
>>> +		DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name);
>>> +
>>> +		transport->dgram_addr_init(skb, vm_addr);
>>
>> Do we need check that dgram_addr_init != NULL? because I see that not all transports have this
>> callback set in this patch
>>
> 
> How about adding the check somewhere outside of the hotpath, such as
> when the transport is assigned?

Yes, may be we can return ESOCKTNOSUPPORT if this callback is not provided by transport (as we dereference
it here without any checks).

Thanks, Arseniy

> 
>>> +		msg->msg_namelen = sizeof(*vm_addr);
>>> +	}
>>> +	err = payload_len;
>>> +
>>> +out:
>>> +	skb_free_datagram(&vsk->sk, skb);
>>> +	return err;
>>>  }
>>>  EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
>>>  
>>> diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
>>> index 7cb1a9d2cdb4..7f1ea434656d 100644
>>> --- a/net/vmw_vsock/hyperv_transport.c
>>> +++ b/net/vmw_vsock/hyperv_transport.c
>>> @@ -556,12 +556,6 @@ static int hvs_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr)
>>>  	return -EOPNOTSUPP;
>>>  }
>>>  
>>> -static int hvs_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
>>> -			     size_t len, int flags)
>>> -{
>>> -	return -EOPNOTSUPP;
>>> -}
>>> -
>>>  static int hvs_dgram_enqueue(struct vsock_sock *vsk,
>>>  			     struct sockaddr_vm *remote, struct msghdr *msg,
>>>  			     size_t dgram_len)
>>> @@ -833,7 +827,6 @@ static struct vsock_transport hvs_transport = {
>>>  	.shutdown                 = hvs_shutdown,
>>>  
>>>  	.dgram_bind               = hvs_dgram_bind,
>>> -	.dgram_dequeue            = hvs_dgram_dequeue,
>>>  	.dgram_enqueue            = hvs_dgram_enqueue,
>>>  	.dgram_allow              = hvs_dgram_allow,
>>>  
>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>> index e95df847176b..66edffdbf303 100644
>>> --- a/net/vmw_vsock/virtio_transport.c
>>> +++ b/net/vmw_vsock/virtio_transport.c
>>> @@ -429,7 +429,6 @@ static struct virtio_transport virtio_transport = {
>>>  		.cancel_pkt               = virtio_transport_cancel_pkt,
>>>  
>>>  		.dgram_bind               = virtio_transport_dgram_bind,
>>> -		.dgram_dequeue            = virtio_transport_dgram_dequeue,
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>>  		.dgram_allow              = virtio_transport_dgram_allow,
>>>  
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index b769fc258931..01ea1402ad40 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -583,15 +583,6 @@ virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
>>>  }
>>>  EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue);
>>>  
>>> -int
>>> -virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
>>> -			       struct msghdr *msg,
>>> -			       size_t len, int flags)
>>> -{
>>> -	return -EOPNOTSUPP;
>>> -}
>>> -EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
>>> -
>>>  s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
>>>  {
>>>  	struct virtio_vsock_sock *vvs = vsk->trans;
>>> diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
>>> index b370070194fa..0bbbdb222245 100644
>>> --- a/net/vmw_vsock/vmci_transport.c
>>> +++ b/net/vmw_vsock/vmci_transport.c
>>> @@ -641,6 +641,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
>>>  	sock_hold(sk);
>>>  	skb_put(skb, size);
>>>  	memcpy(skb->data, dg, size);
>>> +	skb_pull(skb, VMCI_DG_HEADERSIZE);
>>>  	sk_receive_skb(sk, skb, 0);
>>>  
>>>  	return VMCI_SUCCESS;
>>> @@ -1731,57 +1732,18 @@ static int vmci_transport_dgram_enqueue(
>>>  	return err - sizeof(*dg);
>>>  }
>>>  
>>> -static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk,
>>> -					struct msghdr *msg, size_t len,
>>> -					int flags)
>>> +static void vmci_transport_dgram_addr_init(struct sk_buff *skb,
>>> +					   struct sockaddr_vm *addr)
>>>  {
>>> -	int err;
>>>  	struct vmci_datagram *dg;
>>> -	size_t payload_len;
>>> -	struct sk_buff *skb;
>>> -
>>> -	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
>>> -		return -EOPNOTSUPP;
>>> -
>>> -	/* Retrieve the head sk_buff from the socket's receive queue. */
>>> -	err = 0;
>>> -	skb = skb_recv_datagram(&vsk->sk, flags, &err);
>>> -	if (!skb)
>>> -		return err;
>>> -
>>> -	dg = (struct vmci_datagram *)skb->data;
>>> -	if (!dg)
>>> -		/* err is 0, meaning we read zero bytes. */
>>> -		goto out;
>>> -
>>> -	payload_len = dg->payload_size;
>>> -	/* Ensure the sk_buff matches the payload size claimed in the packet. */
>>> -	if (payload_len != skb->len - sizeof(*dg)) {
>>> -		err = -EINVAL;
>>> -		goto out;
>>> -	}
>>> -
>>> -	if (payload_len > len) {
>>> -		payload_len = len;
>>> -		msg->msg_flags |= MSG_TRUNC;
>>> -	}
>>> +	unsigned int cid, port;
>>>  
>>> -	/* Place the datagram payload in the user's iovec. */
>>> -	err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len);
>>> -	if (err)
>>> -		goto out;
>>> -
>>> -	if (msg->msg_name) {
>>> -		/* Provide the address of the sender. */
>>> -		DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name);
>>> -		vsock_addr_init(vm_addr, dg->src.context, dg->src.resource);
>>> -		msg->msg_namelen = sizeof(*vm_addr);
>>> -	}
>>> -	err = payload_len;
>>> +	WARN_ONCE(skb->head == skb->data, "vmci vsock bug: bad dgram skb");
>>>  
>>> -out:
>>> -	skb_free_datagram(&vsk->sk, skb);
>>> -	return err;
>>> +	dg = (struct vmci_datagram *)skb->head;
>>> +	cid = dg->src.context;
>>> +	port = dg->src.resource;
>>> +	vsock_addr_init(addr, cid, port);
>>
>> I think we
>>
>> 1) can short this to:
>>
>> vsock_addr_init(addr, dg->src.context, dg->src.resource);
>>
>> 2) w/o previous point, cid and port better be u32, as VMCI structure has u32 fields 'context' and
>>    'resource' and 'vsock_addr_init()' also has u32 type for both arguments.
>>
>> Thanks, Arseniy
> 
> Sounds good, thanks.
> 
>>
>>>  }
>>>  
>>>  static bool vmci_transport_dgram_allow(u32 cid, u32 port)
>>> @@ -2040,9 +2002,9 @@ static struct vsock_transport vmci_transport = {
>>>  	.release = vmci_transport_release,
>>>  	.connect = vmci_transport_connect,
>>>  	.dgram_bind = vmci_transport_dgram_bind,
>>> -	.dgram_dequeue = vmci_transport_dgram_dequeue,
>>>  	.dgram_enqueue = vmci_transport_dgram_enqueue,
>>>  	.dgram_allow = vmci_transport_dgram_allow,
>>> +	.dgram_addr_init = vmci_transport_dgram_addr_init,
>>>  	.stream_dequeue = vmci_transport_stream_dequeue,
>>>  	.stream_enqueue = vmci_transport_stream_enqueue,
>>>  	.stream_has_data = vmci_transport_stream_has_data,
>>> diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
>>> index 5c6360df1f31..2a59dd177c74 100644
>>> --- a/net/vmw_vsock/vsock_loopback.c
>>> +++ b/net/vmw_vsock/vsock_loopback.c
>>> @@ -62,7 +62,6 @@ static struct virtio_transport loopback_transport = {
>>>  		.cancel_pkt               = vsock_loopback_cancel_pkt,
>>>  
>>>  		.dgram_bind               = virtio_transport_dgram_bind,
>>> -		.dgram_dequeue            = virtio_transport_dgram_dequeue,
>>>  		.dgram_enqueue            = virtio_transport_dgram_enqueue,
>>>  		.dgram_allow              = virtio_transport_dgram_allow,
>>>  
>>>
> 
> Thanks,
> Bobby

^ permalink raw reply

* Re: [PATCH RFC net-next v5 10/14] virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit
From: Stefano Garzarella @ 2023-07-27  7:48 UTC (permalink / raw)
  To: Michael S. Tsirkin, Bobby Eshleman
  Cc: Stefan Hajnoczi, Jason Wang, Xuan Zhuo, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	VMware PV-Drivers Reviewers, Dan Carpenter, Simon Horman,
	Krasnov Arseniy, kvm, virtualization, netdev, linux-kernel,
	linux-hyperv, bpf, Jiang Wang
In-Reply-To: <20230726143736-mutt-send-email-mst@kernel.org>

On Wed, Jul 26, 2023 at 02:38:08PM -0400, Michael S. Tsirkin wrote:
>On Wed, Jul 19, 2023 at 12:50:14AM +0000, Bobby Eshleman wrote:
>> This commit adds a feature bit for virtio vsock to support datagrams.
>>
>> Signed-off-by: Jiang Wang <jiang.wang@bytedance.com>
>> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
>> ---
>>  include/uapi/linux/virtio_vsock.h | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
>> index 331be28b1d30..27b4b2b8bf13 100644
>> --- a/include/uapi/linux/virtio_vsock.h
>> +++ b/include/uapi/linux/virtio_vsock.h
>> @@ -40,6 +40,7 @@
>>
>>  /* The feature bitmap for virtio vsock */
>>  #define VIRTIO_VSOCK_F_SEQPACKET	1	/* SOCK_SEQPACKET supported */
>> +#define VIRTIO_VSOCK_F_DGRAM		3	/* SOCK_DGRAM supported */
>>
>>  struct virtio_vsock_config {
>>  	__le64 guest_cid;
>
>pls do not add interface without first getting it accepted in the
>virtio spec.

Yep, fortunatelly this series is still RFC.
I think by now we've seen that the implementation is doable, so we
should discuss the changes to the specification ASAP. Then we can
merge the series.

@Bobby can you start the discussion about spec changes?

Thanks,
Stefano


^ permalink raw reply

* Re: [PATCH V6 net] net: mana: Fix MANA VF unload when hardware is
From: Souradeep Chakrabarti @ 2023-07-27  5:13 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: Souradeep Chakrabarti, kys, haiyangz, wei.liu, decui, davem,
	edumazet, kuba, pabeni, longli, sharmaajay, leon, cai.huoqing,
	ssengar, vkuznets, tglx, linux-hyperv, netdev, linux-kernel,
	linux-rdma, schakrabarti, stable
In-Reply-To: <519602aa-0a6a-70a5-23c7-ce190045e4af@linux.dev>

On Thu, Jul 27, 2023 at 9:07 AM Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>
> 在 2023/7/26 21:15, Souradeep Chakrabarti 写道:
> > When unloading the MANA driver, mana_dealloc_queues() waits for the MANA
> > hardware to complete any inflight packets and set the pending send count
> > to zero. But if the hardware has failed, mana_dealloc_queues()
> > could wait forever.
> >
> > Fix this by adding a timeout to the wait. Set the timeout to 120 seconds,
> > which is a somewhat arbitrary value that is more than long enough for
> > functional hardware to complete any sends.
> >
> > Cc: stable@vger.kernel.org
> > Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
> >
> > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > ---
> > V5 -> V6:
> > * Added pcie_flr to reset the pci after timeout.
> > * Fixed the position of changelog.
> > * Removed unused variable like cq.
> >
> > V4 -> V5:
> > * Added fixes tag
> > * Changed the usleep_range from static to incremental value.
> > * Initialized timeout in the begining.
> >
> > V3 -> V4:
> > * Removed the unnecessary braces from mana_dealloc_queues().
> >
> > V2 -> V3:
> > * Removed the unnecessary braces from mana_dealloc_queues().
> >
> > V1 -> V2:
> > * Added net branch
> > * Removed the typecasting to (struct mana_context*) of void pointer
> > * Repositioned timeout variable in mana_dealloc_queues()
> > * Repositioned vf_unload_timeout in mana_context struct, to utilise the
> >   6 bytes hole
> > ---
> >   drivers/net/ethernet/microsoft/mana/mana_en.c | 38 +++++++++++++++++--
> >   1 file changed, 34 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index a499e460594b..ea039e2d4c4b 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -8,6 +8,7 @@
> >   #include <linux/ethtool.h>
> >   #include <linux/filter.h>
> >   #include <linux/mm.h>
> > +#include <linux/pci.h>
> >
> >   #include <net/checksum.h>
> >   #include <net/ip6_checksum.h>
> > @@ -2345,9 +2346,12 @@ int mana_attach(struct net_device *ndev)
> >   static int mana_dealloc_queues(struct net_device *ndev)
> >   {
> >       struct mana_port_context *apc = netdev_priv(ndev);
> > +     unsigned long timeout = jiffies + 120 * HZ;
> >       struct gdma_dev *gd = apc->ac->gdma_dev;
> >       struct mana_txq *txq;
> > +     struct sk_buff *skb;
> >       int i, err;
> > +     u32 tsleep;
> >
> >       if (apc->port_is_up)
> >               return -EINVAL;
> > @@ -2363,15 +2367,41 @@ static int mana_dealloc_queues(struct net_device *ndev)
> >        * to false, but it doesn't matter since mana_start_xmit() drops any
> >        * new packets due to apc->port_is_up being false.
> >        *
> > -      * Drain all the in-flight TX packets
> > +      * Drain all the in-flight TX packets.
> > +      * A timeout of 120 seconds for all the queues is used.
> > +      * This will break the while loop when h/w is not responding.
> > +      * This value of 120 has been decided here considering max
> > +      * number of queues.
> >        */
> > +
> >       for (i = 0; i < apc->num_queues; i++) {
> >               txq = &apc->tx_qp[i].txq;
> > -
> > -             while (atomic_read(&txq->pending_sends) > 0)
> > -                     usleep_range(1000, 2000);
> > +             tsleep = 1000;
> > +             while (atomic_read(&txq->pending_sends) > 0 &&
> > +                    time_before(jiffies, timeout)) {
> > +                     usleep_range(tsleep, tsleep + 1000);
> > +                     tsleep <<= 1;
> > +             }
> > +             if (atomic_read(&txq->pending_sends)) {
> > +                     err  = pcie_flr(to_pci_dev(gd->gdma_context->dev));
> > +                     if (err) {
> > +                             netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n",
> > +                                        err, atomic_read(&txq->pending_sends),
> > +                                        txq->gdma_txq_id);
> > +                     }
> > +                     break;
> > +             }
> >       }
> >
> > +     for (i = 0; i < apc->num_queues; i++) {
> > +             txq = &apc->tx_qp[i].txq;
> > +             while (atomic_read(&txq->pending_sends)) {
> > +                     skb = skb_dequeue(&txq->pending_skbs);
> > +                     mana_unmap_skb(skb, apc);
> > +                     dev_consume_skb_any(skb);
> > +                     atomic_sub(1, &txq->pending_sends);
> > +             }
> If I get this commit correctly, txq->pending_sends should be equal to
> the length of txq->pending_skbs?
>
> If yes, can we only handle the pending_skbs?
>
> the above snippet can be changed to as below? So the performance is better?
> "
>                 while ((skb = skb_dequeue(&txq->pending_skbs))) {
>                         mana_unmap_skb(skb, apc);
>                         dev_consume_skb_any(skb);
>                 }
>                 atomic_set(&txq->pending_sends, 0);
> "
>
> Zhu Yanjun
Yes, we can do that, thanks for pointing. Will take care of it in next version.
>
> > +     }
> >       /* We're 100% sure the queues can no longer be woken up, because
> >        * we're sure now mana_poll_tx_cq() can't be running.
> >        */
>

^ permalink raw reply

* Re: [PATCH V6 net] net: mana: Fix MANA VF unload when hardware is
From: Zhu Yanjun @ 2023-07-27  3:10 UTC (permalink / raw)
  To: Souradeep Chakrabarti, kys, haiyangz, wei.liu, decui, davem,
	edumazet, kuba, pabeni, longli, sharmaajay, leon, cai.huoqing,
	ssengar, vkuznets, tglx, linux-hyperv, netdev, linux-kernel,
	linux-rdma
  Cc: schakrabarti, stable
In-Reply-To: <1690377336-1353-1-git-send-email-schakrabarti@linux.microsoft.com>

在 2023/7/26 21:15, Souradeep Chakrabarti 写道:
> When unloading the MANA driver, mana_dealloc_queues() waits for the MANA
> hardware to complete any inflight packets and set the pending send count
> to zero. But if the hardware has failed, mana_dealloc_queues()
> could wait forever.
> 
> Fix this by adding a timeout to the wait. Set the timeout to 120 seconds,
> which is a somewhat arbitrary value that is more than long enough for
> functional hardware to complete any sends.
> 
> Cc: stable@vger.kernel.org
> Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
> 
> Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> ---
> V5 -> V6:
> * Added pcie_flr to reset the pci after timeout.
> * Fixed the position of changelog.
> * Removed unused variable like cq.
> 
> V4 -> V5:
> * Added fixes tag
> * Changed the usleep_range from static to incremental value.
> * Initialized timeout in the begining.
> 
> V3 -> V4:
> * Removed the unnecessary braces from mana_dealloc_queues().
> 
> V2 -> V3:
> * Removed the unnecessary braces from mana_dealloc_queues().
> 
> V1 -> V2:
> * Added net branch
> * Removed the typecasting to (struct mana_context*) of void pointer
> * Repositioned timeout variable in mana_dealloc_queues()
> * Repositioned vf_unload_timeout in mana_context struct, to utilise the
>   6 bytes hole
> ---
>   drivers/net/ethernet/microsoft/mana/mana_en.c | 38 +++++++++++++++++--
>   1 file changed, 34 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index a499e460594b..ea039e2d4c4b 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -8,6 +8,7 @@
>   #include <linux/ethtool.h>
>   #include <linux/filter.h>
>   #include <linux/mm.h>
> +#include <linux/pci.h>
>   
>   #include <net/checksum.h>
>   #include <net/ip6_checksum.h>
> @@ -2345,9 +2346,12 @@ int mana_attach(struct net_device *ndev)
>   static int mana_dealloc_queues(struct net_device *ndev)
>   {
>   	struct mana_port_context *apc = netdev_priv(ndev);
> +	unsigned long timeout = jiffies + 120 * HZ;
>   	struct gdma_dev *gd = apc->ac->gdma_dev;
>   	struct mana_txq *txq;
> +	struct sk_buff *skb;
>   	int i, err;
> +	u32 tsleep;
>   
>   	if (apc->port_is_up)
>   		return -EINVAL;
> @@ -2363,15 +2367,41 @@ static int mana_dealloc_queues(struct net_device *ndev)
>   	 * to false, but it doesn't matter since mana_start_xmit() drops any
>   	 * new packets due to apc->port_is_up being false.
>   	 *
> -	 * Drain all the in-flight TX packets
> +	 * Drain all the in-flight TX packets.
> +	 * A timeout of 120 seconds for all the queues is used.
> +	 * This will break the while loop when h/w is not responding.
> +	 * This value of 120 has been decided here considering max
> +	 * number of queues.
>   	 */
> +
>   	for (i = 0; i < apc->num_queues; i++) {
>   		txq = &apc->tx_qp[i].txq;
> -
> -		while (atomic_read(&txq->pending_sends) > 0)
> -			usleep_range(1000, 2000);
> +		tsleep = 1000;
> +		while (atomic_read(&txq->pending_sends) > 0 &&
> +		       time_before(jiffies, timeout)) {
> +			usleep_range(tsleep, tsleep + 1000);
> +			tsleep <<= 1;
> +		}
> +		if (atomic_read(&txq->pending_sends)) {
> +			err  = pcie_flr(to_pci_dev(gd->gdma_context->dev));
> +			if (err) {
> +				netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n",
> +					   err, atomic_read(&txq->pending_sends),
> +					   txq->gdma_txq_id);
> +			}
> +			break;
> +		}
>   	}
>   
> +	for (i = 0; i < apc->num_queues; i++) {
> +		txq = &apc->tx_qp[i].txq;
> +		while (atomic_read(&txq->pending_sends)) {
> +			skb = skb_dequeue(&txq->pending_skbs);
> +			mana_unmap_skb(skb, apc);
> +			dev_consume_skb_any(skb);
> +			atomic_sub(1, &txq->pending_sends);
> +		}
If I get this commit correctly, txq->pending_sends should be equal to 
the length of txq->pending_skbs?

If yes, can we only handle the pending_skbs?

the above snippet can be changed to as below? So the performance is better?
"
		while ((skb = skb_dequeue(&txq->pending_skbs))) {
			mana_unmap_skb(skb, apc);
			dev_consume_skb_any(skb);
		}
		atomic_set(&txq->pending_sends, 0);
"

Zhu Yanjun

> +	}
>   	/* We're 100% sure the queues can no longer be woken up, because
>   	 * we're sure now mana_poll_tx_cq() can't be running.
>   	 */


^ permalink raw reply

* RE: [PATCH v9 0/2] Support TDX guests on Hyper-V (the x86/tdx part)
From: Dexuan Cui @ 2023-07-27  0:29 UTC (permalink / raw)
  To: Wei Liu, dave.hansen@intel.com, dave.hansen@linux.intel.com,
	bp@alien8.de, kirill.shutemov@linux.intel.com
  Cc: ak@linux.intel.com, arnd@arndb.de, brijesh.singh@amd.com,
	dan.j.williams@intel.com, Haiyang Zhang, hpa@zytor.com,
	jane.chu@oracle.com, KY Srinivasan, linux-arch@vger.kernel.org,
	linux-hyperv@vger.kernel.org, luto@kernel.org, mingo@redhat.com,
	peterz@infradead.org, rostedt@goodmis.org,
	sathyanarayanan.kuppuswamy@linux.intel.com, seanjc@google.com,
	tglx@linutronix.de, tony.luck@intel.com, x86@kernel.org,
	Michael Kelley (LINUX), linux-kernel@vger.kernel.org, Tianyu Lan,
	rick.p.edgecombe@intel.com, Sebastian, Shiny
In-Reply-To: <SA1PR21MB133517719A03FCE05A9251C0BF30A@SA1PR21MB1335.namprd21.prod.outlook.com>

> From: Dexuan Cui
> Sent: Monday, July 10, 2023 10:21 AM
> To: Dexuan Cui <decui@microsoft.com>; Wei Liu <wei.liu@kernel.org>;
> dave.hansen@intel.com; dave.hansen@linux.intel.com; bp@alien8.de;
> kirill.shutemov@linux.intel.com
> Cc: ak@linux.intel.com; arnd@arndb.de; brijesh.singh@amd.com;
> dan.j.williams@intel.com; Haiyang Zhang <haiyangz@microsoft.com>;
> hpa@zytor.com; jane.chu@oracle.com; KY Srinivasan <kys@microsoft.com>;
> linux-arch@vger.kernel.org; linux-hyperv@vger.kernel.org; luto@kernel.org;
> mingo@redhat.com; peterz@infradead.org; rostedt@goodmis.org;
> sathyanarayanan.kuppuswamy@linux.intel.com; seanjc@google.com;
> tglx@linutronix.de; tony.luck@intel.com; x86@kernel.org; Michael Kelley
> (LINUX) <mikelley@microsoft.com>; linux-kernel@vger.kernel.org; Tianyu Lan
> <Tianyu.Lan@microsoft.com>; rick.p.edgecombe@intel.com
> Subject: RE: [PATCH v9 0/2] Support TDX guests on Hyper-V (the x86/tdx part)
> 
> > From: Dexuan Cui <decui@microsoft.com>
> > Sent: Wednesday, June 28, 2023 11:45 AM
> > To: Wei Liu <wei.liu@kernel.org>
> > ...
> > > From: Wei Liu <wei.liu@kernel.org>
> > > Sent: Wednesday, June 28, 2023 11:06 AM
> > > To: Dexuan Cui <decui@microsoft.com>
> > > Subject: Re: [PATCH v9 0/2] Support TDX guests on Hyper-V (the x86/tdx
> > > part)
> > >
> > > On Wed, Jun 21, 2023 at 12:13:15PM -0700, Dexuan Cui wrote:
> > > > The two patches are based on today's tip.git's master branch.
> > > >
> > > > Note: the two patches don't apply to the current x86/tdx branch, which
> > > > doesn't have commit 75d090fd167a ("x86/tdx: Add unaccepted
> memory
> > > support").
> > > >
> > > > As Dave suggested, I moved some local variables of tdx_map_gpa() to
> > > > inside the loop. I added Sathyanarayanan's Reviewed-by.
> > > >
> > > > Please review.
> > > > ...
> > > > Dexuan Cui (2):
> > > >   x86/tdx: Retry TDVMCALL_MAP_GPA() when needed
> > > >   x86/tdx: Support vmalloc() for tdx_enc_status_changed()
> > > ...
> > > Dexuan, do you expect these to go through the Hyper-V tree?
> > >
> > > Thanks,
> > > Wei.
> >
> > I suppose Dave and/or other x86 folks would like the 2 patches to go
> > through the tip tree if the patches look good.
> >
> > Hi Dave, any comments on the patches?
> 
> Hi Dave, would you please take a look at the 2 patches?
> 
> The patches have got Reviewed-by/Acked-by from Kirill, Sathyanarayanan
> and Michael.
> The patches can still apply cleanly on today's tip tree's master branch.
> 
> Thanks,
> Dexuan

Hi Dave, kindly ping.

^ permalink raw reply

* RE: [PATCH v9 1/2] x86/tdx: Retry TDVMCALL_MAP_GPA() when needed
From: Dexuan Cui @ 2023-07-27  0:19 UTC (permalink / raw)
  To: Xiaoyao Li, ak@linux.intel.com, arnd@arndb.de, bp@alien8.de,
	brijesh.singh@amd.com, dan.j.williams@intel.com,
	dave.hansen@intel.com, dave.hansen@linux.intel.com, Haiyang Zhang,
	hpa@zytor.com, jane.chu@oracle.com,
	kirill.shutemov@linux.intel.com, KY Srinivasan,
	linux-arch@vger.kernel.org, linux-hyperv@vger.kernel.org,
	luto@kernel.org, mingo@redhat.com, peterz@infradead.org,
	rostedt@goodmis.org, sathyanarayanan.kuppuswamy@linux.intel.com,
	seanjc@google.com, tglx@linutronix.de, tony.luck@intel.com,
	wei.liu@kernel.org, x86@kernel.org, Michael Kelley (LINUX)
  Cc: linux-kernel@vger.kernel.org, Tianyu Lan,
	rick.p.edgecombe@intel.com
In-Reply-To: <e3d2c81f-16e9-9a62-9fcb-d9552c3f12d2@intel.com>

> From: Xiaoyao Li <xiaoyao.li@intel.com>
> Sent: Tuesday, July 18, 2023 7:31 PM
> ...
> On 6/22/2023 3:13 AM, Dexuan Cui wrote:
> > GHCI spec for TDX 1.0 says that the MapGPA call may fail with the R10
> > error code = TDG.VP.VMCALL_RETRY (1), and the guest must retry this
> > operation for the pages in the region starting at the GPA specified
> > in R11.
> >
> > When a fully enlightened TDX guest runs on Hyper-V, Hyper-V can return
> > the retry error when set_memory_decrypted() is called to decrypt up to
> > 1GB of swiotlb bounce buffers.
> 
> just out of curiosity, what size does Hyper-v handle at most in one call?

In my test, Hyper-V can process 1 to 9 pages in one call. Most of the time,
Hyper-V only processes 2 pages or 1 page, in one call.

Thanks,
Dexuan

^ permalink raw reply

* RE: Hyper-V vsock streams do not fill the supplied buffer in full
From: Dexuan Cui @ 2023-07-26 21:34 UTC (permalink / raw)
  To: Stefano Garzarella, Gary Guo
  Cc: KY Srinivasan, Haiyang Zhang, Wei Liu,
	linux-hyperv@vger.kernel.org,
	virtualization@lists.linux-foundation.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Nischala Yelchuri
In-Reply-To: <CAGxU2F4_br6e3hEELXP_wpQSZTs5FYhQ-iahiZKzMMRYWpFXdA@mail.gmail.com>

> -----Original Message-----
> From: Stefano Garzarella <sgarzare@redhat.com>
> Sent: Thursday, July 6, 2023 3:02 AM
> To: Gary Guo <gary@garyguo.net>; Dexuan Cui <decui@microsoft.com>
> Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> <haiyangz@microsoft.com>; Wei Liu <wei.liu@kernel.org>; linux-
> hyperv@vger.kernel.org; virtualization@lists.linux-foundation.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: Hyper-V vsock streams do not fill the supplied buffer in full
> 
> Hi Gary,
> 
> On Wed, Jul 5, 2023 at 12:45 AM Gary Guo <gary@garyguo.net> wrote:
> >
> > When a vsock stream is called with recvmsg with a buffer, it only fills
> > the buffer with data from the first single VM packet. Even if there are
> > more VM packets at the time and the buffer is still not completely
> > filled, it will just leave the buffer partially filled.
> >
> > This causes some issues when in WSLD which uses the vsock in
> > non-blocking mode and uses epoll.
> >
> > For stream-oriented sockets, the epoll man page [1] says that
> >
> > > For stream-oriented files (e.g., pipe, FIFO, stream socket),
> > > the condition that the read/write I/O space is exhausted can
> > > also be detected by checking the amount of data read from /
> > > written to the target file descriptor.  For example, if you
> > > call read(2) by asking to read a certain amount of data and
> > > read(2) returns a lower number of bytes, you can be sure of
> > > having exhausted the read I/O space for the file descriptor.
> >
> > This has been used as an optimisation in the wild for reducing number
> > of syscalls required for stream sockets (by asserting that the socket
> > will not have to polled to EAGAIN in edge-trigger mode, if the buffer
> > given to recvmsg is not filled completely). An example is Tokio, which
> > starting in v1.21.0 [2].
> >
> > When this optimisation combines with the behaviour of Hyper-V vsock, it
> > causes issue in this scenario:
> > * the VM host send data to the guest, and it's splitted into multiple
> >   VM packets
> > * sk_data_ready is called and epoll returns, notifying the userspace
> >   that the socket is ready
> > * userspace call recvmsg with a buffer, and it's partially filled
> > * userspace assumes that the stream socket is depleted, and if new data
> >   arrives epoll will notify it again.
> > * kernel always considers the socket to be ready, and since it's in
> >   edge-trigger mode, the epoll instance will never be notified again.
> >
> > This different realisation of the readiness causes the userspace to
> > block forever.
> 
> Thanks for the detailed description of the problem.
> 
> I think we should fix the hvs_stream_dequeue() in
> net/vmw_vsock/hyperv_transport.c.
> We can do something similar to what we do in
> virtio_transport_stream_do_dequeue() in
> net/vmw_vsock/virtio_transport_common.c
> 
> @Dexuan WDYT?
> 
> Thanks,
> Stefano

(Sorry for the late response...)

Thanks Gary Guo for the good analysis!

I didn't realize that hvs_stream_dequeue() is supposed to
copy as much data as possible to the userspace in the case
of EPOLLET mode. 

Yes, I think we should fix hvs_stream_dequeue(). We'll try to get
this fixed asap.

Thanks,
-- Dexuan

^ permalink raw reply

* [Patch v3 3/4] RDMA/mana_ib : Create adapter and Add error eq
From: sharmaajay @ 2023-07-26 20:08 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky, Dexuan Cui, Wei Liu,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma, linux-hyperv, netdev, linux-kernel, Ajay Sharma
In-Reply-To: <1690402104-29518-1-git-send-email-sharmaajay@linuxonhyperv.com>

From: Ajay Sharma <sharmaajay@microsoft.com>

Create adapter object as nice container for VF resources.
Add error eq needed for adapter creation and later used
for notification from Management SW. The management
software uses this channel to send messages or error
notifications back to the Client.

Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
---
 drivers/infiniband/hw/mana/device.c           |  22 ++-
 drivers/infiniband/hw/mana/main.c             |  95 ++++++++++++
 drivers/infiniband/hw/mana/mana_ib.h          |  33 ++++
 .../net/ethernet/microsoft/mana/gdma_main.c   | 146 ++++++++++--------
 drivers/net/ethernet/microsoft/mana/mana_en.c |   3 +
 include/net/mana/gdma.h                       |  13 +-
 6 files changed, 242 insertions(+), 70 deletions(-)

diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index ea4c8c8fc10d..4077e440657a 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -68,7 +68,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 	ibdev_dbg(&mib_dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
 		  mdev->dev_id.as_uint32, mib_dev->ib_dev.phys_port_cnt);
 
-	mib_dev->gdma_dev = mdev;
+	mib_dev->gc = mdev->gdma_context;
 	mib_dev->ib_dev.node_type = RDMA_NODE_IB_CA;
 
 	/*
@@ -85,15 +85,31 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 		goto free_ib_device;
 	}
 
+	ret = mana_ib_create_error_eq(mib_dev);
+	if (ret) {
+		ibdev_err(&mib_dev->ib_dev, "Failed to allocate err eq");
+		goto deregister_device;
+	}
+
+	ret = mana_ib_create_adapter(mib_dev);
+	if (ret) {
+		ibdev_err(&mib_dev->ib_dev, "Failed to create adapter");
+		goto free_error_eq;
+	}
+
 	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
 				 mdev->gdma_context->dev);
 	if (ret)
-		goto deregister_device;
+		goto destroy_adapter;
 
 	dev_set_drvdata(&adev->dev, mib_dev);
 
 	return 0;
 
+destroy_adapter:
+	mana_ib_destroy_adapter(mib_dev);
+free_error_eq:
+	mana_gd_destroy_queue(mib_dev->gc, mib_dev->fatal_err_eq);
 deregister_device:
 	mana_gd_deregister_device(&mib_dev->gc->mana_ib);
 free_ib_device:
@@ -105,6 +121,8 @@ static void mana_ib_remove(struct auxiliary_device *adev)
 {
 	struct mana_ib_dev *mib_dev = dev_get_drvdata(&adev->dev);
 
+	mana_gd_destroy_queue(mib_dev->gc, mib_dev->fatal_err_eq);
+	mana_ib_destroy_adapter(mib_dev);
 	mana_gd_deregister_device(&mib_dev->gc->mana_ib);
 	ib_unregister_device(&mib_dev->ib_dev);
 	ib_dealloc_device(&mib_dev->ib_dev);
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 2c4e3c496644..1b1a8670d0fa 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -504,3 +504,98 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
 void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
 }
+
+int mana_ib_destroy_adapter(struct mana_ib_dev *mib_dev)
+{
+	struct mana_ib_destroy_adapter_resp resp = {};
+	struct mana_ib_destroy_adapter_req req = {};
+	struct gdma_context *gc;
+	int err;
+
+	gc = mib_dev->gc;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req),
+			     sizeof(resp));
+	req.adapter = mib_dev->adapter_handle;
+	req.hdr.dev_id = gc->mana_ib.dev_id;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+
+	if (err) {
+		ibdev_err(&mib_dev->ib_dev, "Failed to destroy adapter err %d", err);
+		return err;
+	}
+
+	return 0;
+}
+
+int mana_ib_create_adapter(struct mana_ib_dev *mib_dev)
+{
+	struct mana_ib_create_adapter_resp resp = {};
+	struct mana_ib_create_adapter_req req = {};
+	struct gdma_context *gc;
+	int err;
+
+	gc = mib_dev->gc;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req),
+			     sizeof(resp));
+	req.notify_eq_id = mib_dev->fatal_err_eq->id;
+	req.hdr.dev_id = gc->mana_ib.dev_id;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+
+	if (err) {
+		ibdev_err(&mib_dev->ib_dev, "Failed to create adapter err %d",
+			  err);
+		return err;
+	}
+
+	mib_dev->adapter_handle = resp.adapter;
+
+	return 0;
+}
+
+static void mana_ib_soc_event_handler(void *ctx, struct gdma_queue *queue,
+				      struct gdma_event *event)
+{
+	struct mana_ib_dev *mib_dev = (struct mana_ib_dev *)ctx;
+
+	switch (event->type) {
+	case GDMA_EQE_SOC_EVENT_NOTIFICATION:
+		ibdev_info(&mib_dev->ib_dev, "Received SOC Notification");
+		break;
+	case GDMA_EQE_SOC_EVENT_TEST:
+		ibdev_info(&mib_dev->ib_dev, "Received SoC Test");
+		break;
+	default:
+		ibdev_dbg(&mib_dev->ib_dev, "Received unsolicited evt %d",
+			  event->type);
+	}
+}
+
+int mana_ib_create_error_eq(struct mana_ib_dev *mib_dev)
+{
+	struct gdma_queue_spec spec = {};
+	int err;
+
+	spec.type = GDMA_EQ;
+	spec.monitor_avl_buf = false;
+	spec.queue_size = EQ_SIZE;
+	spec.eq.callback = mana_ib_soc_event_handler;
+	spec.eq.context = mib_dev;
+	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
+	spec.eq.msix_allocated = true;
+	spec.eq.msix_index = 0;
+	spec.doorbell = mib_dev->gc->mana_ib.doorbell;
+	spec.pdid = mib_dev->gc->mana_ib.pdid;
+
+	err = mana_gd_create_mana_eq(&mib_dev->gc->mana_ib, &spec,
+				     &mib_dev->fatal_err_eq);
+	if (err)
+		return err;
+
+	mib_dev->fatal_err_eq->eq.disable_needed = true;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 3a2ba6b96f15..8a652bccd978 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -31,6 +31,8 @@ struct mana_ib_dev {
 	struct ib_device ib_dev;
 	struct gdma_dev *gdma_dev;
 	struct gdma_context *gc;
+	struct gdma_queue *fatal_err_eq;
+	mana_handle_t adapter_handle;
 };
 
 struct mana_ib_wq {
@@ -93,6 +95,31 @@ struct mana_ib_rwq_ind_table {
 	struct ib_rwq_ind_table ib_ind_table;
 };
 
+enum mana_ib_command_code {
+	MANA_IB_CREATE_ADAPTER  = 0x30002,
+	MANA_IB_DESTROY_ADAPTER = 0x30003,
+};
+
+struct mana_ib_create_adapter_req {
+	struct gdma_req_hdr hdr;
+	u32 notify_eq_id;
+	u32 reserved;
+}; /*HW Data */
+
+struct mana_ib_create_adapter_resp {
+	struct gdma_resp_hdr hdr;
+	mana_handle_t adapter;
+}; /* HW Data */
+
+struct mana_ib_destroy_adapter_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t adapter;
+}; /*HW Data */
+
+struct mana_ib_destroy_adapter_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW Data */
+
 int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
 				 struct ib_umem *umem,
 				 mana_handle_t *gdma_region);
@@ -161,4 +188,10 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
 
 void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext);
 
+int mana_ib_create_error_eq(struct mana_ib_dev *mib_dev);
+
+int mana_ib_create_adapter(struct mana_ib_dev *mib_dev);
+
+int mana_ib_destroy_adapter(struct mana_ib_dev *mib_dev);
+
 #endif
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 9fa7a2d6c2b2..55e194c9d84e 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -185,7 +185,8 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi)
 }
 
 static int mana_gd_create_hw_eq(struct gdma_context *gc,
-				struct gdma_queue *queue)
+				struct gdma_queue *queue,
+				u32 doorbell, u32 pdid)
 {
 	struct gdma_create_queue_resp resp = {};
 	struct gdma_create_queue_req req = {};
@@ -199,8 +200,8 @@ static int mana_gd_create_hw_eq(struct gdma_context *gc,
 
 	req.hdr.dev_id = queue->gdma_dev->dev_id;
 	req.type = queue->type;
-	req.pdid = queue->gdma_dev->pdid;
-	req.doolbell_id = queue->gdma_dev->doorbell;
+	req.pdid = pdid;
+	req.doolbell_id = doorbell;
 	req.gdma_region = queue->mem_info.dma_region_handle;
 	req.queue_size = queue->queue_size;
 	req.log2_throttle_limit = queue->eq.log2_throttle_limit;
@@ -371,53 +372,51 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 	}
 }
 
-static void mana_gd_process_eq_events(void *arg)
+static void mana_gd_process_eq_events(struct list_head *eq_list)
 {
 	u32 owner_bits, new_bits, old_bits;
 	union gdma_eqe_info eqe_info;
 	struct gdma_eqe *eq_eqe_ptr;
-	struct gdma_queue *eq = arg;
+	struct gdma_queue *eq;
 	struct gdma_context *gc;
 	struct gdma_eqe *eqe;
 	u32 head, num_eqe;
 	int i;
 
-	gc = eq->gdma_dev->gdma_context;
-
-	num_eqe = eq->queue_size / GDMA_EQE_SIZE;
-	eq_eqe_ptr = eq->queue_mem_ptr;
-
-	/* Process up to 5 EQEs at a time, and update the HW head. */
-	for (i = 0; i < 5; i++) {
-		eqe = &eq_eqe_ptr[eq->head % num_eqe];
-		eqe_info.as_uint32 = eqe->eqe_info;
-		owner_bits = eqe_info.owner_bits;
-
-		old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK;
-		/* No more entries */
-		if (owner_bits == old_bits)
-			break;
-
-		new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK;
-		if (owner_bits != new_bits) {
-			dev_err(gc->dev, "EQ %d: overflow detected\n", eq->id);
-			break;
+	list_for_each_entry_rcu(eq, eq_list, entry) {
+		gc = eq->gdma_dev->gdma_context;
+
+		num_eqe = eq->queue_size / GDMA_EQE_SIZE;
+		eq_eqe_ptr = eq->queue_mem_ptr;
+		/* Process up to 5 EQEs at a time, and update the HW head. */
+		for (i = 0; i < 5; i++) {
+			eqe = &eq_eqe_ptr[eq->head % num_eqe];
+			eqe_info.as_uint32 = eqe->eqe_info;
+			owner_bits = eqe_info.owner_bits;
+
+			old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK;
+			/* No more entries */
+			if (owner_bits == old_bits)
+				break;
+
+			new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK;
+			if (owner_bits != new_bits) {
+				dev_err(gc->dev, "EQ %d: overflow detected\n",
+					eq->id);
+				break;
+			}
+			/* Per GDMA spec, rmb is necessary after checking owner_bits, before
+			 * reading eqe.
+			 */
+			rmb();
+			mana_gd_process_eqe(eq);
+			eq->head++;
 		}
 
-		/* Per GDMA spec, rmb is necessary after checking owner_bits, before
-		 * reading eqe.
-		 */
-		rmb();
-
-		mana_gd_process_eqe(eq);
-
-		eq->head++;
+		head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS);
+		mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type,
+				      eq->id, head, SET_ARM_BIT);
 	}
-
-	head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS);
-
-	mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq->id,
-			      head, SET_ARM_BIT);
 }
 
 static int mana_gd_register_irq(struct gdma_queue *queue,
@@ -435,44 +434,47 @@ static int mana_gd_register_irq(struct gdma_queue *queue,
 	gc = gd->gdma_context;
 	r = &gc->msix_resource;
 	dev = gc->dev;
+	msi_index = spec->eq.msix_index;
 
 	spin_lock_irqsave(&r->lock, flags);
 
-	msi_index = find_first_zero_bit(r->map, r->size);
-	if (msi_index >= r->size || msi_index >= gc->num_msix_usable) {
-		err = -ENOSPC;
-	} else {
-		bitmap_set(r->map, msi_index, 1);
-		queue->eq.msix_index = msi_index;
-	}
-
-	spin_unlock_irqrestore(&r->lock, flags);
+	if (!spec->eq.msix_allocated) {
+		msi_index = find_first_zero_bit(r->map, r->size);
 
-	if (err) {
-		dev_err(dev, "Register IRQ err:%d, msi:%u rsize:%u, nMSI:%u",
-			err, msi_index, r->size, gc->num_msix_usable);
+		if (msi_index >= r->size ||
+		    msi_index >= gc->num_msix_usable)
+			err = -ENOSPC;
+		else
+			bitmap_set(r->map, msi_index, 1);
 
-		return err;
+		if (err) {
+			dev_err(dev, "Register IRQ err:%d, msi:%u rsize:%u, nMSI:%u",
+				err, msi_index, r->size, gc->num_msix_usable);
+				goto out;
+		}
 	}
 
+	queue->eq.msix_index = msi_index;
 	gic = &gc->irq_contexts[msi_index];
 
-	WARN_ON(gic->handler || gic->arg);
-
-	gic->arg = queue;
+	list_add_rcu(&queue->entry, &gic->eq_list);
 
 	gic->handler = mana_gd_process_eq_events;
 
-	return 0;
+out:
+	spin_unlock_irqrestore(&r->lock, flags);
+	return err;
 }
 
-static void mana_gd_deregiser_irq(struct gdma_queue *queue)
+static void mana_gd_deregister_irq(struct gdma_queue *queue)
 {
 	struct gdma_dev *gd = queue->gdma_dev;
 	struct gdma_irq_context *gic;
 	struct gdma_context *gc;
 	struct gdma_resource *r;
 	unsigned int msix_index;
+	struct list_head *p, *n;
+	struct gdma_queue *eq;
 	unsigned long flags;
 
 	gc = gd->gdma_context;
@@ -483,14 +485,23 @@ static void mana_gd_deregiser_irq(struct gdma_queue *queue)
 	if (WARN_ON(msix_index >= gc->num_msix_usable))
 		return;
 
+	spin_lock_irqsave(&r->lock, flags);
+
 	gic = &gc->irq_contexts[msix_index];
-	gic->handler = NULL;
-	gic->arg = NULL;
+	list_for_each_safe(p, n, &gic->eq_list) {
+		eq = list_entry(p, struct gdma_queue, entry);
+		if (queue == eq) {
+			list_del(&eq->entry);
+			break;
+		}
+	}
 
-	spin_lock_irqsave(&r->lock, flags);
-	bitmap_clear(r->map, msix_index, 1);
-	spin_unlock_irqrestore(&r->lock, flags);
+	if (list_empty(&gic->eq_list)) {
+		gic->handler = NULL;
+		bitmap_clear(r->map, msix_index, 1);
+	}
 
+	spin_unlock_irqrestore(&r->lock, flags);
 	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
 }
 
@@ -553,7 +564,7 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets,
 			dev_warn(gc->dev, "Failed to flush EQ: %d\n", err);
 	}
 
-	mana_gd_deregiser_irq(queue);
+	mana_gd_deregister_irq(queue);
 
 	if (queue->eq.disable_needed)
 		mana_gd_disable_queue(queue);
@@ -568,7 +579,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd,
 	u32 log2_num_entries;
 	int err;
 
-	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
+	queue->eq.msix_index = spec->eq.msix_index;
 
 	log2_num_entries = ilog2(queue->queue_size / GDMA_EQE_SIZE);
 
@@ -590,7 +601,8 @@ static int mana_gd_create_eq(struct gdma_dev *gd,
 	queue->eq.log2_throttle_limit = spec->eq.log2_throttle_limit ?: 1;
 
 	if (create_hwq) {
-		err = mana_gd_create_hw_eq(gc, queue);
+		err = mana_gd_create_hw_eq(gc, queue,
+					   spec->doorbell, spec->pdid);
 		if (err)
 			goto out;
 
@@ -800,6 +812,7 @@ int mana_gd_create_mana_eq(struct gdma_dev *gd,
 	kfree(queue);
 	return err;
 }
+EXPORT_SYMBOL(mana_gd_create_mana_eq);
 
 int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
 			      const struct gdma_queue_spec *spec,
@@ -876,6 +889,7 @@ void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue)
 	mana_gd_free_memory(gmi);
 	kfree(queue);
 }
+EXPORT_SYMBOL(mana_gd_destroy_queue);
 
 int mana_gd_verify_vf_version(struct pci_dev *pdev)
 {
@@ -1193,7 +1207,7 @@ static irqreturn_t mana_gd_intr(int irq, void *arg)
 	struct gdma_irq_context *gic = arg;
 
 	if (gic->handler)
-		gic->handler(gic->arg);
+		gic->handler(&gic->eq_list);
 
 	return IRQ_HANDLED;
 }
@@ -1246,7 +1260,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
 	for (i = 0; i < nvec; i++) {
 		gic = &gc->irq_contexts[i];
 		gic->handler = NULL;
-		gic->arg = NULL;
+		INIT_LIST_HEAD(&gic->eq_list);
 
 		if (!i)
 			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a499e460594b..d2ba7de8b512 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1167,6 +1167,9 @@ static int mana_create_eq(struct mana_context *ac)
 	spec.eq.callback = NULL;
 	spec.eq.context = ac->eqs;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
+	spec.eq.msix_allocated = false;
+	spec.doorbell = gd->doorbell;
+	spec.pdid = gd->pdid;
 
 	for (i = 0; i < gc->max_num_queues; i++) {
 		err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index e2b212dd722b..aee8e8fa1ea6 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -57,6 +57,10 @@ enum gdma_eqe_type {
 	GDMA_EQE_HWC_INIT_EQ_ID_DB	= 129,
 	GDMA_EQE_HWC_INIT_DATA		= 130,
 	GDMA_EQE_HWC_INIT_DONE		= 131,
+
+	/* IB NiC  Events start at 176*/
+	GDMA_EQE_SOC_EVENT_NOTIFICATION = 176,
+	GDMA_EQE_SOC_EVENT_TEST,
 };
 
 enum {
@@ -291,6 +295,7 @@ struct gdma_queue {
 
 	u32 head;
 	u32 tail;
+	struct list_head entry;
 
 	/* Extra fields specific to EQ/CQ. */
 	union {
@@ -318,6 +323,8 @@ struct gdma_queue_spec {
 	enum gdma_queue_type type;
 	bool monitor_avl_buf;
 	unsigned int queue_size;
+	u32 doorbell;
+	u32 pdid;
 
 	/* Extra fields specific to EQ/CQ. */
 	union {
@@ -326,6 +333,8 @@ struct gdma_queue_spec {
 			void *context;
 
 			unsigned long log2_throttle_limit;
+			bool msix_allocated;
+			unsigned int msix_index;
 		} eq;
 
 		struct {
@@ -341,8 +350,8 @@ struct gdma_queue_spec {
 #define MANA_IRQ_NAME_SZ 32
 
 struct gdma_irq_context {
-	void (*handler)(void *arg);
-	void *arg;
+	void (*handler)(struct list_head *arg);
+	struct list_head eq_list;
 	char name[MANA_IRQ_NAME_SZ];
 };
 
-- 
2.25.1


^ permalink raw reply related

* [Patch v3 4/4] RDMA/mana_ib : Query adapter capabilities
From: sharmaajay @ 2023-07-26 20:08 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky, Dexuan Cui, Wei Liu,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma, linux-hyperv, netdev, linux-kernel, Ajay Sharma
In-Reply-To: <1690402104-29518-1-git-send-email-sharmaajay@linuxonhyperv.com>

From: Ajay Sharma <sharmaajay@microsoft.com>

Query the adapter capabilities to expose to
other clients and VF. This checks against
the user supplied values and protects against
overflows.

Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
---
 drivers/infiniband/hw/mana/device.c  |  4 ++
 drivers/infiniband/hw/mana/main.c    | 66 +++++++++++++++++++++++++---
 drivers/infiniband/hw/mana/mana_ib.h | 53 +++++++++++++++++++++-
 3 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index 4077e440657a..e15da43c73a0 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -97,6 +97,10 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 		goto free_error_eq;
 	}
 
+	ret = mana_ib_query_adapter_caps(mib_dev);
+	if (ret)
+		ibdev_dbg(&mib_dev->ib_dev, "Failed to get caps, use defaults");
+
 	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
 				 mdev->gdma_context->dev);
 	if (ret)
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 1b1a8670d0fa..512815e1e64d 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -469,21 +469,27 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
 int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
 			 struct ib_udata *uhw)
 {
+	struct mana_ib_dev *mib_dev = container_of(ibdev,
+			struct mana_ib_dev, ib_dev);
+
 	props->max_qp = MANA_MAX_NUM_QUEUES;
 	props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE;
-
-	/*
-	 * max_cqe could be potentially much bigger.
-	 * As this version of driver only support RAW QP, set it to the same
-	 * value as max_qp_wr
-	 */
 	props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE;
-
 	props->max_mr_size = MANA_IB_MAX_MR_SIZE;
 	props->max_mr = MANA_IB_MAX_MR;
 	props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES;
 	props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES;
 
+	/* If the Management SW is updated and supports adapter creation */
+	if (mib_dev->adapter_handle) {
+		props->max_qp = mib_dev->adapter_caps.max_qp_count;
+		props->max_qp_wr = mib_dev->adapter_caps.max_requester_sq_size;
+		props->max_cqe = mib_dev->adapter_caps.max_requester_sq_size;
+		props->max_mr = mib_dev->adapter_caps.max_mr_count;
+		props->max_send_sge = mib_dev->adapter_caps.max_send_wqe_size;
+		props->max_recv_sge = mib_dev->adapter_caps.max_recv_wqe_size;
+	}
+
 	return 0;
 }
 
@@ -599,3 +605,49 @@ int mana_ib_create_error_eq(struct mana_ib_dev *mib_dev)
 
 	return 0;
 }
+
+static void assign_caps(struct mana_ib_adapter_caps *caps,
+			struct mana_ib_query_adapter_caps_resp *resp)
+{
+	caps->max_sq_id = resp->max_sq_id;
+	caps->max_rq_id = resp->max_rq_id;
+	caps->max_cq_id = resp->max_cq_id;
+	caps->max_qp_count = resp->max_qp_count;
+	caps->max_cq_count = resp->max_cq_count;
+	caps->max_mr_count = resp->max_mr_count;
+	caps->max_pd_count = resp->max_pd_count;
+	caps->max_inbound_read_limit = resp->max_inbound_read_limit;
+	caps->max_outbound_read_limit = resp->max_outbound_read_limit;
+	caps->mw_count = resp->mw_count;
+	caps->max_srq_count = resp->max_srq_count;
+	caps->max_requester_sq_size = resp->max_requester_sq_size;
+	caps->max_responder_sq_size = resp->max_responder_sq_size;
+	caps->max_requester_rq_size = resp->max_requester_rq_size;
+	caps->max_responder_rq_size = resp->max_responder_rq_size;
+	caps->max_send_wqe_size = resp->max_send_wqe_size;
+	caps->max_recv_wqe_size = resp->max_recv_wqe_size;
+	caps->max_inline_data_size = resp->max_inline_data_size;
+}
+
+int mana_ib_query_adapter_caps(struct mana_ib_dev *mib_dev)
+{
+	struct mana_ib_query_adapter_caps_resp resp = {};
+	struct mana_ib_query_adapter_caps_req req = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_IB_GET_ADAPTER_CAP, sizeof(req),
+			     sizeof(resp));
+	req.hdr.resp.msg_version = MANA_IB__GET_ADAPTER_CAP_RESPONSE_V3;
+	req.hdr.dev_id = mib_dev->gc->mana_ib.dev_id;
+
+	err = mana_gd_send_request(mib_dev->gc, sizeof(req), &req,
+				   sizeof(resp), &resp);
+
+	if (err) {
+		ibdev_err(&mib_dev->ib_dev, "Failed to query adapter caps err %d", err);
+		return err;
+	}
+
+	assign_caps(&mib_dev->adapter_caps, &resp);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 8a652bccd978..1044358230d3 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -20,19 +20,41 @@
 
 /* MANA doesn't have any limit for MR size */
 #define MANA_IB_MAX_MR_SIZE	U64_MAX
-
+#define MANA_IB__GET_ADAPTER_CAP_RESPONSE_V3 3
 /*
  * The hardware limit of number of MRs is greater than maximum number of MRs
  * that can possibly represent in 24 bits
  */
 #define MANA_IB_MAX_MR		0xFFFFFFu
 
+struct mana_ib_adapter_caps {
+	u32 max_sq_id;
+	u32 max_rq_id;
+	u32 max_cq_id;
+	u32 max_qp_count;
+	u32 max_cq_count;
+	u32 max_mr_count;
+	u32 max_pd_count;
+	u32 max_inbound_read_limit;
+	u32 max_outbound_read_limit;
+	u32 mw_count;
+	u32 max_srq_count;
+	u32 max_requester_sq_size;
+	u32 max_responder_sq_size;
+	u32 max_requester_rq_size;
+	u32 max_responder_rq_size;
+	u32 max_send_wqe_size;
+	u32 max_recv_wqe_size;
+	u32 max_inline_data_size;
+};
+
 struct mana_ib_dev {
 	struct ib_device ib_dev;
 	struct gdma_dev *gdma_dev;
 	struct gdma_context *gc;
 	struct gdma_queue *fatal_err_eq;
 	mana_handle_t adapter_handle;
+	struct mana_ib_adapter_caps adapter_caps;
 };
 
 struct mana_ib_wq {
@@ -96,6 +118,7 @@ struct mana_ib_rwq_ind_table {
 };
 
 enum mana_ib_command_code {
+	MANA_IB_GET_ADAPTER_CAP = 0x30001,
 	MANA_IB_CREATE_ADAPTER  = 0x30002,
 	MANA_IB_DESTROY_ADAPTER = 0x30003,
 };
@@ -120,6 +143,32 @@ struct mana_ib_destroy_adapter_resp {
 	struct gdma_resp_hdr hdr;
 }; /* HW Data */
 
+struct mana_ib_query_adapter_caps_req {
+	struct gdma_req_hdr hdr;
+}; /*HW Data */
+
+struct mana_ib_query_adapter_caps_resp {
+	struct gdma_resp_hdr hdr;
+	u32 max_sq_id;
+	u32 max_rq_id;
+	u32 max_cq_id;
+	u32 max_qp_count;
+	u32 max_cq_count;
+	u32 max_mr_count;
+	u32 max_pd_count;
+	u32 max_inbound_read_limit;
+	u32 max_outbound_read_limit;
+	u32 mw_count;
+	u32 max_srq_count;
+	u32 max_requester_sq_size;
+	u32 max_responder_sq_size;
+	u32 max_requester_rq_size;
+	u32 max_responder_rq_size;
+	u32 max_send_wqe_size;
+	u32 max_recv_wqe_size;
+	u32 max_inline_data_size;
+}; /* HW Data */
+
 int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
 				 struct ib_umem *umem,
 				 mana_handle_t *gdma_region);
@@ -194,4 +243,6 @@ int mana_ib_create_adapter(struct mana_ib_dev *mib_dev);
 
 int mana_ib_destroy_adapter(struct mana_ib_dev *mib_dev);
 
+int mana_ib_query_adapter_caps(struct mana_ib_dev *mib_dev);
+
 #endif
-- 
2.25.1


^ permalink raw reply related

* [Patch v3 1/4] RDMA/mana_ib : Rename all mana_ib_dev type variables to mib_dev
From: sharmaajay @ 2023-07-26 20:08 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky, Dexuan Cui, Wei Liu,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma, linux-hyperv, netdev, linux-kernel, Ajay Sharma
In-Reply-To: <1690402104-29518-1-git-send-email-sharmaajay@linuxonhyperv.com>

From: Ajay Sharma <sharmaajay@microsoft.com>

This patch does not introduce any functional changes. It
creates naming convention to distinguish especially when
used in the same function.Renaming all mana_ib_dev type
variables to mib_dev to have clean separation between
eth dev and ibdev variables.

Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
---
 drivers/infiniband/hw/mana/cq.c      | 12 ++--
 drivers/infiniband/hw/mana/device.c  | 34 +++++------
 drivers/infiniband/hw/mana/main.c    | 87 ++++++++++++++--------------
 drivers/infiniband/hw/mana/mana_ib.h |  9 +--
 drivers/infiniband/hw/mana/mr.c      | 29 +++++-----
 drivers/infiniband/hw/mana/qp.c      | 82 +++++++++++++-------------
 drivers/infiniband/hw/mana/wq.c      | 21 +++----
 7 files changed, 140 insertions(+), 134 deletions(-)

diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index d141cab8a1e6..1aed4e6360ba 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -11,10 +11,10 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
 	struct ib_device *ibdev = ibcq->device;
 	struct mana_ib_create_cq ucmd = {};
-	struct mana_ib_dev *mdev;
+	struct mana_ib_dev *mib_dev;
 	int err;
 
-	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
 	if (udata->inlen < sizeof(ucmd))
 		return -EINVAL;
@@ -41,7 +41,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		return err;
 	}
 
-	err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region);
+	err = mana_ib_gd_create_dma_region(mib_dev, cq->umem, &cq->gdma_region);
 	if (err) {
 		ibdev_dbg(ibdev,
 			  "Failed to create dma region for create cq, %d\n",
@@ -68,11 +68,11 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
 	struct ib_device *ibdev = ibcq->device;
-	struct mana_ib_dev *mdev;
+	struct mana_ib_dev *mib_dev;
 
-	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
-	mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region);
+	mana_ib_gd_destroy_dma_region(mib_dev, cq->gdma_region);
 	ib_umem_release(cq->umem);
 
 	return 0;
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index d4541b8707e4..083f27246ba8 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -51,51 +51,51 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 {
 	struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
 	struct gdma_dev *mdev = madev->mdev;
+	struct mana_ib_dev *mib_dev;
 	struct mana_context *mc;
-	struct mana_ib_dev *dev;
 	int ret;
 
 	mc = mdev->driver_data;
 
-	dev = ib_alloc_device(mana_ib_dev, ib_dev);
-	if (!dev)
+	mib_dev = ib_alloc_device(mana_ib_dev, ib_dev);
+	if (!mib_dev)
 		return -ENOMEM;
 
-	ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops);
+	ib_set_device_ops(&mib_dev->ib_dev, &mana_ib_dev_ops);
 
-	dev->ib_dev.phys_port_cnt = mc->num_ports;
+	mib_dev->ib_dev.phys_port_cnt = mc->num_ports;
 
-	ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
-		  mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
+	ibdev_dbg(&mib_dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
+		  mdev->dev_id.as_uint32, mib_dev->ib_dev.phys_port_cnt);
 
-	dev->gdma_dev = mdev;
-	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
+	mib_dev->gdma_dev = mdev;
+	mib_dev->ib_dev.node_type = RDMA_NODE_IB_CA;
 
 	/*
 	 * num_comp_vectors needs to set to the max MSIX index
 	 * when interrupts and event queues are implemented
 	 */
-	dev->ib_dev.num_comp_vectors = 1;
-	dev->ib_dev.dev.parent = mdev->gdma_context->dev;
+	mib_dev->ib_dev.num_comp_vectors = 1;
+	mib_dev->ib_dev.dev.parent = mdev->gdma_context->dev;
 
-	ret = ib_register_device(&dev->ib_dev, "mana_%d",
+	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
 				 mdev->gdma_context->dev);
 	if (ret) {
-		ib_dealloc_device(&dev->ib_dev);
+		ib_dealloc_device(&mib_dev->ib_dev);
 		return ret;
 	}
 
-	dev_set_drvdata(&adev->dev, dev);
+	dev_set_drvdata(&adev->dev, mib_dev);
 
 	return 0;
 }
 
 static void mana_ib_remove(struct auxiliary_device *adev)
 {
-	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
+	struct mana_ib_dev *mib_dev = dev_get_drvdata(&adev->dev);
 
-	ib_unregister_device(&dev->ib_dev);
-	ib_dealloc_device(&dev->ib_dev);
+	ib_unregister_device(&mib_dev->ib_dev);
+	ib_dealloc_device(&mib_dev->ib_dev);
 }
 
 static const struct auxiliary_device_id mana_id_table[] = {
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 7be4c3adb4e2..189e774cdab6 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -5,10 +5,10 @@
 
 #include "mana_ib.h"
 
-void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
+void mana_ib_uncfg_vport(struct mana_ib_dev *mib_dev, struct mana_ib_pd *pd,
 			 u32 port)
 {
-	struct gdma_dev *gd = dev->gdma_dev;
+	struct gdma_dev *gd = mib_dev->gdma_dev;
 	struct mana_port_context *mpc;
 	struct net_device *ndev;
 	struct mana_context *mc;
@@ -28,10 +28,11 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
 	mutex_unlock(&pd->vport_mutex);
 }
 
-int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
+int mana_ib_cfg_vport(struct mana_ib_dev *mib_dev, u32 port,
+		      struct mana_ib_pd *pd,
 		      u32 doorbell_id)
 {
-	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_dev *mdev = mib_dev->gdma_dev;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
 	struct net_device *ndev;
@@ -45,7 +46,7 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 
 	pd->vport_use_count++;
 	if (pd->vport_use_count > 1) {
-		ibdev_dbg(&dev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Skip as this PD is already configured vport\n");
 		mutex_unlock(&pd->vport_mutex);
 		return 0;
@@ -56,7 +57,8 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 		pd->vport_use_count--;
 		mutex_unlock(&pd->vport_mutex);
 
-		ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err);
+		ibdev_dbg(&mib_dev->ib_dev, "Failed to configure vPort %d\n",
+			  err);
 		return err;
 	}
 
@@ -65,7 +67,7 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
 	pd->tx_vp_offset = mpc->tx_vp_offset;
 
-	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+	ibdev_dbg(&mib_dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
 		  mpc->port_handle, pd->pdn, doorbell_id);
 
 	return 0;
@@ -78,12 +80,12 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 	struct gdma_create_pd_resp resp = {};
 	struct gdma_create_pd_req req = {};
 	enum gdma_pd_flags flags = 0;
-	struct mana_ib_dev *dev;
+	struct mana_ib_dev *mib_dev;
 	struct gdma_dev *mdev;
 	int err;
 
-	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	mdev = dev->gdma_dev;
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mdev = mib_dev->gdma_dev;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
 			     sizeof(resp));
@@ -93,7 +95,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 				   sizeof(resp), &resp);
 
 	if (err || resp.hdr.status) {
-		ibdev_dbg(&dev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to get pd_id err %d status %u\n", err,
 			  resp.hdr.status);
 		if (!err)
@@ -104,7 +106,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 
 	pd->pd_handle = resp.pd_handle;
 	pd->pdn = resp.pd_id;
-	ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
+	ibdev_dbg(&mib_dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
 		  pd->pd_handle, pd->pdn);
 
 	mutex_init(&pd->vport_mutex);
@@ -118,12 +120,12 @@ int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 	struct ib_device *ibdev = ibpd->device;
 	struct gdma_destory_pd_resp resp = {};
 	struct gdma_destroy_pd_req req = {};
-	struct mana_ib_dev *dev;
+	struct mana_ib_dev *mib_dev;
 	struct gdma_dev *mdev;
 	int err;
 
-	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	mdev = dev->gdma_dev;
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mdev = mib_dev->gdma_dev;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req),
 			     sizeof(resp));
@@ -133,7 +135,7 @@ int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 				   sizeof(resp), &resp);
 
 	if (err || resp.hdr.status) {
-		ibdev_dbg(&dev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to destroy pd_handle 0x%llx err %d status %u",
 			  pd->pd_handle, err, resp.hdr.status);
 		if (!err)
@@ -204,14 +206,14 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
 	struct mana_ib_ucontext *ucontext =
 		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
 	struct ib_device *ibdev = ibcontext->device;
-	struct mana_ib_dev *mdev;
+	struct mana_ib_dev *mib_dev;
 	struct gdma_context *gc;
 	struct gdma_dev *dev;
 	int doorbell_page;
 	int ret;
 
-	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	dev = mdev->gdma_dev;
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	dev = mib_dev->gdma_dev;
 	gc = dev->gdma_context;
 
 	/* Allocate a doorbell page index */
@@ -233,12 +235,12 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	struct mana_ib_ucontext *mana_ucontext =
 		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
 	struct ib_device *ibdev = ibcontext->device;
-	struct mana_ib_dev *mdev;
+	struct mana_ib_dev *mib_dev;
 	struct gdma_context *gc;
 	int ret;
 
-	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	gc = mdev->gdma_dev->gdma_context;
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	gc = mib_dev->gdma_dev->gdma_context;
 
 	ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell);
 	if (ret)
@@ -246,7 +248,7 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 }
 
 static int
-mana_ib_gd_first_dma_region(struct mana_ib_dev *dev,
+mana_ib_gd_first_dma_region(struct mana_ib_dev *mib_dev,
 			    struct gdma_context *gc,
 			    struct gdma_create_dma_region_req *create_req,
 			    size_t num_pages, mana_handle_t *gdma_region,
@@ -263,7 +265,7 @@ mana_ib_gd_first_dma_region(struct mana_ib_dev *dev,
 	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
 				   sizeof(create_resp), &create_resp);
 	if (err || create_resp.hdr.status != expected_status) {
-		ibdev_dbg(&dev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to create DMA region: %d, 0x%x\n",
 			  err, create_resp.hdr.status);
 		if (!err)
@@ -273,14 +275,14 @@ mana_ib_gd_first_dma_region(struct mana_ib_dev *dev,
 	}
 
 	*gdma_region = create_resp.dma_region_handle;
-	ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n",
+	ibdev_dbg(&mib_dev->ib_dev, "Created DMA region handle 0x%llx\n",
 		  *gdma_region);
 
 	return 0;
 }
 
 static int
-mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc,
+mana_ib_gd_add_dma_region(struct mana_ib_dev *mib_dev, struct gdma_context *gc,
 			  struct gdma_dma_region_add_pages_req *add_req,
 			  unsigned int num_pages, u32 expected_status)
 {
@@ -296,7 +298,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc,
 	err = mana_gd_send_request(gc, add_req_msg_size, add_req,
 				   sizeof(add_resp), &add_resp);
 	if (err || add_resp.hdr.status != expected_status) {
-		ibdev_dbg(&dev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to create DMA region: %d, 0x%x\n",
 			  err, add_resp.hdr.status);
 
@@ -309,7 +311,8 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc,
 	return 0;
 }
 
-int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
+				 struct ib_umem *umem,
 				 mana_handle_t *gdma_region)
 {
 	struct gdma_dma_region_add_pages_req *add_req = NULL;
@@ -329,14 +332,14 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
 	void *request_buf;
 	int err;
 
-	mdev = dev->gdma_dev;
+	mdev = mib_dev->gdma_dev;
 	gc = mdev->gdma_context;
 	hwc = gc->hwc.driver_data;
 
 	/* Hardware requires dma region to align to chosen page size */
 	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
 	if (!page_sz) {
-		ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
+		ibdev_dbg(&mib_dev->ib_dev, "failed to find page size.\n");
 		return -ENOMEM;
 	}
 	num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
@@ -362,13 +365,13 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
 	create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
 	create_req->page_count = num_pages_total;
 
-	ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
+	ibdev_dbg(&mib_dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
 		  umem->length, num_pages_total);
 
-	ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
+	ibdev_dbg(&mib_dev->ib_dev, "page_sz %lu offset_in_page %u\n",
 		  page_sz, create_req->offset_in_page);
 
-	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
+	ibdev_dbg(&mib_dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
 		  num_pages_to_handle, create_req->gdma_page_type);
 
 	page_addr_list = create_req->page_addr_list;
@@ -385,7 +388,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
 
 		if (!num_pages_processed) {
 			/* First create message */
-			err = mana_ib_gd_first_dma_region(dev, gc, create_req,
+			err = mana_ib_gd_first_dma_region(mib_dev, gc, create_req,
 							  tail, gdma_region,
 							  expected_status);
 			if (err)
@@ -400,7 +403,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
 			page_addr_list = add_req->page_addr_list;
 		} else {
 			/* Subsequent create messages */
-			err = mana_ib_gd_add_dma_region(dev, gc, add_req, tail,
+			err = mana_ib_gd_add_dma_region(mib_dev, gc, add_req, tail,
 							expected_status);
 			if (err)
 				break;
@@ -417,20 +420,20 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
 	}
 
 	if (err)
-		mana_ib_gd_destroy_dma_region(dev, *gdma_region);
+		mana_ib_gd_destroy_dma_region(mib_dev, *gdma_region);
 
 out:
 	kfree(request_buf);
 	return err;
 }
 
-int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region)
+int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *mib_dev, u64 gdma_region)
 {
-	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_dev *mdev = mib_dev->gdma_dev;
 	struct gdma_context *gc;
 
 	gc = mdev->gdma_context;
-	ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region);
+	ibdev_dbg(&mib_dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region);
 
 	return mana_gd_destroy_dma_region(gc, gdma_region);
 }
@@ -440,14 +443,14 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 	struct mana_ib_ucontext *mana_ucontext =
 		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
 	struct ib_device *ibdev = ibcontext->device;
-	struct mana_ib_dev *mdev;
+	struct mana_ib_dev *mib_dev;
 	struct gdma_context *gc;
 	phys_addr_t pfn;
 	pgprot_t prot;
 	int ret;
 
-	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	gc = mdev->gdma_dev->gdma_context;
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	gc = mib_dev->gdma_dev->gdma_context;
 
 	if (vma->vm_pgoff != 0) {
 		ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff);
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 502cc8672eef..ee4efd0af278 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -92,10 +92,11 @@ struct mana_ib_rwq_ind_table {
 	struct ib_rwq_ind_table ib_ind_table;
 };
 
-int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
+				 struct ib_umem *umem,
 				 mana_handle_t *gdma_region);
 
-int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev,
+int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *mib_dev,
 				  mana_handle_t gdma_region);
 
 struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
@@ -129,9 +130,9 @@ int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 
-int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id,
+int mana_ib_cfg_vport(struct mana_ib_dev *mib_dev, u32 port_id,
 		      struct mana_ib_pd *pd, u32 doorbell_id);
-void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
+void mana_ib_uncfg_vport(struct mana_ib_dev *mib_dev, struct mana_ib_pd *pd,
 			 u32 port);
 
 int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 351207c60eb6..f6a53906204d 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -25,12 +25,13 @@ mana_ib_verbs_to_gdma_access_flags(int access_flags)
 	return flags;
 }
 
-static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
+static int mana_ib_gd_create_mr(struct mana_ib_dev *mib_dev,
+				struct mana_ib_mr *mr,
 				struct gdma_create_mr_params *mr_params)
 {
+	struct gdma_dev *mdev = mib_dev->gdma_dev;
 	struct gdma_create_mr_response resp = {};
 	struct gdma_create_mr_request req = {};
-	struct gdma_dev *mdev = dev->gdma_dev;
 	struct gdma_context *gc;
 	int err;
 
@@ -49,7 +50,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
 		break;
 
 	default:
-		ibdev_dbg(&dev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "invalid param (GDMA_MR_TYPE) passed, type %d\n",
 			  req.mr_type);
 		return -EINVAL;
@@ -58,7 +59,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
 
 	if (err || resp.hdr.status) {
-		ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
+		ibdev_dbg(&mib_dev->ib_dev, "Failed to create mr %d, %u", err,
 			  resp.hdr.status);
 		if (!err)
 			err = -EPROTO;
@@ -73,11 +74,11 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
 	return 0;
 }
 
-static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle)
+static int mana_ib_gd_destroy_mr(struct mana_ib_dev *mib_dev, u64 mr_handle)
 {
 	struct gdma_destroy_mr_response resp = {};
+	struct gdma_dev *mdev = mib_dev->gdma_dev;
 	struct gdma_destroy_mr_request req = {};
-	struct gdma_dev *mdev = dev->gdma_dev;
 	struct gdma_context *gc;
 	int err;
 
@@ -107,12 +108,12 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
 	struct gdma_create_mr_params mr_params = {};
 	struct ib_device *ibdev = ibpd->device;
-	struct mana_ib_dev *dev;
+	struct mana_ib_dev *mib_dev;
 	struct mana_ib_mr *mr;
 	u64 dma_region_handle;
 	int err;
 
-	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
 	ibdev_dbg(ibdev,
 		  "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x",
@@ -133,7 +134,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 		goto err_free;
 	}
 
-	err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle);
+	err = mana_ib_gd_create_dma_region(mib_dev, mr->umem, &dma_region_handle);
 	if (err) {
 		ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n",
 			  err);
@@ -151,7 +152,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	mr_params.gva.access_flags =
 		mana_ib_verbs_to_gdma_access_flags(access_flags);
 
-	err = mana_ib_gd_create_mr(dev, mr, &mr_params);
+	err = mana_ib_gd_create_mr(mib_dev, mr, &mr_params);
 	if (err)
 		goto err_dma_region;
 
@@ -164,7 +165,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	return &mr->ibmr;
 
 err_dma_region:
-	mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context,
+	mana_gd_destroy_dma_region(mib_dev->gdma_dev->gdma_context,
 				   dma_region_handle);
 
 err_umem:
@@ -179,12 +180,12 @@ int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr);
 	struct ib_device *ibdev = ibmr->device;
-	struct mana_ib_dev *dev;
+	struct mana_ib_dev *mib_dev;
 	int err;
 
-	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
-	err = mana_ib_gd_destroy_mr(dev, mr->mr_handle);
+	err = mana_ib_gd_destroy_mr(mib_dev, mr->mr_handle);
 	if (err)
 		return err;
 
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 4b3b5b274e84..2e3a57123ed7 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -5,7 +5,7 @@
 
 #include "mana_ib.h"
 
-static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
+static int mana_ib_cfg_vport_steering(struct mana_ib_dev *mib_dev,
 				      struct net_device *ndev,
 				      mana_handle_t default_rxobj,
 				      mana_handle_t ind_table[],
@@ -21,7 +21,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
 	u32 req_buf_size;
 	int i, err;
 
-	mdev = dev->gdma_dev;
+	mdev = mib_dev->gdma_dev;
 	gc = mdev->gdma_context;
 
 	req_buf_size =
@@ -55,10 +55,10 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
 	 * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb
 	 * ind_table to MANA_INDIRECT_TABLE_SIZE if required
 	 */
-	ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size);
+	ibdev_dbg(&mib_dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size);
 	for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
 		req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)];
-		ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i,
+		ibdev_dbg(&mib_dev->ib_dev, "index %u handle 0x%llx\n", i,
 			  req_indir_tab[i]);
 	}
 
@@ -68,7 +68,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
 	else
 		netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE);
 
-	ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n",
+	ibdev_dbg(&mib_dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n",
 		  req->vport, default_rxobj);
 
 	err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp);
@@ -97,12 +97,12 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 				 struct ib_udata *udata)
 {
 	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
-	struct mana_ib_dev *mdev =
+	struct mana_ib_dev *mib_dev =
 		container_of(pd->device, struct mana_ib_dev, ib_dev);
 	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
 	struct mana_ib_create_qp_rss_resp resp = {};
 	struct mana_ib_create_qp_rss ucmd = {};
-	struct gdma_dev *gd = mdev->gdma_dev;
+	struct gdma_dev *gd = mib_dev->gdma_dev;
 	mana_handle_t *mana_ind_table;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
@@ -123,21 +123,21 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 
 	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
 	if (ret) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed copy from udata for create rss-qp, err %d\n",
 			  ret);
 		return ret;
 	}
 
 	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Requested max_recv_wr %d exceeding limit\n",
 			  attr->cap.max_recv_wr);
 		return -EINVAL;
 	}
 
 	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Requested max_recv_sge %d exceeding limit\n",
 			  attr->cap.max_recv_sge);
 		return -EINVAL;
@@ -145,14 +145,14 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 
 	ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
 	if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Indirect table size %d exceeding limit\n",
 			  ind_tbl_size);
 		return -EINVAL;
 	}
 
 	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "RX Hash function is not supported, %d\n",
 			  ucmd.rx_hash_function);
 		return -EINVAL;
@@ -161,14 +161,14 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 	/* IB ports start with 1, MANA start with 0 */
 	port = ucmd.port;
 	if (port < 1 || port > mc->num_ports) {
-		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n",
+		ibdev_dbg(&mib_dev->ib_dev, "Invalid port %u in creating qp\n",
 			  port);
 		return -EINVAL;
 	}
 	ndev = mc->ports[port - 1];
 	mpc = netdev_priv(ndev);
 
-	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
+	ibdev_dbg(&mib_dev->ib_dev, "rx_hash_function %d port %d\n",
 		  ucmd.rx_hash_function, port);
 
 	mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t),
@@ -210,7 +210,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		wq->id = wq_spec.queue_index;
 		cq->id = cq_spec.queue_index;
 
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
 			  ret, wq->rx_object, wq->id, cq->id);
 
@@ -221,7 +221,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 	}
 	resp.num_entries = i;
 
-	ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
+	ret = mana_ib_cfg_vport_steering(mib_dev, ndev, wq->rx_object,
 					 mana_ind_table,
 					 ind_tbl->log_ind_tbl_size,
 					 ucmd.rx_hash_key_len,
@@ -231,7 +231,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 
 	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
 	if (ret) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to copy to udata create rss-qp, %d\n",
 			  ret);
 		goto fail;
@@ -259,7 +259,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 {
 	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
 	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
-	struct mana_ib_dev *mdev =
+	struct mana_ib_dev *mib_dev =
 		container_of(ibpd->device, struct mana_ib_dev, ib_dev);
 	struct mana_ib_cq *send_cq =
 		container_of(attr->send_cq, struct mana_ib_cq, ibcq);
@@ -267,7 +267,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 		rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
 					  ibucontext);
 	struct mana_ib_create_qp_resp resp = {};
-	struct gdma_dev *gd = mdev->gdma_dev;
+	struct gdma_dev *gd = mib_dev->gdma_dev;
 	struct mana_ib_create_qp ucmd = {};
 	struct mana_obj_spec wq_spec = {};
 	struct mana_obj_spec cq_spec = {};
@@ -285,7 +285,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 
 	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
 	if (err) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to copy from udata create qp-raw, %d\n", err);
 		return err;
 	}
@@ -296,14 +296,14 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 		return -EINVAL;
 
 	if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Requested max_send_wr %d exceeding limit\n",
 			  attr->cap.max_send_wr);
 		return -EINVAL;
 	}
 
 	if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Requested max_send_sge %d exceeding limit\n",
 			  attr->cap.max_send_sge);
 		return -EINVAL;
@@ -311,38 +311,38 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 
 	ndev = mc->ports[port - 1];
 	mpc = netdev_priv(ndev);
-	ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc);
+	ibdev_dbg(&mib_dev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc);
 
-	err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell);
+	err = mana_ib_cfg_vport(mib_dev, port - 1, pd, mana_ucontext->doorbell);
 	if (err)
 		return -ENODEV;
 
 	qp->port = port;
 
-	ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
+	ibdev_dbg(&mib_dev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
 		  ucmd.sq_buf_addr, ucmd.port);
 
 	umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size,
 			   IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(umem)) {
 		err = PTR_ERR(umem);
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to get umem for create qp-raw, err %d\n",
 			  err);
 		goto err_free_vport;
 	}
 	qp->sq_umem = umem;
 
-	err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem,
+	err = mana_ib_gd_create_dma_region(mib_dev, qp->sq_umem,
 					   &qp->sq_gdma_region);
 	if (err) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to create dma region for create qp-raw, %d\n",
 			  err);
 		goto err_release_umem;
 	}
 
-	ibdev_dbg(&mdev->ib_dev,
+	ibdev_dbg(&mib_dev->ib_dev,
 		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
 		  err, qp->sq_gdma_region);
 
@@ -358,7 +358,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
 				 &cq_spec, &qp->tx_object);
 	if (err) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to create wq for create raw-qp, err %d\n",
 			  err);
 		goto err_destroy_dma_region;
@@ -371,7 +371,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	qp->sq_id = wq_spec.queue_index;
 	send_cq->id = cq_spec.queue_index;
 
-	ibdev_dbg(&mdev->ib_dev,
+	ibdev_dbg(&mib_dev->ib_dev,
 		  "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err,
 		  qp->tx_object, qp->sq_id, send_cq->id);
 
@@ -381,7 +381,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 
 	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
 	if (err) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed copy udata for create qp-raw, %d\n",
 			  err);
 		goto err_destroy_wq_obj;
@@ -393,13 +393,13 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
 
 err_destroy_dma_region:
-	mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
+	mana_ib_gd_destroy_dma_region(mib_dev, qp->sq_gdma_region);
 
 err_release_umem:
 	ib_umem_release(umem);
 
 err_free_vport:
-	mana_ib_uncfg_vport(mdev, pd, port - 1);
+	mana_ib_uncfg_vport(mib_dev, pd, port - 1);
 
 	return err;
 }
@@ -435,9 +435,9 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
 				  struct ib_rwq_ind_table *ind_tbl,
 				  struct ib_udata *udata)
 {
-	struct mana_ib_dev *mdev =
+	struct mana_ib_dev *mib_dev =
 		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
-	struct gdma_dev *gd = mdev->gdma_dev;
+	struct gdma_dev *gd = mib_dev->gdma_dev;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
 	struct net_device *ndev;
@@ -452,7 +452,7 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
 	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
 		ibwq = ind_tbl->ind_tbl[i];
 		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
-		ibdev_dbg(&mdev->ib_dev, "destroying wq->rx_object %llu\n",
+		ibdev_dbg(&mib_dev->ib_dev, "destroying wq->rx_object %llu\n",
 			  wq->rx_object);
 		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
 	}
@@ -462,9 +462,9 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
 
 static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata)
 {
-	struct mana_ib_dev *mdev =
+	struct mana_ib_dev *mib_dev =
 		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
-	struct gdma_dev *gd = mdev->gdma_dev;
+	struct gdma_dev *gd = mib_dev->gdma_dev;
 	struct ib_pd *ibpd = qp->ibqp.pd;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
@@ -479,11 +479,11 @@ static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata)
 	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
 
 	if (qp->sq_umem) {
-		mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
+		mana_ib_gd_destroy_dma_region(mib_dev, qp->sq_gdma_region);
 		ib_umem_release(qp->sq_umem);
 	}
 
-	mana_ib_uncfg_vport(mdev, pd, qp->port - 1);
+	mana_ib_uncfg_vport(mib_dev, pd, qp->port - 1);
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
index 372d361510e0..56bc2b8b6690 100644
--- a/drivers/infiniband/hw/mana/wq.c
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -9,7 +9,7 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata)
 {
-	struct mana_ib_dev *mdev =
+	struct mana_ib_dev *mib_dev =
 		container_of(pd->device, struct mana_ib_dev, ib_dev);
 	struct mana_ib_create_wq ucmd = {};
 	struct mana_ib_wq *wq;
@@ -21,7 +21,7 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 
 	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
 	if (err) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to copy from udata for create wq, %d\n", err);
 		return ERR_PTR(err);
 	}
@@ -30,13 +30,14 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
 
-	ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
+	ibdev_dbg(&mib_dev->ib_dev, "ucmd wq_buf_addr 0x%llx\n",
+		  ucmd.wq_buf_addr);
 
 	umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size,
 			   IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(umem)) {
 		err = PTR_ERR(umem);
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to get umem for create wq, err %d\n", err);
 		goto err_free_wq;
 	}
@@ -46,15 +47,15 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
 	wq->wq_buf_size = ucmd.wq_buf_size;
 	wq->rx_object = INVALID_MANA_HANDLE;
 
-	err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region);
+	err = mana_ib_gd_create_dma_region(mib_dev, wq->umem, &wq->gdma_region);
 	if (err) {
-		ibdev_dbg(&mdev->ib_dev,
+		ibdev_dbg(&mib_dev->ib_dev,
 			  "Failed to create dma region for create wq, %d\n",
 			  err);
 		goto err_release_umem;
 	}
 
-	ibdev_dbg(&mdev->ib_dev,
+	ibdev_dbg(&mib_dev->ib_dev,
 		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
 		  err, wq->gdma_region);
 
@@ -82,11 +83,11 @@ int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 {
 	struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq, ibwq);
 	struct ib_device *ib_dev = ibwq->device;
-	struct mana_ib_dev *mdev;
+	struct mana_ib_dev *mib_dev;
 
-	mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev);
+	mib_dev = container_of(ib_dev, struct mana_ib_dev, ib_dev);
 
-	mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region);
+	mana_ib_gd_destroy_dma_region(mib_dev, wq->gdma_region);
 	ib_umem_release(wq->umem);
 
 	kfree(wq);
-- 
2.25.1


^ permalink raw reply related

* [Patch v3 2/4] RDMA/mana_ib : Register Mana IB  device with Management SW
From: sharmaajay @ 2023-07-26 20:08 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky, Dexuan Cui, Wei Liu,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: linux-rdma, linux-hyperv, netdev, linux-kernel, Ajay Sharma
In-Reply-To: <1690402104-29518-1-git-send-email-sharmaajay@linuxonhyperv.com>

From: Ajay Sharma <sharmaajay@microsoft.com>

Each of the MANA infiniband devices must be registered
with the management software to request services/resources.
Register the Mana IB device with Management
which would later help get an adapter handle.

Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
---
 drivers/infiniband/hw/mana/device.c           | 20 +++++--
 drivers/infiniband/hw/mana/main.c             | 58 ++++++-------------
 drivers/infiniband/hw/mana/mana_ib.h          |  1 +
 drivers/infiniband/hw/mana/mr.c               | 17 ++----
 drivers/infiniband/hw/mana/qp.c               | 10 ++--
 .../net/ethernet/microsoft/mana/gdma_main.c   |  5 ++
 include/net/mana/gdma.h                       |  3 +
 7 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index 083f27246ba8..ea4c8c8fc10d 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -78,22 +78,34 @@ static int mana_ib_probe(struct auxiliary_device *adev,
 	mib_dev->ib_dev.num_comp_vectors = 1;
 	mib_dev->ib_dev.dev.parent = mdev->gdma_context->dev;
 
-	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
-				 mdev->gdma_context->dev);
+	ret = mana_gd_register_device(&mib_dev->gc->mana_ib);
 	if (ret) {
-		ib_dealloc_device(&mib_dev->ib_dev);
-		return ret;
+		ibdev_err(&mib_dev->ib_dev, "Failed to register device, ret %d",
+			  ret);
+		goto free_ib_device;
 	}
 
+	ret = ib_register_device(&mib_dev->ib_dev, "mana_%d",
+				 mdev->gdma_context->dev);
+	if (ret)
+		goto deregister_device;
+
 	dev_set_drvdata(&adev->dev, mib_dev);
 
 	return 0;
+
+deregister_device:
+	mana_gd_deregister_device(&mib_dev->gc->mana_ib);
+free_ib_device:
+	ib_dealloc_device(&mib_dev->ib_dev);
+	return ret;
 }
 
 static void mana_ib_remove(struct auxiliary_device *adev)
 {
 	struct mana_ib_dev *mib_dev = dev_get_drvdata(&adev->dev);
 
+	mana_gd_deregister_device(&mib_dev->gc->mana_ib);
 	ib_unregister_device(&mib_dev->ib_dev);
 	ib_dealloc_device(&mib_dev->ib_dev);
 }
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 189e774cdab6..2c4e3c496644 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -8,7 +8,7 @@
 void mana_ib_uncfg_vport(struct mana_ib_dev *mib_dev, struct mana_ib_pd *pd,
 			 u32 port)
 {
-	struct gdma_dev *gd = mib_dev->gdma_dev;
+	struct gdma_dev *gd = &mib_dev->gc->mana;
 	struct mana_port_context *mpc;
 	struct net_device *ndev;
 	struct mana_context *mc;
@@ -32,7 +32,7 @@ int mana_ib_cfg_vport(struct mana_ib_dev *mib_dev, u32 port,
 		      struct mana_ib_pd *pd,
 		      u32 doorbell_id)
 {
-	struct gdma_dev *mdev = mib_dev->gdma_dev;
+	struct gdma_dev *mdev = &mib_dev->gc->mana;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
 	struct net_device *ndev;
@@ -81,17 +81,16 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 	struct gdma_create_pd_req req = {};
 	enum gdma_pd_flags flags = 0;
 	struct mana_ib_dev *mib_dev;
-	struct gdma_dev *mdev;
+
 	int err;
 
 	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	mdev = mib_dev->gdma_dev;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
 			     sizeof(resp));
 
 	req.flags = flags;
-	err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req,
+	err = mana_gd_send_request(mib_dev->gc, sizeof(req), &req,
 				   sizeof(resp), &resp);
 
 	if (err || resp.hdr.status) {
@@ -121,17 +120,15 @@ int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 	struct gdma_destory_pd_resp resp = {};
 	struct gdma_destroy_pd_req req = {};
 	struct mana_ib_dev *mib_dev;
-	struct gdma_dev *mdev;
 	int err;
 
 	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	mdev = mib_dev->gdma_dev;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req),
 			     sizeof(resp));
 
 	req.pd_handle = pd->pd_handle;
-	err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req,
+	err = mana_gd_send_request(mib_dev->gc, sizeof(req), &req,
 				   sizeof(resp), &resp);
 
 	if (err || resp.hdr.status) {
@@ -207,17 +204,13 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
 		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
 	struct ib_device *ibdev = ibcontext->device;
 	struct mana_ib_dev *mib_dev;
-	struct gdma_context *gc;
-	struct gdma_dev *dev;
 	int doorbell_page;
 	int ret;
 
 	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	dev = mib_dev->gdma_dev;
-	gc = dev->gdma_context;
 
 	/* Allocate a doorbell page index */
-	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
+	ret = mana_gd_allocate_doorbell_page(mib_dev->gc, &doorbell_page);
 	if (ret) {
 		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
 		return ret;
@@ -236,20 +229,17 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
 	struct ib_device *ibdev = ibcontext->device;
 	struct mana_ib_dev *mib_dev;
-	struct gdma_context *gc;
 	int ret;
 
 	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	gc = mib_dev->gdma_dev->gdma_context;
 
-	ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell);
+	ret = mana_gd_destroy_doorbell_page(mib_dev->gc, mana_ucontext->doorbell);
 	if (ret)
 		ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret);
 }
 
 static int
 mana_ib_gd_first_dma_region(struct mana_ib_dev *mib_dev,
-			    struct gdma_context *gc,
 			    struct gdma_create_dma_region_req *create_req,
 			    size_t num_pages, mana_handle_t *gdma_region,
 			    u32 expected_status)
@@ -262,7 +252,7 @@ mana_ib_gd_first_dma_region(struct mana_ib_dev *mib_dev,
 		struct_size(create_req, page_addr_list, num_pages);
 	create_req->page_addr_list_len = num_pages;
 
-	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
+	err = mana_gd_send_request(mib_dev->gc, create_req_msg_size, create_req,
 				   sizeof(create_resp), &create_resp);
 	if (err || create_resp.hdr.status != expected_status) {
 		ibdev_dbg(&mib_dev->ib_dev,
@@ -282,7 +272,7 @@ mana_ib_gd_first_dma_region(struct mana_ib_dev *mib_dev,
 }
 
 static int
-mana_ib_gd_add_dma_region(struct mana_ib_dev *mib_dev, struct gdma_context *gc,
+mana_ib_gd_add_dma_region(struct mana_ib_dev *mib_dev,
 			  struct gdma_dma_region_add_pages_req *add_req,
 			  unsigned int num_pages, u32 expected_status)
 {
@@ -295,7 +285,7 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev *mib_dev, struct gdma_context *gc,
 			     add_req_msg_size, sizeof(add_resp));
 	add_req->page_addr_list_len = num_pages;
 
-	err = mana_gd_send_request(gc, add_req_msg_size, add_req,
+	err = mana_gd_send_request(mib_dev->gc, add_req_msg_size, add_req,
 				   sizeof(add_resp), &add_resp);
 	if (err || add_resp.hdr.status != expected_status) {
 		ibdev_dbg(&mib_dev->ib_dev,
@@ -323,18 +313,14 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
 	struct ib_block_iter biter;
 	size_t max_pgs_add_cmd = 0;
 	size_t max_pgs_create_cmd;
-	struct gdma_context *gc;
 	size_t num_pages_total;
-	struct gdma_dev *mdev;
 	unsigned long page_sz;
 	unsigned int tail = 0;
 	u64 *page_addr_list;
 	void *request_buf;
 	int err;
 
-	mdev = mib_dev->gdma_dev;
-	gc = mdev->gdma_context;
-	hwc = gc->hwc.driver_data;
+	hwc = mib_dev->gc->hwc.driver_data;
 
 	/* Hardware requires dma region to align to chosen page size */
 	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
@@ -388,7 +374,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
 
 		if (!num_pages_processed) {
 			/* First create message */
-			err = mana_ib_gd_first_dma_region(mib_dev, gc, create_req,
+			err = mana_ib_gd_first_dma_region(mib_dev, create_req,
 							  tail, gdma_region,
 							  expected_status);
 			if (err)
@@ -403,7 +389,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
 			page_addr_list = add_req->page_addr_list;
 		} else {
 			/* Subsequent create messages */
-			err = mana_ib_gd_add_dma_region(mib_dev, gc, add_req, tail,
+			err = mana_ib_gd_add_dma_region(mib_dev, add_req, tail,
 							expected_status);
 			if (err)
 				break;
@@ -429,13 +415,9 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *mib_dev,
 
 int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *mib_dev, u64 gdma_region)
 {
-	struct gdma_dev *mdev = mib_dev->gdma_dev;
-	struct gdma_context *gc;
-
-	gc = mdev->gdma_context;
 	ibdev_dbg(&mib_dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region);
 
-	return mana_gd_destroy_dma_region(gc, gdma_region);
+	return mana_gd_destroy_dma_region(mib_dev->gc, gdma_region);
 }
 
 int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
@@ -444,13 +426,11 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
 	struct ib_device *ibdev = ibcontext->device;
 	struct mana_ib_dev *mib_dev;
-	struct gdma_context *gc;
 	phys_addr_t pfn;
 	pgprot_t prot;
 	int ret;
 
 	mib_dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
-	gc = mib_dev->gdma_dev->gdma_context;
 
 	if (vma->vm_pgoff != 0) {
 		ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff);
@@ -458,18 +438,18 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 	}
 
 	/* Map to the page indexed by ucontext->doorbell */
-	pfn = (gc->phys_db_page_base +
-	       gc->db_page_size * mana_ucontext->doorbell) >>
+	pfn = (mib_dev->gc->phys_db_page_base +
+	       mib_dev->gc->db_page_size * mana_ucontext->doorbell) >>
 	      PAGE_SHIFT;
 	prot = pgprot_writecombine(vma->vm_page_prot);
 
-	ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot,
-				NULL);
+	ret = rdma_user_mmap_io(ibcontext, vma, pfn, mib_dev->gc->db_page_size,
+				prot, NULL);
 	if (ret)
 		ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret);
 	else
 		ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n",
-			  pfn, gc->db_page_size, ret);
+			  pfn, mib_dev->gc->db_page_size, ret);
 
 	return ret;
 }
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index ee4efd0af278..3a2ba6b96f15 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -30,6 +30,7 @@
 struct mana_ib_dev {
 	struct ib_device ib_dev;
 	struct gdma_dev *gdma_dev;
+	struct gdma_context *gc;
 };
 
 struct mana_ib_wq {
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index f6a53906204d..3106d1bce837 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -29,13 +29,10 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *mib_dev,
 				struct mana_ib_mr *mr,
 				struct gdma_create_mr_params *mr_params)
 {
-	struct gdma_dev *mdev = mib_dev->gdma_dev;
 	struct gdma_create_mr_response resp = {};
 	struct gdma_create_mr_request req = {};
-	struct gdma_context *gc;
 	int err;
 
-	gc = mdev->gdma_context;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
 			     sizeof(resp));
@@ -56,7 +53,8 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *mib_dev,
 		return -EINVAL;
 	}
 
-	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	err = mana_gd_send_request(mib_dev->gc, sizeof(req), &req,
+				   sizeof(resp), &resp);
 
 	if (err || resp.hdr.status) {
 		ibdev_dbg(&mib_dev->ib_dev, "Failed to create mr %d, %u", err,
@@ -77,22 +75,19 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *mib_dev,
 static int mana_ib_gd_destroy_mr(struct mana_ib_dev *mib_dev, u64 mr_handle)
 {
 	struct gdma_destroy_mr_response resp = {};
-	struct gdma_dev *mdev = mib_dev->gdma_dev;
 	struct gdma_destroy_mr_request req = {};
-	struct gdma_context *gc;
 	int err;
 
-	gc = mdev->gdma_context;
-
 	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
 			     sizeof(resp));
 
 	req.mr_handle = mr_handle;
 
-	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	err = mana_gd_send_request(mib_dev->gc, sizeof(req), &req,
+				   sizeof(resp), &resp);
 	if (err || resp.hdr.status) {
-		dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
-			resp.hdr.status);
+		dev_err(mib_dev->gc->dev, "Failed to destroy MR: %d, 0x%x\n",
+			err, resp.hdr.status);
 		if (!err)
 			err = -EPROTO;
 		return err;
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 2e3a57123ed7..874cfd794825 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -21,7 +21,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *mib_dev,
 	u32 req_buf_size;
 	int i, err;
 
-	mdev = mib_dev->gdma_dev;
+	mdev = &mib_dev->gc->mana;
 	gc = mdev->gdma_context;
 
 	req_buf_size =
@@ -102,7 +102,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
 	struct mana_ib_create_qp_rss_resp resp = {};
 	struct mana_ib_create_qp_rss ucmd = {};
-	struct gdma_dev *gd = mib_dev->gdma_dev;
+	struct gdma_dev *gd = &mib_dev->gc->mana;
 	mana_handle_t *mana_ind_table;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
@@ -267,7 +267,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 		rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
 					  ibucontext);
 	struct mana_ib_create_qp_resp resp = {};
-	struct gdma_dev *gd = mib_dev->gdma_dev;
+	struct gdma_dev *gd = &mib_dev->gc->mana;
 	struct mana_ib_create_qp ucmd = {};
 	struct mana_obj_spec wq_spec = {};
 	struct mana_obj_spec cq_spec = {};
@@ -437,7 +437,7 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
 {
 	struct mana_ib_dev *mib_dev =
 		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
-	struct gdma_dev *gd = mib_dev->gdma_dev;
+	struct gdma_dev *gd = &mib_dev->gc->mana;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
 	struct net_device *ndev;
@@ -464,7 +464,7 @@ static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata)
 {
 	struct mana_ib_dev *mib_dev =
 		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
-	struct gdma_dev *gd = mib_dev->gdma_dev;
+	struct gdma_dev *gd = &mib_dev->gc->mana;
 	struct ib_pd *ibpd = qp->ibqp.pd;
 	struct mana_port_context *mpc;
 	struct mana_context *mc;
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 8f3f78b68592..9fa7a2d6c2b2 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -139,6 +139,9 @@ static int mana_gd_detect_devices(struct pci_dev *pdev)
 		if (dev_type == GDMA_DEVICE_MANA) {
 			gc->mana.gdma_context = gc;
 			gc->mana.dev_id = dev;
+		} else if (dev_type == GDMA_DEVICE_MANA_IB) {
+			gc->mana_ib.dev_id = dev;
+			gc->mana_ib.gdma_context = gc;
 		}
 	}
 
@@ -940,6 +943,7 @@ int mana_gd_register_device(struct gdma_dev *gd)
 
 	return 0;
 }
+EXPORT_SYMBOL(mana_gd_register_device);
 
 int mana_gd_deregister_device(struct gdma_dev *gd)
 {
@@ -970,6 +974,7 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
 
 	return err;
 }
+EXPORT_SYMBOL(mana_gd_deregister_device);
 
 u32 mana_gd_wq_avail_space(struct gdma_queue *wq)
 {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 96c120160f15..e2b212dd722b 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -63,6 +63,7 @@ enum {
 	GDMA_DEVICE_NONE	= 0,
 	GDMA_DEVICE_HWC		= 1,
 	GDMA_DEVICE_MANA	= 2,
+	GDMA_DEVICE_MANA_IB	= 3,
 };
 
 struct gdma_resource {
@@ -384,6 +385,8 @@ struct gdma_context {
 
 	/* Azure network adapter */
 	struct gdma_dev		mana;
+	/* rdma device */
+	struct gdma_dev		mana_ib;
 };
 
 #define MAX_NUM_GDMA_DEVICES	4
-- 
2.25.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox