AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code
@ 2025-10-08 13:32 will.aitken
  2025-10-08 13:32 ` [PATCH 2/3] drm/amdgpu: Update headers for sriov xgmi ext peer link support feature flag will.aitken
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: will.aitken @ 2025-10-08 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: victor.skvortsov, lijo.lazar, Will Aitken

From: Will Aitken <wiaitken@amd.com>

amdgpu_xgmi_fill_topology_info and psp_xgmi_reflect_topology_info
perform the same logic of copying topology info of one node to every
other node in the hive. Instead of having two functions that purport to
do the same thing, this refactoring moves the logic of the fill function
to the reflect function and adds reflecting port number info as well for
complete functionality.

Signed-off-by: Will Aitken <wiaitken@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 19 ++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 ------------------------
 2 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 6208a49c9f23..82500ade240d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1539,6 +1539,7 @@ static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
 	uint64_t src_node_id = psp->adev->gmc.xgmi.node_id;
 	uint64_t dst_node_id = node_info.node_id;
 	uint8_t dst_num_hops = node_info.num_hops;
+	uint8_t dst_is_sharing_enabled = node_info.is_sharing_enabled;
 	uint8_t dst_num_links = node_info.num_links;
 
 	hive = amdgpu_get_xgmi_hive(psp->adev);
@@ -1558,13 +1559,20 @@ static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
 				continue;
 
 			mirror_top_info->nodes[j].num_hops = dst_num_hops;
-			/*
-			 * prevent 0 num_links value re-reflection since reflection
+			mirror_top_info->nodes[j].is_sharing_enabled = dst_is_sharing_enabled;
+			/* prevent 0 num_links value re-reflection since reflection
 			 * criteria is based on num_hops (direct or indirect).
-			 *
 			 */
-			if (dst_num_links)
+			if (dst_num_links) {
 				mirror_top_info->nodes[j].num_links = dst_num_links;
+				/* swap src and dst due to frame of reference */
+				for (int k = 0; k < dst_num_links; k++) {
+					mirror_top_info->nodes[j].port_num[k].src_xgmi_port_num =
+						node_info.port_num[k].dst_xgmi_port_num;
+					mirror_top_info->nodes[j].port_num[k].dst_xgmi_port_num =
+						node_info.port_num[k].src_xgmi_port_num;
+				}
+			}
 
 			break;
 		}
@@ -1639,7 +1647,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
 			amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
 				IP_VERSION(13, 0, 6) ||
 			amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
-				IP_VERSION(13, 0, 14);
+				IP_VERSION(13, 0, 14) ||
+			amdgpu_sriov_vf(psp->adev);
 		bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
 				psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 1ede308a7c67..2e70b84a8c3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -958,28 +958,6 @@ static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
 	return 0;
 }
 
-static void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
-	struct amdgpu_device *peer_adev)
-{
-	struct psp_xgmi_topology_info *top_info = &adev->psp.xgmi_context.top_info;
-	struct psp_xgmi_topology_info *peer_info = &peer_adev->psp.xgmi_context.top_info;
-
-	for (int i = 0; i < peer_info->num_nodes; i++) {
-		if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) {
-			for (int j = 0; j < top_info->num_nodes; j++) {
-				if (top_info->nodes[j].node_id == peer_adev->gmc.xgmi.node_id) {
-					peer_info->nodes[i].num_hops = top_info->nodes[j].num_hops;
-					peer_info->nodes[i].is_sharing_enabled =
-							top_info->nodes[j].is_sharing_enabled;
-					peer_info->nodes[i].num_links =
-							top_info->nodes[j].num_links;
-					return;
-				}
-			}
-		}
-	}
-}
-
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 {
 	struct psp_xgmi_topology_info *top_info;
@@ -1065,11 +1043,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 				/* To do: continue with some node failed or disable the whole hive*/
 				goto exit_unlock;
 			}
-
-			/* fill the topology info for peers instead of getting from PSP */
-			list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
-				amdgpu_xgmi_fill_topology_info(adev, tmp_adev);
-			}
 		} else {
 			/* get latest topology info for each device from psp */
 			list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] drm/amdgpu: Update headers for sriov xgmi ext peer link support feature flag
  2025-10-08 13:32 [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code will.aitken
@ 2025-10-08 13:32 ` will.aitken
  2025-10-08 13:32 ` [PATCH 3/3] drm/amdgpu: Enable xgmi extended peer links for sriov guest will.aitken
  2025-10-28 13:16 ` [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code Lazar, Lijo
  2 siblings, 0 replies; 6+ messages in thread
From: will.aitken @ 2025-10-08 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: victor.skvortsov, lijo.lazar, Will Aitken

From: Will Aitken <wiaitken@amd.com>

Adds new sriov msg flag to match host, feature flag in the amdgim
enum, and a wrapper macro to check it.

Signed-off-by: Will Aitken <wiaitken@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h    | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index d1172c8e58c4..73b7ead88aa3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -144,6 +144,7 @@ enum AMDGIM_FEATURE_FLAG {
 	AMDGIM_FEATURE_RAS_CAPS = (1 << 9),
 	AMDGIM_FEATURE_RAS_TELEMETRY = (1 << 10),
 	AMDGIM_FEATURE_RAS_CPER = (1 << 11),
+	AMDGIM_FEATURE_XGMI_TA_EXT_PEER_LINK = (1 << 12),
 };
 
 enum AMDGIM_REG_ACCESS_FLAG {
@@ -378,6 +379,9 @@ struct amdgpu_video_codec_info;
 #define amdgpu_sriov_ras_cper_en(adev) \
 ((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_CPER)
 
+#define amdgpu_sriov_xgmi_ta_ext_peer_link_en(adev) \
+((adev)->virt.gim_feature & AMDGIM_FEATURE_XGMI_TA_EXT_PEER_LINK)
+
 static inline bool is_virtual_machine(void)
 {
 #if defined(CONFIG_X86)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index 3a79ed7d8031..652d809bba06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -102,7 +102,8 @@ union amd_sriov_msg_feature_flags {
 		uint32_t ras_caps		: 1;
 		uint32_t ras_telemetry		: 1;
 		uint32_t ras_cper		: 1;
-		uint32_t reserved		: 20;
+		uint32_t xgmi_ta_ext_peer_link	: 1;
+		uint32_t reserved		: 19;
 	} flags;
 	uint32_t all;
 };
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] drm/amdgpu: Enable xgmi extended peer links for sriov guest
  2025-10-08 13:32 [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code will.aitken
  2025-10-08 13:32 ` [PATCH 2/3] drm/amdgpu: Update headers for sriov xgmi ext peer link support feature flag will.aitken
@ 2025-10-08 13:32 ` will.aitken
  2025-10-28 13:16 ` [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code Lazar, Lijo
  2 siblings, 0 replies; 6+ messages in thread
From: will.aitken @ 2025-10-08 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: victor.skvortsov, lijo.lazar, Will Aitken

From: Will Aitken <wiaitken@amd.com>

The amd-smi tool relies on extended peer link information to report xgmi
link metrics. The necessary xgmi ta command, GET_EXTEND_PEER_LINKS, has
been enabled in the host driver and this change is necessary for the
guest to make use of it. To handle the case where the host driver does
not have the latest xgmi ta, the guest driver checks for guest support
through a pf2vf feature flag before invoking psp.

Signed-off-by: Will Aitken <wiaitken@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 82500ade240d..b69e376372d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1649,8 +1649,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
 			amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
 				IP_VERSION(13, 0, 14) ||
 			amdgpu_sriov_vf(psp->adev);
-		bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
-				psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG;
+		bool ta_port_num_support = psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG ||
+			amdgpu_sriov_xgmi_ta_ext_peer_link_en(psp->adev);
 
 		/* popluate the shared output buffer rather than the cmd input buffer
 		 * with node_ids as the input for GET_PEER_LINKS command execution.
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code
  2025-10-08 13:32 [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code will.aitken
  2025-10-08 13:32 ` [PATCH 2/3] drm/amdgpu: Update headers for sriov xgmi ext peer link support feature flag will.aitken
  2025-10-08 13:32 ` [PATCH 3/3] drm/amdgpu: Enable xgmi extended peer links for sriov guest will.aitken
@ 2025-10-28 13:16 ` Lazar, Lijo
       [not found]   ` <MW4PR12MB71918A5EFF20C239743AC346F1FAA@MW4PR12MB7191.namprd12.prod.outlook.com>
  2 siblings, 1 reply; 6+ messages in thread
From: Lazar, Lijo @ 2025-10-28 13:16 UTC (permalink / raw)
  To: will.aitken, amd-gfx; +Cc: victor.skvortsov, Will Aitken



On 10/8/2025 7:02 PM, will.aitken@amd.com wrote:
> From: Will Aitken <wiaitken@amd.com>
> 
> amdgpu_xgmi_fill_topology_info and psp_xgmi_reflect_topology_info
> perform the same logic of copying topology info of one node to every
> other node in the hive. Instead of having two functions that purport to
> do the same thing, this refactoring moves the logic of the fill function
> to the reflect function and adds reflecting port number info as well for
> complete functionality.
> 
> Signed-off-by: Will Aitken <wiaitken@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 19 ++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 ------------------------
>   2 files changed, 14 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 6208a49c9f23..82500ade240d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1539,6 +1539,7 @@ static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
>   	uint64_t src_node_id = psp->adev->gmc.xgmi.node_id;
>   	uint64_t dst_node_id = node_info.node_id;
>   	uint8_t dst_num_hops = node_info.num_hops;
> +	uint8_t dst_is_sharing_enabled = node_info.is_sharing_enabled;
>   	uint8_t dst_num_links = node_info.num_links;
>   
>   	hive = amdgpu_get_xgmi_hive(psp->adev);
> @@ -1558,13 +1559,20 @@ static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
>   				continue;
>   
>   			mirror_top_info->nodes[j].num_hops = dst_num_hops;
> -			/*
> -			 * prevent 0 num_links value re-reflection since reflection
> +			mirror_top_info->nodes[j].is_sharing_enabled = dst_is_sharing_enabled;
> +			/* prevent 0 num_links value re-reflection since reflection
>   			 * criteria is based on num_hops (direct or indirect).
> -			 *
>   			 */
> -			if (dst_num_links)
> +			if (dst_num_links) {

Patches look fine. One clarification - do you need to fill this 
information if sharing is disabled?

Thanks,
Lijo

>   				mirror_top_info->nodes[j].num_links = dst_num_links;
> +				/* swap src and dst due to frame of reference */
> +				for (int k = 0; k < dst_num_links; k++) {
> +					mirror_top_info->nodes[j].port_num[k].src_xgmi_port_num =
> +						node_info.port_num[k].dst_xgmi_port_num;
> +					mirror_top_info->nodes[j].port_num[k].dst_xgmi_port_num =
> +						node_info.port_num[k].src_xgmi_port_num;
> +				}
> +			}
>   
>   			break;
>   		}
> @@ -1639,7 +1647,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
>   			amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
>   				IP_VERSION(13, 0, 6) ||
>   			amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
> -				IP_VERSION(13, 0, 14);
> +				IP_VERSION(13, 0, 14) ||
> +			amdgpu_sriov_vf(psp->adev);
>   		bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
>   				psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 1ede308a7c67..2e70b84a8c3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -958,28 +958,6 @@ static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
>   	return 0;
>   }
>   
> -static void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
> -	struct amdgpu_device *peer_adev)
> -{
> -	struct psp_xgmi_topology_info *top_info = &adev->psp.xgmi_context.top_info;
> -	struct psp_xgmi_topology_info *peer_info = &peer_adev->psp.xgmi_context.top_info;
> -
> -	for (int i = 0; i < peer_info->num_nodes; i++) {
> -		if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) {
> -			for (int j = 0; j < top_info->num_nodes; j++) {
> -				if (top_info->nodes[j].node_id == peer_adev->gmc.xgmi.node_id) {
> -					peer_info->nodes[i].num_hops = top_info->nodes[j].num_hops;
> -					peer_info->nodes[i].is_sharing_enabled =
> -							top_info->nodes[j].is_sharing_enabled;
> -					peer_info->nodes[i].num_links =
> -							top_info->nodes[j].num_links;
> -					return;
> -				}
> -			}
> -		}
> -	}
> -}
> -
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   {
>   	struct psp_xgmi_topology_info *top_info;
> @@ -1065,11 +1043,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   				/* To do: continue with some node failed or disable the whole hive*/
>   				goto exit_unlock;
>   			}
> -
> -			/* fill the topology info for peers instead of getting from PSP */
> -			list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> -				amdgpu_xgmi_fill_topology_info(adev, tmp_adev);
> -			}
>   		} else {
>   			/* get latest topology info for each device from psp */
>   			list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code
       [not found]   ` <MW4PR12MB71918A5EFF20C239743AC346F1FAA@MW4PR12MB7191.namprd12.prod.outlook.com>
@ 2025-11-03 20:24     ` Aitken, Will
  2025-11-06 16:27       ` Lazar, Lijo
  0 siblings, 1 reply; 6+ messages in thread
From: Aitken, Will @ 2025-11-03 20:24 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx@lists.freedesktop.org; +Cc: Skvortsov, Victor

[-- Attachment #1: Type: text/plain, Size: 7676 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Lijo,

Just ran some tests and yes, I think we should still reflect everything. Even with extended peer link command disabled, if we do not reflect, the table will only be filled beneath the diagonal (e.g. attached). So I think the patch is good as is for that concern.

Let me know if there's anything else you want me to look into.
Thanks,
Will
-----Original Message-----
From: Aitken, Will
Sent: Wednesday, October 29, 2025 9:16 AM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor <Victor.Skvortsov@amd.com>
Subject: RE: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code

Hi Lijo,

Thank for reviewing. Good question. I don't think it's necessary since it wasn't being done before (the equivalent xgmi_fill function was only called if sharing was enabled). I don't think it hurts to do it, but it might be redundant (the port num copying at least would be zeros for the other nodes since it does not receive extended data). Not sure about the is_sharing_enabled. I will run some experiments and add checks to only reflect if necessary.

Thanks,
Will

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com>
Sent: Tuesday, October 28, 2025 9:16 AM
To: Aitken, Will <Will.Aitken@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Skvortsov, Victor <Victor.Skvortsov@amd.com>; Aitken, Will <Will.Aitken@amd.com>
Subject: Re: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code



On 10/8/2025 7:02 PM, will.aitken@amd.com wrote:
> From: Will Aitken <wiaitken@amd.com>
>
> amdgpu_xgmi_fill_topology_info and psp_xgmi_reflect_topology_info
> perform the same logic of copying topology info of one node to every
> other node in the hive. Instead of having two functions that purport
> to do the same thing, this refactoring moves the logic of the fill
> function to the reflect function and adds reflecting port number info
> as well for complete functionality.
>
> Signed-off-by: Will Aitken <wiaitken@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 19 ++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 ------------------------
>   2 files changed, 14 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 6208a49c9f23..82500ade240d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1539,6 +1539,7 @@ static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
>       uint64_t src_node_id = psp->adev->gmc.xgmi.node_id;
>       uint64_t dst_node_id = node_info.node_id;
>       uint8_t dst_num_hops = node_info.num_hops;
> +     uint8_t dst_is_sharing_enabled = node_info.is_sharing_enabled;
>       uint8_t dst_num_links = node_info.num_links;
>
>       hive = amdgpu_get_xgmi_hive(psp->adev); @@ -1558,13 +1559,20 @@
> static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
>                               continue;
>
>                       mirror_top_info->nodes[j].num_hops = dst_num_hops;
> -                     /*
> -                      * prevent 0 num_links value re-reflection since reflection
> +                     mirror_top_info->nodes[j].is_sharing_enabled = dst_is_sharing_enabled;
> +                     /* prevent 0 num_links value re-reflection since reflection
>                        * criteria is based on num_hops (direct or indirect).
> -                      *
>                        */
> -                     if (dst_num_links)
> +                     if (dst_num_links) {

Patches look fine. One clarification - do you need to fill this information if sharing is disabled?

Thanks,
Lijo

>                               mirror_top_info->nodes[j].num_links = dst_num_links;
> +                             /* swap src and dst due to frame of reference */
> +                             for (int k = 0; k < dst_num_links; k++) {
> +                                     mirror_top_info->nodes[j].port_num[k].src_xgmi_port_num =
> +                                             node_info.port_num[k].dst_xgmi_port_num;
> +                                     mirror_top_info->nodes[j].port_num[k].dst_xgmi_port_num =
> +                                             node_info.port_num[k].src_xgmi_port_num;
> +                             }
> +                     }
>
>                       break;
>               }
> @@ -1639,7 +1647,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
>                       amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
>                               IP_VERSION(13, 0, 6) ||
>                       amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
> -                             IP_VERSION(13, 0, 14);
> +                             IP_VERSION(13, 0, 14) ||
> +                     amdgpu_sriov_vf(psp->adev);
>               bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
>                               psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 1ede308a7c67..2e70b84a8c3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -958,28 +958,6 @@ static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
>       return 0;
>   }
>
> -static void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
> -     struct amdgpu_device *peer_adev)
> -{
> -     struct psp_xgmi_topology_info *top_info = &adev->psp.xgmi_context.top_info;
> -     struct psp_xgmi_topology_info *peer_info = &peer_adev->psp.xgmi_context.top_info;
> -
> -     for (int i = 0; i < peer_info->num_nodes; i++) {
> -             if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) {
> -                     for (int j = 0; j < top_info->num_nodes; j++) {
> -                             if (top_info->nodes[j].node_id == peer_adev->gmc.xgmi.node_id) {
> -                                     peer_info->nodes[i].num_hops = top_info->nodes[j].num_hops;
> -                                     peer_info->nodes[i].is_sharing_enabled =
> -                                                     top_info->nodes[j].is_sharing_enabled;
> -                                     peer_info->nodes[i].num_links =
> -                                                     top_info->nodes[j].num_links;
> -                                     return;
> -                             }
> -                     }
> -             }
> -     }
> -}
> -
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   {
>       struct psp_xgmi_topology_info *top_info; @@ -1065,11 +1043,6 @@ int
> amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>                               /* To do: continue with some node failed or disable the whole hive*/
>                               goto exit_unlock;
>                       }
> -
> -                     /* fill the topology info for peers instead of getting from PSP */
> -                     list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> -                             amdgpu_xgmi_fill_topology_info(adev, tmp_adev);
> -                     }
>               } else {
>                       /* get latest topology info for each device from psp */
>                       list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
> {


[-- Attachment #2: link_metrics_not_reflected.png --]
[-- Type: image/png, Size: 27956 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code
  2025-11-03 20:24     ` Aitken, Will
@ 2025-11-06 16:27       ` Lazar, Lijo
  0 siblings, 0 replies; 6+ messages in thread
From: Lazar, Lijo @ 2025-11-06 16:27 UTC (permalink / raw)
  To: Aitken, Will, amd-gfx@lists.freedesktop.org; +Cc: Skvortsov, Victor

Hi Will/Victor,

Thanks for all the clarifications. As discussed offline, sharing 
disabled scenario can be taken up separately.

Series is -

	Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>

Thanks,
Lijo

On 11/4/2025 1:54 AM, Aitken, Will wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> Hi Lijo,
> 
> Just ran some tests and yes, I think we should still reflect everything. Even with extended peer link command disabled, if we do not reflect, the table will only be filled beneath the diagonal (e.g. attached). So I think the patch is good as is for that concern.
> 
> Let me know if there's anything else you want me to look into.
> Thanks,
> Will
> -----Original Message-----
> From: Aitken, Will
> Sent: Wednesday, October 29, 2025 9:16 AM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Skvortsov, Victor <Victor.Skvortsov@amd.com>
> Subject: RE: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code
> 
> Hi Lijo,
> 
> Thank for reviewing. Good question. I don't think it's necessary since it wasn't being done before (the equivalent xgmi_fill function was only called if sharing was enabled). I don't think it hurts to do it, but it might be redundant (the port num copying at least would be zeros for the other nodes since it does not receive extended data). Not sure about the is_sharing_enabled. I will run some experiments and add checks to only reflect if necessary.
> 
> Thanks,
> Will
> 
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Tuesday, October 28, 2025 9:16 AM
> To: Aitken, Will <Will.Aitken@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Skvortsov, Victor <Victor.Skvortsov@amd.com>; Aitken, Will <Will.Aitken@amd.com>
> Subject: Re: [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code
> 
> 
> 
> On 10/8/2025 7:02 PM, will.aitken@amd.com wrote:
>> From: Will Aitken <wiaitken@amd.com>
>>
>> amdgpu_xgmi_fill_topology_info and psp_xgmi_reflect_topology_info
>> perform the same logic of copying topology info of one node to every
>> other node in the hive. Instead of having two functions that purport
>> to do the same thing, this refactoring moves the logic of the fill
>> function to the reflect function and adds reflecting port number info
>> as well for complete functionality.
>>
>> Signed-off-by: Will Aitken <wiaitken@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  | 19 ++++++++++++-----
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 ------------------------
>>    2 files changed, 14 insertions(+), 32 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> index 6208a49c9f23..82500ade240d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> @@ -1539,6 +1539,7 @@ static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
>>        uint64_t src_node_id = psp->adev->gmc.xgmi.node_id;
>>        uint64_t dst_node_id = node_info.node_id;
>>        uint8_t dst_num_hops = node_info.num_hops;
>> +     uint8_t dst_is_sharing_enabled = node_info.is_sharing_enabled;
>>        uint8_t dst_num_links = node_info.num_links;
>>
>>        hive = amdgpu_get_xgmi_hive(psp->adev); @@ -1558,13 +1559,20 @@
>> static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
>>                                continue;
>>
>>                        mirror_top_info->nodes[j].num_hops = dst_num_hops;
>> -                     /*
>> -                      * prevent 0 num_links value re-reflection since reflection
>> +                     mirror_top_info->nodes[j].is_sharing_enabled = dst_is_sharing_enabled;
>> +                     /* prevent 0 num_links value re-reflection since reflection
>>                         * criteria is based on num_hops (direct or indirect).
>> -                      *
>>                         */
>> -                     if (dst_num_links)
>> +                     if (dst_num_links) {
> 
> Patches look fine. One clarification - do you need to fill this information if sharing is disabled?
> 
> Thanks,
> Lijo
> 
>>                                mirror_top_info->nodes[j].num_links = dst_num_links;
>> +                             /* swap src and dst due to frame of reference */
>> +                             for (int k = 0; k < dst_num_links; k++) {
>> +                                     mirror_top_info->nodes[j].port_num[k].src_xgmi_port_num =
>> +                                             node_info.port_num[k].dst_xgmi_port_num;
>> +                                     mirror_top_info->nodes[j].port_num[k].dst_xgmi_port_num =
>> +                                             node_info.port_num[k].src_xgmi_port_num;
>> +                             }
>> +                     }
>>
>>                        break;
>>                }
>> @@ -1639,7 +1647,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
>>                        amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
>>                                IP_VERSION(13, 0, 6) ||
>>                        amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
>> -                             IP_VERSION(13, 0, 14);
>> +                             IP_VERSION(13, 0, 14) ||
>> +                     amdgpu_sriov_vf(psp->adev);
>>                bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
>>                                psp->xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 1ede308a7c67..2e70b84a8c3b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -958,28 +958,6 @@ static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
>>        return 0;
>>    }
>>
>> -static void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
>> -     struct amdgpu_device *peer_adev)
>> -{
>> -     struct psp_xgmi_topology_info *top_info = &adev->psp.xgmi_context.top_info;
>> -     struct psp_xgmi_topology_info *peer_info = &peer_adev->psp.xgmi_context.top_info;
>> -
>> -     for (int i = 0; i < peer_info->num_nodes; i++) {
>> -             if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) {
>> -                     for (int j = 0; j < top_info->num_nodes; j++) {
>> -                             if (top_info->nodes[j].node_id == peer_adev->gmc.xgmi.node_id) {
>> -                                     peer_info->nodes[i].num_hops = top_info->nodes[j].num_hops;
>> -                                     peer_info->nodes[i].is_sharing_enabled =
>> -                                                     top_info->nodes[j].is_sharing_enabled;
>> -                                     peer_info->nodes[i].num_links =
>> -                                                     top_info->nodes[j].num_links;
>> -                                     return;
>> -                             }
>> -                     }
>> -             }
>> -     }
>> -}
>> -
>>    int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    {
>>        struct psp_xgmi_topology_info *top_info; @@ -1065,11 +1043,6 @@ int
>> amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>                                /* To do: continue with some node failed or disable the whole hive*/
>>                                goto exit_unlock;
>>                        }
>> -
>> -                     /* fill the topology info for peers instead of getting from PSP */
>> -                     list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>> -                             amdgpu_xgmi_fill_topology_info(adev, tmp_adev);
>> -                     }
>>                } else {
>>                        /* get latest topology info for each device from psp */
>>                        list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
>> {
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-11-06 16:27 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-08 13:32 [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code will.aitken
2025-10-08 13:32 ` [PATCH 2/3] drm/amdgpu: Update headers for sriov xgmi ext peer link support feature flag will.aitken
2025-10-08 13:32 ` [PATCH 3/3] drm/amdgpu: Enable xgmi extended peer links for sriov guest will.aitken
2025-10-28 13:16 ` [PATCH 1/3] drm/amdgpu: Refactor sriov xgmi topology filling to common code Lazar, Lijo
     [not found]   ` <MW4PR12MB71918A5EFF20C239743AC346F1FAA@MW4PR12MB7191.namprd12.prod.outlook.com>
2025-11-03 20:24     ` Aitken, Will
2025-11-06 16:27       ` Lazar, Lijo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox