AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing
@ 2024-10-16 15:58 jokim
  2024-10-16 19:08 ` Kim, Jonathan
  0 siblings, 1 reply; 3+ messages in thread
From: jokim @ 2024-10-16 15:58 UTC (permalink / raw)
  To: amd-gfx
  Cc: Harish.Kasiviswanathan, Felix.Kuehling, Jonathan Kim,
	Jonathan Kim, James Yao

From: Jonathan Kim <Jonathan.Kim@amd.com>

Host drivers can create partial hives per guest by disabling xgmi sharing
between certain peers in the main hive.
Typically, these partial hives are fully connected per guest session.
In the event that the host makes a mistake by adding a non-shared node
to a guest session, have the KFD reflect sharing disabled by severing
the IO link.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Tested-by: James Yao <yiqing@yao.amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 17 +++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c    |  3 +++
 3 files changed, 22 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index fcdbcff57632..1d50f327eb08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -801,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
 	return	-EINVAL;
 }
 
+bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
+					struct amdgpu_device *peer_adev)
+{
+	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
+	int i;
+
+	/* Sharing should always be enabled for non-SRIOV. */
+	if (!amdgpu_sriov_vf(adev))
+		return true;
+
+	for (i = 0 ; i < top->num_nodes; ++i)
+		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
+			return !!top->nodes[i].is_sharing_enabled;
+
+	return false;
+}
+
 /*
  * Devices that support extended data require the entire hive to initialize with
  * the shared memory buffer flag set.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 41d5f97fc77a..8cc7ab38db7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -66,6 +66,8 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
 		struct amdgpu_device *peer_adev);
 int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
 		struct amdgpu_device *peer_adev);
+bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
+					struct amdgpu_device *peer_adev);
 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
 					   uint64_t addr);
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 48caecf7e72e..723f1220e1cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -28,6 +28,7 @@
 #include "kfd_topology.h"
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_xgmi.h"
 
 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
  * GPU processor ID are expressed with Bit[31]=1.
@@ -2329,6 +2330,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 				continue;
 			if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
 				continue;
+			if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))
+				continue;
 			sub_type_hdr = (typeof(sub_type_hdr))(
 				(char *)sub_type_hdr +
 				sizeof(struct crat_subtype_iolink));
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing
  2024-10-16 15:58 [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing jokim
@ 2024-10-16 19:08 ` Kim, Jonathan
  2024-10-21 17:35   ` Harish Kasiviswanathan
  0 siblings, 1 reply; 3+ messages in thread
From: Kim, Jonathan @ 2024-10-16 19:08 UTC (permalink / raw)
  To: amd-gfx@lists.freedesktop.org
  Cc: Kasiviswanathan, Harish, Kuehling, Felix, Yao, Yiqing(James)

[Public]

Messed up James' email in Tested-by tag.  CC'ing James.

> -----Original Message-----
> From: Kim, Jonathan <Jonathan.Kim@amd.com>
> Sent: Wednesday, October 16, 2024 11:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan@amd.com>; Kuehling, Felix
> <Felix.Kuehling@amd.com>; Kim, Jonathan <Jonathan.Kim@amd.com>; Kim,
> Jonathan <Jonathan.Kim@amd.com>; James Yao <yiqing@yao.amd.com>
> Subject: [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing
>
> From: Jonathan Kim <Jonathan.Kim@amd.com>
>
> Host drivers can create partial hives per guest by disabling xgmi sharing
> between certain peers in the main hive.
> Typically, these partial hives are fully connected per guest session.
> In the event that the host makes a mistake by adding a non-shared node
> to a guest session, have the KFD reflect sharing disabled by severing
> the IO link.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> Tested-by: James Yao <yiqing@yao.amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 17 +++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c    |  3 +++
>  3 files changed, 22 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index fcdbcff57632..1d50f327eb08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -801,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device
> *adev,
>       return  -EINVAL;
>  }
>
> +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
> +                                     struct amdgpu_device *peer_adev)
> +{
> +     struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
> +     int i;
> +
> +     /* Sharing should always be enabled for non-SRIOV. */
> +     if (!amdgpu_sriov_vf(adev))
> +             return true;
> +
> +     for (i = 0 ; i < top->num_nodes; ++i)
> +             if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> +                     return !!top->nodes[i].is_sharing_enabled;
> +
> +     return false;
> +}
> +
>  /*
>   * Devices that support extended data require the entire hive to initialize with
>   * the shared memory buffer flag set.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 41d5f97fc77a..8cc7ab38db7c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -66,6 +66,8 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device
> *adev,
>               struct amdgpu_device *peer_adev);
>  int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
>               struct amdgpu_device *peer_adev);
> +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
> +                                     struct amdgpu_device *peer_adev);
>  uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
>                                          uint64_t addr);
>  static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 48caecf7e72e..723f1220e1cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -28,6 +28,7 @@
>  #include "kfd_topology.h"
>  #include "amdgpu.h"
>  #include "amdgpu_amdkfd.h"
> +#include "amdgpu_xgmi.h"
>
>  /* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
>   * GPU processor ID are expressed with Bit[31]=1.
> @@ -2329,6 +2330,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>                               continue;
>                       if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
>                               continue;
> +                     if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev,
> peer_dev->gpu->adev))
> +                             continue;
>                       sub_type_hdr = (typeof(sub_type_hdr))(
>                               (char *)sub_type_hdr +
>                               sizeof(struct crat_subtype_iolink));
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing
  2024-10-16 19:08 ` Kim, Jonathan
@ 2024-10-21 17:35   ` Harish Kasiviswanathan
  0 siblings, 0 replies; 3+ messages in thread
From: Harish Kasiviswanathan @ 2024-10-21 17:35 UTC (permalink / raw)
  To: Kim, Jonathan, amd-gfx@lists.freedesktop.org
  Cc: Kuehling, Felix, Yao, Yiqing(James)

Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>

On 2024-10-16 15:08, Kim, Jonathan wrote:
> [Public]
> 
> Messed up James' email in Tested-by tag.  CC'ing James.
> 
>> -----Original Message-----
>> From: Kim, Jonathan <Jonathan.Kim@amd.com>
>> Sent: Wednesday, October 16, 2024 11:59 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan@amd.com>; Kuehling, Felix
>> <Felix.Kuehling@amd.com>; Kim, Jonathan <Jonathan.Kim@amd.com>; Kim,
>> Jonathan <Jonathan.Kim@amd.com>; James Yao <yiqing@yao.amd.com>
>> Subject: [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing
>>
>> From: Jonathan Kim <Jonathan.Kim@amd.com>
>>
>> Host drivers can create partial hives per guest by disabling xgmi sharing
>> between certain peers in the main hive.
>> Typically, these partial hives are fully connected per guest session.
>> In the event that the host makes a mistake by adding a non-shared node
>> to a guest session, have the KFD reflect sharing disabled by severing
>> the IO link.
>>
>> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
>> Tested-by: James Yao <yiqing@yao.amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 17 +++++++++++++++++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  2 ++
>>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c    |  3 +++
>>  3 files changed, 22 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index fcdbcff57632..1d50f327eb08 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -801,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device
>> *adev,
>>       return  -EINVAL;
>>  }
>>
>> +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
>> +                                     struct amdgpu_device *peer_adev)
>> +{
>> +     struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
>> +     int i;
>> +
>> +     /* Sharing should always be enabled for non-SRIOV. */
>> +     if (!amdgpu_sriov_vf(adev))
>> +             return true;
>> +
>> +     for (i = 0 ; i < top->num_nodes; ++i)
>> +             if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
>> +                     return !!top->nodes[i].is_sharing_enabled;
>> +
>> +     return false;
>> +}
>> +
>>  /*
>>   * Devices that support extended data require the entire hive to initialize with
>>   * the shared memory buffer flag set.
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 41d5f97fc77a..8cc7ab38db7c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -66,6 +66,8 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device
>> *adev,
>>               struct amdgpu_device *peer_adev);
>>  int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
>>               struct amdgpu_device *peer_adev);
>> +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
>> +                                     struct amdgpu_device *peer_adev);
>>  uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
>>                                          uint64_t addr);
>>  static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> index 48caecf7e72e..723f1220e1cc 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> @@ -28,6 +28,7 @@
>>  #include "kfd_topology.h"
>>  #include "amdgpu.h"
>>  #include "amdgpu_amdkfd.h"
>> +#include "amdgpu_xgmi.h"
>>
>>  /* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
>>   * GPU processor ID are expressed with Bit[31]=1.
>> @@ -2329,6 +2330,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>>                               continue;
>>                       if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
>>                               continue;
>> +                     if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev,
>> peer_dev->gpu->adev))
>> +                             continue;
>>                       sub_type_hdr = (typeof(sub_type_hdr))(
>>                               (char *)sub_type_hdr +
>>                               sizeof(struct crat_subtype_iolink));
>> --
>> 2.34.1
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-10-21 17:35 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-16 15:58 [PATCH] drm/amdkfd: sever xgmi io link if host driver has disable sharing jokim
2024-10-16 19:08 ` Kim, Jonathan
2024-10-21 17:35   ` Harish Kasiviswanathan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox